diff options
Diffstat (limited to 'arch/arm64/kvm')
82 files changed, 11161 insertions, 4271 deletions
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 58f09370d17e..096e45acadb2 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -19,6 +19,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" + depends on AS_HAS_ARMV8_4 select KVM_COMMON select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER @@ -65,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE If unsure, or not using protected nVHE (pKVM), say N. +config PTDUMP_STAGE2_DEBUGFS + bool "Present the stage-2 pagetables to debugfs" + depends on KVM + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on ARCH_HAS_PTDUMP + select PTDUMP + default n + help + Say Y here if you want to show the stage-2 kernel pagetables + layout in a debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index c0c050e53157..209bc76263f1 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -3,26 +3,31 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -I $(srctree)/$(src) +ccflags-y += -I $(src) include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ +CFLAGS_sys_regs.o += -Wno-override-init +CFLAGS_handle_exit.o += -Wno-override-init + kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ - arch_timer.o trng.o vmid.o emulate-nested.o nested.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ - vgic/vgic-its.o vgic/vgic-debug.o + vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o +kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o always-y := hyp_constants.h hyp-constants.s @@ -30,7 +35,7 @@ define rule_gen_hyp_constants $(call filechk,offsets,__HYP_CONSTANTS_H__) endef -CFLAGS_hyp-constants.o = -I $(srctree)/$(src)/hyp/include +CFLAGS_hyp-constants.o = -I $(src)/hyp/include $(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE $(call if_changed_dep,cc_s_c) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 879982b1cc73..5133dcbfe9f7 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -30,6 +30,7 @@ static u32 host_vtimer_irq_flags; static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); +DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key); static const u8 default_ppi[] = { [TIMER_PTIMER] = 30, @@ -101,21 +102,6 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) } } -static u64 timer_get_offset(struct arch_timer_context *ctxt) -{ - u64 offset = 0; - - if (!ctxt) - return 0; - - if (ctxt->offset.vm_offset) - offset += *ctxt->offset.vm_offset; - if (ctxt->offset.vcpu_offset) - offset += *ctxt->offset.vcpu_offset; - - return offset; -} - static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) { struct kvm_vcpu *vcpu = ctxt->vcpu; @@ -206,8 +192,7 @@ void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) static inline bool userspace_irqchip(struct kvm *kvm) { - return static_branch_unlikely(&userspace_irqchip_in_use) && - unlikely(!irqchip_in_kernel(kvm)); + return unlikely(!irqchip_in_kernel(kvm)); } static void soft_timer_start(struct hrtimer *hrt, u64 ns) @@ -442,22 +427,39 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } +static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) +{ + /* + * Paper over NV2 brokenness by publishing the interrupt status + * bit. This still results in a poor quality of emulation (guest + * writes will have no effect until the next exit). + * + * But hey, it's fast, right? + */ + if (is_hyp_ctxt(ctx->vcpu) && + (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) { + unsigned long val = timer_get_ctl(ctx); + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); + timer_set_ctl(ctx, val); + } +} + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { - int ret; + kvm_timer_update_status(timer_ctx, new_level); timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); - if (!userspace_irqchip(vcpu->kvm)) { - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, - timer_irq(timer_ctx), - timer_ctx->irq.level, - timer_ctx); - WARN_ON(ret); - } + if (userspace_irqchip(vcpu->kvm)) + return; + + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + timer_irq(timer_ctx), + timer_ctx->irq.level, + timer_ctx); } /* Only called for a fully emulated timer */ @@ -467,10 +469,10 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, should_fire); - if (should_fire != ctx->irq.level) { + if (should_fire != ctx->irq.level) kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); - return; - } + + kvm_timer_update_status(ctx, should_fire); /* * If the timer can fire now, we don't need to have a soft timer @@ -514,7 +516,12 @@ static void timer_save_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); - timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL)); + cval = read_sysreg_el0(SYS_CNTV_CVAL); + + if (has_broken_cntvoff()) + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTV_CTL); @@ -619,8 +626,15 @@ static void timer_restore_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: - set_cntvoff(timer_get_offset(ctx)); - write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL); + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + if (has_broken_cntvoff()) { + set_cntvoff(0); + cval += offset; + } else { + set_cntvoff(offset); + } + write_sysreg_el0(cval, SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); break; @@ -743,27 +757,12 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, timer_irq(map->direct_ptimer), &arch_timer_irq_ops); WARN_ON_ONCE(ret); - - /* - * The virtual offset behaviour is "interesting", as it - * always applies when HCR_EL2.E2H==0, but only when - * accessed from EL1 when HCR_EL2.E2H==1. So make sure we - * track E2H when putting the HV timer in "direct" mode. - */ - if (map->direct_vtimer == vcpu_hvtimer(vcpu)) { - struct arch_timer_offset *offs = &map->direct_vtimer->offset; - - if (vcpu_el2_e2h_is_set(vcpu)) - offs->vcpu_offset = NULL; - else - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); - } } } static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) { - bool tpt, tpc; + bool tvt, tpt, tvc, tpc, tvt02, tpt02; u64 clr, set; /* @@ -778,7 +777,29 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) * within this function, reality kicks in and we start adding * traps based on emulation requirements. */ - tpt = tpc = false; + tvt = tpt = tvc = tpc = false; + tvt02 = tpt02 = false; + + /* + * NV2 badly breaks the timer semantics by redirecting accesses to + * the EL1 timer state to memory, so let's call ECV to the rescue if + * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses. + * + * The treatment slightly varies depending whether we run a nVHE or + * VHE guest: nVHE will use the _EL0 registers directly, while VHE + * will use the _EL02 accessors. This translates in different trap + * bits. + * + * None of the trapping is required when running in non-HYP context, + * unless required by the L1 hypervisor settings once we advertise + * ECV+NV in the guest, or that we need trapping for other reasons. + */ + if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) { + if (vcpu_el2_e2h_is_set(vcpu)) + tvt02 = tpt02 = true; + else + tvt = tpt = true; + } /* * We have two possibility to deal with a physical offset: @@ -794,9 +815,20 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) tpt = tpc = true; /* + * For the poor sods that could not correctly substract one value + * from another, trap the full virtual timer and counter. + */ + if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer)) + tvt = tvc = true; + + /* * Apply the enable bits that the guest hypervisor has requested for * its own guest. We can only add traps that wouldn't have been set * above. + * Implementation choices: we do not support NV when E2H=0 in the + * guest, and we don't support configuration where E2H is writable + * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but + * not both). This simplifies the handling of the EL1NV* bits. */ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); @@ -807,6 +839,9 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); + + tpt02 |= (val & CNTHCTL_EL1NVPCT); + tvt02 |= (val & CNTHCTL_EL1NVVCT); } /* @@ -818,6 +853,10 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); + assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set); + assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set); + assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set); + assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set); /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */ sysreg_clear_set(cnthctl_el2, clr, set); @@ -906,6 +945,44 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) kvm_timer_blocking(vcpu); } +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) +{ + /* + * When NV2 is on, guest hypervisors have their EL1 timer register + * accesses redirected to the VNCR page. Any guest action taken on + * the timer is postponed until the next exit, leading to a very + * poor quality of emulation. + * + * This is an unmitigated disaster, only papered over by FEAT_ECV, + * which allows trapping of the timer registers even with NV2. + * Still, this is still worse than FEAT_NV on its own. Meh. + */ + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { + /* + * For a VHE guest hypervisor, the EL2 state is directly + * stored in the host EL1 timers, while the emulated EL1 + * state is stored in the VNCR page. The latter could have + * been updated behind our back, and we must reset the + * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. + */ + struct timer_map map; + get_timer_map(vcpu, &map); + + soft_timer_cancel(&map.emul_vtimer->hrtimer); + soft_timer_cancel(&map.emul_ptimer->hrtimer); + timer_emulate(map.emul_vtimer); + timer_emulate(map.emul_ptimer); + } +} + /* * With a userspace irqchip we have to check if the guest de-asserted the * timer and if so, unmask the timer irq signal on the host interrupt @@ -993,8 +1070,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) else ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; - hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - ctxt->hrtimer.function = kvm_hrtimer_expire; + hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); switch (timerid) { case TIMER_PTIMER: @@ -1021,8 +1097,8 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) timer_set_offset(vcpu_ptimer(vcpu), 0); } - hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - timer->bg_timer.function = kvm_bg_timer_expire; + hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); } void kvm_timer_init_vm(struct kvm *kvm) @@ -1364,6 +1440,37 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } +static void kvm_timer_handle_errata(void) +{ + u64 mmfr0, mmfr1, mmfr4; + + /* + * CNTVOFF_EL2 is broken on some implementations. For those, we trap + * all virtual timer/counter accesses, requiring FEAT_ECV. + * + * However, a hypervisor supporting nesting is likely to mitigate the + * erratum at L0, and not require other levels to mitigate it (which + * would otherwise be a terrible performance sink due to trap + * amplification). + * + * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0, + * and that NV is likely not to (because of limitations of the + * architecture), only enable the workaround when FEAT_VHE and + * FEAT_E2H0 are both detected. Time will tell if this actually holds. + */ + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1); + if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) && + !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) && + SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) && + (has_vhe() || has_hvhe()) && + cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) { + static_branch_enable(&broken_cntvoff_key); + kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n"); + } +} + int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; @@ -1432,6 +1539,7 @@ int __init kvm_timer_hyp_init(bool has_gic) goto out_free_vtimer_irq; } + kvm_timer_handle_errata(); return 0; out_free_ptimer_irq: diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3dee5490eea9..68fec8c95fee 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -35,21 +35,33 @@ #include <asm/virt.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/kvm_pkvm.h> -#include <asm/kvm_emulate.h> +#include <asm/kvm_ptrauth.h> #include <asm/sections.h> #include <kvm/arm_hypercalls.h> #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> +#include "sys_regs.h" + static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; +enum kvm_wfx_trap_policy { + KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */ + KVM_WFX_NOTRAP, + KVM_WFX_TRAP, +}; + +static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; +static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; + DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); @@ -57,7 +69,6 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); static bool vgic_present, kvm_arm_initialised; static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized); -DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); bool is_kvm_arm_initialised(void) { @@ -72,12 +83,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { - int r; - u64 new_cap; + int r = -EINVAL; if (cap->flags) return -EINVAL; + if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) + return -EINVAL; + switch (cap->cap) { case KVM_CAP_ARM_NISV_TO_USER: r = 0; @@ -86,9 +99,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; case KVM_CAP_ARM_MTE: mutex_lock(&kvm->lock); - if (!system_supports_mte() || kvm->created_vcpus) { - r = -EINVAL; - } else { + if (system_supports_mte() && !kvm->created_vcpus) { r = 0; set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); } @@ -99,25 +110,30 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); break; case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: - new_cap = cap->args[0]; - mutex_lock(&kvm->slots_lock); /* * To keep things simple, allow changing the chunk * size only when no memory slots have been created. */ - if (!kvm_are_all_memslots_empty(kvm)) { - r = -EINVAL; - } else if (new_cap && !kvm_is_block_size_supported(new_cap)) { - r = -EINVAL; - } else { - r = 0; - kvm->arch.mmu.split_page_chunk_size = new_cap; + if (kvm_are_all_memslots_empty(kvm)) { + u64 new_cap = cap->args[0]; + + if (!new_cap || kvm_is_block_size_supported(new_cap)) { + r = 0; + kvm->arch.mmu.split_page_chunk_size = new_cap; + } } mutex_unlock(&kvm->slots_lock); break; + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + r = 0; + set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags); + } + mutex_unlock(&kvm->lock); + break; default: - r = -EINVAL; break; } @@ -132,6 +148,7 @@ static int kvm_arm_default_max_vcpus(void) /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct + * @type: kvm device type */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { @@ -147,6 +164,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_unlock(&kvm->lock); #endif + kvm_init_nested(kvm); + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; @@ -193,6 +212,24 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) void kvm_arch_create_vm_debugfs(struct kvm *kvm) { kvm_sys_regs_create_debugfs(kvm); + kvm_s2_ptdump_create_debugfs(kvm); +} + +static void kvm_destroy_mpidr_data(struct kvm *kvm) +{ + struct kvm_mpidr_data *data; + + mutex_lock(&kvm->arch.config_lock); + + data = rcu_dereference_protected(kvm->arch.mpidr_data, + lockdep_is_held(&kvm->arch.config_lock)); + if (data) { + rcu_assign_pointer(kvm->arch.mpidr_data, NULL); + synchronize_rcu(); + kfree(data); + } + + mutex_unlock(&kvm->arch.config_lock); } /** @@ -209,7 +246,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (is_protected_kvm_enabled()) pkvm_destroy_hyp_vm(kvm); - kfree(kvm->arch.mpidr_data); + kvm_destroy_mpidr_data(kvm); + kfree(kvm->arch.sysreg_masks); kvm_destroy_vcpus(kvm); @@ -218,9 +256,47 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_arm_teardown_hypercalls(kvm); } +static bool kvm_has_full_ptr_auth(void) +{ + bool apa, gpa, api, gpi, apa3, gpa3; + u64 isar1, isar2, val; + + /* + * Check that: + * + * - both Address and Generic auth are implemented for a given + * algorithm (Q5, IMPDEF or Q3) + * - only a single algorithm is implemented. + */ + if (!system_has_full_ptr_auth()) + return false; + + isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1); + gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP); + + api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1); + gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP); + + apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2); + val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2); + gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP); + + return (apa == gpa && api == gpi && apa3 == gpa3 && + (apa + api + apa3) == 1); +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; + + if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) + return 0; + switch (ext) { case KVM_CAP_IRQCHIP: r = vgic_present; @@ -245,6 +321,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_SYSTEM_SUSPEND: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -298,7 +375,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = get_num_wrps(); break; case KVM_CAP_ARM_PMU_V3: - r = kvm_arm_support_pmu_v3(); + r = kvm_supports_guest_pmuv3(); break; case KVM_CAP_ARM_INJECT_SERROR_ESR: r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); @@ -311,7 +388,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: - r = system_has_full_ptr_auth(); + r = kvm_has_full_ptr_auth(); break; case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: if (kvm) @@ -378,28 +455,31 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; - /* - * Default value for the FP state, will be overloaded at load - * time if we support FP (pretty likely) - */ - vcpu->arch.fp_state = FP_STATE_FREE; - /* Set up the timer */ kvm_timer_vcpu_init(vcpu); kvm_pmu_vcpu_init(vcpu); - kvm_arm_reset_debug_ptr(vcpu); - kvm_arm_pvtime_vcpu_init(&vcpu->arch); vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + /* + * This vCPU may have been created after mpidr_data was initialized. + * Throw out the pre-computed mappings if that is the case which forces + * KVM to fall back to iteratively searching the vCPUs. + */ + kvm_destroy_mpidr_data(vcpu->kvm); + err = kvm_vgic_vcpu_init(vcpu); if (err) return err; - return kvm_share_hyp(vcpu, vcpu + 1); + err = kvm_share_hyp(vcpu, vcpu + 1); + if (err) + kvm_vgic_vcpu_destroy(vcpu); + + return err; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -408,10 +488,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm))) - static_branch_dec(&userspace_irqchip_in_use); - - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + if (!is_protected_kvm_enabled()) + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + else + free_hyp_memcache(&vcpu->arch.pkvm_memcache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); kvm_vgic_vcpu_destroy(vcpu); @@ -428,15 +508,81 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) } +static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) { + /* + * Either we're running an L2 guest, and the API/APK bits come + * from L1's HCR_EL2, or API/APK are both set. + */ + if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) { + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + val &= (HCR_API | HCR_APK); + vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK); + vcpu->arch.hcr_el2 |= val; + } else { + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + } + + /* + * Save the host keys if there is any chance for the guest + * to use pauth, as the entry code will reload the guest + * keys in that case. + */ + if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) { + struct kvm_cpu_context *ctxt; + + ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt); + ptrauth_save_keys(ctxt); + } + } +} + +static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfi_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running() && + (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || + vcpu->kvm->arch.vgic.nassgireq); +} + +static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfe_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running(); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_s2_mmu *mmu; int *last_ran; + if (is_protected_kvm_enabled()) + goto nommu; + + if (vcpu_has_nv(vcpu)) + kvm_vcpu_load_hw_mmu(vcpu); + mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); /* + * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, + * which happens eagerly in VHE. + * + * Also, the VMID allocator only preserves VMIDs that are active at the + * time of rollover, so KVM might need to grab a new VMID for the MMU if + * this is called from kvm_sched_in(). + */ + kvm_arm_vmid_update(&mmu->vmid); + + /* * We guarantee that both TLBs and I-cache are private to each * vcpu. If detecting that a vcpu from the same VM has * previously run on the same physical CPU, call into the @@ -450,10 +596,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) *last_ran = vcpu->vcpu_idx; } +nommu: vcpu->cpu = cpu; - kvm_vgic_load(vcpu); + /* + * The timer must be loaded before the vgic to correctly set up physical + * interrupt deactivation in nested state (e.g. timer interrupt). + */ kvm_timer_vcpu_load(vcpu); + kvm_vgic_load(vcpu); + kvm_vcpu_load_debug(vcpu); if (has_vhe()) kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); @@ -461,14 +613,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); - if (single_task_running()) - vcpu_clear_wfx_traps(vcpu); + if (kvm_vcpu_should_clear_twe(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWE; else - vcpu_set_wfx_traps(vcpu); + vcpu->arch.hcr_el2 |= HCR_TWE; + + if (kvm_vcpu_should_clear_twi(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWI; + else + vcpu->arch.hcr_el2 |= HCR_TWI; + + vcpu_set_pauth_traps(vcpu); - if (vcpu_has_ptrauth(vcpu)) - vcpu_ptrauth_disable(vcpu); - kvm_arch_vcpu_load_debug_state_flags(vcpu); + if (is_protected_kvm_enabled()) { + kvm_call_hyp_nvhe(__pkvm_vcpu_load, + vcpu->kvm->arch.pkvm.handle, + vcpu->vcpu_idx, vcpu->arch.hcr_el2); + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + } if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); @@ -476,13 +639,21 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_arch_vcpu_put_debug_state_flags(vcpu); + if (is_protected_kvm_enabled()) { + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp_nvhe(__pkvm_vcpu_put); + } + + kvm_vcpu_put_debug(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_vhe(vcpu); kvm_timer_vcpu_put(vcpu); kvm_vgic_put(vcpu); kvm_vcpu_pmu_restore_host(vcpu); + if (vcpu_has_nv(vcpu)) + kvm_vcpu_put_hw_mmu(vcpu); kvm_arm_vmid_clear_active(); vcpu_clear_on_unsupported_cpu(vcpu); @@ -580,11 +751,6 @@ unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) } #endif -static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) -{ - return vcpu_get_flag(vcpu, VCPU_INITIALIZED); -} - static void kvm_init_mpidr_data(struct kvm *kvm) { struct kvm_mpidr_data *data = NULL; @@ -594,7 +760,8 @@ static void kvm_init_mpidr_data(struct kvm *kvm) mutex_lock(&kvm->arch.config_lock); - if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1) + if (rcu_access_pointer(kvm->arch.mpidr_data) || + atomic_read(&kvm->online_vcpus) == 1) goto out; kvm_for_each_vcpu(c, vcpu, kvm) { @@ -631,7 +798,7 @@ static void kvm_init_mpidr_data(struct kvm *kvm) data->cmpidr_to_idx[index] = c; } - kvm->arch.mpidr_data = data; + rcu_assign_pointer(kvm->arch.mpidr_data, data); out: mutex_unlock(&kvm->arch.config_lock); } @@ -661,8 +828,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) kvm_init_mpidr_data(kvm); - kvm_arm_vcpu_init_debug(vcpu); - if (likely(irqchip_in_kernel(kvm))) { /* * Map the VGIC hardware resources before running a vcpu the @@ -673,48 +838,42 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } + ret = kvm_finalize_sys_regs(vcpu); + if (ret) + return ret; + if (vcpu_has_nv(vcpu)) { - ret = kvm_init_nv_sysregs(vcpu->kvm); + ret = kvm_vgic_vcpu_nv_init(vcpu); if (ret) return ret; } /* - * This needs to happen after NV has imposed its own restrictions on - * the feature set + * This needs to happen after any restriction has been applied + * to the feature set. */ - kvm_init_sysreg(vcpu); + kvm_calculate_traps(vcpu); ret = kvm_timer_enable(vcpu); if (ret) return ret; - ret = kvm_arm_pmu_v3_enable(vcpu); - if (ret) - return ret; + if (kvm_vcpu_has_pmu(vcpu)) { + ret = kvm_arm_pmu_v3_enable(vcpu); + if (ret) + return ret; + } if (is_protected_kvm_enabled()) { ret = pkvm_create_hyp_vm(kvm); if (ret) return ret; - } - if (!irqchip_in_kernel(kvm)) { - /* - * Tell the rest of the code that there are userspace irqchip - * VMs in the wild. - */ - static_branch_inc(&userspace_irqchip_in_use); + ret = pkvm_create_hyp_vcpu(vcpu); + if (ret) + return ret; } - /* - * Initialize traps for protected VMs. - * NOTE: Move to run in EL2 directly, rather than via a hypercall, once - * the code is in place for first run initialization at EL2. - */ - if (kvm_vm_is_protected(kvm)) - kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); - mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); @@ -790,9 +949,8 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) * doorbells to be signalled, should an interrupt become pending. */ preempt_disable(); - kvm_vgic_vmcr_sync(vcpu); vcpu_set_flag(vcpu, IN_WFI); - vgic_v4_put(vcpu); + kvm_vgic_put(vcpu); preempt_enable(); kvm_vcpu_halt(vcpu); @@ -800,7 +958,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_disable(); vcpu_clear_flag(vcpu, IN_WFI); - vgic_v4_load(vcpu); + kvm_vgic_load(vcpu); preempt_enable(); } @@ -849,6 +1007,9 @@ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) kvm_vcpu_sleep(vcpu); @@ -883,6 +1044,8 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_dirty_ring_check_request(vcpu)) return 0; + + check_nested_vcpu_requests(vcpu); } return 1; @@ -924,7 +1087,7 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) * state gets updated in kvm_timer_update_run and * kvm_pmu_update_run below). */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { if (kvm_timer_should_notify_user(vcpu) || kvm_pmu_should_notify_user(vcpu)) { *ret = -EINTR; @@ -980,13 +1143,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu); - if (ret) + if (ret <= 0) return ret; } vcpu_load(vcpu); - if (run->immediate_exit) { + if (!vcpu->wants_to_run) { ret = -EINTR; goto out; } @@ -1014,19 +1177,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ preempt_disable(); - /* - * The VMID allocator only tracks active VMIDs per - * physical CPU, and therefore the VMID allocated may not be - * preserved on VMID roll-over if the task was preempted, - * making a thread's VMID inactive. So we need to call - * kvm_arm_vmid_update() in non-premptible context. - */ - if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) && - has_vhe()) - __load_stage2(vcpu->arch.hw_mmu, - vcpu->arch.hw_mmu->arch); - - kvm_pmu_flush_hwstate(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); @@ -1045,8 +1197,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ - kvm_pmu_sync_hwstate(vcpu); - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); @@ -1054,7 +1207,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } - kvm_arm_setup_debug(vcpu); kvm_arch_vcpu_ctxflush_fp(vcpu); /************************************************************** @@ -1071,14 +1223,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Back from guest *************************************************************/ - kvm_arm_clear_debug(vcpu); - /* * We must sync the PMU state before the vgic state so * that the vgic can properly sample the updated state of the * interrupt line. */ - kvm_pmu_sync_hwstate(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); /* * Sync the vgic state before syncing the timer state because @@ -1092,9 +1243,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * we don't want vtimer interrupts to race with syncing the * timer virtual interrupt state. */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); + if (is_hyp_ctxt(vcpu)) + kvm_timer_sync_nested(vcpu); + kvm_arch_vcpu_ctxsync_fp(vcpu); /* @@ -1264,13 +1418,13 @@ static unsigned long system_supported_vcpu_features(void) if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); - if (!kvm_arm_support_pmu_v3()) + if (!kvm_supports_guest_pmuv3()) clear_bit(KVM_ARM_VCPU_PMU_V3, &features); if (!system_supports_sve()) clear_bit(KVM_ARM_VCPU_SVE, &features); - if (!system_has_full_ptr_auth()) { + if (!kvm_has_full_ptr_auth()) { clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features); clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features); } @@ -1306,11 +1460,6 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features)) return -EINVAL; - /* Disallow NV+SVE for the time being */ - if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features) && - test_bit(KVM_ARM_VCPU_SVE, &features)) - return -EINVAL; - if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) return 0; @@ -1346,6 +1495,10 @@ static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu) ret = kvm_arm_set_default_pmu(kvm); + /* Prepare for nested if required */ + if (!ret && vcpu_has_nv(vcpu)) + ret = kvm_vcpu_init_nested(vcpu); + return ret; } @@ -1439,7 +1592,6 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); /* * Handle the "start in power-off" case. @@ -1818,6 +1970,11 @@ static unsigned long nvhe_percpu_order(void) return size ? get_order(size) : 0; } +static size_t pkvm_host_sve_state_order(void) +{ + return get_order(pkvm_host_sve_state_size()); +} + /* A lookup table holding the hypervisor VA for each vector slot */ static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; @@ -1853,7 +2010,6 @@ static int kvm_init_vector_slots(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); unsigned long tcr; /* @@ -1869,17 +2025,17 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) tcr = read_sysreg(tcr_el1); if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); tcr |= TCR_EPD1_MASK; } else { + unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); + tcr &= TCR_EL2_MASK; - tcr |= TCR_EL2_RES1; + tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); + if (lpa2_is_enabled()) + tcr |= TCR_EL2_DS; } - tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); - tcr &= ~TCR_EL2_PS_MASK; - tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0)); - if (kvm_lpa2_is_enabled()) - tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); @@ -1971,7 +2127,8 @@ static void cpu_set_hyp_vector(void) static void cpu_hyp_init_context(void) { - kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); + kvm_init_host_cpu_context(host_data_ptr(host_ctxt)); + kvm_init_host_debug_data(); if (!is_kernel_in_hyp_mode()) cpu_init_hyp_mode(); @@ -1980,7 +2137,6 @@ static void cpu_hyp_init_context(void) static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); - kvm_arm_init_debug(); if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); @@ -2012,7 +2168,7 @@ static void cpu_hyp_uninit(void *discard) } } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { /* * Most calls to this function are made with migration @@ -2032,7 +2188,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_timer_cpu_down(); kvm_vgic_cpu_down(); @@ -2163,6 +2319,19 @@ static int __init init_subsystems(void) break; case -ENODEV: case -ENXIO: + /* + * No VGIC? No pKVM for you. + * + * Protected mode assumes that VGICv3 is present, so no point + * in trying to hobble along if vgic initialization fails. + */ + if (is_protected_kvm_enabled()) + goto out; + + /* + * Otherwise, userspace could choose to implement a GIC for its + * guest on non-cooperative hardware. + */ vgic_present = false; err = 0; break; @@ -2170,6 +2339,13 @@ static int __init init_subsystems(void) goto out; } + if (kvm_mode == KVM_MODE_NV && + !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) { + kvm_err("NV support requires GICv3, giving up\n"); + err = -EINVAL; + goto out; + } + /* * Init HYP architected timer support */ @@ -2197,12 +2373,20 @@ static void __init teardown_subsystems(void) static void __init teardown_hyp_mode(void) { + bool free_sve = system_supports_sve() && is_protected_kvm_enabled(); int cpu; free_hyp_pgds(); for_each_possible_cpu(cpu) { - free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT); free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); + + if (free_sve) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); + } } } @@ -2220,7 +2404,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits) /* * The stub hypercalls are now disabled, so set our local flag to - * prevent a later re-init attempt in kvm_arch_hardware_enable(). + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). */ __this_cpu_write(kvm_hyp_initialized, 1); preempt_enable(); @@ -2265,6 +2449,13 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + + /* + * Flush entire BSS since part of its data containing init symbols is read + * while the MMU is off. + */ + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); } static int __init kvm_hyp_init_protection(u32 hyp_va_bits) @@ -2285,6 +2476,50 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits) return 0; } +static int init_pkvm_host_sve_state(void) +{ + int cpu; + + if (!system_supports_sve()) + return 0; + + /* Allocate pages for host sve state in protected mode. */ + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order()); + + if (!page) + return -ENOMEM; + + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page); + } + + /* + * Don't map the pages in hyp since these are only used in protected + * mode, which will (re)create its own mapping when initialized. + */ + + return 0; +} + +/* + * Finalizes the initialization of hyp mode, once everything else is initialized + * and the initialziation process cannot fail. + */ +static void finalize_init_hyp_mode(void) +{ + int cpu; + + if (system_supports_sve() && is_protected_kvm_enabled()) { + for_each_possible_cpu(cpu) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = + kern_hyp_va(sve_state); + } + } +} + static void pkvm_hyp_init_ptrauth(void) { struct kvm_cpu_context *hyp_ctxt; @@ -2330,15 +2565,15 @@ static int __init init_hyp_mode(void) * Allocate stack pages for Hypervisor-mode */ for_each_possible_cpu(cpu) { - unsigned long stack_page; + unsigned long stack_base; - stack_page = __get_free_page(GFP_KERNEL); - if (!stack_page) { + stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT); + if (!stack_base) { err = -ENOMEM; goto out_err; } - per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; + per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base; } /* @@ -2407,9 +2642,9 @@ static int __init init_hyp_mode(void) */ for_each_possible_cpu(cpu) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); + char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu); - err = create_hyp_stack(__pa(stack_page), ¶ms->stack_hyp_va); + err = create_hyp_stack(__pa(stack_base), ¶ms->stack_hyp_va); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; @@ -2421,7 +2656,7 @@ static int __init init_hyp_mode(void) * __hyp_pa() won't do the right thing there, since the stack * has been mapped in the flexible private VA space. */ - params->stack_pa = __pa(stack_page); + params->stack_pa = __pa(stack_base); } for_each_possible_cpu(cpu) { @@ -2453,6 +2688,10 @@ static int __init init_hyp_mode(void) goto out_err; } + err = init_pkvm_host_sve_state(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); @@ -2470,21 +2709,27 @@ out_err: struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu = NULL; + struct kvm_mpidr_data *data; unsigned long i; mpidr &= MPIDR_HWID_BITMASK; - if (kvm->arch.mpidr_data) { - u16 idx = kvm_mpidr_index(kvm->arch.mpidr_data, mpidr); + rcu_read_lock(); + data = rcu_dereference(kvm->arch.mpidr_data); - vcpu = kvm_get_vcpu(kvm, - kvm->arch.mpidr_data->cmpidr_to_idx[idx]); + if (data) { + u16 idx = kvm_mpidr_index(data, mpidr); + + vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]); if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu)) vcpu = NULL; + } + rcu_read_unlock(); + + if (vcpu) return vcpu; - } kvm_for_each_vcpu(i, vcpu, kvm) { if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) @@ -2508,6 +2753,14 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + /* + * The only thing we have a chance of directly-injecting is LPIs. Maybe + * one day... + */ + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return 0; return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, &irqfd->irq_entry); @@ -2517,6 +2770,10 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return; kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, &irqfd->irq_entry); @@ -2597,14 +2854,12 @@ static __init int kvm_arm_init(void) if (err) goto out_hyp; - if (is_protected_kvm_enabled()) { - kvm_info("Protected nVHE mode initialized successfully\n"); - } else if (in_hyp_mode) { - kvm_info("VHE mode initialized successfully\n"); - } else { - char mode = cpus_have_final_cap(ARM64_KVM_HVHE) ? 'h' : 'n'; - kvm_info("Hyp mode (%cVHE) initialized successfully\n", mode); - } + kvm_info("%s%sVHE%s mode initialized successfully\n", + in_hyp_mode ? "" : (is_protected_kvm_enabled() ? + "Protected " : "Hyp "), + in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? + "h" : "n"), + cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": ""); /* * FIXME: Do something reasonable if kvm_init() fails after pKVM @@ -2614,6 +2869,13 @@ static __init int kvm_arm_init(void) if (err) goto out_subs; + /* + * This should be called after initialization is done and failure isn't + * possible anymore. + */ + if (!in_hyp_mode) + finalize_init_hyp_mode(); + kvm_arm_initialised = true; return 0; @@ -2666,6 +2928,36 @@ static int __init early_kvm_mode_cfg(char *arg) } early_param("kvm-arm.mode", early_kvm_mode_cfg); +static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "trap") == 0) { + *p = KVM_WFX_TRAP; + return 0; + } + + if (strcmp(arg, "notrap") == 0) { + *p = KVM_WFX_NOTRAP; + return 0; + } + + return -EINVAL; +} + +static int __init early_kvm_wfi_trap_policy_cfg(char *arg) +{ + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy); +} +early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg); + +static int __init early_kvm_wfe_trap_policy_cfg(char *arg) +{ + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy); +} +early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg); + enum kvm_mode kvm_get_mode(void) { return kvm_mode; diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 000000000000..f74a66ce3064 --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1446 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm_host.h> + +#include <asm/esr.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool hpd; + bool e0poe; + bool poe; + bool pan; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool UXNTable; + bool PXNTable; + bool uwxn; + bool uov; + bool ur; + bool uw; + bool ux; + bool pwxn; + bool pov; + bool pr; + bool pw; + bool px; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +{ + wr->fst = fst; + wr->ptw = ptw; + wr->s2 = s2; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + break; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (!kvm_has_s1pie(vcpu->kvm)) + return false; + + switch (regime) { + case TR_EL2: + case TR_EL20: + return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; + case TR_EL10: + return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && + (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1_PIE); + default: + BUG(); + } +} + +static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) +{ + u64 val; + + if (!kvm_has_s1poe(vcpu->kvm)) { + wi->poe = wi->e0poe = false; + return; + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + val = vcpu_read_sys_reg(vcpu, TCR2_EL2); + wi->poe = val & TCR2_EL2_POE; + wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); + break; + case TR_EL10: + if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { + wi->poe = wi->e0poe = false; + return; + } + + val = __vcpu_sys_reg(vcpu, TCR2_EL1); + wi->poe = val & TCR2_EL1_POE; + wi->e0poe = val & TCR2_EL1_E0POE; + } +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva, as_el0; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + wi->regime = compute_translation_regime(vcpu, op); + as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + va55 = va & BIT(55); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + /* R_JHSVW */ + wi->hpd |= s1pie_enabled(vcpu, wi->regime); + + /* Do we have POE? */ + compute_s1poe(vcpu, wi); + + /* R_BVXDG */ + wi->hpd |= (wi->poe || wi->e0poe); + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault_l0; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault_l0; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_16K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault_l0; + + ia_bits = get_ia_size(wi); + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault_l0; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault_l0; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault_l0; + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + return 0; + +addrsz: /* Address Size Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + return -EFAULT; + +transfault_l0: /* Translation Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + return -EFAULT; +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index, ipa; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true, true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true, true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), + true, false); + return ret; + } + + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc & GENMASK_ULL(47, wi->pgshift); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2; + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2; + break; + } + + if (!valid_block) + goto transfault; + } + + if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + goto addrsz; + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = desc & GENMASK(47, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 tcr2; + u64 pir; + u64 pire0; + u64 por_el0; + u64 por_el1; + u64 sctlr; + u64 vttbr; + u64 vtcr; + u64 hcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + config->tcr2 = read_sysreg_el1(SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + config->pir = read_sysreg_el1(SYS_PIR); + config->pire0 = read_sysreg_el1(SYS_PIRE0); + } + if (system_supports_poe()) { + config->por_el1 = read_sysreg_el1(SYS_POR); + config->por_el0 = read_sysreg_s(SYS_POR_EL0); + } + } + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); + config->hcr = read_sysreg(hcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + write_sysreg(config->hcr, hcr_el2); + + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + write_sysreg_el1(config->tcr2, SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + write_sysreg_el1(config->pir, SYS_PIR); + write_sysreg_el1(config->pire0, SYS_PIRE0); + } + if (system_supports_poe()) { + write_sysreg_el1(config->por_el1, SYS_POR); + write_sysreg_s(config->por_el0, SYS_POR_EL0); + } + } + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_sh(u8 attr, u64 desc) +{ + u8 sh; + + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + sh = FIELD_GET(PTE_SHARED, desc); + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_sh(final_attr, tr->desc))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, + enum trans_regime regime) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & GENMASK_ULL(47, 12); + + if (regime == TR_EL10 && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & GENMASK_ULL(47, 12); + + sh = compute_sh(mair, wr->desc); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (s1pie_enabled(vcpu, regime)) + return true; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool wxn; + + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { + case 0b00: + wr->pr = wr->pw = true; + wr->ur = wr->uw = false; + break; + case 0b01: + wr->pr = wr->pw = wr->ur = wr->uw = true; + break; + case 0b10: + wr->pr = true; + wr->pw = wr->ur = wr->uw = false; + break; + case 0b11: + wr->pr = wr->ur = true; + wr->pw = wr->uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + wr->px = !((wr->desc & PTE_PXN) || wr->uw); + wr->ux = !(wr->desc & PTE_UXN); + } else { + wr->ur = wr->uw = wr->ux = false; + + if (!(wr->desc & PTE_RDONLY)) { + wr->pr = wr->pw = true; + } else { + wr->pr = true; + wr->pw = false; + } + + /* XN maps to UXN */ + wr->px = !(wr->desc & PTE_UXN); + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); + break; + case TR_EL10: + wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + break; + } + + wr->pwxn = wr->uwxn = wxn; + wr->pov = wi->poe; + wr->uov = wi->e0poe; +} + +static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (wr->APTable) { + case 0b00: + break; + case 0b01: + wr->ur = wr->uw = false; + break; + case 0b10: + wr->pw = wr->uw = false; + break; + case 0b11: + wr->pw = wr->ur = wr->uw = false; + break; + } + + wr->px &= !wr->PXNTable; + wr->ux &= !wr->UXNTable; + } else { + if (wr->APTable & BIT(1)) + wr->pw = false; + + /* XN maps to UXN */ + wr->px &= !wr->UXNTable; + } +} + +#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) + +#define set_priv_perms(wr, r, w, x) \ + do { \ + (wr)->pr = (r); \ + (wr)->pw = (w); \ + (wr)->px = (x); \ + } while (0) + +#define set_unpriv_perms(wr, r, w, x) \ + do { \ + (wr)->ur = (r); \ + (wr)->uw = (w); \ + (wr)->ux = (x); \ + } while (0) + +#define set_priv_wxn(wr, v) \ + do { \ + (wr)->pwxn = (v); \ + } while (0) + +#define set_unpriv_wxn(wr, v) \ + do { \ + (wr)->uwxn = (v); \ + } while (0) + +/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ +#define set_perms(w, wr, ip) \ + do { \ + /* R_LLZDZ */ \ + switch ((ip)) { \ + case 0b0000: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b0010: \ + set_ ## w ## _perms((wr), false, false, true ); \ + break; \ + case 0b0011: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b0100: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0101: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b0110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b0111: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1000: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1010: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b1011: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1100: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b1101: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1111: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + } \ + \ + /* R_HJYGR */ \ + set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ + \ + } while (0) + +static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 up, pp, idx; + + idx = pte_pi_index(wr->desc); + + switch (wi->regime) { + case TR_EL10: + pp = perm_idx(vcpu, PIR_EL1, idx); + up = perm_idx(vcpu, PIRE0_EL1, idx); + break; + case TR_EL20: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = perm_idx(vcpu, PIRE0_EL2, idx); + break; + case TR_EL2: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = 0; + break; + } + + set_perms(priv, wr, pp); + + if (wi->regime != TR_EL2) + set_perms(unpriv, wr, up); + else + set_unpriv_perms(wr, false, false, false); + + wr->pov = wi->poe && !(pp & BIT(3)); + wr->uov = wi->e0poe && !(up & BIT(3)); + + /* R_VFPJF */ + if (wr->px && wr->uw) { + set_priv_perms(wr, false, false, false); + set_unpriv_perms(wr, false, false, false); + } +} + +static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 idx, pov_perms, uov_perms; + + idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); + + switch (wi->regime) { + case TR_EL10: + pov_perms = perm_idx(vcpu, POR_EL1, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL20: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL2: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = 0; + break; + } + + if (pov_perms & ~POE_RWX) + pov_perms = POE_NONE; + + if (wi->poe && wr->pov) { + wr->pr &= pov_perms & POE_R; + wr->pw &= pov_perms & POE_W; + wr->px &= pov_perms & POE_X; + } + + if (uov_perms & ~POE_RWX) + uov_perms = POE_NONE; + + if (wi->e0poe && wr->uov) { + wr->ur &= uov_perms & POE_R; + wr->uw &= uov_perms & POE_W; + wr->ux &= uov_perms & POE_X; + } +} + +static void compute_s1_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool pan; + + if (!s1pie_enabled(vcpu, wi->regime)) + compute_s1_direct_permissions(vcpu, wi, wr); + else + compute_s1_indirect_permissions(vcpu, wi, wr); + + if (!wi->hpd) + compute_s1_hierarchical_permissions(vcpu, wi, wr); + + if (wi->poe || wi->e0poe) + compute_s1_overlay_permissions(vcpu, wi, wr); + + /* R_QXXPC */ + if (wr->pwxn) { + if (!wr->pov && wr->pw) + wr->px = false; + if (wr->pov && wr->px) + wr->pw = false; + } + + /* R_NPBXC */ + if (wr->uwxn) { + if (!wr->uov && wr->uw) + wr->ux = false; + if (wr->uov && wr->ux) + wr->uw = false; + } + + pan = wi->pan && (wr->ur || wr->uw || + (pan3_enabled(vcpu, wi->regime) && wr->ux)); + wr->pw &= !pan; + wr->pr &= !pan; +} + +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + bool perm_fail = false; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + compute_s1_permissions(vcpu, &wi, &wr); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !wr.pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !wr.pw; + break; + case OP_AT_S1E0R: + perm_fail = !wr.ur; + break; + case OP_AT_S1E0W: + perm_fail = !wr.uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + +compute_par: + return compute_par_s1(vcpu, &wr, wi.regime); +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + if (kvm_has_tcr2(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); + if (kvm_has_s1pie(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); + } + if (kvm_has_s1poe(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); + write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); + } + } + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Clear TGE, enable S2 translation, we're rolling */ + write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + + /* + * If PAR_EL1 reports that AT failed on a S1 permission fault, we + * know for sure that the PTW was able to walk the S1 tables and + * there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + u64 val, hcr; + bool fail; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg(val, hcr_el2); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg(hcr, hcr_el2); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return; + + /* + * If we only have a single stage of translation (E2H=0 or + * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. + */ + if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index ce8886122ed3..0e4c805e7e89 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -3,7 +3,8 @@ * Debug and Guest Debug support * * Copyright (C) 2015 - Linaro Ltd - * Author: Alex Bennée <alex.bennee@linaro.org> + * Authors: Alex Bennée <alex.bennee@linaro.org> + * Oliver Upton <oliver.upton@linux.dev> */ #include <linux/kvm_host.h> @@ -14,72 +15,6 @@ #include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> -#include "trace.h" - -/* These are the bits of MDSCR_EL1 we may manipulate */ -#define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \ - DBG_MDSCR_KDE | \ - DBG_MDSCR_MDE) - -static DEFINE_PER_CPU(u64, mdcr_el2); - -/* - * save/restore_guest_debug_regs - * - * For some debug operations we need to tweak some guest registers. As - * a result we need to save the state of those registers before we - * make those modifications. - * - * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled - * after we have restored the preserved value to the main context. - * - * When single-step is enabled by userspace, we tweak PSTATE.SS on every - * guest entry. Preserve PSTATE.SS so we can restore the original value - * for the vcpu after the single-step is disabled. - */ -static void save_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - - vcpu->arch.guest_debug_preserved.mdscr_el1 = val; - - trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", - vcpu->arch.guest_debug_preserved.mdscr_el1); - - vcpu->arch.guest_debug_preserved.pstate_ss = - (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); -} - -static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1; - - vcpu_write_sys_reg(vcpu, val, MDSCR_EL1); - - trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", - vcpu_read_sys_reg(vcpu, MDSCR_EL1)); - - if (vcpu->arch.guest_debug_preserved.pstate_ss) - *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; - else - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; -} - -/** - * kvm_arm_init_debug - grab what we need for debug - * - * Currently the sole task of this function is to retrieve the initial - * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has - * presumably been set-up by some knowledgeable bootcode. - * - * It is called once per-cpu during CPU hyp initialisation. - */ - -void kvm_arm_init_debug(void) -{ - __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); -} - /** * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value * @@ -95,11 +30,14 @@ void kvm_arm_init_debug(void) */ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) { + preempt_disable(); + /* * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK * to disable guest access to the profiling and trace buffers */ - vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; + vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, + *host_data_ptr(nr_event_counters)); vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | MDCR_EL2_TTRF | @@ -113,233 +51,215 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; /* - * Trap debug register access when one of the following is true: - * - Userspace is using the hardware to debug the guest - * (KVM_GUESTDBG_USE_HW is set). - * - The guest is not using debug (DEBUG_DIRTY clear). - * - The guest has enabled the OS Lock (debug exceptions are blocked). + * Trap debug registers if the guest doesn't have ownership of them. */ - if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || - !vcpu_get_flag(vcpu, DEBUG_DIRTY) || - kvm_vcpu_os_lock_enabled(vcpu)) + if (!kvm_guest_owns_debug_regs(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; - trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); -} + /* Write MDCR_EL2 directly if we're already at EL2 */ + if (has_vhe()) + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); -/** - * kvm_arm_vcpu_init_debug - setup vcpu debug traps - * - * @vcpu: the vcpu pointer - * - * Set vcpu initial mdcr_el2 value. - */ -void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - kvm_arm_setup_mdcr_el2(vcpu); preempt_enable(); } -/** - * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state - * @vcpu: the vcpu pointer - */ - -void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) +void kvm_init_host_debug_data(void) { - vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state; + u64 dfr0 = read_sysreg(id_aa64dfr0_el1); + + if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) + *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, + read_sysreg(pmcr_el0)); + + *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); + *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); + + if (has_vhe()) + return; + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && + !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P)) + host_data_set_flag(HAS_SPE); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) { + /* Force disable trace in protected mode in case of no TRBE */ + if (is_protected_kvm_enabled()) + host_data_set_flag(EL1_TRACING_CONFIGURED); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) + host_data_set_flag(HAS_TRBE); + } } -/** - * kvm_arm_setup_debug - set up debug related stuff +/* + * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host + * has taken over MDSCR_EL1. * - * @vcpu: the vcpu pointer + * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1. * - * This is called before each entry into the hypervisor to setup any - * debug related registers. + * - Userspace is using the breakpoint/watchpoint registers to debug the + * guest, and MDSCR_EL1.MDE is forced to 1. * - * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them (see the DEBUG_DIRTY - * flag on vcpu->arch.iflags). Since the guest must not interfere - * with the hardware state when debugging the guest, we must ensure that - * trapping is enabled whenever we are debugging the guest using the - * debug registers. + * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0, + * masking all debug exceptions affected by the OS Lock. */ - -void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) +static void setup_external_mdscr(struct kvm_vcpu *vcpu) { - unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2; + /* + * Use the guest's MDSCR_EL1 as a starting point, since there are + * several other features controlled by MDSCR_EL1 that are not relevant + * to the host. + * + * Clear the bits that KVM may use which also satisfies emulation of + * the OS Lock as MDSCR_EL1.MDE is cleared. + */ + u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS | + MDSCR_EL1_MDE | + MDSCR_EL1_KDE); - trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + mdscr |= MDSCR_EL1_SS; - kvm_arm_setup_mdcr_el2(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE; - /* Check if we need to use the debug registers. */ + vcpu->arch.external_mdscr_el1 = mdscr; +} + +void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) +{ + u64 mdscr; + + /* Must be called before kvm_vcpu_load_vhe() */ + KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); + + /* + * Determine which of the possible debug states we're in: + * + * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's + * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do + * software step or emulate the effects of the OS Lock being enabled. + * + * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and + * the breakpoint/watchpoint registers need to be loaded eagerly. + * + * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint + * context needs to be loaded on the CPU. + */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - /* Save guest debug state */ - save_guest_debug_regs(vcpu); + vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; + setup_external_mdscr(vcpu); /* - * Single Step (ARM ARM D2.12.3 The software step state - * machine) - * - * If we are doing Single Step we need to manipulate - * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the - * step has occurred the hypervisor will trap the - * debug exception and we return to userspace. - * - * If the guest attempts to single step its userspace - * we would have to deal with a trapped exception - * while in the guest kernel. Because this would be - * hard to unwind we suppress the guest's ability to - * do so by masking MDSCR_EL.SS. - * - * This confuses guest debuggers which use - * single-step behind the scenes but everything - * returns to normal once the host is no longer - * debugging the system. + * Steal the guest's single-step state machine if userspace wants + * single-step the guest. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - /* - * If the software step state at the last guest exit - * was Active-pending, we don't set DBG_SPSR_SS so - * that the state is maintained (to not run another - * single-step until the pending Software Step - * exception is taken). - */ - if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING)) + if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS) + vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + else + vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + + if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING)) *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; else *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; - - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } else { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); } + } else { + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu)); - - /* - * HW Breakpoints and watchpoints - * - * We simply switch the debug_ptr to point to our new - * external_debug_state which has been populated by the - * debug ioctl. The existing DEBUG_DIRTY mechanism ensures - * the registers are updated on the world switch. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - /* Enable breakpoints/watchpoints */ - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - - vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); - - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); - - /* - * The OS Lock blocks debug exceptions in all ELs when it is - * enabled. If the guest has enabled the OS Lock, constrain its - * effects to the guest. Emulate the behavior by clearing - * MDSCR_EL1.MDE. In so doing, we ensure that host debug - * exceptions are unaffected by guest configuration of the OS - * Lock. - */ - } else if (kvm_vcpu_os_lock_enabled(vcpu)) { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } + if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE)) + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + else + vcpu->arch.debug_owner = VCPU_DEBUG_FREE; } - BUG_ON(!vcpu->guest_debug && - vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state); - - /* If KDE or MDE are set, perform a full save/restore cycle. */ - if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - /* Write mdcr_el2 changes since vcpu_load on VHE systems */ - if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - - trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); + kvm_arm_setup_mdcr_el2(vcpu); } -void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) +void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) { - trace_kvm_arm_clear_debug(vcpu->guest_debug); + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + return; /* - * Restore the guest's debug registers if we were using them. + * Save the host's software step state and restore the guest's before + * potentially returning to userspace. */ - if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) - /* - * Mark the vcpu as ACTIVE_PENDING - * until Software Step exception is taken. - */ - vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING); - } - - restore_guest_debug_regs(vcpu); + if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) + vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING); + else + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); - /* - * If we were using HW debug we need to restore the - * debug_ptr to the guest debug state. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - kvm_arm_reset_debug_ptr(vcpu); + if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; +} - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); +/* + * Updates ownership of the debug registers after a trapped guest access to a + * breakpoint/watchpoint register. Host ownership of the debug registers is of + * strictly higher priority, and it is the responsibility of the VMM to emulate + * guest debug exceptions in this configuration. + */ +void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) +{ + if (kvm_host_owns_debug_regs(vcpu)) + return; - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); - } - } + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + kvm_arm_setup_mdcr_el2(vcpu); } -void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) +void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) { - u64 dfr0; + if (val & OSLAR_EL1_OSLK) + __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK; + else + __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK; - /* For VHE, there is nothing to do */ - if (has_vhe()) + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); +} + +void kvm_enable_trbe(void) +{ + if (has_vhe() || is_protected_kvm_enabled() || + WARN_ON_ONCE(preemptible())) return; - dfr0 = read_sysreg(id_aa64dfr0_el1); - /* - * If SPE is present on this CPU and is available at current EL, - * we may need to check if the host state needs to be saved. - */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && - !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(PMBIDR_EL1_P_SHIFT))) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE); + host_data_set_flag(TRBE_ENABLED); +} +EXPORT_SYMBOL_GPL(kvm_enable_trbe); + +void kvm_disable_trbe(void) +{ + if (has_vhe() || is_protected_kvm_enabled() || + WARN_ON_ONCE(preemptible())) + return; - /* Check if we have TRBE implemented and available at the host */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && - !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE); + host_data_clear_flag(TRBE_ENABLED); } +EXPORT_SYMBOL_GPL(kvm_disable_trbe); -void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) +void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) { - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); + if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible())) + return; + + if (has_vhe()) { + write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12); + return; + } + + *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest; + if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest) + host_data_set_flag(EL1_TRACING_CONFIGURED); + else + host_data_clear_flag(EL1_TRACING_CONFIGURED); } +EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 4697ba41b3a9..0fcfcc0478f9 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -16,9 +16,13 @@ enum trap_behaviour { BEHAVE_HANDLE_LOCALLY = 0, + BEHAVE_FORWARD_READ = BIT(0), BEHAVE_FORWARD_WRITE = BIT(1), - BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + + /* Traps that take effect in Host EL0, this is rare! */ + BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), }; struct trap_bits { @@ -79,25 +83,43 @@ enum cgt_group_id { CGT_MDCR_E2TB, CGT_MDCR_TDCC, + CGT_CPTR_TAM, + CGT_CPTR_TCPAC, + + CGT_HCRX_EnFPM, + CGT_HCRX_TCR2En, + + CGT_CNTHCTL_EL1TVT, + CGT_CNTHCTL_EL1TVCT, + + CGT_ICH_HCR_TC, + CGT_ICH_HCR_TALL0, + CGT_ICH_HCR_TALL1, + CGT_ICH_HCR_TDIR, + /* * Anything after this point is a combination of coarse trap * controls, which must all be evaluated to decide what to do. */ __MULTIPLE_CONTROL_BITS__, - CGT_HCR_IMO_FMO = __MULTIPLE_CONTROL_BITS__, + CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, CGT_HCR_TID2_TID4, CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB_TTLBOS, CGT_HCR_TVM_TRVM, + CGT_HCR_TVM_TRVM_HCRX_TCR2En, CGT_HCR_TPU_TICAB, CGT_HCR_TPU_TOCU, CGT_HCR_NV1_nNV2_ENSCXT, CGT_MDCR_TPM_TPMCR, + CGT_MDCR_TPM_HPMN, CGT_MDCR_TDE_TDA, CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE_TDRA, CGT_MDCR_TDCC_TDE_TDA, + CGT_ICH_HCR_TC_TDIR, + /* * Anything after this point requires a callback evaluating a * complex trap condition. Ugly stuff. @@ -105,6 +127,11 @@ enum cgt_group_id { __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PTEN, + CGT_CNTHCTL_EL1NVPCT, + CGT_CNTHCTL_EL1NVVCT, + + CGT_CPTR_TTA, + CGT_MDCR_HPMN, /* Must be last */ __NR_CGT_GROUP_IDS__ @@ -121,7 +148,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TID2, .mask = HCR_TID2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID3] = { .index = HCR_EL2, @@ -145,37 +172,37 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TIDCP, .mask = HCR_TIDCP, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TACR] = { .index = HCR_EL2, .value = HCR_TACR, .mask = HCR_TACR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TSW] = { .index = HCR_EL2, .value = HCR_TSW, .mask = HCR_TSW, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ .index = HCR_EL2, .value = HCR_TPC, .mask = HCR_TPC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPU] = { .index = HCR_EL2, .value = HCR_TPU, .mask = HCR_TPU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLB] = { .index = HCR_EL2, .value = HCR_TTLB, .mask = HCR_TTLB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TVM] = { .index = HCR_EL2, @@ -187,7 +214,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TDZ, .mask = HCR_TDZ, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TRVM] = { .index = HCR_EL2, @@ -199,151 +226,213 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TLOR, .mask = HCR_TLOR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TERR] = { .index = HCR_EL2, .value = HCR_TERR, .mask = HCR_TERR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_APK] = { .index = HCR_EL2, .value = 0, .mask = HCR_APK, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV_nNV2] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV1_nNV2] = { .index = HCR_EL2, .value = HCR_NV | HCR_NV1, .mask = HCR_NV | HCR_NV1 | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_AT] = { .index = HCR_EL2, .value = HCR_AT, .mask = HCR_AT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_nFIEN] = { .index = HCR_EL2, .value = 0, .mask = HCR_FIEN, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID4] = { .index = HCR_EL2, .value = HCR_TID4, .mask = HCR_TID4, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TICAB] = { .index = HCR_EL2, .value = HCR_TICAB, .mask = HCR_TICAB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TOCU] = { .index = HCR_EL2, .value = HCR_TOCU, .mask = HCR_TOCU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_ENSCXT] = { .index = HCR_EL2, .value = 0, .mask = HCR_ENSCXT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBIS] = { .index = HCR_EL2, .value = HCR_TTLBIS, .mask = HCR_TTLBIS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBOS] = { .index = HCR_EL2, .value = HCR_TTLBOS, .mask = HCR_TTLBOS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMCR] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, .mask = MDCR_EL2_TPMCR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TPM] = { .index = MDCR_EL2, .value = MDCR_EL2_TPM, .mask = MDCR_EL2_TPM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TDE] = { .index = MDCR_EL2, .value = MDCR_EL2_TDE, .mask = MDCR_EL2_TDE, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDA, .mask = MDCR_EL2_TDA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDOSA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDOSA, .mask = MDCR_EL2_TDOSA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDRA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDRA, .mask = MDCR_EL2_TDRA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2PB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2PB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMS] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMS, .mask = MDCR_EL2_TPMS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TTRF] = { .index = MDCR_EL2, .value = MDCR_EL2_TTRF, .mask = MDCR_EL2_TTRF, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2TB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2TB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDCC] = { .index = MDCR_EL2, .value = MDCR_EL2_TDCC, .mask = MDCR_EL2_TDCC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TAM] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TAM, + .mask = CPTR_EL2_TAM, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TCPAC] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TCPAC, + .mask = CPTR_EL2_TCPAC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_EnFPM] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_EnFPM, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_TCR2En] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_TCR2En, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVT, + .mask = CNTHCTL_EL1TVT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVCT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVCT, + .mask = CNTHCTL_EL1TVCT, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_ICH_HCR_TC] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TC, + .mask = ICH_HCR_EL2_TC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL0] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TALL0, + .mask = ICH_HCR_EL2_TALL0, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL1] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TALL1, + .mask = ICH_HCR_EL2_TALL1, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TDIR] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TDIR, + .mask = ICH_HCR_EL2_TDIR, + .behaviour = BEHAVE_FORWARD_RW, }, }; @@ -354,19 +443,24 @@ static const struct trap_bits coarse_trap_bits[] = { } static const enum cgt_group_id *coarse_control_combo[] = { - MCB(CGT_HCR_IMO_FMO, CGT_HCR_IMO, CGT_HCR_FMO), MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM), + MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En, + CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En), MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB), MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), + MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), + + MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), + MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), }; typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); @@ -399,7 +493,7 @@ static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) @@ -407,7 +501,74 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; +} + +static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) +{ + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); +} + +static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); + + if (!vcpu_el2_e2h_is_set(vcpu)) + val = translate_cptr_el2_to_cpacr_el1(val); + + if (val & CPACR_EL1_TTA) + return BEHAVE_FORWARD_RW; + + return BEHAVE_HANDLE_LOCALLY; +} + +static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + unsigned int idx; + + + switch (sysreg) { + case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): + case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): + idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); + break; + case SYS_PMXEVTYPER_EL0: + case SYS_PMXEVCNTR_EL0: + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); + break; + default: + /* Someone used this trap helper for something else... */ + KVM_BUG_ON(1, vcpu->kvm); + return BEHAVE_HANDLE_LOCALLY; + } + + if (kvm_pmu_counter_is_hyp(vcpu, idx)) + return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; + + return BEHAVE_HANDLE_LOCALLY; } #define CCC(id, fn) \ @@ -416,6 +577,10 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), + CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), + CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), + CCC(CGT_CPTR_TTA, check_cptr_tta), + CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; /* @@ -487,9 +652,9 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), - SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO), + SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), @@ -622,6 +787,11 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ), @@ -725,17 +895,22 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), - /* All _EL02, _EL12 registers */ + /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), - sys_reg(3, 5, 14, 15, 7), CGT_HCR_NV), + sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), + SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), @@ -817,6 +992,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), @@ -827,77 +1003,77 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM), + SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), @@ -1000,11 +1176,97 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC), + SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), + /* op0=2, op1=1, and CRn<0b1000 */ + SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), + sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), + /* + * IMPDEF choice: + * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as + * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for + * ICC_SRE_EL1 access, and always handle it locally. + */ + SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), + SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), }; static DEFINE_XARRAY(sr_forward_xa); @@ -1071,6 +1333,7 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_TPIDRRO_EL0, HFGxTR, TPIDRRO_EL0, 1), SR_FGT(SYS_TPIDR_EL1, HFGxTR, TPIDR_EL1, 1), SR_FGT(SYS_TCR_EL1, HFGxTR, TCR_EL1, 1), + SR_FGT(SYS_TCR2_EL1, HFGxTR, TCR_EL1, 1), SR_FGT(SYS_SCXTNUM_EL0, HFGxTR, SCXTNUM_EL0, 1), SR_FGT(SYS_SCXTNUM_EL1, HFGxTR, SCXTNUM_EL1, 1), SR_FGT(SYS_SCTLR_EL1, HFGxTR, SCTLR_EL1, 1), @@ -1846,7 +2109,8 @@ check_mcb: cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) { - if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && + cgids[i] < __COMPLEX_CONDITIONS__) { kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); ret = -EINVAL; } @@ -1951,11 +2215,19 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) return masks->mask[sr - __VNCR_START__].res0; } -static bool check_fgt_bit(struct kvm *kvm, bool is_read, +static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, u64 val, const union trap_config tc) { + struct kvm *kvm = vcpu->kvm; enum vcpu_sysreg sr; + /* + * KVM doesn't know about any FGTs that apply to the host, and hopefully + * that'll remain the case. + */ + if (is_hyp_ctxt(vcpu)) + return false; + if (tc.pol) return (val & BIT(tc.bit)); @@ -2032,7 +2304,15 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) * If we're not nesting, immediately return to the caller, with the * sysreg index, should we have it. */ - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + if (!vcpu_has_nv(vcpu)) + goto local; + + /* + * There are a few traps that take effect InHost, but are constrained + * to EL0. Don't bother with computing the trap behaviour if the vCPU + * isn't in EL0. + */ + if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) goto local; switch ((enum fgt_group_id)tc.fgt) { @@ -2078,12 +2358,14 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) goto local; } - if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu->kvm, is_read, - val, tc)) + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); + if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) + goto local; + if (((b & BEHAVE_FORWARD_READ) && is_read) || ((b & BEHAVE_FORWARD_WRITE) && !is_read)) goto inject; @@ -2117,6 +2399,41 @@ inject: return true; } +static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit) +{ + bool control_bit_set; + + if (!vcpu_has_nv(vcpu)) + return false; + + control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit; + if (!is_hyp_ctxt(vcpu) && control_bit_set) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return true; + } + return false; +} + +static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, HCR_EL2, control_bit); +} + +bool forward_smc_trap(struct kvm_vcpu *vcpu) +{ + return forward_hcr_traps(vcpu, HCR_TSC); +} + +static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, MDCR_EL2, control_bit); +} + +bool forward_debug_exception(struct kvm_vcpu *vcpu) +{ + return forward_mdcr_traps(vcpu, MDCR_EL2_TDE); +} + static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) { u64 mode = spsr & PSR_MODE_MASK; @@ -2152,49 +2469,57 @@ static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) { - u64 spsr, elr, mode; - bool direct_eret; + u64 spsr, elr, esr; /* - * Going through the whole put/load motions is a waste of time - * if this is a VHE guest hypervisor returning to its own - * userspace, or the hypervisor performing a local exception - * return. No need to save/restore registers, no need to - * switch S2 MMU. Just do the canonical ERET. + * Forward this trap to the virtual EL2 if the virtual + * HCR_EL2.NV bit is set and this is coming from !EL2. */ + if (forward_hcr_traps(vcpu, HCR_NV)) + return; + spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); spsr = kvm_check_illegal_exception_return(vcpu, spsr); - mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); - - direct_eret = (mode == PSR_MODE_EL0t && - vcpu_el2_e2h_is_set(vcpu) && - vcpu_el2_tge_is_set(vcpu)); - direct_eret |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); - - if (direct_eret) { - *vcpu_pc(vcpu) = vcpu_read_sys_reg(vcpu, ELR_EL2); - *vcpu_cpsr(vcpu) = spsr; - trace_kvm_nested_eret(vcpu, *vcpu_pc(vcpu), spsr); - return; + /* Check for an ERETAx */ + esr = kvm_vcpu_get_esr(vcpu); + if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { + /* + * Oh no, ERETAx failed to authenticate. + * + * If we have FPACCOMBINE and we don't have a pending + * Illegal Execution State exception (which has priority + * over FPAC), deliver an exception right away. + * + * Otherwise, let the mangled ELR value trickle down the + * ERET handling, and the guest will have a little surprise. + */ + if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { + esr &= ESR_ELx_ERET_ISS_ERETA; + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); + kvm_inject_nested_sync(vcpu, esr); + return; + } } preempt_disable(); + vcpu_set_flag(vcpu, IN_NESTED_ERET); kvm_arch_vcpu_put(vcpu); - elr = __vcpu_sys_reg(vcpu, ELR_EL2); + if (!esr_iss_is_eretax(esr)) + elr = __vcpu_sys_reg(vcpu, ELR_EL2); trace_kvm_nested_eret(vcpu, elr, spsr); - /* - * Note that the current exception level is always the virtual EL2, - * since we set HCR_EL2.NV bit only when entering the virtual EL2. - */ *vcpu_pc(vcpu) = elr; *vcpu_cpsr(vcpu) = spsr; kvm_arch_vcpu_load(vcpu, smp_processor_id()); + vcpu_clear_flag(vcpu, IN_NESTED_ERET); preempt_enable(); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); } static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, @@ -2277,6 +2602,9 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); + return 1; } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 826307e19e3a..7f6e43d25691 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -14,19 +14,6 @@ #include <asm/kvm_mmu.h> #include <asm/sysreg.h> -void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu) -{ - struct task_struct *p = vcpu->arch.parent_task; - struct user_fpsimd_state *fpsimd; - - if (!is_protected_kvm_enabled() || !p) - return; - - fpsimd = &p->thread.uw.fpsimd_state; - kvm_unshare_hyp(fpsimd, fpsimd + 1); - put_task_struct(p); -} - /* * Called on entry to KVM_RUN unless this vcpu previously ran at least * once and the most recent prior KVM_RUN for this vcpu was called from @@ -38,30 +25,18 @@ void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu) */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) { - int ret; - struct user_fpsimd_state *fpsimd = ¤t->thread.uw.fpsimd_state; + int ret; - kvm_vcpu_unshare_task_fp(vcpu); + /* pKVM has its own tracking of the host fpsimd state. */ + if (is_protected_kvm_enabled()) + return 0; /* Make sure the host task fpsimd state is visible to hyp: */ ret = kvm_share_hyp(fpsimd, fpsimd + 1); if (ret) return ret; - vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd); - - /* - * We need to keep current's task_struct pinned until its data has been - * unshared with the hypervisor to make sure it is not re-used by the - * kernel and donated to someone else while already shared -- see - * kvm_vcpu_unshare_task_fp() for the matching put_task_struct(). - */ - if (is_protected_kvm_enabled()) { - get_task_struct(current); - vcpu->arch.parent_task = current; - } - return 0; } @@ -79,41 +54,18 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (!system_supports_fpsimd()) return; - fpsimd_kvm_prepare(); - /* - * We will check TIF_FOREIGN_FPSTATE just before entering the - * guest in kvm_arch_vcpu_ctxflush_fp() and override this to - * FP_STATE_FREE if the flag set. + * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such + * that the host kernel is responsible for restoring this state upon + * return to userspace, and the hyp code doesn't need to save anything. + * + * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures + * that PSTATE.{SM,ZA} == {0,0}. */ - vcpu->arch.fp_state = FP_STATE_HOST_OWNED; - - vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SVE_ENABLED); - - if (system_supports_sme()) { - vcpu_clear_flag(vcpu, HOST_SME_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SME_ENABLED); + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; - /* - * If PSTATE.SM is enabled then save any pending FP - * state and disable PSTATE.SM. If we leave PSTATE.SM - * enabled and the guest does not enable SME via - * CPACR_EL1.SMEN then operations that should be valid - * may generate SME traps from EL1 to EL1 which we - * can't intercept and which would confuse the guest. - * - * Do the same for PSTATE.ZA in the case where there - * is state in the registers which has not already - * been saved, this is very unlikely to happen. - */ - if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { - vcpu->arch.fp_state = FP_STATE_FREE; - fpsimd_save_and_flush_cpu_state(); - } - } + WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); } /* @@ -126,7 +78,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) { if (test_thread_flag(TIF_FOREIGN_FPSTATE)) - vcpu->arch.fp_state = FP_STATE_FREE; + *host_data_ptr(fp_owner) = FP_STATE_FREE; } /* @@ -142,8 +94,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) WARN_ON_ONCE(!irqs_disabled()); - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { - + if (guest_owns_fp_regs()) { /* * Currently we do not support SME guests so SVCR is * always 0 and we just need a variable to point to. @@ -152,8 +103,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; fp_state.sme_state = NULL; - fp_state.svcr = &vcpu->arch.svcr; - fp_state.fpmr = &vcpu->arch.fpmr; + fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR); + fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) @@ -179,46 +130,18 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); - /* - * If we have VHE then the Hyp code will reset CPACR_EL1 to - * the default value and we need to reenable SME. - */ - if (has_vhe() && system_supports_sme()) { - /* Also restore EL0 state seen on entry */ - if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, - CPACR_EL1_SMEN_EL0EN | - CPACR_EL1_SMEN_EL1EN); - else - sysreg_clear_set(CPACR_EL1, - CPACR_EL1_SMEN_EL0EN, - CPACR_EL1_SMEN_EL1EN); - isb(); - } - - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { - if (vcpu_has_sve(vcpu)) { - __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); - - /* Restore the VL that was saved when bound to the CPU */ - if (!has_vhe()) - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, - SYS_ZCR_EL1); - } - - fpsimd_save_and_flush_cpu_state(); - } else if (has_vhe() && system_supports_sve()) { + if (guest_owns_fp_regs()) { /* - * The FPSIMD/SVE state in the CPU has not been touched, and we - * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been - * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE - * for EL0. To avoid spurious traps, restore the trap state - * seen by kvm_arch_vcpu_load_fp(): + * Flush (save and invalidate) the fpsimd/sve state so that if + * the host tries to use fpsimd/sve, it's not using stale data + * from the guest. + * + * Flushing the state sets the TIF_FOREIGN_FPSTATE bit for the + * context unconditionally, in both nVHE and VHE. This allows + * the kernel to restore the fpsimd/sve state, including ZCR_EL1 + * when needed. */ - if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); - else - sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); + fpsimd_save_and_flush_cpu_state(); } local_irq_restore(flags); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index e2f762d959bb..2196979a24a3 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -251,6 +251,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: + case PSR_AA32_MODE_SYS: if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; @@ -276,7 +277,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { int i, nr_reg; - switch (*vcpu_cpsr(vcpu)) { + switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { /* * Either we are dealing with user mode, and only the * first 15 registers (+ PC) must be narrowed to 32bit. @@ -916,31 +917,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int ret = 0; - trace_kvm_set_guest_debug(vcpu, dbg->control); - if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) { - ret = -EINVAL; - goto out; - } - - if (dbg->control & KVM_GUESTDBG_ENABLE) { - vcpu->guest_debug = dbg->control; - - /* Hardware assisted Break and Watch points */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - vcpu->arch.external_debug_state = dbg->arch; - } + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) + return -EINVAL; - } else { - /* If not enabled clear all flags */ + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); + return 0; } -out: - return ret; + vcpu->guest_debug = dbg->control; + + /* Hardware assisted Break and Watch points */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + vcpu->arch.external_debug_state = dbg->arch; + + return 0; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, @@ -1044,50 +1038,64 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, mutex_lock(&kvm->slots_lock); + if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { + ret = -EBUSY; + goto out; + } + while (length > 0) { - kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); + struct page *page = __gfn_to_page(kvm, gfn, write); void *maddr; unsigned long num_tags; - struct page *page; + struct folio *folio; - if (is_error_noslot_pfn(pfn)) { + if (!page) { ret = -EFAULT; goto out; } - page = pfn_to_online_page(pfn); - if (!page) { + if (!pfn_to_online_page(page_to_pfn(page))) { /* Reject ZONE_DEVICE memory */ + kvm_release_page_unused(page); ret = -EFAULT; goto out; } + folio = page_folio(page); maddr = page_address(page); if (!write) { - if (page_mte_tagged(page)) + if ((folio_test_hugetlb(folio) && + folio_test_hugetlb_mte_tagged(folio)) || + page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else /* No tags in memory, so write zeros */ num_tags = MTE_GRANULES_PER_PAGE - clear_user(tags, MTE_GRANULES_PER_PAGE); - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); } else { /* * Only locking to serialise with a concurrent * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ - try_page_mte_tagging(page); + if (folio_test_hugetlb(folio)) + folio_try_hugetlb_mte_tagging(folio); + else + try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) mte_clear_page_tags(maddr); - set_page_mte_tagged(page); + if (folio_test_hugetlb(folio)) + folio_set_hugetlb_mte_tagged(folio); + else + set_page_mte_tagged(page); - kvm_release_pfn_dirty(pfn); + kvm_release_page_dirty(page); } if (num_tags != MTE_GRANULES_PER_PAGE) { diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 617ae6dea5d5..b73dc26bc44b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -56,6 +56,13 @@ static int handle_hvc(struct kvm_vcpu *vcpu) static int handle_smc(struct kvm_vcpu *vcpu) { /* + * Forward this trapped smc instruction to the virtual EL2 if + * the guest has asked for it. + */ + if (forward_smc_trap(vcpu)) + return 1; + + /* * "If an SMC instruction executed at Non-secure EL1 is * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a * Trap exception, not a Secure Monitor Call exception [...]" @@ -87,11 +94,19 @@ static int handle_smc(struct kvm_vcpu *vcpu) } /* - * Guest access to FP/ASIMD registers are routed to this handler only - * when the system doesn't support FP/ASIMD. + * This handles the cases where the system does not support FP/ASIMD or when + * we are running nested virtualization and the guest hypervisor is trapping + * FP/ASIMD accesses by its guest guest. + * + * All other handling of guest vs. host FP/ASIMD register state is handled in + * fixup_guest_exit(). */ -static int handle_no_fpsimd(struct kvm_vcpu *vcpu) +static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu) { + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + /* This is the case when the system doesn't support FP/ASIMD. */ kvm_inject_undefined(vcpu); return 1; } @@ -114,8 +129,12 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu) static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); + bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE); - if (esr & ESR_ELx_WFx_ISS_WFE) { + if (guest_hyp_wfx_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + if (is_wfe) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); vcpu->stat.wfe_exit_stat++; } else { @@ -168,6 +187,9 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u64 esr = kvm_vcpu_get_esr(vcpu); + if (!vcpu->guest_debug && forward_debug_exception(vcpu)) + return 1; + run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = lower_32_bits(esr); run->debug.arch.hsr_high = upper_32_bits(esr); @@ -178,7 +200,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) run->debug.arch.far = vcpu->arch.fault.far_el2; break; case ESR_ELx_EC_SOFTSTP_LOW: - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; break; } @@ -202,24 +224,48 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) */ static int handle_sve(struct kvm_vcpu *vcpu) { + if (guest_hyp_sve_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + kvm_inject_undefined(vcpu); return 1; } /* - * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into - * a NOP). If we get here, it is that we didn't fixup ptrauth on exit, and all - * that we can do is give the guest an UNDEF. + * Two possibilities to handle a trapping ptrauth instruction: + * + * - Guest usage of a ptrauth instruction (which the guest EL1 did not + * turn into a NOP). If we get here, it is because we didn't enable + * ptrauth for the guest. This results in an UNDEF, as it isn't + * supposed to use ptrauth without being told it could. + * + * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for + * which we reinject the exception into L1. + * + * Anything else is an emulation bug (hence the WARN_ON + UNDEF). */ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) { + if (!vcpu_has_ptrauth(vcpu)) { + kvm_inject_undefined(vcpu); + return 1; + } + + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; + } + + /* Really shouldn't be here! */ + WARN_ON_ONCE(1); kvm_inject_undefined(vcpu); return 1; } static int kvm_handle_eret(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_ERET_ISS_ERET) + if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) && + !vcpu_has_ptrauth(vcpu)) return kvm_handle_ptrauth(vcpu); /* @@ -276,7 +322,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, - [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd, [ESR_ELx_EC_PAC] = kvm_handle_ptrauth, }; @@ -383,6 +429,20 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } +static void print_nvhe_hyp_panic(const char *name, u64 panic_addr) +{ + kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr, + (void *)(panic_addr + kaslr_offset())); +} + +static void kvm_nvhe_report_cfi_failure(u64 panic_addr) +{ + print_nvhe_hyp_panic("CFI failure", panic_addr); + + if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) + kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n"); +} + void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, @@ -395,7 +455,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && - (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { + esr_brk_comment(esr) == BUG_BRK_IMM) { const char *file = NULL; unsigned int line = 0; @@ -411,11 +471,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr, - (void *)(panic_addr + kaslr_offset())); + print_nvhe_hyp_panic("BUG", panic_addr); + } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { + kvm_nvhe_report_cfi_failure(panic_addr); } else { - kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr, - (void *)(panic_addr + kaslr_offset())); + print_nvhe_hyp_panic("panic", panic_addr); } /* Dump the nVHE hypervisor backtrace */ diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index a38dea6186c9..d61e44642f98 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -3,7 +3,7 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # -incdir := $(srctree)/$(src)/include +incdir := $(src)/include subdir-asflags-y := -I$(incdir) subdir-ccflags-y := -I$(incdir) diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c index 8d9670e6615d..449fa58cf3b6 100644 --- a/arch/arm64/kvm/hyp/aarch32.c +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -50,9 +50,23 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) u32 cpsr_cond; int cond; - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_esr(vcpu) >> 30) + /* + * These are the exception classes that could fire with a + * conditional instruction. + */ + switch (kvm_vcpu_trap_get_class(vcpu)) { + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_LS: + case ESR_ELx_EC_FP_ASIMD: + case ESR_ELx_EC_CP10_ID: + case ESR_ELx_EC_CP14_64: + case ESR_ELx_EC_SVC32: + break; + default: return true; + } /* Is condition field valid? */ cond = kvm_vcpu_get_condition(vcpu); diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index f3aa7738b477..9f4e8d68ab50 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN alternative_else_nop_endif mrs x1, isr_el1 cbz x1, 1f + + // Ensure that __guest_enter() always provides a context + // synchronization event so that callers don't need ISBs for anything + // that would usually be synchonized by the ERET. + isb mov x0, #ARM_EXCEPTION_IRQ ret @@ -83,6 +88,14 @@ alternative_else_nop_endif eret sb +SYM_INNER_LABEL(__guest_exit_restore_elr_and_panic, SYM_L_GLOBAL) + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + adr_this_cpu x0, kvm_hyp_ctxt, x1 + ldr x0, [x0, #CPU_ELR_EL2] + msr elr_el2, x0 + SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // x2-x29,lr: vcpu regs // vcpu x0-x1 on the stack diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 61e6f3ba7b7d..e950875e31ce 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state) sve_load 0, x1, x2, 3 ret SYM_FUNC_END(__sve_restore_state) + +SYM_FUNC_START(__sve_save_state) + mov x2, #1 + sve_save 0, x1, x2, 3 + ret +SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 961bbef104a6..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -88,15 +88,26 @@ default: write_debug(ptr[0], reg, 0); \ } +static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.debug_owner) { + case VCPU_DEBUG_FREE: + WARN_ON_ONCE(1); + fallthrough; + case VCPU_DEBUG_GUEST_OWNED: + return &vcpu->arch.vcpu_debug_state; + case VCPU_DEBUG_HOST_OWNED: + return &vcpu->arch.external_debug_state; + } + + return NULL; +} + static void __debug_save_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); save_debug(dbg->dbg_bcr, dbgbcr, brps); save_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -109,13 +120,8 @@ static void __debug_save_state(struct kvm_guest_debug_arch *dbg, static void __debug_restore_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); restore_debug(dbg->dbg_bcr, dbgbcr, brps); restore_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -132,13 +138,13 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - host_dbg = &vcpu->arch.host_debug_state.regs; - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + host_dbg = host_data_ptr(host_debug_state.regs); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(host_dbg, host_ctxt); __debug_restore_state(guest_dbg, guest_ctxt); @@ -151,18 +157,16 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - host_dbg = &vcpu->arch.host_debug_state.regs; - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + host_dbg = host_data_ptr(host_debug_state.regs); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9e13c1bc2ad5..17df94570f03 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -14,6 +14,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { + int ret; u64 par, tmp; /* @@ -27,7 +28,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : + __kvm_at(OP_AT_S1E1R, far); + if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e3fcf8c4d5b4..b741ea6aefa5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -39,12 +39,6 @@ struct kvm_exception_table_entry { extern struct kvm_exception_table_entry __start___kvm_ex_table; extern struct kvm_exception_table_entry __stop___kvm_ex_table; -/* Check whether the FP regs are owned by the guest */ -static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.fp_state == FP_STATE_GUEST_OWNED; -} - /* Save the 32-bit only FPSIMD system register state */ static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu) { @@ -155,7 +149,7 @@ static inline bool cpu_has_amu(void) static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) { - struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); struct kvm *kvm = kern_hyp_va(vcpu->kvm); CHECK_FGT_MASKS(HFGRTR_EL2); @@ -191,7 +185,7 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) { - struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); struct kvm *kvm = kern_hyp_va(vcpu->kvm); if (!cpus_have_final_cap(ARM64_HAS_FGT)) @@ -210,6 +204,35 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); } +static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) +{ + u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + + if (!system_supports_mpam()) + return; + + /* trap guest access to MPAMIDR_EL1 */ + if (system_supports_mpam_hcr()) { + write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); + } else { + /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ + r |= MPAM2_EL2_TIDR; + } + + write_sysreg_s(r, SYS_MPAM2_EL2); +} + +static inline void __deactivate_traps_mpam(void) +{ + if (!system_supports_mpam()) + return; + + write_sysreg_s(0, SYS_MPAM2_EL2); + + if (system_supports_mpam_hcr()) + write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); +} + static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ @@ -221,18 +244,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - if (kvm_arm_support_pmu_v3()) { + if (system_supports_pmuv3()) { struct kvm_cpu_context *hctxt; write_sysreg(0, pmselr_el0); - hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + hctxt = host_data_ptr(host_ctxt); ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } - vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); if (cpus_have_final_cap(ARM64_HAS_HCX)) { @@ -250,17 +273,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) } __activate_traps_hfgxtr(vcpu); + __activate_traps_mpam(vcpu); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { - write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) { + if (system_supports_pmuv3()) { struct kvm_cpu_context *hctxt; - hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + hctxt = host_data_ptr(host_ctxt); write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } @@ -269,12 +293,11 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_mpam(); } -static inline void ___activate_traps(struct kvm_vcpu *vcpu) +static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) { - u64 hcr = vcpu->arch.hcr_el2; - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM)) hcr |= HCR_TVM; @@ -303,7 +326,7 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } -static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) { *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); @@ -321,23 +344,129 @@ static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { + /* + * The vCPU's saved SVE state layout always matches the max VL of the + * vCPU. Start off with the max VL so we can load the SVE state. + */ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), - &vcpu->arch.ctxt.fp_regs.fpsr); - write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); + &vcpu->arch.ctxt.fp_regs.fpsr, + true); + + /* + * The effective VL for a VM could differ from the max VL when running a + * nested guest, as the guest hypervisor could select a smaller VL. Slap + * that into hardware before wrapping up. + */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) + sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2); + + write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR); +} + +static inline void __hyp_sve_save_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); + __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); +} + +static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + if (vcpu_has_sve(vcpu)) { + /* A guest hypervisor may restrict the effective max VL. */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); + else + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); + write_sysreg_el1(zcr_el1, SYS_ZCR); + } +} + +static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + /* + * When the guest owns the FP regs, we know that guest+hyp traps for + * any FPSIMD/SVE/SME features exposed to the guest have been disabled + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * prior to __guest_entry(). As __guest_entry() guarantees a context + * synchronization event, we don't need an ISB here to avoid taking + * traps for anything that was exposed to the guest. + */ + if (vcpu_has_sve(vcpu)) { + zcr_el1 = read_sysreg_el1(SYS_ZCR); + __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + + /* + * The guest's state is always saved using the guest's max VL. + * Ensure that the host has the guest's max VL active such that + * the host can save the guest's state lazily, but don't + * artificially restrict the host to the guest's max VL. + */ + if (has_vhe()) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + } else { + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el1(zcr_el1, SYS_ZCR); + } + } +} + +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_EL1_ZEN, 0); + + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); } + /* * We trap the first access to the FP/SIMD to save the host context and * restore the guest context lazily. * If FP/SIMD is not implemented, handle the trap and inject an undefined * instruction exception to the guest. Similarly for trapped SVE accesses. */ -static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; - u64 reg; if (!system_supports_fpsimd()) return false; @@ -348,10 +477,19 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Only handle traps the vCPU can support here: */ switch (esr_ec) { case ESR_ELx_EC_FP_ASIMD: + /* Forward traps to the guest hypervisor as required */ + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return false; break; + case ESR_ELx_EC_SYS64: + if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu))) + return false; + fallthrough; case ESR_ELx_EC_SVE: if (!sve_guest) return false; + if (guest_hyp_sve_traps_enabled(vcpu)) + return false; break; default: return false; @@ -360,24 +498,15 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (has_vhe() || has_hvhe()) { - reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; - if (sve_guest) - reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - - sysreg_clear_set(cpacr_el1, 0, reg); - } else { - reg = CPTR_EL2_TFP; - if (sve_guest) - reg |= CPTR_EL2_TZ; - - sysreg_clear_set(cptr_el2, reg, 0); - } + if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) + cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); + else + cpacr_clear_set(0, CPACR_EL1_FPEN); isb(); /* Write out the host state if it's in the registers */ - if (vcpu->arch.fp_state == FP_STATE_HOST_OWNED) - __fpsimd_save_state(vcpu->arch.host_fpsimd_state); + if (is_protected_kvm_enabled() && host_owns_fp_regs()) + kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ if (sve_guest) @@ -385,11 +514,14 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) else __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR); + /* Skip restoring fpexc32 for AArch64 guests */ if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); - vcpu->arch.fp_state = FP_STATE_GUEST_OWNED; + *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; return true; } @@ -449,61 +581,25 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } -static inline bool esr_is_ptrauth_trap(u64 esr) +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) { - switch (esr_sys64_to_sysreg(esr)) { - case SYS_APIAKEYLO_EL1: - case SYS_APIAKEYHI_EL1: - case SYS_APIBKEYLO_EL1: - case SYS_APIBKEYHI_EL1: - case SYS_APDAKEYLO_EL1: - case SYS_APDAKEYHI_EL1: - case SYS_APDBKEYLO_EL1: - case SYS_APDBKEYHI_EL1: - case SYS_APGAKEYLO_EL1: - case SYS_APGAKEYHI_EL1: - return true; - } - - return false; -} + u64 offset = 0; -#define __ptrauth_save_key(ctxt, key) \ - do { \ - u64 __val; \ - __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \ - ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \ - __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \ - ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \ -} while(0) + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); -DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); + return offset; +} -static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline u64 compute_counter_value(struct arch_timer_context *ctxt) { - struct kvm_cpu_context *ctxt; - u64 val; - - if (!vcpu_has_ptrauth(vcpu)) - return false; - - ctxt = this_cpu_ptr(&kvm_hyp_ctxt); - __ptrauth_save_key(ctxt, APIA); - __ptrauth_save_key(ctxt, APIB); - __ptrauth_save_key(ctxt, APDA); - __ptrauth_save_key(ctxt, APDB); - __ptrauth_save_key(ctxt, APGA); - - vcpu_ptrauth_enable(vcpu); - - val = read_sysreg(hcr_el2); - val |= (HCR_API | HCR_APK); - write_sysreg(val, hcr_el2); - - return true; + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); } -static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) +static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) { struct arch_timer_context *ctxt; u32 sysreg; @@ -513,18 +609,19 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) * We only get here for 64bit guests, 32bit guests will hit * the long and winding road all the way to the standard * handling. Yes, it sucks to be irrelevant. + * + * Also, we only deal with non-hypervisor context here (either + * an EL1 guest, or a non-HYP context of an EL2 guest). */ + if (is_hyp_ctxt(vcpu)) + return false; + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); switch (sysreg) { case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: if (vcpu_has_nv(vcpu)) { - if (is_hyp_ctxt(vcpu)) { - ctxt = vcpu_hptimer(vcpu); - break; - } - /* Check for guest hypervisor trapping */ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) @@ -536,16 +633,23 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) ctxt = vcpu_ptimer(vcpu); break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (val & CNTHCTL_EL1TVCT) + return false; + } + + ctxt = vcpu_vtimer(vcpu); + break; default: return false; } - val = arch_timer_read_cntpct_el0(); - - if (ctxt->offset.vm_offset) - val -= *kern_hyp_va(ctxt->offset.vm_offset); - if (ctxt->offset.vcpu_offset) - val -= *kern_hyp_va(ctxt->offset.vcpu_offset); + val = compute_counter_value(ctxt); vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); __kvm_skip_instr(vcpu); @@ -576,7 +680,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) return true; } -static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) @@ -590,16 +694,13 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; - if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) - return kvm_hyp_handle_ptrauth(vcpu, exit_code); - - if (kvm_hyp_handle_cntpct(vcpu)) + if (kvm_handle_cntxct(vcpu)) return true; return false; } -static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) { if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) @@ -608,19 +709,18 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, + u64 *exit_code) { if (!__populate_fault_info(vcpu)) return true; return false; } -static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); -static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); +#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault +#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault -static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) { if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) return true; @@ -650,23 +750,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); - -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); - /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * * Returns true if the hypervisor handled the exit, and control should go back * to the guest, or false if it hasn't. */ -static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - exit_handler_fn fn; - - fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; - + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; if (fn) return fn(vcpu, exit_code); @@ -696,20 +789,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code * the guest, false when we should restore the host state and return to the * main run loop. */ -static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - /* - * Save PSTATE early so that we can evaluate the vcpu mode - * early on. - */ - synchronize_vcpu_pstate(vcpu, exit_code); - - /* - * Check whether we want to repaint the state one way or - * another. - */ - early_exit_filter(vcpu, exit_code); - if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -739,7 +821,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) goto exit; /* Check if there's an exit handler and allow it to handle the exit. */ - if (kvm_hyp_handle_exit(vcpu, exit_code)) + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) goto guest; exit: /* Return to the host kernel and handle the exit */ @@ -753,7 +835,7 @@ guest: static inline void __kvm_unexpected_el2_exception(void) { - extern char __guest_exit_panic[]; + extern char __guest_exit_restore_elr_and_panic[]; unsigned long addr, fixup; struct kvm_exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); @@ -775,7 +857,8 @@ static inline void __kvm_unexpected_el2_exception(void) } /* Trigger a panic after restoring the hyp context. */ - write_sysreg(__guest_exit_panic, elr_el2); + this_cpu_ptr(&kvm_hyp_ctxt)->sys_regs[ELR_EL2] = elr_el2; + write_sysreg(__guest_exit_restore_elr_and_panic, elr_el2); } #endif /* __ARM64_KVM_HYP_SWITCH_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 4be6a7fa0070..b9cff893bbe0 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -16,16 +16,7 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> -static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) -{ - ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); -} - -static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) -{ - ctxt_sys_reg(ctxt, TPIDR_EL0) = read_sysreg(tpidr_el0); - ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); -} +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) { @@ -37,6 +28,47 @@ static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) return vcpu; } +static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt) +{ + return host_data_ptr(host_ctxt) != ctxt; +} + +static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + + if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu)) + return &vcpu->arch.external_mdscr_el1; + + return &ctxt_sys_reg(ctxt, MDSCR_EL1); +} + +static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm); + + if (!(ctxt_is_guest(ctxt) && + test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))) + return read_cpuid_id(); + + return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1); +} + +static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) +{ + *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); +} + +static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) +{ + ctxt_sys_reg(ctxt, TPIDR_EL0) = read_sysreg(tpidr_el0); + ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); +} + static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); @@ -52,7 +84,29 @@ static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) return false; vcpu = ctxt_to_vcpu(ctxt); - return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1PIE, IMP); + return kvm_has_s1pie(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_TCR2)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_tcr2(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!system_supports_poe()) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_s1poe(kern_hyp_va(vcpu->kvm)); } static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) @@ -62,8 +116,17 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0); ctxt_sys_reg(ctxt, TTBR1_EL1) = read_sysreg_el1(SYS_TTBR1); ctxt_sys_reg(ctxt, TCR_EL1) = read_sysreg_el1(SYS_TCR); - if (cpus_have_final_cap(ARM64_HAS_TCR2)) + if (ctxt_has_tcrx(ctxt)) { ctxt_sys_reg(ctxt, TCR2_EL1) = read_sysreg_el1(SYS_TCR2); + + if (ctxt_has_s1pie(ctxt)) { + ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); + ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); + } + + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); + } ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); ctxt_sys_reg(ctxt, AFSR1_EL1) = read_sysreg_el1(SYS_AFSR1); @@ -73,10 +136,6 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR); ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR); ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL); - if (ctxt_has_s1pie(ctxt)) { - ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); - ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); - } ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par(); ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1); @@ -106,7 +165,11 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { - write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); } static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -115,9 +178,11 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0); } -static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) +static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, + u64 midr, u64 mpidr) { - write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2); + write_sysreg(midr, vpidr_el2); + write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { @@ -138,8 +203,17 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1), SYS_CPACR); write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0); write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1); - if (cpus_have_final_cap(ARM64_HAS_TCR2)) + if (ctxt_has_tcrx(ctxt)) { write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1), SYS_TCR2); + + if (ctxt_has_s1pie(ctxt)) { + write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); + write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); + } + + if (ctxt_has_s1poe(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); + } write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1), SYS_AFSR1); @@ -149,10 +223,6 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR); write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1), SYS_AMAIR); write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL); - if (ctxt_has_s1pie(ctxt)) { - write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); - write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); - } write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1), par_el1); write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1); @@ -240,7 +310,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); } @@ -257,7 +327,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h index d9fd5e6c7d3c..146e0aebfa1c 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -9,7 +9,7 @@ #include <asm/kvm_host.h> #define FFA_MIN_FUNC_NUM 0x60 -#define FFA_MAX_FUNC_NUM 0x7F +#define FFA_MAX_FUNC_NUM 0xFF int hyp_ffa_init(void *pages); bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id); diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h deleted file mode 100644 index 51f043649146..000000000000 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ /dev/null @@ -1,223 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Google LLC - * Author: Fuad Tabba <tabba@google.com> - */ - -#ifndef __ARM64_KVM_FIXED_CONFIG_H__ -#define __ARM64_KVM_FIXED_CONFIG_H__ - -#include <asm/sysreg.h> - -/* - * This file contains definitions for features to be allowed or restricted for - * guest virtual machines, depending on the mode KVM is running in and on the - * type of guest that is running. - * - * The ALLOW masks represent a bitmask of feature fields that are allowed - * without any restrictions as long as they are supported by the system. - * - * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for - * features that are restricted to support at most the specified feature. - * - * If a feature field is not present in either, than it is not supported. - * - * The approach taken for protected VMs is to allow features that are: - * - Needed by common Linux distributions (e.g., floating point) - * - Trivial to support, e.g., supporting the feature does not introduce or - * require tracking of additional state in KVM - * - Cannot be trapped or prevent the guest from using anyway - */ - -/* - * Allow for protected VMs: - * - Floating-point and Advanced SIMD - * - Data Independent Timing - * - Spectre/Meltdown Mitigation - */ -#define PVM_ID_AA64PFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - AArch64 guests only (no support for AArch32 guests): - * AArch32 adds complexity in trap handling, emulation, condition codes, - * etc... - * - RAS (v1) - * Supported by KVM - */ -#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \ - ) - -/* - * Allow for protected VMs: - * - Branch Target Identification - * - Speculative Store Bypassing - */ -#define PVM_ID_AA64PFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \ - ) - -#define PVM_ID_AA64PFR2_ALLOW 0ULL - -/* - * Allow for protected VMs: - * - Mixed-endian - * - Distinction between Secure and Non-secure Memory - * - Mixed-endian at EL0 only - * - Non-context synchronizing exception entry and exit - */ -#define PVM_ID_AA64MMFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - 40-bit IPA - * - 16-bit ASID - */ -#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ - ) - -/* - * Allow for protected VMs: - * - Hardware translation table updates to Access flag and Dirty state - * - Number of VMID bits from CPU - * - Hierarchical Permission Disables - * - Privileged Access Never - * - SError interrupt exceptions from speculative reads - * - Enhanced Translation Synchronization - * - Control for cache maintenance permission - */ -#define PVM_ID_AA64MMFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_CMOW) \ - ) - -/* - * Allow for protected VMs: - * - Common not Private translations - * - User Access Override - * - IESB bit in the SCTLR_ELx registers - * - Unaligned single-copy atomicity and atomic functions - * - ESR_ELx.EC value on an exception by read access to feature ID space - * - TTL field in address operations. - * - Break-before-make sequences when changing translation block size - * - E0PDx mechanism - */ -#define PVM_ID_AA64MMFR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \ - ) - -#define PVM_ID_AA64MMFR3_ALLOW (0ULL) - -/* - * No support for Scalable Vectors for protected VMs: - * Requires additional support from KVM, e.g., context-switching and - * trapping at EL2 - */ -#define PVM_ID_AA64ZFR0_ALLOW (0ULL) - -/* - * No support for debug, including breakpoints, and watchpoints for protected - * VMs: - * The Arm architecture mandates support for at least the Armv8 debug - * architecture, which would include at least 2 hardware breakpoints and - * watchpoints. Providing that support to protected guests adds - * considerable state and complexity. Therefore, the reserved value of 0 is - * used for debug-related fields. - */ -#define PVM_ID_AA64DFR0_ALLOW (0ULL) -#define PVM_ID_AA64DFR1_ALLOW (0ULL) - -/* - * No support for implementation defined features. - */ -#define PVM_ID_AA64AFR0_ALLOW (0ULL) -#define PVM_ID_AA64AFR1_ALLOW (0ULL) - -/* - * No restrictions on instructions implemented in AArch64. - */ -#define PVM_ID_AA64ISAR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ - ) - -/* Restrict pointer authentication to the basic version. */ -#define PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ - ) - -#define PVM_ID_AA64ISAR2_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ - ) - -#define PVM_ID_AA64ISAR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \ - ) - -#define PVM_ID_AA64ISAR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) \ - ) - -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); -bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); -bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); -int kvm_check_pvm_sysreg_table(void); - -#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 97c527ef53c2..3766333bace9 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -7,7 +7,7 @@ #include <nvhe/memory.h> #include <nvhe/spinlock.h> -#define HYP_NO_ORDER USHRT_MAX +#define HYP_NO_ORDER ((u8)(~0)) struct hyp_pool { /* @@ -19,11 +19,11 @@ struct hyp_pool { struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; - unsigned short max_order; + u8 max_order; }; /* Allocation */ -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order); void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0972faccc2af..ea0a704da9b8 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -11,40 +11,10 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> #include <asm/virt.h> +#include <nvhe/memory.h> #include <nvhe/pkvm.h> #include <nvhe/spinlock.h> -/* - * SW bits 0-1 are reserved to track the memory ownership state of each page: - * 00: The page is owned exclusively by the page-table owner. - * 01: The page is owned by the page-table owner, but is shared - * with another entity. - * 10: The page is shared with, but not owned by the page-table owner. - * 11: Reserved for future use (lending). - */ -enum pkvm_page_state { - PKVM_PAGE_OWNED = 0ULL, - PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, - PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, - __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 | - KVM_PGTABLE_PROT_SW1, - - /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE, -}; - -#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) -static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, - enum pkvm_page_state state) -{ - return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; -} - -static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) -{ - return prot & PKVM_PAGE_STATE_PROT_MASK; -} - struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; @@ -69,6 +39,13 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot); +int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); @@ -79,7 +56,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index ab205c4d6774..34233d586060 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -7,9 +7,47 @@ #include <linux/types.h> +/* + * Bits 0-1 are reserved to track the memory ownership state of each page: + * 00: The page is owned exclusively by the page-table owner. + * 01: The page is owned by the page-table owner, but is shared + * with another entity. + * 10: The page is shared with, but not owned by the page-table owner. + * 11: Reserved for future use (lending). + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = BIT(0), + PKVM_PAGE_SHARED_BORROWED = BIT(1), + __PKVM_PAGE_RESERVED = BIT(0) | BIT(1), + + /* Meta-states which aren't encoded directly in the PTE's SW bits */ + PKVM_NOPAGE = BIT(2), +}; +#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED) + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + prot &= ~PKVM_PAGE_STATE_PROT_MASK; + prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); + return prot; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); +} + struct hyp_page { - unsigned short refcount; - unsigned short order; + u16 refcount; + u8 order; + + /* Host (non-meta) state. Guarded by the host stage-2 lock. */ + enum pkvm_page_state host_state : 8; + + u32 host_share_guest_count; }; extern u64 __hyp_vmemmap; @@ -29,7 +67,13 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) #define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) -#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) + +static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) +{ + BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64)); + return &hyp_vmemmap[hyp_phys_to_pfn(phys)]; +} + #define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) #define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 82b3d62538a6..ce31d3b73603 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -20,6 +20,12 @@ struct pkvm_hyp_vcpu { /* Backpointer to the host's (untrusted) vCPU instance. */ struct kvm_vcpu *host_vcpu; + + /* + * If this hyp vCPU is loaded, then this is a backpointer to the + * per-cpu pointer tracking us. Otherwise, NULL if not loaded. + */ + struct pkvm_hyp_vcpu **loaded_hyp_vcpu; }; /* @@ -37,22 +43,28 @@ struct pkvm_hyp_vm { struct hyp_pool pool; hyp_spinlock_t lock; - /* - * The number of vcpus initialized and ready to run. - * Modifying this is protected by 'vm_table_lock'. - */ - unsigned int nr_vcpus; - /* Array of the hyp vCPU structures for this VM. */ struct pkvm_hyp_vcpu *vcpus[]; }; +extern hyp_spinlock_t vm_table_lock; + static inline struct pkvm_hyp_vm * pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) { return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm); } +static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return vcpu_is_protected(&hyp_vcpu->vcpu); +} + +static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) +{ + return kvm_vm_is_protected(&hyp_vm->kvm); +} + void pkvm_hyp_vm_table_init(void *tbl); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, @@ -64,5 +76,15 @@ int __pkvm_teardown_vm(pkvm_handle_t handle); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); + +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); +int kvm_check_pvm_sysreg_table(void); #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 45a84f0ade04..1e6d995968a1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,6 +15,4 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); - #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 2250253a6429..b43426a493df 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -20,6 +20,8 @@ HOST_EXTRACFLAGS += -I$(objtree)/include lib-objs := clear_page.o copy_page.o memcpy.o memset.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) +CFLAGS_switch.nvhe.o += -Wno-override-init + hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o @@ -89,24 +91,11 @@ quiet_cmd_hyprel = HYPREL $@ quiet_cmd_hypcopy = HYPCOPY $@ cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ -# Remove ftrace, Shadow Call Stack, and CFI CFLAGS. -# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations. -KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) +# Remove ftrace and Shadow Call Stack CFLAGS. +# This is equivalent to the 'notrace' and '__noscs' annotations. +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) # Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile' # when profile optimization is applied. gen-hyprel does not support SHT_REL and # causes a build failure. Remove profile optimization flags. KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables - -# KVM nVHE code is run at a different exception code with a different map, so -# compiler instrumentation that inserts callbacks or checks into the code may -# cause crashes. Just disable it. -GCOV_PROFILE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -UBSAN_SANITIZE := n -KCOV_INSTRUMENT := n - -# Skip objtool checking for this directory because nVHE code is compiled with -# non-standard build rules. -OBJECT_FILES_NON_STANDARD := y diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 7746ea507b6f..2f4a4f5036bb 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -51,42 +51,55 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_el1(pmscr_el1, SYS_PMSCR); } -static void __debug_save_trace(u64 *trfcr_el1) +static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) { - *trfcr_el1 = 0; + *saved_trfcr = read_sysreg_el1(SYS_TRFCR); + write_sysreg_el1(new_trfcr, SYS_TRFCR); +} - /* Check if the TRBE is enabled */ - if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E)) - return; - /* - * Prohibit trace generation while we are in guest. - * Since access to TRFCR_EL1 is trapped, the guest can't - * modify the filtering set by the host. - */ - *trfcr_el1 = read_sysreg_el1(SYS_TRFCR); - write_sysreg_el1(0, SYS_TRFCR); - isb(); - /* Drain the trace buffer to memory */ - tsb_csync(); +static bool __trace_needs_drain(void) +{ + if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) + return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; + + return host_data_test_flag(TRBE_ENABLED); } -static void __debug_restore_trace(u64 trfcr_el1) +static bool __trace_needs_switch(void) { - if (!trfcr_el1) - return; + return host_data_test_flag(TRBE_ENABLED) || + host_data_test_flag(EL1_TRACING_CONFIGURED); +} - /* Restore trace filter controls */ - write_sysreg_el1(trfcr_el1, SYS_TRFCR); +static void __trace_switch_to_guest(void) +{ + /* Unsupported with TRBE so disable */ + if (host_data_test_flag(TRBE_ENABLED)) + *host_data_ptr(trfcr_while_in_guest) = 0; + + __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), + *host_data_ptr(trfcr_while_in_guest)); + + if (__trace_needs_drain()) { + isb(); + tsb_csync(); + } +} + +static void __trace_switch_to_host(void) +{ + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), + *host_data_ptr(host_debug_state.trfcr_el1)); } void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) - __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); - /* Disable and flush Self-Hosted Trace generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1); + if (host_data_test_flag(HAS_SPE)) + __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); + + if (__trace_needs_switch()) + __trace_switch_to_guest(); } void __debug_switch_to_guest(struct kvm_vcpu *vcpu) @@ -96,18 +109,13 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) - __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1); + if (host_data_test_flag(HAS_SPE)) + __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); + if (__trace_needs_switch()) + __trace_switch_to_host(); } void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 320f2eaa14a9..e433dfab882a 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -67,6 +67,9 @@ struct kvm_ffa_buffers { */ static struct kvm_ffa_buffers hyp_buffers; static struct kvm_ffa_buffers host_buffers; +static u32 hyp_ffa_version; +static bool has_version_negotiated; +static hyp_spinlock_t version_lock; static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) { @@ -177,6 +180,14 @@ static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) res); } +static void ffa_rx_release(struct arm_smccc_res *res) +{ + arm_smccc_1_1_smc(FFA_RX_RELEASE, + 0, 0, + 0, 0, 0, 0, 0, + res); +} + static void do_ffa_rxtx_map(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { @@ -415,9 +426,9 @@ out: return; } -static __always_inline void do_ffa_mem_xfer(const u64 func_id, - struct arm_smccc_res *res, - struct kvm_cpu_context *ctxt) +static void __do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); DECLARE_REG(u32, fraglen, ctxt, 2); @@ -429,9 +440,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, u32 offset, nr_ranges; int ret = 0; - BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && - func_id != FFA_FN64_MEM_LEND); - if (addr_mbz || npages_mbz || fraglen > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { ret = FFA_RET_INVALID_PARAMETERS; @@ -450,11 +458,16 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, goto out_unlock; } + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + buf = hyp_buffers.tx; memcpy(buf, host_buffers.tx, fraglen); ep_mem_access = (void *)buf + - ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0); + ffa_mem_desc_offset(buf, 0, hyp_ffa_version); offset = ep_mem_access->composite_off; if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) { ret = FFA_RET_INVALID_PARAMETERS; @@ -501,6 +514,13 @@ err_unshare: goto out_unlock; } +#define do_ffa_mem_xfer(fid, res, ctxt) \ + do { \ + BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \ + (fid) != FFA_FN64_MEM_LEND); \ + __do_ffa_mem_xfer((fid), (res), (ctxt)); \ + } while (0); + static void do_ffa_mem_reclaim(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { @@ -533,7 +553,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, fraglen = res->a2; ep_mem_access = (void *)buf + - ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0); + ffa_mem_desc_offset(buf, 0, hyp_ffa_version); offset = ep_mem_access->composite_off; /* * We can trust the SPMD to get this right, but let's at least @@ -543,16 +563,19 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, if (WARN_ON(offset > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { ret = FFA_RET_ABORTED; + ffa_rx_release(res); goto out_unlock; } if (len > ffa_desc_buf.len) { ret = FFA_RET_NO_MEMORY; + ffa_rx_release(res); goto out_unlock; } buf = ffa_desc_buf.buf; memcpy(buf, hyp_buffers.rx, fraglen); + ffa_rx_release(res); for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); @@ -563,6 +586,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, fraglen = res->a3; memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + ffa_rx_release(res); } ffa_mem_reclaim(res, handle_lo, handle_hi, flags); @@ -600,7 +624,6 @@ static bool ffa_call_supported(u64 func_id) case FFA_MSG_POLL: case FFA_MSG_WAIT: /* 32-bit variants of 64-bit calls */ - case FFA_MSG_SEND_DIRECT_REQ: case FFA_MSG_SEND_DIRECT_RESP: case FFA_RXTX_MAP: case FFA_MEM_DONATE: @@ -640,6 +663,132 @@ out_handled: return true; } +static int hyp_ffa_post_init(void) +{ + size_t min_rxtx_sz; + struct arm_smccc_res res; + + arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + if (res.a2 != HOST_FFA_ID) + return -EINVAL; + + arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, + 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + switch (res.a2) { + case FFA_FEAT_RXTX_MIN_SZ_4K: + min_rxtx_sz = SZ_4K; + break; + case FFA_FEAT_RXTX_MIN_SZ_16K: + min_rxtx_sz = SZ_16K; + break; + case FFA_FEAT_RXTX_MIN_SZ_64K: + min_rxtx_sz = SZ_64K; + break; + default: + return -EINVAL; + } + + if (min_rxtx_sz > PAGE_SIZE) + return -EOPNOTSUPP; + + return 0; +} + +static void do_ffa_version(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, ffa_req_version, ctxt, 1); + + if (FFA_MAJOR_VERSION(ffa_req_version) != 1) { + res->a0 = FFA_RET_NOT_SUPPORTED; + return; + } + + hyp_spin_lock(&version_lock); + if (has_version_negotiated) { + res->a0 = hyp_ffa_version; + goto unlock; + } + + /* + * If the client driver tries to downgrade the version, we need to ask + * first if TEE supports it. + */ + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) { + arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0, + 0, 0, 0, 0, 0, + res); + if (res->a0 == FFA_RET_NOT_SUPPORTED) + goto unlock; + + hyp_ffa_version = ffa_req_version; + } + + if (hyp_ffa_post_init()) + res->a0 = FFA_RET_NOT_SUPPORTED; + else { + has_version_negotiated = true; + res->a0 = hyp_ffa_version; + } +unlock: + hyp_spin_unlock(&version_lock); +} + +static void do_ffa_part_get(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, uuid0, ctxt, 1); + DECLARE_REG(u32, uuid1, ctxt, 2); + DECLARE_REG(u32, uuid2, ctxt, 3); + DECLARE_REG(u32, uuid3, ctxt, 4); + DECLARE_REG(u32, flags, ctxt, 5); + u32 count, partition_sz, copy_sz; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.rx) { + ffa_to_smccc_res(res, FFA_RET_BUSY); + goto out_unlock; + } + + arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1, + uuid2, uuid3, flags, 0, 0, + res); + + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + count = res->a2; + if (!count) + goto out_unlock; + + if (hyp_ffa_version > FFA_VERSION_1_0) { + /* Get the number of partitions deployed in the system */ + if (flags & 0x1) + goto out_unlock; + + partition_sz = res->a3; + } else { + /* FFA_VERSION_1_0 lacks the size in the response */ + partition_sz = FFA_1_0_PARTITON_INFO_SZ; + } + + copy_sz = partition_sz * count; + if (copy_sz > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ffa_to_smccc_res(res, FFA_RET_ABORTED); + goto out_unlock; + } + + memcpy(host_buffers.rx, hyp_buffers.rx, copy_sz); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +} + bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) { struct arm_smccc_res res; @@ -660,6 +809,11 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) if (!is_ffa_call(func_id)) return false; + if (!has_version_negotiated && func_id != FFA_VERSION) { + ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS); + goto out_handled; + } + switch (func_id) { case FFA_FEATURES: if (!do_ffa_features(&res, host_ctxt)) @@ -686,6 +840,12 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) case FFA_MEM_FRAG_TX: do_ffa_mem_frag_tx(&res, host_ctxt); goto out_handled; + case FFA_VERSION: + do_ffa_version(&res, host_ctxt); + goto out_handled; + case FFA_PARTITION_INFO_GET: + do_ffa_part_get(&res, host_ctxt); + goto out_handled; } if (ffa_call_supported(func_id)) @@ -700,13 +860,12 @@ out_handled: int hyp_ffa_init(void *pages) { struct arm_smccc_res res; - size_t min_rxtx_sz; void *tx, *rx; if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) return 0; - arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res); + arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res); if (res.a0 == FFA_RET_NOT_SUPPORTED) return 0; @@ -726,34 +885,10 @@ int hyp_ffa_init(void *pages) if (FFA_MAJOR_VERSION(res.a0) != 1) return -EOPNOTSUPP; - arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); - if (res.a0 != FFA_SUCCESS) - return -EOPNOTSUPP; - - if (res.a2 != HOST_FFA_ID) - return -EINVAL; - - arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, - 0, 0, 0, 0, 0, 0, &res); - if (res.a0 != FFA_SUCCESS) - return -EOPNOTSUPP; - - switch (res.a2) { - case FFA_FEAT_RXTX_MIN_SZ_4K: - min_rxtx_sz = SZ_4K; - break; - case FFA_FEAT_RXTX_MIN_SZ_16K: - min_rxtx_sz = SZ_16K; - break; - case FFA_FEAT_RXTX_MIN_SZ_64K: - min_rxtx_sz = SZ_64K; - break; - default: - return -EINVAL; - } - - if (min_rxtx_sz > PAGE_SIZE) - return -EOPNOTSUPP; + if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1)) + hyp_ffa_version = res.a0; + else + hyp_ffa_version = FFA_VERSION_1_1; tx = pages; pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; @@ -776,5 +911,6 @@ int hyp_ffa_init(void *pages) .lock = __HYP_SPIN_LOCK_UNLOCKED, }; + version_lock = __HYP_SPIN_LOCK_UNLOCKED; return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c index 6bc88a756cb7..b63f4e1c1033 100644 --- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c +++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c @@ -50,6 +50,9 @@ #ifndef R_AARCH64_ABS64 #define R_AARCH64_ABS64 257 #endif +#ifndef R_AARCH64_ABS32 +#define R_AARCH64_ABS32 258 +#endif #ifndef R_AARCH64_PREL64 #define R_AARCH64_PREL64 260 #endif @@ -383,6 +386,9 @@ static void emit_rela_section(Elf64_Shdr *sh_rela) case R_AARCH64_ABS64: emit_rela_abs64(rela, sh_orig_name); break; + /* Allow 32-bit absolute relocation, for kCFI type hashes. */ + case R_AARCH64_ABS32: + break; /* Allow position-relative data relocations. */ case R_AARCH64_PREL64: case R_AARCH64_PREL32: diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 135cfb294ee5..58f0cb2298cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -188,21 +188,15 @@ SYM_FUNC_END(__host_hvc) /* * Test whether the SP has overflowed, without corrupting a GPR. - * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit + * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit * of SP should always be 1. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp - tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@ + tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@ sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp - /* If a guest is loaded, panic out of it. */ - stp x0, x1, [sp, #-16]! - get_loaded_vcpu x0, x1 - cbnz x0, __guest_exit_panic - add sp, sp, #16 - /* * The panic may not be clean if the exception is taken before the host * context has been saved by __host_exit or after the hyp context has diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 2994878d68ea..f8af11189572 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -5,6 +5,7 @@ */ #include <linux/arm-smccc.h> +#include <linux/cfi_types.h> #include <linux/linkage.h> #include <asm/alternative.h> @@ -23,28 +24,25 @@ .align 11 SYM_CODE_START(__kvm_hyp_init) - ventry __invalid // Synchronous EL2t - ventry __invalid // IRQ EL2t - ventry __invalid // FIQ EL2t - ventry __invalid // Error EL2t + ventry . // Synchronous EL2t + ventry . // IRQ EL2t + ventry . // FIQ EL2t + ventry . // Error EL2t - ventry __invalid // Synchronous EL2h - ventry __invalid // IRQ EL2h - ventry __invalid // FIQ EL2h - ventry __invalid // Error EL2h + ventry . // Synchronous EL2h + ventry . // IRQ EL2h + ventry . // FIQ EL2h + ventry . // Error EL2h ventry __do_hyp_init // Synchronous 64-bit EL1 - ventry __invalid // IRQ 64-bit EL1 - ventry __invalid // FIQ 64-bit EL1 - ventry __invalid // Error 64-bit EL1 + ventry . // IRQ 64-bit EL1 + ventry . // FIQ 64-bit EL1 + ventry . // Error 64-bit EL1 - ventry __invalid // Synchronous 32-bit EL1 - ventry __invalid // IRQ 32-bit EL1 - ventry __invalid // FIQ 32-bit EL1 - ventry __invalid // Error 32-bit EL1 - -__invalid: - b . + ventry . // Synchronous 32-bit EL1 + ventry . // IRQ 32-bit EL1 + ventry . // FIQ 32-bit EL1 + ventry . // Error 32-bit EL1 /* * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers. @@ -76,6 +74,17 @@ __do_hyp_init: SYM_CODE_END(__kvm_hyp_init) /* + * Initialize EL2 CPU state to sane values. + * + * HCR_EL2.E2H must have been initialized already. + */ +SYM_CODE_START_LOCAL(__kvm_init_el2_state) + init_el2_state // Clobbers x0..x2 + finalise_el2_state + ret +SYM_CODE_END(__kvm_init_el2_state) + +/* * Initialize the hypervisor in EL2. * * Only uses x0..x2 so as to not clobber callee-saved SMCCC registers @@ -101,9 +110,12 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) // TPIDR_EL2 is used to preserve x0 across the macro maze... isb msr tpidr_el2, x0 - init_el2_state - finalise_el2_state + str lr, [x0, #NVHE_INIT_TMP] + + bl __kvm_init_el2_state + mrs x0, tpidr_el2 + ldr lr, [x0, #NVHE_INIT_TMP] 1: ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] @@ -129,7 +141,7 @@ alternative_else_nop_endif /* Invalidate the stale TLBs from Bootloader */ tlbi alle2 - tlbi vmalls12e1 + tlbi alle1 dsb sy mov_q x0, INIT_SCTLR_EL2_MMU_ON @@ -198,10 +210,9 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) 2: msr SPsel, #1 // We want to use SP_EL{1,2} - /* Initialize EL2 CPU state to sane values. */ - init_el2_state // Clobbers x0..x2 - finalise_el2_state - __init_el2_nvhe_prepare_eret + init_el2_hcr 0 + + bl __kvm_init_el2_state /* Enable MMU, set vectors and stack. */ mov x0, x28 @@ -265,33 +276,38 @@ alternative_else_nop_endif SYM_CODE_END(__kvm_handle_stub_hvc) -SYM_FUNC_START(__pkvm_init_switch_pgd) +/* + * void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp, + * void (*fn)(void)); + * + * SYM_TYPED_FUNC_START() allows C to call this ID-mapped function indirectly + * using a physical pointer without triggering a kCFI failure. + */ +SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd) /* Turn the MMU off */ pre_disable_mmu_workaround - mrs x2, sctlr_el2 - bic x3, x2, #SCTLR_ELx_M - msr sctlr_el2, x3 + mrs x3, sctlr_el2 + bic x4, x3, #SCTLR_ELx_M + msr sctlr_el2, x4 isb tlbi alle2 /* Install the new pgtables */ - ldr x3, [x0, #NVHE_INIT_PGD_PA] - phys_to_ttbr x4, x3 + phys_to_ttbr x5, x0 alternative_if ARM64_HAS_CNP - orr x4, x4, #TTBR_CNP_BIT + orr x5, x5, #TTBR_CNP_BIT alternative_else_nop_endif - msr ttbr0_el2, x4 + msr ttbr0_el2, x5 /* Set the new stack pointer */ - ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA] - mov sp, x0 + mov sp, x1 /* And turn the MMU back on! */ dsb nsh isb - set_sctlr_el2 x2 - ret x1 + set_sctlr_el2 x3 + ret x2 SYM_FUNC_END(__pkvm_init_switch_pgd) .popsection diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2385fd03ed87..2c37680d954c 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -5,6 +5,7 @@ */ #include <hyp/adjust_pc.h> +#include <hyp/switch.h> #include <asm/pgtable-types.h> #include <asm/kvm_asm.h> @@ -23,26 +24,115 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); +static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) +{ + __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + /* + * On saving/restoring guest sve state, always use the maximum VL for + * the guest. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) guest VL. + */ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); +} + +static void __hyp_sve_restore_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + /* + * On saving/restoring host sve state, always use the maximum VL for + * the host. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) host VL. + * + * Note that this constrains the PE to the maximum shared VL + * that was discovered, if we wish to use larger VLs this will + * need to be revisited. + */ + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); + __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); + write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR); +} + +static void fpsimd_sve_flush(void) +{ + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) +{ + bool has_fpmr; + + if (!guest_owns_fp_regs()) + return; + + cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); + isb(); + + if (vcpu_has_sve(vcpu)) + __hyp_sve_save_guest(vcpu); + else + __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + + has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); + if (has_fpmr) + __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR); + + if (system_supports_sve()) + __hyp_sve_restore_host(); + else + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); + + if (has_fpmr) + write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); + + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl; - - hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; + /* Limit guest vector length to the maximum supported by the host. */ + hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); - hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; - hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2; + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & + (HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; - hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state; - - hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr); - hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state; hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; @@ -56,35 +146,74 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; unsigned int i; + fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; - host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2; host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault; host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; - host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state; host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; for (i = 0; i < hyp_cpu_if->used_lrs; ++i) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } +static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2); + DECLARE_REG(u64, hcr_el2, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); + if (!hyp_vcpu) + return; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + /* Propagate WFx trapping flags */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI); + hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } +} + +static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu) + pkvm_put_hyp_vcpu(hyp_vcpu); +} + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); int ret; - host_vcpu = kern_hyp_va(host_vcpu); - if (unlikely(is_protected_kvm_enabled())) { - struct pkvm_hyp_vcpu *hyp_vcpu; - struct kvm *host_kvm; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + + /* + * KVM (and pKVM) doesn't support SME guests for now, and + * ensures that SME features aren't enabled in pstate when + * loading a vcpu. Therefore, if SME features enabled the host + * is misbehaving. + */ + if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) { + ret = -EINVAL; + goto out; + } - host_kvm = kern_hyp_va(host_vcpu->kvm); - hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, - host_vcpu->vcpu_idx); if (!hyp_vcpu) { ret = -EINVAL; goto out; @@ -95,12 +224,145 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); sync_hyp_vcpu(hyp_vcpu); - pkvm_put_hyp_vcpu(hyp_vcpu); } else { + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); + /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(host_vcpu); + fpsimd_lazy_switch_to_guest(vcpu); + ret = __kvm_vcpu_run(vcpu); + fpsimd_lazy_switch_to_host(vcpu); } +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, + host_vcpu->arch.pkvm_memcache.nr_pages, + &host_vcpu->arch.pkvm_memcache); +} +static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_unshare_guest(gfn, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(bool, mkold, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu); out: cpu_reg(host_ctxt, 1) = ret; } @@ -152,6 +414,22 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } +static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + struct pkvm_hyp_vm *hyp_vm; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + return; + + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + put_pkvm_hyp_vm(hyp_vm); +} + static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -178,38 +456,23 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } -static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr(); -} - -static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt) -{ - __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1)); -} - static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) { __vgic_v3_init_lrs(); } -static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2(); -} - -static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if)); } -static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if)); } static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) @@ -280,13 +543,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } -static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); - - __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); -} - static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -320,7 +576,6 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { /* ___kvm_hyp_init */ - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_cpu_set_vector), @@ -331,6 +586,12 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), @@ -340,14 +601,14 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__vgic_v3_read_vmcr), - HANDLE_FUNC(__vgic_v3_write_vmcr), - HANDLE_FUNC(__vgic_v3_save_aprs), - HANDLE_FUNC(__vgic_v3_restore_aprs), - HANDLE_FUNC(__pkvm_vcpu_init_traps), + HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), + HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_load), + HANDLE_FUNC(__pkvm_vcpu_put), + HANDLE_FUNC(__pkvm_tlb_flush_vmid), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) @@ -419,15 +680,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; - case ESR_ELx_EC_SVE: - if (has_hvhe()) - sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_ZEN_EL0EN)); - else - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); - isb(); - sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); - break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 861c76021a25..f34f11c720d7 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -201,8 +201,8 @@ static void *guest_s2_zalloc_page(void *mc) memset(addr, 0, PAGE_SIZE); p = hyp_virt_to_page(addr); - memset(p, 0, sizeof(*p)); p->refcount = 1; + p->order = 0; return addr; } @@ -266,8 +266,9 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { + struct hyp_page *page; void *addr; /* Dump all pgtable pages in the hyp_pool */ @@ -279,7 +280,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) /* Drain the hyp_pool into the memcache */ addr = hyp_alloc_pages(&vm->pool, 0); while (addr) { - memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + page = hyp_virt_to_page(addr); + page->refcount = 0; + page->order = 0; push_hyp_memcache(mc, addr, hyp_virt_to_phys); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); addr = hyp_alloc_pages(&vm->pool, 0); @@ -382,19 +385,28 @@ bool addr_is_memory(phys_addr_t phys) return !!find_mem_range(phys, &range); } -static bool addr_is_allowed_memory(phys_addr_t phys) +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + +static int check_range_allowed_memory(u64 start, u64 end) { struct memblock_region *reg; struct kvm_mem_range range; - reg = find_mem_range(phys, &range); + /* + * Callers can't check the state of a range that overlaps memory and + * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range. + */ + reg = find_mem_range(start, &range); + if (!is_in_mem_range(end - 1, &range)) + return -EINVAL; - return reg && !(reg->flags & MEMBLOCK_NOMAP); -} + if (!reg || reg->flags & MEMBLOCK_NOMAP) + return -EPERM; -static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) -{ - return range->start <= addr && addr < range->end; + return 0; } static bool range_is_memory(u64 start, u64 end) @@ -454,8 +466,10 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) if (kvm_pte_valid(pte)) return -EAGAIN; - if (pte) + if (pte) { + WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE); return -EPERM; + } do { u64 granule = kvm_granule_size(level); @@ -477,10 +491,33 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); } +static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) +{ + phys_addr_t end = addr + size; + + for (; addr < end; addr += PAGE_SIZE) + hyp_phys_to_page(addr)->host_state = state; +} + int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, - addr, size, &host_s2_pool, owner_id); + int ret; + + if (!addr_is_memory(addr)) + return -EPERM; + + ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, + addr, size, &host_s2_pool, owner_id); + if (ret) + return ret; + + /* Don't forget to update the vmemmap tracking for the host */ + if (owner_id == PKVM_ID_HOST) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + else + __host_update_page_state(addr, size, PKVM_NOPAGE); + + return 0; } static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) @@ -533,46 +570,19 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) int ret = 0; esr = read_sysreg_el2(SYS_ESR); - BUG_ON(!__get_fault_info(esr, &fault)); + if (!__get_fault_info(esr, &fault)) { + /* + * We've presumably raced with a page-table change which caused + * AT to fail, try again. + */ + return; + } addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; ret = host_stage2_idmap(addr); BUG_ON(ret && ret != -EAGAIN); } -struct pkvm_mem_transition { - u64 nr_pages; - - struct { - enum pkvm_component_id id; - /* Address in the initiator's address space */ - u64 addr; - - union { - struct { - /* Address in the completer's address space */ - u64 completer_addr; - } host; - struct { - u64 completer_addr; - } hyp; - }; - } initiator; - - struct { - enum pkvm_component_id id; - } completer; -}; - -struct pkvm_mem_share { - const struct pkvm_mem_transition tx; - const enum kvm_pgtable_prot completer_prot; -}; - -struct pkvm_mem_donation { - const struct pkvm_mem_transition tx; -}; - struct check_walk_data { enum pkvm_page_state desired; enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); @@ -598,115 +608,38 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, return kvm_pgtable_walk(pgt, addr, size, &walker); } -static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr) -{ - if (!addr_is_allowed_memory(addr)) - return PKVM_NOPAGE; - - if (!kvm_pte_valid(pte) && pte) - return PKVM_NOPAGE; - - return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); -} - static int __host_check_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - struct check_walk_data d = { - .desired = state, - .get_page_state = host_get_page_state, - }; + u64 end = addr + size; + int ret; + + ret = check_range_allowed_memory(addr, end); + if (ret) + return ret; hyp_assert_lock_held(&host_mmu.lock); - return check_page_state_range(&host_mmu.pgt, addr, size, &d); + for (; addr < end; addr += PAGE_SIZE) { + if (hyp_phys_to_page(addr)->host_state != state) + return -EPERM; + } + + return 0; } static int __host_set_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state); - - return host_stage2_idmap_locked(addr, size, prot); -} - -static int host_request_owned_transition(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int host_request_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} - -static int host_initiate_share(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} - -static int host_initiate_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int host_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u8 owner_id = tx->completer.id; - u64 size = tx->nr_pages * PAGE_SIZE; - - *completer_addr = tx->initiator.host.completer_addr; - return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); -} - -static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) -{ - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HYP); -} - -static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, - enum pkvm_page_state state) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - - if (__host_ack_skip_pgtable_check(tx)) - return 0; - - return __host_check_page_state_range(addr, size, state); -} + if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) { + int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); -static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - return __host_ack_transition(addr, tx, PKVM_NOPAGE); -} + if (ret) + return ret; + } -static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u8 host_id = tx->completer.id; + __host_update_page_state(addr, size, state); - return host_stage2_set_owner_locked(addr, size, host_id); + return 0; } static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) @@ -729,577 +662,425 @@ static int __hyp_check_page_state_range(u64 addr, u64 size, return check_page_state_range(&pkvm_pgtable, addr, size, &d); } -static int hyp_request_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) { - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; - *completer_addr = tx->initiator.hyp.completer_addr; - return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); } -static int hyp_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, + u64 size, enum pkvm_page_state state) { - u64 size = tx->nr_pages * PAGE_SIZE; - int ret; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct check_walk_data d = { + .desired = state, + .get_page_state = guest_get_page_state, + }; - *completer_addr = tx->initiator.hyp.completer_addr; - ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); - return (ret != size) ? -EFAULT : 0; + hyp_assert_lock_held(&vm->lock); + return check_page_state_range(&vm->pgt, addr, size, &d); } -static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +int __pkvm_host_share_hyp(u64 pfn) { - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HOST); -} + u64 phys = hyp_pfn_to_phys(pfn); + void *virt = __hyp_va(phys); + enum kvm_pgtable_prot prot; + u64 size = PAGE_SIZE; + int ret; -static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + host_lock_component(); + hyp_lock_component(); - if (perms != PAGE_HYP) - return -EPERM; + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); + if (ret) + goto unlock; + } + + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); + return ret; } -static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) +int __pkvm_host_unshare_hyp(u64 pfn) { - u64 size = tx->nr_pages * PAGE_SIZE; - - if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) - return -EBUSY; + u64 phys = hyp_pfn_to_phys(pfn); + u64 virt = (u64)__hyp_va(phys); + u64 size = PAGE_SIZE; + int ret; - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; + host_lock_component(); + hyp_lock_component(); - return __hyp_check_page_state_range(addr, size, - PKVM_PAGE_SHARED_BORROWED); -} + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + if (hyp_page_count((void *)virt)) { + ret = -EBUSY; + goto unlock; + } -static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); + return ret; } -static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) { - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + void *virt = __hyp_va(phys); enum kvm_pgtable_prot prot; + int ret; - prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); - return pkvm_create_mappings_locked(start, end, prot); -} + host_lock_component(); + hyp_lock_component(); -static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); + if (ret) + goto unlock; + } - return (ret != size) ? -EFAULT : 0; -} + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); -static int hyp_complete_donation(u64 addr, - const struct pkvm_mem_transition *tx) -{ - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); +unlock: + hyp_unlock_component(); + host_unlock_component(); - return pkvm_create_mappings_locked(start, end, prot); + return ret; } -static int check_share(struct pkvm_mem_share *share) +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + u64 virt = (u64)__hyp_va(phys); int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + hyp_lock_component(); + ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED); if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We only check the host; the secure side will check the other - * end when we forward the FFA call. - */ - ret = 0; - break; - default: - ret = -EINVAL; + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; } + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + return ret; } -static int __do_share(struct pkvm_mem_share *share) +int hyp_pin_shared_mem(void *from, void *to) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 size = end - start; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_share(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + hyp_lock_component(); + ret = __host_check_page_state_range(__hyp_pa(start), size, + PKVM_PAGE_SHARED_OWNED); if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We're not responsible for any secure page-tables, so there's - * nothing to do here. - */ - ret = 0; - break; - default: - ret = -EINVAL; - } + goto unlock; - return ret; -} + ret = __hyp_check_page_state_range(start, size, + PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; -/* - * do_share(): - * - * The page owner grants access to another component with a given set - * of permissions. - * - * Initiator: OWNED => SHARED_OWNED - * Completer: NOPAGE => SHARED_BORROWED - */ -static int do_share(struct pkvm_mem_share *share) -{ - int ret; + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_inc(hyp_virt_to_page(cur)); - ret = check_share(share); - if (ret) - return ret; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return WARN_ON(__do_share(share)); + return ret; } -static int check_unshare(struct pkvm_mem_share *share) +void hyp_unpin_shared_mem(void *from, void *to) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); - if (ret) - return ret; + host_lock_component(); + hyp_lock_component(); - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See check_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_dec(hyp_virt_to_page(cur)); - return ret; + hyp_unlock_component(); + host_unlock_component(); } -static int __do_unshare(struct pkvm_mem_share *share) +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See __do_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + host_unlock_component(); return ret; } -/* - * do_unshare(): - * - * The page owner revokes access from another component for a range of - * pages which were previously shared using do_share(). - * - * Initiator: SHARED_OWNED => OWNED - * Completer: SHARED_BORROWED => NOPAGE - */ -static int do_unshare(struct pkvm_mem_share *share) +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - ret = check_unshare(share); - if (ret) - return ret; + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + host_unlock_component(); - return WARN_ON(__do_unshare(share)); + return ret; } -static int check_donation(struct pkvm_mem_donation *donation) +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot) { - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + struct hyp_page *page; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_request_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); if (ret) return ret; - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_ack_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_ack_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - return ret; -} + host_lock_component(); + guest_lock_component(vm); -static int __do_donate(struct pkvm_mem_donation *donation) -{ - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; - int ret; + ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE); + if (ret) + goto unlock; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_donation(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_initiate_donation(&completer_addr, tx); + page = hyp_phys_to_page(phys); + switch (page->host_state) { + case PKVM_PAGE_OWNED: + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED)); break; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count) + break; + /* Only host to np-guest multi-sharing is tolerated */ + WARN_ON(1); + fallthrough; default: - ret = -EINVAL; + ret = -EPERM; + goto unlock; } - if (ret) - return ret; + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + page->host_share_guest_count++; - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_complete_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_complete_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; - } +unlock: + guest_unlock_component(vm); + host_unlock_component(); return ret; } -/* - * do_donate(): - * - * The page owner transfers ownership to another component, losing access - * as a consequence. - * - * Initiator: OWNED => NOPAGE - * Completer: NOPAGE => OWNED - */ -static int do_donate(struct pkvm_mem_donation *donation) +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa) { + enum pkvm_page_state state; + struct hyp_page *page; + kvm_pte_t pte; + u64 phys; + s8 level; int ret; - ret = check_donation(donation); + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); if (ret) return ret; + if (!kvm_pte_valid(pte)) + return -ENOENT; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; - return WARN_ON(__do_donate(donation)); -} - -int __pkvm_host_share_hyp(u64 pfn) -{ - int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; + state = guest_get_page_state(pte, ipa); + if (state != PKVM_PAGE_SHARED_BORROWED) + return -EPERM; - host_lock_component(); - hyp_lock_component(); + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + if (WARN_ON(ret)) + return ret; - ret = do_share(&share); + page = hyp_phys_to_page(phys); + if (page->host_state != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; - hyp_unlock_component(); - host_unlock_component(); + *__phys = phys; - return ret; + return 0; } -int __pkvm_host_unshare_hyp(u64 pfn) +int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) { + u64 ipa = hyp_pfn_to_phys(gfn); + struct hyp_page *page; + u64 phys; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; host_lock_component(); - hyp_lock_component(); - - ret = do_unshare(&share); - - hyp_unlock_component(); - host_unlock_component(); - - return ret; -} + guest_lock_component(vm); -int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) -{ - int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - }; + ret = __check_host_shared_guest(vm, &phys, ipa); + if (ret) + goto unlock; - host_lock_component(); - hyp_lock_component(); + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE); + if (ret) + goto unlock; - ret = do_donate(&donation); + page = hyp_phys_to_page(phys); + page->host_share_guest_count--; + if (!page->host_share_guest_count) + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED)); - hyp_unlock_component(); +unlock: + guest_unlock_component(vm); host_unlock_component(); return ret; } -int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) { + u64 phys; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HYP, - .addr = hyp_addr, - .hyp = { - .completer_addr = host_addr, - }, - }, - .completer = { - .id = PKVM_ID_HOST, - }, - }, - }; + + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) + return; host_lock_component(); - hyp_lock_component(); + guest_lock_component(vm); - ret = do_donate(&donation); + ret = __check_host_shared_guest(vm, &phys, ipa); - hyp_unlock_component(); + guest_unlock_component(vm); host_unlock_component(); - return ret; + WARN_ON(ret && ret != -ENOENT); } -int hyp_pin_shared_mem(void *from, void *to) +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); - u64 size = end - start; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); int ret; - host_lock_component(); - hyp_lock_component(); - - ret = __host_check_page_state_range(__hyp_pa(start), size, - PKVM_PAGE_SHARED_OWNED); - if (ret) - goto unlock; - - ret = __hyp_check_page_state_range(start, size, - PKVM_PAGE_SHARED_BORROWED); - if (ret) - goto unlock; + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_inc(hyp_virt_to_page(cur)); + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; -unlock: - hyp_unlock_component(); - host_unlock_component(); + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); + guest_unlock_component(vm); return ret; } -void hyp_unpin_shared_mem(void *from, void *to) +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) { - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); + u64 ipa = hyp_pfn_to_phys(gfn); + int ret; - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_dec(hyp_virt_to_page(cur)); + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + guest_unlock_component(vm); - hyp_unlock_component(); - host_unlock_component(); + return ret; } -int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) { + u64 ipa = hyp_pfn_to_phys(gfn); int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, - }, - }, - }; - host_lock_component(); - ret = do_share(&share); - host_unlock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); + guest_unlock_component(vm); return ret; } -int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { - int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, - }, - }, - }; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); - host_lock_component(); - ret = do_unshare(&share); - host_unlock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - return ret; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + guest_unlock_component(vm); + + return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 8850b591d775..f41c7440b34b 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -360,10 +360,10 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) prev_base = __io_map_base; /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; addr = ALIGN(__io_map_base, size); ret = __pkvm_alloc_private_va_range(addr, size); @@ -373,12 +373,12 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + PAGE_SIZE, - PAGE_SIZE, phys, PAGE_HYP); + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE, + NVHE_STACK_SIZE, phys, PAGE_HYP); if (ret) __io_map_base = prev_base; } diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index e691290d3765..a1eb27a1a747 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -32,7 +32,7 @@ u64 __hyp_vmemmap; */ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { phys_addr_t addr = hyp_page_to_phys(p); @@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, /* Find a buddy page currently available for allocation */ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); @@ -94,7 +94,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { phys_addr_t phys = hyp_page_to_phys(p); - unsigned short order = p->order; + u8 order = p->order; struct hyp_page *buddy; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); @@ -129,7 +129,7 @@ insert: static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy; @@ -183,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) void hyp_split_page(struct hyp_page *p) { - unsigned short order = p->order; + u8 order = p->order; unsigned int i; p->order = 0; @@ -195,10 +195,10 @@ void hyp_split_page(struct hyp_page *p) } } -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order) { - unsigned short i = order; struct hyp_page *p; + u8 i = order; hyp_spin_lock(&pool->lock); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 26dd9a20ad6e..5a335a51deca 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -6,7 +6,9 @@ #include <linux/kvm_host.h> #include <linux/mm.h> -#include <nvhe/fixed_config.h> + +#include <asm/kvm_emulate.h> + #include <nvhe/mem_protect.h> #include <nvhe/memory.h> #include <nvhe/pkvm.h> @@ -18,198 +20,170 @@ unsigned long __icache_flags; /* Used by kvm_get_vttbr(). */ unsigned int kvm_arm_vmid_bits; +unsigned int kvm_host_sve_max_vl; + /* - * Set trap register values based on features in ID_AA64PFR0. + * The currently loaded hyp vCPU for each physical CPU. Used only when + * protected KVM is enabled, but for both protected and non-protected VMs. */ -static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - u64 hcr_set = HCR_RW; - u64 hcr_clear = 0; - u64 cptr_set = 0; - u64 cptr_clear = 0; +static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); - /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY); - - /* - * Linux guests assume support for floating-point and Advanced SIMD. Do - * not change the trapping behavior for these from the KVM default. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; if (has_hvhe()) - hcr_set |= HCR_E2H; + vcpu->arch.hcr_el2 |= HCR_E2H; - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < - ID_AA64PFR0_EL1_RAS_V1P1) { - hcr_set |= HCR_TERR | HCR_TEA; - hcr_clear |= HCR_FIEN; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { - hcr_clear |= HCR_AMVOFFEN; - cptr_set |= CPTR_EL2_TAM; - } + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { - if (has_hvhe()) - cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - else - cptr_set |= CPTR_EL2_TZ; - } + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; - vcpu->arch.cptr_el2 &= ~cptr_clear; + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; } -/* - * Set trap register values based on features in ID_AA64PFR1. - */ -static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - u64 hcr_set = 0; - u64 hcr_clear = 0; + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.hcr_el2; + + /* No support for AArch32. */ + val |= HCR_RW; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; - /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) { - hcr_set |= HCR_TID5; - hcr_clear |= HCR_DCT | HCR_ATA; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { + val |= HCR_TERR | HCR_TEA; + val &= ~(HCR_FIEN); } - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; -} + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) + val &= ~(HCR_AMVOFFEN); -/* - * Set trap register values based on features in ID_AA64DFR0. - */ -static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - u64 mdcr_set = 0; - u64 mdcr_clear = 0; - u64 cptr_set = 0; - - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; - mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | - MDCR_EL2_HPMN_MASK; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { + val |= HCR_TID5; + val &= ~(HCR_DCT | HCR_ATA); } - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids)) - mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) + val |= HCR_TLOR; + + vcpu->arch.hcr_el2 = val; +} - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids)) - mdcr_set |= MDCR_EL2_TDOSA; +static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.mdcr_el2; - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { + val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); } - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) - mdcr_set |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) + val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) + val |= MDCR_EL2_TDOSA; - /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { - if (has_hvhe()) - cptr_set |= CPACR_EL1_TTA; - else - cptr_set |= CPTR_EL2_TTA; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { + val |= MDCR_EL2_TPMS; + val &= ~MDCR_EL2_E2PB_MASK; } - /* Trap External Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) - mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + val |= MDCR_EL2_TTRF; - vcpu->arch.mdcr_el2 |= mdcr_set; - vcpu->arch.mdcr_el2 &= ~mdcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; -} - -/* - * Set trap register values based on features in ID_AA64MMFR0. - */ -static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); - u64 mdcr_set = 0; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) + val |= MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids)) - mdcr_set |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + val |= MDCR_EL2_TDCC; - vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 = val; } /* - * Set trap register values based on features in ID_AA64MMFR1. + * Check that cpu features that are neither trapped nor supported are not + * enabled for protected VMs. */ -static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) +static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); - u64 hcr_set = 0; + struct kvm *kvm = vcpu->kvm; + + /* Protected KVM does not support AArch32 guests. */ + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) + return -EINVAL; - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids)) - hcr_set |= HCR_TLOR; + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || + !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) + return -EINVAL; + + /* No SME support in KVM right now. Check to catch if it changes. */ + if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + return -EINVAL; - vcpu->arch.hcr_el2 |= hcr_set; + return 0; } /* - * Set baseline trap register values. + * Initialize trap register values in protected mode. */ -static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) +static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) { - const u64 hcr_trap_feat_regs = HCR_TID3; - const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret; - /* - * Always trap: - * - Feature id registers: to control features exposed to guests - * - Implementation-defined features - */ - vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; - - /* Clear res0 and set res1 bits to trap potential new features. */ - vcpu->arch.hcr_el2 &= ~(HCR_RES0); - vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - if (!has_hvhe()) { - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + /* Trust the host for non-protected vcpu features. */ + vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; + return 0; } -} -/* - * Initialize trap register values for protected VMs. - */ -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) -{ - pvm_init_trap_regs(vcpu); - pvm_init_traps_aa64pfr0(vcpu); - pvm_init_traps_aa64pfr1(vcpu); - pvm_init_traps_aa64dfr0(vcpu); - pvm_init_traps_aa64mmfr0(vcpu); - pvm_init_traps_aa64mmfr1(vcpu); + ret = pkvm_check_pvm_cpu_features(vcpu); + if (ret) + return ret; + + pvm_init_traps_hcr(vcpu); + pvm_init_traps_mdcr(vcpu); + vcpu_set_hcrx(vcpu); + + return 0; } /* @@ -230,10 +204,10 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) /* * Spinlock for protecting state related to the VM table. Protects writes - * to 'vm_table' and 'nr_table_entries' as well as reads and writes to - * 'last_hyp_vcpu_lookup'. + * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. + * Also protects reads and writes to 'last_hyp_vcpu_lookup'. */ -static DEFINE_HYP_SPINLOCK(vm_table_lock); +DEFINE_HYP_SPINLOCK(vm_table_lock); /* * The table of VM entries for protected VMs in hyp. @@ -266,15 +240,32 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, struct pkvm_hyp_vcpu *hyp_vcpu = NULL; struct pkvm_hyp_vm *hyp_vm; + /* Cannot load a new vcpu without putting the old one first. */ + if (__this_cpu_read(loaded_hyp_vcpu)) + return NULL; + hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + if (!hyp_vcpu) + goto unlock; + + /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ + if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { + hyp_vcpu = NULL; + goto unlock; + } + + hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); unlock: hyp_spin_unlock(&vm_table_lock); + + if (hyp_vcpu) + __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); return hyp_vcpu; } @@ -283,10 +274,98 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); hyp_spin_lock(&vm_table_lock); + hyp_vcpu->loaded_hyp_vcpu = NULL; + __this_cpu_write(loaded_hyp_vcpu, NULL); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) +{ + return __this_cpu_read(loaded_hyp_vcpu); + +} + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (hyp_vm) + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm; +} + +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) +{ + hyp_spin_lock(&vm_table_lock); hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); hyp_spin_unlock(&vm_table_lock); } +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + + if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) { + put_pkvm_hyp_vm(hyp_vm); + hyp_vm = NULL; + } + + return hyp_vm; +} + +static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) +{ + struct kvm *kvm = &hyp_vm->kvm; + unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* CTR_EL0 is always under host control, even for protected VMs. */ + hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + + /* No restrictions for non-protected VMs. */ + if (!kvm_vm_is_protected(kvm)) { + hyp_vm->kvm.arch.flags = host_arch_flags; + + bitmap_copy(kvm->arch.vcpu_features, + host_kvm->arch.vcpu_features, + KVM_VCPU_MAX_FEATURES); + + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags)) + hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1; + + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); + } + + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, + allowed_features, KVM_VCPU_MAX_FEATURES); +} + static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -298,8 +377,14 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], { int i; - for (i = 0; i < nr_vcpus; i++) - unpin_host_vcpu(hyp_vcpus[i]->host_vcpu); + for (i = 0; i < nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; + + if (!hyp_vcpu) + continue; + + unpin_host_vcpu(hyp_vcpu->host_vcpu); + } } static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, @@ -308,31 +393,46 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.flags = 0; + pkvm_init_features_from_host(hyp_vm, host_kvm); +} + +static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) + vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, struct pkvm_hyp_vm *hyp_vm, - struct kvm_vcpu *host_vcpu, - unsigned int vcpu_idx) + struct kvm_vcpu *host_vcpu) { int ret = 0; if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) return -EBUSY; - if (host_vcpu->vcpu_idx != vcpu_idx) { - ret = -EINVAL; - goto done; - } - hyp_vcpu->host_vcpu = host_vcpu; hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); - hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx); hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + + ret = pkvm_vcpu_init_traps(hyp_vcpu); + if (ret) + goto done; + + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); @@ -430,6 +530,7 @@ static void *map_donated_memory(unsigned long host_va, size_t size) static void __unmap_donated_memory(void *va, size_t size) { + kvm_flush_dcache_to_poc(va, size); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), PAGE_ALIGN(size) >> PAGE_SHIFT)); } @@ -556,24 +657,27 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, goto unlock; } - idx = hyp_vm->nr_vcpus; + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu); + if (ret) + goto unlock; + + idx = hyp_vcpu->vcpu.vcpu_idx; if (idx >= hyp_vm->kvm.created_vcpus) { ret = -EINVAL; goto unlock; } - ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); - if (ret) + if (hyp_vm->vcpus[idx]) { + ret = -EINVAL; goto unlock; + } hyp_vm->vcpus[idx] = hyp_vcpu; - hyp_vm->nr_vcpus++; unlock: hyp_spin_unlock(&vm_table_lock); if (ret) unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); - return ret; } @@ -591,7 +695,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) int __pkvm_teardown_vm(pkvm_handle_t handle) { - struct kvm_hyp_memcache *mc; + struct kvm_hyp_memcache *mc, *stage2_mc; struct pkvm_hyp_vm *hyp_vm; struct kvm *host_kvm; unsigned int idx; @@ -619,12 +723,26 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) /* Reclaim guest pages (including page-table pages) */ mc = &host_kvm->arch.pkvm.teardown_mc; - reclaim_guest_pages(hyp_vm, mc); - unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; + reclaim_pgtable_pages(hyp_vm, stage2_mc); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus); /* Push the metadata pages to the teardown memcache */ - for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + struct kvm_hyp_memcache *vcpu_mc; + + if (!hyp_vcpu) + continue; + + vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + + while (vcpu_mc->nr_pages) { + void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); + + push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); + unmap_donated_memory_noclear(addr, PAGE_SIZE); + } teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); } diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index d57bcb6ab94d..c3e196fb8b18 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -205,7 +205,7 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) struct psci_boot_args *boot_args; struct kvm_cpu_context *host_ctxt; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); if (is_cpu_on) boot_args = this_cpu_ptr(&cpu_on_args); @@ -218,6 +218,9 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) if (is_cpu_on) release_boot_args(boot_args); + write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); + write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + __host_enter(host_ctxt); } @@ -265,6 +268,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ case PSCI_1_0_FN_PSCI_FEATURES: case PSCI_1_0_FN_SET_SUSPEND_MODE: case PSCI_1_1_FN64_SYSTEM_RESET2: + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: return psci_forward(host_ctxt); case PSCI_1_0_FN64_SYSTEM_SUSPEND: return psci_system_suspend(func_id, host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index bc58d1b515af..d62bcb5634a2 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -12,7 +12,6 @@ #include <nvhe/early_alloc.h> #include <nvhe/ffa.h> -#include <nvhe/fixed_config.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> @@ -67,13 +66,34 @@ static int divide_memory_pool(void *virt, unsigned long size) return 0; } +static int pkvm_create_host_sve_mappings(void) +{ + void *start, *end; + int ret, i; + + if (!system_supports_sve()) + return 0; + + for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); + struct cpu_sve_state *sve_state = host_data->sve_state; + + start = kern_hyp_va(sve_state); + end = start + PAGE_ALIGN(pkvm_host_sve_state_size()); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + } + + return 0; +} + static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits) { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; - enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -125,22 +145,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } - /* - * Map the host sections RO in the hypervisor, but transfer the - * ownership from the host to the hypervisor itself to make sure they - * can't be donated or shared with another entity. - * - * The ownership transition requires matching changes in the host - * stage-2. This will be done later (see finalize_host_mappings()) once - * the hyp_vmemmap is addressable. - */ - prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(&kvm_vgic_global_state, - &kvm_vgic_global_state + 1, prot); - if (ret) - return ret; - - return 0; + return pkvm_create_host_sve_mappings(); } static void update_nvhe_init_params(void) @@ -174,7 +179,6 @@ static void hpool_put_page(void *addr) static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - enum kvm_pgtable_prot prot; enum pkvm_page_state state; phys_addr_t phys; @@ -197,16 +201,16 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, case PKVM_PAGE_OWNED: return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED; break; case PKVM_PAGE_SHARED_BORROWED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED; break; default: return -EINVAL; } - return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); + return 0; } static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -257,8 +261,7 @@ static int fix_hyp_pgtable_refcnt(void) void __noreturn __pkvm_init_finalise(void) { - struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); - struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt; + struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt); unsigned long nr_pages, reserved_pages, pfn; int ret; @@ -316,7 +319,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, { struct kvm_nvhe_init_params *params; void *virt = hyp_phys_to_virt(phys); - void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); + typeof(__pkvm_init_switch_pgd) *fn; int ret; BUG_ON(kvm_check_pvm_sysreg_table()); @@ -340,7 +343,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, /* Jump in the idmap page to switch to the new page-tables */ params = this_cpu_ptr(&kvm_init_params); fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd); - fn(__hyp_pa(params), __pkvm_init_finalise); + fn(params->pgd_pa, params->stack_hyp_va, __pkvm_init_finalise); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index ed6b58b19cfa..5b6eeab1a774 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -28,7 +28,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); - stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE); stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); stacktrace_info->fp = fp; stacktrace_info->pc = pc; @@ -54,7 +54,7 @@ static struct stack_info stackinfo_get_hyp(void) { struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); unsigned long high = params->stack_hyp_va; - unsigned long low = high - PAGE_SIZE; + unsigned long low = high - NVHE_STACK_SIZE; return (struct stack_info) { .low = low, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index c50f8459e4fc..7d2ba6ef0261 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -26,7 +26,6 @@ #include <asm/debug-monitors.h> #include <asm/processor.h> -#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> /* Non-VHE specific context */ @@ -36,34 +35,71 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_traps(struct kvm_vcpu *vcpu) +static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - ___activate_traps(vcpu); - __activate_traps_common(vcpu); + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); - val = vcpu->arch.cptr_el2; - val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; - if (cpus_have_final_cap(ARM64_SME)) { - if (has_hvhe()) - val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN); - else - val |= CPTR_EL2_TSM; + if (has_hvhe()) { + val |= CPACR_EL1_TTA; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + write_sysreg(val, cpacr_el1); + } else { + val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); } +} + +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_hvhe()) { + u64 val = CPACR_EL1_FPEN; - if (!guest_owns_fp_regs(vcpu)) { - if (has_hvhe()) - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | - CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN); - else - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; - __activate_traps_fpsimd32(vcpu); + write_sysreg(val, cpacr_el1); + } else { + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); } +} + +static void __activate_traps(struct kvm_vcpu *vcpu) +{ + ___activate_traps(vcpu, vcpu->arch.hcr_el2); + __activate_traps_common(vcpu); + __activate_cptr_traps(vcpu); - kvm_write_cptr_el2(val); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { @@ -108,7 +144,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } @@ -174,9 +210,8 @@ static void __pmu_switch_to_host(struct kvm_vcpu *vcpu) static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) { /* - * Make sure we handle the exit for workarounds and ptrauth - * before the pKVM handling, as the latter could decide to - * UNDEF. + * Make sure we handle the exit for workarounds before the pKVM + * handling, as the latter could decide to UNDEF. */ return (kvm_hyp_handle_sysreg(vcpu, exit_code) || kvm_handle_pvm_sysreg(vcpu, exit_code)); @@ -191,7 +226,6 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, }; @@ -203,34 +237,33 @@ static const exit_handler_fn pvm_exit_handlers[] = { [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, }; static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) { - if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)))) + if (unlikely(vcpu_is_protected(vcpu))) return pvm_exit_handlers; return hyp_exit_handlers; } -/* - * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. - * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a - * guest from dropping to AArch32 EL0 if implemented by the CPU. If the - * hypervisor spots a guest in such a state ensure it is handled, and don't - * trust the host to spot or fix it. The check below is based on the one in - * kvm_arch_vcpu_ioctl_run(). - * - * Returns false if the guest ran in AArch32 when it shouldn't have, and - * thus should exit to the host, or true if a the guest run loop can continue. - */ -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - struct kvm *kvm = kern_hyp_va(vcpu->kvm); + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + + synchronize_vcpu_pstate(vcpu, exit_code); - if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) { + /* + * Some guests (e.g., protected VMs) are not be allowed to run in + * AArch32. The ARMv8 architecture does not give the hypervisor a + * mechanism to prevent a guest from dropping to AArch32 EL0 if + * implemented by the CPU. If the hypervisor spots a guest in such a + * state ensure it is handled, and don't trust the host to spot or fix + * it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + */ + if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { /* * As we have caught the guest red-handed, decide that it isn't * fit for purpose anymore by making the vcpu invalid. The VMM @@ -242,6 +275,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; } + + return __fixup_guest_exit(vcpu, exit_code, handlers); } /* Switch to the guest for legacy non-VHE systems */ @@ -264,7 +299,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmr_sync(); } - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; @@ -337,7 +372,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_restore_state_nvhe(host_ctxt); - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) + if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); @@ -367,7 +402,7 @@ asmlinkage void __noreturn hyp_panic(void) struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; if (vcpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index edd969a1f36b..1ddd9ed3cbb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -11,7 +11,7 @@ #include <hyp/adjust_pc.h> -#include <nvhe/fixed_config.h> +#include <nvhe/pkvm.h> #include "../../sys_regs.h" @@ -28,222 +28,255 @@ u64 id_aa64mmfr1_el1_sys_val; u64 id_aa64mmfr2_el1_sys_val; u64 id_aa64smfr0_el1_sys_val; -/* - * Inject an unknown/undefined exception to an AArch64 guest while most of its - * sysregs are live. - */ -static void inject_undef64(struct kvm_vcpu *vcpu) -{ - u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); - *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); - - __kvm_adjust_pc(vcpu); - - write_sysreg_el1(esr, SYS_ESR); - write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); - write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); - write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); -} - -/* - * Returns the restricted features values of the feature register based on the - * limitations in restrict_fields. - * A feature id field value of 0b0000 does not impose any restrictions. - * Note: Use only for unsigned feature field values. - */ -static u64 get_restricted_features_unsigned(u64 sys_reg_val, - u64 restrict_fields) -{ - u64 value = 0UL; - u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); +struct pvm_ftr_bits { + bool sign; + u8 shift; + u8 width; + u8 max_val; + bool (*vm_supported)(const struct kvm *kvm); +}; - /* - * According to the Arm Architecture Reference Manual, feature fields - * use increasing values to indicate increases in functionality. - * Iterate over the restricted feature fields and calculate the minimum - * unsigned value between the one supported by the system, and what the - * value is being restricted to. - */ - while (sys_reg_val && restrict_fields) { - value |= min(sys_reg_val & mask, restrict_fields & mask); - sys_reg_val &= ~mask; - restrict_fields &= ~mask; - mask <<= ARM64_FEATURE_FIELD_BITS; +#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \ + { \ + .sign = sgn, \ + .shift = id##_##fld##_SHIFT, \ + .width = id##_##fld##_WIDTH, \ + .max_val = id##_##fld##_##max, \ + .vm_supported = func, \ } - return value; -} - -/* - * Functions that return the value of feature id registers for protected VMs - * based on allowed features, system features, and KVM support. - */ +#define MAX_FEAT_FUNC(id, fld, max, func) \ + __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED) -static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask = 0; - u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; +#define MAX_FEAT(id, fld, max) \ + MAX_FEAT_FUNC(id, fld, max, NULL) - set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); +#define MAX_FEAT_ENUM(id, fld, max) \ + __MAX_FEAT_FUNC(id, fld, max, NULL, false) - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; -} +#define FEAT_END { .width = 0, } -static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +static bool vm_has_ptrauth(const struct kvm *kvm) { - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); - u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; - - if (!kvm_has_mte(kvm)) - allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - - return id_aa64pfr1_el1_sys_val & allow_mask; -} - -static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for Scalable Vectors, therefore, hyp has no sanitized - * copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, including breakpoints, and watchpoints, - * therefore, pKVM has no sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, therefore, hyp has no sanitized copy of the - * feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); - return 0; -} + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; -static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); - return 0; + return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || + cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC); } -static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +static bool vm_has_sve(const struct kvm *kvm) { - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); - return 0; + return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE); } -static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) -{ - return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; -} +/* + * Definitions for features to be allowed or restricted for protected guests. + * + * Each field in the masks represents the highest supported value for the + * feature. If a feature field is not present, it is not supported. Moreover, + * these are used to generate the guest's view of the feature registers. + * + * The approach for protected VMs is to at least support features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ -static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64pfr0[] = { + MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP), + MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve), + MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); +static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { + MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP), + MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2), + MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI), + FEAT_END +}; - return id_aa64isar1_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = { + MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM), + MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16), + MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2), + MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3), + MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); +static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { + MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), + MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), + MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), + FEAT_END +}; - return id_aa64isar2_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64isar1[] = { + MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3), + MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX), + MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16), + MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask; +static const struct pvm_ftr_bits pvmid_aa64isar2[] = { + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP), + FEAT_END +}; - set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); +/* + * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported. + * However, both have Not-Implemented values that are non-zero. Define them + * so they can be used when getting the value of these registers. + */ +#define ID_AA64DFR0_EL1_NONZERO_NI \ +( \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \ +) - return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; -} +#define ID_AA64MMFR4_EL1_NONZERO_NI \ + SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI) -static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +/* + * Returns the value of the feature registers based on the system register + * value, the vcpu support for the revelant features, and the additional + * restrictions for protected VMs. + */ +static u64 get_restricted_features(const struct kvm_vcpu *vcpu, + u64 sys_reg_val, + const struct pvm_ftr_bits restrictions[]) { - return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; -} + u64 val = 0UL; + int i; + + for (i = 0; restrictions[i].width != 0; i++) { + bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported; + bool sign = restrictions[i].sign; + int shift = restrictions[i].shift; + int width = restrictions[i].width; + u64 min_signed = (1UL << width) - 1UL; + u64 sign_bit = 1UL << (width - 1); + u64 mask = GENMASK_ULL(width + shift - 1, shift); + u64 sys_val = (sys_reg_val & mask) >> shift; + u64 pvm_max = restrictions[i].max_val; + + if (vm_supported && !vm_supported(vcpu->kvm)) + val |= (sign ? min_signed : 0) << shift; + else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit)) + val |= max(sys_val, pvm_max) << shift; + else + val |= min(sys_val, pvm_max) << shift; + } -static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) -{ - return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; + return val; } -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) { switch (id) { case SYS_ID_AA64PFR0_EL1: - return get_pvm_id_aa64pfr0(vcpu); + return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); case SYS_ID_AA64PFR1_EL1: - return get_pvm_id_aa64pfr1(vcpu); - case SYS_ID_AA64ZFR0_EL1: - return get_pvm_id_aa64zfr0(vcpu); - case SYS_ID_AA64DFR0_EL1: - return get_pvm_id_aa64dfr0(vcpu); - case SYS_ID_AA64DFR1_EL1: - return get_pvm_id_aa64dfr1(vcpu); - case SYS_ID_AA64AFR0_EL1: - return get_pvm_id_aa64afr0(vcpu); - case SYS_ID_AA64AFR1_EL1: - return get_pvm_id_aa64afr1(vcpu); + return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); case SYS_ID_AA64ISAR0_EL1: - return get_pvm_id_aa64isar0(vcpu); + return id_aa64isar0_el1_sys_val; case SYS_ID_AA64ISAR1_EL1: - return get_pvm_id_aa64isar1(vcpu); + return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1); case SYS_ID_AA64ISAR2_EL1: - return get_pvm_id_aa64isar2(vcpu); + return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2); case SYS_ID_AA64MMFR0_EL1: - return get_pvm_id_aa64mmfr0(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0); case SYS_ID_AA64MMFR1_EL1: - return get_pvm_id_aa64mmfr1(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1); case SYS_ID_AA64MMFR2_EL1: - return get_pvm_id_aa64mmfr2(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2); + case SYS_ID_AA64DFR0_EL1: + return ID_AA64DFR0_EL1_NONZERO_NI; + case SYS_ID_AA64MMFR4_EL1: + return ID_AA64MMFR4_EL1_NONZERO_NI; default: /* Unhandled ID register, RAZ */ return 0; } } +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { - return pvm_read_id_reg(vcpu, reg_to_encoding(r)); + struct kvm *kvm = vcpu->kvm; + u32 reg = reg_to_encoding(r); + + if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))) + return 0; + + if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7)) + return kvm->arch.id_regs[IDREG_IDX(reg)]; + + return 0; } /* Handler to RAZ/WI sysregs */ @@ -271,13 +304,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, return false; } - /* - * No support for AArch32 guests, therefore, pKVM has no sanitized copy - * of AArch32 feature id registers. - */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_ELx_64BIT_ONLY); - return pvm_access_raz_wi(vcpu, p, r); } @@ -449,6 +475,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { }; /* + * Initializes feature registers for protected vms. + */ +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch *ka = &kvm->arch; + u32 r; + + hyp_assert_lock_held(&vm_table_lock); + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + /* + * Initialize only AArch64 id registers since AArch32 isn't supported + * for protected VMs. + */ + for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1)) + ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r); + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + +/* * Checks that the sysreg table is unique and in-order. * * Returns 0 if the table is consistent, or 1 otherwise. diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index 29305022bc04..3cc613cce5f5 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt); + u64 midr = ctxt_midr_el1(ctxt); + + __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index 3aaab20ae5b4..ff176f4ce7de 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -22,15 +22,16 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val, shift = 0; + u64 set, clr, shift = 0; if (has_hvhe()) shift = 10; /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; - write_sysreg(val, cnthctl_el2); + set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; + clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); } /* @@ -58,5 +59,12 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu) set <<= 10; } + /* + * Trap the virtual counter/timer if we have a broken cntvoff + * implementation. + */ + if (has_broken_cntvoff()) + set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index a60fb13e2192..48da9ca9763f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -11,13 +11,23 @@ #include <nvhe/mem_protect.h> struct tlb_inv_context { - u64 tcr; + struct kvm_s2_mmu *mmu; + u64 tcr; + u64 sctlr; }; -static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt, - bool nsh) +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt, + bool nsh) { + struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + cxt->mmu = NULL; + /* * We have two requirements: * @@ -40,20 +50,55 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, else dsb(ish); + /* + * If we're already in the desired context, then there's nothing to do. + */ + if (vcpu) { + /* + * We're in guest context. However, for this to work, this needs + * to be called from within __kvm_vcpu_run(), which ensures that + * __hyp_running_vcpu is set to the current guest vcpu. + */ + if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu)) + return; + + cxt->mmu = vcpu->arch.hw_mmu; + } else { + /* We're in host context. */ + if (mmu == host_s2_mmu) + return; + + cxt->mmu = host_s2_mmu; + } + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* * For CPUs that are affected by ARM 1319367, we need to - * avoid a host Stage-1 walk while we have the guest's - * VMID set in the VTTBR in order to invalidate TLBs. - * We're guaranteed that the S1 MMU is enabled, so we can - * simply set the EPD bits to avoid any further TLB fill. + * avoid a Stage-1 walk with the old VMID while we have + * the new VMID set in the VTTBR in order to invalidate TLBs. + * We're guaranteed that the host S1 MMU is enabled, so + * we can simply set the EPD bits to avoid any further + * TLB fill. For guests, we ensure that the S1 MMU is + * temporarily enabled in the next context. */ val = cxt->tcr = read_sysreg_el1(SYS_TCR); val |= TCR_EPD1_MASK | TCR_EPD0_MASK; write_sysreg_el1(val, SYS_TCR); isb(); + + if (vcpu) { + val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); + if (!(val & SCTLR_ELx_M)) { + val |= SCTLR_ELx_M; + write_sysreg_el1(val, SYS_SCTLR); + isb(); + } + } else { + /* The host S1 MMU is always enabled. */ + cxt->sctlr = SCTLR_ELx_M; + } } /* @@ -62,18 +107,40 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_stage2(mmu, kern_hyp_va(mmu->arch)); + if (vcpu) + __load_host_stage2(); + else + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static void __tlb_switch_to_host(struct tlb_inv_context *cxt) +static void exit_vmid_context(struct tlb_inv_context *cxt) { - __load_host_stage2(); + struct kvm_s2_mmu *mmu = cxt->mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + + if (!mmu) + return; + + if (vcpu) + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + else + __load_host_stage2(); + + /* Ensure write of the old VMID */ + isb(); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { - /* Ensure write of the host VMID */ - isb(); - /* Restore the host's TCR_EL1 */ + if (!(cxt->sctlr & SCTLR_ELx_M)) { + write_sysreg_el1(cxt->sctlr, SYS_SCTLR); + isb(); + } + write_sysreg_el1(cxt->tcr, SYS_TCR); } } @@ -84,7 +151,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); /* * We could do so much better if we had the VA as well. @@ -105,7 +172,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, @@ -114,7 +181,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, true); + enter_vmid_context(mmu, &cxt, true); /* * We could do so much better if we had the VA as well. @@ -135,7 +202,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, @@ -152,16 +219,17 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, start = round_down(start, stride); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); - __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); dsb(ish); __tlbi(vmalle1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) @@ -169,13 +237,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) @@ -183,19 +251,19 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) { - /* Same remark as in __tlb_switch_to_guest() */ + /* Same remark as in enter_vmid_context() */ dsb(ish); __tlbi(alle1is); dsb(ish); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3fae5830f8d2..df5cc74a7dd0 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -17,48 +17,6 @@ #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) - -#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) - -#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) - -#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ - KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ - KVM_PTE_LEAF_ATTR_HI_S2_XN) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 - -/* - * Used to indicate a pte for which a 'break-before-make' sequence is in - * progress. - */ -#define KVM_INVALID_PTE_LOCKED BIT(10) - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -77,14 +35,6 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); } -static bool kvm_phys_is_valid(u64 phys) -{ - u64 parange_max = kvm_get_parange_max(); - u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max); - - return phys < BIT(shift); -} - static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) { u64 granule = kvm_granule_size(ctx->level); @@ -95,7 +45,7 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, if (granule > (ctx->end - ctx->addr)) return false; - if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) + if (!IS_ALIGNED(phys, granule)) return false; return IS_ALIGNED(ctx->addr, granule); @@ -528,7 +478,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; @@ -629,6 +579,9 @@ struct stage2_map_data { /* Force mappings to page granularity */ bool force_pte; + + /* Walk should update owner_id only */ + bool annotation; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) @@ -843,12 +796,15 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, * Perform the appropriate TLB invalidation based on the * evicted pte value (if any). */ - if (kvm_pte_table(ctx->old, ctx->level)) - kvm_tlb_flush_vmid_range(mmu, ctx->addr, - kvm_granule_size(ctx->level)); - else if (kvm_pte_valid(ctx->old)) + if (kvm_pte_table(ctx->old, ctx->level)) { + u64 size = kvm_granule_size(ctx->level); + u64 addr = ALIGN_DOWN(ctx->addr, size); + + kvm_tlb_flush_vmid_range(mmu, addr, size); + } else if (kvm_pte_valid(ctx->old)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + } } if (stage2_pte_is_counted(ctx->old)) @@ -896,9 +852,13 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - if (!stage2_unmap_defer_tlb_flush(pgt)) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, - ctx->addr, ctx->level); + if (kvm_pte_table(ctx->old, ctx->level)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + TLBI_TTL_UNKNOWN); + } else if (!stage2_unmap_defer_tlb_flush(pgt)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + ctx->level); + } } mm_ops->put_page(ctx->ptep); @@ -907,12 +867,12 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) { u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; - return memattr == KVM_S2_MEMATTR(pgt, NORMAL); + return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); } static bool stage2_pte_executable(kvm_pte_t pte) { - return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); + return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, @@ -920,18 +880,7 @@ static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, { u64 phys = data->phys; - /* - * Stage-2 walks to update ownership data are communicated to the map - * walker using an invalid PA. Avoid offsetting an already invalid PA, - * which could overflow and make the address valid again. - */ - if (!kvm_phys_is_valid(phys)) - return phys; - - /* - * Otherwise, work out the correct PA based on how far the walk has - * gotten. - */ + /* Work out the correct PA based on how far the walk has gotten */ return phys + (ctx->addr - ctx->start); } @@ -943,6 +892,9 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; + if (data->annotation) + return true; + return kvm_block_mapping_supported(ctx, phys); } @@ -958,7 +910,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; - if (kvm_phys_is_valid(phys)) + if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); @@ -972,6 +924,21 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_pte_needs_update(ctx->old, new)) return -EAGAIN; + /* If we're only changing software bits, then store them and go! */ + if (!kvm_pgtable_walk_shared(ctx) && + !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) { + bool old_is_counted = stage2_pte_is_counted(ctx->old); + + if (old_is_counted != stage2_pte_is_counted(new)) { + if (old_is_counted) + mm_ops->put_page(ctx->ptep); + else + mm_ops->get_page(ctx->ptep); + } + WARN_ON_ONCE(!stage2_try_set_pte(ctx, new)); + return 0; + } + if (!stage2_try_break_pte(ctx, data->mmu)) return -EAGAIN; @@ -1105,11 +1072,11 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, { int ret; struct stage2_map_data map_data = { - .phys = KVM_PHYS_INVALID, .mmu = pgt->mmu, .memcache = mc, .owner_id = owner_id, .force_pte = true, + .annotation = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -1265,19 +1232,15 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) NULL, NULL, 0); } -kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) { - kvm_pte_t pte = 0; int ret; ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - &pte, NULL, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + NULL, NULL, flags); if (!ret) dsb(ishst); - - return pte; } struct stage2_age_data { @@ -1331,7 +1294,7 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, } int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot) + enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { int ret; s8 level; @@ -1349,9 +1312,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; @@ -1363,7 +1324,7 @@ static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) + if (!stage2_pte_cacheable(pgt, ctx->old)) return 0; if (mm_ops->dcache_clean_inval_poc) @@ -1525,7 +1486,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); - dsb(ishst); return 0; } @@ -1537,8 +1497,11 @@ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, .flags = KVM_PGTABLE_WALK_LEAF, .arg = mc, }; + int ret; - return kvm_pgtable_walk(pgt, addr, size, &walker); + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; } int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 6cb638b184b1..ed363aa3027e 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -18,7 +18,7 @@ #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) -static u64 __gic_v3_get_lr(unsigned int lr) +u64 __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { case 0: @@ -218,7 +218,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); + write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) @@ -268,8 +268,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). + * + * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is + * provisioned at all. In order to prevent illegal accesses to the + * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 + * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!cpu_if->vgic_sre) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) { + write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); + isb(); + } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); @@ -288,8 +296,9 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. */ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); @@ -297,10 +306,11 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, + * injected. Note that this also applies if we don't expect + * any system register access (no vgic at all). */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } @@ -326,11 +336,11 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } -void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -363,7 +373,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) } } -void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) +static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -455,16 +465,35 @@ u64 __vgic_v3_get_gic_config(void) return val; } -u64 __vgic_v3_read_vmcr(void) +static u64 __vgic_v3_read_vmcr(void) { return read_gicreg(ICH_VMCR_EL2); } -void __vgic_v3_write_vmcr(u32 vmcr) +static void __vgic_v3_write_vmcr(u32 vmcr) { write_gicreg(vmcr, ICH_VMCR_EL2); } +void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + __vgic_v3_save_aprs(cpu_if); + if (cpu_if->vgic_sre) + cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); +} + +void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (cpu_if->vgic_sre) + __vgic_v3_write_vmcr(cpu_if->vgic_vmcr); + __vgic_v3_restore_aprs(cpu_if); +} + static int __vgic_v3_bpr_min(void) { /* See Pseudocode for VPriorityGroup */ @@ -723,7 +752,7 @@ static void __vgic_v3_bump_eoicount(void) u32 hcr; hcr = read_gicreg(ICH_HCR_EL2); - hcr += 1 << ICH_HCR_EOIcount_SHIFT; + hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT; write_gicreg(hcr, ICH_HCR_EL2); } @@ -983,9 +1012,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) - val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ @@ -1013,6 +1039,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) write_gicreg(vmcr, ICH_VMCR_EL2); } +static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, + u32 sysreg, bool is_read) +{ + u64 ich_hcr; + + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return false; + + ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + switch (sysreg) { + case SYS_ICC_IGRPEN0_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_BPR0_EL1: + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_IAR0_EL1: + return ich_hcr & ICH_HCR_EL2_TALL0; + + case SYS_ICC_IGRPEN1_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP1Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(3): + case SYS_ICC_BPR1_EL1: + case SYS_ICC_EOIR1_EL1: + case SYS_ICC_HPPIR1_EL1: + case SYS_ICC_IAR1_EL1: + return ich_hcr & ICH_HCR_EL2_TALL1; + + case SYS_ICC_DIR_EL1: + if (ich_hcr & ICH_HCR_EL2_TDIR) + return true; + + fallthrough; + + case SYS_ICC_RPR_EL1: + case SYS_ICC_CTLR_EL1: + case SYS_ICC_PMR_EL1: + return ich_hcr & ICH_HCR_EL2_TC; + + default: + return false; + } +} + int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; @@ -1022,6 +1117,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) bool is_read; u32 sysreg; + if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return 0; + esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { @@ -1036,6 +1134,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) + return 0; + switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index 3b9e5464b5b3..afc4aed9231a 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -6,6 +6,8 @@ asflags-y := -D__KVM_VHE_HYPERVISOR__ ccflags-y := -D__KVM_VHE_HYPERVISOR__ +CFLAGS_switch.o += -Wno-override-init + obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index 289689b2682d..0100339b09e0 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -19,8 +19,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 1581df6aec87..731a0378ed13 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -33,11 +33,124 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +/* + * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1 + * semantics, irrespective of the configuration), but that cannot be + * applied to the actual HW as things would otherwise break badly. + * + * - TGE: we want the guest to use EL1, which is incompatible with + * this bit being set + * + * - API/APK: they are already accounted for by vcpu_load(), and can + * only take effect across a load/put cycle (such as ERET) + */ +#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK) + +static u64 __compute_hcr(struct kvm_vcpu *vcpu) +{ + u64 hcr = vcpu->arch.hcr_el2; + + if (!vcpu_has_nv(vcpu)) + return hcr; + + if (is_hyp_ctxt(vcpu)) { + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; + + if (!vcpu_el2_e2h_is_set(vcpu)) + hcr |= HCR_NV1; + + write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); + } + + return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE); +} + +static void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + u64 cptr; + + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } else { + __activate_traps_fpsimd32(vcpu); + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; + + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN_EL1EN; + + write_sysreg(val, cpacr_el1); +} + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; - ___activate_traps(vcpu); + ___activate_traps(vcpu, __compute_hcr(vcpu)); if (has_cntpoff()) { struct timer_map map; @@ -59,31 +172,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) } } - val = read_sysreg(cpacr_el1); - val |= CPACR_ELx_TTA; - val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); - - /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. - */ - - val |= CPTR_EL2_TAM; - - if (guest_owns_fp_regs(vcpu)) { - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - } else { - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); - __activate_traps_fpsimd32(vcpu); - } - - write_sysreg(val, cpacr_el1); + __activate_cptr_traps(vcpu); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } @@ -128,7 +217,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); @@ -162,6 +251,8 @@ static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu) void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu) { + host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu; + __vcpu_load_switch_sysregs(vcpu); __vcpu_load_activate_traps(vcpu); __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); @@ -171,33 +262,315 @@ void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) { __vcpu_put_deactivate_traps(vcpu); __vcpu_put_switch_sysregs(vcpu); + + host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; +} + +static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + unsigned long ctl; + u64 cval, cnt; + bool stat; + + switch (reg) { + case CNTP_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + cnt = compute_counter_value(vcpu_ptimer(vcpu)); + break; + case CNTV_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + cnt = compute_counter_value(vcpu_vtimer(vcpu)); + break; + default: + BUG(); + } + + stat = cval <= cnt; + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); + + return ctl; +} + +static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr, val; + + /* + * Having FEAT_ECV allows for a better quality of timer emulation. + * However, this comes at a huge cost in terms of traps. Try and + * satisfy the reads from guest's hypervisor context without + * returning to the kernel if we can. + */ + if (!is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) + return false; + + switch (esr_sys64_to_sysreg(esr)) { + case SYS_CNTP_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTP_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + break; + case SYS_CNTP_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + val -= timer_get_offset(vcpu_hptimer(vcpu)); + } else { + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + } + break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + val = compute_counter_value(vcpu_hptimer(vcpu)); + break; + case SYS_CNTV_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTV_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CVAL); + else + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + val = compute_counter_value(vcpu_hvtimer(vcpu)); + break; + default: + return false; + } + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 spsr, elr, mode; + + /* + * Going through the whole put/load motions is a waste of time + * if this is a VHE guest hypervisor returning to its own + * userspace, or the hypervisor performing a local exception + * return. No need to save/restore registers, no need to + * switch S2 MMU. Just do the canonical ERET. + * + * Unless the trap has to be forwarded further down the line, + * of course... + */ + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) || + (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET)) + return false; + + spsr = read_sysreg_el1(SYS_SPSR); + mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL0t: + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + return false; + break; + case PSR_MODE_EL2t: + mode = PSR_MODE_EL1t; + break; + case PSR_MODE_EL2h: + mode = PSR_MODE_EL1h; + break; + default: + return false; + } + + /* If ERETAx fails, take the slow path */ + if (esr_iss_is_eretax(esr)) { + if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr))) + return false; + } else { + elr = read_sysreg_el1(SYS_ELR); + } + + spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; + + write_sysreg_el2(spsr, SYS_SPSR); + write_sysreg_el2(elr, SYS_ELR); + + return true; +} + +static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + int ret = -EINVAL; + u32 instr; + u64 val; + + /* + * Ideally, we would never trap on EL2 S1 TLB invalidations using + * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}. + * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2, + * meaning that we can't track changes to the virtual TGE bit. So we + * have to leave HCR_EL2.TTLB set on the host. Oopsie... + * + * Try and handle these invalidation as quickly as possible, without + * fully exiting. Note that we don't need to consider any forwarding + * here, as having E2H+TGE set is the very definition of being + * InHost. + * + * For the lesser hypervisors out there that have failed to get on + * with the VHE program, we can also handle the nVHE style of EL2 + * invalidation. + */ + if (!(is_hyp_ctxt(vcpu))) + return false; + + instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) && + vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) || + kvm_supported_tlbi_s1e2_op (vcpu, instr)) + ret = __kvm_tlbi_s1e2(NULL, val, instr); + + if (ret) + return false; + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + int rt; + + if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1) + return false; + + rt = kvm_vcpu_sys_get_rt(vcpu); + + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) { + vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2)); + } else { + vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2); + __activate_cptr_traps(vcpu); + } + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + + if (!vcpu_has_nv(vcpu)) + return false; + + if (sysreg != SYS_ZCR_EL2) + return false; + + if (guest_owns_fp_regs()) + return false; + + /* + * ZCR_EL2 traps are handled in the slow path, with the expectation + * that the guest's FP context has already been loaded onto the CPU. + * + * Load the guest's FP context and unconditionally forward to the + * slow path for handling (i.e. return false). + */ + kvm_hyp_handle_fpsimd(vcpu, exit_code); + return false; +} + +static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_timer(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_zcr_el2(vcpu, exit_code)) + return true; + + return kvm_hyp_handle_sysreg(vcpu, exit_code); +} + +static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 iss; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return false; + + /* + * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 + * is populated with a correct ISS for a sysreg trap. These fruity + * parts are 64bit only, so unconditionally set IL. + */ + iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); + vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | + FIELD_PREP(ESR_ELx_ISS_MASK, iss) | + ESR_ELx_IL; + return false; } static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, - [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe, [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, + [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, + + /* Apple shenanigans */ + [0x3F] = kvm_hyp_handle_impdef, }; -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - return hyp_exit_handlers; -} + synchronize_vcpu_pstate(vcpu, exit_code); -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) -{ /* * If we were in HYP context on entry, adjust the PSTATE view * so that the usual helpers work correctly. */ - if (unlikely(vcpu_get_flag(vcpu, VCPU_HYP_CONTEXT))) { + if (vcpu_has_nv(vcpu) && (read_sysreg(hcr_el2) & HCR_NV)) { u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { @@ -212,6 +585,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); *vcpu_cpsr(vcpu) |= mode; } + + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ @@ -221,12 +596,13 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt; u64 exit_code; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - host_ctxt->__hyp_running_vcpu = vcpu; + host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; sysreg_save_host_state_vhe(host_ctxt); + fpsimd_lazy_switch_to_guest(vcpu); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -240,11 +616,6 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); - if (is_hyp_ctxt(vcpu)) - vcpu_set_flag(vcpu, VCPU_HYP_CONTEXT); - else - vcpu_clear_flag(vcpu, VCPU_HYP_CONTEXT); - do { /* Jump in the fire! */ exit_code = __guest_enter(vcpu); @@ -256,9 +627,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + fpsimd_lazy_switch_to_host(vcpu); + sysreg_restore_host_state_vhe(host_ctxt); - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) + if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); @@ -301,12 +674,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) return ret; } -static void __hyp_call_panic(u64 spsr, u64 elr, u64 par) +static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) { struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; __deactivate_traps(vcpu); @@ -326,7 +699,6 @@ void __noreturn hyp_panic(void) u64 par = read_sysreg_par(); __hyp_call_panic(spsr, elr, par); - unreachable(); } asmlinkage void kvm_unexpected_el2_exception(void) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index a8b9ea496706..3814b0b2c937 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -15,6 +15,132 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> +static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) +{ + /* These registers are common with EL1 */ + __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); + __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); + + __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); + __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); + __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); + __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); + __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); + __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); + __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); + __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); + + /* + * In VHE mode those registers are compatible between EL1 and EL2, + * and the guest uses the _EL1 versions on the CPU naturally. + * So we save them into their _EL2 versions here. + * For nVHE mode we trap accesses to those registers, so our + * _EL2 copy in sys_regs[] is always up-to-date and we don't need + * to save anything here. + */ + if (vcpu_el2_e2h_is_set(vcpu)) { + u64 val; + + /* + * We don't save CPTR_EL2, as accesses to CPACR_EL1 + * are always trapped, ensuring that the in-memory + * copy is always up-to-date. A small blessing... + */ + __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); + __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); + __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); + __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); + __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); + } + + /* + * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where + * the interesting CNTHCTL_EL2 bits live. So preserve these + * bits when reading back the guest-visible value. + */ + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; + } + + __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); + __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); + __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); +} + +static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* These registers are common with EL1 */ + write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); + write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); + + write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + + if (vcpu_el2_e2h_is_set(vcpu)) { + /* + * In VHE mode those registers are compatible between + * EL1 and EL2. + */ + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); + } else { + /* + * CNTHCTL_EL2 only affects EL1 when running nVHE, so + * no need to restore it. + */ + val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); + write_sysreg_el1(val, SYS_SCTLR); + val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); + write_sysreg_el1(val, SYS_CPACR); + val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); + write_sysreg_el1(val, SYS_TTBR0); + val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); + write_sysreg_el1(val, SYS_TCR); + } + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); + } + + write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); + write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); +} + /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and * pstate, which are handled as part of the el2 return state) on every @@ -66,8 +192,9 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; + u64 midr, mpidr; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); /* @@ -89,7 +216,24 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); - __sysreg_restore_el1_state(guest_ctxt); + + if (unlikely(is_hyp_ctxt(vcpu))) { + __sysreg_restore_vel2_state(vcpu); + } else { + if (vcpu_has_nv(vcpu)) { + /* + * As we're restoring a nested guest, set the value + * provided by the guest hypervisor. + */ + midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); + mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); + } else { + midr = ctxt_midr_el1(guest_ctxt); + mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); + } + + __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); + } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } @@ -110,9 +254,13 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); + + if (unlikely(is_hyp_ctxt(vcpu))) + __sysreg_save_vel2_state(vcpu); + else + __sysreg_save_el1_state(guest_ctxt); - __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index b32e2940df7d..3d50a1bd2bdb 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -17,8 +17,8 @@ struct tlb_inv_context { u64 sctlr; }; -static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt) +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); u64 val; @@ -67,7 +67,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, isb(); } -static void __tlb_switch_to_host(struct tlb_inv_context *cxt) +static void exit_vmid_context(struct tlb_inv_context *cxt) { /* * We're done with the TLB operation, let's restore the host's @@ -97,7 +97,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. @@ -118,7 +118,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, @@ -129,7 +129,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, dsb(nshst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. @@ -150,7 +150,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, @@ -169,16 +169,17 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); - __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); dsb(ish); __tlbi(vmalle1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) @@ -188,13 +189,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) @@ -202,14 +203,14 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) @@ -218,3 +219,150 @@ void __kvm_flush_vm_context(void) __tlbi(alle1is); dsb(ish); } + +/* + * TLB invalidation emulation for NV. For any given instruction, we + * perform the following transformtions: + * + * - a TLBI targeting EL2 S1 is remapped to EL1 S1 + * - a non-shareable TLBI is upgraded to being inner-shareable + * - an outer-shareable TLBI is also mapped to inner-shareable + * - an nXS TLBI is upgraded to XS + */ +int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) +{ + struct tlb_inv_context cxt; + int ret = 0; + + /* + * The guest will have provided its own DSB ISHST before trapping. + * If it hasn't, that's its own problem, and we won't paper over it + * (plus, there is plenty of extra synchronisation before we even + * get here...). + */ + + if (mmu) + enter_vmid_context(mmu, &cxt); + + switch (sys_encoding) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + __tlbi(vmalle1is); + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + __tlbi(vae1is, va); + break; + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + __tlbi(vale1is, va); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + __tlbi(aside1is, va); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + __tlbi(vaae1is, va); + break; + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + __tlbi(vaale1is, va); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + __tlbi(rvae1is, va); + break; + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + __tlbi(rvale1is, va); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + __tlbi(rvaae1is, va); + break; + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + __tlbi(rvaale1is, va); + break; + default: + ret = -EINVAL; + } + dsb(ish); + isb(); + + if (mmu) + exit_vmid_context(&cxt); + + return ret; +} diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 5763d979d8ca..569941eeb3fe 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -15,6 +15,8 @@ GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0) #define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \ GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2 \ + GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0) static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) { @@ -317,7 +319,7 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) * to the guest, and hide SSBS so that the * guest stays protected. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) break; fallthrough; case SPECTRE_UNAFFECTED: @@ -360,6 +362,8 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) break; case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: val[0] = smccc_feat->vendor_hyp_bmap; + /* Function numbers 2-63 are reserved for pKVM for now */ + val[2] = smccc_feat->vendor_hyp_bmap_2; break; case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: kvm_ptp_get_time(vcpu, val); @@ -387,6 +391,7 @@ static const u64 kvm_arm_fw_reg_ids[] = { KVM_REG_ARM_STD_BMAP, KVM_REG_ARM_STD_HYP_BMAP, KVM_REG_ARM_VENDOR_HYP_BMAP, + KVM_REG_ARM_VENDOR_HYP_BMAP_2, }; void kvm_arm_init_hypercalls(struct kvm *kvm) @@ -428,7 +433,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) * Convert the workaround level into an easy-to-compare number, where higher * values mean better protection. */ -static int get_kernel_wa_level(u64 regid) +static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid) { switch (regid) { case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: @@ -449,7 +454,7 @@ static int get_kernel_wa_level(u64 regid) * don't have any FW mitigation if SSBS is there at * all times. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; fallthrough; case SPECTRE_UNAFFECTED: @@ -486,7 +491,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK; break; case KVM_REG_ARM_STD_BMAP: val = READ_ONCE(smccc_feat->std_bmap); @@ -497,6 +502,9 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_VENDOR_HYP_BMAP: val = READ_ONCE(smccc_feat->vendor_hyp_bmap); break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2); + break; default: return -ENOENT; } @@ -527,6 +535,10 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) fw_reg_bmap = &smccc_feat->vendor_hyp_bmap; fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2; + fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2; + break; default: return -ENOENT; } @@ -575,6 +587,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_ARM_PSCI_0_2: case KVM_ARM_PSCI_1_0: case KVM_ARM_PSCI_1_1: + case KVM_ARM_PSCI_1_2: + case KVM_ARM_PSCI_1_3: if (!wants_02) return -EINVAL; vcpu->kvm->arch.psci_version = val; @@ -588,7 +602,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (val & ~KVM_REG_FEATURE_LEVEL_MASK) return -EINVAL; - if (get_kernel_wa_level(reg->id) < val) + if (get_kernel_wa_level(vcpu, reg->id) < val) return -EINVAL; return 0; @@ -624,13 +638,14 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the * other way around. */ - if (get_kernel_wa_level(reg->id) < wa_level) + if (get_kernel_wa_level(vcpu, reg->id) < wa_level) return -EINVAL; return 0; case KVM_REG_ARM_STD_BMAP: case KVM_REG_ARM_STD_HYP_BMAP: case KVM_REG_ARM_VENDOR_HYP_BMAP: + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val); default: return -ENOENT; diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 200c8019a82a..ab365e839874 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -72,6 +72,31 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) return data; } +static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu) +{ + if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) + return false; + + if (vcpu_el1_is_32bit(vcpu)) { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): + case unpack_vcpu_flag(EXCEPT_AA32_IABT): + case unpack_vcpu_flag(EXCEPT_AA32_DABT): + return true; + default: + return false; + } + } else { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + return true; + default: + return false; + } + } +} + /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * or in-kernel IO emulation @@ -84,9 +109,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) unsigned int len; int mask; - /* Detect an already handled MMIO return */ - if (unlikely(!vcpu->mmio_needed)) - return 0; + /* + * Detect if the MMIO return was already handled or if userspace aborted + * the MMIO access. + */ + if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu))) + return 1; vcpu->mmio_needed = 0; @@ -117,7 +145,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) */ kvm_incr_pc(vcpu); - return 0; + return 1; } int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) @@ -133,11 +161,19 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) /* * No valid syndrome? Ask userspace for help if it has * volunteered to do so, and bail out otherwise. + * + * In the protected VM case, there isn't much userspace can do + * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); + if (vcpu_is_protected(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 18680771cdb0..2feb6c6b63af 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -15,6 +15,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> #include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> @@ -29,8 +30,12 @@ static unsigned long __ro_after_init hyp_idmap_start; static unsigned long __ro_after_init hyp_idmap_end; static phys_addr_t __ro_after_init hyp_idmap_vector; +u32 __ro_after_init __hyp_va_bits; + static unsigned long __ro_after_init io_map_base; +#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) + static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, phys_addr_t size) { @@ -147,7 +152,7 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, return -EINVAL; next = __stage2_range_addr_end(addr, end, chunk_size); - ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); + ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); if (ret) break; } while (addr = next, addr != end); @@ -168,15 +173,23 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { - kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); return 0; } int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { - kvm_tlb_flush_vmid_range(&kvm->arch.mmu, - gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); + u64 size = nr_pages << PAGE_SHIFT; + u64 addr = gfn << PAGE_SHIFT; + + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); return 0; } @@ -225,7 +238,7 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) void *pgtable = page_to_virt(page); s8 level = page_private(page); - kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); + KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); } static void stage2_free_unlinked_table(void *addr, s8 level) @@ -324,13 +337,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, + WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), may_block)); } -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block) +{ + __unmap_stage2_range(mmu, start, size, may_block); +} + +void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - __unmap_stage2_range(mmu, start, size, true); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); } static void stage2_flush_memslot(struct kvm *kvm, @@ -339,7 +358,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush); + kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); } /** @@ -362,6 +381,8 @@ static void stage2_flush_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_flush_memslot(kvm, memslot); + kvm_nested_s2_flush(kvm); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } @@ -696,10 +717,10 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) mutex_lock(&kvm_hyp_pgd_mutex); /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; base = ALIGN_DOWN(io_map_base - size, size); ret = __hyp_alloc_private_va_range(base); @@ -716,12 +737,12 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr, - PAGE_HYP); + ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, + phys_addr, PAGE_HYP); if (ret) kvm_err("Cannot map hyp stack\n"); @@ -855,21 +876,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .icache_inval_pou = invalidate_icache_guest_page, }; -/** - * kvm_init_stage2_mmu - Initialise a S2 MMU structure - * @kvm: The pointer to the KVM structure - * @mmu: The pointer to the s2 MMU structure - * @type: The machine type of the virtual machine - * - * Allocates only the stage-2 HW PGD level table(s). - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) { u32 kvm_ipa_limit = get_kvm_ipa_limit(); - int cpu, err; - struct kvm_pgtable *pgt; u64 mmfr0, mmfr1; u32 phys_shift; @@ -896,20 +905,64 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + return 0; +} + +/** + * kvm_init_stage2_mmu - Initialise a S2 MMU structure + * @kvm: The pointer to the KVM structure + * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine + * + * Allocates only the stage-2 HW PGD level table(s). + * Note we don't need locking here as this is only called in two cases: + * + * - when the VM is created, which can't race against anything + * + * - when secondary kvm_s2_mmu structures are initialised for NV + * guests, and the caller must hold kvm->lock as this is called on a + * per-vcpu basis. + */ +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +{ + int cpu, err; + struct kvm_pgtable *pgt; + + /* + * If we already have our page tables in place, and that the + * MMU context is the canonical one, we have a bug somewhere, + * as this is only supposed to ever happen once per VM. + * + * Otherwise, we're building nested page tables, and that's + * probably because userspace called KVM_ARM_VCPU_INIT more + * than once on the same vcpu. Since that's actually legal, + * don't kick a fuss and leave gracefully. + */ if (mmu->pgt != NULL) { + if (kvm_is_nested_s2_mmu(kvm, mmu)) + return 0; + kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + err = kvm_init_ipa_range(mmu, type); + if (err) + return err; + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; mmu->arch = &kvm->arch; - err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops); + err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); if (err) goto out_free_pgtable; + mmu->pgt = pgt; + if (is_protected_kvm_enabled()) + return 0; + mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { err = -ENOMEM; @@ -923,12 +976,15 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; mmu->split_page_cache.gfp_zero = __GFP_ZERO; - mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + return 0; out_destroy_pgtable: - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); out_free_pgtable: kfree(pgt); return err; @@ -976,7 +1032,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); } hva = vm_end; } while (hva < reg_end); @@ -1003,6 +1059,8 @@ void stage2_unmap_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_unmap_memslot(kvm, memslot); + kvm_nested_s2_unmap(kvm, true); + write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); srcu_read_unlock(&kvm->srcu, idx); @@ -1023,26 +1081,40 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); kfree(pgt); } } -static void hyp_mc_free_fn(void *addr, void *unused) +static void hyp_mc_free_fn(void *addr, void *mc) { + struct kvm_hyp_memcache *memcache = mc; + + if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, -1); + free_page((unsigned long)addr); } -static void *hyp_mc_alloc_fn(void *unused) +static void *hyp_mc_alloc_fn(void *mc) { - return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + struct kvm_hyp_memcache *memcache = mc; + void *addr; + + addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, 1); + + return addr; } void free_hyp_memcache(struct kvm_hyp_memcache *mc) { - if (is_protected_kvm_enabled()) - __free_hyp_memcache(mc, hyp_mc_free_fn, - kvm_host_va, NULL); + if (!is_protected_kvm_enabled()) + return; + + kfree(mc->mapping); + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); } int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) @@ -1050,8 +1122,14 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) if (!is_protected_kvm_enabled()) return 0; + if (!mc->mapping) { + mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); + if (!mc->mapping) + return -ENOMEM; + } + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, - kvm_host_pa, NULL); + kvm_host_pa, mc); } /** @@ -1088,8 +1166,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, break; write_lock(&kvm->mmu_lock); - ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache, 0); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, + pa, prot, &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -1102,14 +1180,14 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, } /** - * stage2_wp_range() - write protect stage2 memory region range + * kvm_stage2_wp_range() - write protect stage2 memory region range * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ -static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); } /** @@ -1138,7 +1216,8 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_nested_s2_wp(kvm); write_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs_memslot(kvm, memslot); } @@ -1192,7 +1271,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, lockdep_assert_held_write(&kvm->mmu_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); /* * Eager-splitting is done when manual-protect is set. We @@ -1204,6 +1283,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) kvm_mmu_split_huge_pages(kvm, start, end); + + kvm_nested_s2_wp(kvm); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) @@ -1357,10 +1438,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, { unsigned long i, nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); if (!kvm_has_mte(kvm)) return; + if (folio_test_hugetlb(folio)) { + /* Hugetlb has MTE flags set on head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr_pages; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + return; + } + for (i = 0; i < nr_pages; i++, page++) { if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); @@ -1375,6 +1467,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, bool fault_is_perm) { @@ -1383,16 +1476,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool exec_fault, mte_allowed; bool device = false, vfio_allow_any_uc = false; unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + void *memcache; gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; + struct page *page; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1412,8 +1508,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * and a write fault needs to collapse a block entry into a table. */ if (!fault_is_perm || (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); + int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) { + memcache = &vcpu->arch.mmu_page_cache; + ret = kvm_mmu_topup_memory_cache(memcache, min_pages); + } else { + memcache = &vcpu->arch.pkvm_memcache; + ret = topup_hyp_memcache(memcache, min_pages); + } if (ret) return ret; } @@ -1434,7 +1537,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * logging_active is guaranteed to never be true for VM_PFNMAP * memslots. */ - if (logging_active) { + if (logging_active || is_protected_kvm_enabled()) { force_pte = true; vma_shift = PAGE_SHIFT; } else { @@ -1466,10 +1569,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = 1UL << vma_shift; - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + + if (nested) { + unsigned long max_map_size; + + max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; + + ipa = kvm_s2_trans_output(nested); + + /* + * If we're about to create a shadow stage 2 entry, then we + * can only create a block mapping if the guest stage 2 page + * table uses at least as big a mapping. + */ + max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + + /* + * Be careful that if the mapping size falls between + * two host sizes, take the smallest of the two. + */ + if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) + max_map_size = PMD_SIZE; + else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) + max_map_size = PAGE_SIZE; + + force_pte = (max_map_size == PAGE_SIZE); + vma_pagesize = min(vma_pagesize, (long)max_map_size); + } + + /* + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. + */ + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); + } - gfn = fault_ipa >> PAGE_SHIFT; + gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; @@ -1479,7 +1617,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * vma_lookup() or __kvm_faultin_pfn() become stale prior to * acquiring kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -1488,8 +1626,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, - write_fault, &writable, NULL); + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -1502,7 +1640,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If the page was identified as device early by looking at * the VMA flags, vma_pagesize is already representing the * largest quantity we can map. If instead it was mapped - * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE + * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() @@ -1520,10 +1658,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; - read_lock(&kvm->mmu_lock); + /* + * Potentially reduce shadow S2 permissions to match the guest's own + * S2. For exec faults, we'd only reach this point if the guest + * actually allowed it (see kvm_s2_handle_perm_fault). + * + * Also encode the level of the original translation in the SW bits + * of the leaf entry as a proxy for the span of that translation. + * This will be retrieved on TLB invalidation from the guest and + * used to limit the invalidation scope if a TTL hint or a range + * isn't provided. + */ + if (nested) { + writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_readable(nested)) + prot &= ~KVM_PGTABLE_PROT_R; + + prot |= kvm_encode_nested_level(nested); + } + + kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; goto out_unlock; + } /* * If we are not forced to use page mapping, check if we are @@ -1564,7 +1723,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= KVM_PGTABLE_PROT_NORMAL_NC; else prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && + (!nested || kvm_s2_trans_executable(nested))) { prot |= KVM_PGTABLE_PROT_X; } @@ -1573,42 +1733,42 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_is_perm && vma_pagesize == fault_granule) - ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); - else - ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + if (fault_is_perm && vma_pagesize == fault_granule) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); + } else { + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + memcache, flags); + } + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) { - kvm_set_pfn_dirty(pfn); + if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); - } -out_unlock: - read_unlock(&kvm->mmu_lock); - kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; } /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - kvm_pte_t pte; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); + KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); read_unlock(&vcpu->kvm->mmu_lock); - - if (kvm_pte_valid(pte)) - kvm_set_pfn_accessed(kvm_pte_to_pfn(pte)); } /** @@ -1624,8 +1784,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { + struct kvm_s2_trans nested_trans, *nested = NULL; unsigned long esr; - phys_addr_t fault_ipa; + phys_addr_t fault_ipa; /* The address we faulted on */ + phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ struct kvm_memory_slot *memslot; unsigned long hva; bool is_iabt, write_fault, writable; @@ -1634,10 +1796,10 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) esr = kvm_vcpu_get_esr(vcpu); - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (esr_fsc_is_permission_fault(esr)) { + if (esr_fsc_is_translation_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); @@ -1645,7 +1807,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } /* Falls between the IPA range and the PARange? */ - if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) { + if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); if (is_iabt) @@ -1684,7 +1846,42 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) idx = srcu_read_lock(&vcpu->kvm->srcu); - gfn = fault_ipa >> PAGE_SHIFT; + /* + * We may have faulted on a shadow stage 2 page table if we are + * running a nested guest. In this case, we have to resolve the L2 + * IPA to the L1 IPA first, before knowing what kind of memory should + * back the L1 IPA. + * + * If the shadow stage 2 page table walk faults, then we simply inject + * this to the guest and carry on. + * + * If there are no shadow S2 PTs because S2 is disabled, there is + * nothing to walk and we treat it as a 1:1 before going through the + * canonical translation. + */ + if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && + vcpu->arch.hw_mmu->nested_stage2_enabled) { + u32 esr; + + ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ipa = kvm_s2_trans_output(&nested_trans); + nested = &nested_trans; + } + + gfn = ipa >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); @@ -1728,13 +1925,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * faulting VA. This is always 12 bits, irrespective * of the page size. */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, fault_ipa); + ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + ret = io_mem_abort(vcpu, ipa); goto out_unlock; } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); + VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); if (esr_fsc_is_access_flag_fault(esr)) { handle_access_fault(vcpu, fault_ipa); @@ -1742,7 +1939,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; @@ -1765,40 +1962,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) (range->end - range->start) << PAGE_SHIFT, range->may_block); - return false; -} - -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - kvm_pfn_t pfn = pte_pfn(range->arg.pte); - - if (!kvm->arch.mmu.pgt) - return false; - - WARN_ON(range->end - range->start != 1); - - /* - * If the page isn't tagged, defer to user_mem_abort() for sanitising - * the MTE tags. The S2 pte should have been unmapped by - * mmu_notifier_invalidate_range_end(). - */ - if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) - return false; - - /* - * We've moved a page around, probably through CoW, so let's treat - * it just like a translation fault and the map handler will clean - * the cache to the PoC. - * - * The MMU notifiers will have unmapped a huge PMD before calling - * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and - * therefore we never need to clear out a huge PMD through this - * calling path and a memcache is not required. - */ - kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, - PAGE_SIZE, __pfn_to_phys(pfn), - KVM_PGTABLE_PROT_R, NULL, 0); - + kvm_nested_s2_unmap(kvm, range->may_block); return false; } @@ -1809,9 +1973,13 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); + /* + * TODO: Handle nested_mmu structures here using the reverse mapping in + * a later version of patch series. + */ } bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1821,7 +1989,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, false); } @@ -1931,6 +2099,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) goto out_destroy_pgtable; io_map_base = hyp_idmap_start; + __hyp_va_bits = *hyp_va_bits; return 0; out_destroy_pgtable: @@ -2054,11 +2223,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_uninit_stage2_mmu(kvm); -} - void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { @@ -2066,7 +2230,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index ced30c90521a..4a3fc11f7ecf 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -4,17 +4,802 @@ * Author: Jintack Lim <jintack.lim@linaro.org> */ +#include <linux/bitfield.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/sysreg.h> #include "sys_regs.h" -/* Protection against the sysreg repainting madness... */ -#define NV_FTR(r, f) ID_AA64##r##_EL1_##f +/* + * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between + * memory usage and potential number of different sets of S2 PTs in + * the guests. Running out of S2 MMUs only affects performance (we + * will invalidate them more often). + */ +#define S2_MMU_PER_VCPU 2 + +void kvm_init_nested(struct kvm *kvm) +{ + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; +} + +static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +{ + /* + * We only initialise the IPA range on the canonical MMU, which + * defines the contract between KVM and userspace on where the + * "hardware" is in the IPA space. This affects the validity of MMIO + * exits forwarded to userspace, for example. + * + * For nested S2s, we use the PARange as exposed to the guest, as it + * is allowed to use it at will to expose whatever memory map it + * wants to its own guests as it would be on real HW. + */ + return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm)); +} + +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *tmp; + int num_mmus, ret = 0; + + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) && + !cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + return -EINVAL; + + /* + * Let's treat memory allocation failures as benign: If we fail to + * allocate anything, return an error and keep the allocated array + * alive. Userspace may try to recover by intializing the vcpu + * again, and there is no reason to affect the whole VM for this. + */ + num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; + tmp = kvrealloc(kvm->arch.nested_mmus, + size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!tmp) + return -ENOMEM; + + swap(kvm->arch.nested_mmus, tmp); + + /* + * If we went through a realocation, adjust the MMU back-pointers in + * the previously initialised kvm_pgtable structures. + */ + if (kvm->arch.nested_mmus != tmp) + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; + + for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); + + if (ret) { + for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); + + return ret; + } + + kvm->arch.nested_mmus_size = num_mmus; + + return 0; +} + +struct s2_walk_info { + int (*read_desc)(phys_addr_t pa, u64 *desc, void *data); + void *data; + u64 baddr; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int sl; + unsigned int t0sz; + bool be; +}; + +static u32 compute_fsc(int level, u32 fsc) +{ + return fsc | (level & 0x3); +} + +static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc) +{ + u32 esr; + + esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC; + esr |= compute_fsc(level, fsc); + return esr; +} + +static int get_ia_size(struct s2_walk_info *wi) +{ + return 64 - wi->t0sz; +} + +static int check_base_s2_limits(struct s2_walk_info *wi, + int level, int input_size, int stride) +{ + int start_size, ia_size; + + ia_size = get_ia_size(wi); + + /* Check translation limits */ + switch (BIT(wi->pgshift)) { + case SZ_64K: + if (level == 0 || (level == 1 && ia_size <= 42)) + return -EFAULT; + break; + case SZ_16K: + if (level == 0 || (level == 1 && ia_size <= 40)) + return -EFAULT; + break; + case SZ_4K: + if (level < 0 || (level == 0 && ia_size <= 42)) + return -EFAULT; + break; + } + + /* Check input size limits */ + if (input_size > ia_size) + return -EFAULT; + + /* Check number of entries in starting level table */ + start_size = input_size - ((3 - level) * stride + wi->pgshift); + if (start_size < 1 || start_size > stride + 4) + return -EFAULT; + + return 0; +} + +/* Check if output is within boundaries */ +static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) +{ + unsigned int output_size = wi->max_oa_bits; + + if (output_size != 48 && (output & GENMASK_ULL(47, output_size))) + return -1; + + return 0; +} + +/* + * This is essentially a C-version of the pseudo code from the ARM ARM + * AArch64.TranslationTableWalk function. I strongly recommend looking at + * that pseudocode in trying to understand this. + * + * Must be called with the kvm->srcu read lock held + */ +static int walk_nested_s2_pgd(phys_addr_t ipa, + struct s2_walk_info *wi, struct kvm_s2_trans *out) +{ + int first_block_level, level, stride, input_size, base_lower_bound; + phys_addr_t base_addr; + unsigned int addr_top, addr_bottom; + u64 desc; /* page table entry */ + int ret; + phys_addr_t paddr; + + switch (BIT(wi->pgshift)) { + default: + case SZ_64K: + case SZ_16K: + level = 3 - wi->sl; + first_block_level = 2; + break; + case SZ_4K: + level = 2 - wi->sl; + first_block_level = 1; + break; + } + + stride = wi->pgshift - 3; + input_size = get_ia_size(wi); + if (input_size > 48 || input_size < 25) + return -EFAULT; + + ret = check_base_s2_limits(wi, level, input_size, stride); + if (WARN_ON(ret)) + return ret; + + base_lower_bound = 3 + input_size - ((3 - level) * stride + + wi->pgshift); + base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound); + + if (check_output_size(wi, base_addr)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + return 1; + } + + addr_top = input_size - 1; + + while (1) { + phys_addr_t index; + + addr_bottom = (3 - level) * stride + wi->pgshift; + index = (ipa & GENMASK_ULL(addr_top, addr_bottom)) + >> (addr_bottom - 3); + + paddr = base_addr | index; + ret = wi->read_desc(paddr, &desc, wi->data); + if (ret < 0) + return ret; + + /* + * Handle reversedescriptors if endianness differs between the + * host and the guest hypervisor. + */ + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Check for valid descriptor at this point */ + if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + /* We're at the final level or block translation level */ + if ((desc & 3) == 1 || level == 3) + break; + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + base_addr = desc & GENMASK_ULL(47, wi->pgshift); + + level += 1; + addr_top = addr_bottom - 1; + } + + if (level < first_block_level) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + if (!(desc & BIT(10))) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); + out->desc = desc; + return 1; + } + + addr_bottom += contiguous_bit_shift(desc, wi, level); + + /* Calculate and return the result */ + paddr = (desc & GENMASK_ULL(47, addr_bottom)) | + (ipa & GENMASK_ULL(addr_bottom - 1, 0)); + out->output = paddr; + out->block_size = 1UL << ((3 - level) * stride + wi->pgshift); + out->readable = desc & (0b01 << 6); + out->writable = desc & (0b10 << 6); + out->level = level; + out->desc = desc; + return 0; +} + +static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) +{ + struct kvm_vcpu *vcpu = data; + + return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); +} + +static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) +{ + wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + wi->pgshift = 12; break; + case VTCR_EL2_TG0_16K: + wi->pgshift = 14; break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + + wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + /* Global limit for now, should eventually be per-VM */ + wi->max_oa_bits = min(get_kvm_ipa_limit(), + ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr))); +} + +int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, + struct kvm_s2_trans *result) +{ + u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + struct s2_walk_info wi; + int ret; + + result->esr = 0; + + if (!vcpu_has_nv(vcpu)) + return 0; + + wi.read_desc = read_guest_s2_desc; + wi.data = vcpu; + wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + vtcr_to_walk_info(vtcr, &wi); + + wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; + + ret = walk_nested_s2_pgd(gipa, &wi, result); + if (ret) + result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC); + + return ret; +} + +static unsigned int ttl_to_size(u8 ttl) +{ + int level = ttl & 3; + int gran = (ttl >> 2) & 3; + unsigned int max_size = 0; + + switch (gran) { + case TLBI_TTL_TG_4K: + switch (level) { + case 0: + break; + case 1: + max_size = SZ_1G; + break; + case 2: + max_size = SZ_2M; + break; + case 3: + max_size = SZ_4K; + break; + } + break; + case TLBI_TTL_TG_16K: + switch (level) { + case 0: + case 1: + break; + case 2: + max_size = SZ_32M; + break; + case 3: + max_size = SZ_16K; + break; + } + break; + case TLBI_TTL_TG_64K: + switch (level) { + case 0: + case 1: + /* No 52bit IPA support */ + break; + case 2: + max_size = SZ_512M; + break; + case 3: + max_size = SZ_64K; + break; + } + break; + default: /* No size information */ + break; + } + + return max_size; +} + +/* + * Compute the equivalent of the TTL field by parsing the shadow PT. The + * granule size is extracted from the cached VTCR_EL2.TG0 while the level is + * retrieved from first entry carrying the level as a tag. + */ +static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) +{ + u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr; + kvm_pte_t pte; + u8 ttl, level; + + lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock); + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + ttl = (TLBI_TTL_TG_4K << 2); + break; + case VTCR_EL2_TG0_16K: + ttl = (TLBI_TTL_TG_16K << 2); + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + ttl = (TLBI_TTL_TG_64K << 2); + break; + } + + tmp = addr; + +again: + /* Iteratively compute the block sizes for a particular granule size */ + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + if (sz < SZ_4K) sz = SZ_4K; + else if (sz < SZ_2M) sz = SZ_2M; + else if (sz < SZ_1G) sz = SZ_1G; + else sz = 0; + break; + case VTCR_EL2_TG0_16K: + if (sz < SZ_16K) sz = SZ_16K; + else if (sz < SZ_32M) sz = SZ_32M; + else sz = 0; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (sz < SZ_64K) sz = SZ_64K; + else if (sz < SZ_512M) sz = SZ_512M; + else sz = 0; + break; + } + + if (sz == 0) + return 0; + + tmp &= ~(sz - 1); + if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL)) + goto again; + if (!(pte & PTE_VALID)) + goto again; + level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte); + if (!level) + goto again; + + ttl |= level; + + /* + * We now have found some level information in the shadow S2. Check + * that the resulting range is actually including the original IPA. + */ + sz = ttl_to_size(ttl); + if (addr < (tmp + sz)) + return ttl; + + return 0; +} + +unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); + unsigned long max_size; + u8 ttl; + + ttl = FIELD_GET(TLBI_TTL_MASK, val); + + if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) { + /* No TTL, check the shadow S2 for a hint */ + u64 addr = (val & GENMASK_ULL(35, 0)) << 12; + ttl = get_guest_mapping_ttl(mmu, addr); + } + + max_size = ttl_to_size(ttl); + + if (!max_size) { + /* Compute the maximum extent of the invalidation */ + switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + max_size = SZ_1G; + break; + case VTCR_EL2_TG0_16K: + max_size = SZ_32M; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + /* + * No, we do not support 52bit IPA in nested yet. Once + * we do, this should be 4TB. + */ + max_size = SZ_512M; + break; + } + } + + WARN_ON(!max_size); + return max_size; +} + +/* + * We can have multiple *different* MMU contexts with the same VMID: + * + * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit + * + * - Multiple vcpus using private S2s (huh huh...), hence differing by the + * VBBTR_EL2.BADDR address + * + * - A combination of the above... + * + * We can always identify which MMU context to pick at run-time. However, + * TLB invalidation involving a VMID must take action on all the TLBs using + * this particular VMID. This translates into applying the same invalidation + * operation to all the contexts that are using this VMID. Moar phun! + */ +void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, + const union tlbi_info *info, + void (*tlbi_callback)(struct kvm_s2_mmu *, + const union tlbi_info *)) +{ + write_lock(&kvm->mmu_lock); + + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (vmid == get_vmid(mmu->tlb_vttbr)) + tlbi_callback(mmu, info); + } + + write_unlock(&kvm->mmu_lock); +} + +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + bool nested_stage2_enabled; + u64 vttbr, vtcr, hcr; + + lockdep_assert_held_write(&kvm->mmu_lock); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + hcr = vcpu_read_sys_reg(vcpu, HCR_EL2); + + nested_stage2_enabled = hcr & HCR_VM; + + /* Don't consider the CnP bit for the vttbr match */ + vttbr &= ~VTTBR_CNP_BIT; + + /* + * Two possibilities when looking up a S2 MMU context: + * + * - either S2 is enabled in the guest, and we need a context that is + * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR, + * which makes it safe from a TLB conflict perspective (a broken + * guest won't be able to generate them), + * + * - or S2 is disabled, and we need a context that is S2-disabled + * and matches the VMID only, as all TLBs are tagged by VMID even + * if S2 translation is disabled. + */ + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (nested_stage2_enabled && + mmu->nested_stage2_enabled && + vttbr == mmu->tlb_vttbr && + vtcr == mmu->tlb_vtcr) + return mmu; + + if (!nested_stage2_enabled && + !mmu->nested_stage2_enabled && + get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr)) + return mmu; + } + return NULL; +} + +static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *s2_mmu; + int i; + + lockdep_assert_held_write(&vcpu->kvm->mmu_lock); + + s2_mmu = lookup_s2_mmu(vcpu); + if (s2_mmu) + goto out; + + /* + * Make sure we don't always search from the same point, or we + * will always reuse a potentially active context, leaving + * free contexts unused. + */ + for (i = kvm->arch.nested_mmus_next; + i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next); + i++) { + s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size]; + + if (atomic_read(&s2_mmu->refcnt) == 0) + break; + } + BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */ + + /* Set the scene for the next search */ + kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; + + /* Make sure we don't forget to do the laundry */ + if (kvm_s2_mmu_valid(s2_mmu)) + s2_mmu->pending_unmap = true; + + /* + * The virtual VMID (modulo CnP) will be used as a key when matching + * an existing kvm_s2_mmu. + * + * We cache VTCR at allocation time, once and for all. It'd be great + * if the guest didn't screw that one up, as this is not very + * forgiving... + */ + s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT; + s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM; + +out: + atomic_inc(&s2_mmu->refcnt); + + /* + * Set the vCPU request to perform an unmap, even if the pending unmap + * originates from another vCPU. This guarantees that the MMU has been + * completely unmapped before any vCPU actually uses it, and allows + * multiple vCPUs to lend a hand with completing the unmap. + */ + if (s2_mmu->pending_unmap) + kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu); + + return s2_mmu; +} + +void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) +{ + /* CnP being set denotes an invalid entry */ + mmu->tlb_vttbr = VTTBR_CNP_BIT; + mmu->nested_stage2_enabled = false; + atomic_set(&mmu->refcnt, 0); +} + +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* + * The vCPU kept its reference on the MMU after the last put, keep + * rolling with it. + */ + if (vcpu->arch.hw_mmu) + return; + + if (is_hyp_ctxt(vcpu)) { + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + } else { + write_lock(&vcpu->kvm->mmu_lock); + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); + write_unlock(&vcpu->kvm->mmu_lock); + } +} + +void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* + * Keep a reference on the associated stage-2 MMU if the vCPU is + * scheduling out and not in WFI emulation, suggesting it is likely to + * reuse the MMU sometime soon. + */ + if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI)) + return; + + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) + atomic_dec(&vcpu->arch.hw_mmu->refcnt); + + vcpu->arch.hw_mmu = NULL; +} + +/* + * Returns non-zero if permission fault is handled by injecting it to the next + * level hypervisor. + */ +int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) +{ + bool forward_fault = false; + + trans->esr = 0; + + if (!kvm_vcpu_trap_is_permission_fault(vcpu)) + return 0; + + if (kvm_vcpu_trap_is_iabt(vcpu)) { + forward_fault = !kvm_s2_trans_executable(trans); + } else { + bool write_fault = kvm_is_write_fault(vcpu); + + forward_fault = ((write_fault && !trans->writable) || + (!write_fault && !trans->readable)); + } + + if (forward_fault) + trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM); + + return forward_fault; +} + +int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2); + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2); + + return kvm_inject_nested_sync(vcpu, esr_el2); +} + +void kvm_nested_s2_wp(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu)); + } +} + +void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); + } +} + +void kvm_nested_s2_flush(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu)); + } +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!WARN_ON(atomic_read(&mmu->refcnt))) + kvm_free_stage2_pgd(mmu); + } + kvfree(kvm->arch.nested_mmus); + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; + kvm_uninit_stage2_mmu(kvm); +} /* * Our emulated CPU doesn't support all the possible features. For the @@ -23,156 +808,162 @@ * This list should get updated as new features get added to the NV * support, and new extension to the architecture. */ -static u64 limit_nv_id_reg(u32 id, u64 val) +u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) { - u64 tmp; - - switch (id) { + switch (reg) { case SYS_ID_AA64ISAR0_EL1: - /* Support everything but TME, O.S. and Range TLBIs */ - val &= ~(NV_FTR(ISAR0, TLB) | - NV_FTR(ISAR0, TME)); + /* Support everything but TME */ + val &= ~ID_AA64ISAR0_EL1_TME; break; case SYS_ID_AA64ISAR1_EL1: - /* Support everything but PtrAuth and Spec Invalidation */ - val &= ~(GENMASK_ULL(63, 56) | - NV_FTR(ISAR1, SPECRES) | - NV_FTR(ISAR1, GPI) | - NV_FTR(ISAR1, GPA) | - NV_FTR(ISAR1, API) | - NV_FTR(ISAR1, APA)); + /* Support everything but LS64 and Spec Invalidation */ + val &= ~(ID_AA64ISAR1_EL1_LS64 | + ID_AA64ISAR1_EL1_SPECRES); break; case SYS_ID_AA64PFR0_EL1: - /* No AMU, MPAM, S-EL2, RAS or SVE */ - val &= ~(GENMASK_ULL(55, 52) | - NV_FTR(PFR0, AMU) | - NV_FTR(PFR0, MPAM) | - NV_FTR(PFR0, SEL2) | - NV_FTR(PFR0, RAS) | - NV_FTR(PFR0, SVE) | - NV_FTR(PFR0, EL3) | - NV_FTR(PFR0, EL2) | - NV_FTR(PFR0, EL1)); - /* 64bit EL1/EL2/EL3 only */ - val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001); - val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001); - val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001); + /* No RME, AMU, MPAM, S-EL2, or RAS */ + val &= ~(ID_AA64PFR0_EL1_RME | + ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SEL2 | + ID_AA64PFR0_EL1_RAS | + ID_AA64PFR0_EL1_EL3 | + ID_AA64PFR0_EL1_EL2 | + ID_AA64PFR0_EL1_EL1 | + ID_AA64PFR0_EL1_EL0); + /* 64bit only at any EL */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP); break; case SYS_ID_AA64PFR1_EL1: - /* Only support SSBS */ - val &= NV_FTR(PFR1, SSBS); + /* Only support BTI, SSBS, CSV2_frac */ + val &= (ID_AA64PFR1_EL1_BT | + ID_AA64PFR1_EL1_SSBS | + ID_AA64PFR1_EL1_CSV2_frac); break; case SYS_ID_AA64MMFR0_EL1: - /* Hide ECV, ExS, Secure Memory */ - val &= ~(NV_FTR(MMFR0, ECV) | - NV_FTR(MMFR0, EXS) | - NV_FTR(MMFR0, TGRAN4_2) | - NV_FTR(MMFR0, TGRAN16_2) | - NV_FTR(MMFR0, TGRAN64_2) | - NV_FTR(MMFR0, SNSMEM)); + /* Hide ExS, Secure Memory */ + val &= ~(ID_AA64MMFR0_EL1_EXS | + ID_AA64MMFR0_EL1_TGRAN4_2 | + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_TGRAN64_2 | + ID_AA64MMFR0_EL1_SNSMEM); + + /* Hide CNTPOFF if present */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP); /* Disallow unsupported S2 page sizes */ switch (PAGE_SIZE) { case SZ_64K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI); fallthrough; case SZ_16K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI); fallthrough; case SZ_4K: /* Support everything */ break; } + /* - * Since we can't support a guest S2 page size smaller than - * the host's own page size (due to KVM only populating its - * own S2 using the kernel's page size), advertise the - * limitation using FEAT_GTG. + * Since we can't support a guest S2 page size smaller + * than the host's own page size (due to KVM only + * populating its own S2 using the kernel's page + * size), advertise the limitation using FEAT_GTG. */ switch (PAGE_SIZE) { case SZ_4K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); fallthrough; case SZ_16K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); fallthrough; case SZ_64K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); break; } + /* Cap PARange to 48bits */ - tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val); - if (tmp > 0b0101) { - val &= ~NV_FTR(MMFR0, PARANGE); - val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101); - } + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48); break; case SYS_ID_AA64MMFR1_EL1: - val &= (NV_FTR(MMFR1, HCX) | - NV_FTR(MMFR1, PAN) | - NV_FTR(MMFR1, LO) | - NV_FTR(MMFR1, HPDS) | - NV_FTR(MMFR1, VH) | - NV_FTR(MMFR1, VMIDBits)); + val &= (ID_AA64MMFR1_EL1_HCX | + ID_AA64MMFR1_EL1_PAN | + ID_AA64MMFR1_EL1_LO | + ID_AA64MMFR1_EL1_HPDS | + ID_AA64MMFR1_EL1_VH | + ID_AA64MMFR1_EL1_VMIDBits); + /* FEAT_E2H0 implies no VHE */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) + val &= ~ID_AA64MMFR1_EL1_VH; break; case SYS_ID_AA64MMFR2_EL1: - val &= ~(NV_FTR(MMFR2, BBM) | - NV_FTR(MMFR2, TTL) | + val &= ~(ID_AA64MMFR2_EL1_BBM | + ID_AA64MMFR2_EL1_TTL | GENMASK_ULL(47, 44) | - NV_FTR(MMFR2, ST) | - NV_FTR(MMFR2, CCIDX) | - NV_FTR(MMFR2, VARange)); + ID_AA64MMFR2_EL1_ST | + ID_AA64MMFR2_EL1_CCIDX | + ID_AA64MMFR2_EL1_VARange); /* Force TTL support */ - val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP); break; case SYS_ID_AA64MMFR4_EL1: - val = 0; - if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) - val |= FIELD_PREP(NV_FTR(MMFR4, E2H0), - ID_AA64MMFR4_EL1_E2H0_NI_NV1); + /* + * You get EITHER + * + * - FEAT_VHE without FEAT_E2H0 + * - FEAT_NV limited to FEAT_NV2 + * - HCR_EL2.NV1 being RES0 + * + * OR + * + * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV + * + * Life is too short for anything else. + */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) { + val = 0; + } else { + val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1); + } break; case SYS_ID_AA64DFR0_EL1: - /* Only limited support for PMU, Debug, BPs and WPs */ - val &= (NV_FTR(DFR0, PMUVer) | - NV_FTR(DFR0, WRPs) | - NV_FTR(DFR0, BRPs) | - NV_FTR(DFR0, DebugVer)); + /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ + val &= (ID_AA64DFR0_EL1_PMUVer | + ID_AA64DFR0_EL1_WRPs | + ID_AA64DFR0_EL1_BRPs | + ID_AA64DFR0_EL1_DebugVer| + ID_AA64DFR0_EL1_HPMN0); /* Cap Debug to ARMv8.1 */ - tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val); - if (tmp > 0b0111) { - val &= ~NV_FTR(DFR0, DebugVer); - val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111); - } - break; - - default: - /* Unknown register, just wipe it clean */ - val = 0; + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE); break; } return val; } -u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg sr, u64 v) { - u64 v = ctxt_sys_reg(&vcpu->arch.ctxt, sr); struct kvm_sysreg_masks *masks; masks = vcpu->kvm->arch.sysreg_masks; if (masks) { - sr -= __VNCR_START__; + sr -= __SANITISED_REG_START__; v &= ~masks->mask[sr].res0; v |= masks->mask[sr].res1; @@ -181,34 +972,32 @@ u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) return v; } -static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) +static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) { - int i = sr - __VNCR_START__; + int i = sr - __SANITISED_REG_START__; + + BUILD_BUG_ON(!__builtin_constant_p(sr)); + BUILD_BUG_ON(sr < __SANITISED_REG_START__); + BUILD_BUG_ON(sr >= NR_SYS_REGS); kvm->arch.sysreg_masks->mask[i].res0 = res0; kvm->arch.sysreg_masks->mask[i].res1 = res1; } -int kvm_init_nv_sysregs(struct kvm *kvm) +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; u64 res0, res1; - int ret = 0; - mutex_lock(&kvm->arch.config_lock); + lockdep_assert_held(&kvm->arch.config_lock); if (kvm->arch.sysreg_masks) goto out; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), - GFP_KERNEL); - if (!kvm->arch.sysreg_masks) { - ret = -ENOMEM; - goto out; - } - - for (int i = 0; i < KVM_ARM_ID_REG_NUM; i++) - kvm->arch.id_regs[i] = limit_nv_id_reg(IDX_IDREG(i), - kvm->arch.id_regs[i]); + GFP_KERNEL_ACCOUNT); + if (!kvm->arch.sysreg_masks) + return -ENOMEM; /* VTTBR_EL2 */ res0 = res1 = 0; @@ -248,12 +1037,13 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HCR_FIEN; if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP)) res0 |= HCR_FWB; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)) - res0 |= HCR_NV2; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP)) - res0 |= (HCR_AT | HCR_NV1 | HCR_NV); - if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + /* Implementation choice: NV2 is the only supported config */ + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + res0 |= (HCR_NV2 | HCR_NV | HCR_AT); + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, NI)) + res0 |= HCR_NV1; + if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HCR_API | HCR_APK); if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP)) res0 |= BIT(39); @@ -261,6 +1051,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= (HCR_TEA | HCR_TERR); if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) res0 |= HCR_TLOR; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= HCR_E2H; if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP)) res1 |= HCR_E2H; set_sysreg_masks(kvm, HCR_EL2, res0, res1); @@ -286,7 +1078,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HCRX_EL2_PTTWI; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP)) res0 |= HCRX_EL2_SCTLR2En; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + if (!kvm_has_tcr2(kvm)) res0 |= HCRX_EL2_TCR2En; if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); @@ -309,8 +1101,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm) /* HFG[RW]TR_EL2 */ res0 = res1 = 0; - if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey | HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey | HFGxTR_EL2_APIBKey); @@ -337,9 +1129,9 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) res0 |= HFGxTR_EL2_nRCWMASK_EL1; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_s1pie(kvm)) res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (!kvm_has_s1poe(kvm)) res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) res0 |= HFGxTR_EL2_nS2POR_EL1; @@ -435,8 +1227,118 @@ int kvm_init_nv_sysregs(struct kvm *kvm) if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) res0 |= ~(res0 | res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + + /* TCR2_EL2 */ + res0 = TCR2_EL2_RES0; + res1 = TCR2_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP)) + res0 |= (TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1 | TCR2_EL2_D128); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, MEC, IMP)) + res0 |= TCR2_EL2_AMEC1 | TCR2_EL2_AMEC0; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT)) + res0 |= TCR2_EL2_HAFT; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) + res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) + res0 |= TCR2_EL2_AIE; + if (!kvm_has_s1poe(kvm)) + res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE; + if (!kvm_has_s1pie(kvm)) + res0 |= TCR2_EL2_PIE; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 | + TCR2_EL2_AMEC1 | TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1); + set_sysreg_masks(kvm, TCR2_EL2, res0, res1); + + /* SCTLR_EL1 */ + res0 = SCTLR_EL1_RES0; + res1 = SCTLR_EL1_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + res0 |= SCTLR_EL1_EPAN; + set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + + /* MDCR_EL2 */ + res0 = MDCR_EL2_RES0; + res1 = MDCR_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) + res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR | + MDCR_EL2_TPM | MDCR_EL2_HPME); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) + res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP)) + res0 |= MDCR_EL2_EnSPM; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1)) + res0 |= MDCR_EL2_HPMD; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + res0 |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) + res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) + res0 |= MDCR_EL2_E2TB; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + res0 |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) + res0 |= MDCR_EL2_MTPME; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7)) + res0 |= MDCR_EL2_HPMFZO; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP)) + res0 |= MDCR_EL2_PMSSE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) + res0 |= MDCR_EL2_HPMFZS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP)) + res0 |= MDCR_EL2_PMEE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9)) + res0 |= MDCR_EL2_EBWE; + if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP)) + res0 |= MDCR_EL2_EnSTEPOP; + set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + + /* CNTHCTL_EL2 */ + res0 = GENMASK(63, 20); + res1 = 0; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP)) + res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) { + res0 |= CNTHCTL_ECV; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP)) + res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | + CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); + } + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= GENMASK(11, 8); + set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + + /* ICH_HCR_EL2 */ + res0 = ICH_HCR_EL2_RES0; + res1 = ICH_HCR_EL2_RES1; + if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS)) + res0 |= ICH_HCR_EL2_TDIR; + /* No GICv4 is presented to the guest */ + res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; + set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1); + out: - mutex_unlock(&kvm->arch.config_lock); + for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) + (void)__vcpu_sys_reg(vcpu, sr); - return ret; + return 0; +} + +void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) { + struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + + write_lock(&vcpu->kvm->mmu_lock); + if (mmu->pending_unmap) { + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true); + mmu->pending_unmap = false; + } + write_unlock(&vcpu->kvm->mmu_lock); + } + + /* Must be last, as may switch context! */ + if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu)) + kvm_inject_nested_irq(vcpu); } diff --git a/arch/arm64/kvm/pauth.c b/arch/arm64/kvm/pauth.c new file mode 100644 index 000000000000..d5eb3ae876be --- /dev/null +++ b/arch/arm64/kvm/pauth.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 - Google LLC + * Author: Marc Zyngier <maz@kernel.org> + * + * Primitive PAuth emulation for ERETAA/ERETAB. + * + * This code assumes that is is run from EL2, and that it is part of + * the emulation of ERETAx for a guest hypervisor. That's a lot of + * baked-in assumptions and shortcuts. + * + * Do no reuse for anything else! + */ + +#include <linux/kvm_host.h> + +#include <asm/gpr-num.h> +#include <asm/kvm_emulate.h> +#include <asm/pointer_auth.h> + +/* PACGA Xd, Xn, Xm */ +#define PACGA(d,n,m) \ + asm volatile(__DEFINE_ASM_GPR_NUMS \ + ".inst 0x9AC03000 |" \ + "(.L__gpr_num_%[Rd] << 0) |" \ + "(.L__gpr_num_%[Rn] << 5) |" \ + "(.L__gpr_num_%[Rm] << 16)\n" \ + : [Rd] "=r" ((d)) \ + : [Rn] "r" ((n)), [Rm] "r" ((m))) + +static u64 compute_pac(struct kvm_vcpu *vcpu, u64 ptr, + struct ptrauth_key ikey) +{ + struct ptrauth_key gkey; + u64 mod, pac = 0; + + preempt_disable(); + + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + mod = __vcpu_sys_reg(vcpu, SP_EL2); + else + mod = read_sysreg(sp_el1); + + gkey.lo = read_sysreg_s(SYS_APGAKEYLO_EL1); + gkey.hi = read_sysreg_s(SYS_APGAKEYHI_EL1); + + __ptrauth_key_install_nosync(APGA, ikey); + isb(); + + PACGA(pac, ptr, mod); + isb(); + + __ptrauth_key_install_nosync(APGA, gkey); + + preempt_enable(); + + /* PAC in the top 32bits */ + return pac; +} + +static bool effective_tbi(struct kvm_vcpu *vcpu, bool bit55) +{ + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + bool tbi, tbid; + + /* + * Since we are authenticating an instruction address, we have + * to take TBID into account. If E2H==0, ignore VA[55], as + * TCR_EL2 only has a single TBI/TBID. If VA[55] was set in + * this case, this is likely a guest bug... + */ + if (!vcpu_el2_e2h_is_set(vcpu)) { + tbi = tcr & BIT(20); + tbid = tcr & BIT(29); + } else if (bit55) { + tbi = tcr & TCR_TBI1; + tbid = tcr & TCR_TBID1; + } else { + tbi = tcr & TCR_TBI0; + tbid = tcr & TCR_TBID0; + } + + return tbi && !tbid; +} + +static int compute_bottom_pac(struct kvm_vcpu *vcpu, bool bit55) +{ + static const int maxtxsz = 39; // Revisit these two values once + static const int mintxsz = 16; // (if) we support TTST/LVA/LVA2 + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + int txsz; + + if (!vcpu_el2_e2h_is_set(vcpu) || !bit55) + txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + else + txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + + return 64 - clamp(txsz, mintxsz, maxtxsz); +} + +static u64 compute_pac_mask(struct kvm_vcpu *vcpu, bool bit55) +{ + int bottom_pac; + u64 mask; + + bottom_pac = compute_bottom_pac(vcpu, bit55); + + mask = GENMASK(54, bottom_pac); + if (!effective_tbi(vcpu, bit55)) + mask |= GENMASK(63, 56); + + return mask; +} + +static u64 to_canonical_addr(struct kvm_vcpu *vcpu, u64 ptr, u64 mask) +{ + bool bit55 = !!(ptr & BIT(55)); + + if (bit55) + return ptr | mask; + + return ptr & ~mask; +} + +static u64 corrupt_addr(struct kvm_vcpu *vcpu, u64 ptr) +{ + bool bit55 = !!(ptr & BIT(55)); + u64 mask, error_code; + int shift; + + if (effective_tbi(vcpu, bit55)) { + mask = GENMASK(54, 53); + shift = 53; + } else { + mask = GENMASK(62, 61); + shift = 61; + } + + if (esr_iss_is_eretab(kvm_vcpu_get_esr(vcpu))) + error_code = 2 << shift; + else + error_code = 1 << shift; + + ptr &= ~mask; + ptr |= error_code; + + return ptr; +} + +/* + * Authenticate an ERETAA/ERETAB instruction, returning true if the + * authentication succeeded and false otherwise. In all cases, *elr + * contains the VA to ERET to. Potential exception injection is left + * to the caller. + */ +bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) +{ + u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 ptr, cptr, pac, mask; + struct ptrauth_key ikey; + + *elr = ptr = vcpu_read_sys_reg(vcpu, ELR_EL2); + + /* We assume we're already in the context of an ERETAx */ + if (esr_iss_is_eretab(esr)) { + if (!(sctlr & SCTLR_EL1_EnIB)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIBKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIBKEYHI_EL1); + } else { + if (!(sctlr & SCTLR_EL1_EnIA)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIAKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIAKEYHI_EL1); + } + + mask = compute_pac_mask(vcpu, !!(ptr & BIT(55))); + cptr = to_canonical_addr(vcpu, ptr, mask); + + pac = compute_pac(vcpu, cptr, ikey); + + /* + * Slightly deviate from the pseudocode: if we have a PAC + * match with the signed pointer, then it must be good. + * Anything after this point is pure error handling. + */ + if ((pac & mask) == (ptr & mask)) { + *elr = cptr; + return true; + } + + /* + * Authentication failed, corrupt the canonical address if + * PAuth2 isn't implemented, or some XORing if it is. + */ + if (!kvm_has_pauth(vcpu->kvm, PAuth2)) + cptr = corrupt_addr(vcpu, cptr); + else + cptr = ptr ^ (pac & mask); + + *elr = cptr; + return false; +} diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index b7be96a53597..0f89157d31fd 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/kmemleak.h> #include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> #include <linux/memblock.h> #include <linux/mutex.h> #include <linux/sort.h> @@ -110,6 +111,29 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) host_kvm->arch.pkvm.handle = 0; free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); + free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc); +} + +static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) +{ + size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); + pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle; + void *hyp_vcpu; + int ret; + + vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + + hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vcpu) + return -ENOMEM; + + ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu); + if (!ret) + vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED); + else + free_pages_exact(hyp_vcpu, hyp_vcpu_sz); + + return ret; } /* @@ -124,11 +148,8 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) */ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) { - size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz; - struct kvm_vcpu *host_vcpu; - pkvm_handle_t handle; + size_t pgd_sz, hyp_vm_sz; void *pgd, *hyp_vm; - unsigned long idx; int ret; if (host_kvm->created_vcpus < 1) @@ -160,40 +181,11 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) if (ret < 0) goto free_vm; - handle = ret; - - host_kvm->arch.pkvm.handle = handle; - - /* Donate memory for the vcpus at hyp and initialize it. */ - hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); - kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { - void *hyp_vcpu; - - /* Indexing of the vcpus to be sequential starting at 0. */ - if (WARN_ON(host_vcpu->vcpu_idx != idx)) { - ret = -EINVAL; - goto destroy_vm; - } - - hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); - if (!hyp_vcpu) { - ret = -ENOMEM; - goto destroy_vm; - } - - ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, - hyp_vcpu); - if (ret) { - free_pages_exact(hyp_vcpu, hyp_vcpu_sz); - goto destroy_vm; - } - } + host_kvm->arch.pkvm.handle = ret; + host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); return 0; - -destroy_vm: - __pkvm_destroy_hyp_vm(host_kvm); - return ret; free_vm: free_pages_exact(hyp_vm, hyp_vm_sz); free_pgd: @@ -213,6 +205,18 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm) return ret; } +int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) +{ + int ret = 0; + + mutex_lock(&vcpu->kvm->arch.config_lock); + if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED)) + ret = __pkvm_create_hyp_vcpu(vcpu); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + void pkvm_destroy_hyp_vm(struct kvm *host_kvm) { mutex_lock(&host_kvm->arch.config_lock); @@ -222,7 +226,6 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm) int pkvm_init_host_vm(struct kvm *host_kvm) { - mutex_init(&host_kvm->lock); return 0; } @@ -259,6 +262,7 @@ static int __init finalize_pkvm(void) * at, which would end badly once inaccessible. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); ret = pkvm_drop_host_privileges(); @@ -268,3 +272,203 @@ static int __init finalize_pkvm(void) return ret; } device_initcall_sync(finalize_pkvm); + +static int cmp_mappings(struct rb_node *node, const struct rb_node *parent) +{ + struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node); + struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node); + + if (a->gfn < b->gfn) + return -1; + if (a->gfn > b->gfn) + return 1; + return 0; +} + +static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn) +{ + struct rb_node *node = root->rb_node, *prev = NULL; + struct pkvm_mapping *mapping; + + while (node) { + mapping = rb_entry(node, struct pkvm_mapping, node); + if (mapping->gfn == gfn) + return node; + prev = node; + node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right; + } + + return prev; +} + +/* + * __tmp is updated to rb_next(__tmp) *before* entering the body of the loop to allow freeing + * of __map inline. + */ +#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ + for (struct rb_node *__tmp = find_first_mapping_node(&(__pgt)->pkvm_mappings, \ + ((__start) >> PAGE_SHIFT)); \ + __tmp && ({ \ + __map = rb_entry(__tmp, struct pkvm_mapping, node); \ + __tmp = rb_next(__tmp); \ + true; \ + }); \ + ) \ + if (__map->gfn < ((__start) >> PAGE_SHIFT)) \ + continue; \ + else if (__map->gfn >= ((__end) >> PAGE_SHIFT)) \ + break; \ + else + +int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) +{ + pgt->pkvm_mappings = RB_ROOT; + pgt->mmu = mmu; + + return 0; +} + +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + struct rb_node *node; + + if (!handle) + return; + + node = rb_first(&pgt->pkvm_mappings); + while (node) { + mapping = rb_entry(node, struct pkvm_mapping, node); + kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); + node = rb_next(node); + rb_erase(&mapping->node, &pgt->pkvm_mappings); + kfree(mapping); + } +} + +int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + void *mc, enum kvm_pgtable_walk_flags flags) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping = NULL; + struct kvm_hyp_memcache *cache = mc; + u64 gfn = addr >> PAGE_SHIFT; + u64 pfn = phys >> PAGE_SHIFT; + int ret; + + if (size != PAGE_SIZE) + return -EINVAL; + + lockdep_assert_held_write(&kvm->mmu_lock); + ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot); + if (ret) { + /* Is the gfn already mapped due to a racing vCPU? */ + if (ret == -EPERM) + return -EAGAIN; + } + + swap(mapping, cache->mapping); + mapping->gfn = gfn; + mapping->pfn = pfn; + WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm_mappings, cmp_mappings)); + + return ret; +} + +int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); + if (WARN_ON(ret)) + break; + rb_erase(&mapping->node, &pgt->pkvm_mappings); + kfree(mapping); + } + + return ret; +} + +int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn); + if (WARN_ON(ret)) + break; + } + + return ret; +} + +int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE); + + return 0; +} + +bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + bool young = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, + mkold); + + return young; +} + +int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, + enum kvm_pgtable_walk_flags flags) +{ + return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); +} + +void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) +{ + WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); +} + +void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) +{ + WARN_ON_ONCE(1); +} + +kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, + enum kvm_pgtable_prot prot, void *mc, bool force_pte) +{ + WARN_ON_ONCE(1); + return NULL; +} + +int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index a35ce10e0a9f..a1bc10d7116a 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -14,17 +14,21 @@ #include <asm/kvm_emulate.h> #include <kvm/arm_pmu.h> #include <kvm/arm_vgic.h> -#include <asm/arm_pmuv3.h> #define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) -DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); + +bool kvm_supports_guest_pmuv3(void) +{ + guard(mutex)(&arm_pmus_lock); + return !list_empty(&arm_pmus); +} static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { @@ -54,7 +58,7 @@ static u32 __kvm_pmu_event_mask(unsigned int pmuver) static u32 kvm_pmu_event_mask(struct kvm *kvm) { - u64 dfr0 = IDREG(kvm, SYS_ID_AA64DFR0_EL1); + u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); return __kvm_pmu_event_mask(pmuver); @@ -90,7 +94,11 @@ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = kvm_vcpu_read_pmcr(kvm_pmc_to_vcpu(pmc)); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 val = kvm_vcpu_read_pmcr(vcpu); + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); @@ -112,6 +120,11 @@ static u32 counter_index_to_evtreg(u64 idx) return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } +static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) +{ + return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); +} + static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); @@ -141,9 +154,6 @@ static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) */ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); } @@ -182,13 +192,23 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) */ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { - if (!kvm_vcpu_has_pmu(vcpu)) - return; - kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } /** + * kvm_pmu_set_counter_value_user - set PMU counter value from user + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); + __vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val; + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); +} + +/** * kvm_pmu_release_perf_event - remove the perf event * @pmc: The PMU counter pointer */ @@ -234,25 +254,11 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) pmu->pmc[i].idx = i; } /** - * kvm_pmu_vcpu_reset - reset pmu state for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) -{ - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - int i; - - for_each_set_bit(i, &mask, 32) - kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i)); -} - -/** * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu * @vcpu: The vcpu pointer * @@ -261,12 +267,55 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) +{ + unsigned int hpmn, n; + + if (!vcpu_has_nv(vcpu)) + return 0; + + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + n = vcpu->kvm->arch.pmcr_n; + + /* + * Programming HPMN to a value greater than PMCR_EL0.N is + * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an + * UNKNOWN number of counters (in our case, zero) are reserved for EL2. + */ + if (hpmn >= n) + return 0; + + /* + * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't + * implemented. Since KVM's ability to emulate HPMN=0 does not directly + * depend on hardware (all PMU registers are trapped), make the + * implementation choice that all counters are included in the second + * range reserved for EL2/EL3. + */ + return GENMASK(n - 1, hpmn); +} + +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx); +} + +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return mask; + + return mask & ~kvm_pmu_hyp_counter_mask(vcpu); +} + +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); @@ -276,76 +325,70 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); } -/** - * kvm_pmu_enable_counter_mask - enable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENSET register - * - * Call perf_event_enable to start counting the perf event - */ -void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) { - int i; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - - if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) + if (!pmc->perf_event) { + kvm_pmu_create_perf_event(pmc); return; + } - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; - - if (!(val & BIT(i))) - continue; - - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + perf_event_enable(pmc->perf_event); + if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) + kvm_debug("fail to enable perf event\n"); +} - if (!pmc->perf_event) { - kvm_pmu_create_perf_event(pmc); - } else { - perf_event_enable(pmc->perf_event); - if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) - kvm_debug("fail to enable perf event\n"); - } - } +static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) + perf_event_disable(pmc->perf_event); } -/** - * kvm_pmu_disable_counter_mask - disable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENCLR register - * - * Call perf_event_disable to stop counting the perf event - */ -void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; - if (!kvm_vcpu_has_pmu(vcpu) || !val) + if (!val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (!(val & BIT(i))) continue; - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); - - if (pmc->perf_event) - perf_event_disable(pmc->perf_event); + if (kvm_pmu_counter_is_enabled(pmc)) + kvm_pmc_enable_perf_event(pmc); + else + kvm_pmc_disable_perf_event(pmc); } + + kvm_vcpu_pmu_restore_guest(vcpu); } -static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) +/* + * Returns the PMU overflow state, which is true if there exists an event + * counter where the values of the global enable control, PMOVSSET_EL0[n], and + * PMINTENSET_EL1[n] are all 1. + */ +static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { - u64 reg = 0; + u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); - if ((kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) { - reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - } + reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); + + /* + * PMCR_EL0.E is the global enable control for event counters available + * to EL0 and EL1. + */ + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) + reg &= kvm_pmu_hyp_counter_mask(vcpu); + + /* + * Otherwise, MDCR_EL2.HPME is the global enable control for event + * counters reserved for EL2. + */ + if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME)) + reg &= ~kvm_pmu_hyp_counter_mask(vcpu); return reg; } @@ -355,10 +398,7 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; bool overflow; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - - overflow = !!kvm_pmu_overflow_status(vcpu); + overflow = kvm_pmu_overflow_status(vcpu); if (pmu->irq_level == overflow) return; @@ -553,41 +593,89 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { int i; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; + /* Request a reload of the PMU to enable/disable affected counters */ + if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + /* The reset bits don't indicate any state, and shouldn't be saved. */ __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); - if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } else { - kvm_pmu_disable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } - if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); + /* + * Unlike other PMU sysregs, the controls in PMCR_EL0 always apply + * to the 'guest' range of counters and never the 'hyp' range. + */ + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & + ~kvm_pmu_hyp_counter_mask(vcpu) & + ~BIT(ARMV8_PMU_CYCLE_IDX); + for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } - kvm_vcpu_pmu_restore_guest(vcpu); } static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - return (kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); + unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) + return false; + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return mdcr & MDCR_EL2_HPME; + + return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; +} + +static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; + bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; + + return u == nsu; +} + +static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; + bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; + + return p == nsk; +} + +static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) + return false; + + return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; +} + +static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel) +{ + struct arm_pmu *pmu = kvm->arch.arm_pmu; + + /* + * The CPU PMU likely isn't PMUv3; let the driver provide a mapping + * for the guest's PMUv3 event ID. + */ + if (unlikely(pmu->map_pmuv3_event)) + return pmu->map_pmuv3_event(eventsel); + + return eventsel; } /** @@ -600,17 +688,16 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, reg, data; - bool p, u, nsk, nsu; + int eventsel; + u64 evtreg; - reg = counter_index_to_evtreg(pmc->idx); - data = __vcpu_sys_reg(vcpu, reg); + evtreg = kvm_pmc_read_evtreg(pmc); kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else - eventsel = data & kvm_pmu_event_mask(vcpu->kvm); + eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); /* * Neither SW increment nor chained events need to be backed @@ -628,23 +715,34 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; - p = data & ARMV8_PMU_EXCLUDE_EL1; - u = data & ARMV8_PMU_EXCLUDE_EL0; - nsk = data & ARMV8_PMU_EXCLUDE_NS_EL1; - nsu = data & ARMV8_PMU_EXCLUDE_NS_EL0; + /* + * Don't create an event if we're running on hardware that requires + * PMUv3 event translation and we couldn't find a valid mapping. + */ + eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel); + if (eventsel < 0) + return; memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); - attr.exclude_user = (u != nsu); - attr.exclude_kernel = (p != nsk); + attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; /* + * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the + * guest's EL2 filter. + */ + if (unlikely(is_hyp_ctxt(vcpu))) + attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); + else + attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); + + /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period * which also depends on the overflow. @@ -682,9 +780,6 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); u64 reg; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - reg = counter_index_to_evtreg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm); @@ -702,29 +797,23 @@ void kvm_host_pmu_init(struct arm_pmu *pmu) if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) return; - mutex_lock(&arm_pmus_lock); + guard(mutex)(&arm_pmus_lock); entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) - goto out_unlock; + return; entry->arm_pmu = pmu; list_add_tail(&entry->entry, &arm_pmus); - - if (list_is_singular(&arm_pmus)) - static_branch_enable(&kvm_arm_pmu_available); - -out_unlock: - mutex_unlock(&arm_pmus_lock); } static struct arm_pmu *kvm_pmu_probe_armpmu(void) { - struct arm_pmu *tmp, *pmu = NULL; struct arm_pmu_entry *entry; + struct arm_pmu *pmu; int cpu; - mutex_lock(&arm_pmus_lock); + guard(mutex)(&arm_pmus_lock); /* * It is safe to use a stale cpu to iterate the list of PMUs so long as @@ -745,42 +834,62 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) */ cpu = raw_smp_processor_id(); list_for_each_entry(entry, &arm_pmus, entry) { - tmp = entry->arm_pmu; + pmu = entry->arm_pmu; - if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) { - pmu = tmp; - break; - } + if (cpumask_test_cpu(cpu, &pmu->supported_cpus)) + return pmu; } - mutex_unlock(&arm_pmus_lock); + return NULL; +} - return pmu; +static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) +{ + u32 hi[2], lo[2]; + + bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + + return ((u64)hi[pmceid1] << 32) | lo[pmceid1]; +} + +static u64 compute_pmceid0(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 0); + + /* always support SW_INCR */ + val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR); + /* always support CHAIN */ + val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + return val; +} + +static u64 compute_pmceid1(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 1); + + /* + * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled + * as RAZ + */ + val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); + return val; } u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { + struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu; unsigned long *bmap = vcpu->kvm->arch.pmu_filter; u64 val, mask = 0; int base, i, nr_events; - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - if (!pmceid1) { - val = read_sysreg(pmceid0_el0); - /* always support CHAIN */ - val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + val = compute_pmceid0(cpu_pmu); base = 0; } else { - val = read_sysreg(pmceid1_el0); - /* - * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled - * as RAZ - */ - val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | - BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | - BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); + val = compute_pmceid1(cpu_pmu); base = 32; } @@ -805,20 +914,17 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); - - kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask; __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask; __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask; + + kvm_pmu_reprogram_counter_mask(vcpu, mask); } int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - if (!vcpu->arch.pmu.created) return -EINVAL; @@ -841,9 +947,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - /* One-off reload of the PMU on first run */ - kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); - return 0; } @@ -911,10 +1014,17 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; /* - * The arm_pmu->num_events considers the cycle counter as well. - * Ignore that and return only the general-purpose counters. + * PMUv3 requires that all event counters are capable of counting any + * event, though the same may not be true of non-PMUv3 hardware. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return 1; + + /* + * The arm_pmu->cntr_mask considers the fixed counter(s) as well. + * Ignore those and return only the general-purpose counters. */ - return arm_pmu->num_events - 1; + return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); } static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) @@ -1121,13 +1231,26 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) u8 kvm_arm_pmu_get_pmuver_limit(void) { - u64 tmp; + unsigned int pmuver; - tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - tmp = cpuid_feature_cap_perfmon_field(tmp, - ID_AA64DFR0_EL1_PMUVer_SHIFT, - ID_AA64DFR0_EL1_PMUVer_V3P5); - return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); + pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + + /* + * Spoof a barebones PMUv3 implementation if the system supports IMPDEF + * traps of the PMUv3 sysregs + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return ID_AA64DFR0_EL1_PMUVer_IMP; + + /* + * Otherwise, treat IMPLEMENTATION DEFINED functionality as + * unimplemented + */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return 0; + + return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5); } /** @@ -1140,3 +1263,29 @@ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); } + +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) +{ + bool reprogrammed = false; + unsigned long mask; + int i; + + mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + for_each_set_bit(i, &mask, 32) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + + /* + * We only need to reconfigure events where the filter is + * different at EL1 vs. EL2, as we're multiplexing the true EL1 + * event filter bit for nested. + */ + if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) + continue; + + kvm_pmu_create_perf_event(pmc); + reprogrammed = true; + } + + if (reprogrammed) + kvm_vcpu_pmu_restore_guest(vcpu); +} diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index a243934c5568..6b48a3d16d0d 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,6 +5,8 @@ */ #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> +#include <linux/perf/arm_pmuv3.h> static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); @@ -35,11 +37,11 @@ struct kvm_pmu_events *kvm_get_pmu_events(void) * Add events to track that we may want to switch at guest entry/exit * time. */ -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !kvm_pmu_switch_needed(attr)) + if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -51,90 +53,43 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) /* * Stop tracking events */ -void kvm_clr_pmu_events(u32 clr) +void kvm_clr_pmu_events(u64 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3()) + if (!system_supports_pmuv3()) return; pmu->events_host &= ~clr; pmu->events_guest &= ~clr; } -#define PMEVTYPER_READ_CASE(idx) \ - case idx: \ - return read_sysreg(pmevtyper##idx##_el0) - -#define PMEVTYPER_WRITE_CASE(idx) \ - case idx: \ - write_sysreg(val, pmevtyper##idx##_el0); \ - break - -#define PMEVTYPER_CASES(readwrite) \ - PMEVTYPER_##readwrite##_CASE(0); \ - PMEVTYPER_##readwrite##_CASE(1); \ - PMEVTYPER_##readwrite##_CASE(2); \ - PMEVTYPER_##readwrite##_CASE(3); \ - PMEVTYPER_##readwrite##_CASE(4); \ - PMEVTYPER_##readwrite##_CASE(5); \ - PMEVTYPER_##readwrite##_CASE(6); \ - PMEVTYPER_##readwrite##_CASE(7); \ - PMEVTYPER_##readwrite##_CASE(8); \ - PMEVTYPER_##readwrite##_CASE(9); \ - PMEVTYPER_##readwrite##_CASE(10); \ - PMEVTYPER_##readwrite##_CASE(11); \ - PMEVTYPER_##readwrite##_CASE(12); \ - PMEVTYPER_##readwrite##_CASE(13); \ - PMEVTYPER_##readwrite##_CASE(14); \ - PMEVTYPER_##readwrite##_CASE(15); \ - PMEVTYPER_##readwrite##_CASE(16); \ - PMEVTYPER_##readwrite##_CASE(17); \ - PMEVTYPER_##readwrite##_CASE(18); \ - PMEVTYPER_##readwrite##_CASE(19); \ - PMEVTYPER_##readwrite##_CASE(20); \ - PMEVTYPER_##readwrite##_CASE(21); \ - PMEVTYPER_##readwrite##_CASE(22); \ - PMEVTYPER_##readwrite##_CASE(23); \ - PMEVTYPER_##readwrite##_CASE(24); \ - PMEVTYPER_##readwrite##_CASE(25); \ - PMEVTYPER_##readwrite##_CASE(26); \ - PMEVTYPER_##readwrite##_CASE(27); \ - PMEVTYPER_##readwrite##_CASE(28); \ - PMEVTYPER_##readwrite##_CASE(29); \ - PMEVTYPER_##readwrite##_CASE(30) - /* * Read a value direct from PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) { - switch (idx) { - PMEVTYPER_CASES(READ); - case ARMV8_PMU_CYCLE_IDX: - return read_sysreg(pmccfiltr_el0); - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + return read_pmccfiltr(); + else if (idx == ARMV8_PMU_INSTR_IDX) + return read_pmicfiltr(); - return 0; + return read_pmevtypern(idx); } /* * Write a value direct to PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) { - switch (idx) { - PMEVTYPER_CASES(WRITE); - case ARMV8_PMU_CYCLE_IDX: - write_sysreg(val, pmccfiltr_el0); - break; - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + write_pmccfiltr(val); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicfiltr(val); + else + write_pmevtypern(idx, val); } /* @@ -145,7 +100,7 @@ static void kvm_vcpu_pmu_enable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer &= ~ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -160,7 +115,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer |= ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -176,9 +131,9 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return; preempt_disable(); @@ -197,9 +152,9 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return; pmu = kvm_get_pmu_events(); @@ -225,14 +180,14 @@ bool kvm_set_pmuserenr(u64 val) struct kvm_cpu_context *hctxt; struct kvm_vcpu *vcpu; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return false; vcpu = kvm_get_running_vcpu(); if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU)) return false; - hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + hctxt = host_data_ptr(host_ctxt); ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val; return true; } diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 1f69b667332b..3b5dbe9a0a0e 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -194,6 +194,12 @@ static void kvm_psci_system_off(struct kvm_vcpu *vcpu) kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0); } +static void kvm_psci_system_off2(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, + KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2); +} + static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) { kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0); @@ -322,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) switch(psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: - val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1; + val = PSCI_VERSION(1, minor); break; case PSCI_1_0_FN_PSCI_FEATURES: arg = smccc_get_arg1(vcpu); @@ -358,6 +364,11 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) if (minor >= 1) val = 0; break; + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor >= 3) + val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF; + break; } break; case PSCI_1_0_FN_SYSTEM_SUSPEND: @@ -392,6 +403,33 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) break; } break; + case PSCI_1_3_FN_SYSTEM_OFF2: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor < 3) + break; + + arg = smccc_get_arg1(vcpu); + /* + * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2 + * must be zero. + */ + if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) || + smccc_get_arg2(vcpu) != 0) { + val = PSCI_RET_INVALID_PARAMS; + break; + } + kvm_psci_system_off2(vcpu); + /* + * We shouldn't be going back to the guest after receiving a + * SYSTEM_OFF2 request. Preload a return value of + * INTERNAL_FAILURE should userspace ignore the exit and resume + * the vCPU. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; default: return kvm_psci_0_2_call(vcpu); } @@ -449,6 +487,10 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) } switch (version) { + case KVM_ARM_PSCI_1_3: + return kvm_psci_1_x_call(vcpu, 3); + case KVM_ARM_PSCI_1_2: + return kvm_psci_1_x_call(vcpu, 2); case KVM_ARM_PSCI_1_1: return kvm_psci_1_x_call(vcpu, 1); case KVM_ARM_PSCI_1_0: diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c new file mode 100644 index 000000000000..098416d7e5c2 --- /dev/null +++ b/arch/arm64/kvm/ptdump.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug helper used to dump the stage-2 pagetables of the system and their + * associated permissions. + * + * Copyright (C) Google, 2024 + * Author: Sebastian Ene <sebastianene@google.com> + */ +#include <linux/debugfs.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> + +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/ptdump.h> + +#define MARKERS_LEN 2 +#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1) + +struct kvm_ptdump_guest_state { + struct kvm *kvm; + struct ptdump_pg_state parser_state; + struct addr_marker ipa_marker[MARKERS_LEN]; + struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS]; + struct ptdump_range range[MARKERS_LEN]; +}; + +static const struct ptdump_prot_bits stage2_pte_bits[] = { + { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .set = "R", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .set = "W", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "X", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .set = "AF", + .clear = " ", + }, { + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, + .set = "BLK", + .clear = " ", + }, +}; + +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct ptdump_pg_state *st = ctx->arg; + struct ptdump_state *pt_st = &st->ptdump; + + note_page(pt_st, ctx->addr, ctx->level, ctx->old); + + return 0; +} + +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl) +{ + u32 i; + u64 mask; + + if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + mask = 0; + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++) + mask |= stage2_pte_bits[i].mask; + + for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) { + snprintf(level[i].name, sizeof(level[i].name), "%u", i); + + level[i].num = ARRAY_SIZE(stage2_pte_bits); + level[i].bits = stage2_pte_bits; + level[i].mask = mask; + } + + return 0; +} + +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) +{ + struct kvm_ptdump_guest_state *st; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgtable = mmu->pgt; + int ret; + + st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT); + if (!st) + return ERR_PTR(-ENOMEM); + + ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level); + if (ret) { + kfree(st); + return ERR_PTR(ret); + } + + st->ipa_marker[0].name = "Guest IPA"; + st->ipa_marker[1].start_address = BIT(pgtable->ia_bits); + st->range[0].end = BIT(pgtable->ia_bits); + + st->kvm = kvm; + st->parser_state = (struct ptdump_pg_state) { + .marker = &st->ipa_marker[0], + .level = -1, + .pg_level = &st->level[0], + .ptdump.range = &st->range[0], + .start_address = 0, + }; + + return st; +} + +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) +{ + int ret; + struct kvm_ptdump_guest_state *st = m->private; + struct kvm *kvm = st->kvm; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct ptdump_pg_state *parser_state = &st->parser_state; + struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) { + .cb = kvm_ptdump_visitor, + .arg = parser_state, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + parser_state->seq = m; + + write_lock(&kvm->mmu_lock); + ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker); + write_unlock(&kvm->mmu_lock); + + return ret; +} + +static int kvm_ptdump_guest_open(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + struct kvm_ptdump_guest_state *st; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + st = kvm_ptdump_parser_create(kvm); + if (IS_ERR(st)) { + ret = PTR_ERR(st); + goto err_with_kvm_ref; + } + + ret = single_open(file, kvm_ptdump_guest_show, st); + if (!ret) + return 0; + + kfree(st); +err_with_kvm_ref: + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_ptdump_guest_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + void *st = ((struct seq_file *)file->private_data)->private; + + kfree(st); + kvm_put_kvm(kvm); + + return single_release(m, file); +} + +static const struct file_operations kvm_ptdump_guest_fops = { + .open = kvm_ptdump_guest_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_ptdump_guest_close, +}; + +static int kvm_pgtable_range_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%2u\n", pgtable->ia_bits); + return 0; +} + +static int kvm_pgtable_levels_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level); + return 0; +} + +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + struct kvm *kvm = m->i_private; + struct kvm_pgtable *pgtable; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + pgtable = kvm->arch.mmu.pgt; + + ret = single_open(file, show, pgtable); + if (ret < 0) + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_pgtable_range_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show); +} + +static int kvm_pgtable_levels_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show); +} + +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + + kvm_put_kvm(kvm); + return single_release(m, file); +} + +static const struct file_operations kvm_pgtable_range_fops = { + .open = kvm_pgtable_range_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +static const struct file_operations kvm_pgtable_levels_fops = { + .open = kvm_pgtable_levels_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) +{ + debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry, + kvm, &kvm_ptdump_guest_fops); + debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm, + &kvm_pgtable_range_fops); + debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry, + kvm, &kvm_pgtable_levels_fops); +} diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 68d1d05672bd..f82fcc614e13 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -32,6 +32,7 @@ /* Maximum phys_shift supported for any VM on this host */ static u32 __ro_after_init kvm_ipa_limit; +unsigned int __ro_after_init kvm_host_sve_max_vl; /* * ARMv8 Reset Values @@ -51,6 +52,8 @@ int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); + kvm_host_sve_max_vl = sve_max_vl(); + kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl; /* * The get_sve_reg()/set_sve_reg() ioctl interface will need @@ -82,7 +85,7 @@ static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ - vcpu_set_flag(vcpu, GUEST_HAS_SVE); + set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags); } /* @@ -151,7 +154,6 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { void *sve_state = vcpu->arch.sve_state; - kvm_vcpu_unshare_task_fp(vcpu); kvm_unshare_hyp(vcpu, vcpu + 1); if (sve_state) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); @@ -165,11 +167,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } -static void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) -{ - vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); -} - /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer @@ -199,9 +196,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.reset_state.reset = false; spin_unlock(&vcpu->arch.mp_state_lock); - /* Reset PMU outside of the non-preemptible section */ - kvm_pmu_vcpu_reset(vcpu); - preempt_disable(); loaded = (vcpu->cpu != -1); if (loaded) @@ -214,10 +208,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_vcpu_reset_sve(vcpu); } - if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || - vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) - kvm_vcpu_enable_ptrauth(vcpu); - if (vcpu_el1_is_32bit(vcpu)) pstate = VCPU_RESET_PSTATE_SVC; else if (vcpu_has_nv(vcpu)) @@ -266,6 +256,12 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu) preempt_enable(); } +u32 kvm_get_pa_bits(struct kvm *kvm) +{ + /* Fixed limit until we can configure ID_AA64MMFR0.PARange */ + return kvm_ipa_limit; +} + u32 get_kvm_ipa_limit(void) { return kvm_ipa_limit; diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index 3ace5b75813b..af5eec681127 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -19,6 +19,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> #include <asm/stacktrace/nvhe.h> static struct stack_info stackinfo_get_overflow(void) @@ -50,7 +51,7 @@ static struct stack_info stackinfo_get_hyp(void) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); unsigned long low = (unsigned long)stacktrace_info->stack_base; - unsigned long high = low + PAGE_SIZE; + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, @@ -60,8 +61,8 @@ static struct stack_info stackinfo_get_hyp(void) static struct stack_info stackinfo_get_hyp_kern_va(void) { - unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); - unsigned long high = low + PAGE_SIZE; + unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_base); + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, @@ -145,7 +146,7 @@ static void unwind(struct unwind_state *state, */ static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) { - unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long va_mask = GENMASK_ULL(__hyp_va_bits - 1, 0); unsigned long hyp_offset = (unsigned long)arg; /* Mask tags and convert to kern addr */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c9f4f387155f..005ad28f7306 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -17,7 +17,9 @@ #include <linux/mm.h> #include <linux/printk.h> #include <linux/uaccess.h> +#include <linux/irqchip/arm-gic-v3.h> +#include <asm/arm_pmuv3.h> #include <asm/cacheflush.h> #include <asm/cputype.h> #include <asm/debug-monitors.h> @@ -33,6 +35,7 @@ #include <trace/events/kvm.h> #include "sys_regs.h" +#include "vgic/vgic.h" #include "trace.h" @@ -46,6 +49,13 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); +static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + static bool bad_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r, @@ -53,8 +63,7 @@ static bool bad_trap(struct kvm_vcpu *vcpu, { WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, params, r); } static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -102,6 +111,14 @@ static bool get_el2_to_el1_mapping(unsigned int reg, PURE_EL2_SYSREG( RVBAR_EL2 ); PURE_EL2_SYSREG( TPIDR_EL2 ); PURE_EL2_SYSREG( HPFAR_EL2 ); + PURE_EL2_SYSREG( HCRX_EL2 ); + PURE_EL2_SYSREG( HFGRTR_EL2 ); + PURE_EL2_SYSREG( HFGWTR_EL2 ); + PURE_EL2_SYSREG( HFGITR_EL2 ); + PURE_EL2_SYSREG( HDFGRTR_EL2 ); + PURE_EL2_SYSREG( HDFGWTR_EL2 ); + PURE_EL2_SYSREG( HAFGRTR_EL2 ); + PURE_EL2_SYSREG( CNTVOFF_EL2 ); PURE_EL2_SYSREG( CNTHCTL_EL2 ); MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, translate_sctlr_el2_to_sctlr_el1 ); @@ -118,9 +135,15 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); + MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); + MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); + MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); + MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); + MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); default: return false; } @@ -140,6 +163,21 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) goto memory_read; /* + * CNTHCTL_EL2 requires some special treatment to + * account for the bits that can be set via CNTKCTL_EL1. + */ + switch (reg) { + case CNTHCTL_EL2: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; + return val; + } + break; + } + + /* * If this register does not have an EL1 counterpart, * then read the stored EL2 version. */ @@ -156,6 +194,9 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) /* Get the current version of the EL1 counterpart. */ WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + return val; } @@ -189,6 +230,19 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) */ __vcpu_sys_reg(vcpu, reg) = val; + switch (reg) { + case CNTHCTL_EL2: + /* + * If E2H=0, CNHTCTL_EL2 is a pure shadow register. + * Otherwise, some of the bits are backed by + * CNTKCTL_EL1, while the rest is kept in memory. + * Yes, this is fun stuff. + */ + if (vcpu_el2_e2h_is_set(vcpu)) + write_sysreg_el1(val, SYS_CNTKCTL); + return; + } + /* No EL1 counterpart? We're done here.? */ if (reg == el1r) return; @@ -344,10 +398,8 @@ static bool access_dcgsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_has_mte(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_mte(vcpu->kvm)) + return undef_access(vcpu, p, r); /* Treat MTE S/W ops as we treat the classic ones: with contempt */ return access_dcsw(vcpu, p, r); @@ -428,6 +480,9 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, { bool g1; + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (!p->is_write) return read_from_write_only(vcpu, p, r); @@ -471,10 +526,19 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (p->is_write) return ignore_write(vcpu, p); - p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + if (p->Op1 == 4) { /* ICC_SRE_EL2 */ + p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | + ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB); + } else { /* ICC_SRE_EL1 */ + p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + } + return true; } @@ -488,14 +552,6 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu, return read_zero(vcpu, p); } -static bool trap_undef(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - return false; -} - /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 @@ -508,10 +564,8 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, { u32 sr = reg_to_encoding(r); - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) + return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); @@ -523,17 +577,10 @@ static bool trap_oslar_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 oslsr; - if (!p->is_write) return read_from_write_only(vcpu, p, r); - /* Forward the OSLK bit to OSLSR */ - oslsr = __vcpu_sys_reg(vcpu, OSLSR_EL1) & ~OSLSR_EL1_OSLK; - if (p->regval & OSLAR_EL1_OSLK) - oslsr |= OSLSR_EL1_OSLK; - - __vcpu_sys_reg(vcpu, OSLSR_EL1) = oslsr; + kvm_debug_handle_oslar(vcpu, p->regval); return true; } @@ -574,43 +621,13 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, } } -/* - * We want to avoid world-switching all the DBG registers all the - * time: - * - * - If we've touched any debug register, it is likely that we're - * going to touch more of them. It then makes sense to disable the - * traps and start doing the save/restore dance - * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is - * then mandatory to save/restore the registers, as the guest - * depends on them. - * - * For this, we use a DIRTY bit, indicating the guest has modified the - * debug registers, used as follow: - * - * On guest entry: - * - If the dirty bit is set (because we're coming back from trapping), - * disable the traps, save host registers, restore guest registers. - * - If debug is actively in use (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), - * set the dirty bit, disable the traps, save host registers, - * restore guest registers. - * - Otherwise, enable the traps - * - * On guest exit: - * - If the dirty bit is set, save guest registers, restore host - * registers and clear the dirty bit. This ensure that the host can - * now use the debug registers. - */ static bool trap_debug_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { access_rw(vcpu, p, r); - if (p->is_write) - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - trace_trap_reg(__func__, r->reg, p->is_write, p->regval); + kvm_debug_set_guest_ownership(vcpu); return true; } @@ -619,9 +636,6 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, * * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits - * - * All writes will set the DEBUG_DIRTY flag to ensure the hyp code - * switches between host and guest values in future. */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -636,8 +650,6 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val &= ~mask; val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; - - vcpu_set_flag(vcpu, DEBUG_DIRTY); } static void dbg_to_reg(struct kvm_vcpu *vcpu, @@ -651,152 +663,79 @@ static void dbg_to_reg(struct kvm_vcpu *vcpu, p->regval = (*dbg_reg & mask) >> shift; } -static bool trap_bvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - - return true; -} - -static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) +static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val; - return 0; -} + struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state; -static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - return 0; + switch (rd->Op2) { + case 0b100: + return &dbg->dbg_bvr[rd->CRm]; + case 0b101: + return &dbg->dbg_bcr[rd->CRm]; + case 0b110: + return &dbg->dbg_wvr[rd->CRm]; + case 0b111: + return &dbg->dbg_wcr[rd->CRm]; + default: + KVM_BUG_ON(1, vcpu->kvm); + return NULL; + } } -static u64 reset_bvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; - return rd->val; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static bool trap_bcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; + if (!reg) + return false; if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); + reg_to_dbg(vcpu, p, rd, reg); else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); + dbg_to_reg(vcpu, p, rd, reg); + kvm_debug_set_guest_ownership(vcpu); return true; } -static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) +static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) { - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val; - return 0; -} - -static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - return 0; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static u64 reset_bcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; - return rd->val; -} - -static bool trap_wvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); - - return true; -} - -static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val; - return 0; -} + if (!reg) + return -EINVAL; -static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; + *reg = val; return 0; } -static u64 reset_wvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; - return rd->val; -} - -static bool trap_wcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; + u64 *reg = demux_wb_reg(vcpu, rd); - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - - return true; -} + if (!reg) + return -EINVAL; -static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val; + *val = *reg; return 0; } -static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) +static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - return 0; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static u64 reset_wcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; + /* + * Bail early if we couldn't find storage for the register, the + * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever + * being run. + */ + if (!reg) + return 0; + + *reg = rd->val; return rd->val; } @@ -880,7 +819,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; + __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; return __vcpu_sys_reg(vcpu, r->reg); } @@ -972,7 +911,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + & PMSELR_EL0_SEL_MASK; return true; } @@ -1028,6 +967,22 @@ static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, return 0; } +static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + kvm_pmu_set_counter_value_user(vcpu, idx, val); + return 0; +} + static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1040,8 +995,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, if (pmu_access_event_counter_el0_disabled(vcpu)) return false; - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); } else if (r->Op2 == 0) { /* PMCCNTR_EL0 */ if (pmu_access_cycle_counter_el0_disabled(vcpu)) @@ -1091,7 +1046,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); @@ -1119,32 +1074,17 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { - bool set; - - val &= kvm_pmu_valid_counter_mask(vcpu); - - switch (r->reg) { - case PMOVSSET_EL0: - /* CRm[1] being set indicates a SET register, and CLR otherwise */ - set = r->CRm & 2; - break; - default: - /* Op2[0] being set indicates a SET register, and CLR otherwise */ - set = r->Op2 & 1; - break; - } + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); - if (set) - __vcpu_sys_reg(vcpu, r->reg) |= val; - else - __vcpu_sys_reg(vcpu, r->reg) &= ~val; + __vcpu_sys_reg(vcpu, r->reg) = val & mask; + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; } static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); *val = __vcpu_sys_reg(vcpu, r->reg) & mask; return 0; @@ -1158,19 +1098,17 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; - if (r->Op2 & 0x1) { + if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; - kvm_pmu_enable_counter_mask(vcpu, val); - kvm_vcpu_pmu_restore_guest(vcpu); - } else { + else /* accessing PMCNTENCLR_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; - kvm_pmu_disable_counter_mask(vcpu, val); - } + + kvm_pmu_reprogram_counter_mask(vcpu, val); } else { p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } @@ -1181,7 +1119,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -1205,7 +1143,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; @@ -1235,7 +1173,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_write_swinc_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } @@ -1244,10 +1182,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { - if (!vcpu_mode_priv(vcpu)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!vcpu_mode_priv(vcpu)) + return undef_access(vcpu, p, r); __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; @@ -1301,19 +1237,25 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, val |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, r->reg) = val; + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + return 0; } /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ - trap_bvr, reset_bvr, 0, 0, get_bvr, set_bvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ - trap_bcr, reset_bcr, 0, 0, get_bcr, set_bcr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ - trap_wvr, reset_wvr, 0, 0, get_wvr, set_wvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ - trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg } #define PMU_SYS_REG(name) \ SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \ @@ -1323,6 +1265,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(PMEVCNTRn_EL0(n)), \ .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ + .set_user = set_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ @@ -1331,14 +1274,6 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } -static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - - return false; -} - /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } @@ -1375,30 +1310,149 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, switch (reg) { case SYS_CNTP_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTV_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_AARCH32_CNTP_TVAL: + case SYS_CNTP_TVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; + + case SYS_CNTV_TVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHP_TVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHV_TVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTV_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_AARCH32_CNTP_CTL: + case SYS_CNTP_CTL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; + + case SYS_CNTV_CTL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHP_CTL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHV_CTL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTV_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_AARCH32_CNTP_CVAL: + case SYS_CNTP_CVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; + + case SYS_CNTV_CVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHP_CVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHV_CVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; + case SYS_AARCH32_CNTPCT: + case SYS_AARCH32_CNTPCTSS: tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; + + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_AARCH32_CNTVCT: + case SYS_AARCH32_CNTVCTSS: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, p, r); } if (p->is_write) @@ -1409,6 +1463,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static bool access_hv_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!vcpu_el2_e2h_is_set(vcpu)) + return undef_access(vcpu, p, r); + + return access_arch_timer(vcpu, p, r); +} + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -1513,6 +1577,9 @@ static u8 pmuver_to_perfmon(u8 pmuver) } } +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) @@ -1526,11 +1593,30 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = read_sanitised_ftr_reg(id); switch (id) { + case SYS_ID_AA64DFR0_EL1: + val = sanitise_id_aa64dfr0_el1(vcpu, val); + break; + case SYS_ID_AA64PFR0_EL1: + val = sanitise_id_aa64pfr0_el1(vcpu, val); + break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); + break; + case SYS_ID_AA64PFR2_EL1: + /* We only expose FPMR */ + val &= ID_AA64PFR2_EL1_FPMR; break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1543,17 +1629,29 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); - if (!cpus_have_final_cap(ARM64_HAS_WFXT)) + if (!cpus_have_final_cap(ARM64_HAS_WFXT) || + has_broken_cntvoff()) val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; + case SYS_ID_AA64ISAR3_EL1: + val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX; + break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + val &= ~ID_AA64MMFR2_EL1_NV; + break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | + ID_AA64MMFR3_EL1_S1PIE; break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; } + if (vcpu_has_nv(vcpu)) + val = limit_nv_id_reg(vcpu->kvm, id, val); + return val; } @@ -1565,18 +1663,44 @@ static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - return IDREG(vcpu->kvm, reg_to_encoding(r)); + return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r)); +} + +static bool is_feature_id_reg(u32 encoding) +{ + return (sys_reg_Op0(encoding) == 3 && + (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && + sys_reg_CRn(encoding) == 0 && + sys_reg_CRm(encoding) <= 7); } /* * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is - * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. + * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID + * registers KVM maintains on a per-VM basis. + * + * Additionally, the implementation ID registers and CTR_EL0 are handled as + * per-VM registers. */ -static inline bool is_id_reg(u32 id) +static inline bool is_vm_ftr_id_reg(u32 id) { - return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && - sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && - sys_reg_CRm(id) < 8); + switch (id) { + case SYS_CTR_EL0: + case SYS_MIDR_EL1: + case SYS_REVIDR_EL1: + case SYS_AIDR_EL1: + return true; + default: + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) < 8); + + } +} + +static inline bool is_vcpu_ftr_id_reg(u32 id) +{ + return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id); } static inline bool is_aa32_id_reg(u32 id) @@ -1645,11 +1769,26 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) + return 0; + return REG_HIDDEN; +} + +static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_fpmr(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) +{ if (!vcpu_has_sve(vcpu)) val &= ~ID_AA64PFR0_EL1_SVE_MASK; @@ -1677,24 +1816,18 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, val &= ~ID_AA64PFR0_EL1_AMU_MASK; + /* + * MPAM is disabled by default as KVM also needs a set of PARTID to + * program the MPAMVPMx_EL2 PARTID remapping registers with. But some + * older kernels let the guest see the ID bit. + */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + return val; } -#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ -({ \ - u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ - (val) &= ~reg##_##field##_MASK; \ - (val) |= FIELD_PREP(reg##_##field##_MASK, \ - min(__f_val, \ - (u64)SYS_FIELD_VALUE(reg, field, limit))); \ - (val); \ -}) - -static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); /* @@ -1708,6 +1841,9 @@ static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, /* Hide SPE from guests */ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; + /* Hide BRBE from guests */ + val &= ~ID_AA64DFR0_EL1_BRBE_MASK; + return val; } @@ -1748,12 +1884,14 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + u8 perfmon; u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); val &= ~ID_DFR0_EL1_PerfMon_MASK; - if (kvm_vcpu_has_pmu(vcpu)) + if (kvm_vcpu_has_pmu(vcpu)) { + perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); + } val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8); @@ -1787,6 +1925,101 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, val); } +static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; + + /* + * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits + * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to + * guests, but didn't add trap handling. KVM doesn't support MPAM and + * always returns an UNDEF for these registers. The guest must see 0 + * for this field. + * + * But KVM must also accept values from user-space that were provided + * by KVM. On CPUs that support MPAM, permit user-space to write + * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. + */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; + + /* See set_id_aa64pfr0_el1 for comment about MPAM */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd); + u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK | + ID_AA64MMFR0_EL1_TGRAN16_2_MASK | + ID_AA64MMFR0_EL1_TGRAN64_2_MASK; + + if (vcpu_has_nv(vcpu) && + ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask))) + return -EINVAL; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK; + + /* + * We made the mistake to expose the now deprecated NV field, + * so allow userspace to write it, but silently ignore it. + */ + if ((hw_val & nv_mask) == (user_val & nv_mask)) + user_val &= ~nv_mask; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_ctr_el0(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val); + + /* + * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. + * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based + * on what hardware reports. + * + * Using a VIPT software model on PIPT will lead to over invalidation, + * but still correct. Hence, we can allow downgrading PIPT to VIPT, + * but not the other way around. This is handled via arm64_ftr_safe_value() + * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value + * set as VIPT. + */ + switch (user_L1Ip) { + case CTR_EL0_L1Ip_RESERVED_VPIPT: + case CTR_EL0_L1Ip_RESERVED_AIVIVT: + return -EINVAL; + case CTR_EL0_L1Ip_VIPT: + case CTR_EL0_L1Ip_PIPT: + return set_id_reg(vcpu, rd, user_val); + default: + return -ENOENT; + } +} + /* * cpufeature ID register user accessors * @@ -1837,7 +2070,7 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, ret = arm64_check_features(vcpu, rd, val); if (!ret) - IDREG(vcpu->kvm, id) = val; + kvm_set_vm_id_reg(vcpu->kvm, id, val); mutex_unlock(&vcpu->kvm->arch.config_lock); @@ -1853,6 +2086,18 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return ret; } +void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val) +{ + u64 *p = __vm_id_reg(&kvm->arch, reg); + + lockdep_assert_held(&kvm->arch.config_lock); + + if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm)) + return; + + *p = val; +} + static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { @@ -1872,7 +2117,7 @@ static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0); + p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0); return true; } @@ -1935,7 +2180,7 @@ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) * one cache line. */ if (kvm_has_mte(vcpu->kvm)) - clidr |= 2 << CLIDR_TTYPE_SHIFT(loc); + clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); __vcpu_sys_reg(vcpu, r->reg) = clidr; @@ -2045,29 +2290,18 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = v, \ } -#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) -#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) - -/* - * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when - * HCR_EL2.E2H==1, and only in the sysreg table for convenience of - * handling traps. Given that, they are always hidden from userspace. - */ -static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - return REG_HIDDEN_USER; -} - -#define EL12_REG(name, acc, rst, v) { \ - SYS_DESC(SYS_##name##_EL12), \ +#define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ + SYS_DESC(SYS_##name), \ .access = acc, \ .reset = rst, \ - .reg = name##_EL1, \ + .reg = name, \ + .visibility = filter, \ .val = v, \ - .visibility = hidden_user_visibility, \ } +#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) +#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) + /* * Since reset() callback and field val are not used for idregs, they will be * used for specific purposes for idregs. @@ -2079,50 +2313,53 @@ static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu, * from userspace. */ +#define ID_DESC_DEFAULT_CALLBACKS \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = id_visibility, \ + .reset = kvm_read_sanitised_id_reg + #define ID_DESC(name) \ SYS_DESC(SYS_##name), \ - .access = access_id_reg, \ - .get_user = get_id_reg \ + ID_DESC_DEFAULT_CALLBACKS /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ - .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* sys_reg_desc initialiser for known cpufeature ID registers */ #define AA32_ID_SANITISED(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ .visibility = aa32_id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* sys_reg_desc initialiser for writable ID registers */ #define ID_WRITABLE(name, mask) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ - .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = mask, \ } +/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ +#define ID_FILTERED(sysreg, name, mask) { \ + ID_DESC(sysreg), \ + .set_user = set_##name, \ + .val = (mask), \ +} + /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 * (1 <= crm < 8, 0 <= Op2 < 8). */ #define ID_UNALLOCATED(crm, op2) { \ + .name = "S3_0_0_" #crm "_" #op2, \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ - .access = access_id_reg, \ - .get_user = get_id_reg, \ - .set_user = set_id_reg, \ + ID_DESC_DEFAULT_CALLBACKS, \ .visibility = raz_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } @@ -2133,9 +2370,7 @@ static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu, */ #define ID_HIDDEN(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ .visibility = raz_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } @@ -2175,6 +2410,18 @@ static bool access_spsr(struct kvm_vcpu *vcpu, return true; } +static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; + else + p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); + + return true; +} + static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 val = r->val; @@ -2185,6 +2432,274 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return __vcpu_sys_reg(vcpu, r->reg) = val; } +static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + unsigned int (*fn)(const struct kvm_vcpu *, + const struct sys_reg_desc *)) +{ + return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); +} + +static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, sve_visibility); +} + +static bool access_zcr_el2(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + unsigned int vq; + + if (guest_hyp_sve_traps_enabled(vcpu)) { + kvm_inject_nested_sve_trap(vcpu); + return true; + } + + if (!p->is_write) { + p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2); + return true; + } + + vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; + vq = min(vq, vcpu_sve_max_vq(vcpu)); + vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2); + + return true; +} + +static bool access_gic_vtr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = kvm_vgic_global_state.ich_vtr_el2; + p->regval &= ~(ICH_VTR_EL2_DVIM | + ICH_VTR_EL2_A3V | + ICH_VTR_EL2_IDbits); + p->regval |= ICH_VTR_EL2_nV4; + + return true; +} + +static bool access_gic_misr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_misr(vcpu); + + return true; +} + +static bool access_gic_eisr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_eisr(vcpu); + + return true; +} + +static bool access_gic_elrsr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_elrsr(vcpu); + + return true; +} + +static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1poe(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1poe_visibility); +} + +static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_tcr2(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, tcr2_visibility); +} + +static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1pie(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1pie_visibility); +} + +static bool access_mdcr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 old = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!access_rw(vcpu, p, r)) + return false; + + /* + * Request a reload of the PMU to enable/disable the counters affected + * by HPME. + */ + if ((old ^ __vcpu_sys_reg(vcpu, MDCR_EL2)) & MDCR_EL2_HPME) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return true; +} + +/* + * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and + * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them. + * The values made visible to userspace were the register values of the boot + * CPU. + * + * At the same time, reads from these registers at EL1 previously were not + * trapped, allowing the guest to read the actual hardware value. On big-little + * machines, this means the VM can see different values depending on where a + * given vCPU got scheduled. + * + * These registers are now trapped as collateral damage from SME, and what + * follows attempts to give a user / guest view consistent with the existing + * ABI. + */ +static bool access_imp_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + /* + * Return the VM-scoped implementation ID register values if userspace + * has made them writable. + */ + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags)) + return access_id_reg(vcpu, p, r); + + /* + * Otherwise, fall back to the old behavior of returning the value of + * the current CPU. + */ + switch (reg_to_encoding(r)) { + case SYS_REVIDR_EL1: + p->regval = read_sysreg(revidr_el1); + break; + case SYS_AIDR_EL1: + p->regval = read_sysreg(aidr_el1); + break; + default: + WARN_ON_ONCE(1); + } + + return true; +} + +static u64 __ro_after_init boot_cpu_midr_val; +static u64 __ro_after_init boot_cpu_revidr_val; +static u64 __ro_after_init boot_cpu_aidr_val; + +static void init_imp_id_regs(void) +{ + boot_cpu_midr_val = read_sysreg(midr_el1); + boot_cpu_revidr_val = read_sysreg(revidr_el1); + boot_cpu_aidr_val = read_sysreg(aidr_el1); +} + +static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + switch (reg_to_encoding(r)) { + case SYS_MIDR_EL1: + return boot_cpu_midr_val; + case SYS_REVIDR_EL1: + return boot_cpu_revidr_val; + case SYS_AIDR_EL1: + return boot_cpu_aidr_val; + default: + KVM_BUG_ON(1, vcpu->kvm); + return 0; + } +} + +static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct kvm *kvm = vcpu->kvm; + u64 expected; + + guard(mutex)(&kvm->arch.config_lock); + + expected = read_id_reg(vcpu, r); + if (expected == val) + return 0; + + if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)) + return -EINVAL; + + /* + * Once the VM has started the ID registers are immutable. Reject the + * write if userspace tries to change it. + */ + if (kvm_vm_has_ran_once(kvm)) + return -EBUSY; + + /* + * Any value is allowed for the implementation ID registers so long as + * it is within the writable mask. + */ + if ((val & r->val) != val) + return -EINVAL; + + kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val); + return 0; +} + +#define IMPLEMENTATION_ID(reg, mask) { \ + SYS_DESC(SYS_##reg), \ + .access = access_imp_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_imp_id_reg, \ + .reset = reset_imp_id_reg, \ + .val = mask, \ +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2231,9 +2746,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, - { SYS_DESC(SYS_DBGVCR32_EL2), trap_undef, reset_val, DBGVCR32_EL2, 0 }, + { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, + IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)), { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, + IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)), /* * ID regs: all ID_SANITISED() entries here must have corresponding @@ -2280,34 +2797,52 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 ID registers */ /* CRm=4 */ - { SYS_DESC(SYS_ID_AA64PFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_reg, - .reset = read_sanitised_id_aa64pfr0_el1, - .val = ~(ID_AA64PFR0_EL1_AMU | - ID_AA64PFR0_EL1_MPAM | - ID_AA64PFR0_EL1_SVE | - ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_GIC | - ID_AA64PFR0_EL1_AdvSIMD | - ID_AA64PFR0_EL1_FP), }, - ID_SANITISED(ID_AA64PFR1_EL1), - ID_UNALLOCATED(4,2), + ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, + ~(ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SVE | + ID_AA64PFR0_EL1_RAS | + ID_AA64PFR0_EL1_AdvSIMD | + ID_AA64PFR0_EL1_FP)), + ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, + ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_DF2 | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_RNDR_trap | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_RAS_frac | + ID_AA64PFR1_EL1_MTE)), + ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), - ID_UNALLOCATED(4,7), + ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ - { SYS_DESC(SYS_ID_AA64DFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_aa64dfr0_el1, - .reset = read_sanitised_id_aa64dfr0_el1, - .val = ID_AA64DFR0_EL1_PMUVer_MASK | - ID_AA64DFR0_EL1_DebugVer_MASK, }, + /* + * Prior to FEAT_Debugv8.9, the architecture defines context-aware + * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). + * KVM does not trap + emulate the breakpoint registers, and as such + * cannot support a layout that misaligns with the underlying hardware. + * While it may be possible to describe a subset that aligns with + * hardware, just prevent changes to BRPs and CTX_CMPs altogether for + * simplicity. + * + * See DDI0487K.a, section D2.8.3 Breakpoint types and linking + * of breakpoints for more details. + */ + ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, + ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_PMUVer_MASK | + ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), @@ -2325,32 +2860,34 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 | ID_AA64ISAR2_EL1_APA3 | ID_AA64ISAR2_EL1_GPA3)), - ID_UNALLOCATED(6,3), + ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | + ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), ID_UNALLOCATED(6,6), ID_UNALLOCATED(6,7), /* CRm=7 */ - ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 | - ID_AA64MMFR0_EL1_TGRAN4_2 | - ID_AA64MMFR0_EL1_TGRAN64_2 | - ID_AA64MMFR0_EL1_TGRAN16_2)), + ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1, + ~(ID_AA64MMFR0_EL1_RES0 | + ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | - ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_TWED | ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits)), - ID_WRITABLE(ID_AA64MMFR2_EL1, ~(ID_AA64MMFR2_EL1_RES0 | + ID_FILTERED(ID_AA64MMFR2_EL1, + id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 | ID_AA64MMFR2_EL1_EVT | ID_AA64MMFR2_EL1_FWB | ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), - ID_SANITISED(ID_AA64MMFR3_EL1), - ID_SANITISED(ID_AA64MMFR4_EL1), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1PIE | + ID_AA64MMFR3_EL1_S1POE)), + ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), ID_UNALLOCATED(7,7), @@ -2369,7 +2906,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, - { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0 }, + { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0, + .visibility = tcr2_visibility }, PTRAUTH_KEY(APIA), PTRAUTH_KEY(APIB), @@ -2380,6 +2918,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SPSR_EL1), access_spsr}, { SYS_DESC(SYS_ELR_EL1), access_elr}, + { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, @@ -2421,31 +2961,51 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, - { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, - { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, { SYS_DESC(SYS_LORN_EL1), trap_loregion }, { SYS_DESC(SYS_LORC_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAM1_EL1), undef_access }, + { SYS_DESC(SYS_MPAM0_EL1), undef_access }, { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, - { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, - { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, @@ -2458,12 +3018,19 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, - .set_user = set_clidr }, + .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, + IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)), { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, - { SYS_DESC(SYS_CTR_EL0), access_ctr }, - { SYS_DESC(SYS_SVCR), undef_access }, + ID_FILTERED(CTR_EL0, ctr_el0, + CTR_EL0_DIC_MASK | + CTR_EL0_IDC_MASK | + CTR_EL0_DminLine_MASK | + CTR_EL0_L1Ip_MASK | + CTR_EL0_IminLine_MASK), + { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, + { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, @@ -2491,7 +3058,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(PMCCNTR_EL0), .access = access_pmu_evcntr, .reset = reset_unknown, - .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr, + .set_user = set_pmu_evcntr }, { PMU_SYS_REG(PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(PMXEVCNTR_EL0), @@ -2506,6 +3074,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, + { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, @@ -2586,11 +3156,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, + /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), PMU_PMEVCNTR_EL0(1), @@ -2667,7 +3243,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), - EL2_REG(MDCR_EL2, access_rw, reset_val, 0), + EL2_REG(MDCR_EL2, access_mdcr, reset_val, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG_VNCR(HSTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0), @@ -2675,15 +3251,20 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), EL2_REG_VNCR(HACR_EL2, reset_val, 0), + EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, + sve_el2_visibility), + EL2_REG_VNCR(HCRX_EL2, reset_val, 0), EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), + EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1, + tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), - { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 }, + { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0), @@ -2692,55 +3273,584 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SP_EL1), access_sp_el1}, /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ - { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - - { SYS_DESC(SYS_IFSR32_EL2), trap_undef, reset_unknown, IFSR32_EL2 }, + { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, + + { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), EL2_REG_REDIR(ESR_EL2, reset_val, 0), - { SYS_DESC(SYS_FPEXC32_EL2), trap_undef, reset_val, FPEXC32_EL2, 0x700 }, + { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), + EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0, + s1poe_el2_visibility), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, + { SYS_DESC(SYS_MPAM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), - { SYS_DESC(SYS_RMR_EL2), trap_undef }, + { SYS_DESC(SYS_RMR_EL2), undef_access }, + + EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0), + + { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre }, + + EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), + { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr }, + { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr }, + { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr }, + { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr }, + EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0), + + EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0), + EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, + EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), + EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), - EL12_REG(CNTKCTL, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, + EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), + EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), + + { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, + + { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, + + { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; +static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s1e01(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* There is no FGT associated with AT S1E2A :-( */ + if (op == OP_AT_S1E2A && + !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + __kvm_at_s1e2(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s12(vcpu, op, p->regval); + + return true; +} + +static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_nROS && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + return true; +} + +static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + write_lock(&vcpu->kvm->mmu_lock); + + /* + * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the + * corresponding VMIDs. + */ + kvm_nested_s2_unmap(vcpu->kvm, true); + + write_unlock(&vcpu->kvm->mmu_lock); + + return true; +} + +static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + u8 Op2 = sys_reg_Op2(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + return true; +} + +/* Only defined here as this is an internal "abstraction" */ +union tlbi_info { + struct { + u64 start; + u64 size; + } range; + + struct { + u64 addr; + } ipa; + + struct { + u64 addr; + u32 encoding; + } va; +}; + +static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + /* + * The unmap operation is allowed to drop the MMU lock and block, which + * means that @mmu could be used for a different context than the one + * currently being invalidated. + * + * This behavior is still safe, as: + * + * 1) The vCPU(s) that recycled the MMU are responsible for invalidating + * the entire MMU before reusing it, which still honors the intent + * of a TLBI. + * + * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC + * and ERET to the guest), other vCPUs are allowed to use stale + * translations. + * + * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and + * at worst may cause more aborts for shadow stage-2 fills. + * + * Dropping the MMU lock also implies that shadow stage-2 fills could + * happen behind the back of the TLBI. This is still safe, though, as + * the L1 needs to put its stage-2 in a consistent state before doing + * the TLBI. + */ + kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); +} + +static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 limit, vttbr; + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = 0, + .size = limit, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + u64 base, range, tg, num, scale; + int shift; + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + /* + * Because the shadow S2 structure doesn't necessarily reflect that + * of the guest's S2 (different base granule size, for example), we + * decide to ignore TTL and only use the described range. + */ + tg = FIELD_GET(GENMASK(47, 46), p->regval); + scale = FIELD_GET(GENMASK(45, 44), p->regval); + num = FIELD_GET(GENMASK(43, 39), p->regval); + base = p->regval & GENMASK(36, 0); + + switch(tg) { + case 1: + shift = 12; + break; + case 2: + shift = 14; + break; + case 3: + default: /* IMPDEF: handle tg==0 as 64k */ + shift = 16; + break; + } + + base <<= shift; + range = __TLBI_RANGE_PAGES(num, scale) << shift; + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = base, + .size = range, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + unsigned long max_size; + u64 base_addr; + + /* + * We drop a number of things from the supplied value: + * + * - NS bit: we're non-secure only. + * + * - IPA[51:48]: We don't support 52bit IPA just yet... + * + * And of course, adjust the IPA to be on an actual address. + */ + base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12; + max_size = compute_tlb_inval_range(mmu, info->ipa.addr); + base_addr &= ~(max_size - 1); + + /* + * See comment in s2_mmu_unmap_range() for why this is allowed to + * reschedule. + */ + kvm_stage2_unmap_range(mmu, base_addr, max_size, true); +} + +static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .ipa = { + .addr = p->regval, + }, + }, + s2_mmu_unmap_ipa); + + return true; +} + +static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); +} + +static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + /* + * If we're here, this is because we've trapped on a EL1 TLBI + * instruction that affects the EL1 translation regime while + * we're running in a context that doesn't allow us to let the + * HW do its thing (aka vEL2): + * + * - HCR_EL2.E2H == 0 : a non-VHE guest + * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode + * + * We don't expect these helpers to ever be called when running + * in a vEL1 context. + */ + + WARN_ON(!vcpu_is_el2(vcpu)); + + if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .va = { + .addr = p->regval, + .encoding = sys_encoding, + }, + }, + s2_mmu_tlbi_s1e1); + + return true; +} + +#define SYS_INSN(insn, access_fn) \ + { \ + SYS_DESC(OP_##insn), \ + .access = (access_fn), \ + } + static struct sys_reg_desc sys_insn_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + + SYS_INSN(AT_S1E1R, handle_at_s1e01), + SYS_INSN(AT_S1E1W, handle_at_s1e01), + SYS_INSN(AT_S1E0R, handle_at_s1e01), + SYS_INSN(AT_S1E0W, handle_at_s1e01), + SYS_INSN(AT_S1E1RP, handle_at_s1e01), + SYS_INSN(AT_S1E1WP, handle_at_s1e01), + { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, { SYS_DESC(SYS_DC_CISW), access_dcsw }, { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, -}; -static const struct sys_reg_desc *first_idreg; + SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), + + SYS_INSN(AT_S1E2R, handle_at_s1e2), + SYS_INSN(AT_S1E2W, handle_at_s1e2), + SYS_INSN(AT_S12E1R, handle_at_s12), + SYS_INSN(AT_S12E1W, handle_at_s12), + SYS_INSN(AT_S12E0R, handle_at_s12), + SYS_INSN(AT_S12E0W, handle_at_s12), + SYS_INSN(AT_S1E2A, handle_at_s1e2), + + SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OS, undef_access), + SYS_INSN(TLBI_VAE2OS, undef_access), + SYS_INSN(TLBI_ALLE1OS, handle_alle1is), + SYS_INSN(TLBI_VALE2OS, undef_access), + SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2IS, undef_access), + SYS_INSN(TLBI_RVALE2IS, undef_access), + + SYS_INSN(TLBI_ALLE1IS, handle_alle1is), + SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OS, undef_access), + SYS_INSN(TLBI_RVALE2OS, undef_access), + SYS_INSN(TLBI_RVAE2, undef_access), + SYS_INSN(TLBI_RVALE2, undef_access), + SYS_INSN(TLBI_ALLE1, handle_alle1is), + SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), + + SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OSNXS, undef_access), + SYS_INSN(TLBI_VAE2OSNXS, undef_access), + SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2OSNXS, undef_access), + SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2ISNXS, undef_access), + SYS_INSN(TLBI_RVALE2ISNXS, undef_access), + SYS_INSN(TLBI_ALLE2ISNXS, undef_access), + SYS_INSN(TLBI_VAE2ISNXS, undef_access), + + SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2ISNXS, undef_access), + SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OSNXS, undef_access), + SYS_INSN(TLBI_RVALE2OSNXS, undef_access), + SYS_INSN(TLBI_RVAE2NXS, undef_access), + SYS_INSN(TLBI_RVALE2NXS, undef_access), + SYS_INSN(TLBI_ALLE2NXS, undef_access), + SYS_INSN(TLBI_VAE2NXS, undef_access), + SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), + SYS_INSN(TLBI_VALE2NXS, undef_access), + SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), +}; static bool trap_dbgdidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -2749,7 +3859,7 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, if (p->is_write) { return ignore_write(vcpu, p); } else { - u64 dfr = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); + u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1); u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | @@ -2770,18 +3880,20 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, * None of the other registers share their location, so treat them as * if they were 64bit. */ -#define DBG_BCR_BVR_WCR_WVR(n) \ - /* DBGBVRn */ \ - { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \ - /* DBGBCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \ - /* DBGWVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \ - /* DBGWCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n } - -#define DBGBXVR(n) \ - { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n } +#define DBG_BCR_BVR_WCR_WVR(n) \ + /* DBGBVRn */ \ + { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \ + trap_dbg_wb_reg, NULL, n }, \ + /* DBGBCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n } + +#define DBGBXVR(n) \ + { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \ + trap_dbg_wb_reg, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external @@ -2916,6 +4028,7 @@ static const struct sys_reg_desc cp15_regs[] = { /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, @@ -2965,8 +4078,28 @@ static const struct sys_reg_desc cp15_regs[] = { /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, - /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, @@ -3057,9 +4190,11 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ + { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, @@ -3069,12 +4204,14 @@ static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, for (i = 0; i < n; i++) { if (!is_32 && table[i].reg && !table[i].reset) { - kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i); + kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n", + &table[i], i, table[i].name); return false; } if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1); + kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n", + &table[i], i, table[i - 1].name, table[i].name); return false; } } @@ -3377,9 +4514,13 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) * Certain AArch32 ID registers are handled by rerouting to the AArch64 * system register table. Registers in the ID range where CRm=0 are * excluded from this scheme as they do not trivially map into AArch64 - * system register encodings. + * system register encodings, except for AIDR/REVIDR. */ - if (params.Op1 == 0 && params.CRn == 0 && params.CRm) + if (params.Op1 == 0 && params.CRn == 0 && + (params.CRm || params.Op2 == 6 /* REVIDR */)) + return kvm_emulate_cp15_id_reg(vcpu, ¶ms); + if (params.Op1 == 1 && params.CRn == 0 && + params.CRm == 0 && params.Op2 == 7 /* AIDR */) return kvm_emulate_cp15_id_reg(vcpu, ¶ms); return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); @@ -3425,6 +4566,25 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, return false; } +static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) +{ + unsigned long i, idreg_idx = 0; + + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (!is_vm_ftr_id_reg(reg_to_encoding(r))) + continue; + + if (idreg_idx == pos) + return r; + + idreg_idx++; + } + + return NULL; +} + static void *idregs_debug_start(struct seq_file *s, loff_t *pos) { struct kvm *kvm = s->private; @@ -3436,7 +4596,7 @@ static void *idregs_debug_start(struct seq_file *s, loff_t *pos) if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && *iter == (u8)~0) { *iter = *pos; - if (*iter >= KVM_ARM_ID_REG_NUM) + if (!idregs_debug_find(kvm, *iter)) iter = NULL; } else { iter = ERR_PTR(-EBUSY); @@ -3453,7 +4613,7 @@ static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) (*pos)++; - if ((kvm->arch.idreg_debugfs_iter + 1) < KVM_ARM_ID_REG_NUM) { + if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) { kvm->arch.idreg_debugfs_iter++; return &kvm->arch.idreg_debugfs_iter; @@ -3478,16 +4638,16 @@ static void idregs_debug_stop(struct seq_file *s, void *v) static int idregs_debug_show(struct seq_file *s, void *v) { - struct kvm *kvm = s->private; const struct sys_reg_desc *desc; + struct kvm *kvm = s->private; - desc = first_idreg + kvm->arch.idreg_debugfs_iter; + desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter); if (!desc->name) return 0; seq_printf(s, "%20s:\t%016llx\n", - desc->name, IDREG(kvm, IDX_IDREG(kvm->arch.idreg_debugfs_iter))); + desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc))); return 0; } @@ -3509,26 +4669,24 @@ void kvm_sys_regs_create_debugfs(struct kvm *kvm) &idregs_debug_fops); } -static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) +static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) { - const struct sys_reg_desc *idreg = first_idreg; - u32 id = reg_to_encoding(idreg); + u32 id = reg_to_encoding(reg); struct kvm *kvm = vcpu->kvm; if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) return; - lockdep_assert_held(&kvm->arch.config_lock); - - /* Initialize all idregs */ - while (is_id_reg(id)) { - IDREG(kvm, id) = idreg->reset(vcpu, idreg); + kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg)); +} - idreg++; - id = reg_to_encoding(idreg); - } +static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *reg) +{ + if (kvm_vcpu_initialized(vcpu)) + return; - set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); + reg->reset(vcpu, reg); } /** @@ -3540,19 +4698,30 @@ static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) */ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; unsigned long i; - kvm_reset_id_regs(vcpu); - for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *r = &sys_reg_descs[i]; - if (is_id_reg(reg_to_encoding(r))) + if (!r->reset) continue; - if (r->reset) + if (is_vm_ftr_id_reg(reg_to_encoding(r))) + reset_vm_ftr_id_reg(vcpu, r); + else if (is_vcpu_ftr_id_reg(reg_to_encoding(r))) + reset_vcpu_ftr_id_reg(vcpu, r); + else r->reset(vcpu, r); + + if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) + (void)__vcpu_sys_reg(vcpu, r->reg); } + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } /** @@ -3658,72 +4827,6 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, return r; } -/* - * These are the invariant sys_reg registers: we let the guest see the - * host versions of these, so they're part of the guest state. - * - * A future CPU may provide a mechanism to present different values to - * the guest, or a future kvm may trap them. - */ - -#define FUNCTION_INVARIANT(reg) \ - static u64 get_##reg(struct kvm_vcpu *v, \ - const struct sys_reg_desc *r) \ - { \ - ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ - return ((struct sys_reg_desc *)r)->val; \ - } - -FUNCTION_INVARIANT(midr_el1) -FUNCTION_INVARIANT(revidr_el1) -FUNCTION_INVARIANT(aidr_el1) - -static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) -{ - ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); - return ((struct sys_reg_desc *)r)->val; -} - -/* ->val is filled in by kvm_sys_reg_table_init() */ -static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = { - { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, - { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 }, - { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 }, - { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, -}; - -static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) -{ - const struct sys_reg_desc *r; - - r = get_reg_by_id(id, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - - return put_user(r->val, uaddr); -} - -static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) -{ - const struct sys_reg_desc *r; - u64 val; - - r = get_reg_by_id(id, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* This is what we mean by invariant: you can't change it. */ - if (r->val != val) - return -EINVAL; - - return 0; -} - static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val; @@ -3786,7 +4889,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) { @@ -3805,15 +4908,10 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; - int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(vcpu, reg->id, uaddr); - err = get_invariant_sys_reg(reg->id, uaddr); - if (err != -ENOENT) - return err; - return kvm_sys_reg_get_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } @@ -3830,7 +4928,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) @@ -3849,15 +4947,10 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; - int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(vcpu, reg->id, uaddr); - err = set_invariant_sys_reg(reg->id, uaddr); - if (err != -ENOENT) - return err; - return kvm_sys_reg_set_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } @@ -3916,7 +5009,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, if (!(rd->reg || rd->get_user)) return 0; - if (sysreg_hidden_user(vcpu, rd)) + if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) @@ -3946,23 +5039,14 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(invariant_sys_regs) - + num_demux_regs() + return num_demux_regs() + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; int err; - /* Then give them all the invariant registers' indices. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { - if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) - return -EFAULT; - uindices++; - } - err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; @@ -3978,14 +5062,6 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) sys_reg_CRm(r), \ sys_reg_Op2(r)) -static bool is_feature_id_reg(u32 encoding) -{ - return (sys_reg_Op0(encoding) == 3 && - (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && - sys_reg_CRn(encoding) == 0 && - sys_reg_CRm(encoding) <= 7); -} - int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range) { const void *zero_page = page_to_virt(ZERO_PAGE(0)); @@ -4008,20 +5084,11 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range * if (!is_feature_id_reg(encoding) || !reg->set_user) continue; - /* - * For ID registers, we return the writable mask. Other feature - * registers return a full 64bit mask. That's not necessary - * compliant with a given revision of the architecture, but the - * RES0/RES1 definitions allow us to do that. - */ - if (is_id_reg(encoding)) { - if (!reg->val || - (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) - continue; - val = reg->val; - } else { - val = ~0UL; + if (!reg->val || + (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) { + continue; } + val = reg->val; if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding)))) return -EFAULT; @@ -4030,11 +5097,34 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range * return 0; } -void kvm_init_sysreg(struct kvm_vcpu *vcpu) +static void vcpu_set_hcr(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - mutex_lock(&kvm->arch.config_lock); + if (has_vhe() || has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_el1_is_32bit(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_RW; + + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; /* * In the absence of FGT, we cannot independently trap TLBI @@ -4043,13 +5133,16 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu) */ if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) vcpu->arch.hcr_el2 |= HCR_TTLBOS; +} - if (cpus_have_final_cap(ARM64_HAS_HCX)) { - vcpu->arch.hcrx_el2 = HCRX_GUEST_FLAGS; +void kvm_calculate_traps(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; - if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) - vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); - } + mutex_lock(&kvm->arch.config_lock); + vcpu_set_hcr(vcpu); + vcpu_set_ich_hcr(vcpu); + vcpu_set_hcrx(vcpu); if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) goto out; @@ -4057,8 +5150,6 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | - HFGxTR_EL2_nPOR_EL1 | - HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1 | HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK); @@ -4089,22 +5180,70 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu) HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) + kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A; + + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | + HFGITR_EL2_ATS1E1WP); + + if (!kvm_has_s1pie(kvm)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); + if (!kvm_has_s1poe(kvm)) + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | + HFGxTR_EL2_nPOR_EL0); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) { + kvm->arch.fgu[HDFGRTR_GROUP] |= (HDFGRTR_EL2_nBRBDATA | + HDFGRTR_EL2_nBRBCTL | + HDFGRTR_EL2_nBRBIDR); + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_nBRBINJ | + HFGITR_EL2_nBRBIALL); + } + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: mutex_unlock(&kvm->arch.config_lock); } +/* + * Perform last adjustments to the ID registers that are implied by the + * configuration outside of the ID regs themselves, as well as any + * initialisation that directly depend on these ID registers (such as + * RES0/RES1 behaviours). This is not the place to configure traps though. + * + * Because this can be called once per CPU, changes must be idempotent. + */ +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + guard(mutex)(&kvm->arch.config_lock); + + if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && + irqchip_in_kernel(kvm) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { + kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; + kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; + } + + if (vcpu_has_nv(vcpu)) { + int ret = kvm_init_nv_sysregs(vcpu); + if (ret) + return ret; + } + + return 0; +} + int __init kvm_sys_reg_table_init(void) { - struct sys_reg_params params; bool valid = true; unsigned int i; int ret = 0; @@ -4115,21 +5254,12 @@ int __init kvm_sys_reg_table_init(void) valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true); valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); - valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); if (!valid) return -EINVAL; - /* We abuse the reset function to overwrite the table itself. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) - invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); - - /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */ - params = encoding_to_params(SYS_ID_PFR0_EL1); - first_idreg = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); - if (!first_idreg) - return -EINVAL; + init_imp_id_regs(); ret = populate_nv_trap_config(); diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 997eea21ba2a..cc6338d38766 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -95,9 +95,8 @@ struct sys_reg_desc { }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ -#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */ -#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */ -#define REG_USER_WI (1 << 3) /* WI from userspace only */ +#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -165,15 +164,6 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, return sysreg_visibility(vcpu, r) & REG_HIDDEN; } -static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) -{ - if (likely(!r->visibility)) - return false; - - return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER); -} - static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -235,6 +225,8 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x @@ -248,4 +240,21 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) +#define CP15_SYS_DESC(reg) \ + .name = #reg, \ + .aarch32_map = AA32_DIRECT, \ + Op0(0), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + +#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ +({ \ + u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ + (val) &= ~reg##_##field##_MASK; \ + (val) |= FIELD_PREP(reg##_##field##_MASK, \ + min(__f_val, \ + (u64)SYS_FIELD_VALUE(reg, field, limit))); \ + (val); \ +}) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 064a58c19f48..f85415db7713 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -46,38 +46,6 @@ TRACE_EVENT(kvm_hvc_arm64, __entry->vcpu_pc, __entry->r0, __entry->imm) ); -TRACE_EVENT(kvm_arm_setup_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_clear_debug, - TP_PROTO(__u32 guest_debug), - TP_ARGS(guest_debug), - - TP_STRUCT__entry( - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->guest_debug = guest_debug; - ), - - TP_printk("flags: 0x%08x", __entry->guest_debug) -); - /* * The dreg32 name is a leftover from a distant past. This will really * output a 64bit value... @@ -99,49 +67,6 @@ TRACE_EVENT(kvm_arm_set_dreg32, TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); -TRACE_DEFINE_SIZEOF(__u64); - -TRACE_EVENT(kvm_arm_set_regset, - TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), - TP_ARGS(type, len, control, value), - TP_STRUCT__entry( - __field(const char *, name) - __field(int, len) - __array(u64, ctrls, 16) - __array(u64, values, 16) - ), - TP_fast_assign( - __entry->name = type; - __entry->len = len; - memcpy(__entry->ctrls, control, len << 3); - memcpy(__entry->values, value, len << 3); - ), - TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, - __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), - __print_array(__entry->values, __entry->len, sizeof(__u64))) -); - -TRACE_EVENT(trap_reg, - TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), - TP_ARGS(fn, reg, is_write, write_value), - - TP_STRUCT__entry( - __field(const char *, fn) - __field(int, reg) - __field(bool, is_write) - __field(u64, write_value) - ), - - TP_fast_assign( - __entry->fn = fn; - __entry->reg = reg; - __entry->is_write = is_write; - __entry->write_value = write_value; - ), - - TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) -); - TRACE_EVENT(kvm_handle_sys_reg, TP_PROTO(unsigned long hsr), TP_ARGS(hsr), diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 9e7c486b48c2..5eacb4b3250a 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -35,12 +35,12 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, vgic_v3_cpu->num_id_bits = host_id_bits; - host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2); + host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2); seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); if (host_seis != seis) return -EINVAL; - host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2); + host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2); a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); if (host_a3v != a3v) return -EINVAL; @@ -68,10 +68,10 @@ static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, - FIELD_GET(ICH_VTR_SEIS_MASK, + FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2)); val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, - FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2)); + FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2)); /* * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. * Extract it directly using ICC_CTLR_EL1 reg definitions. diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 389025ce7749..afb018528bc3 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -28,27 +28,66 @@ struct vgic_state_iter { int nr_lpis; int dist_id; int vcpu_id; - int intid; + unsigned long intid; int lpi_idx; - u32 *lpi_array; }; -static void iter_next(struct vgic_state_iter *iter) +static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) { + struct vgic_dist *dist = &kvm->arch.vgic; + if (iter->dist_id == 0) { iter->dist_id++; return; } + /* + * Let the xarray drive the iterator after the last SPI, as the iterator + * has exhausted the sequentially-allocated INTID space. + */ + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && + iter->nr_lpis) { + if (iter->lpi_idx < iter->nr_lpis) + xa_find_after(&dist->lpi_xa, &iter->intid, + VGIC_LPI_MAX_INTID, + LPI_XA_MARK_DEBUG_ITER); + iter->lpi_idx++; + return; + } + iter->intid++; if (iter->intid == VGIC_NR_PRIVATE_IRQS && ++iter->vcpu_id < iter->nr_cpus) iter->intid = 0; +} - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) { - if (iter->lpi_idx < iter->nr_lpis) - iter->intid = iter->lpi_array[iter->lpi_idx]; - iter->lpi_idx++; +static int iter_mark_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long intid; + int nr_lpis = 0; + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (!vgic_try_get_irq_kref(irq)) + continue; + + xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + nr_lpis++; + } + + return nr_lpis; +} + +static void iter_unmark_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long intid; + + xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { + xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + vgic_put_irq(kvm, irq); } } @@ -61,15 +100,12 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, iter->nr_cpus = nr_cpus; iter->nr_spis = kvm->arch.vgic.nr_spis; - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array); - if (iter->nr_lpis < 0) - iter->nr_lpis = 0; - } + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + iter->nr_lpis = iter_mark_lpis(kvm); /* Fast forward to the right position if needed */ while (pos--) - iter_next(iter); + iter_next(kvm, iter); } static bool end_of_vgic(struct vgic_state_iter *iter) @@ -77,7 +113,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - iter->lpi_idx > iter->nr_lpis; + (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) @@ -114,7 +150,7 @@ static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) struct vgic_state_iter *iter = kvm->arch.vgic.iter; ++*pos; - iter_next(iter); + iter_next(kvm, iter); if (end_of_vgic(iter)) iter = NULL; return iter; @@ -134,13 +170,14 @@ static void vgic_debug_stop(struct seq_file *s, void *v) mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; - kfree(iter->lpi_array); + iter_unmark_lpis(kvm); kfree(iter); kvm->arch.vgic.iter = NULL; mutex_unlock(&kvm->arch.config_lock); } -static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) +static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, + struct vgic_state_iter *iter) { bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; @@ -149,7 +186,7 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); if (v3) - seq_printf(s, "nr_lpis:\t%d\n", atomic_read(&dist->lpi_count)); + seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); @@ -236,7 +273,7 @@ static int vgic_debug_show(struct seq_file *s, void *v) unsigned long flags; if (iter->dist_id == 0) { - print_dist_state(s, &kvm->arch.vgic); + print_dist_state(s, &kvm->arch.vgic, iter); return 0; } @@ -246,11 +283,16 @@ static int vgic_debug_show(struct seq_file *s, void *v) if (iter->vcpu_id < iter->nr_cpus) vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); - irq = vgic_get_irq(kvm, vcpu, iter->intid); - if (!irq) { - seq_printf(s, " LPI %4d freed\n", iter->intid); - return 0; - } + /* + * Expect this to succeed, as iter_mark_lpis() takes a reference on + * every LPI to be visited. + */ + if (iter->intid < VGIC_NR_PRIVATE_IRQS) + irq = vgic_get_vcpu_irq(vcpu, iter->intid); + else + irq = vgic_get_irq(kvm, iter->intid); + if (WARN_ON_ONCE(!irq)) + return -EINVAL; raw_spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index f20941f83a07..1f33e71c2a73 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -34,9 +34,9 @@ * * CPU Interface: * - * - kvm_vgic_vcpu_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. + * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend + * on any sizing information. Private interrupts are allocated if not + * already allocated at vgic-creation time. */ /* EARLY INIT */ @@ -53,13 +53,13 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - INIT_LIST_HEAD(&dist->lpi_translation_cache); - raw_spin_lock_init(&dist->lpi_list_lock); xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); } /* CREATION */ +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); + /** * kvm_vgic_create: triggered by the instantiation of the VGIC device by * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) @@ -114,6 +114,22 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_allocate_private_irqs_locked(vcpu, type); + if (ret) + break; + } + + if (ret) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + } + + goto out_unlock; + } + kvm->arch.vgic.in_kernel = true; kvm->arch.vgic.vgic_model = type; @@ -182,27 +198,43 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } -/** - * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data - * structures and register VCPU-specific KVM iodevs - * - * @vcpu: pointer to the VCPU being created and initialized - * - * Only do initialization, but do not actually enable the - * VGIC CPU interface - */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */ +#define DEFAULT_MI_INTID 25 + +int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu) +{ + int ret; + + guard(mutex)(&vcpu->kvm->arch.config_lock); + + /* + * Matching the tradition established with the timers, provide + * a default PPI for the maintenance interrupt. It makes + * things easier to reason about. + */ + if (vcpu->kvm->arch.vgic.mi_intid == 0) + vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID; + ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu); + + return ret; +} + +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int ret = 0; int i; - vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + lockdep_assert_held(&vcpu->kvm->arch.config_lock); - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - raw_spin_lock_init(&vgic_cpu->ap_list_lock); - atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (vgic_cpu->private_irqs) + return 0; + + vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS, + sizeof(struct vgic_irq), + GFP_KERNEL_ACCOUNT); + + if (!vgic_cpu->private_irqs) + return -ENOMEM; /* * Enable and configure all SGIs to be edge-triggered and @@ -225,11 +257,61 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = BIT(vcpu->vcpu_id); + break; + } } + return 0; +} + +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type) +{ + int ret; + + mutex_lock(&vcpu->kvm->arch.config_lock); + ret = vgic_allocate_private_irqs_locked(vcpu, type); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + +/** + * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data + * structures and register VCPU-specific KVM iodevs + * + * @vcpu: pointer to the VCPU being created and initialized + * + * Only do initialization, but do not actually enable the + * VGIC CPU interface + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int ret = 0; + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (!irqchip_in_kernel(vcpu->kvm)) return 0; + ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); + if (ret) + return ret; + /* * If we are creating a VCPU with a GICv3 we must also register the * KVM io device for the redistributor that belongs to this VCPU. @@ -263,7 +345,7 @@ int vgic_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0; unsigned long idx; lockdep_assert_held(&kvm->arch.config_lock); @@ -283,31 +365,6 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; - /* Initialize groups on CPUs created before the VGIC type was known */ - kvm_for_each_vcpu(idx, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - switch (dist->vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->group = 1; - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->group = 0; - irq->targets = 1U << idx; - break; - default: - ret = -EINVAL; - goto out; - } - } - } - - if (vgic_has_its(kvm)) - vgic_lpi_translation_cache_init(kvm); - /* * If we have GICv4.1 enabled, unconditionally request enable the * v4 support so that we get HW-accelerated vSGIs. Otherwise, only @@ -355,15 +412,12 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); INIT_LIST_HEAD(&dist->rd_regions); } else { dist->vgic_cpu_base = VGIC_ADDR_UNDEF; } - if (vgic_has_its(kvm)) - vgic_lpi_translation_cache_destroy(kvm); - if (vgic_supports_direct_msis(kvm)) vgic_v4_teardown(kvm); @@ -381,8 +435,29 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) vgic_flush_pending_lpis(vcpu); INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_unregister_redist_iodev(vcpu); + /* + * If this vCPU is being destroyed because of a failed creation + * then unregister the redistributor to avoid leaving behind a + * dangling pointer to the vCPU struct. + * + * vCPUs that have been successfully created (i.e. added to + * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as + * this function gets called while holding kvm->arch.config_lock + * in the VM teardown path and would otherwise introduce a lock + * inversion w.r.t. kvm->srcu. + * + * vCPUs that failed creation are torn down outside of the + * kvm->arch.config_lock and do not get unregistered in + * kvm_vgic_destroy(), meaning it is both safe and necessary to + * do so here. + */ + if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu) + vgic_unregister_redist_iodev(vcpu); + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; } } @@ -402,17 +477,21 @@ void kvm_vgic_destroy(struct kvm *kvm) unsigned long i; mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); vgic_debug_destroy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) __kvm_vgic_vcpu_destroy(vcpu); - mutex_lock(&kvm->arch.config_lock); - kvm_vgic_dist_destroy(kvm); mutex_unlock(&kvm->arch.config_lock); + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm_for_each_vcpu(i, vcpu, kvm) + vgic_unregister_redist_iodev(vcpu); + mutex_unlock(&kvm->slots_lock); } @@ -486,22 +565,31 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (ret) goto out; - dist->ready = true; dist_base = dist->vgic_dist_base; mutex_unlock(&kvm->arch.config_lock); ret = vgic_register_dist_iodev(kvm, dist_base, type); - if (ret) + if (ret) { kvm_err("Unable to register VGIC dist MMIO regions\n"); + goto out_slots; + } + /* + * kvm_io_bus_register_dev() guarantees all readers see the new MMIO + * registration before returning through synchronize_srcu(), which also + * implies a full memory barrier. As such, marking the distributor as + * 'ready' here is guaranteed to be ordered after all vCPUs having seen + * a completely configured distributor. + */ + dist->ready = true; goto out_slots; out: mutex_unlock(&kvm->arch.config_lock); out_slots: - mutex_unlock(&kvm->slots_lock); - if (ret) - kvm_vgic_destroy(kvm); + kvm_vm_dead(kvm); + + mutex_unlock(&kvm->slots_lock); return ret; } @@ -521,12 +609,20 @@ void kvm_vgic_cpu_down(void) static irqreturn_t vgic_maintenance_handler(int irq, void *data) { + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data; + /* * We cannot rely on the vgic maintenance interrupt to be * delivered synchronously. This means we can only use it to * exit the VM, and we perform the handling of EOIed * interrupts on the exit path (see vgic_fold_lr_state). + * + * Of course, NV throws a wrench in this plan, and needs + * something special. */ + if (vcpu && vgic_state_is_nested(vcpu)) + vgic_v3_handle_nested_maint_irq(vcpu); + return IRQ_HANDLED; } diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 8c711deb25aa..c314c016659a 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -9,7 +9,7 @@ #include <kvm/arm_vgic.h> #include "vgic.h" -/** +/* * vgic_irqfd_set_irq: inject the IRQ corresponding to the * irqchip routing entry * @@ -75,7 +75,8 @@ static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, msi->flags = e->msi.flags; msi->devid = e->msi.devid; } -/** + +/* * kvm_set_msi: inject the MSI corresponding to the * MSI routing entry * @@ -98,7 +99,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return vgic_its_inject_msi(kvm, &msi); } -/** +/* * kvm_arch_set_irq_inatomic: fast-path for irqfd injection */ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index e85a495ada9c..fb96802799c6 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -23,12 +23,49 @@ #include "vgic.h" #include "vgic-mmio.h" +static struct kvm_device_ops kvm_arm_vgic_its_ops; + static int vgic_its_save_tables_v0(struct vgic_its *its); static int vgic_its_restore_tables_v0(struct vgic_its *its); static int vgic_its_commit_v0(struct vgic_its *its); static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, struct kvm_vcpu *filter_vcpu, bool needs_inv); +#define vgic_its_read_entry_lock(i, g, valp, t) \ + ({ \ + int __sz = vgic_its_get_abi(i)->t##_esz; \ + struct kvm *__k = (i)->dev->kvm; \ + int __ret; \ + \ + BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ + sizeof(*(valp)) != ABI_0_ESZ); \ + if (NR_ITS_ABIS > 1 && \ + KVM_BUG_ON(__sz != sizeof(*(valp)), __k)) \ + __ret = -EINVAL; \ + else \ + __ret = kvm_read_guest_lock(__k, (g), \ + valp, __sz); \ + __ret; \ + }) + +#define vgic_its_write_entry_lock(i, g, val, t) \ + ({ \ + int __sz = vgic_its_get_abi(i)->t##_esz; \ + struct kvm *__k = (i)->dev->kvm; \ + typeof(val) __v = (val); \ + int __ret; \ + \ + BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ + sizeof(__v) != ABI_0_ESZ); \ + if (NR_ITS_ABIS > 1 && \ + KVM_BUG_ON(__sz != sizeof(__v), __k)) \ + __ret = -EINVAL; \ + else \ + __ret = vgic_write_guest_lock(__k, (g), \ + &__v, __sz); \ + __ret; \ + }) + /* * Creates a new (reference to a) struct vgic_irq for a given LPI. * If this LPI is already mapped on another ITS, we increase its refcount @@ -40,7 +77,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, struct kvm_vcpu *vcpu) { struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; + struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; unsigned long flags; int ret; @@ -67,7 +104,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, irq->target_vcpu = vcpu; irq->group = 1; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + xa_lock_irqsave(&dist->lpi_xa, flags); /* * There could be a race with another vgic_add_lpi(), so we need to @@ -82,17 +119,14 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, goto out_unlock; } - ret = xa_err(xa_store(&dist->lpi_xa, intid, irq, 0)); + ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); if (ret) { xa_release(&dist->lpi_xa, intid); kfree(irq); - goto out_unlock; } - atomic_inc(&dist->lpi_count); - out_unlock: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + xa_unlock_irqrestore(&dist->lpi_xa, flags); if (ret) return ERR_PTR(ret); @@ -150,14 +184,6 @@ struct its_ite { u32 event_id; }; -struct vgic_translation_cache_entry { - struct list_head entry; - phys_addr_t db; - u32 devid; - u32 eventid; - struct vgic_irq *irq; -}; - /** * struct vgic_its_abi - ITS abi ops and settings * @cte_esz: collection table entry size @@ -252,8 +278,10 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, #define GIC_LPI_OFFSET 8192 -#define VITS_TYPER_IDBITS 16 -#define VITS_TYPER_DEVBITS 16 +#define VITS_TYPER_IDBITS 16 +#define VITS_MAX_EVENTID (BIT(VITS_TYPER_IDBITS) - 1) +#define VITS_TYPER_DEVBITS 16 +#define VITS_MAX_DEVID (BIT(VITS_TYPER_DEVBITS) - 1) #define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) #define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) @@ -316,53 +344,6 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, return 0; } -#define GIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) - -/* - * Create a snapshot of the current LPIs targeting @vcpu, so that we can - * enumerate those LPIs without holding any lock. - * Returns their number and puts the kmalloc'ed array into intid_ptr. - */ -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - XA_STATE(xas, &dist->lpi_xa, GIC_LPI_OFFSET); - struct vgic_irq *irq; - unsigned long flags; - u32 *intids; - int irq_count, i = 0; - - /* - * There is an obvious race between allocating the array and LPIs - * being mapped/unmapped. If we ended up here as a result of a - * command, we're safe (locks are held, preventing another - * command). If coming from another path (such as enabling LPIs), - * we must be careful not to overrun the array. - */ - irq_count = atomic_read(&dist->lpi_count); - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT); - if (!intids) - return -ENOMEM; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - rcu_read_lock(); - - xas_for_each(&xas, irq, GIC_LPI_MAX_INTID) { - if (i == irq_count) - break; - /* We don't need to "get" the IRQ, as we hold the list lock. */ - if (vcpu && irq->target_vcpu != vcpu) - continue; - intids[i++] = irq->intid; - } - - rcu_read_unlock(); - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - *intid_ptr = intids; - return i; -} - static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) { int ret = 0; @@ -446,23 +427,18 @@ static u32 max_lpis_propbaser(u64 propbaser) static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) { gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + unsigned long intid, flags; struct vgic_irq *irq; int last_byte_offset = -1; int ret = 0; - u32 *intids; - int nr_irqs, i; - unsigned long flags; u8 pendmask; - nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids); - if (nr_irqs < 0) - return nr_irqs; - - for (i = 0; i < nr_irqs; i++) { + xa_for_each(&dist->lpi_xa, intid, irq) { int byte_offset, bit_nr; - byte_offset = intids[i] / BITS_PER_BYTE; - bit_nr = intids[i] % BITS_PER_BYTE; + byte_offset = intid / BITS_PER_BYTE; + bit_nr = intid % BITS_PER_BYTE; /* * For contiguously allocated LPIs chances are we just read @@ -472,25 +448,23 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) ret = kvm_read_guest_lock(vcpu->kvm, pendbase + byte_offset, &pendmask, 1); - if (ret) { - kfree(intids); + if (ret) return ret; - } + last_byte_offset = byte_offset; } - irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + irq = vgic_get_irq(vcpu->kvm, intid); if (!irq) continue; raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = pendmask & (1U << bit_nr); + if (irq->target_vcpu == vcpu) + irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } - kfree(intids); - return ret; } @@ -566,51 +540,52 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, return 0; } -static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, - phys_addr_t db, - u32 devid, u32 eventid) +static struct vgic_its *__vgic_doorbell_to_its(struct kvm *kvm, gpa_t db) { - struct vgic_translation_cache_entry *cte; + struct kvm_io_device *kvm_io_dev; + struct vgic_io_device *iodev; - list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { - /* - * If we hit a NULL entry, there is nothing after this - * point. - */ - if (!cte->irq) - break; + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, db); + if (!kvm_io_dev) + return ERR_PTR(-EINVAL); - if (cte->db != db || cte->devid != devid || - cte->eventid != eventid) - continue; + if (kvm_io_dev->ops != &kvm_io_gic_ops) + return ERR_PTR(-EINVAL); - /* - * Move this entry to the head, as it is the most - * recently used. - */ - if (!list_is_first(&cte->entry, &dist->lpi_translation_cache)) - list_move(&cte->entry, &dist->lpi_translation_cache); + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + if (iodev->iodev_type != IODEV_ITS) + return ERR_PTR(-EINVAL); - return cte->irq; - } + return iodev->its; +} + +static unsigned long vgic_its_cache_key(u32 devid, u32 eventid) +{ + return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid; - return NULL; } static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, u32 devid, u32 eventid) { - struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long cache_key = vgic_its_cache_key(devid, eventid); + struct vgic_its *its; struct vgic_irq *irq; - unsigned long flags; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + if (devid > VITS_MAX_DEVID || eventid > VITS_MAX_EVENTID) + return NULL; - irq = __vgic_its_check_cache(dist, db, devid, eventid); + its = __vgic_doorbell_to_its(kvm, db); + if (IS_ERR(its)) + return NULL; + + rcu_read_lock(); + + irq = xa_load(&its->translation_cache, cache_key); if (!vgic_try_get_irq_kref(irq)) irq = NULL; - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + rcu_read_unlock(); return irq; } @@ -619,81 +594,68 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq *irq) { - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte; - unsigned long flags; - phys_addr_t db; + unsigned long cache_key = vgic_its_cache_key(devid, eventid); + struct vgic_irq *old; /* Do not cache a directly injected interrupt */ if (irq->hw) return; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - if (unlikely(list_empty(&dist->lpi_translation_cache))) - goto out; - /* - * We could have raced with another CPU caching the same - * translation behind our back, so let's check it is not in - * already + * The irq refcount is guaranteed to be nonzero while holding the + * its_lock, as the ITE (and the reference it holds) cannot be freed. */ - db = its->vgic_its_base + GITS_TRANSLATER; - if (__vgic_its_check_cache(dist, db, devid, eventid)) - goto out; + lockdep_assert_held(&its->its_lock); + vgic_get_irq_kref(irq); - /* Always reuse the last entry (LRU policy) */ - cte = list_last_entry(&dist->lpi_translation_cache, - typeof(*cte), entry); + old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); /* - * Caching the translation implies having an extra reference - * to the interrupt, so drop the potential reference on what - * was in the cache, and increment it on the new interrupt. + * Put the reference taken on @irq if the store fails. Intentionally do + * not return the error as the translation cache is best effort. */ - if (cte->irq) - vgic_put_irq(kvm, cte->irq); + if (xa_is_err(old)) { + vgic_put_irq(kvm, irq); + return; + } /* - * The irq refcount is guaranteed to be nonzero while holding the - * its_lock, as the ITE (and the reference it holds) cannot be freed. + * We could have raced with another CPU caching the same + * translation behind our back, ensure we don't leak a + * reference if that is the case. */ - lockdep_assert_held(&its->its_lock); - vgic_get_irq_kref(irq); - - cte->db = db; - cte->devid = devid; - cte->eventid = eventid; - cte->irq = irq; + if (old) + vgic_put_irq(kvm, old); +} - /* Move the new translation to the head of the list */ - list_move(&cte->entry, &dist->lpi_translation_cache); +static void vgic_its_invalidate_cache(struct vgic_its *its) +{ + struct kvm *kvm = its->dev->kvm; + struct vgic_irq *irq; + unsigned long idx; -out: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + xa_for_each(&its->translation_cache, idx, irq) { + xa_erase(&its->translation_cache, idx); + vgic_put_irq(kvm, irq); + } } -void vgic_its_invalidate_cache(struct kvm *kvm) +void vgic_its_invalidate_all_caches(struct kvm *kvm) { - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte; - unsigned long flags; + struct kvm_device *dev; + struct vgic_its *its; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + rcu_read_lock(); - list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { - /* - * If we hit a NULL entry, there is nothing after this - * point. - */ - if (!cte->irq) - break; + list_for_each_entry_rcu(dev, &kvm->devices, vm_node) { + if (dev->ops != &kvm_arm_vgic_its_ops) + continue; - vgic_put_irq(kvm, cte->irq); - cte->irq = NULL; + its = dev->private; + vgic_its_invalidate_cache(its); } - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + rcu_read_unlock(); } int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, @@ -725,8 +687,6 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) { u64 address; - struct kvm_io_device *kvm_io_dev; - struct vgic_io_device *iodev; if (!vgic_has_its(kvm)) return ERR_PTR(-ENODEV); @@ -736,18 +696,7 @@ struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) address = (u64)msi->address_hi << 32 | msi->address_lo; - kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); - if (!kvm_io_dev) - return ERR_PTR(-EINVAL); - - if (kvm_io_dev->ops != &kvm_io_gic_ops) - return ERR_PTR(-EINVAL); - - iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); - if (iodev->iodev_type != IODEV_ITS) - return ERR_PTR(-EINVAL); - - return iodev->its; + return __vgic_doorbell_to_its(kvm, address); } /* @@ -878,15 +827,19 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, ite = find_ite(its, device_id, event_id); if (ite && its_is_collection_mapped(ite->collection)) { + struct its_device *device = find_its_device(its, device_id); + int ite_esz = vgic_its_get_abi(its)->ite_esz; + gpa_t gpa = device->itt_addr + ite->event_id * ite_esz; /* * Though the spec talks about removing the pending state, we * don't bother here since we clear the ITTE anyway and the * pending state is a property of the ITTE struct. */ - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); its_free_ite(kvm, ite); - return 0; + + return vgic_its_write_entry_lock(its, gpa, 0ULL, ite); } return E_ITS_DISCARD_UNMAPPED_INTERRUPT; @@ -920,7 +873,7 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, ite->collection = collection; vcpu = collection_to_vcpu(kvm, collection); - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); return update_affinity(ite->irq, vcpu); } @@ -955,7 +908,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, switch (type) { case GITS_BASER_TYPE_DEVICE: - if (id >= BIT_ULL(VITS_TYPER_DEVBITS)) + if (id > VITS_MAX_DEVID) return false; break; case GITS_BASER_TYPE_COLLECTION: @@ -1167,7 +1120,8 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, } /* Requires the its_lock to be held. */ -static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) +static void vgic_its_free_device(struct kvm *kvm, struct vgic_its *its, + struct its_device *device) { struct its_ite *ite, *temp; @@ -1179,7 +1133,7 @@ static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) its_free_ite(kvm, ite); - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); list_del(&device->dev_list); kfree(device); @@ -1191,7 +1145,7 @@ static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) struct its_device *cur, *temp; list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) - vgic_its_free_device(kvm, cur); + vgic_its_free_device(kvm, its, cur); } /* its lock must be held */ @@ -1235,8 +1189,9 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, u8 num_eventid_bits = its_cmd_get_size(its_cmd); gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); struct its_device *device; + gpa_t gpa; - if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) + if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa)) return E_ITS_MAPD_DEVICE_OOR; if (valid && num_eventid_bits > VITS_TYPER_IDBITS) @@ -1250,14 +1205,14 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, * by removing the mapping and re-establishing it. */ if (device) - vgic_its_free_device(kvm, device); + vgic_its_free_device(kvm, its, device); /* * The spec does not say whether unmapping a not-mapped device * is an error, so we are done in any case. */ if (!valid) - return 0; + return vgic_its_write_entry_lock(its, gpa, 0ULL, dte); device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); @@ -1281,7 +1236,7 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, if (!valid) { vgic_its_free_collection(its, coll_id); - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); } else { struct kvm_vcpu *vcpu; @@ -1372,23 +1327,19 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, int vgic_its_invall(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int irq_count, i = 0; - u32 *intids; - - irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); - if (irq_count < 0) - return irq_count; + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long intid; - for (i = 0; i < irq_count; i++) { - struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intids[i]); + xa_for_each(&dist->lpi_xa, intid, irq) { + irq = vgic_get_irq(kvm, intid); if (!irq) continue; + update_lpi_config(kvm, irq, vcpu, false); vgic_put_irq(kvm, irq); } - kfree(intids); - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); @@ -1431,10 +1382,10 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { + struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu1, *vcpu2; struct vgic_irq *irq; - u32 *intids; - int irq_count, i; + unsigned long intid; /* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */ vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd)); @@ -1446,12 +1397,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, if (vcpu1 == vcpu2) return 0; - irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); + xa_for_each(&dist->lpi_xa, intid, irq) { + irq = vgic_get_irq(kvm, intid); if (!irq) continue; @@ -1460,9 +1407,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, vgic_put_irq(kvm, irq); } - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); - kfree(intids); return 0; } @@ -1813,7 +1759,7 @@ static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, its->enabled = !!(val & GITS_CTLR_ENABLE); if (!its->enabled) - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); /* * Try to process any pending commands. This function bails out early @@ -1914,47 +1860,6 @@ out: return ret; } -/* Default is 16 cached LPIs per vcpu */ -#define LPI_DEFAULT_PCPU_CACHE_SIZE 16 - -void vgic_lpi_translation_cache_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - unsigned int sz; - int i; - - if (!list_empty(&dist->lpi_translation_cache)) - return; - - sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE; - - for (i = 0; i < sz; i++) { - struct vgic_translation_cache_entry *cte; - - /* An allocation failure is not fatal */ - cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT); - if (WARN_ON(!cte)) - break; - - INIT_LIST_HEAD(&cte->entry); - list_add(&cte->entry, &dist->lpi_translation_cache); - } -} - -void vgic_lpi_translation_cache_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte, *tmp; - - vgic_its_invalidate_cache(kvm); - - list_for_each_entry_safe(cte, tmp, - &dist->lpi_translation_cache, entry) { - list_del(&cte->entry); - kfree(cte); - } -} - #define INITIAL_BASER_VALUE \ (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ @@ -1987,8 +1892,6 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) kfree(its); return ret; } - - vgic_lpi_translation_cache_init(dev->kvm); } mutex_init(&its->its_lock); @@ -2006,6 +1909,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) INIT_LIST_HEAD(&its->device_list); INIT_LIST_HEAD(&its->collection_list); + xa_init(&its->translation_cache); dev->kvm->arch.vgic.msis_require_devid = true; dev->kvm->arch.vgic.has_its = true; @@ -2036,6 +1940,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev) vgic_its_free_device_list(kvm, its); vgic_its_free_collection_list(kvm, its); + vgic_its_invalidate_cache(its); + xa_destroy(&its->translation_cache); mutex_unlock(&its->its_lock); kfree(its); @@ -2184,6 +2090,7 @@ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, * @start_id: the ID of the first entry in the table * (non zero for 2d level tables) * @fn: function to apply on each entry + * @opaque: pointer to opaque data * * Return: < 0 on error, 0 if last element was identified, 1 otherwise * (the last element may not be found on second level tables) @@ -2223,13 +2130,12 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, return 1; } -/** +/* * vgic_its_save_ite - Save an interrupt translation entry at @gpa */ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, - struct its_ite *ite, gpa_t gpa, int ite_esz) + struct its_ite *ite, gpa_t gpa) { - struct kvm *kvm = its->dev->kvm; u32 next_offset; u64 val; @@ -2238,11 +2144,14 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return vgic_write_guest_lock(kvm, gpa, &val, ite_esz); + + return vgic_its_write_entry_lock(its, gpa, val, ite); } /** * vgic_its_restore_ite - restore an interrupt translation entry + * + * @its: its handle * @event_id: id used for indexing * @ptr: pointer to the ITE entry * @opaque: pointer to the its_device @@ -2336,7 +2245,7 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1) return -EACCES; - ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); + ret = vgic_its_save_ite(its, device, ite, gpa); if (ret) return ret; } @@ -2377,9 +2286,8 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) * @ptr: GPA */ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, - gpa_t ptr, int dte_esz) + gpa_t ptr) { - struct kvm *kvm = its->dev->kvm; u64 val, itt_addr_field; u32 next_offset; @@ -2390,7 +2298,8 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return vgic_write_guest_lock(kvm, ptr, &val, dte_esz); + + return vgic_its_write_entry_lock(its, ptr, val, dte); } /** @@ -2438,7 +2347,7 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, ret = vgic_its_restore_itt(its, dev); if (ret) { - vgic_its_free_device(its->dev->kvm, dev); + vgic_its_free_device(its->dev->kvm, its, dev); return ret; } @@ -2457,7 +2366,7 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a, return 1; } -/** +/* * vgic_its_save_device_tables - Save the device table and all ITT * into guest RAM * @@ -2466,10 +2375,8 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a, */ static int vgic_its_save_device_tables(struct vgic_its *its) { - const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_device_table; struct its_device *dev; - int dte_esz = abi->dte_esz; if (!(baser & GITS_BASER_VALID)) return 0; @@ -2488,7 +2395,7 @@ static int vgic_its_save_device_tables(struct vgic_its *its) if (ret) return ret; - ret = vgic_its_save_dte(its, dev, eaddr, dte_esz); + ret = vgic_its_save_dte(its, dev, eaddr); if (ret) return ret; } @@ -2530,7 +2437,7 @@ static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, return ret; } -/** +/* * vgic_its_restore_device_tables - Restore the device table and all ITT * from guest RAM to internal data structs */ @@ -2569,7 +2476,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) static int vgic_its_save_cte(struct vgic_its *its, struct its_collection *collection, - gpa_t gpa, int esz) + gpa_t gpa) { u64 val; @@ -2577,7 +2484,8 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz); + + return vgic_its_write_entry_lock(its, gpa, val, cte); } /* @@ -2585,7 +2493,7 @@ static int vgic_its_save_cte(struct vgic_its *its, * Return +1 on success, 0 if the entry was invalid (which should be * interpreted as end-of-table), and a negative error value for generic errors. */ -static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) +static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa) { struct its_collection *collection; struct kvm *kvm = its->dev->kvm; @@ -2593,8 +2501,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) u64 val; int ret; - BUG_ON(esz > sizeof(val)); - ret = kvm_read_guest_lock(kvm, gpa, &val, esz); + ret = vgic_its_read_entry_lock(its, gpa, &val, cte); if (ret) return ret; val = le64_to_cpu(val); @@ -2622,7 +2529,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) return 1; } -/** +/* * vgic_its_save_collection_table - Save the collection table into * guest RAM */ @@ -2632,7 +2539,6 @@ static int vgic_its_save_collection_table(struct vgic_its *its) u64 baser = its->baser_coll_table; gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; - u64 val; size_t max_size, filled = 0; int ret, cte_esz = abi->cte_esz; @@ -2642,7 +2548,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; list_for_each_entry(collection, &its->collection_list, coll_list) { - ret = vgic_its_save_cte(its, collection, gpa, cte_esz); + ret = vgic_its_save_cte(its, collection, gpa); if (ret) return ret; gpa += cte_esz; @@ -2656,13 +2562,10 @@ static int vgic_its_save_collection_table(struct vgic_its *its) * table is not fully filled, add a last dummy element * with valid bit unset */ - val = 0; - BUG_ON(cte_esz > sizeof(val)); - ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); - return ret; + return vgic_its_write_entry_lock(its, gpa, 0ULL, cte); } -/** +/* * vgic_its_restore_collection_table - reads the collection table * in guest memory and restores the ITS internal state. Requires the * BASER registers to be restored before. @@ -2684,7 +2587,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; while (read < max_size) { - ret = vgic_its_restore_cte(its, gpa, cte_esz); + ret = vgic_its_restore_cte(its, gpa); if (ret <= 0) break; gpa += cte_esz; @@ -2700,7 +2603,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) return ret; } -/** +/* * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM * according to v0 ABI */ @@ -2715,7 +2618,7 @@ static int vgic_its_save_tables_v0(struct vgic_its *its) return vgic_its_save_collection_table(its); } -/** +/* * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM * to internal data structs according to V0 ABI * diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index f48b8dab8b3d..359094f68c23 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -236,7 +236,12 @@ static int vgic_set_common_attr(struct kvm_device *dev, mutex_lock(&dev->kvm->arch.config_lock); - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) + /* + * Either userspace has already configured NR_IRQS or + * the vgic has already been initialized and vgic_init() + * supplied a default amount of SPIs. + */ + if (dev->kvm->arch.vgic.nr_spis) ret = -EBUSY; else dev->kvm->arch.vgic.nr_spis = @@ -298,6 +303,12 @@ static int vgic_get_common_attr(struct kvm_device *dev, VGIC_NR_PRIVATE_IRQS, uaddr); break; } + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + r = put_user(dev->kvm->arch.vgic.mi_intid, uaddr); + break; + } } return r; @@ -338,12 +349,12 @@ int kvm_register_vgic_device(unsigned long type) int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr) { - int cpuid; - - cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr); + int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr); - reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid); reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid); + if (!reg_attr->vcpu) + return -EINVAL; return 0; } @@ -512,7 +523,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; - bool uaccess; + bool uaccess, post_init = true; u32 val; int ret; @@ -528,6 +539,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, /* Sysregs uaccess is performed by the sysreg handling code */ uaccess = false; break; + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: + post_init = false; + fallthrough; default: uaccess = true; } @@ -547,7 +561,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->arch.config_lock); - if (unlikely(!vgic_initialized(dev->kvm))) { + if (post_init != vgic_initialized(dev->kvm)) { ret = -EBUSY; goto out; } @@ -577,6 +591,19 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, } break; } + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: + if (!is_write) { + val = dev->kvm->arch.vgic.mi_intid; + ret = 0; + break; + } + + ret = -EINVAL; + if ((val < VGIC_NR_PRIVATE_IRQS) && (val >= VGIC_NR_SGIS)) { + dev->kvm->arch.vgic.mi_intid = val; + ret = 0; + } + break; default: ret = -EINVAL; break; @@ -603,6 +630,7 @@ static int vgic_v3_set_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return vgic_v3_attr_regs_access(dev, attr, true); default: return vgic_set_common_attr(dev, attr); @@ -617,6 +645,7 @@ static int vgic_v3_get_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return vgic_v3_attr_regs_access(dev, attr, false); default: return vgic_get_common_attr(dev, attr); @@ -640,6 +669,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return 0; case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index e070cda86e12..f25fccb1f8e6 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -148,7 +148,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, if (!(targets & (1U << c))) continue; - irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); + irq = vgic_get_vcpu_irq(vcpu, intid); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; @@ -167,7 +167,7 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, u64 val = 0; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->targets << (i * 8); @@ -191,7 +191,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, return; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid + i); int target; raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -213,7 +213,7 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, u64 val = 0; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->source << (i * 8); @@ -231,7 +231,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, unsigned long flags; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -253,7 +253,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, unsigned long flags; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index c15ee1df036a..ae4c0593d114 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -194,7 +194,7 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid); unsigned long ret = 0; if (!irq) @@ -220,7 +220,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, if (addr & 4) return; - irq = vgic_get_irq(vcpu->kvm, NULL, intid); + irq = vgic_get_irq(vcpu->kvm, intid); if (!irq) return; @@ -277,7 +277,7 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, return; vgic_flush_pending_lpis(vcpu); - vgic_its_invalidate_cache(vcpu->kvm); + vgic_its_invalidate_all_caches(vcpu->kvm); atomic_set_release(&vgic_cpu->ctlr, 0); } else { ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0, @@ -530,6 +530,7 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, unsigned long val) { struct vgic_irq *irq; + u32 intid; /* * If the guest wrote only to the upper 32bit part of the @@ -541,9 +542,13 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, if ((addr & 4) || !vgic_lpis_enabled(vcpu)) return; + intid = lower_32_bits(val); + if (intid < VGIC_MIN_LPI) + return; + vgic_set_rdist_busy(vcpu, true); - irq = vgic_get_irq(vcpu->kvm, NULL, lower_32_bits(val)); + irq = vgic_get_irq(vcpu->kvm, intid); if (irq) { vgic_its_inv_lpi(vcpu->kvm, irq); vgic_put_irq(vcpu->kvm, irq); @@ -919,8 +924,19 @@ free: return ret; } -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg) +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg) { + struct kvm_vcpu *vcpu; + unsigned long c; + + lockdep_assert_held(&kvm->arch.config_lock); + + /* Garbage collect the region */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (vcpu->arch.vgic_cpu.rdreg == rdreg) + vcpu->arch.vgic_cpu.rdreg = NULL; + } + list_del(&rdreg->list); kfree(rdreg); } @@ -945,7 +961,7 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) mutex_lock(&kvm->arch.config_lock); rdreg = vgic_v3_rdist_region_from_index(kvm, index); - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -1009,7 +1025,7 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, sgi); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, sgi); unsigned long flags; raw_spin_lock_irqsave(&irq->irq_lock, flags); diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index cf76523a2194..e416e433baff 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -50,7 +50,7 @@ unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->group) value |= BIT(i); @@ -74,7 +74,7 @@ void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, unsigned long flags; for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->group = !!(val & BIT(i)); @@ -102,7 +102,7 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->enabled) value |= (1U << i); @@ -122,7 +122,7 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { @@ -171,7 +171,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) @@ -193,7 +193,7 @@ int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = true; @@ -214,7 +214,7 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = false; @@ -236,7 +236,7 @@ static unsigned long __read_pending(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); unsigned long flags; bool val; @@ -309,7 +309,7 @@ static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { @@ -395,7 +395,7 @@ static void __clear_pending(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { @@ -494,7 +494,7 @@ static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Even for HW interrupts, don't evaluate the HW state as @@ -598,7 +598,7 @@ static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, int i; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, false); vgic_put_irq(vcpu->kvm, irq); } @@ -635,7 +635,7 @@ static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, int i; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, true); vgic_put_irq(vcpu->kvm, irq); } @@ -672,7 +672,7 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, u64 val = 0; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->priority << (i * 8); @@ -698,7 +698,7 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, unsigned long flags; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); /* Narrow the priority range to what we actually support */ @@ -719,7 +719,7 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, int i; for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_EDGE) value |= (2U << (i * 2)); @@ -750,7 +750,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, if (intid + i < VGIC_NR_PRIVATE_IRQS) continue; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + irq = vgic_get_irq(vcpu->kvm, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i * 2 + 1, &val)) @@ -775,7 +775,7 @@ u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) val |= (1U << i); @@ -799,7 +799,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Line level is set irrespective of irq type diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 7e9cdb78f7ce..381673f03c39 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -72,7 +72,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) kvm_notify_acked_irq(vcpu->kvm, 0, intid - VGIC_NR_PRIVATE_IRQS); - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + irq = vgic_get_vcpu_irq(vcpu, intid); raw_spin_lock(&irq->irq_lock); @@ -464,17 +464,10 @@ void vgic_v2_load(struct kvm_vcpu *vcpu) kvm_vgic_global_state.vctrl_base + GICH_APR); } -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); -} - void vgic_v2_put(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - vgic_v2_vmcr_sync(vcpu); + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c new file mode 100644 index 000000000000..bfa5bde1f106 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/uaccess.h> + +#include <kvm/arm_vgic.h> + +#include <asm/kvm_arm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> + +#include "vgic.h" + +#define ICH_LRN(n) (ICH_LR0_EL2 + (n)) +#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n)) +#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n)) + +struct mi_state { + u16 eisr; + u16 elrsr; + bool pend; +}; + +/* + * The shadow registers loaded to the hardware when running a L2 guest + * with the virtual IMO/FMO bits set. + */ +struct shadow_if { + struct vgic_v3_cpu_if cpuif; + unsigned long lr_map; +}; + +static DEFINE_PER_CPU(struct shadow_if, shadow_if); + +/* + * Nesting GICv3 support + * + * On a non-nesting VM (only running at EL0/EL1), the host hypervisor + * completely controls the interrupts injected via the list registers. + * Consequently, most of the state that is modified by the guest (by ACK-ing + * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we + * keep a semi-consistent view of the interrupts. + * + * This still applies for a NV guest, but only while "InHost" (either + * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}. + * + * When running a L2 guest ("not InHost"), things are radically different, + * as the L1 guest is in charge of provisioning the interrupts via its own + * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR + * page. This means that the flow described above does work (there is no + * state to rebuild in the L0 hypervisor), and that most things happed on L2 + * load/put: + * + * - on L2 load: move the in-memory L1 vGIC configuration into a shadow, + * per-CPU data structure that is used to populate the actual LRs. This is + * an extra copy that we could avoid, but life is short. In the process, + * we remap any interrupt that has the HW bit set to the mapped interrupt + * on the host, should the host consider it a HW one. This allows the HW + * deactivation to take its course, such as for the timer. + * + * - on L2 put: perform the inverse transformation, so that the result of L2 + * running becomes visible to L1 in the VNCR-accessible registers. + * + * - there is nothing to do on L2 entry, as everything will have happened + * on load. However, this is the point where we detect that an interrupt + * targeting L1 and prepare the grand switcheroo. + * + * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1 + * interrupt. The L0 active state will be cleared by the HW if the L1 + * interrupt was itself backed by a HW interrupt. + * + * Maintenance Interrupt (MI) management: + * + * Since the L2 guest runs the vgic in its full glory, MIs get delivered and + * used as a handover point between L2 and L1. + * + * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending, + * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to + * run and process the MI. + * + * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its + * state must be computed at each entry/exit of the guest, much like we do + * it for the PMU interrupt. + * + * - because most of the ICH_*_EL2 registers live in the VNCR page, the + * quality of emulation is poor: L1 can setup the vgic so that an MI would + * immediately fire, and not observe anything until the next exit. Trying + * to read ICH_MISR_EL2 would do the trick, for example. + * + * System register emulation: + * + * We get two classes of registers: + * + * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access + * them, and L0 doesn't see a thing. + * + * - those that always trap (ELRSR, EISR, MISR): these are status registers + * that are built on the fly based on the in-memory state. + * + * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot, + * and a NV L2 would either access the VNCR page provided by L1 (memory + * based registers), or see the access redirected to L1 (registers that + * trap) thanks to NV being set by L1. + */ + +bool vgic_state_is_nested(struct kvm_vcpu *vcpu) +{ + u64 xmo; + + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { + xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO); + WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO), + "Separate virtual IRQ/FIQ settings not supported\n"); + + return !!xmo; + } + + return false; +} + +static struct shadow_if *get_shadow_if(void) +{ + return this_cpu_ptr(&shadow_if); +} + +static bool lr_triggers_eoi(u64 lr) +{ + return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI); +} + +static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state) +{ + u16 eisr = 0, elrsr = 0; + bool pend = false; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + if (lr_triggers_eoi(lr)) + eisr |= BIT(i); + if (!(lr & ICH_LR_STATE)) + elrsr |= BIT(i); + pend |= (lr & ICH_LR_PENDING_BIT); + } + + mi_state->eisr = eisr; + mi_state->elrsr = elrsr; + mi_state->pend = pend; +} + +u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.eisr; +} + +u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.elrsr; +} + +u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + u64 reg = 0, hcr, vmcr; + + hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + + vgic_compute_mi_state(vcpu, &mi_state); + + if (mi_state.eisr) + reg |= ICH_MISR_EL2_EOI; + + if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) { + int used_lrs = kvm_vgic_global_state.nr_lr; + + used_lrs -= hweight16(mi_state.elrsr); + reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0; + } + + if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr)) + reg |= ICH_MISR_EL2_LRENP; + + if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend) + reg |= ICH_MISR_EL2_NP; + + if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0E; + + if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0D; + + if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1E; + + if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1D; + + return reg; +} + +/* + * For LRs which have HW bit set such as timer interrupts, we modify them to + * have the host hardware interrupt number instead of the virtual one programmed + * by the guest hypervisor. + */ +static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + unsigned long lr_map = 0; + int index = 0; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + struct vgic_irq *irq; + + if (!(lr & ICH_LR_STATE)) + lr = 0; + + if (!(lr & ICH_LR_HW)) + goto next; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) { + /* There was no real mapping, so nuke the HW bit */ + lr &= ~ICH_LR_HW; + if (irq) + vgic_put_irq(vcpu->kvm, irq); + goto next; + } + + /* It is illegal to have the EOI bit set with HW */ + lr &= ~ICH_LR_EOI; + + /* Translate the virtual mapping to the real one */ + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + + vgic_put_irq(vcpu->kvm, irq); + +next: + s_cpu_if->vgic_lr[index] = lr; + if (lr) { + lr_map |= BIT(i); + index++; + } + } + + container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map; + s_cpu_if->used_lrs = index; +} + +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + int i, index = 0; + + for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) + goto next; + + /* + * If we had a HW lr programmed by the guest hypervisor, we + * need to emulate the HW effect between the guest hypervisor + * and the nested guest. + */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ + goto next; + + lr = __gic_v3_get_lr(index); + if (!(lr & ICH_LR_STATE)) + irq->active = false; + + vgic_put_irq(vcpu->kvm, irq); + next: + index++; + } +} + +static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 val = 0; + int i; + + /* + * If we're on a system with a broken vgic that requires + * trapping, propagate the trapping requirements. + * + * Ah, the smell of rotten fruits... + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap)) + val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | + ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR); + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val; + s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + s_cpu_if->vgic_sre = host_if->vgic_sre; + + for (i = 0; i < 4; i++) { + s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i)); + s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i)); + } + + vgic_v3_create_shadow_lr(vcpu, s_cpu_if); +} + +void vgic_v3_load_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif; + + BUG_ON(!vgic_state_is_nested(vcpu)); + + vgic_v3_create_shadow_state(vcpu, cpu_if); + + __vgic_v3_restore_vmcr_aprs(cpu_if); + __vgic_v3_activate_traps(cpu_if); + + __vgic_v3_restore_state(cpu_if); + + /* + * Propagate the number of used LRs for the benefit of the HYP + * GICv3 emulation code. Yes, this is a pretty sorry hack. + */ + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs; +} + +void vgic_v3_put_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; + u64 val; + int i; + + __vgic_v3_save_vmcr_aprs(s_cpu_if); + __vgic_v3_deactivate_traps(s_cpu_if); + __vgic_v3_save_state(s_cpu_if); + + /* + * Translate the shadow state HW fields back to the virtual ones + * before copying the shadow struct back to the nested one. + */ + val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + val &= ~ICH_HCR_EL2_EOIcount_MASK; + val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); + __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val; + __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr; + + for (i = 0; i < 4; i++) { + __vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i]; + __vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i]; + } + + for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { + val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + val &= ~ICH_LR_STATE; + val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; + + __vcpu_sys_reg(vcpu, ICH_LRN(i)) = val; + s_cpu_if->vgic_lr[i] = 0; + } + + shadow_if->lr_map = 0; + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; +} + +/* + * If we exit a L2 VM with a pending maintenance interrupt from the GIC, + * then we need to forward this to L1 so that it can re-sync the appropriate + * LRs and sample level triggered interrupts again. + */ +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu) +{ + bool state = read_sysreg_s(SYS_ICH_MISR_EL2); + + /* This will force a switch back to L1 if the level is high */ + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, state, vcpu); + + sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); +} + +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu) +{ + bool level; + + level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En; + if (level) + level &= vgic_v3_get_misr(vcpu); + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, level, vcpu); +} diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 4ea3340786b9..b9ad7c42c5b0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -6,6 +6,7 @@ #include <linux/kstrtox.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/string_choices.h> #include <kvm/arm_vgic.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> @@ -23,7 +24,7 @@ void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - cpuif->vgic_hcr |= ICH_HCR_UIE; + cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; } static bool lr_signals_eoi_mi(u64 lr_val) @@ -41,7 +42,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - cpuif->vgic_hcr &= ~ICH_HCR_UIE; + cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE; for (lr = 0; lr < cpuif->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; @@ -65,7 +66,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) kvm_notify_acked_irq(vcpu->kvm, 0, intid - VGIC_NR_PRIVATE_IRQS); - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + irq = vgic_get_vcpu_irq(vcpu, intid); if (!irq) /* An LPI could have been unmapped. */ continue; @@ -283,23 +284,34 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_sre = 0; } - vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_ID_BITS_MASK) >> - ICH_VTR_ID_BITS_SHIFT; - vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_PRI_BITS_MASK) >> - ICH_VTR_PRI_BITS_SHIFT) + 1; + vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits, + kvm_vgic_global_state.ich_vtr_el2); + vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, + kvm_vgic_global_state.ich_vtr_el2) + 1; /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; + vgic_v3->vgic_hcr = ICH_HCR_EL2_En; +} + +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* Hide GICv3 sysreg if necessary */ + if (!kvm_has_gicv3(vcpu->kvm)) { + vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | + ICH_HCR_EL2_TC); + return; + } + if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL0; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0; if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL1; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1; if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TC; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC; if (dir_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TDIR; + vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -370,7 +382,7 @@ static void map_all_vpes(struct kvm *kvm) dist->its_vm.vpes[i]->irq)); } -/** +/* * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ @@ -619,8 +631,8 @@ static const struct midr_range broken_seis[] = { static bool vgic_v3_broken_seis(void) { - return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && - is_midr_in_range_list(read_cpuid_id(), broken_seis)); + return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) && + is_midr_in_range_list(broken_seis)); } /** @@ -651,9 +663,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; - kvm_info("GICv4%s support %sabled\n", + kvm_info("GICv4%s support %s\n", kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", - gicv4_enable ? "en" : "dis"); + str_enabled_disabled(gicv4_enable)); } kvm_vgic_global_state.vcpu_base = 0; @@ -693,10 +705,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); - kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK; + kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; group0_trap = true; group1_trap = true; - if (ich_vtr_el2 & ICH_VTR_TDS_MASK) + if (ich_vtr_el2 & ICH_VTR_EL2_TDS) dir_trap = true; else common_trap = true; @@ -722,15 +734,14 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - /* - * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen - * is dependent on ICC_SRE_EL1.SRE, and we have to perform the - * VMCR_EL2 save/restore in the world switch. - */ - if (likely(cpu_if->vgic_sre)) - kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); + /* If the vgic is nested, perform the full state loading */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_load_nested(vcpu); + return; + } - kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -738,23 +749,18 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) WARN_ON(vgic_v4_load(vcpu)); } -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); -} - void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - WARN_ON(vgic_v4_put(vcpu)); - - vgic_v3_vmcr_sync(vcpu); + if (vgic_state_is_nested(vcpu)) { + vgic_v3_put_nested(vcpu); + return; + } - kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); + WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 74a67ad87f29..c7de6154627c 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -123,7 +123,7 @@ static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu) * IRQ. The SGI code will do its magic. */ for (i = 0; i < VGIC_NR_SGIS; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); struct irq_desc *desc; unsigned long flags; int ret; @@ -160,7 +160,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) int i; for (i = 0; i < VGIC_NR_SGIS; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); struct irq_desc *desc; unsigned long flags; int ret; @@ -336,6 +336,22 @@ void vgic_v4_teardown(struct kvm *kvm) its_vm->vpes = NULL; } +static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu) +{ + if (vcpu_get_flag(vcpu, IN_WFI)) + return true; + + if (likely(!vcpu_has_nv(vcpu))) + return false; + + /* + * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the + * L1 context) nonresident and request a doorbell to kick us out of the + * L2 when an IRQ becomes pending. + */ + return vcpu_get_flag(vcpu, IN_NESTED_ERET); +} + int vgic_v4_put(struct kvm_vcpu *vcpu) { struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; @@ -343,7 +359,7 @@ int vgic_v4_put(struct kvm_vcpu *vcpu) if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) return 0; - return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI)); + return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu)); } int vgic_v4_load(struct kvm_vcpu *vcpu) @@ -415,7 +431,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, struct vgic_irq *irq; struct its_vlpi_map map; unsigned long flags; - int ret; + int ret = 0; if (!vgic_supports_direct_msis(kvm)) return 0; @@ -430,10 +446,15 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, mutex_lock(&its->its_lock); - /* Perform the actual DevID/EventID -> LPI translation. */ - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) + /* + * Perform the actual DevID/EventID -> LPI translation. + * + * Silently exit if translation fails as the guest (or userspace!) has + * managed to do something stupid. Emulated LPI injection will still + * work if the guest figures itself out at a later time. + */ + if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq)) goto out; /* Silently exit if the vLPI is already mapped */ @@ -512,7 +533,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, if (ret) goto out; - WARN_ON(!(irq->hw && irq->host_irq == virq)); + WARN_ON(irq->hw && irq->host_irq != virq); if (irq->hw) { atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); irq->hw = false; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 4ec93587c8cd..8f8096d48925 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -29,14 +29,18 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * its->cmd_lock (mutex) * its->its_lock (mutex) * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * kvm->lpi_list_lock must be taken with IRQs disabled - * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled - * vgic_irq->irq_lock must be taken with IRQs disabled + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, * we have to disable IRQs before taking this lock and everything lower * than it. * + * The config_lock has additional ordering requirements: + * kvm->slots_lock + * kvm->srcu + * kvm->arch.config_lock + * * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. * If you are already holding a lock and need to take a higher one, you @@ -80,17 +84,11 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) * struct vgic_irq. It also increases the refcount, so any caller is expected * to call vgic_put_irq() once it's finished with this IRQ. */ -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid) +struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid) { - /* SGIs and PPIs */ - if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); - return &vcpu->arch.vgic_cpu.private_irqs[intid]; - } - /* SPIs */ - if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + if (intid >= VGIC_NR_PRIVATE_IRQS && + intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; } @@ -102,6 +100,20 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, return NULL; } +struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) +{ + if (WARN_ON(!vcpu)) + return NULL; + + /* SGIs and PPIs */ + if (intid < VGIC_NR_PRIVATE_IRQS) { + intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS); + return &vcpu->arch.vgic_cpu.private_irqs[intid]; + } + + return vgic_get_irq(vcpu->kvm, intid); +} + /* * We can't do anything in here, because we lack the kvm pointer to * lock and remove the item from the lpi_list. So we keep this function @@ -126,7 +138,6 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) __xa_erase(&dist->lpi_xa, irq->intid); xa_unlock_irqrestore(&dist->lpi_xa, flags); - atomic_dec(&dist->lpi_count); kfree_rcu(irq, rcu); } @@ -315,7 +326,7 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne * with all locks dropped. */ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags) + unsigned long flags) __releases(&irq->irq_lock) { struct kvm_vcpu *vcpu; @@ -434,7 +445,10 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level); - irq = vgic_get_irq(kvm, vcpu, intid); + if (intid < VGIC_NR_PRIVATE_IRQS) + irq = vgic_get_vcpu_irq(vcpu, intid); + else + irq = vgic_get_irq(kvm, intid); if (!irq) return -EINVAL; @@ -496,7 +510,7 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, u32 vintid, struct irq_ops *ops) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); unsigned long flags; int ret; @@ -521,7 +535,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, */ void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); unsigned long flags; if (!irq->hw) @@ -544,7 +558,7 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) if (!vgic_initialized(vcpu->kvm)) return -EAGAIN; - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + irq = vgic_get_vcpu_irq(vcpu, vintid); BUG_ON(!irq); raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -557,7 +571,7 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); unsigned long flags; int ret = -1; @@ -593,7 +607,7 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) return -EINVAL; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + irq = vgic_get_vcpu_irq(vcpu, intid); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->owner && irq->owner != owner) ret = -EEXIST; @@ -858,6 +872,15 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { int used_lrs; + /* If nesting, emulate the HW effect from L0 to L1 */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_sync_nested(vcpu); + return; + } + + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); + /* An empty ap_list_head implies used_lrs == 0 */ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; @@ -887,6 +910,35 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu) void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { /* + * If in a nested state, we must return early. Two possibilities: + * + * - If we have any pending IRQ for the guest and the guest + * expects IRQs to be handled in its virtual EL2 mode (the + * virtual IMO bit is set) and it is not already running in + * virtual EL2 mode, then we have to emulate an IRQ + * exception to virtual EL2. + * + * We do that by placing a request to ourselves which will + * abort the entry procedure and inject the exception at the + * beginning of the run loop. + * + * - Otherwise, do exactly *NOTHING*. The guest state is + * already loaded, and we can carry on with running it. + * + * If we have NV, but are not in a nested state, compute the + * maintenance interrupt state, as it may fire. + */ + if (vgic_state_is_nested(vcpu)) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) + kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu); + + return; + } + + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); + + /* * If there are no virtual interrupts active or pending for this * VCPU, then there is no work to do and we can bail out without * taking any lock. There is a potential race with someone injecting @@ -919,10 +971,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -930,26 +985,18 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); } -void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) -{ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_vmcr_sync(vcpu); - else - vgic_v3_vmcr_sync(vcpu); -} - int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; @@ -1010,7 +1057,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) if (!vgic_initialized(vcpu->kvm)) return false; - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + irq = vgic_get_vcpu_irq(vcpu, vintid); raw_spin_lock_irqsave(&irq->irq_lock, flags); map_is_active = irq->hw && irq->active; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 0c2b82de8fa3..0c5a63712702 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -16,6 +16,7 @@ #define INTERRUPT_ID_BITS_SPIS 10 #define INTERRUPT_ID_BITS_ITS 16 +#define VGIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) #define VGIC_PRI_BITS 5 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) @@ -178,14 +179,14 @@ int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, const struct vgic_register_region * vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len); -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid); +struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); +struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags); + unsigned long flags) __releases(&irq->irq_lock); void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); @@ -214,7 +215,6 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); void vgic_v2_put(struct kvm_vcpu *vcpu); -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); @@ -253,7 +253,6 @@ bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); void vgic_v3_put(struct kvm_vcpu *vcpu); -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); @@ -317,7 +316,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg); +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); @@ -330,14 +329,11 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) } bool vgic_lpis_enabled(struct kvm_vcpu *vcpu); -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq); struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); -void vgic_lpi_translation_cache_init(struct kvm *kvm); -void vgic_lpi_translation_cache_destroy(struct kvm *kvm); -void vgic_its_invalidate_cache(struct kvm *kvm); +void vgic_its_invalidate_all_caches(struct kvm *kvm); /* GICv4.1 MMIO interface */ int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); @@ -350,4 +346,17 @@ void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); + +static inline bool kvm_has_gicv3(struct kvm *kvm) +{ + return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); +} + +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); +void vgic_v3_load_nested(struct kvm_vcpu *vcpu); +void vgic_v3_put_nested(struct kvm_vcpu *vcpu); +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index 806223b7022a..7fe8ba1a2851 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -135,11 +135,10 @@ void kvm_arm_vmid_clear_active(void) atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); } -bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) { unsigned long flags; u64 vmid, old_active_vmid; - bool updated = false; vmid = atomic64_read(&kvm_vmid->id); @@ -157,21 +156,17 @@ bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) if (old_active_vmid != 0 && vmid_gen_match(vmid) && 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), old_active_vmid, vmid)) - return false; + return; raw_spin_lock_irqsave(&cpu_vmid_lock, flags); /* Check that our VMID belongs to the current generation. */ vmid = atomic64_read(&kvm_vmid->id); - if (!vmid_gen_match(vmid)) { + if (!vmid_gen_match(vmid)) vmid = new_vmid(kvm_vmid); - updated = true; - } atomic64_set(this_cpu_ptr(&active_vmids), vmid); raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); - - return updated; } /* |