summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r--arch/arm64/kvm/Kconfig18
-rw-r--r--arch/arm64/kvm/Makefile13
-rw-r--r--arch/arm64/kvm/arch_timer.c212
-rw-r--r--arch/arm64/kvm/arm.c574
-rw-r--r--arch/arm64/kvm/at.c1446
-rw-r--r--arch/arm64/kvm/debug.c416
-rw-r--r--arch/arm64/kvm/emulate-nested.c600
-rw-r--r--arch/arm64/kvm/fpsimd.c131
-rw-r--r--arch/arm64/kvm/guest.c68
-rw-r--r--arch/arm64/kvm/handle_exit.c90
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/aarch32.c18
-rw-r--r--arch/arm64/kvm/hyp/entry.S13
-rw-r--r--arch/arm64/kvm/hyp/fpsimd.S6
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/debug-sr.h50
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/fault.h5
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h353
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h122
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/ffa.h2
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/fixed_config.h223
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/gfp.h6
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h41
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/memory.h50
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/pkvm.h34
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/trap_handler.h2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile21
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c76
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c214
-rw-r--r--arch/arm64/kvm/hyp/nvhe/gen-hyprel.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S10
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S90
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c374
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c913
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c12
-rw-r--r--arch/arm64/kvm/hyp/nvhe/page_alloc.c14
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c460
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c7
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c55
-rw-r--r--arch/arm64/kvm/hyp/nvhe/stacktrace.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c125
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c404
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sysreg-sr.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/timer-sr.c16
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c120
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c141
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c133
-rw-r--r--arch/arm64/kvm/hyp/vhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/vhe/debug-sr.c5
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c464
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c156
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c176
-rw-r--r--arch/arm64/kvm/hypercalls.c27
-rw-r--r--arch/arm64/kvm/mmio.c44
-rw-r--r--arch/arm64/kvm/mmu.c423
-rw-r--r--arch/arm64/kvm/nested.c1130
-rw-r--r--arch/arm64/kvm/pauth.c206
-rw-r--r--arch/arm64/kvm/pkvm.c278
-rw-r--r--arch/arm64/kvm/pmu-emul.c471
-rw-r--r--arch/arm64/kvm/pmu.c99
-rw-r--r--arch/arm64/kvm/psci.c44
-rw-r--r--arch/arm64/kvm/ptdump.c268
-rw-r--r--arch/arm64/kvm/reset.c24
-rw-r--r--arch/arm64/kvm/stacktrace.c9
-rw-r--r--arch/arm64/kvm/sys_regs.c2194
-rw-r--r--arch/arm64/kvm/sys_regs.h33
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h75
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c8
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c90
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c214
-rw-r--r--arch/arm64/kvm/vgic/vgic-irqfd.c7
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c461
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c44
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v2.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c30
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c38
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c11
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c409
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c88
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c39
-rw-r--r--arch/arm64/kvm/vgic/vgic.c119
-rw-r--r--arch/arm64/kvm/vgic/vgic.h29
-rw-r--r--arch/arm64/kvm/vmid.c11
82 files changed, 11161 insertions, 4271 deletions
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 58f09370d17e..096e45acadb2 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -19,6 +19,7 @@ if VIRTUALIZATION
menuconfig KVM
bool "Kernel-based Virtual Machine (KVM) support"
+ depends on AS_HAS_ARMV8_4
select KVM_COMMON
select KVM_GENERIC_HARDWARE_ENABLING
select KVM_GENERIC_MMU_NOTIFIER
@@ -65,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE
If unsure, or not using protected nVHE (pKVM), say N.
+config PTDUMP_STAGE2_DEBUGFS
+ bool "Present the stage-2 pagetables to debugfs"
+ depends on KVM
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on ARCH_HAS_PTDUMP
+ select PTDUMP
+ default n
+ help
+ Say Y here if you want to show the stage-2 kernel pagetables
+ layout in a debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
+
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index c0c050e53157..209bc76263f1 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -3,26 +3,31 @@
# Makefile for Kernel-based Virtual Machine module
#
-ccflags-y += -I $(srctree)/$(src)
+ccflags-y += -I $(src)
include $(srctree)/virt/kvm/Makefile.kvm
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM) += hyp/
+CFLAGS_sys_regs.o += -Wno-override-init
+CFLAGS_handle_exit.o += -Wno-override-init
+
kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o stacktrace.o \
vgic-sys-reg-v3.o fpsimd.o pkvm.o \
- arch_timer.o trng.o vmid.o emulate-nested.o nested.o \
+ arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
- vgic/vgic-its.o vgic/vgic-debug.o
+ vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
+kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
+kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
always-y := hyp_constants.h hyp-constants.s
@@ -30,7 +35,7 @@ define rule_gen_hyp_constants
$(call filechk,offsets,__HYP_CONSTANTS_H__)
endef
-CFLAGS_hyp-constants.o = -I $(srctree)/$(src)/hyp/include
+CFLAGS_hyp-constants.o = -I $(src)/hyp/include
$(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE
$(call if_changed_dep,cc_s_c)
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 879982b1cc73..5133dcbfe9f7 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -30,6 +30,7 @@ static u32 host_vtimer_irq_flags;
static u32 host_ptimer_irq_flags;
static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
+DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key);
static const u8 default_ppi[] = {
[TIMER_PTIMER] = 30,
@@ -101,21 +102,6 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
}
}
-static u64 timer_get_offset(struct arch_timer_context *ctxt)
-{
- u64 offset = 0;
-
- if (!ctxt)
- return 0;
-
- if (ctxt->offset.vm_offset)
- offset += *ctxt->offset.vm_offset;
- if (ctxt->offset.vcpu_offset)
- offset += *ctxt->offset.vcpu_offset;
-
- return offset;
-}
-
static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
{
struct kvm_vcpu *vcpu = ctxt->vcpu;
@@ -206,8 +192,7 @@ void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
static inline bool userspace_irqchip(struct kvm *kvm)
{
- return static_branch_unlikely(&userspace_irqchip_in_use) &&
- unlikely(!irqchip_in_kernel(kvm));
+ return unlikely(!irqchip_in_kernel(kvm));
}
static void soft_timer_start(struct hrtimer *hrt, u64 ns)
@@ -442,22 +427,39 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu)
regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
}
+static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level)
+{
+ /*
+ * Paper over NV2 brokenness by publishing the interrupt status
+ * bit. This still results in a poor quality of emulation (guest
+ * writes will have no effect until the next exit).
+ *
+ * But hey, it's fast, right?
+ */
+ if (is_hyp_ctxt(ctx->vcpu) &&
+ (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) {
+ unsigned long val = timer_get_ctl(ctx);
+ __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level);
+ timer_set_ctl(ctx, val);
+ }
+}
+
static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
struct arch_timer_context *timer_ctx)
{
- int ret;
+ kvm_timer_update_status(timer_ctx, new_level);
timer_ctx->irq.level = new_level;
trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx),
timer_ctx->irq.level);
- if (!userspace_irqchip(vcpu->kvm)) {
- ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu,
- timer_irq(timer_ctx),
- timer_ctx->irq.level,
- timer_ctx);
- WARN_ON(ret);
- }
+ if (userspace_irqchip(vcpu->kvm))
+ return;
+
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ timer_irq(timer_ctx),
+ timer_ctx->irq.level,
+ timer_ctx);
}
/* Only called for a fully emulated timer */
@@ -467,10 +469,10 @@ static void timer_emulate(struct arch_timer_context *ctx)
trace_kvm_timer_emulate(ctx, should_fire);
- if (should_fire != ctx->irq.level) {
+ if (should_fire != ctx->irq.level)
kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
- return;
- }
+
+ kvm_timer_update_status(ctx, should_fire);
/*
* If the timer can fire now, we don't need to have a soft timer
@@ -514,7 +516,12 @@ static void timer_save_state(struct arch_timer_context *ctx)
case TIMER_VTIMER:
case TIMER_HVTIMER:
timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL));
- timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL));
+ cval = read_sysreg_el0(SYS_CNTV_CVAL);
+
+ if (has_broken_cntvoff())
+ cval -= timer_get_offset(ctx);
+
+ timer_set_cval(ctx, cval);
/* Disable the timer */
write_sysreg_el0(0, SYS_CNTV_CTL);
@@ -619,8 +626,15 @@ static void timer_restore_state(struct arch_timer_context *ctx)
case TIMER_VTIMER:
case TIMER_HVTIMER:
- set_cntvoff(timer_get_offset(ctx));
- write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL);
+ cval = timer_get_cval(ctx);
+ offset = timer_get_offset(ctx);
+ if (has_broken_cntvoff()) {
+ set_cntvoff(0);
+ cval += offset;
+ } else {
+ set_cntvoff(offset);
+ }
+ write_sysreg_el0(cval, SYS_CNTV_CVAL);
isb();
write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
break;
@@ -743,27 +757,12 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
timer_irq(map->direct_ptimer),
&arch_timer_irq_ops);
WARN_ON_ONCE(ret);
-
- /*
- * The virtual offset behaviour is "interesting", as it
- * always applies when HCR_EL2.E2H==0, but only when
- * accessed from EL1 when HCR_EL2.E2H==1. So make sure we
- * track E2H when putting the HV timer in "direct" mode.
- */
- if (map->direct_vtimer == vcpu_hvtimer(vcpu)) {
- struct arch_timer_offset *offs = &map->direct_vtimer->offset;
-
- if (vcpu_el2_e2h_is_set(vcpu))
- offs->vcpu_offset = NULL;
- else
- offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
- }
}
}
static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
{
- bool tpt, tpc;
+ bool tvt, tpt, tvc, tpc, tvt02, tpt02;
u64 clr, set;
/*
@@ -778,7 +777,29 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
* within this function, reality kicks in and we start adding
* traps based on emulation requirements.
*/
- tpt = tpc = false;
+ tvt = tpt = tvc = tpc = false;
+ tvt02 = tpt02 = false;
+
+ /*
+ * NV2 badly breaks the timer semantics by redirecting accesses to
+ * the EL1 timer state to memory, so let's call ECV to the rescue if
+ * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses.
+ *
+ * The treatment slightly varies depending whether we run a nVHE or
+ * VHE guest: nVHE will use the _EL0 registers directly, while VHE
+ * will use the _EL02 accessors. This translates in different trap
+ * bits.
+ *
+ * None of the trapping is required when running in non-HYP context,
+ * unless required by the L1 hypervisor settings once we advertise
+ * ECV+NV in the guest, or that we need trapping for other reasons.
+ */
+ if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) {
+ if (vcpu_el2_e2h_is_set(vcpu))
+ tvt02 = tpt02 = true;
+ else
+ tvt = tpt = true;
+ }
/*
* We have two possibility to deal with a physical offset:
@@ -794,9 +815,20 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
tpt = tpc = true;
/*
+ * For the poor sods that could not correctly substract one value
+ * from another, trap the full virtual timer and counter.
+ */
+ if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer))
+ tvt = tvc = true;
+
+ /*
* Apply the enable bits that the guest hypervisor has requested for
* its own guest. We can only add traps that wouldn't have been set
* above.
+ * Implementation choices: we do not support NV when E2H=0 in the
+ * guest, and we don't support configuration where E2H is writable
+ * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but
+ * not both). This simplifies the handling of the EL1NV* bits.
*/
if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
@@ -807,6 +839,9 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
tpt |= !(val & (CNTHCTL_EL1PCEN << 10));
tpc |= !(val & (CNTHCTL_EL1PCTEN << 10));
+
+ tpt02 |= (val & CNTHCTL_EL1NVPCT);
+ tvt02 |= (val & CNTHCTL_EL1NVVCT);
}
/*
@@ -818,6 +853,10 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr);
assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr);
+ assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set);
+ assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set);
+ assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set);
+ assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set);
/* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */
sysreg_clear_set(cnthctl_el2, clr, set);
@@ -906,6 +945,44 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
kvm_timer_blocking(vcpu);
}
+void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When NV2 is on, guest hypervisors have their EL1 timer register
+ * accesses redirected to the VNCR page. Any guest action taken on
+ * the timer is postponed until the next exit, leading to a very
+ * poor quality of emulation.
+ *
+ * This is an unmitigated disaster, only papered over by FEAT_ECV,
+ * which allows trapping of the timer registers even with NV2.
+ * Still, this is still worse than FEAT_NV on its own. Meh.
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_ECV)) {
+ /*
+ * For a VHE guest hypervisor, the EL2 state is directly
+ * stored in the host EL1 timers, while the emulated EL1
+ * state is stored in the VNCR page. The latter could have
+ * been updated behind our back, and we must reset the
+ * emulation of the timers.
+ *
+ * A non-VHE guest hypervisor doesn't have any direct access
+ * to its timers: the EL2 registers trap despite being
+ * notionally direct (we use the EL1 HW, as for VHE), while
+ * the EL1 registers access memory.
+ *
+ * In both cases, process the emulated timers on each guest
+ * exit. Boo.
+ */
+ struct timer_map map;
+ get_timer_map(vcpu, &map);
+
+ soft_timer_cancel(&map.emul_vtimer->hrtimer);
+ soft_timer_cancel(&map.emul_ptimer->hrtimer);
+ timer_emulate(map.emul_vtimer);
+ timer_emulate(map.emul_ptimer);
+ }
+}
+
/*
* With a userspace irqchip we have to check if the guest de-asserted the
* timer and if so, unmask the timer irq signal on the host interrupt
@@ -993,8 +1070,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
else
ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset;
- hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- ctxt->hrtimer.function = kvm_hrtimer_expire;
+ hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
switch (timerid) {
case TIMER_PTIMER:
@@ -1021,8 +1097,8 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
timer_set_offset(vcpu_ptimer(vcpu), 0);
}
- hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- timer->bg_timer.function = kvm_bg_timer_expire;
+ hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
}
void kvm_timer_init_vm(struct kvm *kvm)
@@ -1364,6 +1440,37 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info)
return 0;
}
+static void kvm_timer_handle_errata(void)
+{
+ u64 mmfr0, mmfr1, mmfr4;
+
+ /*
+ * CNTVOFF_EL2 is broken on some implementations. For those, we trap
+ * all virtual timer/counter accesses, requiring FEAT_ECV.
+ *
+ * However, a hypervisor supporting nesting is likely to mitigate the
+ * erratum at L0, and not require other levels to mitigate it (which
+ * would otherwise be a terrible performance sink due to trap
+ * amplification).
+ *
+ * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0,
+ * and that NV is likely not to (because of limitations of the
+ * architecture), only enable the workaround when FEAT_VHE and
+ * FEAT_E2H0 are both detected. Time will tell if this actually holds.
+ */
+ mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+ mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1);
+ if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) &&
+ !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) &&
+ SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) &&
+ (has_vhe() || has_hvhe()) &&
+ cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) {
+ static_branch_enable(&broken_cntvoff_key);
+ kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n");
+ }
+}
+
int __init kvm_timer_hyp_init(bool has_gic)
{
struct arch_timer_kvm_info *info;
@@ -1432,6 +1539,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
goto out_free_vtimer_irq;
}
+ kvm_timer_handle_errata();
return 0;
out_free_ptimer_irq:
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 3dee5490eea9..68fec8c95fee 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -35,21 +35,33 @@
#include <asm/virt.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/kvm_pkvm.h>
-#include <asm/kvm_emulate.h>
+#include <asm/kvm_ptrauth.h>
#include <asm/sections.h>
#include <kvm/arm_hypercalls.h>
#include <kvm/arm_pmu.h>
#include <kvm/arm_psci.h>
+#include "sys_regs.h"
+
static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
+enum kvm_wfx_trap_policy {
+ KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
+ KVM_WFX_NOTRAP,
+ KVM_WFX_TRAP,
+};
+
+static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
+static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
+
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
-DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -57,7 +69,6 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
static bool vgic_present, kvm_arm_initialised;
static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
-DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
bool is_kvm_arm_initialised(void)
{
@@ -72,12 +83,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
- int r;
- u64 new_cap;
+ int r = -EINVAL;
if (cap->flags)
return -EINVAL;
+ if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap))
+ return -EINVAL;
+
switch (cap->cap) {
case KVM_CAP_ARM_NISV_TO_USER:
r = 0;
@@ -86,9 +99,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
break;
case KVM_CAP_ARM_MTE:
mutex_lock(&kvm->lock);
- if (!system_supports_mte() || kvm->created_vcpus) {
- r = -EINVAL;
- } else {
+ if (system_supports_mte() && !kvm->created_vcpus) {
r = 0;
set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
}
@@ -99,25 +110,30 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
break;
case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
- new_cap = cap->args[0];
-
mutex_lock(&kvm->slots_lock);
/*
* To keep things simple, allow changing the chunk
* size only when no memory slots have been created.
*/
- if (!kvm_are_all_memslots_empty(kvm)) {
- r = -EINVAL;
- } else if (new_cap && !kvm_is_block_size_supported(new_cap)) {
- r = -EINVAL;
- } else {
- r = 0;
- kvm->arch.mmu.split_page_chunk_size = new_cap;
+ if (kvm_are_all_memslots_empty(kvm)) {
+ u64 new_cap = cap->args[0];
+
+ if (!new_cap || kvm_is_block_size_supported(new_cap)) {
+ r = 0;
+ kvm->arch.mmu.split_page_chunk_size = new_cap;
+ }
}
mutex_unlock(&kvm->slots_lock);
break;
+ case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ r = 0;
+ set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags);
+ }
+ mutex_unlock(&kvm->lock);
+ break;
default:
- r = -EINVAL;
break;
}
@@ -132,6 +148,7 @@ static int kvm_arm_default_max_vcpus(void)
/**
* kvm_arch_init_vm - initializes a VM data structure
* @kvm: pointer to the KVM struct
+ * @type: kvm device type
*/
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
@@ -147,6 +164,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_unlock(&kvm->lock);
#endif
+ kvm_init_nested(kvm);
+
ret = kvm_share_hyp(kvm, kvm + 1);
if (ret)
return ret;
@@ -193,6 +212,24 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
void kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
kvm_sys_regs_create_debugfs(kvm);
+ kvm_s2_ptdump_create_debugfs(kvm);
+}
+
+static void kvm_destroy_mpidr_data(struct kvm *kvm)
+{
+ struct kvm_mpidr_data *data;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ data = rcu_dereference_protected(kvm->arch.mpidr_data,
+ lockdep_is_held(&kvm->arch.config_lock));
+ if (data) {
+ rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
+ synchronize_rcu();
+ kfree(data);
+ }
+
+ mutex_unlock(&kvm->arch.config_lock);
}
/**
@@ -209,7 +246,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (is_protected_kvm_enabled())
pkvm_destroy_hyp_vm(kvm);
- kfree(kvm->arch.mpidr_data);
+ kvm_destroy_mpidr_data(kvm);
+
kfree(kvm->arch.sysreg_masks);
kvm_destroy_vcpus(kvm);
@@ -218,9 +256,47 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_arm_teardown_hypercalls(kvm);
}
+static bool kvm_has_full_ptr_auth(void)
+{
+ bool apa, gpa, api, gpi, apa3, gpa3;
+ u64 isar1, isar2, val;
+
+ /*
+ * Check that:
+ *
+ * - both Address and Generic auth are implemented for a given
+ * algorithm (Q5, IMPDEF or Q3)
+ * - only a single algorithm is implemented.
+ */
+ if (!system_has_full_ptr_auth())
+ return false;
+
+ isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
+ isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
+
+ apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
+ val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
+ gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);
+
+ api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
+ val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
+ gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);
+
+ apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
+ val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
+ gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);
+
+ return (apa == gpa && api == gpi && apa3 == gpa3 &&
+ (apa + api + apa3) == 1);
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
+
+ if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext))
+ return 0;
+
switch (ext) {
case KVM_CAP_IRQCHIP:
r = vgic_present;
@@ -245,6 +321,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_SYSTEM_SUSPEND:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_COUNTER_OFFSET:
+ case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -298,7 +375,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = get_num_wrps();
break;
case KVM_CAP_ARM_PMU_V3:
- r = kvm_arm_support_pmu_v3();
+ r = kvm_supports_guest_pmuv3();
break;
case KVM_CAP_ARM_INJECT_SERROR_ESR:
r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
@@ -311,7 +388,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_ARM_PTRAUTH_ADDRESS:
case KVM_CAP_ARM_PTRAUTH_GENERIC:
- r = system_has_full_ptr_auth();
+ r = kvm_has_full_ptr_auth();
break;
case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
if (kvm)
@@ -378,28 +455,31 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
- /*
- * Default value for the FP state, will be overloaded at load
- * time if we support FP (pretty likely)
- */
- vcpu->arch.fp_state = FP_STATE_FREE;
-
/* Set up the timer */
kvm_timer_vcpu_init(vcpu);
kvm_pmu_vcpu_init(vcpu);
- kvm_arm_reset_debug_ptr(vcpu);
-
kvm_arm_pvtime_vcpu_init(&vcpu->arch);
vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
+ /*
+ * This vCPU may have been created after mpidr_data was initialized.
+ * Throw out the pre-computed mappings if that is the case which forces
+ * KVM to fall back to iteratively searching the vCPUs.
+ */
+ kvm_destroy_mpidr_data(vcpu->kvm);
+
err = kvm_vgic_vcpu_init(vcpu);
if (err)
return err;
- return kvm_share_hyp(vcpu, vcpu + 1);
+ err = kvm_share_hyp(vcpu, vcpu + 1);
+ if (err)
+ kvm_vgic_vcpu_destroy(vcpu);
+
+ return err;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -408,10 +488,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
- static_branch_dec(&userspace_irqchip_in_use);
-
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+ if (!is_protected_kvm_enabled())
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+ else
+ free_hyp_memcache(&vcpu->arch.pkvm_memcache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
kvm_vgic_vcpu_destroy(vcpu);
@@ -428,15 +508,81 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
}
+static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) {
+ /*
+ * Either we're running an L2 guest, and the API/APK bits come
+ * from L1's HCR_EL2, or API/APK are both set.
+ */
+ if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
+ u64 val;
+
+ val = __vcpu_sys_reg(vcpu, HCR_EL2);
+ val &= (HCR_API | HCR_APK);
+ vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
+ vcpu->arch.hcr_el2 |= val;
+ } else {
+ vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
+ }
+
+ /*
+ * Save the host keys if there is any chance for the guest
+ * to use pauth, as the entry code will reload the guest
+ * keys in that case.
+ */
+ if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
+ struct kvm_cpu_context *ctxt;
+
+ ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
+ ptrauth_save_keys(ctxt);
+ }
+ }
+}
+
+static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
+ return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
+
+ return single_task_running() &&
+ (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
+ vcpu->kvm->arch.vgic.nassgireq);
+}
+
+static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
+ return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
+
+ return single_task_running();
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_s2_mmu *mmu;
int *last_ran;
+ if (is_protected_kvm_enabled())
+ goto nommu;
+
+ if (vcpu_has_nv(vcpu))
+ kvm_vcpu_load_hw_mmu(vcpu);
+
mmu = vcpu->arch.hw_mmu;
last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
/*
+ * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2,
+ * which happens eagerly in VHE.
+ *
+ * Also, the VMID allocator only preserves VMIDs that are active at the
+ * time of rollover, so KVM might need to grab a new VMID for the MMU if
+ * this is called from kvm_sched_in().
+ */
+ kvm_arm_vmid_update(&mmu->vmid);
+
+ /*
* We guarantee that both TLBs and I-cache are private to each
* vcpu. If detecting that a vcpu from the same VM has
* previously run on the same physical CPU, call into the
@@ -450,10 +596,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*last_ran = vcpu->vcpu_idx;
}
+nommu:
vcpu->cpu = cpu;
- kvm_vgic_load(vcpu);
+ /*
+ * The timer must be loaded before the vgic to correctly set up physical
+ * interrupt deactivation in nested state (e.g. timer interrupt).
+ */
kvm_timer_vcpu_load(vcpu);
+ kvm_vgic_load(vcpu);
+ kvm_vcpu_load_debug(vcpu);
if (has_vhe())
kvm_vcpu_load_vhe(vcpu);
kvm_arch_vcpu_load_fp(vcpu);
@@ -461,14 +613,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
- if (single_task_running())
- vcpu_clear_wfx_traps(vcpu);
+ if (kvm_vcpu_should_clear_twe(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_TWE;
else
- vcpu_set_wfx_traps(vcpu);
+ vcpu->arch.hcr_el2 |= HCR_TWE;
+
+ if (kvm_vcpu_should_clear_twi(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_TWI;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TWI;
+
+ vcpu_set_pauth_traps(vcpu);
- if (vcpu_has_ptrauth(vcpu))
- vcpu_ptrauth_disable(vcpu);
- kvm_arch_vcpu_load_debug_state_flags(vcpu);
+ if (is_protected_kvm_enabled()) {
+ kvm_call_hyp_nvhe(__pkvm_vcpu_load,
+ vcpu->kvm->arch.pkvm.handle,
+ vcpu->vcpu_idx, vcpu->arch.hcr_el2);
+ kvm_call_hyp(__vgic_v3_restore_vmcr_aprs,
+ &vcpu->arch.vgic_cpu.vgic_v3);
+ }
if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
vcpu_set_on_unsupported_cpu(vcpu);
@@ -476,13 +639,21 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_arch_vcpu_put_debug_state_flags(vcpu);
+ if (is_protected_kvm_enabled()) {
+ kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
+ &vcpu->arch.vgic_cpu.vgic_v3);
+ kvm_call_hyp_nvhe(__pkvm_vcpu_put);
+ }
+
+ kvm_vcpu_put_debug(vcpu);
kvm_arch_vcpu_put_fp(vcpu);
if (has_vhe())
kvm_vcpu_put_vhe(vcpu);
kvm_timer_vcpu_put(vcpu);
kvm_vgic_put(vcpu);
kvm_vcpu_pmu_restore_host(vcpu);
+ if (vcpu_has_nv(vcpu))
+ kvm_vcpu_put_hw_mmu(vcpu);
kvm_arm_vmid_clear_active();
vcpu_clear_on_unsupported_cpu(vcpu);
@@ -580,11 +751,6 @@ unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
}
#endif
-static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
-{
- return vcpu_get_flag(vcpu, VCPU_INITIALIZED);
-}
-
static void kvm_init_mpidr_data(struct kvm *kvm)
{
struct kvm_mpidr_data *data = NULL;
@@ -594,7 +760,8 @@ static void kvm_init_mpidr_data(struct kvm *kvm)
mutex_lock(&kvm->arch.config_lock);
- if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1)
+ if (rcu_access_pointer(kvm->arch.mpidr_data) ||
+ atomic_read(&kvm->online_vcpus) == 1)
goto out;
kvm_for_each_vcpu(c, vcpu, kvm) {
@@ -631,7 +798,7 @@ static void kvm_init_mpidr_data(struct kvm *kvm)
data->cmpidr_to_idx[index] = c;
}
- kvm->arch.mpidr_data = data;
+ rcu_assign_pointer(kvm->arch.mpidr_data, data);
out:
mutex_unlock(&kvm->arch.config_lock);
}
@@ -661,8 +828,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
kvm_init_mpidr_data(kvm);
- kvm_arm_vcpu_init_debug(vcpu);
-
if (likely(irqchip_in_kernel(kvm))) {
/*
* Map the VGIC hardware resources before running a vcpu the
@@ -673,48 +838,42 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
return ret;
}
+ ret = kvm_finalize_sys_regs(vcpu);
+ if (ret)
+ return ret;
+
if (vcpu_has_nv(vcpu)) {
- ret = kvm_init_nv_sysregs(vcpu->kvm);
+ ret = kvm_vgic_vcpu_nv_init(vcpu);
if (ret)
return ret;
}
/*
- * This needs to happen after NV has imposed its own restrictions on
- * the feature set
+ * This needs to happen after any restriction has been applied
+ * to the feature set.
*/
- kvm_init_sysreg(vcpu);
+ kvm_calculate_traps(vcpu);
ret = kvm_timer_enable(vcpu);
if (ret)
return ret;
- ret = kvm_arm_pmu_v3_enable(vcpu);
- if (ret)
- return ret;
+ if (kvm_vcpu_has_pmu(vcpu)) {
+ ret = kvm_arm_pmu_v3_enable(vcpu);
+ if (ret)
+ return ret;
+ }
if (is_protected_kvm_enabled()) {
ret = pkvm_create_hyp_vm(kvm);
if (ret)
return ret;
- }
- if (!irqchip_in_kernel(kvm)) {
- /*
- * Tell the rest of the code that there are userspace irqchip
- * VMs in the wild.
- */
- static_branch_inc(&userspace_irqchip_in_use);
+ ret = pkvm_create_hyp_vcpu(vcpu);
+ if (ret)
+ return ret;
}
- /*
- * Initialize traps for protected VMs.
- * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
- * the code is in place for first run initialization at EL2.
- */
- if (kvm_vm_is_protected(kvm))
- kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
-
mutex_lock(&kvm->arch.config_lock);
set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
mutex_unlock(&kvm->arch.config_lock);
@@ -790,9 +949,8 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
* doorbells to be signalled, should an interrupt become pending.
*/
preempt_disable();
- kvm_vgic_vmcr_sync(vcpu);
vcpu_set_flag(vcpu, IN_WFI);
- vgic_v4_put(vcpu);
+ kvm_vgic_put(vcpu);
preempt_enable();
kvm_vcpu_halt(vcpu);
@@ -800,7 +958,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
preempt_disable();
vcpu_clear_flag(vcpu, IN_WFI);
- vgic_v4_load(vcpu);
+ kvm_vgic_load(vcpu);
preempt_enable();
}
@@ -849,6 +1007,9 @@ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
static int check_vcpu_requests(struct kvm_vcpu *vcpu)
{
if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
kvm_vcpu_sleep(vcpu);
@@ -883,6 +1044,8 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_dirty_ring_check_request(vcpu))
return 0;
+
+ check_nested_vcpu_requests(vcpu);
}
return 1;
@@ -924,7 +1087,7 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
* state gets updated in kvm_timer_update_run and
* kvm_pmu_update_run below).
*/
- if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
if (kvm_timer_should_notify_user(vcpu) ||
kvm_pmu_should_notify_user(vcpu)) {
*ret = -EINTR;
@@ -980,13 +1143,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (run->exit_reason == KVM_EXIT_MMIO) {
ret = kvm_handle_mmio_return(vcpu);
- if (ret)
+ if (ret <= 0)
return ret;
}
vcpu_load(vcpu);
- if (run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
ret = -EINTR;
goto out;
}
@@ -1014,19 +1177,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
preempt_disable();
- /*
- * The VMID allocator only tracks active VMIDs per
- * physical CPU, and therefore the VMID allocated may not be
- * preserved on VMID roll-over if the task was preempted,
- * making a thread's VMID inactive. So we need to call
- * kvm_arm_vmid_update() in non-premptible context.
- */
- if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) &&
- has_vhe())
- __load_stage2(vcpu->arch.hw_mmu,
- vcpu->arch.hw_mmu->arch);
-
- kvm_pmu_flush_hwstate(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_flush_hwstate(vcpu);
local_irq_disable();
@@ -1045,8 +1197,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
isb(); /* Ensure work in x_flush_hwstate is committed */
- kvm_pmu_sync_hwstate(vcpu);
- if (static_branch_unlikely(&userspace_irqchip_in_use))
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_sync_hwstate(vcpu);
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
kvm_timer_sync_user(vcpu);
kvm_vgic_sync_hwstate(vcpu);
local_irq_enable();
@@ -1054,7 +1207,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
continue;
}
- kvm_arm_setup_debug(vcpu);
kvm_arch_vcpu_ctxflush_fp(vcpu);
/**************************************************************
@@ -1071,14 +1223,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* Back from guest
*************************************************************/
- kvm_arm_clear_debug(vcpu);
-
/*
* We must sync the PMU state before the vgic state so
* that the vgic can properly sample the updated state of the
* interrupt line.
*/
- kvm_pmu_sync_hwstate(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_sync_hwstate(vcpu);
/*
* Sync the vgic state before syncing the timer state because
@@ -1092,9 +1243,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* we don't want vtimer interrupts to race with syncing the
* timer virtual interrupt state.
*/
- if (static_branch_unlikely(&userspace_irqchip_in_use))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
kvm_timer_sync_user(vcpu);
+ if (is_hyp_ctxt(vcpu))
+ kvm_timer_sync_nested(vcpu);
+
kvm_arch_vcpu_ctxsync_fp(vcpu);
/*
@@ -1264,13 +1418,13 @@ static unsigned long system_supported_vcpu_features(void)
if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
- if (!kvm_arm_support_pmu_v3())
+ if (!kvm_supports_guest_pmuv3())
clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
if (!system_supports_sve())
clear_bit(KVM_ARM_VCPU_SVE, &features);
- if (!system_has_full_ptr_auth()) {
+ if (!kvm_has_full_ptr_auth()) {
clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
}
@@ -1306,11 +1460,6 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
return -EINVAL;
- /* Disallow NV+SVE for the time being */
- if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features) &&
- test_bit(KVM_ARM_VCPU_SVE, &features))
- return -EINVAL;
-
if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
return 0;
@@ -1346,6 +1495,10 @@ static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
ret = kvm_arm_set_default_pmu(kvm);
+ /* Prepare for nested if required */
+ if (!ret && vcpu_has_nv(vcpu))
+ ret = kvm_vcpu_init_nested(vcpu);
+
return ret;
}
@@ -1439,7 +1592,6 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
}
vcpu_reset_hcr(vcpu);
- vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
/*
* Handle the "start in power-off" case.
@@ -1818,6 +1970,11 @@ static unsigned long nvhe_percpu_order(void)
return size ? get_order(size) : 0;
}
+static size_t pkvm_host_sve_state_order(void)
+{
+ return get_order(pkvm_host_sve_state_size());
+}
+
/* A lookup table holding the hypervisor VA for each vector slot */
static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
@@ -1853,7 +2010,6 @@ static int kvm_init_vector_slots(void)
static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
{
struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
- u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
unsigned long tcr;
/*
@@ -1869,17 +2025,17 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
tcr = read_sysreg(tcr_el1);
if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
+ tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK);
tcr |= TCR_EPD1_MASK;
} else {
+ unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr);
+
tcr &= TCR_EL2_MASK;
- tcr |= TCR_EL2_RES1;
+ tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips);
+ if (lpa2_is_enabled())
+ tcr |= TCR_EL2_DS;
}
- tcr &= ~TCR_T0SZ_MASK;
tcr |= TCR_T0SZ(hyp_va_bits);
- tcr &= ~TCR_EL2_PS_MASK;
- tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0));
- if (kvm_lpa2_is_enabled())
- tcr |= TCR_EL2_DS;
params->tcr_el2 = tcr;
params->pgd_pa = kvm_mmu_get_httbr();
@@ -1971,7 +2127,8 @@ static void cpu_set_hyp_vector(void)
static void cpu_hyp_init_context(void)
{
- kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
+ kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
+ kvm_init_host_debug_data();
if (!is_kernel_in_hyp_mode())
cpu_init_hyp_mode();
@@ -1980,7 +2137,6 @@ static void cpu_hyp_init_context(void)
static void cpu_hyp_init_features(void)
{
cpu_set_hyp_vector();
- kvm_arm_init_debug();
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
@@ -2012,7 +2168,7 @@ static void cpu_hyp_uninit(void *discard)
}
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
/*
* Most calls to this function are made with migration
@@ -2032,7 +2188,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
kvm_timer_cpu_down();
kvm_vgic_cpu_down();
@@ -2163,6 +2319,19 @@ static int __init init_subsystems(void)
break;
case -ENODEV:
case -ENXIO:
+ /*
+ * No VGIC? No pKVM for you.
+ *
+ * Protected mode assumes that VGICv3 is present, so no point
+ * in trying to hobble along if vgic initialization fails.
+ */
+ if (is_protected_kvm_enabled())
+ goto out;
+
+ /*
+ * Otherwise, userspace could choose to implement a GIC for its
+ * guest on non-cooperative hardware.
+ */
vgic_present = false;
err = 0;
break;
@@ -2170,6 +2339,13 @@ static int __init init_subsystems(void)
goto out;
}
+ if (kvm_mode == KVM_MODE_NV &&
+ !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) {
+ kvm_err("NV support requires GICv3, giving up\n");
+ err = -EINVAL;
+ goto out;
+ }
+
/*
* Init HYP architected timer support
*/
@@ -2197,12 +2373,20 @@ static void __init teardown_subsystems(void)
static void __init teardown_hyp_mode(void)
{
+ bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
int cpu;
free_hyp_pgds();
for_each_possible_cpu(cpu) {
- free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT);
free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
+
+ if (free_sve) {
+ struct cpu_sve_state *sve_state;
+
+ sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
+ free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
+ }
}
}
@@ -2220,7 +2404,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits)
/*
* The stub hypercalls are now disabled, so set our local flag to
- * prevent a later re-init attempt in kvm_arch_hardware_enable().
+ * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
*/
__this_cpu_write(kvm_hyp_initialized, 1);
preempt_enable();
@@ -2265,6 +2449,13 @@ static void kvm_hyp_init_symbols(void)
kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
kvm_nvhe_sym(__icache_flags) = __icache_flags;
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
+
+ /*
+ * Flush entire BSS since part of its data containing init symbols is read
+ * while the MMU is off.
+ */
+ kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
+ kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
}
static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
@@ -2285,6 +2476,50 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
return 0;
}
+static int init_pkvm_host_sve_state(void)
+{
+ int cpu;
+
+ if (!system_supports_sve())
+ return 0;
+
+ /* Allocate pages for host sve state in protected mode. */
+ for_each_possible_cpu(cpu) {
+ struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());
+
+ if (!page)
+ return -ENOMEM;
+
+ per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
+ }
+
+ /*
+ * Don't map the pages in hyp since these are only used in protected
+ * mode, which will (re)create its own mapping when initialized.
+ */
+
+ return 0;
+}
+
+/*
+ * Finalizes the initialization of hyp mode, once everything else is initialized
+ * and the initialziation process cannot fail.
+ */
+static void finalize_init_hyp_mode(void)
+{
+ int cpu;
+
+ if (system_supports_sve() && is_protected_kvm_enabled()) {
+ for_each_possible_cpu(cpu) {
+ struct cpu_sve_state *sve_state;
+
+ sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
+ per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
+ kern_hyp_va(sve_state);
+ }
+ }
+}
+
static void pkvm_hyp_init_ptrauth(void)
{
struct kvm_cpu_context *hyp_ctxt;
@@ -2330,15 +2565,15 @@ static int __init init_hyp_mode(void)
* Allocate stack pages for Hypervisor-mode
*/
for_each_possible_cpu(cpu) {
- unsigned long stack_page;
+ unsigned long stack_base;
- stack_page = __get_free_page(GFP_KERNEL);
- if (!stack_page) {
+ stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT);
+ if (!stack_base) {
err = -ENOMEM;
goto out_err;
}
- per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
+ per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base;
}
/*
@@ -2407,9 +2642,9 @@ static int __init init_hyp_mode(void)
*/
for_each_possible_cpu(cpu) {
struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
- char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
+ char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu);
- err = create_hyp_stack(__pa(stack_page), &params->stack_hyp_va);
+ err = create_hyp_stack(__pa(stack_base), &params->stack_hyp_va);
if (err) {
kvm_err("Cannot map hyp stack\n");
goto out_err;
@@ -2421,7 +2656,7 @@ static int __init init_hyp_mode(void)
* __hyp_pa() won't do the right thing there, since the stack
* has been mapped in the flexible private VA space.
*/
- params->stack_pa = __pa(stack_page);
+ params->stack_pa = __pa(stack_base);
}
for_each_possible_cpu(cpu) {
@@ -2453,6 +2688,10 @@ static int __init init_hyp_mode(void)
goto out_err;
}
+ err = init_pkvm_host_sve_state();
+ if (err)
+ goto out_err;
+
err = kvm_hyp_init_protection(hyp_va_bits);
if (err) {
kvm_err("Failed to init hyp memory protection\n");
@@ -2470,21 +2709,27 @@ out_err:
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
{
- struct kvm_vcpu *vcpu;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_mpidr_data *data;
unsigned long i;
mpidr &= MPIDR_HWID_BITMASK;
- if (kvm->arch.mpidr_data) {
- u16 idx = kvm_mpidr_index(kvm->arch.mpidr_data, mpidr);
+ rcu_read_lock();
+ data = rcu_dereference(kvm->arch.mpidr_data);
- vcpu = kvm_get_vcpu(kvm,
- kvm->arch.mpidr_data->cmpidr_to_idx[idx]);
+ if (data) {
+ u16 idx = kvm_mpidr_index(data, mpidr);
+
+ vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
vcpu = NULL;
+ }
+ rcu_read_unlock();
+
+ if (vcpu)
return vcpu;
- }
kvm_for_each_vcpu(i, vcpu, kvm) {
if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
@@ -2508,6 +2753,14 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
+
+ /*
+ * The only thing we have a chance of directly-injecting is LPIs. Maybe
+ * one day...
+ */
+ if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
+ return 0;
return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
&irqfd->irq_entry);
@@ -2517,6 +2770,10 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
+
+ if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
+ return;
kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
&irqfd->irq_entry);
@@ -2597,14 +2854,12 @@ static __init int kvm_arm_init(void)
if (err)
goto out_hyp;
- if (is_protected_kvm_enabled()) {
- kvm_info("Protected nVHE mode initialized successfully\n");
- } else if (in_hyp_mode) {
- kvm_info("VHE mode initialized successfully\n");
- } else {
- char mode = cpus_have_final_cap(ARM64_KVM_HVHE) ? 'h' : 'n';
- kvm_info("Hyp mode (%cVHE) initialized successfully\n", mode);
- }
+ kvm_info("%s%sVHE%s mode initialized successfully\n",
+ in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
+ "Protected " : "Hyp "),
+ in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
+ "h" : "n"),
+ cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": "");
/*
* FIXME: Do something reasonable if kvm_init() fails after pKVM
@@ -2614,6 +2869,13 @@ static __init int kvm_arm_init(void)
if (err)
goto out_subs;
+ /*
+ * This should be called after initialization is done and failure isn't
+ * possible anymore.
+ */
+ if (!in_hyp_mode)
+ finalize_init_hyp_mode();
+
kvm_arm_initialised = true;
return 0;
@@ -2666,6 +2928,36 @@ static int __init early_kvm_mode_cfg(char *arg)
}
early_param("kvm-arm.mode", early_kvm_mode_cfg);
+static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "trap") == 0) {
+ *p = KVM_WFX_TRAP;
+ return 0;
+ }
+
+ if (strcmp(arg, "notrap") == 0) {
+ *p = KVM_WFX_NOTRAP;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
+{
+ return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
+}
+early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);
+
+static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
+{
+ return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
+}
+early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);
+
enum kvm_mode kvm_get_mode(void)
{
return kvm_mode;
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
new file mode 100644
index 000000000000..f74a66ce3064
--- /dev/null
+++ b/arch/arm64/kvm/at.c
@@ -0,0 +1,1446 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 - Linaro Ltd
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/esr.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+enum trans_regime {
+ TR_EL10,
+ TR_EL20,
+ TR_EL2,
+};
+
+struct s1_walk_info {
+ u64 baddr;
+ enum trans_regime regime;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int txsz;
+ int sl;
+ bool hpd;
+ bool e0poe;
+ bool poe;
+ bool pan;
+ bool be;
+ bool s2;
+};
+
+struct s1_walk_result {
+ union {
+ struct {
+ u64 desc;
+ u64 pa;
+ s8 level;
+ u8 APTable;
+ bool UXNTable;
+ bool PXNTable;
+ bool uwxn;
+ bool uov;
+ bool ur;
+ bool uw;
+ bool ux;
+ bool pwxn;
+ bool pov;
+ bool pr;
+ bool pw;
+ bool px;
+ };
+ struct {
+ u8 fst;
+ bool ptw;
+ bool s2;
+ };
+ };
+ bool failed;
+};
+
+static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
+{
+ wr->fst = fst;
+ wr->ptw = ptw;
+ wr->s2 = s2;
+ wr->failed = true;
+}
+
+#define S1_MMU_DISABLED (-127)
+
+static int get_ia_size(struct s1_walk_info *wi)
+{
+ return 64 - wi->txsz;
+}
+
+/* Return true if the IPA is out of the OA range */
+static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
+{
+ return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
+}
+
+/* Return the translation regime that applies to an AT instruction */
+static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
+{
+ /*
+ * We only get here from guest EL2, so the translation
+ * regime AT applies to is solely defined by {E2H,TGE}.
+ */
+ switch (op) {
+ case OP_AT_S1E2R:
+ case OP_AT_S1E2W:
+ case OP_AT_S1E2A:
+ return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
+ break;
+ default:
+ return (vcpu_el2_e2h_is_set(vcpu) &&
+ vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
+ }
+}
+
+static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ if (!kvm_has_s1pie(vcpu->kvm))
+ return false;
+
+ switch (regime) {
+ case TR_EL2:
+ case TR_EL20:
+ return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE;
+ case TR_EL10:
+ return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) &&
+ (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1_PIE);
+ default:
+ BUG();
+ }
+}
+
+static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
+{
+ u64 val;
+
+ if (!kvm_has_s1poe(vcpu->kvm)) {
+ wi->poe = wi->e0poe = false;
+ return;
+ }
+
+ switch (wi->regime) {
+ case TR_EL2:
+ case TR_EL20:
+ val = vcpu_read_sys_reg(vcpu, TCR2_EL2);
+ wi->poe = val & TCR2_EL2_POE;
+ wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE);
+ break;
+ case TR_EL10:
+ if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) {
+ wi->poe = wi->e0poe = false;
+ return;
+ }
+
+ val = __vcpu_sys_reg(vcpu, TCR2_EL1);
+ wi->poe = val & TCR2_EL1_POE;
+ wi->e0poe = val & TCR2_EL1_E0POE;
+ }
+}
+
+static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
+ unsigned int stride, x;
+ bool va55, tbi, lva, as_el0;
+
+ hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
+
+ wi->regime = compute_translation_regime(vcpu, op);
+ as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
+ wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
+ (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
+
+ va55 = va & BIT(55);
+
+ if (wi->regime == TR_EL2 && va55)
+ goto addrsz;
+
+ wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
+
+ switch (wi->regime) {
+ case TR_EL10:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+ break;
+ case TR_EL2:
+ case TR_EL20:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+ break;
+ default:
+ BUG();
+ }
+
+ tbi = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_TBI, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_TBI1, tcr) :
+ FIELD_GET(TCR_TBI0, tcr)));
+
+ if (!tbi && (u64)sign_extend64(va, 55) != va)
+ goto addrsz;
+
+ va = (u64)sign_extend64(va, 55);
+
+ /* Let's put the MMU disabled case aside immediately */
+ switch (wi->regime) {
+ case TR_EL10:
+ /*
+ * If dealing with the EL1&0 translation regime, 3 things
+ * can disable the S1 translation:
+ *
+ * - HCR_EL2.DC = 1
+ * - HCR_EL2.{E2H,TGE} = {0,1}
+ * - SCTLR_EL1.M = 0
+ *
+ * The TGE part is interesting. If we have decided that this
+ * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
+ * {0,x}, and we only need to test for TGE == 1.
+ */
+ if (hcr & (HCR_DC | HCR_TGE)) {
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+ fallthrough;
+ case TR_EL2:
+ case TR_EL20:
+ if (!(sctlr & SCTLR_ELx_M))
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+
+ if (wr->level == S1_MMU_DISABLED) {
+ if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
+ goto addrsz;
+
+ wr->pa = va;
+ return 0;
+ }
+
+ wi->be = sctlr & SCTLR_ELx_EE;
+
+ wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
+ wi->hpd &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HPD, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_HPD1, tcr) :
+ FIELD_GET(TCR_HPD0, tcr)));
+ /* R_JHSVW */
+ wi->hpd |= s1pie_enabled(vcpu, wi->regime);
+
+ /* Do we have POE? */
+ compute_s1poe(vcpu, wi);
+
+ /* R_BVXDG */
+ wi->hpd |= (wi->poe || wi->e0poe);
+
+ /* Someone was silly enough to encode TG0/TG1 differently */
+ if (va55) {
+ wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+ switch (tg << TCR_TG1_SHIFT) {
+ case TCR_TG1_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG1_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG1_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ } else {
+ wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+ switch (tg << TCR_TG0_SHIFT) {
+ case TCR_TG0_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG0_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ }
+
+ /* R_PLCGL, R_YXNYW */
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
+ if (wi->txsz > 39)
+ goto transfault_l0;
+ } else {
+ if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
+ goto transfault_l0;
+ }
+
+ /* R_GTJBY, R_SXWGM */
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
+ lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+ break;
+ case SZ_16K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
+ lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+ break;
+ case SZ_64K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
+ break;
+ }
+
+ if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
+ goto transfault_l0;
+
+ ia_bits = get_ia_size(wi);
+
+ /* R_YYVYV, I_THCZK */
+ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
+ (va55 && va < GENMASK(63, ia_bits)))
+ goto transfault_l0;
+
+ /* I_ZFSYQ */
+ if (wi->regime != TR_EL2 &&
+ (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
+ goto transfault_l0;
+
+ /* R_BNDVG and following statements */
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
+ as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
+ goto transfault_l0;
+
+ /* AArch64.S1StartLevel() */
+ stride = wi->pgshift - 3;
+ wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+ ps = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
+
+ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
+
+ /* Compute minimal alignment */
+ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
+
+ wi->baddr = ttbr & TTBRx_EL1_BADDR;
+
+ /* R_VPBBF */
+ if (check_output_size(wi->baddr, wi))
+ goto addrsz;
+
+ wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+
+ return 0;
+
+addrsz: /* Address Size Fault level 0 */
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
+ return -EFAULT;
+
+transfault_l0: /* Translation Fault level 0 */
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
+ return -EFAULT;
+}
+
+static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 va_top, va_bottom, baddr, desc;
+ int level, stride, ret;
+
+ level = wi->sl;
+ stride = wi->pgshift - 3;
+ baddr = wi->baddr;
+
+ va_top = get_ia_size(wi) - 1;
+
+ while (1) {
+ u64 index, ipa;
+
+ va_bottom = (3 - level) * stride + wi->pgshift;
+ index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
+
+ ipa = baddr | index;
+
+ if (wi->s2) {
+ struct kvm_s2_trans s2_trans = {};
+
+ ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+ if (ret) {
+ fail_s1_walk(wr,
+ (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+ true, true);
+ return ret;
+ }
+
+ if (!kvm_s2_trans_readable(&s2_trans)) {
+ fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
+ true, true);
+
+ return -EPERM;
+ }
+
+ ipa = kvm_s2_trans_output(&s2_trans);
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+ if (ret) {
+ fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
+ true, false);
+ return ret;
+ }
+
+ if (wi->be)
+ desc = be64_to_cpu((__force __be64)desc);
+ else
+ desc = le64_to_cpu((__force __le64)desc);
+
+ /* Invalid descriptor */
+ if (!(desc & BIT(0)))
+ goto transfault;
+
+ /* Block mapping, check validity down the line */
+ if (!(desc & BIT(1)))
+ break;
+
+ /* Page mapping */
+ if (level == 3)
+ break;
+
+ /* Table handling */
+ if (!wi->hpd) {
+ wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
+ wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
+ wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+ }
+
+ baddr = desc & GENMASK_ULL(47, wi->pgshift);
+
+ /* Check for out-of-range OA */
+ if (check_output_size(baddr, wi))
+ goto addrsz;
+
+ /* Prepare for next round */
+ va_top = va_bottom - 1;
+ level++;
+ }
+
+ /* Block mapping, check the validity of the level */
+ if (!(desc & BIT(1))) {
+ bool valid_block = false;
+
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ valid_block = level == 1 || level == 2;
+ break;
+ case SZ_16K:
+ case SZ_64K:
+ valid_block = level == 2;
+ break;
+ }
+
+ if (!valid_block)
+ goto transfault;
+ }
+
+ if (check_output_size(desc & GENMASK(47, va_bottom), wi))
+ goto addrsz;
+
+ va_bottom += contiguous_bit_shift(desc, wi, level);
+
+ wr->failed = false;
+ wr->level = level;
+ wr->desc = desc;
+ wr->pa = desc & GENMASK(47, va_bottom);
+ wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
+
+ return 0;
+
+addrsz:
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
+ return -EINVAL;
+transfault:
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
+ return -ENOENT;
+}
+
+struct mmu_config {
+ u64 ttbr0;
+ u64 ttbr1;
+ u64 tcr;
+ u64 mair;
+ u64 tcr2;
+ u64 pir;
+ u64 pire0;
+ u64 por_el0;
+ u64 por_el1;
+ u64 sctlr;
+ u64 vttbr;
+ u64 vtcr;
+ u64 hcr;
+};
+
+static void __mmu_config_save(struct mmu_config *config)
+{
+ config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
+ config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
+ config->tcr = read_sysreg_el1(SYS_TCR);
+ config->mair = read_sysreg_el1(SYS_MAIR);
+ if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
+ config->tcr2 = read_sysreg_el1(SYS_TCR2);
+ if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
+ config->pir = read_sysreg_el1(SYS_PIR);
+ config->pire0 = read_sysreg_el1(SYS_PIRE0);
+ }
+ if (system_supports_poe()) {
+ config->por_el1 = read_sysreg_el1(SYS_POR);
+ config->por_el0 = read_sysreg_s(SYS_POR_EL0);
+ }
+ }
+ config->sctlr = read_sysreg_el1(SYS_SCTLR);
+ config->vttbr = read_sysreg(vttbr_el2);
+ config->vtcr = read_sysreg(vtcr_el2);
+ config->hcr = read_sysreg(hcr_el2);
+}
+
+static void __mmu_config_restore(struct mmu_config *config)
+{
+ write_sysreg(config->hcr, hcr_el2);
+
+ /*
+ * ARM errata 1165522 and 1530923 require TGE to be 1 before
+ * we update the guest state.
+ */
+ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+ write_sysreg_el1(config->ttbr0, SYS_TTBR0);
+ write_sysreg_el1(config->ttbr1, SYS_TTBR1);
+ write_sysreg_el1(config->tcr, SYS_TCR);
+ write_sysreg_el1(config->mair, SYS_MAIR);
+ if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
+ write_sysreg_el1(config->tcr2, SYS_TCR2);
+ if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
+ write_sysreg_el1(config->pir, SYS_PIR);
+ write_sysreg_el1(config->pire0, SYS_PIRE0);
+ }
+ if (system_supports_poe()) {
+ write_sysreg_el1(config->por_el1, SYS_POR);
+ write_sysreg_s(config->por_el0, SYS_POR_EL0);
+ }
+ }
+ write_sysreg_el1(config->sctlr, SYS_SCTLR);
+ write_sysreg(config->vttbr, vttbr_el2);
+ write_sysreg(config->vtcr, vtcr_el2);
+}
+
+static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 host_pan;
+ bool fail;
+
+ host_pan = read_sysreg_s(SYS_PSTATE_PAN);
+ write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ fail = __kvm_at(OP_AT_S1E1RP, vaddr);
+ break;
+ case OP_AT_S1E1WP:
+ fail = __kvm_at(OP_AT_S1E1WP, vaddr);
+ break;
+ }
+
+ write_sysreg_s(host_pan, SYS_PSTATE_PAN);
+
+ return fail;
+}
+
+#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
+#define MEMATTR_NC 0b0100
+#define MEMATTR_Wt 0b1000
+#define MEMATTR_Wb 0b1100
+#define MEMATTR_WbRaWa 0b1111
+
+#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
+
+static u8 s2_memattr_to_attr(u8 memattr)
+{
+ memattr &= 0b1111;
+
+ switch (memattr) {
+ case 0b0000:
+ case 0b0001:
+ case 0b0010:
+ case 0b0011:
+ return memattr << 2;
+ case 0b0100:
+ return MEMATTR(Wb, Wb);
+ case 0b0101:
+ return MEMATTR(NC, NC);
+ case 0b0110:
+ return MEMATTR(Wt, NC);
+ case 0b0111:
+ return MEMATTR(Wb, NC);
+ case 0b1000:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1001:
+ return MEMATTR(NC, Wt);
+ case 0b1010:
+ return MEMATTR(Wt, Wt);
+ case 0b1011:
+ return MEMATTR(Wb, Wt);
+ case 0b1100:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1101:
+ return MEMATTR(NC, Wb);
+ case 0b1110:
+ return MEMATTR(Wt, Wb);
+ case 0b1111:
+ return MEMATTR(Wb, Wb);
+ default:
+ unreachable();
+ }
+}
+
+static u8 combine_s1_s2_attr(u8 s1, u8 s2)
+{
+ bool transient;
+ u8 final = 0;
+
+ /* Upgrade transient s1 to non-transient to simplify things */
+ switch (s1) {
+ case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
+ transient = true;
+ s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
+ break;
+ case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
+ transient = true;
+ s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
+ break;
+ default:
+ transient = false;
+ }
+
+ /* S2CombineS1AttrHints() */
+ if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_NC)
+ final = MEMATTR_NC;
+ else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
+ final = MEMATTR_Wt;
+ else
+ final = MEMATTR_Wb;
+
+ if (final != MEMATTR_NC) {
+ /* Inherit RaWa hints form S1 */
+ if (transient) {
+ switch (s1 & GENMASK(3, 2)) {
+ case MEMATTR_Wt:
+ final = 0;
+ break;
+ case MEMATTR_Wb:
+ final = MEMATTR_NC;
+ break;
+ }
+ }
+
+ final |= s1 & GENMASK(1, 0);
+ }
+
+ return final;
+}
+
+#define ATTR_NSH 0b00
+#define ATTR_RSV 0b01
+#define ATTR_OSH 0b10
+#define ATTR_ISH 0b11
+
+static u8 compute_sh(u8 attr, u64 desc)
+{
+ u8 sh;
+
+ /* Any form of device, as well as NC has SH[1:0]=0b10 */
+ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
+ return ATTR_OSH;
+
+ sh = FIELD_GET(PTE_SHARED, desc);
+ if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
+ sh = ATTR_NSH;
+
+ return sh;
+}
+
+static u8 combine_sh(u8 s1_sh, u8 s2_sh)
+{
+ if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
+ return ATTR_OSH;
+ if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
+ return ATTR_ISH;
+
+ return ATTR_NSH;
+}
+
+static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
+ struct kvm_s2_trans *tr)
+{
+ u8 s1_parattr, s2_memattr, final_attr;
+ u64 par;
+
+ /* If S2 has failed to translate, report the damage */
+ if (tr->esr) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= SYS_PAR_EL1_S;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
+ return par;
+ }
+
+ s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
+ s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
+
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
+ s2_memattr &= ~BIT(3);
+
+ /* Combination of R_VRJSW and R_RHWZM */
+ switch (s2_memattr) {
+ case 0b0101:
+ if (MEMATTR_IS_DEVICE(s1_parattr))
+ final_attr = s1_parattr;
+ else
+ final_attr = MEMATTR(NC, NC);
+ break;
+ case 0b0110:
+ case 0b1110:
+ final_attr = MEMATTR(WbRaWa, WbRaWa);
+ break;
+ case 0b0111:
+ case 0b1111:
+ /* Preserve S1 attribute */
+ final_attr = s1_parattr;
+ break;
+ case 0b0100:
+ case 0b1100:
+ case 0b1101:
+ /* Reserved, do something non-silly */
+ final_attr = s1_parattr;
+ break;
+ default:
+ /*
+ * MemAttr[2]=0, Device from S2.
+ *
+ * FWB does not influence the way that stage 1
+ * memory types and attributes are combined
+ * with stage 2 Device type and attributes.
+ */
+ final_attr = min(s2_memattr_to_attr(s2_memattr),
+ s1_parattr);
+ }
+ } else {
+ /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
+ u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
+
+ if (MEMATTR_IS_DEVICE(s1_parattr) ||
+ MEMATTR_IS_DEVICE(s2_parattr)) {
+ final_attr = min(s1_parattr, s2_parattr);
+ } else {
+ /* At this stage, this is memory vs memory */
+ final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
+ s2_parattr & 0xf);
+ final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
+ s2_parattr >> 4) << 4;
+ }
+ }
+
+ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
+ !MEMATTR_IS_DEVICE(final_attr))
+ final_attr = MEMATTR(NC, NC);
+
+ par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
+ par |= tr->output & GENMASK(47, 12);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH,
+ combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
+ compute_sh(final_attr, tr->desc)));
+
+ return par;
+}
+
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
+ enum trans_regime regime)
+{
+ u64 par;
+
+ if (wr->failed) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
+ par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
+ par |= wr->s2 ? SYS_PAR_EL1_S : 0;
+ } else if (wr->level == S1_MMU_DISABLED) {
+ /* MMU off or HCR_EL2.DC == 1 */
+ par = SYS_PAR_EL1_NSE;
+ par |= wr->pa & GENMASK_ULL(47, 12);
+
+ if (regime == TR_EL10 &&
+ (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
+ MEMATTR(WbRaWa, WbRaWa));
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
+ } else {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
+ }
+ } else {
+ u64 mair, sctlr;
+ u8 sh;
+
+ par = SYS_PAR_EL1_NSE;
+
+ mair = (regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, MAIR_EL1) :
+ vcpu_read_sys_reg(vcpu, MAIR_EL2));
+
+ mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
+ mair &= 0xff;
+
+ sctlr = (regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
+ vcpu_read_sys_reg(vcpu, SCTLR_EL2));
+
+ /* Force NC for memory if SCTLR_ELx.C is clear */
+ if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
+ mair = MEMATTR(NC, NC);
+
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
+ par |= wr->pa & GENMASK_ULL(47, 12);
+
+ sh = compute_sh(mair, wr->desc);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
+ }
+
+ return par;
+}
+
+static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ u64 sctlr;
+
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
+ return false;
+
+ if (s1pie_enabled(vcpu, regime))
+ return true;
+
+ if (regime == TR_EL10)
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ else
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+
+ return sctlr & SCTLR_EL1_EPAN;
+}
+
+static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ bool wxn;
+
+ /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
+ if (wi->regime != TR_EL2) {
+ switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
+ case 0b00:
+ wr->pr = wr->pw = true;
+ wr->ur = wr->uw = false;
+ break;
+ case 0b01:
+ wr->pr = wr->pw = wr->ur = wr->uw = true;
+ break;
+ case 0b10:
+ wr->pr = true;
+ wr->pw = wr->ur = wr->uw = false;
+ break;
+ case 0b11:
+ wr->pr = wr->ur = true;
+ wr->pw = wr->uw = false;
+ break;
+ }
+
+ /* We don't use px for anything yet, but hey... */
+ wr->px = !((wr->desc & PTE_PXN) || wr->uw);
+ wr->ux = !(wr->desc & PTE_UXN);
+ } else {
+ wr->ur = wr->uw = wr->ux = false;
+
+ if (!(wr->desc & PTE_RDONLY)) {
+ wr->pr = wr->pw = true;
+ } else {
+ wr->pr = true;
+ wr->pw = false;
+ }
+
+ /* XN maps to UXN */
+ wr->px = !(wr->desc & PTE_UXN);
+ }
+
+ switch (wi->regime) {
+ case TR_EL2:
+ case TR_EL20:
+ wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
+ break;
+ case TR_EL10:
+ wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
+ break;
+ }
+
+ wr->pwxn = wr->uwxn = wxn;
+ wr->pov = wi->poe;
+ wr->uov = wi->e0poe;
+}
+
+static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ /* Hierarchical part of AArch64.S1DirectBasePermissions() */
+ if (wi->regime != TR_EL2) {
+ switch (wr->APTable) {
+ case 0b00:
+ break;
+ case 0b01:
+ wr->ur = wr->uw = false;
+ break;
+ case 0b10:
+ wr->pw = wr->uw = false;
+ break;
+ case 0b11:
+ wr->pw = wr->ur = wr->uw = false;
+ break;
+ }
+
+ wr->px &= !wr->PXNTable;
+ wr->ux &= !wr->UXNTable;
+ } else {
+ if (wr->APTable & BIT(1))
+ wr->pw = false;
+
+ /* XN maps to UXN */
+ wr->px &= !wr->UXNTable;
+ }
+}
+
+#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
+
+#define set_priv_perms(wr, r, w, x) \
+ do { \
+ (wr)->pr = (r); \
+ (wr)->pw = (w); \
+ (wr)->px = (x); \
+ } while (0)
+
+#define set_unpriv_perms(wr, r, w, x) \
+ do { \
+ (wr)->ur = (r); \
+ (wr)->uw = (w); \
+ (wr)->ux = (x); \
+ } while (0)
+
+#define set_priv_wxn(wr, v) \
+ do { \
+ (wr)->pwxn = (v); \
+ } while (0)
+
+#define set_unpriv_wxn(wr, v) \
+ do { \
+ (wr)->uwxn = (v); \
+ } while (0)
+
+/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
+#define set_perms(w, wr, ip) \
+ do { \
+ /* R_LLZDZ */ \
+ switch ((ip)) { \
+ case 0b0000: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b0001: \
+ set_ ## w ## _perms((wr), true , false, false); \
+ break; \
+ case 0b0010: \
+ set_ ## w ## _perms((wr), false, false, true ); \
+ break; \
+ case 0b0011: \
+ set_ ## w ## _perms((wr), true , false, true ); \
+ break; \
+ case 0b0100: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b0101: \
+ set_ ## w ## _perms((wr), true , true , false); \
+ break; \
+ case 0b0110: \
+ set_ ## w ## _perms((wr), true , true , true ); \
+ break; \
+ case 0b0111: \
+ set_ ## w ## _perms((wr), true , true , true ); \
+ break; \
+ case 0b1000: \
+ set_ ## w ## _perms((wr), true , false, false); \
+ break; \
+ case 0b1001: \
+ set_ ## w ## _perms((wr), true , false, false); \
+ break; \
+ case 0b1010: \
+ set_ ## w ## _perms((wr), true , false, true ); \
+ break; \
+ case 0b1011: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b1100: \
+ set_ ## w ## _perms((wr), true , true , false); \
+ break; \
+ case 0b1101: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b1110: \
+ set_ ## w ## _perms((wr), true , true , true ); \
+ break; \
+ case 0b1111: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ } \
+ \
+ /* R_HJYGR */ \
+ set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
+ \
+ } while (0)
+
+static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ u8 up, pp, idx;
+
+ idx = pte_pi_index(wr->desc);
+
+ switch (wi->regime) {
+ case TR_EL10:
+ pp = perm_idx(vcpu, PIR_EL1, idx);
+ up = perm_idx(vcpu, PIRE0_EL1, idx);
+ break;
+ case TR_EL20:
+ pp = perm_idx(vcpu, PIR_EL2, idx);
+ up = perm_idx(vcpu, PIRE0_EL2, idx);
+ break;
+ case TR_EL2:
+ pp = perm_idx(vcpu, PIR_EL2, idx);
+ up = 0;
+ break;
+ }
+
+ set_perms(priv, wr, pp);
+
+ if (wi->regime != TR_EL2)
+ set_perms(unpriv, wr, up);
+ else
+ set_unpriv_perms(wr, false, false, false);
+
+ wr->pov = wi->poe && !(pp & BIT(3));
+ wr->uov = wi->e0poe && !(up & BIT(3));
+
+ /* R_VFPJF */
+ if (wr->px && wr->uw) {
+ set_priv_perms(wr, false, false, false);
+ set_unpriv_perms(wr, false, false, false);
+ }
+}
+
+static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ u8 idx, pov_perms, uov_perms;
+
+ idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
+
+ switch (wi->regime) {
+ case TR_EL10:
+ pov_perms = perm_idx(vcpu, POR_EL1, idx);
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL20:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL2:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ uov_perms = 0;
+ break;
+ }
+
+ if (pov_perms & ~POE_RWX)
+ pov_perms = POE_NONE;
+
+ if (wi->poe && wr->pov) {
+ wr->pr &= pov_perms & POE_R;
+ wr->pw &= pov_perms & POE_W;
+ wr->px &= pov_perms & POE_X;
+ }
+
+ if (uov_perms & ~POE_RWX)
+ uov_perms = POE_NONE;
+
+ if (wi->e0poe && wr->uov) {
+ wr->ur &= uov_perms & POE_R;
+ wr->uw &= uov_perms & POE_W;
+ wr->ux &= uov_perms & POE_X;
+ }
+}
+
+static void compute_s1_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ bool pan;
+
+ if (!s1pie_enabled(vcpu, wi->regime))
+ compute_s1_direct_permissions(vcpu, wi, wr);
+ else
+ compute_s1_indirect_permissions(vcpu, wi, wr);
+
+ if (!wi->hpd)
+ compute_s1_hierarchical_permissions(vcpu, wi, wr);
+
+ if (wi->poe || wi->e0poe)
+ compute_s1_overlay_permissions(vcpu, wi, wr);
+
+ /* R_QXXPC */
+ if (wr->pwxn) {
+ if (!wr->pov && wr->pw)
+ wr->px = false;
+ if (wr->pov && wr->px)
+ wr->pw = false;
+ }
+
+ /* R_NPBXC */
+ if (wr->uwxn) {
+ if (!wr->uov && wr->uw)
+ wr->ux = false;
+ if (wr->uov && wr->ux)
+ wr->uw = false;
+ }
+
+ pan = wi->pan && (wr->ur || wr->uw ||
+ (pan3_enabled(vcpu, wi->regime) && wr->ux));
+ wr->pw &= !pan;
+ wr->pr &= !pan;
+}
+
+static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct s1_walk_result wr = {};
+ struct s1_walk_info wi = {};
+ bool perm_fail = false;
+ int ret, idx;
+
+ ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
+ if (ret)
+ goto compute_par;
+
+ if (wr.level == S1_MMU_DISABLED)
+ goto compute_par;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ ret = walk_s1(vcpu, &wi, &wr, vaddr);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (ret)
+ goto compute_par;
+
+ compute_s1_permissions(vcpu, &wi, &wr);
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1R:
+ case OP_AT_S1E2R:
+ perm_fail = !wr.pr;
+ break;
+ case OP_AT_S1E1WP:
+ case OP_AT_S1E1W:
+ case OP_AT_S1E2W:
+ perm_fail = !wr.pw;
+ break;
+ case OP_AT_S1E0R:
+ perm_fail = !wr.ur;
+ break;
+ case OP_AT_S1E0W:
+ perm_fail = !wr.uw;
+ break;
+ case OP_AT_S1E1A:
+ case OP_AT_S1E2A:
+ break;
+ default:
+ BUG();
+ }
+
+ if (perm_fail)
+ fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
+
+compute_par:
+ return compute_par_s1(vcpu, &wr, wi.regime);
+}
+
+/*
+ * Return the PAR_EL1 value as the result of a valid translation.
+ *
+ * If the translation is unsuccessful, the value may only contain
+ * PAR_EL1.F, and cannot be taken at face value. It isn't an
+ * indication of the translation having failed, only that the fast
+ * path did not succeed, *unless* it indicates a S1 permission fault.
+ */
+static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct mmu_config config;
+ struct kvm_s2_mmu *mmu;
+ bool fail;
+ u64 par;
+
+ par = SYS_PAR_EL1_F;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will
+ * be switching contexts behind everybody's back, disable
+ * interrupts while holding the mmu lock.
+ */
+ guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
+
+ /*
+ * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
+ * the right one (as we trapped from vEL2). If not, save the
+ * full MMU context.
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
+ goto skip_mmu_switch;
+
+ /*
+ * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
+ * find it (recycled by another vcpu, for example). When this
+ * happens, admit defeat immediately and use the SW (slow) path.
+ */
+ mmu = lookup_s2_mmu(vcpu);
+ if (!mmu)
+ return par;
+
+ __mmu_config_save(&config);
+
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
+ if (kvm_has_tcr2(vcpu->kvm)) {
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
+ if (kvm_has_s1pie(vcpu->kvm)) {
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
+ }
+ if (kvm_has_s1poe(vcpu->kvm)) {
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
+ write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
+ }
+ }
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
+ __load_stage2(mmu, mmu->arch);
+
+skip_mmu_switch:
+ /* Clear TGE, enable S2 translation, we're rolling */
+ write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2);
+ isb();
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1WP:
+ fail = at_s1e1p_fast(vcpu, op, vaddr);
+ break;
+ case OP_AT_S1E1R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E1W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E0R:
+ fail = __kvm_at(OP_AT_S1E0R, vaddr);
+ break;
+ case OP_AT_S1E0W:
+ fail = __kvm_at(OP_AT_S1E0W, vaddr);
+ break;
+ case OP_AT_S1E1A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ break;
+ }
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
+ __mmu_config_restore(&config);
+
+ return par;
+}
+
+static bool par_check_s1_perm_fault(u64 par)
+{
+ u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+ return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
+ !(par & SYS_PAR_EL1_S));
+}
+
+void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
+
+ /*
+ * If PAR_EL1 reports that AT failed on a S1 permission fault, we
+ * know for sure that the PTW was able to walk the S1 tables and
+ * there's nothing else to do.
+ *
+ * If AT failed for any other reason, then we must walk the guest S1
+ * to emulate the instruction.
+ */
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+ par = handle_at_slow(vcpu, op, vaddr);
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
+
+void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will be
+ * switching context behind everybody's back, disable interrupts...
+ */
+ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
+ u64 val, hcr;
+ bool fail;
+
+ val = hcr = read_sysreg(hcr_el2);
+ val &= ~HCR_TGE;
+ val |= HCR_VM;
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val |= HCR_NV | HCR_NV1;
+
+ write_sysreg(val, hcr_el2);
+ isb();
+
+ par = SYS_PAR_EL1_F;
+
+ switch (op) {
+ case OP_AT_S1E2R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E2W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E2A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ }
+
+ isb();
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ write_sysreg(hcr, hcr_el2);
+ isb();
+ }
+
+ /* We failed the translation, let's replay it in slow motion */
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+ par = handle_at_slow(vcpu, op, vaddr);
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
+
+void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct kvm_s2_trans out = {};
+ u64 ipa, par;
+ bool write;
+ int ret;
+
+ /* Do the stage-1 translation */
+ switch (op) {
+ case OP_AT_S12E1R:
+ op = OP_AT_S1E1R;
+ write = false;
+ break;
+ case OP_AT_S12E1W:
+ op = OP_AT_S1E1W;
+ write = true;
+ break;
+ case OP_AT_S12E0R:
+ op = OP_AT_S1E0R;
+ write = false;
+ break;
+ case OP_AT_S12E0W:
+ op = OP_AT_S1E0W;
+ write = true;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ __kvm_at_s1e01(vcpu, op, vaddr);
+ par = vcpu_read_sys_reg(vcpu, PAR_EL1);
+ if (par & SYS_PAR_EL1_F)
+ return;
+
+ /*
+ * If we only have a single stage of translation (E2H=0 or
+ * TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
+ */
+ if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
+ !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
+ return;
+
+ /* Do the stage-2 translation */
+ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
+ out.esr = 0;
+ ret = kvm_walk_nested_s2(vcpu, ipa, &out);
+ if (ret < 0)
+ return;
+
+ /* Check the access permission */
+ if (!out.esr &&
+ ((!write && !out.readable) || (write && !out.writable)))
+ out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
+
+ par = compute_par_s12(vcpu, par, &out);
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index ce8886122ed3..0e4c805e7e89 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -3,7 +3,8 @@
* Debug and Guest Debug support
*
* Copyright (C) 2015 - Linaro Ltd
- * Author: Alex Bennée <alex.bennee@linaro.org>
+ * Authors: Alex Bennée <alex.bennee@linaro.org>
+ * Oliver Upton <oliver.upton@linux.dev>
*/
#include <linux/kvm_host.h>
@@ -14,72 +15,6 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
-#include "trace.h"
-
-/* These are the bits of MDSCR_EL1 we may manipulate */
-#define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \
- DBG_MDSCR_KDE | \
- DBG_MDSCR_MDE)
-
-static DEFINE_PER_CPU(u64, mdcr_el2);
-
-/*
- * save/restore_guest_debug_regs
- *
- * For some debug operations we need to tweak some guest registers. As
- * a result we need to save the state of those registers before we
- * make those modifications.
- *
- * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled
- * after we have restored the preserved value to the main context.
- *
- * When single-step is enabled by userspace, we tweak PSTATE.SS on every
- * guest entry. Preserve PSTATE.SS so we can restore the original value
- * for the vcpu after the single-step is disabled.
- */
-static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
-{
- u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
-
- vcpu->arch.guest_debug_preserved.mdscr_el1 = val;
-
- trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
- vcpu->arch.guest_debug_preserved.mdscr_el1);
-
- vcpu->arch.guest_debug_preserved.pstate_ss =
- (*vcpu_cpsr(vcpu) & DBG_SPSR_SS);
-}
-
-static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
-{
- u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1;
-
- vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);
-
- trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
- vcpu_read_sys_reg(vcpu, MDSCR_EL1));
-
- if (vcpu->arch.guest_debug_preserved.pstate_ss)
- *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
- else
- *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
-}
-
-/**
- * kvm_arm_init_debug - grab what we need for debug
- *
- * Currently the sole task of this function is to retrieve the initial
- * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has
- * presumably been set-up by some knowledgeable bootcode.
- *
- * It is called once per-cpu during CPU hyp initialisation.
- */
-
-void kvm_arm_init_debug(void)
-{
- __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2));
-}
-
/**
* kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value
*
@@ -95,11 +30,14 @@ void kvm_arm_init_debug(void)
*/
static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
{
+ preempt_disable();
+
/*
* This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
* to disable guest access to the profiling and trace buffers
*/
- vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+ vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
+ *host_data_ptr(nr_event_counters));
vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
MDCR_EL2_TPMS |
MDCR_EL2_TTRF |
@@ -113,233 +51,215 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
/*
- * Trap debug register access when one of the following is true:
- * - Userspace is using the hardware to debug the guest
- * (KVM_GUESTDBG_USE_HW is set).
- * - The guest is not using debug (DEBUG_DIRTY clear).
- * - The guest has enabled the OS Lock (debug exceptions are blocked).
+ * Trap debug registers if the guest doesn't have ownership of them.
*/
- if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) ||
- !vcpu_get_flag(vcpu, DEBUG_DIRTY) ||
- kvm_vcpu_os_lock_enabled(vcpu))
+ if (!kvm_guest_owns_debug_regs(vcpu))
vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
- trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
-}
+ /* Write MDCR_EL2 directly if we're already at EL2 */
+ if (has_vhe())
+ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-/**
- * kvm_arm_vcpu_init_debug - setup vcpu debug traps
- *
- * @vcpu: the vcpu pointer
- *
- * Set vcpu initial mdcr_el2 value.
- */
-void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
- kvm_arm_setup_mdcr_el2(vcpu);
preempt_enable();
}
-/**
- * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
- * @vcpu: the vcpu pointer
- */
-
-void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
+void kvm_init_host_debug_data(void)
{
- vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state;
+ u64 dfr0 = read_sysreg(id_aa64dfr0_el1);
+
+ if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0)
+ *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N,
+ read_sysreg(pmcr_el0));
+
+ *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0);
+ *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0);
+
+ if (has_vhe())
+ return;
+
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) &&
+ !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P))
+ host_data_set_flag(HAS_SPE);
+
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) {
+ /* Force disable trace in protected mode in case of no TRBE */
+ if (is_protected_kvm_enabled())
+ host_data_set_flag(EL1_TRACING_CONFIGURED);
+
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) &&
+ !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P))
+ host_data_set_flag(HAS_TRBE);
+ }
}
-/**
- * kvm_arm_setup_debug - set up debug related stuff
+/*
+ * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host
+ * has taken over MDSCR_EL1.
*
- * @vcpu: the vcpu pointer
+ * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1.
*
- * This is called before each entry into the hypervisor to setup any
- * debug related registers.
+ * - Userspace is using the breakpoint/watchpoint registers to debug the
+ * guest, and MDSCR_EL1.MDE is forced to 1.
*
- * Additionally, KVM only traps guest accesses to the debug registers if
- * the guest is not actively using them (see the DEBUG_DIRTY
- * flag on vcpu->arch.iflags). Since the guest must not interfere
- * with the hardware state when debugging the guest, we must ensure that
- * trapping is enabled whenever we are debugging the guest using the
- * debug registers.
+ * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0,
+ * masking all debug exceptions affected by the OS Lock.
*/
-
-void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
+static void setup_external_mdscr(struct kvm_vcpu *vcpu)
{
- unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2;
+ /*
+ * Use the guest's MDSCR_EL1 as a starting point, since there are
+ * several other features controlled by MDSCR_EL1 that are not relevant
+ * to the host.
+ *
+ * Clear the bits that KVM may use which also satisfies emulation of
+ * the OS Lock as MDSCR_EL1.MDE is cleared.
+ */
+ u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS |
+ MDSCR_EL1_MDE |
+ MDSCR_EL1_KDE);
- trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ mdscr |= MDSCR_EL1_SS;
- kvm_arm_setup_mdcr_el2(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW)
+ mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE;
- /* Check if we need to use the debug registers. */
+ vcpu->arch.external_mdscr_el1 = mdscr;
+}
+
+void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
+{
+ u64 mdscr;
+
+ /* Must be called before kvm_vcpu_load_vhe() */
+ KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm);
+
+ /*
+ * Determine which of the possible debug states we're in:
+ *
+ * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's
+ * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do
+ * software step or emulate the effects of the OS Lock being enabled.
+ *
+ * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and
+ * the breakpoint/watchpoint registers need to be loaded eagerly.
+ *
+ * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint
+ * context needs to be loaded on the CPU.
+ */
if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
- /* Save guest debug state */
- save_guest_debug_regs(vcpu);
+ vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED;
+ setup_external_mdscr(vcpu);
/*
- * Single Step (ARM ARM D2.12.3 The software step state
- * machine)
- *
- * If we are doing Single Step we need to manipulate
- * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the
- * step has occurred the hypervisor will trap the
- * debug exception and we return to userspace.
- *
- * If the guest attempts to single step its userspace
- * we would have to deal with a trapped exception
- * while in the guest kernel. Because this would be
- * hard to unwind we suppress the guest's ability to
- * do so by masking MDSCR_EL.SS.
- *
- * This confuses guest debuggers which use
- * single-step behind the scenes but everything
- * returns to normal once the host is no longer
- * debugging the system.
+ * Steal the guest's single-step state machine if userspace wants
+ * single-step the guest.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- /*
- * If the software step state at the last guest exit
- * was Active-pending, we don't set DBG_SPSR_SS so
- * that the state is maintained (to not run another
- * single-step until the pending Software Step
- * exception is taken).
- */
- if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING))
+ if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS)
+ vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING);
+ else
+ vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING);
+
+ if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING))
*vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
else
*vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
-
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr |= DBG_MDSCR_SS;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
- } else {
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr &= ~DBG_MDSCR_SS;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
}
+ } else {
+ mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
-
- /*
- * HW Breakpoints and watchpoints
- *
- * We simply switch the debug_ptr to point to our new
- * external_debug_state which has been populated by the
- * debug ioctl. The existing DEBUG_DIRTY mechanism ensures
- * the registers are updated on the world switch.
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
- /* Enable breakpoints/watchpoints */
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr |= DBG_MDSCR_MDE;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
-
- vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
-
- trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
- &vcpu->arch.debug_ptr->dbg_bcr[0],
- &vcpu->arch.debug_ptr->dbg_bvr[0]);
-
- trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
- &vcpu->arch.debug_ptr->dbg_wcr[0],
- &vcpu->arch.debug_ptr->dbg_wvr[0]);
-
- /*
- * The OS Lock blocks debug exceptions in all ELs when it is
- * enabled. If the guest has enabled the OS Lock, constrain its
- * effects to the guest. Emulate the behavior by clearing
- * MDSCR_EL1.MDE. In so doing, we ensure that host debug
- * exceptions are unaffected by guest configuration of the OS
- * Lock.
- */
- } else if (kvm_vcpu_os_lock_enabled(vcpu)) {
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr &= ~DBG_MDSCR_MDE;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
- }
+ if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE))
+ vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED;
+ else
+ vcpu->arch.debug_owner = VCPU_DEBUG_FREE;
}
- BUG_ON(!vcpu->guest_debug &&
- vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
-
- /* If KDE or MDE are set, perform a full save/restore cycle. */
- if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
-
- /* Write mdcr_el2 changes since vcpu_load on VHE systems */
- if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2)
- write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-
- trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
+ kvm_arm_setup_mdcr_el2(vcpu);
}
-void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
+void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu)
{
- trace_kvm_arm_clear_debug(vcpu->guest_debug);
+ if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ return;
/*
- * Restore the guest's debug registers if we were using them.
+ * Save the host's software step state and restore the guest's before
+ * potentially returning to userspace.
*/
- if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
- /*
- * Mark the vcpu as ACTIVE_PENDING
- * until Software Step exception is taken.
- */
- vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING);
- }
-
- restore_guest_debug_regs(vcpu);
+ if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
+ vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING);
+ else
+ vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING);
- /*
- * If we were using HW debug we need to restore the
- * debug_ptr to the guest debug state.
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
- kvm_arm_reset_debug_ptr(vcpu);
+ if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING))
+ *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
+ else
+ *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
+}
- trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
- &vcpu->arch.debug_ptr->dbg_bcr[0],
- &vcpu->arch.debug_ptr->dbg_bvr[0]);
+/*
+ * Updates ownership of the debug registers after a trapped guest access to a
+ * breakpoint/watchpoint register. Host ownership of the debug registers is of
+ * strictly higher priority, and it is the responsibility of the VMM to emulate
+ * guest debug exceptions in this configuration.
+ */
+void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu)
+{
+ if (kvm_host_owns_debug_regs(vcpu))
+ return;
- trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
- &vcpu->arch.debug_ptr->dbg_wcr[0],
- &vcpu->arch.debug_ptr->dbg_wvr[0]);
- }
- }
+ vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED;
+ kvm_arm_setup_mdcr_el2(vcpu);
}
-void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu)
+void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val)
{
- u64 dfr0;
+ if (val & OSLAR_EL1_OSLK)
+ __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK;
+ else
+ __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK;
- /* For VHE, there is nothing to do */
- if (has_vhe())
+ preempt_disable();
+ kvm_arch_vcpu_put(vcpu);
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ preempt_enable();
+}
+
+void kvm_enable_trbe(void)
+{
+ if (has_vhe() || is_protected_kvm_enabled() ||
+ WARN_ON_ONCE(preemptible()))
return;
- dfr0 = read_sysreg(id_aa64dfr0_el1);
- /*
- * If SPE is present on this CPU and is available at current EL,
- * we may need to check if the host state needs to be saved.
- */
- if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) &&
- !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(PMBIDR_EL1_P_SHIFT)))
- vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE);
+ host_data_set_flag(TRBE_ENABLED);
+}
+EXPORT_SYMBOL_GPL(kvm_enable_trbe);
+
+void kvm_disable_trbe(void)
+{
+ if (has_vhe() || is_protected_kvm_enabled() ||
+ WARN_ON_ONCE(preemptible()))
+ return;
- /* Check if we have TRBE implemented and available at the host */
- if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) &&
- !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P))
- vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE);
+ host_data_clear_flag(TRBE_ENABLED);
}
+EXPORT_SYMBOL_GPL(kvm_disable_trbe);
-void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu)
+void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest)
{
- vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE);
- vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE);
+ if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible()))
+ return;
+
+ if (has_vhe()) {
+ write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12);
+ return;
+ }
+
+ *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest;
+ if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest)
+ host_data_set_flag(EL1_TRACING_CONFIGURED);
+ else
+ host_data_clear_flag(EL1_TRACING_CONFIGURED);
}
+EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration);
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 4697ba41b3a9..0fcfcc0478f9 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -16,9 +16,13 @@
enum trap_behaviour {
BEHAVE_HANDLE_LOCALLY = 0,
+
BEHAVE_FORWARD_READ = BIT(0),
BEHAVE_FORWARD_WRITE = BIT(1),
- BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE,
+ BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE,
+
+ /* Traps that take effect in Host EL0, this is rare! */
+ BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2),
};
struct trap_bits {
@@ -79,25 +83,43 @@ enum cgt_group_id {
CGT_MDCR_E2TB,
CGT_MDCR_TDCC,
+ CGT_CPTR_TAM,
+ CGT_CPTR_TCPAC,
+
+ CGT_HCRX_EnFPM,
+ CGT_HCRX_TCR2En,
+
+ CGT_CNTHCTL_EL1TVT,
+ CGT_CNTHCTL_EL1TVCT,
+
+ CGT_ICH_HCR_TC,
+ CGT_ICH_HCR_TALL0,
+ CGT_ICH_HCR_TALL1,
+ CGT_ICH_HCR_TDIR,
+
/*
* Anything after this point is a combination of coarse trap
* controls, which must all be evaluated to decide what to do.
*/
__MULTIPLE_CONTROL_BITS__,
- CGT_HCR_IMO_FMO = __MULTIPLE_CONTROL_BITS__,
+ CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__,
CGT_HCR_TID2_TID4,
CGT_HCR_TTLB_TTLBIS,
CGT_HCR_TTLB_TTLBOS,
CGT_HCR_TVM_TRVM,
+ CGT_HCR_TVM_TRVM_HCRX_TCR2En,
CGT_HCR_TPU_TICAB,
CGT_HCR_TPU_TOCU,
CGT_HCR_NV1_nNV2_ENSCXT,
CGT_MDCR_TPM_TPMCR,
+ CGT_MDCR_TPM_HPMN,
CGT_MDCR_TDE_TDA,
CGT_MDCR_TDE_TDOSA,
CGT_MDCR_TDE_TDRA,
CGT_MDCR_TDCC_TDE_TDA,
+ CGT_ICH_HCR_TC_TDIR,
+
/*
* Anything after this point requires a callback evaluating a
* complex trap condition. Ugly stuff.
@@ -105,6 +127,11 @@ enum cgt_group_id {
__COMPLEX_CONDITIONS__,
CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__,
CGT_CNTHCTL_EL1PTEN,
+ CGT_CNTHCTL_EL1NVPCT,
+ CGT_CNTHCTL_EL1NVVCT,
+
+ CGT_CPTR_TTA,
+ CGT_MDCR_HPMN,
/* Must be last */
__NR_CGT_GROUP_IDS__
@@ -121,7 +148,7 @@ static const struct trap_bits coarse_trap_bits[] = {
.index = HCR_EL2,
.value = HCR_TID2,
.mask = HCR_TID2,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TID3] = {
.index = HCR_EL2,
@@ -145,37 +172,37 @@ static const struct trap_bits coarse_trap_bits[] = {
.index = HCR_EL2,
.value = HCR_TIDCP,
.mask = HCR_TIDCP,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TACR] = {
.index = HCR_EL2,
.value = HCR_TACR,
.mask = HCR_TACR,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TSW] = {
.index = HCR_EL2,
.value = HCR_TSW,
.mask = HCR_TSW,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */
.index = HCR_EL2,
.value = HCR_TPC,
.mask = HCR_TPC,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TPU] = {
.index = HCR_EL2,
.value = HCR_TPU,
.mask = HCR_TPU,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TTLB] = {
.index = HCR_EL2,
.value = HCR_TTLB,
.mask = HCR_TTLB,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TVM] = {
.index = HCR_EL2,
@@ -187,7 +214,7 @@ static const struct trap_bits coarse_trap_bits[] = {
.index = HCR_EL2,
.value = HCR_TDZ,
.mask = HCR_TDZ,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TRVM] = {
.index = HCR_EL2,
@@ -199,151 +226,213 @@ static const struct trap_bits coarse_trap_bits[] = {
.index = HCR_EL2,
.value = HCR_TLOR,
.mask = HCR_TLOR,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TERR] = {
.index = HCR_EL2,
.value = HCR_TERR,
.mask = HCR_TERR,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_APK] = {
.index = HCR_EL2,
.value = 0,
.mask = HCR_APK,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_NV] = {
.index = HCR_EL2,
.value = HCR_NV,
.mask = HCR_NV,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_NV_nNV2] = {
.index = HCR_EL2,
.value = HCR_NV,
.mask = HCR_NV | HCR_NV2,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_NV1_nNV2] = {
.index = HCR_EL2,
.value = HCR_NV | HCR_NV1,
.mask = HCR_NV | HCR_NV1 | HCR_NV2,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_AT] = {
.index = HCR_EL2,
.value = HCR_AT,
.mask = HCR_AT,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_nFIEN] = {
.index = HCR_EL2,
.value = 0,
.mask = HCR_FIEN,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TID4] = {
.index = HCR_EL2,
.value = HCR_TID4,
.mask = HCR_TID4,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TICAB] = {
.index = HCR_EL2,
.value = HCR_TICAB,
.mask = HCR_TICAB,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TOCU] = {
.index = HCR_EL2,
.value = HCR_TOCU,
.mask = HCR_TOCU,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_ENSCXT] = {
.index = HCR_EL2,
.value = 0,
.mask = HCR_ENSCXT,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TTLBIS] = {
.index = HCR_EL2,
.value = HCR_TTLBIS,
.mask = HCR_TTLBIS,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_HCR_TTLBOS] = {
.index = HCR_EL2,
.value = HCR_TTLBOS,
.mask = HCR_TTLBOS,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_TPMCR] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TPMCR,
.mask = MDCR_EL2_TPMCR,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW |
+ BEHAVE_FORWARD_IN_HOST_EL0,
},
[CGT_MDCR_TPM] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TPM,
.mask = MDCR_EL2_TPM,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW |
+ BEHAVE_FORWARD_IN_HOST_EL0,
},
[CGT_MDCR_TDE] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TDE,
.mask = MDCR_EL2_TDE,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_TDA] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TDA,
.mask = MDCR_EL2_TDA,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_TDOSA] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TDOSA,
.mask = MDCR_EL2_TDOSA,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_TDRA] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TDRA,
.mask = MDCR_EL2_TDRA,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_E2PB] = {
.index = MDCR_EL2,
.value = 0,
.mask = BIT(MDCR_EL2_E2PB_SHIFT),
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_TPMS] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TPMS,
.mask = MDCR_EL2_TPMS,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_TTRF] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TTRF,
.mask = MDCR_EL2_TTRF,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_E2TB] = {
.index = MDCR_EL2,
.value = 0,
.mask = BIT(MDCR_EL2_E2TB_SHIFT),
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
},
[CGT_MDCR_TDCC] = {
.index = MDCR_EL2,
.value = MDCR_EL2_TDCC,
.mask = MDCR_EL2_TDCC,
- .behaviour = BEHAVE_FORWARD_ANY,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CPTR_TAM] = {
+ .index = CPTR_EL2,
+ .value = CPTR_EL2_TAM,
+ .mask = CPTR_EL2_TAM,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CPTR_TCPAC] = {
+ .index = CPTR_EL2,
+ .value = CPTR_EL2_TCPAC,
+ .mask = CPTR_EL2_TCPAC,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCRX_EnFPM] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_EnFPM,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCRX_TCR2En] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_TCR2En,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CNTHCTL_EL1TVT] = {
+ .index = CNTHCTL_EL2,
+ .value = CNTHCTL_EL1TVT,
+ .mask = CNTHCTL_EL1TVT,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CNTHCTL_EL1TVCT] = {
+ .index = CNTHCTL_EL2,
+ .value = CNTHCTL_EL1TVCT,
+ .mask = CNTHCTL_EL1TVCT,
+ .behaviour = BEHAVE_FORWARD_READ,
+ },
+ [CGT_ICH_HCR_TC] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TC,
+ .mask = ICH_HCR_EL2_TC,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_ICH_HCR_TALL0] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TALL0,
+ .mask = ICH_HCR_EL2_TALL0,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_ICH_HCR_TALL1] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TALL1,
+ .mask = ICH_HCR_EL2_TALL1,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_ICH_HCR_TDIR] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TDIR,
+ .mask = ICH_HCR_EL2_TDIR,
+ .behaviour = BEHAVE_FORWARD_RW,
},
};
@@ -354,19 +443,24 @@ static const struct trap_bits coarse_trap_bits[] = {
}
static const enum cgt_group_id *coarse_control_combo[] = {
- MCB(CGT_HCR_IMO_FMO, CGT_HCR_IMO, CGT_HCR_FMO),
MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4),
MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS),
MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS),
MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM),
+ MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En,
+ CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En),
MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB),
MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU),
MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT),
MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR),
+ MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN),
MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA),
MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA),
MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA),
MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA),
+
+ MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC),
+ MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR),
};
typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *);
@@ -399,7 +493,7 @@ static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu)
if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10))
return BEHAVE_HANDLE_LOCALLY;
- return BEHAVE_FORWARD_ANY;
+ return BEHAVE_FORWARD_RW;
}
static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu)
@@ -407,7 +501,74 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu)
if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10))
return BEHAVE_HANDLE_LOCALLY;
- return BEHAVE_FORWARD_ANY;
+ return BEHAVE_FORWARD_RW;
+}
+
+static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ val = __vcpu_sys_reg(vcpu, HCR_EL2);
+ return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV));
+}
+
+static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu)
+{
+ if (!is_nested_nv2_guest(vcpu) ||
+ !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT))
+ return BEHAVE_HANDLE_LOCALLY;
+
+ return BEHAVE_FORWARD_RW;
+}
+
+static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu)
+{
+ if (!is_nested_nv2_guest(vcpu) ||
+ !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT))
+ return BEHAVE_HANDLE_LOCALLY;
+
+ return BEHAVE_FORWARD_RW;
+}
+
+static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2);
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = translate_cptr_el2_to_cpacr_el1(val);
+
+ if (val & CPACR_EL1_TTA)
+ return BEHAVE_FORWARD_RW;
+
+ return BEHAVE_HANDLE_LOCALLY;
+}
+
+static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ unsigned int idx;
+
+
+ switch (sysreg) {
+ case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30):
+ case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30):
+ idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg);
+ break;
+ case SYS_PMXEVTYPER_EL0:
+ case SYS_PMXEVCNTR_EL0:
+ idx = SYS_FIELD_GET(PMSELR_EL0, SEL,
+ __vcpu_sys_reg(vcpu, PMSELR_EL0));
+ break;
+ default:
+ /* Someone used this trap helper for something else... */
+ KVM_BUG_ON(1, vcpu->kvm);
+ return BEHAVE_HANDLE_LOCALLY;
+ }
+
+ if (kvm_pmu_counter_is_hyp(vcpu, idx))
+ return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0;
+
+ return BEHAVE_HANDLE_LOCALLY;
}
#define CCC(id, fn) \
@@ -416,6 +577,10 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu)
static const complex_condition_check ccc[] = {
CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten),
CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten),
+ CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct),
+ CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct),
+ CCC(CGT_CPTR_TTA, check_cptr_tta),
+ CCC(CGT_MDCR_HPMN, check_mdcr_hpmn),
};
/*
@@ -487,9 +652,9 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4),
SR_RANGE_TRAP(SYS_ID_PFR0_EL1,
sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3),
- SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO),
- SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO),
- SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO),
+ SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0),
sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP),
SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0),
@@ -622,6 +787,11 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En),
SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ),
SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ),
SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ),
@@ -725,17 +895,22 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SYS_CNTHP_CVAL_EL2, CGT_HCR_NV),
SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2,
SYS_CNTHV_CVAL_EL2, CGT_HCR_NV),
- /* All _EL02, _EL12 registers */
+ /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/
SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0),
sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV),
SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0),
- sys_reg(3, 5, 14, 15, 7), CGT_HCR_NV),
+ sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV),
+ SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT),
+ SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT),
+ SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT),
+ SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT),
SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV),
SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV),
SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV),
SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV),
SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV),
@@ -817,6 +992,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT),
SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT),
SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT),
SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN),
SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN),
SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN),
@@ -827,77 +1003,77 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM),
SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM),
SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM),
- SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN),
SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM),
SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM),
- SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN),
SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM),
SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM),
SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM),
SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM),
SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM),
- SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN),
SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM),
SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA),
SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA),
@@ -1000,11 +1176,97 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB),
SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB),
SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC),
+ SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM),
+ /* op0=2, op1=1, and CRn<0b1000 */
+ SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0),
+ sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA),
SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN),
SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN),
SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN),
SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN),
SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN),
+ SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT),
+ SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT),
+ SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT),
+ SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT),
+ SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT),
+ SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM),
+ /*
+ * IMPDEF choice:
+ * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as
+ * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for
+ * ICC_SRE_EL1 access, and always handle it locally.
+ */
+ SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR),
+ SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC),
};
static DEFINE_XARRAY(sr_forward_xa);
@@ -1071,6 +1333,7 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_TPIDRRO_EL0, HFGxTR, TPIDRRO_EL0, 1),
SR_FGT(SYS_TPIDR_EL1, HFGxTR, TPIDR_EL1, 1),
SR_FGT(SYS_TCR_EL1, HFGxTR, TCR_EL1, 1),
+ SR_FGT(SYS_TCR2_EL1, HFGxTR, TCR_EL1, 1),
SR_FGT(SYS_SCXTNUM_EL0, HFGxTR, SCXTNUM_EL0, 1),
SR_FGT(SYS_SCXTNUM_EL1, HFGxTR, SCXTNUM_EL1, 1),
SR_FGT(SYS_SCTLR_EL1, HFGxTR, SCTLR_EL1, 1),
@@ -1846,7 +2109,8 @@ check_mcb:
cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__];
for (int i = 0; cgids[i] != __RESERVED__; i++) {
- if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) {
+ if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ &&
+ cgids[i] < __COMPLEX_CONDITIONS__) {
kvm_err("Recursive MCB %d/%d\n", id, cgids[i]);
ret = -EINVAL;
}
@@ -1951,11 +2215,19 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr)
return masks->mask[sr - __VNCR_START__].res0;
}
-static bool check_fgt_bit(struct kvm *kvm, bool is_read,
+static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read,
u64 val, const union trap_config tc)
{
+ struct kvm *kvm = vcpu->kvm;
enum vcpu_sysreg sr;
+ /*
+ * KVM doesn't know about any FGTs that apply to the host, and hopefully
+ * that'll remain the case.
+ */
+ if (is_hyp_ctxt(vcpu))
+ return false;
+
if (tc.pol)
return (val & BIT(tc.bit));
@@ -2032,7 +2304,15 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index)
* If we're not nesting, immediately return to the caller, with the
* sysreg index, should we have it.
*/
- if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu))
+ if (!vcpu_has_nv(vcpu))
+ goto local;
+
+ /*
+ * There are a few traps that take effect InHost, but are constrained
+ * to EL0. Don't bother with computing the trap behaviour if the vCPU
+ * isn't in EL0.
+ */
+ if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu))
goto local;
switch ((enum fgt_group_id)tc.fgt) {
@@ -2078,12 +2358,14 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index)
goto local;
}
- if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu->kvm, is_read,
- val, tc))
+ if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc))
goto inject;
b = compute_trap_behaviour(vcpu, tc);
+ if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu))
+ goto local;
+
if (((b & BEHAVE_FORWARD_READ) && is_read) ||
((b & BEHAVE_FORWARD_WRITE) && !is_read))
goto inject;
@@ -2117,6 +2399,41 @@ inject:
return true;
}
+static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit)
+{
+ bool control_bit_set;
+
+ if (!vcpu_has_nv(vcpu))
+ return false;
+
+ control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit;
+ if (!is_hyp_ctxt(vcpu) && control_bit_set) {
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+ return true;
+ }
+ return false;
+}
+
+static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit)
+{
+ return __forward_traps(vcpu, HCR_EL2, control_bit);
+}
+
+bool forward_smc_trap(struct kvm_vcpu *vcpu)
+{
+ return forward_hcr_traps(vcpu, HCR_TSC);
+}
+
+static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit)
+{
+ return __forward_traps(vcpu, MDCR_EL2, control_bit);
+}
+
+bool forward_debug_exception(struct kvm_vcpu *vcpu)
+{
+ return forward_mdcr_traps(vcpu, MDCR_EL2_TDE);
+}
+
static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr)
{
u64 mode = spsr & PSR_MODE_MASK;
@@ -2152,49 +2469,57 @@ static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr)
void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
{
- u64 spsr, elr, mode;
- bool direct_eret;
+ u64 spsr, elr, esr;
/*
- * Going through the whole put/load motions is a waste of time
- * if this is a VHE guest hypervisor returning to its own
- * userspace, or the hypervisor performing a local exception
- * return. No need to save/restore registers, no need to
- * switch S2 MMU. Just do the canonical ERET.
+ * Forward this trap to the virtual EL2 if the virtual
+ * HCR_EL2.NV bit is set and this is coming from !EL2.
*/
+ if (forward_hcr_traps(vcpu, HCR_NV))
+ return;
+
spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2);
spsr = kvm_check_illegal_exception_return(vcpu, spsr);
- mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT);
-
- direct_eret = (mode == PSR_MODE_EL0t &&
- vcpu_el2_e2h_is_set(vcpu) &&
- vcpu_el2_tge_is_set(vcpu));
- direct_eret |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t);
-
- if (direct_eret) {
- *vcpu_pc(vcpu) = vcpu_read_sys_reg(vcpu, ELR_EL2);
- *vcpu_cpsr(vcpu) = spsr;
- trace_kvm_nested_eret(vcpu, *vcpu_pc(vcpu), spsr);
- return;
+ /* Check for an ERETAx */
+ esr = kvm_vcpu_get_esr(vcpu);
+ if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) {
+ /*
+ * Oh no, ERETAx failed to authenticate.
+ *
+ * If we have FPACCOMBINE and we don't have a pending
+ * Illegal Execution State exception (which has priority
+ * over FPAC), deliver an exception right away.
+ *
+ * Otherwise, let the mangled ELR value trickle down the
+ * ERET handling, and the guest will have a little surprise.
+ */
+ if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) {
+ esr &= ESR_ELx_ERET_ISS_ERETA;
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC);
+ kvm_inject_nested_sync(vcpu, esr);
+ return;
+ }
}
preempt_disable();
+ vcpu_set_flag(vcpu, IN_NESTED_ERET);
kvm_arch_vcpu_put(vcpu);
- elr = __vcpu_sys_reg(vcpu, ELR_EL2);
+ if (!esr_iss_is_eretax(esr))
+ elr = __vcpu_sys_reg(vcpu, ELR_EL2);
trace_kvm_nested_eret(vcpu, elr, spsr);
- /*
- * Note that the current exception level is always the virtual EL2,
- * since we set HCR_EL2.NV bit only when entering the virtual EL2.
- */
*vcpu_pc(vcpu) = elr;
*vcpu_cpsr(vcpu) = spsr;
kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ vcpu_clear_flag(vcpu, IN_NESTED_ERET);
preempt_enable();
+
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_nested_transition(vcpu);
}
static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
@@ -2277,6 +2602,9 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2,
kvm_arch_vcpu_load(vcpu, smp_processor_id());
preempt_enable();
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_nested_transition(vcpu);
+
return 1;
}
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 826307e19e3a..7f6e43d25691 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -14,19 +14,6 @@
#include <asm/kvm_mmu.h>
#include <asm/sysreg.h>
-void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu)
-{
- struct task_struct *p = vcpu->arch.parent_task;
- struct user_fpsimd_state *fpsimd;
-
- if (!is_protected_kvm_enabled() || !p)
- return;
-
- fpsimd = &p->thread.uw.fpsimd_state;
- kvm_unshare_hyp(fpsimd, fpsimd + 1);
- put_task_struct(p);
-}
-
/*
* Called on entry to KVM_RUN unless this vcpu previously ran at least
* once and the most recent prior KVM_RUN for this vcpu was called from
@@ -38,30 +25,18 @@ void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu)
*/
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
{
- int ret;
-
struct user_fpsimd_state *fpsimd = &current->thread.uw.fpsimd_state;
+ int ret;
- kvm_vcpu_unshare_task_fp(vcpu);
+ /* pKVM has its own tracking of the host fpsimd state. */
+ if (is_protected_kvm_enabled())
+ return 0;
/* Make sure the host task fpsimd state is visible to hyp: */
ret = kvm_share_hyp(fpsimd, fpsimd + 1);
if (ret)
return ret;
- vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
-
- /*
- * We need to keep current's task_struct pinned until its data has been
- * unshared with the hypervisor to make sure it is not re-used by the
- * kernel and donated to someone else while already shared -- see
- * kvm_vcpu_unshare_task_fp() for the matching put_task_struct().
- */
- if (is_protected_kvm_enabled()) {
- get_task_struct(current);
- vcpu->arch.parent_task = current;
- }
-
return 0;
}
@@ -79,41 +54,18 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
if (!system_supports_fpsimd())
return;
- fpsimd_kvm_prepare();
-
/*
- * We will check TIF_FOREIGN_FPSTATE just before entering the
- * guest in kvm_arch_vcpu_ctxflush_fp() and override this to
- * FP_STATE_FREE if the flag set.
+ * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such
+ * that the host kernel is responsible for restoring this state upon
+ * return to userspace, and the hyp code doesn't need to save anything.
+ *
+ * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures
+ * that PSTATE.{SM,ZA} == {0,0}.
*/
- vcpu->arch.fp_state = FP_STATE_HOST_OWNED;
-
- vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
- if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
- vcpu_set_flag(vcpu, HOST_SVE_ENABLED);
-
- if (system_supports_sme()) {
- vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
- if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
- vcpu_set_flag(vcpu, HOST_SME_ENABLED);
+ fpsimd_save_and_flush_cpu_state();
+ *host_data_ptr(fp_owner) = FP_STATE_FREE;
- /*
- * If PSTATE.SM is enabled then save any pending FP
- * state and disable PSTATE.SM. If we leave PSTATE.SM
- * enabled and the guest does not enable SME via
- * CPACR_EL1.SMEN then operations that should be valid
- * may generate SME traps from EL1 to EL1 which we
- * can't intercept and which would confuse the guest.
- *
- * Do the same for PSTATE.ZA in the case where there
- * is state in the registers which has not already
- * been saved, this is very unlikely to happen.
- */
- if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) {
- vcpu->arch.fp_state = FP_STATE_FREE;
- fpsimd_save_and_flush_cpu_state();
- }
- }
+ WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR));
}
/*
@@ -126,7 +78,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu)
{
if (test_thread_flag(TIF_FOREIGN_FPSTATE))
- vcpu->arch.fp_state = FP_STATE_FREE;
+ *host_data_ptr(fp_owner) = FP_STATE_FREE;
}
/*
@@ -142,8 +94,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(!irqs_disabled());
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) {
-
+ if (guest_owns_fp_regs()) {
/*
* Currently we do not support SME guests so SVCR is
* always 0 and we just need a variable to point to.
@@ -152,8 +103,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
fp_state.sve_state = vcpu->arch.sve_state;
fp_state.sve_vl = vcpu->arch.sve_max_vl;
fp_state.sme_state = NULL;
- fp_state.svcr = &vcpu->arch.svcr;
- fp_state.fpmr = &vcpu->arch.fpmr;
+ fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR);
+ fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR);
fp_state.fp_type = &vcpu->arch.fp_type;
if (vcpu_has_sve(vcpu))
@@ -179,46 +130,18 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
local_irq_save(flags);
- /*
- * If we have VHE then the Hyp code will reset CPACR_EL1 to
- * the default value and we need to reenable SME.
- */
- if (has_vhe() && system_supports_sme()) {
- /* Also restore EL0 state seen on entry */
- if (vcpu_get_flag(vcpu, HOST_SME_ENABLED))
- sysreg_clear_set(CPACR_EL1, 0,
- CPACR_EL1_SMEN_EL0EN |
- CPACR_EL1_SMEN_EL1EN);
- else
- sysreg_clear_set(CPACR_EL1,
- CPACR_EL1_SMEN_EL0EN,
- CPACR_EL1_SMEN_EL1EN);
- isb();
- }
-
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) {
- if (vcpu_has_sve(vcpu)) {
- __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
-
- /* Restore the VL that was saved when bound to the CPU */
- if (!has_vhe())
- sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
- SYS_ZCR_EL1);
- }
-
- fpsimd_save_and_flush_cpu_state();
- } else if (has_vhe() && system_supports_sve()) {
+ if (guest_owns_fp_regs()) {
/*
- * The FPSIMD/SVE state in the CPU has not been touched, and we
- * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
- * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE
- * for EL0. To avoid spurious traps, restore the trap state
- * seen by kvm_arch_vcpu_load_fp():
+ * Flush (save and invalidate) the fpsimd/sve state so that if
+ * the host tries to use fpsimd/sve, it's not using stale data
+ * from the guest.
+ *
+ * Flushing the state sets the TIF_FOREIGN_FPSTATE bit for the
+ * context unconditionally, in both nVHE and VHE. This allows
+ * the kernel to restore the fpsimd/sve state, including ZCR_EL1
+ * when needed.
*/
- if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED))
- sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN);
- else
- sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0);
+ fpsimd_save_and_flush_cpu_state();
}
local_irq_restore(flags);
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index e2f762d959bb..2196979a24a3 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -251,6 +251,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case PSR_AA32_MODE_SVC:
case PSR_AA32_MODE_ABT:
case PSR_AA32_MODE_UND:
+ case PSR_AA32_MODE_SYS:
if (!vcpu_el1_is_32bit(vcpu))
return -EINVAL;
break;
@@ -276,7 +277,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) {
int i, nr_reg;
- switch (*vcpu_cpsr(vcpu)) {
+ switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) {
/*
* Either we are dealing with user mode, and only the
* first 15 registers (+ PC) must be narrowed to 32bit.
@@ -916,31 +917,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
- int ret = 0;
-
trace_kvm_set_guest_debug(vcpu, dbg->control);
- if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) {
- ret = -EINVAL;
- goto out;
- }
-
- if (dbg->control & KVM_GUESTDBG_ENABLE) {
- vcpu->guest_debug = dbg->control;
-
- /* Hardware assisted Break and Watch points */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
- vcpu->arch.external_debug_state = dbg->arch;
- }
+ if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
+ return -EINVAL;
- } else {
- /* If not enabled clear all flags */
+ if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
vcpu->guest_debug = 0;
- vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING);
+ vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING);
+ return 0;
}
-out:
- return ret;
+ vcpu->guest_debug = dbg->control;
+
+ /* Hardware assisted Break and Watch points */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW)
+ vcpu->arch.external_debug_state = dbg->arch;
+
+ return 0;
}
int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
@@ -1044,50 +1038,64 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
mutex_lock(&kvm->slots_lock);
+ if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
while (length > 0) {
- kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+ struct page *page = __gfn_to_page(kvm, gfn, write);
void *maddr;
unsigned long num_tags;
- struct page *page;
+ struct folio *folio;
- if (is_error_noslot_pfn(pfn)) {
+ if (!page) {
ret = -EFAULT;
goto out;
}
- page = pfn_to_online_page(pfn);
- if (!page) {
+ if (!pfn_to_online_page(page_to_pfn(page))) {
/* Reject ZONE_DEVICE memory */
+ kvm_release_page_unused(page);
ret = -EFAULT;
goto out;
}
+ folio = page_folio(page);
maddr = page_address(page);
if (!write) {
- if (page_mte_tagged(page))
+ if ((folio_test_hugetlb(folio) &&
+ folio_test_hugetlb_mte_tagged(folio)) ||
+ page_mte_tagged(page))
num_tags = mte_copy_tags_to_user(tags, maddr,
MTE_GRANULES_PER_PAGE);
else
/* No tags in memory, so write zeros */
num_tags = MTE_GRANULES_PER_PAGE -
clear_user(tags, MTE_GRANULES_PER_PAGE);
- kvm_release_pfn_clean(pfn);
+ kvm_release_page_clean(page);
} else {
/*
* Only locking to serialise with a concurrent
* __set_ptes() in the VMM but still overriding the
* tags, hence ignoring the return value.
*/
- try_page_mte_tagging(page);
+ if (folio_test_hugetlb(folio))
+ folio_try_hugetlb_mte_tagging(folio);
+ else
+ try_page_mte_tagging(page);
num_tags = mte_copy_tags_from_user(maddr, tags,
MTE_GRANULES_PER_PAGE);
/* uaccess failed, don't leave stale tags */
if (num_tags != MTE_GRANULES_PER_PAGE)
mte_clear_page_tags(maddr);
- set_page_mte_tagged(page);
+ if (folio_test_hugetlb(folio))
+ folio_set_hugetlb_mte_tagged(folio);
+ else
+ set_page_mte_tagged(page);
- kvm_release_pfn_dirty(pfn);
+ kvm_release_page_dirty(page);
}
if (num_tags != MTE_GRANULES_PER_PAGE) {
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 617ae6dea5d5..b73dc26bc44b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -56,6 +56,13 @@ static int handle_hvc(struct kvm_vcpu *vcpu)
static int handle_smc(struct kvm_vcpu *vcpu)
{
/*
+ * Forward this trapped smc instruction to the virtual EL2 if
+ * the guest has asked for it.
+ */
+ if (forward_smc_trap(vcpu))
+ return 1;
+
+ /*
* "If an SMC instruction executed at Non-secure EL1 is
* trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
* Trap exception, not a Secure Monitor Call exception [...]"
@@ -87,11 +94,19 @@ static int handle_smc(struct kvm_vcpu *vcpu)
}
/*
- * Guest access to FP/ASIMD registers are routed to this handler only
- * when the system doesn't support FP/ASIMD.
+ * This handles the cases where the system does not support FP/ASIMD or when
+ * we are running nested virtualization and the guest hypervisor is trapping
+ * FP/ASIMD accesses by its guest guest.
+ *
+ * All other handling of guest vs. host FP/ASIMD register state is handled in
+ * fixup_guest_exit().
*/
-static int handle_no_fpsimd(struct kvm_vcpu *vcpu)
+static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu)
{
+ if (guest_hyp_fpsimd_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ /* This is the case when the system doesn't support FP/ASIMD. */
kvm_inject_undefined(vcpu);
return 1;
}
@@ -114,8 +129,12 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu)
static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
{
u64 esr = kvm_vcpu_get_esr(vcpu);
+ bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE);
- if (esr & ESR_ELx_WFx_ISS_WFE) {
+ if (guest_hyp_wfx_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ if (is_wfe) {
trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
vcpu->stat.wfe_exit_stat++;
} else {
@@ -168,6 +187,9 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu)
struct kvm_run *run = vcpu->run;
u64 esr = kvm_vcpu_get_esr(vcpu);
+ if (!vcpu->guest_debug && forward_debug_exception(vcpu))
+ return 1;
+
run->exit_reason = KVM_EXIT_DEBUG;
run->debug.arch.hsr = lower_32_bits(esr);
run->debug.arch.hsr_high = upper_32_bits(esr);
@@ -178,7 +200,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu)
run->debug.arch.far = vcpu->arch.fault.far_el2;
break;
case ESR_ELx_EC_SOFTSTP_LOW:
- vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING);
+ *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
break;
}
@@ -202,24 +224,48 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
*/
static int handle_sve(struct kvm_vcpu *vcpu)
{
+ if (guest_hyp_sve_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
kvm_inject_undefined(vcpu);
return 1;
}
/*
- * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into
- * a NOP). If we get here, it is that we didn't fixup ptrauth on exit, and all
- * that we can do is give the guest an UNDEF.
+ * Two possibilities to handle a trapping ptrauth instruction:
+ *
+ * - Guest usage of a ptrauth instruction (which the guest EL1 did not
+ * turn into a NOP). If we get here, it is because we didn't enable
+ * ptrauth for the guest. This results in an UNDEF, as it isn't
+ * supposed to use ptrauth without being told it could.
+ *
+ * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for
+ * which we reinject the exception into L1.
+ *
+ * Anything else is an emulation bug (hence the WARN_ON + UNDEF).
*/
static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu)
{
+ if (!vcpu_has_ptrauth(vcpu)) {
+ kvm_inject_undefined(vcpu);
+ return 1;
+ }
+
+ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+ return 1;
+ }
+
+ /* Really shouldn't be here! */
+ WARN_ON_ONCE(1);
kvm_inject_undefined(vcpu);
return 1;
}
static int kvm_handle_eret(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_ERET_ISS_ERET)
+ if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) &&
+ !vcpu_has_ptrauth(vcpu))
return kvm_handle_ptrauth(vcpu);
/*
@@ -276,7 +322,7 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
[ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug,
[ESR_ELx_EC_BRK64] = kvm_handle_guest_debug,
- [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd,
+ [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd,
[ESR_ELx_EC_PAC] = kvm_handle_ptrauth,
};
@@ -383,6 +429,20 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
}
+static void print_nvhe_hyp_panic(const char *name, u64 panic_addr)
+{
+ kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr,
+ (void *)(panic_addr + kaslr_offset()));
+}
+
+static void kvm_nvhe_report_cfi_failure(u64 panic_addr)
+{
+ print_nvhe_hyp_panic("CFI failure", panic_addr);
+
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
+ kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n");
+}
+
void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
u64 elr_virt, u64 elr_phys,
u64 par, uintptr_t vcpu,
@@ -395,7 +455,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
kvm_err("Invalid host exception to nVHE hyp!\n");
} else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
- (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
+ esr_brk_comment(esr) == BUG_BRK_IMM) {
const char *file = NULL;
unsigned int line = 0;
@@ -411,11 +471,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
if (file)
kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
else
- kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr,
- (void *)(panic_addr + kaslr_offset()));
+ print_nvhe_hyp_panic("BUG", panic_addr);
+ } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) {
+ kvm_nvhe_report_cfi_failure(panic_addr);
} else {
- kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr,
- (void *)(panic_addr + kaslr_offset()));
+ print_nvhe_hyp_panic("panic", panic_addr);
}
/* Dump the nVHE hypervisor backtrace */
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index a38dea6186c9..d61e44642f98 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -3,7 +3,7 @@
# Makefile for Kernel-based Virtual Machine module, HYP part
#
-incdir := $(srctree)/$(src)/include
+incdir := $(src)/include
subdir-asflags-y := -I$(incdir)
subdir-ccflags-y := -I$(incdir)
diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c
index 8d9670e6615d..449fa58cf3b6 100644
--- a/arch/arm64/kvm/hyp/aarch32.c
+++ b/arch/arm64/kvm/hyp/aarch32.c
@@ -50,9 +50,23 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
u32 cpsr_cond;
int cond;
- /* Top two bits non-zero? Unconditional. */
- if (kvm_vcpu_get_esr(vcpu) >> 30)
+ /*
+ * These are the exception classes that could fire with a
+ * conditional instruction.
+ */
+ switch (kvm_vcpu_trap_get_class(vcpu)) {
+ case ESR_ELx_EC_CP15_32:
+ case ESR_ELx_EC_CP15_64:
+ case ESR_ELx_EC_CP14_MR:
+ case ESR_ELx_EC_CP14_LS:
+ case ESR_ELx_EC_FP_ASIMD:
+ case ESR_ELx_EC_CP10_ID:
+ case ESR_ELx_EC_CP14_64:
+ case ESR_ELx_EC_SVC32:
+ break;
+ default:
return true;
+ }
/* Is condition field valid? */
cond = kvm_vcpu_get_condition(vcpu);
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index f3aa7738b477..9f4e8d68ab50 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN
alternative_else_nop_endif
mrs x1, isr_el1
cbz x1, 1f
+
+ // Ensure that __guest_enter() always provides a context
+ // synchronization event so that callers don't need ISBs for anything
+ // that would usually be synchonized by the ERET.
+ isb
mov x0, #ARM_EXCEPTION_IRQ
ret
@@ -83,6 +88,14 @@ alternative_else_nop_endif
eret
sb
+SYM_INNER_LABEL(__guest_exit_restore_elr_and_panic, SYM_L_GLOBAL)
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
+
+ adr_this_cpu x0, kvm_hyp_ctxt, x1
+ ldr x0, [x0, #CPU_ELR_EL2]
+ msr elr_el2, x0
+
SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// x2-x29,lr: vcpu regs
// vcpu x0-x1 on the stack
diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S
index 61e6f3ba7b7d..e950875e31ce 100644
--- a/arch/arm64/kvm/hyp/fpsimd.S
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state)
sve_load 0, x1, x2, 3
ret
SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+ mov x2, #1
+ sve_save 0, x1, x2, 3
+ ret
+SYM_FUNC_END(__sve_save_state)
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 961bbef104a6..502a5b73ee70 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -88,15 +88,26 @@
default: write_debug(ptr[0], reg, 0); \
}
+static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu)
+{
+ switch (vcpu->arch.debug_owner) {
+ case VCPU_DEBUG_FREE:
+ WARN_ON_ONCE(1);
+ fallthrough;
+ case VCPU_DEBUG_GUEST_OWNED:
+ return &vcpu->arch.vcpu_debug_state;
+ case VCPU_DEBUG_HOST_OWNED:
+ return &vcpu->arch.external_debug_state;
+ }
+
+ return NULL;
+}
+
static void __debug_save_state(struct kvm_guest_debug_arch *dbg,
struct kvm_cpu_context *ctxt)
{
- u64 aa64dfr0;
- int brps, wrps;
-
- aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
- brps = (aa64dfr0 >> 12) & 0xf;
- wrps = (aa64dfr0 >> 20) & 0xf;
+ int brps = *host_data_ptr(debug_brps);
+ int wrps = *host_data_ptr(debug_wrps);
save_debug(dbg->dbg_bcr, dbgbcr, brps);
save_debug(dbg->dbg_bvr, dbgbvr, brps);
@@ -109,13 +120,8 @@ static void __debug_save_state(struct kvm_guest_debug_arch *dbg,
static void __debug_restore_state(struct kvm_guest_debug_arch *dbg,
struct kvm_cpu_context *ctxt)
{
- u64 aa64dfr0;
- int brps, wrps;
-
- aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
-
- brps = (aa64dfr0 >> 12) & 0xf;
- wrps = (aa64dfr0 >> 20) & 0xf;
+ int brps = *host_data_ptr(debug_brps);
+ int wrps = *host_data_ptr(debug_wrps);
restore_debug(dbg->dbg_bcr, dbgbcr, brps);
restore_debug(dbg->dbg_bvr, dbgbvr, brps);
@@ -132,13 +138,13 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
struct kvm_guest_debug_arch *host_dbg;
struct kvm_guest_debug_arch *guest_dbg;
- if (!vcpu_get_flag(vcpu, DEBUG_DIRTY))
+ if (!kvm_debug_regs_in_use(vcpu))
return;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
- host_dbg = &vcpu->arch.host_debug_state.regs;
- guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
+ host_dbg = host_data_ptr(host_debug_state.regs);
+ guest_dbg = __vcpu_debug_regs(vcpu);
__debug_save_state(host_dbg, host_ctxt);
__debug_restore_state(guest_dbg, guest_ctxt);
@@ -151,18 +157,16 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
struct kvm_guest_debug_arch *host_dbg;
struct kvm_guest_debug_arch *guest_dbg;
- if (!vcpu_get_flag(vcpu, DEBUG_DIRTY))
+ if (!kvm_debug_regs_in_use(vcpu))
return;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
- host_dbg = &vcpu->arch.host_debug_state.regs;
- guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
+ host_dbg = host_data_ptr(host_debug_state.regs);
+ guest_dbg = __vcpu_debug_regs(vcpu);
__debug_save_state(guest_dbg, guest_ctxt);
__debug_restore_state(host_dbg, host_ctxt);
-
- vcpu_clear_flag(vcpu, DEBUG_DIRTY);
}
#endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */
diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h
index 9e13c1bc2ad5..17df94570f03 100644
--- a/arch/arm64/kvm/hyp/include/hyp/fault.h
+++ b/arch/arm64/kvm/hyp/include/hyp/fault.h
@@ -14,6 +14,7 @@
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
+ int ret;
u64 par, tmp;
/*
@@ -27,7 +28,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
* saved the guest context yet, and we may return early...
*/
par = read_sysreg_par();
- if (!__kvm_at("s1e1r", far))
+ ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) :
+ __kvm_at(OP_AT_S1E1R, far);
+ if (!ret)
tmp = read_sysreg_par();
else
tmp = SYS_PAR_EL1_F; /* back to the guest */
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index e3fcf8c4d5b4..b741ea6aefa5 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -39,12 +39,6 @@ struct kvm_exception_table_entry {
extern struct kvm_exception_table_entry __start___kvm_ex_table;
extern struct kvm_exception_table_entry __stop___kvm_ex_table;
-/* Check whether the FP regs are owned by the guest */
-static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.fp_state == FP_STATE_GUEST_OWNED;
-}
-
/* Save the 32-bit only FPSIMD system register state */
static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu)
{
@@ -155,7 +149,7 @@ static inline bool cpu_has_amu(void)
static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu)
{
- struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
struct kvm *kvm = kern_hyp_va(vcpu->kvm);
CHECK_FGT_MASKS(HFGRTR_EL2);
@@ -191,7 +185,7 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu)
static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
{
- struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
struct kvm *kvm = kern_hyp_va(vcpu->kvm);
if (!cpus_have_final_cap(ARM64_HAS_FGT))
@@ -210,6 +204,35 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
__deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2);
}
+static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
+{
+ u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
+
+ if (!system_supports_mpam())
+ return;
+
+ /* trap guest access to MPAMIDR_EL1 */
+ if (system_supports_mpam_hcr()) {
+ write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2);
+ } else {
+ /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */
+ r |= MPAM2_EL2_TIDR;
+ }
+
+ write_sysreg_s(r, SYS_MPAM2_EL2);
+}
+
+static inline void __deactivate_traps_mpam(void)
+{
+ if (!system_supports_mpam())
+ return;
+
+ write_sysreg_s(0, SYS_MPAM2_EL2);
+
+ if (system_supports_mpam_hcr())
+ write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2);
+}
+
static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
{
/* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
@@ -221,18 +244,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
* counter, which could make a PMXEVCNTR_EL0 access UNDEF at
* EL1 instead of being trapped to EL2.
*/
- if (kvm_arm_support_pmu_v3()) {
+ if (system_supports_pmuv3()) {
struct kvm_cpu_context *hctxt;
write_sysreg(0, pmselr_el0);
- hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ hctxt = host_data_ptr(host_ctxt);
ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0);
write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
}
- vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
+ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
if (cpus_have_final_cap(ARM64_HAS_HCX)) {
@@ -250,17 +273,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
}
__activate_traps_hfgxtr(vcpu);
+ __activate_traps_mpam(vcpu);
}
static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
{
- write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2);
+ write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
write_sysreg(0, hstr_el2);
- if (kvm_arm_support_pmu_v3()) {
+ if (system_supports_pmuv3()) {
struct kvm_cpu_context *hctxt;
- hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ hctxt = host_data_ptr(host_ctxt);
write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
}
@@ -269,12 +293,11 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2);
__deactivate_traps_hfgxtr(vcpu);
+ __deactivate_traps_mpam();
}
-static inline void ___activate_traps(struct kvm_vcpu *vcpu)
+static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr)
{
- u64 hcr = vcpu->arch.hcr_el2;
-
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM))
hcr |= HCR_TVM;
@@ -303,7 +326,7 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault);
}
-static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
{
*vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2);
@@ -321,23 +344,129 @@ static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
{
+ /*
+ * The vCPU's saved SVE state layout always matches the max VL of the
+ * vCPU. Start off with the max VL so we can load the SVE state.
+ */
sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
__sve_restore_state(vcpu_sve_pffr(vcpu),
- &vcpu->arch.ctxt.fp_regs.fpsr);
- write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
+ &vcpu->arch.ctxt.fp_regs.fpsr,
+ true);
+
+ /*
+ * The effective VL for a VM could differ from the max VL when running a
+ * nested guest, as the guest hypervisor could select a smaller VL. Slap
+ * that into hardware before wrapping up.
+ */
+ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
+ sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2);
+
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR);
+}
+
+static inline void __hyp_sve_save_host(void)
+{
+ struct cpu_sve_state *sve_state = *host_data_ptr(sve_state);
+
+ sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR);
+ write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2);
+ __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl),
+ &sve_state->fpsr,
+ true);
+}
+
+static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ u64 zcr_el1, zcr_el2;
+
+ if (!guest_owns_fp_regs())
+ return;
+
+ if (vcpu_has_sve(vcpu)) {
+ /* A guest hypervisor may restrict the effective max VL. */
+ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
+ zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2);
+ else
+ zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
+
+ write_sysreg_el2(zcr_el2, SYS_ZCR);
+
+ zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu));
+ write_sysreg_el1(zcr_el1, SYS_ZCR);
+ }
+}
+
+static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu)
+{
+ u64 zcr_el1, zcr_el2;
+
+ if (!guest_owns_fp_regs())
+ return;
+
+ /*
+ * When the guest owns the FP regs, we know that guest+hyp traps for
+ * any FPSIMD/SVE/SME features exposed to the guest have been disabled
+ * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd()
+ * prior to __guest_entry(). As __guest_entry() guarantees a context
+ * synchronization event, we don't need an ISB here to avoid taking
+ * traps for anything that was exposed to the guest.
+ */
+ if (vcpu_has_sve(vcpu)) {
+ zcr_el1 = read_sysreg_el1(SYS_ZCR);
+ __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1;
+
+ /*
+ * The guest's state is always saved using the guest's max VL.
+ * Ensure that the host has the guest's max VL active such that
+ * the host can save the guest's state lazily, but don't
+ * artificially restrict the host to the guest's max VL.
+ */
+ if (has_vhe()) {
+ zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
+ write_sysreg_el2(zcr_el2, SYS_ZCR);
+ } else {
+ zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1;
+ write_sysreg_el2(zcr_el2, SYS_ZCR);
+
+ zcr_el1 = vcpu_sve_max_vq(vcpu) - 1;
+ write_sysreg_el1(zcr_el1, SYS_ZCR);
+ }
+ }
+}
+
+static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Non-protected kvm relies on the host restoring its sve state.
+ * Protected kvm restores the host's sve state as not to reveal that
+ * fpsimd was used by a guest nor leak upper sve bits.
+ */
+ if (system_supports_sve()) {
+ __hyp_sve_save_host();
+
+ /* Re-enable SVE traps if not supported for the guest vcpu. */
+ if (!vcpu_has_sve(vcpu))
+ cpacr_clear_set(CPACR_EL1_ZEN, 0);
+
+ } else {
+ __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
+ }
+
+ if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
+ *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR);
}
+
/*
* We trap the first access to the FP/SIMD to save the host context and
* restore the guest context lazily.
* If FP/SIMD is not implemented, handle the trap and inject an undefined
* instruction exception to the guest. Similarly for trapped SVE accesses.
*/
-static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
{
bool sve_guest;
u8 esr_ec;
- u64 reg;
if (!system_supports_fpsimd())
return false;
@@ -348,10 +477,19 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Only handle traps the vCPU can support here: */
switch (esr_ec) {
case ESR_ELx_EC_FP_ASIMD:
+ /* Forward traps to the guest hypervisor as required */
+ if (guest_hyp_fpsimd_traps_enabled(vcpu))
+ return false;
break;
+ case ESR_ELx_EC_SYS64:
+ if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu)))
+ return false;
+ fallthrough;
case ESR_ELx_EC_SVE:
if (!sve_guest)
return false;
+ if (guest_hyp_sve_traps_enabled(vcpu))
+ return false;
break;
default:
return false;
@@ -360,24 +498,15 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Valid trap. Switch the context: */
/* First disable enough traps to allow us to update the registers */
- if (has_vhe() || has_hvhe()) {
- reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN;
- if (sve_guest)
- reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
-
- sysreg_clear_set(cpacr_el1, 0, reg);
- } else {
- reg = CPTR_EL2_TFP;
- if (sve_guest)
- reg |= CPTR_EL2_TZ;
-
- sysreg_clear_set(cptr_el2, reg, 0);
- }
+ if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve()))
+ cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
+ else
+ cpacr_clear_set(0, CPACR_EL1_FPEN);
isb();
/* Write out the host state if it's in the registers */
- if (vcpu->arch.fp_state == FP_STATE_HOST_OWNED)
- __fpsimd_save_state(vcpu->arch.host_fpsimd_state);
+ if (is_protected_kvm_enabled() && host_owns_fp_regs())
+ kvm_hyp_save_fpsimd_host(vcpu);
/* Restore the guest state */
if (sve_guest)
@@ -385,11 +514,14 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
else
__fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
+ if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
+ write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR);
+
/* Skip restoring fpexc32 for AArch64 guests */
if (!(read_sysreg(hcr_el2) & HCR_RW))
write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2);
- vcpu->arch.fp_state = FP_STATE_GUEST_OWNED;
+ *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED;
return true;
}
@@ -449,61 +581,25 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu)
return true;
}
-static inline bool esr_is_ptrauth_trap(u64 esr)
+/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */
+static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt)
{
- switch (esr_sys64_to_sysreg(esr)) {
- case SYS_APIAKEYLO_EL1:
- case SYS_APIAKEYHI_EL1:
- case SYS_APIBKEYLO_EL1:
- case SYS_APIBKEYHI_EL1:
- case SYS_APDAKEYLO_EL1:
- case SYS_APDAKEYHI_EL1:
- case SYS_APDBKEYLO_EL1:
- case SYS_APDBKEYHI_EL1:
- case SYS_APGAKEYLO_EL1:
- case SYS_APGAKEYHI_EL1:
- return true;
- }
-
- return false;
-}
+ u64 offset = 0;
-#define __ptrauth_save_key(ctxt, key) \
- do { \
- u64 __val; \
- __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \
- ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \
- __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \
- ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
-} while(0)
+ if (ctxt->offset.vm_offset)
+ offset += *kern_hyp_va(ctxt->offset.vm_offset);
+ if (ctxt->offset.vcpu_offset)
+ offset += *kern_hyp_va(ctxt->offset.vcpu_offset);
-DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+ return offset;
+}
-static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline u64 compute_counter_value(struct arch_timer_context *ctxt)
{
- struct kvm_cpu_context *ctxt;
- u64 val;
-
- if (!vcpu_has_ptrauth(vcpu))
- return false;
-
- ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
- __ptrauth_save_key(ctxt, APIA);
- __ptrauth_save_key(ctxt, APIB);
- __ptrauth_save_key(ctxt, APDA);
- __ptrauth_save_key(ctxt, APDB);
- __ptrauth_save_key(ctxt, APGA);
-
- vcpu_ptrauth_enable(vcpu);
-
- val = read_sysreg(hcr_el2);
- val |= (HCR_API | HCR_APK);
- write_sysreg(val, hcr_el2);
-
- return true;
+ return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt);
}
-static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
+static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *ctxt;
u32 sysreg;
@@ -513,18 +609,19 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
* We only get here for 64bit guests, 32bit guests will hit
* the long and winding road all the way to the standard
* handling. Yes, it sucks to be irrelevant.
+ *
+ * Also, we only deal with non-hypervisor context here (either
+ * an EL1 guest, or a non-HYP context of an EL2 guest).
*/
+ if (is_hyp_ctxt(vcpu))
+ return false;
+
sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
switch (sysreg) {
case SYS_CNTPCT_EL0:
case SYS_CNTPCTSS_EL0:
if (vcpu_has_nv(vcpu)) {
- if (is_hyp_ctxt(vcpu)) {
- ctxt = vcpu_hptimer(vcpu);
- break;
- }
-
/* Check for guest hypervisor trapping */
val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
if (!vcpu_el2_e2h_is_set(vcpu))
@@ -536,16 +633,23 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
ctxt = vcpu_ptimer(vcpu);
break;
+ case SYS_CNTVCT_EL0:
+ case SYS_CNTVCTSS_EL0:
+ if (vcpu_has_nv(vcpu)) {
+ /* Check for guest hypervisor trapping */
+ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+
+ if (val & CNTHCTL_EL1TVCT)
+ return false;
+ }
+
+ ctxt = vcpu_vtimer(vcpu);
+ break;
default:
return false;
}
- val = arch_timer_read_cntpct_el0();
-
- if (ctxt->offset.vm_offset)
- val -= *kern_hyp_va(ctxt->offset.vm_offset);
- if (ctxt->offset.vcpu_offset)
- val -= *kern_hyp_va(ctxt->offset.vcpu_offset);
+ val = compute_counter_value(ctxt);
vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
__kvm_skip_instr(vcpu);
@@ -576,7 +680,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
return true;
}
-static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
handle_tx2_tvm(vcpu))
@@ -590,16 +694,13 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
__vgic_v3_perform_cpuif_access(vcpu) == 1)
return true;
- if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
- return kvm_hyp_handle_ptrauth(vcpu, exit_code);
-
- if (kvm_hyp_handle_cntpct(vcpu))
+ if (kvm_handle_cntxct(vcpu))
return true;
return false;
}
-static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
__vgic_v3_perform_cpuif_access(vcpu) == 1)
@@ -608,19 +709,18 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
return false;
}
-static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu,
+ u64 *exit_code)
{
if (!__populate_fault_info(vcpu))
return true;
return false;
}
-static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
- __alias(kvm_hyp_handle_memory_fault);
-static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
- __alias(kvm_hyp_handle_memory_fault);
+#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault
+#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault
-static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (kvm_hyp_handle_memory_fault(vcpu, exit_code))
return true;
@@ -650,23 +750,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);
-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
-
-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
-
/*
* Allow the hypervisor to handle the exit with an exit handler if it has one.
*
* Returns true if the hypervisor handled the exit, and control should go back
* to the guest, or false if it hasn't.
*/
-static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
+ const exit_handler_fn *handlers)
{
- const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
- exit_handler_fn fn;
-
- fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
-
+ exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
if (fn)
return fn(vcpu, exit_code);
@@ -696,20 +789,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code
* the guest, false when we should restore the host state and return to the
* main run loop.
*/
-static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
+ const exit_handler_fn *handlers)
{
- /*
- * Save PSTATE early so that we can evaluate the vcpu mode
- * early on.
- */
- synchronize_vcpu_pstate(vcpu, exit_code);
-
- /*
- * Check whether we want to repaint the state one way or
- * another.
- */
- early_exit_filter(vcpu, exit_code);
-
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
@@ -739,7 +821,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
goto exit;
/* Check if there's an exit handler and allow it to handle the exit. */
- if (kvm_hyp_handle_exit(vcpu, exit_code))
+ if (kvm_hyp_handle_exit(vcpu, exit_code, handlers))
goto guest;
exit:
/* Return to the host kernel and handle the exit */
@@ -753,7 +835,7 @@ guest:
static inline void __kvm_unexpected_el2_exception(void)
{
- extern char __guest_exit_panic[];
+ extern char __guest_exit_restore_elr_and_panic[];
unsigned long addr, fixup;
struct kvm_exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
@@ -775,7 +857,8 @@ static inline void __kvm_unexpected_el2_exception(void)
}
/* Trigger a panic after restoring the hyp context. */
- write_sysreg(__guest_exit_panic, elr_el2);
+ this_cpu_ptr(&kvm_hyp_ctxt)->sys_regs[ELR_EL2] = elr_el2;
+ write_sysreg(__guest_exit_restore_elr_and_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 4be6a7fa0070..b9cff893bbe0 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -16,16 +16,7 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
-static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
-{
- ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1);
-}
-
-static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
-{
- ctxt_sys_reg(ctxt, TPIDR_EL0) = read_sysreg(tpidr_el0);
- ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0);
-}
+static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt);
static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
{
@@ -37,6 +28,47 @@ static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
return vcpu;
}
+static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt)
+{
+ return host_data_ptr(host_ctxt) != ctxt;
+}
+
+static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt);
+
+ if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu))
+ return &vcpu->arch.external_mdscr_el1;
+
+ return &ctxt_sys_reg(ctxt, MDSCR_EL1);
+}
+
+static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt)
+{
+ struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm);
+
+ if (!(ctxt_is_guest(ctxt) &&
+ test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)))
+ return read_cpuid_id();
+
+ return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1);
+}
+
+static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
+{
+ *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1);
+
+ // POR_EL0 can affect uaccess, so must be saved/restored early.
+ if (ctxt_has_s1poe(ctxt))
+ ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0);
+}
+
+static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
+{
+ ctxt_sys_reg(ctxt, TPIDR_EL0) = read_sysreg(tpidr_el0);
+ ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0);
+}
+
static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
{
struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt);
@@ -52,7 +84,29 @@ static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt)
return false;
vcpu = ctxt_to_vcpu(ctxt);
- return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1PIE, IMP);
+ return kvm_has_s1pie(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_TCR2))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_tcr2(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!system_supports_poe())
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_s1poe(kern_hyp_va(vcpu->kvm));
}
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
@@ -62,8 +116,17 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0);
ctxt_sys_reg(ctxt, TTBR1_EL1) = read_sysreg_el1(SYS_TTBR1);
ctxt_sys_reg(ctxt, TCR_EL1) = read_sysreg_el1(SYS_TCR);
- if (cpus_have_final_cap(ARM64_HAS_TCR2))
+ if (ctxt_has_tcrx(ctxt)) {
ctxt_sys_reg(ctxt, TCR2_EL1) = read_sysreg_el1(SYS_TCR2);
+
+ if (ctxt_has_s1pie(ctxt)) {
+ ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR);
+ ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0);
+ }
+
+ if (ctxt_has_s1poe(ctxt))
+ ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR);
+ }
ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR);
ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0);
ctxt_sys_reg(ctxt, AFSR1_EL1) = read_sysreg_el1(SYS_AFSR1);
@@ -73,10 +136,6 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR);
ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR);
ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL);
- if (ctxt_has_s1pie(ctxt)) {
- ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR);
- ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0);
- }
ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par();
ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1);
@@ -106,7 +165,11 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
{
- write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1);
+ write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1);
+
+ // POR_EL0 can affect uaccess, so must be saved/restored early.
+ if (ctxt_has_s1poe(ctxt))
+ write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0);
}
static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
@@ -115,9 +178,11 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0);
}
-static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
+static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt,
+ u64 midr, u64 mpidr)
{
- write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2);
+ write_sysreg(midr, vpidr_el2);
+ write_sysreg(mpidr, vmpidr_el2);
if (has_vhe() ||
!cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
@@ -138,8 +203,17 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1), SYS_CPACR);
write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0);
write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1);
- if (cpus_have_final_cap(ARM64_HAS_TCR2))
+ if (ctxt_has_tcrx(ctxt)) {
write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1), SYS_TCR2);
+
+ if (ctxt_has_s1pie(ctxt)) {
+ write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR);
+ write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0);
+ }
+
+ if (ctxt_has_s1poe(ctxt))
+ write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR);
+ }
write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR);
write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0);
write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1), SYS_AFSR1);
@@ -149,10 +223,6 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR);
write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1), SYS_AMAIR);
write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL);
- if (ctxt_has_s1pie(ctxt)) {
- write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR);
- write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0);
- }
write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1), par_el1);
write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1);
@@ -240,7 +310,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu)
__vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2);
__vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2);
- if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY))
+ if (has_vhe() || kvm_debug_regs_in_use(vcpu))
__vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2);
}
@@ -257,7 +327,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu)
write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2);
write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2);
- if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY))
+ if (has_vhe() || kvm_debug_regs_in_use(vcpu))
write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2);
}
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
index d9fd5e6c7d3c..146e0aebfa1c 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -9,7 +9,7 @@
#include <asm/kvm_host.h>
#define FFA_MIN_FUNC_NUM 0x60
-#define FFA_MAX_FUNC_NUM 0x7F
+#define FFA_MAX_FUNC_NUM 0xFF
int hyp_ffa_init(void *pages);
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
deleted file mode 100644
index 51f043649146..000000000000
--- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2021 Google LLC
- * Author: Fuad Tabba <tabba@google.com>
- */
-
-#ifndef __ARM64_KVM_FIXED_CONFIG_H__
-#define __ARM64_KVM_FIXED_CONFIG_H__
-
-#include <asm/sysreg.h>
-
-/*
- * This file contains definitions for features to be allowed or restricted for
- * guest virtual machines, depending on the mode KVM is running in and on the
- * type of guest that is running.
- *
- * The ALLOW masks represent a bitmask of feature fields that are allowed
- * without any restrictions as long as they are supported by the system.
- *
- * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for
- * features that are restricted to support at most the specified feature.
- *
- * If a feature field is not present in either, than it is not supported.
- *
- * The approach taken for protected VMs is to allow features that are:
- * - Needed by common Linux distributions (e.g., floating point)
- * - Trivial to support, e.g., supporting the feature does not introduce or
- * require tracking of additional state in KVM
- * - Cannot be trapped or prevent the guest from using anyway
- */
-
-/*
- * Allow for protected VMs:
- * - Floating-point and Advanced SIMD
- * - Data Independent Timing
- * - Spectre/Meltdown Mitigation
- */
-#define PVM_ID_AA64PFR0_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \
- )
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - AArch64 guests only (no support for AArch32 guests):
- * AArch32 adds complexity in trap handling, emulation, condition codes,
- * etc...
- * - RAS (v1)
- * Supported by KVM
- */
-#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \
- )
-
-/*
- * Allow for protected VMs:
- * - Branch Target Identification
- * - Speculative Store Bypassing
- */
-#define PVM_ID_AA64PFR1_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \
- ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \
- )
-
-#define PVM_ID_AA64PFR2_ALLOW 0ULL
-
-/*
- * Allow for protected VMs:
- * - Mixed-endian
- * - Distinction between Secure and Non-secure Memory
- * - Mixed-endian at EL0 only
- * - Non-context synchronizing exception entry and exit
- */
-#define PVM_ID_AA64MMFR0_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \
- )
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - 40-bit IPA
- * - 16-bit ASID
- */
-#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \
- )
-
-/*
- * Allow for protected VMs:
- * - Hardware translation table updates to Access flag and Dirty state
- * - Number of VMID bits from CPU
- * - Hierarchical Permission Disables
- * - Privileged Access Never
- * - SError interrupt exceptions from speculative reads
- * - Enhanced Translation Synchronization
- * - Control for cache maintenance permission
- */
-#define PVM_ID_AA64MMFR1_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_CMOW) \
- )
-
-/*
- * Allow for protected VMs:
- * - Common not Private translations
- * - User Access Override
- * - IESB bit in the SCTLR_ELx registers
- * - Unaligned single-copy atomicity and atomic functions
- * - ESR_ELx.EC value on an exception by read access to feature ID space
- * - TTL field in address operations.
- * - Break-before-make sequences when changing translation block size
- * - E0PDx mechanism
- */
-#define PVM_ID_AA64MMFR2_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \
- )
-
-#define PVM_ID_AA64MMFR3_ALLOW (0ULL)
-
-/*
- * No support for Scalable Vectors for protected VMs:
- * Requires additional support from KVM, e.g., context-switching and
- * trapping at EL2
- */
-#define PVM_ID_AA64ZFR0_ALLOW (0ULL)
-
-/*
- * No support for debug, including breakpoints, and watchpoints for protected
- * VMs:
- * The Arm architecture mandates support for at least the Armv8 debug
- * architecture, which would include at least 2 hardware breakpoints and
- * watchpoints. Providing that support to protected guests adds
- * considerable state and complexity. Therefore, the reserved value of 0 is
- * used for debug-related fields.
- */
-#define PVM_ID_AA64DFR0_ALLOW (0ULL)
-#define PVM_ID_AA64DFR1_ALLOW (0ULL)
-
-/*
- * No support for implementation defined features.
- */
-#define PVM_ID_AA64AFR0_ALLOW (0ULL)
-#define PVM_ID_AA64AFR1_ALLOW (0ULL)
-
-/*
- * No restrictions on instructions implemented in AArch64.
- */
-#define PVM_ID_AA64ISAR0_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \
- )
-
-/* Restrict pointer authentication to the basic version. */
-#define PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED (\
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \
- )
-
-#define PVM_ID_AA64ISAR2_RESTRICT_UNSIGNED (\
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \
- )
-
-#define PVM_ID_AA64ISAR1_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \
- )
-
-#define PVM_ID_AA64ISAR2_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) \
- )
-
-u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
-bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
-bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
-int kvm_check_pvm_sysreg_table(void);
-
-#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 97c527ef53c2..3766333bace9 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -7,7 +7,7 @@
#include <nvhe/memory.h>
#include <nvhe/spinlock.h>
-#define HYP_NO_ORDER USHRT_MAX
+#define HYP_NO_ORDER ((u8)(~0))
struct hyp_pool {
/*
@@ -19,11 +19,11 @@ struct hyp_pool {
struct list_head free_area[NR_PAGE_ORDERS];
phys_addr_t range_start;
phys_addr_t range_end;
- unsigned short max_order;
+ u8 max_order;
};
/* Allocation */
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+void *hyp_alloc_pages(struct hyp_pool *pool, u8 order);
void hyp_split_page(struct hyp_page *page);
void hyp_get_page(struct hyp_pool *pool, void *addr);
void hyp_put_page(struct hyp_pool *pool, void *addr);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 0972faccc2af..ea0a704da9b8 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -11,40 +11,10 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/virt.h>
+#include <nvhe/memory.h>
#include <nvhe/pkvm.h>
#include <nvhe/spinlock.h>
-/*
- * SW bits 0-1 are reserved to track the memory ownership state of each page:
- * 00: The page is owned exclusively by the page-table owner.
- * 01: The page is owned by the page-table owner, but is shared
- * with another entity.
- * 10: The page is shared with, but not owned by the page-table owner.
- * 11: Reserved for future use (lending).
- */
-enum pkvm_page_state {
- PKVM_PAGE_OWNED = 0ULL,
- PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0,
- PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1,
- __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 |
- KVM_PGTABLE_PROT_SW1,
-
- /* Meta-states which aren't encoded directly in the PTE's SW bits */
- PKVM_NOPAGE,
-};
-
-#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
-static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
- enum pkvm_page_state state)
-{
- return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state;
-}
-
-static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
-{
- return prot & PKVM_PAGE_STATE_PROT_MASK;
-}
-
struct host_mmu {
struct kvm_arch arch;
struct kvm_pgtable pgt;
@@ -69,6 +39,13 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu,
+ enum kvm_pgtable_prot prot);
+int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm);
+int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot);
+int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm);
+int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm);
+int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
@@ -79,7 +56,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
int hyp_pin_shared_mem(void *from, void *to);
void hyp_unpin_shared_mem(void *from, void *to);
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
+void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
struct kvm_hyp_memcache *host_mc);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index ab205c4d6774..34233d586060 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -7,9 +7,47 @@
#include <linux/types.h>
+/*
+ * Bits 0-1 are reserved to track the memory ownership state of each page:
+ * 00: The page is owned exclusively by the page-table owner.
+ * 01: The page is owned by the page-table owner, but is shared
+ * with another entity.
+ * 10: The page is shared with, but not owned by the page-table owner.
+ * 11: Reserved for future use (lending).
+ */
+enum pkvm_page_state {
+ PKVM_PAGE_OWNED = 0ULL,
+ PKVM_PAGE_SHARED_OWNED = BIT(0),
+ PKVM_PAGE_SHARED_BORROWED = BIT(1),
+ __PKVM_PAGE_RESERVED = BIT(0) | BIT(1),
+
+ /* Meta-states which aren't encoded directly in the PTE's SW bits */
+ PKVM_NOPAGE = BIT(2),
+};
+#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED)
+
+#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
+static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
+ enum pkvm_page_state state)
+{
+ prot &= ~PKVM_PAGE_STATE_PROT_MASK;
+ prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state);
+ return prot;
+}
+
+static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
+{
+ return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot);
+}
+
struct hyp_page {
- unsigned short refcount;
- unsigned short order;
+ u16 refcount;
+ u8 order;
+
+ /* Host (non-meta) state. Guarded by the host stage-2 lock. */
+ enum pkvm_page_state host_state : 8;
+
+ u32 host_share_guest_count;
};
extern u64 __hyp_vmemmap;
@@ -29,7 +67,13 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr)
#define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT)
#define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT))
-#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)])
+
+static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys)
+{
+ BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64));
+ return &hyp_vmemmap[hyp_phys_to_pfn(phys)];
+}
+
#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt))
#define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt))
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 82b3d62538a6..ce31d3b73603 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -20,6 +20,12 @@ struct pkvm_hyp_vcpu {
/* Backpointer to the host's (untrusted) vCPU instance. */
struct kvm_vcpu *host_vcpu;
+
+ /*
+ * If this hyp vCPU is loaded, then this is a backpointer to the
+ * per-cpu pointer tracking us. Otherwise, NULL if not loaded.
+ */
+ struct pkvm_hyp_vcpu **loaded_hyp_vcpu;
};
/*
@@ -37,22 +43,28 @@ struct pkvm_hyp_vm {
struct hyp_pool pool;
hyp_spinlock_t lock;
- /*
- * The number of vcpus initialized and ready to run.
- * Modifying this is protected by 'vm_table_lock'.
- */
- unsigned int nr_vcpus;
-
/* Array of the hyp vCPU structures for this VM. */
struct pkvm_hyp_vcpu *vcpus[];
};
+extern hyp_spinlock_t vm_table_lock;
+
static inline struct pkvm_hyp_vm *
pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
{
return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
}
+static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ return vcpu_is_protected(&hyp_vcpu->vcpu);
+}
+
+static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm)
+{
+ return kvm_vm_is_protected(&hyp_vm->kvm);
+}
+
void pkvm_hyp_vm_table_init(void *tbl);
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
@@ -64,5 +76,15 @@ int __pkvm_teardown_vm(pkvm_handle_t handle);
struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
unsigned int vcpu_idx);
void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void);
+
+struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle);
+struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle);
+void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm);
+
+bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
+bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
+void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu);
+int kvm_check_pvm_sysreg_table(void);
#endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
index 45a84f0ade04..1e6d995968a1 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@@ -15,6 +15,4 @@
#define DECLARE_REG(type, name, ctxt, reg) \
type name = (type)cpu_reg(ctxt, (reg))
-void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu);
-
#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 2250253a6429..b43426a493df 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -20,6 +20,8 @@ HOST_EXTRACFLAGS += -I$(objtree)/include
lib-objs := clear_page.o copy_page.o memcpy.o memset.o
lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+CFLAGS_switch.nvhe.o += -Wno-override-init
+
hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
@@ -89,24 +91,11 @@ quiet_cmd_hyprel = HYPREL $@
quiet_cmd_hypcopy = HYPCOPY $@
cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
-# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
-# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
-KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+# Remove ftrace and Shadow Call Stack CFLAGS.
+# This is equivalent to the 'notrace' and '__noscs' annotations.
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAGS))
# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
# when profile optimization is applied. gen-hyprel does not support SHT_REL and
# causes a build failure. Remove profile optimization flags.
KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables
-
-# KVM nVHE code is run at a different exception code with a different map, so
-# compiler instrumentation that inserts callbacks or checks into the code may
-# cause crashes. Just disable it.
-GCOV_PROFILE := n
-KASAN_SANITIZE := n
-KCSAN_SANITIZE := n
-UBSAN_SANITIZE := n
-KCOV_INSTRUMENT := n
-
-# Skip objtool checking for this directory because nVHE code is compiled with
-# non-standard build rules.
-OBJECT_FILES_NON_STANDARD := y
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 7746ea507b6f..2f4a4f5036bb 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -51,42 +51,55 @@ static void __debug_restore_spe(u64 pmscr_el1)
write_sysreg_el1(pmscr_el1, SYS_PMSCR);
}
-static void __debug_save_trace(u64 *trfcr_el1)
+static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr)
{
- *trfcr_el1 = 0;
+ *saved_trfcr = read_sysreg_el1(SYS_TRFCR);
+ write_sysreg_el1(new_trfcr, SYS_TRFCR);
+}
- /* Check if the TRBE is enabled */
- if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E))
- return;
- /*
- * Prohibit trace generation while we are in guest.
- * Since access to TRFCR_EL1 is trapped, the guest can't
- * modify the filtering set by the host.
- */
- *trfcr_el1 = read_sysreg_el1(SYS_TRFCR);
- write_sysreg_el1(0, SYS_TRFCR);
- isb();
- /* Drain the trace buffer to memory */
- tsb_csync();
+static bool __trace_needs_drain(void)
+{
+ if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE))
+ return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E;
+
+ return host_data_test_flag(TRBE_ENABLED);
}
-static void __debug_restore_trace(u64 trfcr_el1)
+static bool __trace_needs_switch(void)
{
- if (!trfcr_el1)
- return;
+ return host_data_test_flag(TRBE_ENABLED) ||
+ host_data_test_flag(EL1_TRACING_CONFIGURED);
+}
- /* Restore trace filter controls */
- write_sysreg_el1(trfcr_el1, SYS_TRFCR);
+static void __trace_switch_to_guest(void)
+{
+ /* Unsupported with TRBE so disable */
+ if (host_data_test_flag(TRBE_ENABLED))
+ *host_data_ptr(trfcr_while_in_guest) = 0;
+
+ __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1),
+ *host_data_ptr(trfcr_while_in_guest));
+
+ if (__trace_needs_drain()) {
+ isb();
+ tsb_csync();
+ }
+}
+
+static void __trace_switch_to_host(void)
+{
+ __trace_do_switch(host_data_ptr(trfcr_while_in_guest),
+ *host_data_ptr(host_debug_state.trfcr_el1));
}
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE))
- __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
- /* Disable and flush Self-Hosted Trace generation */
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE))
- __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1);
+ if (host_data_test_flag(HAS_SPE))
+ __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1));
+
+ if (__trace_needs_switch())
+ __trace_switch_to_guest();
}
void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -96,18 +109,13 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE))
- __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE))
- __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1);
+ if (host_data_test_flag(HAS_SPE))
+ __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1));
+ if (__trace_needs_switch())
+ __trace_switch_to_host();
}
void __debug_switch_to_host(struct kvm_vcpu *vcpu)
{
__debug_switch_to_host_common(vcpu);
}
-
-u64 __kvm_get_mdcr_el2(void)
-{
- return read_sysreg(mdcr_el2);
-}
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 320f2eaa14a9..e433dfab882a 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -67,6 +67,9 @@ struct kvm_ffa_buffers {
*/
static struct kvm_ffa_buffers hyp_buffers;
static struct kvm_ffa_buffers host_buffers;
+static u32 hyp_ffa_version;
+static bool has_version_negotiated;
+static hyp_spinlock_t version_lock;
static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
{
@@ -177,6 +180,14 @@ static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len)
res);
}
+static void ffa_rx_release(struct arm_smccc_res *res)
+{
+ arm_smccc_1_1_smc(FFA_RX_RELEASE,
+ 0, 0,
+ 0, 0, 0, 0, 0,
+ res);
+}
+
static void do_ffa_rxtx_map(struct arm_smccc_res *res,
struct kvm_cpu_context *ctxt)
{
@@ -415,9 +426,9 @@ out:
return;
}
-static __always_inline void do_ffa_mem_xfer(const u64 func_id,
- struct arm_smccc_res *res,
- struct kvm_cpu_context *ctxt)
+static void __do_ffa_mem_xfer(const u64 func_id,
+ struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, len, ctxt, 1);
DECLARE_REG(u32, fraglen, ctxt, 2);
@@ -429,9 +440,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
u32 offset, nr_ranges;
int ret = 0;
- BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
- func_id != FFA_FN64_MEM_LEND);
-
if (addr_mbz || npages_mbz || fraglen > len ||
fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
ret = FFA_RET_INVALID_PARAMETERS;
@@ -450,11 +458,16 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
goto out_unlock;
}
+ if (len > ffa_desc_buf.len) {
+ ret = FFA_RET_NO_MEMORY;
+ goto out_unlock;
+ }
+
buf = hyp_buffers.tx;
memcpy(buf, host_buffers.tx, fraglen);
ep_mem_access = (void *)buf +
- ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+ ffa_mem_desc_offset(buf, 0, hyp_ffa_version);
offset = ep_mem_access->composite_off;
if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
ret = FFA_RET_INVALID_PARAMETERS;
@@ -501,6 +514,13 @@ err_unshare:
goto out_unlock;
}
+#define do_ffa_mem_xfer(fid, res, ctxt) \
+ do { \
+ BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \
+ (fid) != FFA_FN64_MEM_LEND); \
+ __do_ffa_mem_xfer((fid), (res), (ctxt)); \
+ } while (0);
+
static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
struct kvm_cpu_context *ctxt)
{
@@ -533,7 +553,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
fraglen = res->a2;
ep_mem_access = (void *)buf +
- ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+ ffa_mem_desc_offset(buf, 0, hyp_ffa_version);
offset = ep_mem_access->composite_off;
/*
* We can trust the SPMD to get this right, but let's at least
@@ -543,16 +563,19 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
if (WARN_ON(offset > len ||
fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
ret = FFA_RET_ABORTED;
+ ffa_rx_release(res);
goto out_unlock;
}
if (len > ffa_desc_buf.len) {
ret = FFA_RET_NO_MEMORY;
+ ffa_rx_release(res);
goto out_unlock;
}
buf = ffa_desc_buf.buf;
memcpy(buf, hyp_buffers.rx, fraglen);
+ ffa_rx_release(res);
for (fragoff = fraglen; fragoff < len; fragoff += fraglen) {
ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff);
@@ -563,6 +586,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
fraglen = res->a3;
memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen);
+ ffa_rx_release(res);
}
ffa_mem_reclaim(res, handle_lo, handle_hi, flags);
@@ -600,7 +624,6 @@ static bool ffa_call_supported(u64 func_id)
case FFA_MSG_POLL:
case FFA_MSG_WAIT:
/* 32-bit variants of 64-bit calls */
- case FFA_MSG_SEND_DIRECT_REQ:
case FFA_MSG_SEND_DIRECT_RESP:
case FFA_RXTX_MAP:
case FFA_MEM_DONATE:
@@ -640,6 +663,132 @@ out_handled:
return true;
}
+static int hyp_ffa_post_init(void)
+{
+ size_t min_rxtx_sz;
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ if (res.a2 != HOST_FFA_ID)
+ return -EINVAL;
+
+ arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
+ 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ switch (res.a2) {
+ case FFA_FEAT_RXTX_MIN_SZ_4K:
+ min_rxtx_sz = SZ_4K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_16K:
+ min_rxtx_sz = SZ_16K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_64K:
+ min_rxtx_sz = SZ_64K;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (min_rxtx_sz > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static void do_ffa_version(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, ffa_req_version, ctxt, 1);
+
+ if (FFA_MAJOR_VERSION(ffa_req_version) != 1) {
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ return;
+ }
+
+ hyp_spin_lock(&version_lock);
+ if (has_version_negotiated) {
+ res->a0 = hyp_ffa_version;
+ goto unlock;
+ }
+
+ /*
+ * If the client driver tries to downgrade the version, we need to ask
+ * first if TEE supports it.
+ */
+ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) {
+ arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0,
+ 0, 0, 0, 0, 0,
+ res);
+ if (res->a0 == FFA_RET_NOT_SUPPORTED)
+ goto unlock;
+
+ hyp_ffa_version = ffa_req_version;
+ }
+
+ if (hyp_ffa_post_init())
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ else {
+ has_version_negotiated = true;
+ res->a0 = hyp_ffa_version;
+ }
+unlock:
+ hyp_spin_unlock(&version_lock);
+}
+
+static void do_ffa_part_get(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, uuid0, ctxt, 1);
+ DECLARE_REG(u32, uuid1, ctxt, 2);
+ DECLARE_REG(u32, uuid2, ctxt, 3);
+ DECLARE_REG(u32, uuid3, ctxt, 4);
+ DECLARE_REG(u32, flags, ctxt, 5);
+ u32 count, partition_sz, copy_sz;
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.rx) {
+ ffa_to_smccc_res(res, FFA_RET_BUSY);
+ goto out_unlock;
+ }
+
+ arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1,
+ uuid2, uuid3, flags, 0, 0,
+ res);
+
+ if (res->a0 != FFA_SUCCESS)
+ goto out_unlock;
+
+ count = res->a2;
+ if (!count)
+ goto out_unlock;
+
+ if (hyp_ffa_version > FFA_VERSION_1_0) {
+ /* Get the number of partitions deployed in the system */
+ if (flags & 0x1)
+ goto out_unlock;
+
+ partition_sz = res->a3;
+ } else {
+ /* FFA_VERSION_1_0 lacks the size in the response */
+ partition_sz = FFA_1_0_PARTITON_INFO_SZ;
+ }
+
+ copy_sz = partition_sz * count;
+ if (copy_sz > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+ ffa_to_smccc_res(res, FFA_RET_ABORTED);
+ goto out_unlock;
+ }
+
+ memcpy(host_buffers.rx, hyp_buffers.rx, copy_sz);
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+}
+
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
{
struct arm_smccc_res res;
@@ -660,6 +809,11 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
if (!is_ffa_call(func_id))
return false;
+ if (!has_version_negotiated && func_id != FFA_VERSION) {
+ ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS);
+ goto out_handled;
+ }
+
switch (func_id) {
case FFA_FEATURES:
if (!do_ffa_features(&res, host_ctxt))
@@ -686,6 +840,12 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
case FFA_MEM_FRAG_TX:
do_ffa_mem_frag_tx(&res, host_ctxt);
goto out_handled;
+ case FFA_VERSION:
+ do_ffa_version(&res, host_ctxt);
+ goto out_handled;
+ case FFA_PARTITION_INFO_GET:
+ do_ffa_part_get(&res, host_ctxt);
+ goto out_handled;
}
if (ffa_call_supported(func_id))
@@ -700,13 +860,12 @@ out_handled:
int hyp_ffa_init(void *pages)
{
struct arm_smccc_res res;
- size_t min_rxtx_sz;
void *tx, *rx;
if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
return 0;
- arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);
+ arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res);
if (res.a0 == FFA_RET_NOT_SUPPORTED)
return 0;
@@ -726,34 +885,10 @@ int hyp_ffa_init(void *pages)
if (FFA_MAJOR_VERSION(res.a0) != 1)
return -EOPNOTSUPP;
- arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
- if (res.a0 != FFA_SUCCESS)
- return -EOPNOTSUPP;
-
- if (res.a2 != HOST_FFA_ID)
- return -EINVAL;
-
- arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
- 0, 0, 0, 0, 0, 0, &res);
- if (res.a0 != FFA_SUCCESS)
- return -EOPNOTSUPP;
-
- switch (res.a2) {
- case FFA_FEAT_RXTX_MIN_SZ_4K:
- min_rxtx_sz = SZ_4K;
- break;
- case FFA_FEAT_RXTX_MIN_SZ_16K:
- min_rxtx_sz = SZ_16K;
- break;
- case FFA_FEAT_RXTX_MIN_SZ_64K:
- min_rxtx_sz = SZ_64K;
- break;
- default:
- return -EINVAL;
- }
-
- if (min_rxtx_sz > PAGE_SIZE)
- return -EOPNOTSUPP;
+ if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1))
+ hyp_ffa_version = res.a0;
+ else
+ hyp_ffa_version = FFA_VERSION_1_1;
tx = pages;
pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
@@ -776,5 +911,6 @@ int hyp_ffa_init(void *pages)
.lock = __HYP_SPIN_LOCK_UNLOCKED,
};
+ version_lock = __HYP_SPIN_LOCK_UNLOCKED;
return 0;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
index 6bc88a756cb7..b63f4e1c1033 100644
--- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
+++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
@@ -50,6 +50,9 @@
#ifndef R_AARCH64_ABS64
#define R_AARCH64_ABS64 257
#endif
+#ifndef R_AARCH64_ABS32
+#define R_AARCH64_ABS32 258
+#endif
#ifndef R_AARCH64_PREL64
#define R_AARCH64_PREL64 260
#endif
@@ -383,6 +386,9 @@ static void emit_rela_section(Elf64_Shdr *sh_rela)
case R_AARCH64_ABS64:
emit_rela_abs64(rela, sh_orig_name);
break;
+ /* Allow 32-bit absolute relocation, for kCFI type hashes. */
+ case R_AARCH64_ABS32:
+ break;
/* Allow position-relative data relocations. */
case R_AARCH64_PREL64:
case R_AARCH64_PREL32:
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 135cfb294ee5..58f0cb2298cc 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -188,21 +188,15 @@ SYM_FUNC_END(__host_hvc)
/*
* Test whether the SP has overflowed, without corrupting a GPR.
- * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
+ * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit
* of SP should always be 1.
*/
add sp, sp, x0 // sp' = sp + x0
sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
- tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
+ tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@
sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
- /* If a guest is loaded, panic out of it. */
- stp x0, x1, [sp, #-16]!
- get_loaded_vcpu x0, x1
- cbnz x0, __guest_exit_panic
- add sp, sp, #16
-
/*
* The panic may not be clean if the exception is taken before the host
* context has been saved by __host_exit or after the hyp context has
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 2994878d68ea..f8af11189572 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -5,6 +5,7 @@
*/
#include <linux/arm-smccc.h>
+#include <linux/cfi_types.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
@@ -23,28 +24,25 @@
.align 11
SYM_CODE_START(__kvm_hyp_init)
- ventry __invalid // Synchronous EL2t
- ventry __invalid // IRQ EL2t
- ventry __invalid // FIQ EL2t
- ventry __invalid // Error EL2t
+ ventry . // Synchronous EL2t
+ ventry . // IRQ EL2t
+ ventry . // FIQ EL2t
+ ventry . // Error EL2t
- ventry __invalid // Synchronous EL2h
- ventry __invalid // IRQ EL2h
- ventry __invalid // FIQ EL2h
- ventry __invalid // Error EL2h
+ ventry . // Synchronous EL2h
+ ventry . // IRQ EL2h
+ ventry . // FIQ EL2h
+ ventry . // Error EL2h
ventry __do_hyp_init // Synchronous 64-bit EL1
- ventry __invalid // IRQ 64-bit EL1
- ventry __invalid // FIQ 64-bit EL1
- ventry __invalid // Error 64-bit EL1
+ ventry . // IRQ 64-bit EL1
+ ventry . // FIQ 64-bit EL1
+ ventry . // Error 64-bit EL1
- ventry __invalid // Synchronous 32-bit EL1
- ventry __invalid // IRQ 32-bit EL1
- ventry __invalid // FIQ 32-bit EL1
- ventry __invalid // Error 32-bit EL1
-
-__invalid:
- b .
+ ventry . // Synchronous 32-bit EL1
+ ventry . // IRQ 32-bit EL1
+ ventry . // FIQ 32-bit EL1
+ ventry . // Error 32-bit EL1
/*
* Only uses x0..x3 so as to not clobber callee-saved SMCCC registers.
@@ -76,6 +74,17 @@ __do_hyp_init:
SYM_CODE_END(__kvm_hyp_init)
/*
+ * Initialize EL2 CPU state to sane values.
+ *
+ * HCR_EL2.E2H must have been initialized already.
+ */
+SYM_CODE_START_LOCAL(__kvm_init_el2_state)
+ init_el2_state // Clobbers x0..x2
+ finalise_el2_state
+ ret
+SYM_CODE_END(__kvm_init_el2_state)
+
+/*
* Initialize the hypervisor in EL2.
*
* Only uses x0..x2 so as to not clobber callee-saved SMCCC registers
@@ -101,9 +110,12 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
// TPIDR_EL2 is used to preserve x0 across the macro maze...
isb
msr tpidr_el2, x0
- init_el2_state
- finalise_el2_state
+ str lr, [x0, #NVHE_INIT_TMP]
+
+ bl __kvm_init_el2_state
+
mrs x0, tpidr_el2
+ ldr lr, [x0, #NVHE_INIT_TMP]
1:
ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
@@ -129,7 +141,7 @@ alternative_else_nop_endif
/* Invalidate the stale TLBs from Bootloader */
tlbi alle2
- tlbi vmalls12e1
+ tlbi alle1
dsb sy
mov_q x0, INIT_SCTLR_EL2_MMU_ON
@@ -198,10 +210,9 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
2: msr SPsel, #1 // We want to use SP_EL{1,2}
- /* Initialize EL2 CPU state to sane values. */
- init_el2_state // Clobbers x0..x2
- finalise_el2_state
- __init_el2_nvhe_prepare_eret
+ init_el2_hcr 0
+
+ bl __kvm_init_el2_state
/* Enable MMU, set vectors and stack. */
mov x0, x28
@@ -265,33 +276,38 @@ alternative_else_nop_endif
SYM_CODE_END(__kvm_handle_stub_hvc)
-SYM_FUNC_START(__pkvm_init_switch_pgd)
+/*
+ * void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp,
+ * void (*fn)(void));
+ *
+ * SYM_TYPED_FUNC_START() allows C to call this ID-mapped function indirectly
+ * using a physical pointer without triggering a kCFI failure.
+ */
+SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd)
/* Turn the MMU off */
pre_disable_mmu_workaround
- mrs x2, sctlr_el2
- bic x3, x2, #SCTLR_ELx_M
- msr sctlr_el2, x3
+ mrs x3, sctlr_el2
+ bic x4, x3, #SCTLR_ELx_M
+ msr sctlr_el2, x4
isb
tlbi alle2
/* Install the new pgtables */
- ldr x3, [x0, #NVHE_INIT_PGD_PA]
- phys_to_ttbr x4, x3
+ phys_to_ttbr x5, x0
alternative_if ARM64_HAS_CNP
- orr x4, x4, #TTBR_CNP_BIT
+ orr x5, x5, #TTBR_CNP_BIT
alternative_else_nop_endif
- msr ttbr0_el2, x4
+ msr ttbr0_el2, x5
/* Set the new stack pointer */
- ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA]
- mov sp, x0
+ mov sp, x1
/* And turn the MMU back on! */
dsb nsh
isb
- set_sctlr_el2 x2
- ret x1
+ set_sctlr_el2 x3
+ ret x2
SYM_FUNC_END(__pkvm_init_switch_pgd)
.popsection
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 2385fd03ed87..2c37680d954c 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -5,6 +5,7 @@
*/
#include <hyp/adjust_pc.h>
+#include <hyp/switch.h>
#include <asm/pgtable-types.h>
#include <asm/kvm_asm.h>
@@ -23,26 +24,115 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
+static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu)
+{
+ __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+ /*
+ * On saving/restoring guest sve state, always use the maximum VL for
+ * the guest. The layout of the data when saving the sve state depends
+ * on the VL, so use a consistent (i.e., the maximum) guest VL.
+ */
+ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
+ __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true);
+ write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2);
+}
+
+static void __hyp_sve_restore_host(void)
+{
+ struct cpu_sve_state *sve_state = *host_data_ptr(sve_state);
+
+ /*
+ * On saving/restoring host sve state, always use the maximum VL for
+ * the host. The layout of the data when saving the sve state depends
+ * on the VL, so use a consistent (i.e., the maximum) host VL.
+ *
+ * Note that this constrains the PE to the maximum shared VL
+ * that was discovered, if we wish to use larger VLs this will
+ * need to be revisited.
+ */
+ write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2);
+ __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl),
+ &sve_state->fpsr,
+ true);
+ write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR);
+}
+
+static void fpsimd_sve_flush(void)
+{
+ *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
+}
+
+static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
+{
+ bool has_fpmr;
+
+ if (!guest_owns_fp_regs())
+ return;
+
+ cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
+ isb();
+
+ if (vcpu_has_sve(vcpu))
+ __hyp_sve_save_guest(vcpu);
+ else
+ __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs);
+
+ has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm));
+ if (has_fpmr)
+ __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR);
+
+ if (system_supports_sve())
+ __hyp_sve_restore_host();
+ else
+ __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs));
+
+ if (has_fpmr)
+ write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR);
+
+ *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
+}
+
+static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner;
+
+ if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
+ hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state;
+ else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
+ hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state;
+}
+
+static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
+ host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state;
+ else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
+ host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state;
+}
+
static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
{
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+ fpsimd_sve_flush();
+ flush_debug_state(hyp_vcpu);
+
hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
- hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
-
- hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu;
+ /* Limit guest vector length to the maximum supported by the host. */
+ hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl);
- hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2;
hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
- hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2;
+ hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE);
+ hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) &
+ (HCR_TWI | HCR_TWE);
hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags;
- hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state;
-
- hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr);
- hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state;
hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
@@ -56,35 +146,74 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
unsigned int i;
+ fpsimd_sve_sync(&hyp_vcpu->vcpu);
+ sync_debug_state(hyp_vcpu);
+
host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt;
host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2;
- host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2;
host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault;
host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
- host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state;
host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
}
+static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2);
+ DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx);
+ if (!hyp_vcpu)
+ return;
+
+ if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
+ /* Propagate WFx trapping flags */
+ hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI);
+ hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI);
+ }
+}
+
+static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (hyp_vcpu)
+ pkvm_put_hyp_vcpu(hyp_vcpu);
+}
+
static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
int ret;
- host_vcpu = kern_hyp_va(host_vcpu);
-
if (unlikely(is_protected_kvm_enabled())) {
- struct pkvm_hyp_vcpu *hyp_vcpu;
- struct kvm *host_kvm;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+
+ /*
+ * KVM (and pKVM) doesn't support SME guests for now, and
+ * ensures that SME features aren't enabled in pstate when
+ * loading a vcpu. Therefore, if SME features enabled the host
+ * is misbehaving.
+ */
+ if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) {
+ ret = -EINVAL;
+ goto out;
+ }
- host_kvm = kern_hyp_va(host_vcpu->kvm);
- hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
- host_vcpu->vcpu_idx);
if (!hyp_vcpu) {
ret = -EINVAL;
goto out;
@@ -95,12 +224,145 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
sync_hyp_vcpu(hyp_vcpu);
- pkvm_put_hyp_vcpu(hyp_vcpu);
} else {
+ struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu);
+
/* The host is fully trusted, run its vCPU directly. */
- ret = __kvm_vcpu_run(host_vcpu);
+ fpsimd_lazy_switch_to_guest(vcpu);
+ ret = __kvm_vcpu_run(vcpu);
+ fpsimd_lazy_switch_to_host(vcpu);
}
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache,
+ host_vcpu->arch.pkvm_memcache.nr_pages,
+ &host_vcpu->arch.pkvm_memcache);
+}
+static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = pkvm_refill_memcache(hyp_vcpu);
+ if (ret)
+ goto out;
+
+ ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ goto out;
+
+ ret = __pkvm_host_unshare_guest(gfn, hyp_vm);
+ put_pkvm_hyp_vm(hyp_vm);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, gfn, host_ctxt, 1);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ goto out;
+
+ ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm);
+ put_pkvm_hyp_vm(hyp_vm);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(bool, mkold, host_ctxt, 3);
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ goto out;
+
+ ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm);
+ put_pkvm_hyp_vm(hyp_vm);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, gfn, host_ctxt, 1);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu);
out:
cpu_reg(host_ctxt, 1) = ret;
}
@@ -152,6 +414,22 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid(kern_hyp_va(mmu));
}
+static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ struct pkvm_hyp_vm *hyp_vm;
+
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ return;
+
+ __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
+ put_pkvm_hyp_vm(hyp_vm);
+}
+
static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
@@ -178,38 +456,23 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config();
}
-static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
-{
- cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr();
-}
-
-static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt)
-{
- __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1));
-}
-
static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
{
__vgic_v3_init_lrs();
}
-static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt)
-{
- cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2();
-}
-
-static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
- __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+ __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if));
}
-static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
- __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+ __vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if));
}
static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
@@ -280,13 +543,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
}
-static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
-{
- DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
-
- __pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
-}
-
static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
@@ -320,7 +576,6 @@ typedef void (*hcall_t)(struct kvm_cpu_context *);
static const hcall_t host_hcall[] = {
/* ___kvm_hyp_init */
- HANDLE_FUNC(__kvm_get_mdcr_el2),
HANDLE_FUNC(__pkvm_init),
HANDLE_FUNC(__pkvm_create_private_mapping),
HANDLE_FUNC(__pkvm_cpu_set_vector),
@@ -331,6 +586,12 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__pkvm_host_share_hyp),
HANDLE_FUNC(__pkvm_host_unshare_hyp),
+ HANDLE_FUNC(__pkvm_host_share_guest),
+ HANDLE_FUNC(__pkvm_host_unshare_guest),
+ HANDLE_FUNC(__pkvm_host_relax_perms_guest),
+ HANDLE_FUNC(__pkvm_host_wrprotect_guest),
+ HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
+ HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
@@ -340,14 +601,14 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_tlb_flush_vmid_range),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
- HANDLE_FUNC(__vgic_v3_read_vmcr),
- HANDLE_FUNC(__vgic_v3_write_vmcr),
- HANDLE_FUNC(__vgic_v3_save_aprs),
- HANDLE_FUNC(__vgic_v3_restore_aprs),
- HANDLE_FUNC(__pkvm_vcpu_init_traps),
+ HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
+ HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
HANDLE_FUNC(__pkvm_init_vm),
HANDLE_FUNC(__pkvm_init_vcpu),
HANDLE_FUNC(__pkvm_teardown_vm),
+ HANDLE_FUNC(__pkvm_vcpu_load),
+ HANDLE_FUNC(__pkvm_vcpu_put),
+ HANDLE_FUNC(__pkvm_tlb_flush_vmid),
};
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -419,15 +680,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
case ESR_ELx_EC_SMC64:
handle_host_smc(host_ctxt);
break;
- case ESR_ELx_EC_SVE:
- if (has_hvhe())
- sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN |
- CPACR_EL1_ZEN_EL0EN));
- else
- sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
- isb();
- sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
- break;
case ESR_ELx_EC_IABT_LOW:
case ESR_ELx_EC_DABT_LOW:
handle_host_mem_abort(host_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 861c76021a25..f34f11c720d7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -201,8 +201,8 @@ static void *guest_s2_zalloc_page(void *mc)
memset(addr, 0, PAGE_SIZE);
p = hyp_virt_to_page(addr);
- memset(p, 0, sizeof(*p));
p->refcount = 1;
+ p->order = 0;
return addr;
}
@@ -266,8 +266,9 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
return 0;
}
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
+void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
{
+ struct hyp_page *page;
void *addr;
/* Dump all pgtable pages in the hyp_pool */
@@ -279,7 +280,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
/* Drain the hyp_pool into the memcache */
addr = hyp_alloc_pages(&vm->pool, 0);
while (addr) {
- memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
+ page = hyp_virt_to_page(addr);
+ page->refcount = 0;
+ page->order = 0;
push_hyp_memcache(mc, addr, hyp_virt_to_phys);
WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
addr = hyp_alloc_pages(&vm->pool, 0);
@@ -382,19 +385,28 @@ bool addr_is_memory(phys_addr_t phys)
return !!find_mem_range(phys, &range);
}
-static bool addr_is_allowed_memory(phys_addr_t phys)
+static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
+{
+ return range->start <= addr && addr < range->end;
+}
+
+static int check_range_allowed_memory(u64 start, u64 end)
{
struct memblock_region *reg;
struct kvm_mem_range range;
- reg = find_mem_range(phys, &range);
+ /*
+ * Callers can't check the state of a range that overlaps memory and
+ * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range.
+ */
+ reg = find_mem_range(start, &range);
+ if (!is_in_mem_range(end - 1, &range))
+ return -EINVAL;
- return reg && !(reg->flags & MEMBLOCK_NOMAP);
-}
+ if (!reg || reg->flags & MEMBLOCK_NOMAP)
+ return -EPERM;
-static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
-{
- return range->start <= addr && addr < range->end;
+ return 0;
}
static bool range_is_memory(u64 start, u64 end)
@@ -454,8 +466,10 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
if (kvm_pte_valid(pte))
return -EAGAIN;
- if (pte)
+ if (pte) {
+ WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE);
return -EPERM;
+ }
do {
u64 granule = kvm_granule_size(level);
@@ -477,10 +491,33 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
}
+static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state)
+{
+ phys_addr_t end = addr + size;
+
+ for (; addr < end; addr += PAGE_SIZE)
+ hyp_phys_to_page(addr)->host_state = state;
+}
+
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
- return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
- addr, size, &host_s2_pool, owner_id);
+ int ret;
+
+ if (!addr_is_memory(addr))
+ return -EPERM;
+
+ ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
+ addr, size, &host_s2_pool, owner_id);
+ if (ret)
+ return ret;
+
+ /* Don't forget to update the vmemmap tracking for the host */
+ if (owner_id == PKVM_ID_HOST)
+ __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
+ else
+ __host_update_page_state(addr, size, PKVM_NOPAGE);
+
+ return 0;
}
static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
@@ -533,46 +570,19 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
int ret = 0;
esr = read_sysreg_el2(SYS_ESR);
- BUG_ON(!__get_fault_info(esr, &fault));
+ if (!__get_fault_info(esr, &fault)) {
+ /*
+ * We've presumably raced with a page-table change which caused
+ * AT to fail, try again.
+ */
+ return;
+ }
addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
ret = host_stage2_idmap(addr);
BUG_ON(ret && ret != -EAGAIN);
}
-struct pkvm_mem_transition {
- u64 nr_pages;
-
- struct {
- enum pkvm_component_id id;
- /* Address in the initiator's address space */
- u64 addr;
-
- union {
- struct {
- /* Address in the completer's address space */
- u64 completer_addr;
- } host;
- struct {
- u64 completer_addr;
- } hyp;
- };
- } initiator;
-
- struct {
- enum pkvm_component_id id;
- } completer;
-};
-
-struct pkvm_mem_share {
- const struct pkvm_mem_transition tx;
- const enum kvm_pgtable_prot completer_prot;
-};
-
-struct pkvm_mem_donation {
- const struct pkvm_mem_transition tx;
-};
-
struct check_walk_data {
enum pkvm_page_state desired;
enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr);
@@ -598,115 +608,38 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
-static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr)
-{
- if (!addr_is_allowed_memory(addr))
- return PKVM_NOPAGE;
-
- if (!kvm_pte_valid(pte) && pte)
- return PKVM_NOPAGE;
-
- return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
-}
-
static int __host_check_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- struct check_walk_data d = {
- .desired = state,
- .get_page_state = host_get_page_state,
- };
+ u64 end = addr + size;
+ int ret;
+
+ ret = check_range_allowed_memory(addr, end);
+ if (ret)
+ return ret;
hyp_assert_lock_held(&host_mmu.lock);
- return check_page_state_range(&host_mmu.pgt, addr, size, &d);
+ for (; addr < end; addr += PAGE_SIZE) {
+ if (hyp_phys_to_page(addr)->host_state != state)
+ return -EPERM;
+ }
+
+ return 0;
}
static int __host_set_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state);
-
- return host_stage2_idmap_locked(addr, size, prot);
-}
-
-static int host_request_owned_transition(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
-
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
-}
-
-static int host_request_unshare(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
-
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED);
-}
-
-static int host_initiate_share(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
-
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED);
-}
-
-static int host_initiate_unshare(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
-
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED);
-}
-
-static int host_initiate_donation(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
-{
- u8 owner_id = tx->completer.id;
- u64 size = tx->nr_pages * PAGE_SIZE;
-
- *completer_addr = tx->initiator.host.completer_addr;
- return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id);
-}
-
-static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
-{
- return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
- tx->initiator.id != PKVM_ID_HYP);
-}
-
-static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx,
- enum pkvm_page_state state)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
-
- if (__host_ack_skip_pgtable_check(tx))
- return 0;
-
- return __host_check_page_state_range(addr, size, state);
-}
+ if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) {
+ int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
-static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
-{
- return __host_ack_transition(addr, tx, PKVM_NOPAGE);
-}
+ if (ret)
+ return ret;
+ }
-static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u8 host_id = tx->completer.id;
+ __host_update_page_state(addr, size, state);
- return host_stage2_set_owner_locked(addr, size, host_id);
+ return 0;
}
static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr)
@@ -729,577 +662,425 @@ static int __hyp_check_page_state_range(u64 addr, u64 size,
return check_page_state_range(&pkvm_pgtable, addr, size, &d);
}
-static int hyp_request_donation(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
+static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
+ if (!kvm_pte_valid(pte))
+ return PKVM_NOPAGE;
- *completer_addr = tx->initiator.hyp.completer_addr;
- return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+ return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
}
-static int hyp_initiate_donation(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
+static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr,
+ u64 size, enum pkvm_page_state state)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
- int ret;
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ struct check_walk_data d = {
+ .desired = state,
+ .get_page_state = guest_get_page_state,
+ };
- *completer_addr = tx->initiator.hyp.completer_addr;
- ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size);
- return (ret != size) ? -EFAULT : 0;
+ hyp_assert_lock_held(&vm->lock);
+ return check_page_state_range(&vm->pgt, addr, size, &d);
}
-static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
+int __pkvm_host_share_hyp(u64 pfn)
{
- return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
- tx->initiator.id != PKVM_ID_HOST);
-}
+ u64 phys = hyp_pfn_to_phys(pfn);
+ void *virt = __hyp_va(phys);
+ enum kvm_pgtable_prot prot;
+ u64 size = PAGE_SIZE;
+ int ret;
-static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx,
- enum kvm_pgtable_prot perms)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
+ host_lock_component();
+ hyp_lock_component();
- if (perms != PAGE_HYP)
- return -EPERM;
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+ if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
+ ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
+ }
+
+ prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
+ WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot));
+ WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED));
- if (__hyp_ack_skip_pgtable_check(tx))
- return 0;
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+ return ret;
}
-static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
+int __pkvm_host_unshare_hyp(u64 pfn)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
-
- if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr))
- return -EBUSY;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 virt = (u64)__hyp_va(phys);
+ u64 size = PAGE_SIZE;
+ int ret;
- if (__hyp_ack_skip_pgtable_check(tx))
- return 0;
+ host_lock_component();
+ hyp_lock_component();
- return __hyp_check_page_state_range(addr, size,
- PKVM_PAGE_SHARED_BORROWED);
-}
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
+ if (ret)
+ goto unlock;
+ ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED);
+ if (ret)
+ goto unlock;
+ if (hyp_page_count((void *)virt)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
-static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
+ WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size);
+ WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED));
- if (__hyp_ack_skip_pgtable_check(tx))
- return 0;
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+ return ret;
}
-static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
- enum kvm_pgtable_prot perms)
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
{
- void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
+ void *virt = __hyp_va(phys);
enum kvm_pgtable_prot prot;
+ int ret;
- prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED);
- return pkvm_create_mappings_locked(start, end, prot);
-}
+ host_lock_component();
+ hyp_lock_component();
-static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size);
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+ if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
+ ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
+ }
- return (ret != size) ? -EFAULT : 0;
-}
+ prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
+ WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot));
+ WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP));
-static int hyp_complete_donation(u64 addr,
- const struct pkvm_mem_transition *tx)
-{
- void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
- enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- return pkvm_create_mappings_locked(start, end, prot);
+ return ret;
}
-static int check_share(struct pkvm_mem_share *share)
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
+ u64 virt = (u64)__hyp_va(phys);
int ret;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_request_owned_transition(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ host_lock_component();
+ hyp_lock_component();
+ ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED);
if (ret)
- return ret;
-
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
- break;
- case PKVM_ID_FFA:
- /*
- * We only check the host; the secure side will check the other
- * end when we forward the FFA call.
- */
- ret = 0;
- break;
- default:
- ret = -EINVAL;
+ goto unlock;
+ if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
+ ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
}
+ WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size);
+ WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST));
+
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
+
return ret;
}
-static int __do_share(struct pkvm_mem_share *share)
+int hyp_pin_shared_mem(void *from, void *to)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
+ u64 size = end - start;
int ret;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_initiate_share(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ host_lock_component();
+ hyp_lock_component();
+ ret = __host_check_page_state_range(__hyp_pa(start), size,
+ PKVM_PAGE_SHARED_OWNED);
if (ret)
- return ret;
-
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
- break;
- case PKVM_ID_FFA:
- /*
- * We're not responsible for any secure page-tables, so there's
- * nothing to do here.
- */
- ret = 0;
- break;
- default:
- ret = -EINVAL;
- }
+ goto unlock;
- return ret;
-}
+ ret = __hyp_check_page_state_range(start, size,
+ PKVM_PAGE_SHARED_BORROWED);
+ if (ret)
+ goto unlock;
-/*
- * do_share():
- *
- * The page owner grants access to another component with a given set
- * of permissions.
- *
- * Initiator: OWNED => SHARED_OWNED
- * Completer: NOPAGE => SHARED_BORROWED
- */
-static int do_share(struct pkvm_mem_share *share)
-{
- int ret;
+ for (cur = start; cur < end; cur += PAGE_SIZE)
+ hyp_page_ref_inc(hyp_virt_to_page(cur));
- ret = check_share(share);
- if (ret)
- return ret;
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- return WARN_ON(__do_share(share));
+ return ret;
}
-static int check_unshare(struct pkvm_mem_share *share)
+void hyp_unpin_shared_mem(void *from, void *to)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
- int ret;
-
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_request_unshare(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
- if (ret)
- return ret;
+ host_lock_component();
+ hyp_lock_component();
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_ack_unshare(completer_addr, tx);
- break;
- case PKVM_ID_FFA:
- /* See check_share() */
- ret = 0;
- break;
- default:
- ret = -EINVAL;
- }
+ for (cur = start; cur < end; cur += PAGE_SIZE)
+ hyp_page_ref_dec(hyp_virt_to_page(cur));
- return ret;
+ hyp_unlock_component();
+ host_unlock_component();
}
-static int __do_unshare(struct pkvm_mem_share *share)
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
int ret;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_initiate_unshare(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
-
- if (ret)
- return ret;
-
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_complete_unshare(completer_addr, tx);
- break;
- case PKVM_ID_FFA:
- /* See __do_share() */
- ret = 0;
- break;
- default:
- ret = -EINVAL;
- }
+ host_lock_component();
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (!ret)
+ ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
+ host_unlock_component();
return ret;
}
-/*
- * do_unshare():
- *
- * The page owner revokes access from another component for a range of
- * pages which were previously shared using do_share().
- *
- * Initiator: SHARED_OWNED => OWNED
- * Completer: SHARED_BORROWED => NOPAGE
- */
-static int do_unshare(struct pkvm_mem_share *share)
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
{
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
int ret;
- ret = check_unshare(share);
- if (ret)
- return ret;
+ host_lock_component();
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
+ if (!ret)
+ ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ host_unlock_component();
- return WARN_ON(__do_unshare(share));
+ return ret;
}
-static int check_donation(struct pkvm_mem_donation *donation)
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu,
+ enum kvm_pgtable_prot prot)
{
- const struct pkvm_mem_transition *tx = &donation->tx;
- u64 completer_addr;
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ struct hyp_page *page;
int ret;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_request_owned_transition(&completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_request_donation(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ if (prot & ~KVM_PGTABLE_PROT_RWX)
+ return -EINVAL;
+ ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
if (ret)
return ret;
- switch (tx->completer.id) {
- case PKVM_ID_HOST:
- ret = host_ack_donation(completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_ack_donation(completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
+ host_lock_component();
+ guest_lock_component(vm);
-static int __do_donate(struct pkvm_mem_donation *donation)
-{
- const struct pkvm_mem_transition *tx = &donation->tx;
- u64 completer_addr;
- int ret;
+ ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_initiate_donation(&completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_initiate_donation(&completer_addr, tx);
+ page = hyp_phys_to_page(phys);
+ switch (page->host_state) {
+ case PKVM_PAGE_OWNED:
+ WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED));
break;
+ case PKVM_PAGE_SHARED_OWNED:
+ if (page->host_share_guest_count)
+ break;
+ /* Only host to np-guest multi-sharing is tolerated */
+ WARN_ON(1);
+ fallthrough;
default:
- ret = -EINVAL;
+ ret = -EPERM;
+ goto unlock;
}
- if (ret)
- return ret;
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+ page->host_share_guest_count++;
- switch (tx->completer.id) {
- case PKVM_ID_HOST:
- ret = host_complete_donation(completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_complete_donation(completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
return ret;
}
-/*
- * do_donate():
- *
- * The page owner transfers ownership to another component, losing access
- * as a consequence.
- *
- * Initiator: OWNED => NOPAGE
- * Completer: NOPAGE => OWNED
- */
-static int do_donate(struct pkvm_mem_donation *donation)
+static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa)
{
+ enum pkvm_page_state state;
+ struct hyp_page *page;
+ kvm_pte_t pte;
+ u64 phys;
+ s8 level;
int ret;
- ret = check_donation(donation);
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
if (ret)
return ret;
+ if (!kvm_pte_valid(pte))
+ return -ENOENT;
+ if (level != KVM_PGTABLE_LAST_LEVEL)
+ return -E2BIG;
- return WARN_ON(__do_donate(donation));
-}
-
-int __pkvm_host_share_hyp(u64 pfn)
-{
- int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_share share = {
- .tx = {
- .nr_pages = 1,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = host_addr,
- .host = {
- .completer_addr = hyp_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HYP,
- },
- },
- .completer_prot = PAGE_HYP,
- };
+ state = guest_get_page_state(pte, ipa);
+ if (state != PKVM_PAGE_SHARED_BORROWED)
+ return -EPERM;
- host_lock_component();
- hyp_lock_component();
+ phys = kvm_pte_to_phys(pte);
+ ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
+ if (WARN_ON(ret))
+ return ret;
- ret = do_share(&share);
+ page = hyp_phys_to_page(phys);
+ if (page->host_state != PKVM_PAGE_SHARED_OWNED)
+ return -EPERM;
+ if (WARN_ON(!page->host_share_guest_count))
+ return -EINVAL;
- hyp_unlock_component();
- host_unlock_component();
+ *__phys = phys;
- return ret;
+ return 0;
}
-int __pkvm_host_unshare_hyp(u64 pfn)
+int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm)
{
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ struct hyp_page *page;
+ u64 phys;
int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_share share = {
- .tx = {
- .nr_pages = 1,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = host_addr,
- .host = {
- .completer_addr = hyp_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HYP,
- },
- },
- .completer_prot = PAGE_HYP,
- };
host_lock_component();
- hyp_lock_component();
-
- ret = do_unshare(&share);
-
- hyp_unlock_component();
- host_unlock_component();
-
- return ret;
-}
+ guest_lock_component(vm);
-int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
-{
- int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_donation donation = {
- .tx = {
- .nr_pages = nr_pages,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = host_addr,
- .host = {
- .completer_addr = hyp_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HYP,
- },
- },
- };
+ ret = __check_host_shared_guest(vm, &phys, ipa);
+ if (ret)
+ goto unlock;
- host_lock_component();
- hyp_lock_component();
+ ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
+ if (ret)
+ goto unlock;
- ret = do_donate(&donation);
+ page = hyp_phys_to_page(phys);
+ page->host_share_guest_count--;
+ if (!page->host_share_guest_count)
+ WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED));
- hyp_unlock_component();
+unlock:
+ guest_unlock_component(vm);
host_unlock_component();
return ret;
}
-int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
+static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa)
{
+ u64 phys;
int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_donation donation = {
- .tx = {
- .nr_pages = nr_pages,
- .initiator = {
- .id = PKVM_ID_HYP,
- .addr = hyp_addr,
- .hyp = {
- .completer_addr = host_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HOST,
- },
- },
- };
+
+ if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG))
+ return;
host_lock_component();
- hyp_lock_component();
+ guest_lock_component(vm);
- ret = do_donate(&donation);
+ ret = __check_host_shared_guest(vm, &phys, ipa);
- hyp_unlock_component();
+ guest_unlock_component(vm);
host_unlock_component();
- return ret;
+ WARN_ON(ret && ret != -ENOENT);
}
-int hyp_pin_shared_mem(void *from, void *to)
+int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot)
{
- u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
- u64 end = PAGE_ALIGN((u64)to);
- u64 size = end - start;
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 ipa = hyp_pfn_to_phys(gfn);
int ret;
- host_lock_component();
- hyp_lock_component();
-
- ret = __host_check_page_state_range(__hyp_pa(start), size,
- PKVM_PAGE_SHARED_OWNED);
- if (ret)
- goto unlock;
-
- ret = __hyp_check_page_state_range(start, size,
- PKVM_PAGE_SHARED_BORROWED);
- if (ret)
- goto unlock;
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
- for (cur = start; cur < end; cur += PAGE_SIZE)
- hyp_page_ref_inc(hyp_virt_to_page(cur));
+ if (prot & ~KVM_PGTABLE_PROT_RWX)
+ return -EINVAL;
-unlock:
- hyp_unlock_component();
- host_unlock_component();
+ assert_host_shared_guest(vm, ipa);
+ guest_lock_component(vm);
+ ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0);
+ guest_unlock_component(vm);
return ret;
}
-void hyp_unpin_shared_mem(void *from, void *to)
+int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm)
{
- u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
- u64 end = PAGE_ALIGN((u64)to);
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ int ret;
- host_lock_component();
- hyp_lock_component();
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
- for (cur = start; cur < end; cur += PAGE_SIZE)
- hyp_page_ref_dec(hyp_virt_to_page(cur));
+ assert_host_shared_guest(vm, ipa);
+ guest_lock_component(vm);
+ ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE);
+ guest_unlock_component(vm);
- hyp_unlock_component();
- host_unlock_component();
+ return ret;
}
-int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
+int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm)
{
+ u64 ipa = hyp_pfn_to_phys(gfn);
int ret;
- struct pkvm_mem_share share = {
- .tx = {
- .nr_pages = nr_pages,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = hyp_pfn_to_phys(pfn),
- },
- .completer = {
- .id = PKVM_ID_FFA,
- },
- },
- };
- host_lock_component();
- ret = do_share(&share);
- host_unlock_component();
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
+
+ assert_host_shared_guest(vm, ipa);
+ guest_lock_component(vm);
+ ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold);
+ guest_unlock_component(vm);
return ret;
}
-int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
+int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu)
{
- int ret;
- struct pkvm_mem_share share = {
- .tx = {
- .nr_pages = nr_pages,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = hyp_pfn_to_phys(pfn),
- },
- .completer = {
- .id = PKVM_ID_FFA,
- },
- },
- };
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 ipa = hyp_pfn_to_phys(gfn);
- host_lock_component();
- ret = do_unshare(&share);
- host_unlock_component();
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
- return ret;
+ assert_host_shared_guest(vm, ipa);
+ guest_lock_component(vm);
+ kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0);
+ guest_unlock_component(vm);
+
+ return 0;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 8850b591d775..f41c7440b34b 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -360,10 +360,10 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr)
prev_base = __io_map_base;
/*
- * Efficient stack verification using the PAGE_SHIFT bit implies
+ * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
* an alignment of our allocation on the order of the size.
*/
- size = PAGE_SIZE * 2;
+ size = NVHE_STACK_SIZE * 2;
addr = ALIGN(__io_map_base, size);
ret = __pkvm_alloc_private_va_range(addr, size);
@@ -373,12 +373,12 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr)
* at the higher address and leave the lower guard page
* unbacked.
*
- * Any valid stack address now has the PAGE_SHIFT bit as 1
+ * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
* and addresses corresponding to the guard page have the
- * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+ * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
*/
- ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + PAGE_SIZE,
- PAGE_SIZE, phys, PAGE_HYP);
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE,
+ NVHE_STACK_SIZE, phys, PAGE_HYP);
if (ret)
__io_map_base = prev_base;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index e691290d3765..a1eb27a1a747 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -32,7 +32,7 @@ u64 __hyp_vmemmap;
*/
static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned short order)
+ u8 order)
{
phys_addr_t addr = hyp_page_to_phys(p);
@@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
/* Find a buddy page currently available for allocation */
static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned short order)
+ u8 order)
{
struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
@@ -94,7 +94,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
struct hyp_page *p)
{
phys_addr_t phys = hyp_page_to_phys(p);
- unsigned short order = p->order;
+ u8 order = p->order;
struct hyp_page *buddy;
memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
@@ -129,7 +129,7 @@ insert:
static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned short order)
+ u8 order)
{
struct hyp_page *buddy;
@@ -183,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr)
void hyp_split_page(struct hyp_page *p)
{
- unsigned short order = p->order;
+ u8 order = p->order;
unsigned int i;
p->order = 0;
@@ -195,10 +195,10 @@ void hyp_split_page(struct hyp_page *p)
}
}
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
+void *hyp_alloc_pages(struct hyp_pool *pool, u8 order)
{
- unsigned short i = order;
struct hyp_page *p;
+ u8 i = order;
hyp_spin_lock(&pool->lock);
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 26dd9a20ad6e..5a335a51deca 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -6,7 +6,9 @@
#include <linux/kvm_host.h>
#include <linux/mm.h>
-#include <nvhe/fixed_config.h>
+
+#include <asm/kvm_emulate.h>
+
#include <nvhe/mem_protect.h>
#include <nvhe/memory.h>
#include <nvhe/pkvm.h>
@@ -18,198 +20,170 @@ unsigned long __icache_flags;
/* Used by kvm_get_vttbr(). */
unsigned int kvm_arm_vmid_bits;
+unsigned int kvm_host_sve_max_vl;
+
/*
- * Set trap register values based on features in ID_AA64PFR0.
+ * The currently loaded hyp vCPU for each physical CPU. Used only when
+ * protected KVM is enabled, but for both protected and non-protected VMs.
*/
-static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
-{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1);
- u64 hcr_set = HCR_RW;
- u64 hcr_clear = 0;
- u64 cptr_set = 0;
- u64 cptr_clear = 0;
+static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu);
- /* Protected KVM does not support AArch32 guests. */
- BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0),
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
- BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1),
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
-
- /*
- * Linux guests assume support for floating-point and Advanced SIMD. Do
- * not change the trapping behavior for these from the KVM default.
- */
- BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP),
- PVM_ID_AA64PFR0_ALLOW));
- BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD),
- PVM_ID_AA64PFR0_ALLOW));
+static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
if (has_hvhe())
- hcr_set |= HCR_E2H;
+ vcpu->arch.hcr_el2 |= HCR_E2H;
- /* Trap RAS unless all current versions are supported */
- if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) <
- ID_AA64PFR0_EL1_RAS_V1P1) {
- hcr_set |= HCR_TERR | HCR_TEA;
- hcr_clear |= HCR_FIEN;
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
+ /* route synchronous external abort exceptions to EL2 */
+ vcpu->arch.hcr_el2 |= HCR_TEA;
+ /* trap error record accesses */
+ vcpu->arch.hcr_el2 |= HCR_TERR;
}
- /* Trap AMU */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) {
- hcr_clear |= HCR_AMVOFFEN;
- cptr_set |= CPTR_EL2_TAM;
- }
+ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
+ vcpu->arch.hcr_el2 |= HCR_FWB;
- /* Trap SVE */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) {
- if (has_hvhe())
- cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
- else
- cptr_set |= CPTR_EL2_TZ;
- }
+ if (cpus_have_final_cap(ARM64_HAS_EVT) &&
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
+ kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0))
+ vcpu->arch.hcr_el2 |= HCR_TID4;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TID2;
+
+ if (vcpu_has_ptrauth(vcpu))
+ vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
- vcpu->arch.hcr_el2 |= hcr_set;
- vcpu->arch.hcr_el2 &= ~hcr_clear;
- vcpu->arch.cptr_el2 |= cptr_set;
- vcpu->arch.cptr_el2 &= ~cptr_clear;
+ if (kvm_has_mte(vcpu->kvm))
+ vcpu->arch.hcr_el2 |= HCR_ATA;
}
-/*
- * Set trap register values based on features in ID_AA64PFR1.
- */
-static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu)
+static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu)
{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1);
- u64 hcr_set = 0;
- u64 hcr_clear = 0;
+ struct kvm *kvm = vcpu->kvm;
+ u64 val = vcpu->arch.hcr_el2;
+
+ /* No support for AArch32. */
+ val |= HCR_RW;
+
+ /*
+ * Always trap:
+ * - Feature id registers: to control features exposed to guests
+ * - Implementation-defined features
+ */
+ val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1;
- /* Memory Tagging: Trap and Treat as Untagged if not supported. */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) {
- hcr_set |= HCR_TID5;
- hcr_clear |= HCR_DCT | HCR_ATA;
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
+ val |= HCR_TERR | HCR_TEA;
+ val &= ~(HCR_FIEN);
}
- vcpu->arch.hcr_el2 |= hcr_set;
- vcpu->arch.hcr_el2 &= ~hcr_clear;
-}
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP))
+ val &= ~(HCR_AMVOFFEN);
-/*
- * Set trap register values based on features in ID_AA64DFR0.
- */
-static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
-{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1);
- u64 mdcr_set = 0;
- u64 mdcr_clear = 0;
- u64 cptr_set = 0;
-
- /* Trap/constrain PMU */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) {
- mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
- mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME |
- MDCR_EL2_HPMN_MASK;
+ if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) {
+ val |= HCR_TID5;
+ val &= ~(HCR_DCT | HCR_ATA);
}
- /* Trap Debug */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids))
- mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP))
+ val |= HCR_TLOR;
+
+ vcpu->arch.hcr_el2 = val;
+}
- /* Trap OS Double Lock */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids))
- mdcr_set |= MDCR_EL2_TDOSA;
+static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 val = vcpu->arch.mdcr_el2;
- /* Trap SPE */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) {
- mdcr_set |= MDCR_EL2_TPMS;
- mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) {
+ val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
+ val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK);
}
- /* Trap Trace Filter */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids))
- mdcr_set |= MDCR_EL2_TTRF;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP))
+ val |= MDCR_EL2_TDRA | MDCR_EL2_TDA;
+
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP))
+ val |= MDCR_EL2_TDOSA;
- /* Trap Trace */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) {
- if (has_hvhe())
- cptr_set |= CPACR_EL1_TTA;
- else
- cptr_set |= CPTR_EL2_TTA;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) {
+ val |= MDCR_EL2_TPMS;
+ val &= ~MDCR_EL2_E2PB_MASK;
}
- /* Trap External Trace */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids))
- mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
+ val |= MDCR_EL2_TTRF;
- vcpu->arch.mdcr_el2 |= mdcr_set;
- vcpu->arch.mdcr_el2 &= ~mdcr_clear;
- vcpu->arch.cptr_el2 |= cptr_set;
-}
-
-/*
- * Set trap register values based on features in ID_AA64MMFR0.
- */
-static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu)
-{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1);
- u64 mdcr_set = 0;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP))
+ val |= MDCR_EL2_E2TB_MASK;
/* Trap Debug Communications Channel registers */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids))
- mdcr_set |= MDCR_EL2_TDCC;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
+ val |= MDCR_EL2_TDCC;
- vcpu->arch.mdcr_el2 |= mdcr_set;
+ vcpu->arch.mdcr_el2 = val;
}
/*
- * Set trap register values based on features in ID_AA64MMFR1.
+ * Check that cpu features that are neither trapped nor supported are not
+ * enabled for protected VMs.
*/
-static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu)
+static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu)
{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1);
- u64 hcr_set = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ /* Protected KVM does not support AArch32 guests. */
+ if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) ||
+ kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32))
+ return -EINVAL;
- /* Trap LOR */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids))
- hcr_set |= HCR_TLOR;
+ /*
+ * Linux guests assume support for floating-point and Advanced SIMD. Do
+ * not change the trapping behavior for these from the KVM default.
+ */
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) ||
+ !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP))
+ return -EINVAL;
+
+ /* No SME support in KVM right now. Check to catch if it changes. */
+ if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP))
+ return -EINVAL;
- vcpu->arch.hcr_el2 |= hcr_set;
+ return 0;
}
/*
- * Set baseline trap register values.
+ * Initialize trap register values in protected mode.
*/
-static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
+static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
{
- const u64 hcr_trap_feat_regs = HCR_TID3;
- const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1;
+ struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+ int ret;
- /*
- * Always trap:
- * - Feature id registers: to control features exposed to guests
- * - Implementation-defined features
- */
- vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef;
-
- /* Clear res0 and set res1 bits to trap potential new features. */
- vcpu->arch.hcr_el2 &= ~(HCR_RES0);
- vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
- if (!has_hvhe()) {
- vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
- vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+ vcpu->arch.mdcr_el2 = 0;
+
+ pkvm_vcpu_reset_hcr(vcpu);
+
+ if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) {
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ /* Trust the host for non-protected vcpu features. */
+ vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2;
+ return 0;
}
-}
-/*
- * Initialize trap register values for protected VMs.
- */
-void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
-{
- pvm_init_trap_regs(vcpu);
- pvm_init_traps_aa64pfr0(vcpu);
- pvm_init_traps_aa64pfr1(vcpu);
- pvm_init_traps_aa64dfr0(vcpu);
- pvm_init_traps_aa64mmfr0(vcpu);
- pvm_init_traps_aa64mmfr1(vcpu);
+ ret = pkvm_check_pvm_cpu_features(vcpu);
+ if (ret)
+ return ret;
+
+ pvm_init_traps_hcr(vcpu);
+ pvm_init_traps_mdcr(vcpu);
+ vcpu_set_hcrx(vcpu);
+
+ return 0;
}
/*
@@ -230,10 +204,10 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
/*
* Spinlock for protecting state related to the VM table. Protects writes
- * to 'vm_table' and 'nr_table_entries' as well as reads and writes to
- * 'last_hyp_vcpu_lookup'.
+ * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization.
+ * Also protects reads and writes to 'last_hyp_vcpu_lookup'.
*/
-static DEFINE_HYP_SPINLOCK(vm_table_lock);
+DEFINE_HYP_SPINLOCK(vm_table_lock);
/*
* The table of VM entries for protected VMs in hyp.
@@ -266,15 +240,32 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
struct pkvm_hyp_vm *hyp_vm;
+ /* Cannot load a new vcpu without putting the old one first. */
+ if (__this_cpu_read(loaded_hyp_vcpu))
+ return NULL;
+
hyp_spin_lock(&vm_table_lock);
hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
+ if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx)
goto unlock;
hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+ if (!hyp_vcpu)
+ goto unlock;
+
+ /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
+ if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) {
+ hyp_vcpu = NULL;
+ goto unlock;
+ }
+
+ hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu);
hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
unlock:
hyp_spin_unlock(&vm_table_lock);
+
+ if (hyp_vcpu)
+ __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu);
return hyp_vcpu;
}
@@ -283,10 +274,98 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
hyp_spin_lock(&vm_table_lock);
+ hyp_vcpu->loaded_hyp_vcpu = NULL;
+ __this_cpu_write(loaded_hyp_vcpu, NULL);
+ hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
+ hyp_spin_unlock(&vm_table_lock);
+}
+
+struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void)
+{
+ return __this_cpu_read(loaded_hyp_vcpu);
+
+}
+
+struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_vm_by_handle(handle);
+ if (hyp_vm)
+ hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
+ hyp_spin_unlock(&vm_table_lock);
+
+ return hyp_vm;
+}
+
+void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm)
+{
+ hyp_spin_lock(&vm_table_lock);
hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
hyp_spin_unlock(&vm_table_lock);
}
+struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle);
+
+ if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) {
+ put_pkvm_hyp_vm(hyp_vm);
+ hyp_vm = NULL;
+ }
+
+ return hyp_vm;
+}
+
+static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm)
+{
+ struct kvm *kvm = &hyp_vm->kvm;
+ unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags);
+ DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+ /* CTR_EL0 is always under host control, even for protected VMs. */
+ hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0;
+
+ if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags))
+ set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
+
+ /* No restrictions for non-protected VMs. */
+ if (!kvm_vm_is_protected(kvm)) {
+ hyp_vm->kvm.arch.flags = host_arch_flags;
+
+ bitmap_copy(kvm->arch.vcpu_features,
+ host_kvm->arch.vcpu_features,
+ KVM_VCPU_MAX_FEATURES);
+
+ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags))
+ hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1;
+
+ return;
+ }
+
+ bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+ set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3))
+ set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS))
+ set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC))
+ set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) {
+ set_bit(KVM_ARM_VCPU_SVE, allowed_features);
+ kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE);
+ }
+
+ bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features,
+ allowed_features, KVM_VCPU_MAX_FEATURES);
+}
+
static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
{
if (host_vcpu)
@@ -298,8 +377,14 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
{
int i;
- for (i = 0; i < nr_vcpus; i++)
- unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+ for (i = 0; i < nr_vcpus; i++) {
+ struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i];
+
+ if (!hyp_vcpu)
+ continue;
+
+ unpin_host_vcpu(hyp_vcpu->host_vcpu);
+ }
}
static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
@@ -308,31 +393,46 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
hyp_vm->host_kvm = host_kvm;
hyp_vm->kvm.created_vcpus = nr_vcpus;
hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
+ hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled);
+ hyp_vm->kvm.arch.flags = 0;
+ pkvm_init_features_from_host(hyp_vm, host_kvm);
+}
+
+static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
+{
+ struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+
+ if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE))
+ vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED);
}
static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
struct pkvm_hyp_vm *hyp_vm,
- struct kvm_vcpu *host_vcpu,
- unsigned int vcpu_idx)
+ struct kvm_vcpu *host_vcpu)
{
int ret = 0;
if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
return -EBUSY;
- if (host_vcpu->vcpu_idx != vcpu_idx) {
- ret = -EINVAL;
- goto done;
- }
-
hyp_vcpu->host_vcpu = host_vcpu;
hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
- hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+ hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx);
hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
+ hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+
+ if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ kvm_init_pvm_id_regs(&hyp_vcpu->vcpu);
+
+ ret = pkvm_vcpu_init_traps(hyp_vcpu);
+ if (ret)
+ goto done;
+
+ pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu);
done:
if (ret)
unpin_host_vcpu(host_vcpu);
@@ -430,6 +530,7 @@ static void *map_donated_memory(unsigned long host_va, size_t size)
static void __unmap_donated_memory(void *va, size_t size)
{
+ kvm_flush_dcache_to_poc(va, size);
WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
PAGE_ALIGN(size) >> PAGE_SHIFT));
}
@@ -556,24 +657,27 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
goto unlock;
}
- idx = hyp_vm->nr_vcpus;
+ ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu);
+ if (ret)
+ goto unlock;
+
+ idx = hyp_vcpu->vcpu.vcpu_idx;
if (idx >= hyp_vm->kvm.created_vcpus) {
ret = -EINVAL;
goto unlock;
}
- ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
- if (ret)
+ if (hyp_vm->vcpus[idx]) {
+ ret = -EINVAL;
goto unlock;
+ }
hyp_vm->vcpus[idx] = hyp_vcpu;
- hyp_vm->nr_vcpus++;
unlock:
hyp_spin_unlock(&vm_table_lock);
if (ret)
unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
-
return ret;
}
@@ -591,7 +695,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
int __pkvm_teardown_vm(pkvm_handle_t handle)
{
- struct kvm_hyp_memcache *mc;
+ struct kvm_hyp_memcache *mc, *stage2_mc;
struct pkvm_hyp_vm *hyp_vm;
struct kvm *host_kvm;
unsigned int idx;
@@ -619,12 +723,26 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
/* Reclaim guest pages (including page-table pages) */
mc = &host_kvm->arch.pkvm.teardown_mc;
- reclaim_guest_pages(hyp_vm, mc);
- unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+ stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc;
+ reclaim_pgtable_pages(hyp_vm, stage2_mc);
+ unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus);
/* Push the metadata pages to the teardown memcache */
- for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
+ for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) {
struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
+ struct kvm_hyp_memcache *vcpu_mc;
+
+ if (!hyp_vcpu)
+ continue;
+
+ vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
+
+ while (vcpu_mc->nr_pages) {
+ void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
+
+ push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys);
+ unmap_donated_memory_noclear(addr, PAGE_SIZE);
+ }
teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index d57bcb6ab94d..c3e196fb8b18 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -205,7 +205,7 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
struct psci_boot_args *boot_args;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
if (is_cpu_on)
boot_args = this_cpu_ptr(&cpu_on_args);
@@ -218,6 +218,9 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
if (is_cpu_on)
release_boot_args(boot_args);
+ write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR);
+ write_sysreg(INIT_PSTATE_EL1, SPSR_EL2);
+
__host_enter(host_ctxt);
}
@@ -265,6 +268,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_
case PSCI_1_0_FN_PSCI_FEATURES:
case PSCI_1_0_FN_SET_SUSPEND_MODE:
case PSCI_1_1_FN64_SYSTEM_RESET2:
+ case PSCI_1_3_FN_SYSTEM_OFF2:
+ case PSCI_1_3_FN64_SYSTEM_OFF2:
return psci_forward(host_ctxt);
case PSCI_1_0_FN64_SYSTEM_SUSPEND:
return psci_system_suspend(func_id, host_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index bc58d1b515af..d62bcb5634a2 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -12,7 +12,6 @@
#include <nvhe/early_alloc.h>
#include <nvhe/ffa.h>
-#include <nvhe/fixed_config.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
@@ -67,13 +66,34 @@ static int divide_memory_pool(void *virt, unsigned long size)
return 0;
}
+static int pkvm_create_host_sve_mappings(void)
+{
+ void *start, *end;
+ int ret, i;
+
+ if (!system_supports_sve())
+ return 0;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i);
+ struct cpu_sve_state *sve_state = host_data->sve_state;
+
+ start = kern_hyp_va(sve_state);
+ end = start + PAGE_ALIGN(pkvm_host_sve_state_size());
+ ret = pkvm_create_mappings(start, end, PAGE_HYP);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
unsigned long *per_cpu_base,
u32 hyp_va_bits)
{
void *start, *end, *virt = hyp_phys_to_virt(phys);
unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
- enum kvm_pgtable_prot prot;
int ret, i;
/* Recreate the hyp page-table using the early page allocator */
@@ -125,22 +145,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
return ret;
}
- /*
- * Map the host sections RO in the hypervisor, but transfer the
- * ownership from the host to the hypervisor itself to make sure they
- * can't be donated or shared with another entity.
- *
- * The ownership transition requires matching changes in the host
- * stage-2. This will be done later (see finalize_host_mappings()) once
- * the hyp_vmemmap is addressable.
- */
- prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
- ret = pkvm_create_mappings(&kvm_vgic_global_state,
- &kvm_vgic_global_state + 1, prot);
- if (ret)
- return ret;
-
- return 0;
+ return pkvm_create_host_sve_mappings();
}
static void update_nvhe_init_params(void)
@@ -174,7 +179,6 @@ static void hpool_put_page(void *addr)
static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
- enum kvm_pgtable_prot prot;
enum pkvm_page_state state;
phys_addr_t phys;
@@ -197,16 +201,16 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
case PKVM_PAGE_OWNED:
return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
case PKVM_PAGE_SHARED_OWNED:
- prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
+ hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED;
break;
case PKVM_PAGE_SHARED_BORROWED:
- prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+ hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED;
break;
default:
return -EINVAL;
}
- return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
+ return 0;
}
static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx,
@@ -257,8 +261,7 @@ static int fix_hyp_pgtable_refcnt(void)
void __noreturn __pkvm_init_finalise(void)
{
- struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
- struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt;
+ struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt);
unsigned long nr_pages, reserved_pages, pfn;
int ret;
@@ -316,7 +319,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
{
struct kvm_nvhe_init_params *params;
void *virt = hyp_phys_to_virt(phys);
- void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+ typeof(__pkvm_init_switch_pgd) *fn;
int ret;
BUG_ON(kvm_check_pvm_sysreg_table());
@@ -340,7 +343,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
/* Jump in the idmap page to switch to the new page-tables */
params = this_cpu_ptr(&kvm_init_params);
fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
- fn(__hyp_pa(params), __pkvm_init_finalise);
+ fn(params->pgd_pa, params->stack_hyp_va, __pkvm_init_finalise);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
index ed6b58b19cfa..5b6eeab1a774 100644
--- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c
+++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
@@ -28,7 +28,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc)
struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info);
struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
- stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE);
+ stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE);
stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack);
stacktrace_info->fp = fp;
stacktrace_info->pc = pc;
@@ -54,7 +54,7 @@ static struct stack_info stackinfo_get_hyp(void)
{
struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
unsigned long high = params->stack_hyp_va;
- unsigned long low = high - PAGE_SIZE;
+ unsigned long low = high - NVHE_STACK_SIZE;
return (struct stack_info) {
.low = low,
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index c50f8459e4fc..7d2ba6ef0261 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -26,7 +26,6 @@
#include <asm/debug-monitors.h>
#include <asm/processor.h>
-#include <nvhe/fixed_config.h>
#include <nvhe/mem_protect.h>
/* Non-VHE specific context */
@@ -36,34 +35,71 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
-static void __activate_traps(struct kvm_vcpu *vcpu)
+static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */
- ___activate_traps(vcpu);
- __activate_traps_common(vcpu);
+ if (!guest_owns_fp_regs())
+ __activate_traps_fpsimd32(vcpu);
- val = vcpu->arch.cptr_el2;
- val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */
- val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA;
- if (cpus_have_final_cap(ARM64_SME)) {
- if (has_hvhe())
- val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN);
- else
- val |= CPTR_EL2_TSM;
+ if (has_hvhe()) {
+ val |= CPACR_EL1_TTA;
+
+ if (guest_owns_fp_regs()) {
+ val |= CPACR_EL1_FPEN;
+ if (vcpu_has_sve(vcpu))
+ val |= CPACR_EL1_ZEN;
+ }
+
+ write_sysreg(val, cpacr_el1);
+ } else {
+ val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
+
+ /*
+ * Always trap SME since it's not supported in KVM.
+ * TSM is RES1 if SME isn't implemented.
+ */
+ val |= CPTR_EL2_TSM;
+
+ if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
+ val |= CPTR_EL2_TZ;
+
+ if (!guest_owns_fp_regs())
+ val |= CPTR_EL2_TFP;
+
+ write_sysreg(val, cptr_el2);
}
+}
+
+static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ if (has_hvhe()) {
+ u64 val = CPACR_EL1_FPEN;
- if (!guest_owns_fp_regs(vcpu)) {
- if (has_hvhe())
- val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
- CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN);
- else
- val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
+ if (cpus_have_final_cap(ARM64_SVE))
+ val |= CPACR_EL1_ZEN;
+ if (cpus_have_final_cap(ARM64_SME))
+ val |= CPACR_EL1_SMEN;
- __activate_traps_fpsimd32(vcpu);
+ write_sysreg(val, cpacr_el1);
+ } else {
+ u64 val = CPTR_NVHE_EL2_RES1;
+
+ if (!cpus_have_final_cap(ARM64_SVE))
+ val |= CPTR_EL2_TZ;
+ if (!cpus_have_final_cap(ARM64_SME))
+ val |= CPTR_EL2_TSM;
+
+ write_sysreg(val, cptr_el2);
}
+}
+
+static void __activate_traps(struct kvm_vcpu *vcpu)
+{
+ ___activate_traps(vcpu, vcpu->arch.hcr_el2);
+ __activate_traps_common(vcpu);
+ __activate_cptr_traps(vcpu);
- kvm_write_cptr_el2(val);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
@@ -108,7 +144,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
- kvm_reset_cptr_el2(vcpu);
+ __deactivate_cptr_traps(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
@@ -174,9 +210,8 @@ static void __pmu_switch_to_host(struct kvm_vcpu *vcpu)
static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code)
{
/*
- * Make sure we handle the exit for workarounds and ptrauth
- * before the pKVM handling, as the latter could decide to
- * UNDEF.
+ * Make sure we handle the exit for workarounds before the pKVM
+ * handling, as the latter could decide to UNDEF.
*/
return (kvm_hyp_handle_sysreg(vcpu, exit_code) ||
kvm_handle_pvm_sysreg(vcpu, exit_code));
@@ -191,7 +226,6 @@ static const exit_handler_fn hyp_exit_handlers[] = {
[ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
[ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
[ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
- [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
[ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
};
@@ -203,34 +237,33 @@ static const exit_handler_fn pvm_exit_handlers[] = {
[ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
[ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
[ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
- [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
[ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
};
static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
{
- if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm))))
+ if (unlikely(vcpu_is_protected(vcpu)))
return pvm_exit_handlers;
return hyp_exit_handlers;
}
-/*
- * Some guests (e.g., protected VMs) are not be allowed to run in AArch32.
- * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a
- * guest from dropping to AArch32 EL0 if implemented by the CPU. If the
- * hypervisor spots a guest in such a state ensure it is handled, and don't
- * trust the host to spot or fix it. The check below is based on the one in
- * kvm_arch_vcpu_ioctl_run().
- *
- * Returns false if the guest ran in AArch32 when it shouldn't have, and
- * thus should exit to the host, or true if a the guest run loop can continue.
- */
-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
- struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+ const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
+
+ synchronize_vcpu_pstate(vcpu, exit_code);
- if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) {
+ /*
+ * Some guests (e.g., protected VMs) are not be allowed to run in
+ * AArch32. The ARMv8 architecture does not give the hypervisor a
+ * mechanism to prevent a guest from dropping to AArch32 EL0 if
+ * implemented by the CPU. If the hypervisor spots a guest in such a
+ * state ensure it is handled, and don't trust the host to spot or fix
+ * it. The check below is based on the one in
+ * kvm_arch_vcpu_ioctl_run().
+ */
+ if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) {
/*
* As we have caught the guest red-handed, decide that it isn't
* fit for purpose anymore by making the vcpu invalid. The VMM
@@ -242,6 +275,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
*exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT);
*exit_code |= ARM_EXCEPTION_IL;
}
+
+ return __fixup_guest_exit(vcpu, exit_code, handlers);
}
/* Switch to the guest for legacy non-VHE systems */
@@ -264,7 +299,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmr_sync();
}
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -337,7 +372,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg_restore_state_nvhe(host_ctxt);
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)
+ if (guest_owns_fp_regs())
__fpsimd_save_fpexc32(vcpu);
__debug_switch_to_host(vcpu);
@@ -367,7 +402,7 @@ asmlinkage void __noreturn hyp_panic(void)
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
vcpu = host_ctxt->__hyp_running_vcpu;
if (vcpu) {
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index edd969a1f36b..1ddd9ed3cbb3 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -11,7 +11,7 @@
#include <hyp/adjust_pc.h>
-#include <nvhe/fixed_config.h>
+#include <nvhe/pkvm.h>
#include "../../sys_regs.h"
@@ -28,222 +28,255 @@ u64 id_aa64mmfr1_el1_sys_val;
u64 id_aa64mmfr2_el1_sys_val;
u64 id_aa64smfr0_el1_sys_val;
-/*
- * Inject an unknown/undefined exception to an AArch64 guest while most of its
- * sysregs are live.
- */
-static void inject_undef64(struct kvm_vcpu *vcpu)
-{
- u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
-
- *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
- *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
-
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
-
- __kvm_adjust_pc(vcpu);
-
- write_sysreg_el1(esr, SYS_ESR);
- write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
- write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
- write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
-}
-
-/*
- * Returns the restricted features values of the feature register based on the
- * limitations in restrict_fields.
- * A feature id field value of 0b0000 does not impose any restrictions.
- * Note: Use only for unsigned feature field values.
- */
-static u64 get_restricted_features_unsigned(u64 sys_reg_val,
- u64 restrict_fields)
-{
- u64 value = 0UL;
- u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+struct pvm_ftr_bits {
+ bool sign;
+ u8 shift;
+ u8 width;
+ u8 max_val;
+ bool (*vm_supported)(const struct kvm *kvm);
+};
- /*
- * According to the Arm Architecture Reference Manual, feature fields
- * use increasing values to indicate increases in functionality.
- * Iterate over the restricted feature fields and calculate the minimum
- * unsigned value between the one supported by the system, and what the
- * value is being restricted to.
- */
- while (sys_reg_val && restrict_fields) {
- value |= min(sys_reg_val & mask, restrict_fields & mask);
- sys_reg_val &= ~mask;
- restrict_fields &= ~mask;
- mask <<= ARM64_FEATURE_FIELD_BITS;
+#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \
+ { \
+ .sign = sgn, \
+ .shift = id##_##fld##_SHIFT, \
+ .width = id##_##fld##_WIDTH, \
+ .max_val = id##_##fld##_##max, \
+ .vm_supported = func, \
}
- return value;
-}
-
-/*
- * Functions that return the value of feature id registers for protected VMs
- * based on allowed features, system features, and KVM support.
- */
+#define MAX_FEAT_FUNC(id, fld, max, func) \
+ __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED)
-static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu)
-{
- u64 set_mask = 0;
- u64 allow_mask = PVM_ID_AA64PFR0_ALLOW;
+#define MAX_FEAT(id, fld, max) \
+ MAX_FEAT_FUNC(id, fld, max, NULL)
- set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val,
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED);
+#define MAX_FEAT_ENUM(id, fld, max) \
+ __MAX_FEAT_FUNC(id, fld, max, NULL, false)
- return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask;
-}
+#define FEAT_END { .width = 0, }
-static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu)
+static bool vm_has_ptrauth(const struct kvm *kvm)
{
- const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm);
- u64 allow_mask = PVM_ID_AA64PFR1_ALLOW;
-
- if (!kvm_has_mte(kvm))
- allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
-
- return id_aa64pfr1_el1_sys_val & allow_mask;
-}
-
-static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for Scalable Vectors, therefore, hyp has no sanitized
- * copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL);
- return 0;
-}
-
-static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for debug, including breakpoints, and watchpoints,
- * therefore, pKVM has no sanitized copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL);
- return 0;
-}
-
-static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for debug, therefore, hyp has no sanitized copy of the
- * feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL);
- return 0;
-}
+ if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH))
+ return false;
-static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for implementation defined features, therefore, hyp has no
- * sanitized copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL);
- return 0;
+ return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) ||
+ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) &&
+ kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC);
}
-static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu)
+static bool vm_has_sve(const struct kvm *kvm)
{
- /*
- * No support for implementation defined features, therefore, hyp has no
- * sanitized copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL);
- return 0;
+ return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE);
}
-static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu)
-{
- return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW;
-}
+/*
+ * Definitions for features to be allowed or restricted for protected guests.
+ *
+ * Each field in the masks represents the highest supported value for the
+ * feature. If a feature field is not present, it is not supported. Moreover,
+ * these are used to generate the guest's view of the feature registers.
+ *
+ * The approach for protected VMs is to at least support features that are:
+ * - Needed by common Linux distributions (e.g., floating point)
+ * - Trivial to support, e.g., supporting the feature does not introduce or
+ * require tracking of additional state in KVM
+ * - Cannot be trapped or prevent the guest from using anyway
+ */
-static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu)
-{
- u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW;
+static const struct pvm_ftr_bits pvmid_aa64pfr0[] = {
+ MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16),
+ MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16),
+ MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP),
+ MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve),
+ MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP),
+ FEAT_END
+};
- if (!vcpu_has_ptrauth(vcpu))
- allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
+static const struct pvm_ftr_bits pvmid_aa64pfr1[] = {
+ MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP),
+ MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2),
+ MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI),
+ FEAT_END
+};
- return id_aa64isar1_el1_sys_val & allow_mask;
-}
+static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = {
+ MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40),
+ MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16),
+ MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP),
+ MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP),
+ MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP),
+ MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP),
+ FEAT_END
+};
-static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu)
-{
- u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW;
+static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = {
+ MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM),
+ MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16),
+ MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2),
+ MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3),
+ MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP),
+ MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP),
+ MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP),
+ FEAT_END
+};
- if (!vcpu_has_ptrauth(vcpu))
- allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
+static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = {
+ MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP),
+ MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18),
+ MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2),
+ MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP),
+ FEAT_END
+};
- return id_aa64isar2_el1_sys_val & allow_mask;
-}
+static const struct pvm_ftr_bits pvmid_aa64isar1[] = {
+ MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2),
+ MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth),
+ MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth),
+ MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3),
+ MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX),
+ MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16),
+ MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP),
+ FEAT_END
+};
-static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu)
-{
- u64 set_mask;
+static const struct pvm_ftr_bits pvmid_aa64isar2[] = {
+ MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth),
+ MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth),
+ MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP),
+ FEAT_END
+};
- set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val,
- PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED);
+/*
+ * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported.
+ * However, both have Not-Implemented values that are non-zero. Define them
+ * so they can be used when getting the value of these registers.
+ */
+#define ID_AA64DFR0_EL1_NONZERO_NI \
+( \
+ SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \
+ SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \
+)
- return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask;
-}
+#define ID_AA64MMFR4_EL1_NONZERO_NI \
+ SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI)
-static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu)
+/*
+ * Returns the value of the feature registers based on the system register
+ * value, the vcpu support for the revelant features, and the additional
+ * restrictions for protected VMs.
+ */
+static u64 get_restricted_features(const struct kvm_vcpu *vcpu,
+ u64 sys_reg_val,
+ const struct pvm_ftr_bits restrictions[])
{
- return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW;
-}
+ u64 val = 0UL;
+ int i;
+
+ for (i = 0; restrictions[i].width != 0; i++) {
+ bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported;
+ bool sign = restrictions[i].sign;
+ int shift = restrictions[i].shift;
+ int width = restrictions[i].width;
+ u64 min_signed = (1UL << width) - 1UL;
+ u64 sign_bit = 1UL << (width - 1);
+ u64 mask = GENMASK_ULL(width + shift - 1, shift);
+ u64 sys_val = (sys_reg_val & mask) >> shift;
+ u64 pvm_max = restrictions[i].max_val;
+
+ if (vm_supported && !vm_supported(vcpu->kvm))
+ val |= (sign ? min_signed : 0) << shift;
+ else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit))
+ val |= max(sys_val, pvm_max) << shift;
+ else
+ val |= min(sys_val, pvm_max) << shift;
+ }
-static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu)
-{
- return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW;
+ return val;
}
-/* Read a sanitized cpufeature ID register by its encoding */
-u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id)
+static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id)
{
switch (id) {
case SYS_ID_AA64PFR0_EL1:
- return get_pvm_id_aa64pfr0(vcpu);
+ return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0);
case SYS_ID_AA64PFR1_EL1:
- return get_pvm_id_aa64pfr1(vcpu);
- case SYS_ID_AA64ZFR0_EL1:
- return get_pvm_id_aa64zfr0(vcpu);
- case SYS_ID_AA64DFR0_EL1:
- return get_pvm_id_aa64dfr0(vcpu);
- case SYS_ID_AA64DFR1_EL1:
- return get_pvm_id_aa64dfr1(vcpu);
- case SYS_ID_AA64AFR0_EL1:
- return get_pvm_id_aa64afr0(vcpu);
- case SYS_ID_AA64AFR1_EL1:
- return get_pvm_id_aa64afr1(vcpu);
+ return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1);
case SYS_ID_AA64ISAR0_EL1:
- return get_pvm_id_aa64isar0(vcpu);
+ return id_aa64isar0_el1_sys_val;
case SYS_ID_AA64ISAR1_EL1:
- return get_pvm_id_aa64isar1(vcpu);
+ return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1);
case SYS_ID_AA64ISAR2_EL1:
- return get_pvm_id_aa64isar2(vcpu);
+ return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2);
case SYS_ID_AA64MMFR0_EL1:
- return get_pvm_id_aa64mmfr0(vcpu);
+ return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0);
case SYS_ID_AA64MMFR1_EL1:
- return get_pvm_id_aa64mmfr1(vcpu);
+ return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1);
case SYS_ID_AA64MMFR2_EL1:
- return get_pvm_id_aa64mmfr2(vcpu);
+ return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2);
+ case SYS_ID_AA64DFR0_EL1:
+ return ID_AA64DFR0_EL1_NONZERO_NI;
+ case SYS_ID_AA64MMFR4_EL1:
+ return ID_AA64MMFR4_EL1_NONZERO_NI;
default:
/* Unhandled ID register, RAZ */
return 0;
}
}
+/*
+ * Inject an unknown/undefined exception to an AArch64 guest while most of its
+ * sysregs are live.
+ */
+static void inject_undef64(struct kvm_vcpu *vcpu)
+{
+ u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
+
+ *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
+ *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
+
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+
+ __kvm_adjust_pc(vcpu);
+
+ write_sysreg_el1(esr, SYS_ESR);
+ write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
+ write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
+ write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
+}
+
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
struct sys_reg_desc const *r)
{
- return pvm_read_id_reg(vcpu, reg_to_encoding(r));
+ struct kvm *kvm = vcpu->kvm;
+ u32 reg = reg_to_encoding(r);
+
+ if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)))
+ return 0;
+
+ if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7))
+ return kvm->arch.id_regs[IDREG_IDX(reg)];
+
+ return 0;
}
/* Handler to RAZ/WI sysregs */
@@ -271,13 +304,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu,
return false;
}
- /*
- * No support for AArch32 guests, therefore, pKVM has no sanitized copy
- * of AArch32 feature id registers.
- */
- BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1),
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
-
return pvm_access_raz_wi(vcpu, p, r);
}
@@ -449,6 +475,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
};
/*
+ * Initializes feature registers for protected vms.
+ */
+void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_arch *ka = &kvm->arch;
+ u32 r;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
+ return;
+
+ /*
+ * Initialize only AArch64 id registers since AArch32 isn't supported
+ * for protected VMs.
+ */
+ for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1))
+ ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r);
+
+ set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+}
+
+/*
* Checks that the sysreg table is unique and in-order.
*
* Returns 0 if the table is consistent, or 1 otherwise.
diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
index 29305022bc04..3cc613cce5f5 100644
--- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
@@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt)
void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_restore_el1_state(ctxt);
+ u64 midr = ctxt_midr_el1(ctxt);
+
+ __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1));
__sysreg_restore_common_state(ctxt);
__sysreg_restore_user_state(ctxt);
__sysreg_restore_el2_return_state(ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
index 3aaab20ae5b4..ff176f4ce7de 100644
--- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
@@ -22,15 +22,16 @@ void __kvm_timer_set_cntvoff(u64 cntvoff)
*/
void __timer_disable_traps(struct kvm_vcpu *vcpu)
{
- u64 val, shift = 0;
+ u64 set, clr, shift = 0;
if (has_hvhe())
shift = 10;
/* Allow physical timer/counter access for the host */
- val = read_sysreg(cnthctl_el2);
- val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift;
- write_sysreg(val, cnthctl_el2);
+ set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift;
+ clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT;
+
+ sysreg_clear_set(cnthctl_el2, clr, set);
}
/*
@@ -58,5 +59,12 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu)
set <<= 10;
}
+ /*
+ * Trap the virtual counter/timer if we have a broken cntvoff
+ * implementation.
+ */
+ if (has_broken_cntvoff())
+ set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT;
+
sysreg_clear_set(cnthctl_el2, clr, set);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index a60fb13e2192..48da9ca9763f 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -11,13 +11,23 @@
#include <nvhe/mem_protect.h>
struct tlb_inv_context {
- u64 tcr;
+ struct kvm_s2_mmu *mmu;
+ u64 tcr;
+ u64 sctlr;
};
-static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
- struct tlb_inv_context *cxt,
- bool nsh)
+static void enter_vmid_context(struct kvm_s2_mmu *mmu,
+ struct tlb_inv_context *cxt,
+ bool nsh)
{
+ struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+ cxt->mmu = NULL;
+
/*
* We have two requirements:
*
@@ -40,20 +50,55 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
else
dsb(ish);
+ /*
+ * If we're already in the desired context, then there's nothing to do.
+ */
+ if (vcpu) {
+ /*
+ * We're in guest context. However, for this to work, this needs
+ * to be called from within __kvm_vcpu_run(), which ensures that
+ * __hyp_running_vcpu is set to the current guest vcpu.
+ */
+ if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu))
+ return;
+
+ cxt->mmu = vcpu->arch.hw_mmu;
+ } else {
+ /* We're in host context. */
+ if (mmu == host_s2_mmu)
+ return;
+
+ cxt->mmu = host_s2_mmu;
+ }
+
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
u64 val;
/*
* For CPUs that are affected by ARM 1319367, we need to
- * avoid a host Stage-1 walk while we have the guest's
- * VMID set in the VTTBR in order to invalidate TLBs.
- * We're guaranteed that the S1 MMU is enabled, so we can
- * simply set the EPD bits to avoid any further TLB fill.
+ * avoid a Stage-1 walk with the old VMID while we have
+ * the new VMID set in the VTTBR in order to invalidate TLBs.
+ * We're guaranteed that the host S1 MMU is enabled, so
+ * we can simply set the EPD bits to avoid any further
+ * TLB fill. For guests, we ensure that the S1 MMU is
+ * temporarily enabled in the next context.
*/
val = cxt->tcr = read_sysreg_el1(SYS_TCR);
val |= TCR_EPD1_MASK | TCR_EPD0_MASK;
write_sysreg_el1(val, SYS_TCR);
isb();
+
+ if (vcpu) {
+ val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR);
+ if (!(val & SCTLR_ELx_M)) {
+ val |= SCTLR_ELx_M;
+ write_sysreg_el1(val, SYS_SCTLR);
+ isb();
+ }
+ } else {
+ /* The host S1 MMU is always enabled. */
+ cxt->sctlr = SCTLR_ELx_M;
+ }
}
/*
@@ -62,18 +107,40 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
* ensuring that we always have an ISB, but not two ISBs back
* to back.
*/
- __load_stage2(mmu, kern_hyp_va(mmu->arch));
+ if (vcpu)
+ __load_host_stage2();
+ else
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
+
asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
}
-static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
+static void exit_vmid_context(struct tlb_inv_context *cxt)
{
- __load_host_stage2();
+ struct kvm_s2_mmu *mmu = cxt->mmu;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+
+ if (!mmu)
+ return;
+
+ if (vcpu)
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
+ else
+ __load_host_stage2();
+
+ /* Ensure write of the old VMID */
+ isb();
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
- /* Ensure write of the host VMID */
- isb();
- /* Restore the host's TCR_EL1 */
+ if (!(cxt->sctlr & SCTLR_ELx_M)) {
+ write_sysreg_el1(cxt->sctlr, SYS_SCTLR);
+ isb();
+ }
+
write_sysreg_el1(cxt->tcr, SYS_TCR);
}
}
@@ -84,7 +151,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt, false);
+ enter_vmid_context(mmu, &cxt, false);
/*
* We could do so much better if we had the VA as well.
@@ -105,7 +172,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
@@ -114,7 +181,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt, true);
+ enter_vmid_context(mmu, &cxt, true);
/*
* We could do so much better if we had the VA as well.
@@ -135,7 +202,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
dsb(nsh);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
@@ -152,16 +219,17 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
start = round_down(start, stride);
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt, false);
+ enter_vmid_context(mmu, &cxt, false);
- __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0);
+ __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride,
+ TLBI_TTL_UNKNOWN);
dsb(ish);
__tlbi(vmalle1is);
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
@@ -169,13 +237,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt, false);
+ enter_vmid_context(mmu, &cxt, false);
__tlbi(vmalls12e1is);
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
@@ -183,19 +251,19 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt, false);
+ enter_vmid_context(mmu, &cxt, false);
__tlbi(vmalle1);
asm volatile("ic iallu");
dsb(nsh);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_vm_context(void)
{
- /* Same remark as in __tlb_switch_to_guest() */
+ /* Same remark as in enter_vmid_context() */
dsb(ish);
__tlbi(alle1is);
dsb(ish);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 3fae5830f8d2..df5cc74a7dd0 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -17,48 +17,6 @@
#define KVM_PTE_TYPE_PAGE 1
#define KVM_PTE_TYPE_TABLE 1
-#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
-
-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
- ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
- ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
-
-#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
-
-#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
- KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
- KVM_PTE_LEAF_ATTR_HI_S2_XN)
-
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID 1
-
-/*
- * Used to indicate a pte for which a 'break-before-make' sequence is in
- * progress.
- */
-#define KVM_INVALID_PTE_LOCKED BIT(10)
-
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;
@@ -77,14 +35,6 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
}
-static bool kvm_phys_is_valid(u64 phys)
-{
- u64 parange_max = kvm_get_parange_max();
- u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max);
-
- return phys < BIT(shift);
-}
-
static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
{
u64 granule = kvm_granule_size(ctx->level);
@@ -95,7 +45,7 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx,
if (granule > (ctx->end - ctx->addr))
return false;
- if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
+ if (!IS_ALIGNED(phys, granule))
return false;
return IS_ALIGNED(ctx->addr, granule);
@@ -528,7 +478,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
kvm_clear_pte(ctx->ptep);
dsb(ishst);
- __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
+ __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
} else {
if (ctx->end - ctx->addr < granule)
return -EINVAL;
@@ -629,6 +579,9 @@ struct stage2_map_data {
/* Force mappings to page granularity */
bool force_pte;
+
+ /* Walk should update owner_id only */
+ bool annotation;
};
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
@@ -843,12 +796,15 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
* Perform the appropriate TLB invalidation based on the
* evicted pte value (if any).
*/
- if (kvm_pte_table(ctx->old, ctx->level))
- kvm_tlb_flush_vmid_range(mmu, ctx->addr,
- kvm_granule_size(ctx->level));
- else if (kvm_pte_valid(ctx->old))
+ if (kvm_pte_table(ctx->old, ctx->level)) {
+ u64 size = kvm_granule_size(ctx->level);
+ u64 addr = ALIGN_DOWN(ctx->addr, size);
+
+ kvm_tlb_flush_vmid_range(mmu, addr, size);
+ } else if (kvm_pte_valid(ctx->old)) {
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
ctx->addr, ctx->level);
+ }
}
if (stage2_pte_is_counted(ctx->old))
@@ -896,9 +852,13 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
if (kvm_pte_valid(ctx->old)) {
kvm_clear_pte(ctx->ptep);
- if (!stage2_unmap_defer_tlb_flush(pgt))
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
- ctx->addr, ctx->level);
+ if (kvm_pte_table(ctx->old, ctx->level)) {
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
+ TLBI_TTL_UNKNOWN);
+ } else if (!stage2_unmap_defer_tlb_flush(pgt)) {
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
+ ctx->level);
+ }
}
mm_ops->put_page(ctx->ptep);
@@ -907,12 +867,12 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
{
u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
- return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
+ return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
}
static bool stage2_pte_executable(kvm_pte_t pte)
{
- return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+ return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}
static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
@@ -920,18 +880,7 @@ static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
{
u64 phys = data->phys;
- /*
- * Stage-2 walks to update ownership data are communicated to the map
- * walker using an invalid PA. Avoid offsetting an already invalid PA,
- * which could overflow and make the address valid again.
- */
- if (!kvm_phys_is_valid(phys))
- return phys;
-
- /*
- * Otherwise, work out the correct PA based on how far the walk has
- * gotten.
- */
+ /* Work out the correct PA based on how far the walk has gotten */
return phys + (ctx->addr - ctx->start);
}
@@ -943,6 +892,9 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL)
return false;
+ if (data->annotation)
+ return true;
+
return kvm_block_mapping_supported(ctx, phys);
}
@@ -958,7 +910,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
if (!stage2_leaf_mapping_allowed(ctx, data))
return -E2BIG;
- if (kvm_phys_is_valid(phys))
+ if (!data->annotation)
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
else
new = kvm_init_invalid_leaf_owner(data->owner_id);
@@ -972,6 +924,21 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
if (!stage2_pte_needs_update(ctx->old, new))
return -EAGAIN;
+ /* If we're only changing software bits, then store them and go! */
+ if (!kvm_pgtable_walk_shared(ctx) &&
+ !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) {
+ bool old_is_counted = stage2_pte_is_counted(ctx->old);
+
+ if (old_is_counted != stage2_pte_is_counted(new)) {
+ if (old_is_counted)
+ mm_ops->put_page(ctx->ptep);
+ else
+ mm_ops->get_page(ctx->ptep);
+ }
+ WARN_ON_ONCE(!stage2_try_set_pte(ctx, new));
+ return 0;
+ }
+
if (!stage2_try_break_pte(ctx, data->mmu))
return -EAGAIN;
@@ -1105,11 +1072,11 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
{
int ret;
struct stage2_map_data map_data = {
- .phys = KVM_PHYS_INVALID,
.mmu = pgt->mmu,
.memcache = mc,
.owner_id = owner_id,
.force_pte = true,
+ .annotation = true,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -1265,19 +1232,15 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
NULL, NULL, 0);
}
-kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_walk_flags flags)
{
- kvm_pte_t pte = 0;
int ret;
ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
- &pte, NULL,
- KVM_PGTABLE_WALK_HANDLE_FAULT |
- KVM_PGTABLE_WALK_SHARED);
+ NULL, NULL, flags);
if (!ret)
dsb(ishst);
-
- return pte;
}
struct stage2_age_data {
@@ -1331,7 +1294,7 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
- enum kvm_pgtable_prot prot)
+ enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
{
int ret;
s8 level;
@@ -1349,9 +1312,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
if (prot & KVM_PGTABLE_PROT_X)
clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
- ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
- KVM_PGTABLE_WALK_HANDLE_FAULT |
- KVM_PGTABLE_WALK_SHARED);
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
if (!ret || ret == -EAGAIN)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
return ret;
@@ -1363,7 +1324,7 @@ static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
struct kvm_pgtable *pgt = ctx->arg;
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
- if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old))
+ if (!stage2_pte_cacheable(pgt, ctx->old))
return 0;
if (mm_ops->dcache_clean_inval_poc)
@@ -1525,7 +1486,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
*/
new = kvm_init_table_pte(childp, mm_ops);
stage2_make_pte(ctx, new);
- dsb(ishst);
return 0;
}
@@ -1537,8 +1497,11 @@ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
.flags = KVM_PGTABLE_WALK_LEAF,
.arg = mc,
};
+ int ret;
- return kvm_pgtable_walk(pgt, addr, size, &walker);
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ return ret;
}
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 6cb638b184b1..ed363aa3027e 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -18,7 +18,7 @@
#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5))
-static u64 __gic_v3_get_lr(unsigned int lr)
+u64 __gic_v3_get_lr(unsigned int lr)
{
switch (lr & 0xf) {
case 0:
@@ -218,7 +218,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
elrsr = read_gicreg(ICH_ELRSR_EL2);
- write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
+ write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2);
for (i = 0; i < used_lrs; i++) {
if (elrsr & (1 << i))
@@ -268,8 +268,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
* starting to mess with the rest of the GIC, and VMCR_EL2 in
* particular. This logic must be called before
* __vgic_v3_restore_state().
+ *
+ * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is
+ * provisioned at all. In order to prevent illegal accesses to the
+ * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1
+ * so that the trap bits can take effect. Yes, we *loves* the GIC.
*/
- if (!cpu_if->vgic_sre) {
+ if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) {
+ write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1);
+ isb();
+ } else if (!cpu_if->vgic_sre) {
write_gicreg(0, ICC_SRE_EL1);
isb();
write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
@@ -288,8 +296,9 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
/*
- * Prevent the guest from touching the GIC system registers if
- * SRE isn't enabled for GICv3 emulation.
+ * Prevent the guest from touching the ICC_SRE_EL1 system
+ * register. Note that this may not have any effect, as
+ * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
*/
write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
ICC_SRE_EL2);
@@ -297,10 +306,11 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
/*
* If we need to trap system registers, we must write
* ICH_HCR_EL2 anyway, even if no interrupts are being
- * injected,
+ * injected. Note that this also applies if we don't expect
+ * any system register access (no vgic at all).
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
+ cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
}
@@ -326,11 +336,11 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
* no interrupts were being injected, and we disable it again here.
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
+ cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
write_gicreg(0, ICH_HCR_EL2);
}
-void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
+static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
u32 nr_pre_bits;
@@ -363,7 +373,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
}
}
-void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
+static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
u32 nr_pre_bits;
@@ -455,16 +465,35 @@ u64 __vgic_v3_get_gic_config(void)
return val;
}
-u64 __vgic_v3_read_vmcr(void)
+static u64 __vgic_v3_read_vmcr(void)
{
return read_gicreg(ICH_VMCR_EL2);
}
-void __vgic_v3_write_vmcr(u32 vmcr)
+static void __vgic_v3_write_vmcr(u32 vmcr)
{
write_gicreg(vmcr, ICH_VMCR_EL2);
}
+void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+ __vgic_v3_save_aprs(cpu_if);
+ if (cpu_if->vgic_sre)
+ cpu_if->vgic_vmcr = __vgic_v3_read_vmcr();
+}
+
+void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+ /*
+ * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+ * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+ * VMCR_EL2 save/restore in the world switch.
+ */
+ if (cpu_if->vgic_sre)
+ __vgic_v3_write_vmcr(cpu_if->vgic_vmcr);
+ __vgic_v3_restore_aprs(cpu_if);
+}
+
static int __vgic_v3_bpr_min(void)
{
/* See Pseudocode for VPriorityGroup */
@@ -723,7 +752,7 @@ static void __vgic_v3_bump_eoicount(void)
u32 hcr;
hcr = read_gicreg(ICH_HCR_EL2);
- hcr += 1 << ICH_HCR_EOIcount_SHIFT;
+ hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT;
write_gicreg(hcr, ICH_HCR_EL2);
}
@@ -983,9 +1012,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
/* IDbits */
val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
- /* SEIS */
- if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK)
- val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT);
/* A3V */
val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
/* EOImode */
@@ -1013,6 +1039,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
write_gicreg(vmcr, ICH_VMCR_EL2);
}
+static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
+ u32 sysreg, bool is_read)
+{
+ u64 ich_hcr;
+
+ if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu))
+ return false;
+
+ ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+
+ switch (sysreg) {
+ case SYS_ICC_IGRPEN0_EL1:
+ if (is_read &&
+ (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ if (!is_read &&
+ (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_AP0Rn_EL1(0):
+ case SYS_ICC_AP0Rn_EL1(1):
+ case SYS_ICC_AP0Rn_EL1(2):
+ case SYS_ICC_AP0Rn_EL1(3):
+ case SYS_ICC_BPR0_EL1:
+ case SYS_ICC_EOIR0_EL1:
+ case SYS_ICC_HPPIR0_EL1:
+ case SYS_ICC_IAR0_EL1:
+ return ich_hcr & ICH_HCR_EL2_TALL0;
+
+ case SYS_ICC_IGRPEN1_EL1:
+ if (is_read &&
+ (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ if (!is_read &&
+ (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_AP1Rn_EL1(0):
+ case SYS_ICC_AP1Rn_EL1(1):
+ case SYS_ICC_AP1Rn_EL1(2):
+ case SYS_ICC_AP1Rn_EL1(3):
+ case SYS_ICC_BPR1_EL1:
+ case SYS_ICC_EOIR1_EL1:
+ case SYS_ICC_HPPIR1_EL1:
+ case SYS_ICC_IAR1_EL1:
+ return ich_hcr & ICH_HCR_EL2_TALL1;
+
+ case SYS_ICC_DIR_EL1:
+ if (ich_hcr & ICH_HCR_EL2_TDIR)
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_RPR_EL1:
+ case SYS_ICC_CTLR_EL1:
+ case SYS_ICC_PMR_EL1:
+ return ich_hcr & ICH_HCR_EL2_TC;
+
+ default:
+ return false;
+ }
+}
+
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
{
int rt;
@@ -1022,6 +1117,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
bool is_read;
u32 sysreg;
+ if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+ return 0;
+
esr = kvm_vcpu_get_esr(vcpu);
if (vcpu_mode_is_32bit(vcpu)) {
if (!kvm_condition_valid(vcpu)) {
@@ -1036,6 +1134,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+ if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read))
+ return 0;
+
switch (sysreg) {
case SYS_ICC_IAR0_EL1:
case SYS_ICC_IAR1_EL1:
diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile
index 3b9e5464b5b3..afc4aed9231a 100644
--- a/arch/arm64/kvm/hyp/vhe/Makefile
+++ b/arch/arm64/kvm/hyp/vhe/Makefile
@@ -6,6 +6,8 @@
asflags-y := -D__KVM_VHE_HYPERVISOR__
ccflags-y := -D__KVM_VHE_HYPERVISOR__
+CFLAGS_switch.o += -Wno-override-init
+
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o
diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c
index 289689b2682d..0100339b09e0 100644
--- a/arch/arm64/kvm/hyp/vhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c
@@ -19,8 +19,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
{
__debug_switch_to_host_common(vcpu);
}
-
-u64 __kvm_get_mdcr_el2(void)
-{
- return read_sysreg(mdcr_el2);
-}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 1581df6aec87..731a0378ed13 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -33,11 +33,124 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+/*
+ * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1
+ * semantics, irrespective of the configuration), but that cannot be
+ * applied to the actual HW as things would otherwise break badly.
+ *
+ * - TGE: we want the guest to use EL1, which is incompatible with
+ * this bit being set
+ *
+ * - API/APK: they are already accounted for by vcpu_load(), and can
+ * only take effect across a load/put cycle (such as ERET)
+ */
+#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK)
+
+static u64 __compute_hcr(struct kvm_vcpu *vcpu)
+{
+ u64 hcr = vcpu->arch.hcr_el2;
+
+ if (!vcpu_has_nv(vcpu))
+ return hcr;
+
+ if (is_hyp_ctxt(vcpu)) {
+ hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB;
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ hcr |= HCR_NV1;
+
+ write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2);
+ }
+
+ return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE);
+}
+
+static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ u64 cptr;
+
+ /*
+ * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
+ * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
+ * except for some missing controls, such as TAM.
+ * In this case, CPTR_EL2.TAM has the same position with or without
+ * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
+ * shift value for trapping the AMU accesses.
+ */
+ u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM;
+
+ if (guest_owns_fp_regs()) {
+ val |= CPACR_EL1_FPEN;
+ if (vcpu_has_sve(vcpu))
+ val |= CPACR_EL1_ZEN;
+ } else {
+ __activate_traps_fpsimd32(vcpu);
+ }
+
+ if (!vcpu_has_nv(vcpu))
+ goto write;
+
+ /*
+ * The architecture is a bit crap (what a surprise): an EL2 guest
+ * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
+ * as they are RES0 in the guest's view. To work around it, trap the
+ * sucker using the very same bit it can't set...
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
+ val |= CPTR_EL2_TCPAC;
+
+ /*
+ * Layer the guest hypervisor's trap configuration on top of our own if
+ * we're in a nested context.
+ */
+ if (is_hyp_ctxt(vcpu))
+ goto write;
+
+ cptr = vcpu_sanitised_cptr_el2(vcpu);
+
+ /*
+ * Pay attention, there's some interesting detail here.
+ *
+ * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
+ * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
+ *
+ * - CPTR_EL2.xEN = x0, traps are enabled
+ * - CPTR_EL2.xEN = x1, traps are disabled
+ *
+ * In other words, bit[0] determines if guest accesses trap or not. In
+ * the interest of simplicity, clear the entire field if the guest
+ * hypervisor has traps enabled to dispel any illusion of something more
+ * complicated taking place.
+ */
+ if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
+ val &= ~CPACR_EL1_FPEN;
+ if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
+ val &= ~CPACR_EL1_ZEN;
+
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
+ val |= cptr & CPACR_EL1_E0POE;
+
+ val |= cptr & CPTR_EL2_TCPAC;
+
+write:
+ write_sysreg(val, cpacr_el1);
+}
+
+static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN;
+
+ if (cpus_have_final_cap(ARM64_SME))
+ val |= CPACR_EL1_SMEN_EL1EN;
+
+ write_sysreg(val, cpacr_el1);
+}
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
- ___activate_traps(vcpu);
+ ___activate_traps(vcpu, __compute_hcr(vcpu));
if (has_cntpoff()) {
struct timer_map map;
@@ -59,31 +172,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
}
}
- val = read_sysreg(cpacr_el1);
- val |= CPACR_ELx_TTA;
- val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN |
- CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN);
-
- /*
- * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
- * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
- * except for some missing controls, such as TAM.
- * In this case, CPTR_EL2.TAM has the same position with or without
- * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
- * shift value for trapping the AMU accesses.
- */
-
- val |= CPTR_EL2_TAM;
-
- if (guest_owns_fp_regs(vcpu)) {
- if (vcpu_has_sve(vcpu))
- val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
- } else {
- val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN);
- __activate_traps_fpsimd32(vcpu);
- }
-
- write_sysreg(val, cpacr_el1);
+ __activate_cptr_traps(vcpu);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
@@ -128,7 +217,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
*/
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
- kvm_reset_cptr_el2(vcpu);
+ __deactivate_cptr_traps(vcpu);
if (!arm64_kernel_unmapped_at_el0())
host_vectors = __this_cpu_read(this_cpu_vector);
@@ -162,6 +251,8 @@ static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu)
void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu)
{
+ host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu;
+
__vcpu_load_switch_sysregs(vcpu);
__vcpu_load_activate_traps(vcpu);
__load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
@@ -171,33 +262,315 @@ void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu)
{
__vcpu_put_deactivate_traps(vcpu);
__vcpu_put_switch_sysregs(vcpu);
+
+ host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL;
+}
+
+static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg reg)
+{
+ unsigned long ctl;
+ u64 cval, cnt;
+ bool stat;
+
+ switch (reg) {
+ case CNTP_CTL_EL0:
+ cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0);
+ cnt = compute_counter_value(vcpu_ptimer(vcpu));
+ break;
+ case CNTV_CTL_EL0:
+ cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
+ ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0);
+ cnt = compute_counter_value(vcpu_vtimer(vcpu));
+ break;
+ default:
+ BUG();
+ }
+
+ stat = cval <= cnt;
+ __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat);
+
+ return ctl;
+}
+
+static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 esr, val;
+
+ /*
+ * Having FEAT_ECV allows for a better quality of timer emulation.
+ * However, this comes at a huge cost in terms of traps. Try and
+ * satisfy the reads from guest's hypervisor context without
+ * returning to the kernel if we can.
+ */
+ if (!is_hyp_ctxt(vcpu))
+ return false;
+
+ esr = kvm_vcpu_get_esr(vcpu);
+ if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ)
+ return false;
+
+ switch (esr_sys64_to_sysreg(esr)) {
+ case SYS_CNTP_CTL_EL02:
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0);
+ break;
+ case SYS_CNTP_CTL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu))
+ val = read_sysreg_el0(SYS_CNTP_CTL);
+ else
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0);
+ break;
+ case SYS_CNTP_CVAL_EL02:
+ val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ break;
+ case SYS_CNTP_CVAL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu)) {
+ val = read_sysreg_el0(SYS_CNTP_CVAL);
+
+ if (!has_cntpoff())
+ val -= timer_get_offset(vcpu_hptimer(vcpu));
+ } else {
+ val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ }
+ break;
+ case SYS_CNTPCT_EL0:
+ case SYS_CNTPCTSS_EL0:
+ val = compute_counter_value(vcpu_hptimer(vcpu));
+ break;
+ case SYS_CNTV_CTL_EL02:
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0);
+ break;
+ case SYS_CNTV_CTL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu))
+ val = read_sysreg_el0(SYS_CNTV_CTL);
+ else
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0);
+ break;
+ case SYS_CNTV_CVAL_EL02:
+ val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
+ break;
+ case SYS_CNTV_CVAL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu))
+ val = read_sysreg_el0(SYS_CNTV_CVAL);
+ else
+ val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
+ break;
+ case SYS_CNTVCT_EL0:
+ case SYS_CNTVCTSS_EL0:
+ val = compute_counter_value(vcpu_hvtimer(vcpu));
+ break;
+ default:
+ return false;
+ }
+
+ vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 spsr, elr, mode;
+
+ /*
+ * Going through the whole put/load motions is a waste of time
+ * if this is a VHE guest hypervisor returning to its own
+ * userspace, or the hypervisor performing a local exception
+ * return. No need to save/restore registers, no need to
+ * switch S2 MMU. Just do the canonical ERET.
+ *
+ * Unless the trap has to be forwarded further down the line,
+ * of course...
+ */
+ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) ||
+ (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET))
+ return false;
+
+ spsr = read_sysreg_el1(SYS_SPSR);
+ mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ switch (mode) {
+ case PSR_MODE_EL0t:
+ if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
+ return false;
+ break;
+ case PSR_MODE_EL2t:
+ mode = PSR_MODE_EL1t;
+ break;
+ case PSR_MODE_EL2h:
+ mode = PSR_MODE_EL1h;
+ break;
+ default:
+ return false;
+ }
+
+ /* If ERETAx fails, take the slow path */
+ if (esr_iss_is_eretax(esr)) {
+ if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr)))
+ return false;
+ } else {
+ elr = read_sysreg_el1(SYS_ELR);
+ }
+
+ spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode;
+
+ write_sysreg_el2(spsr, SYS_SPSR);
+ write_sysreg_el2(elr, SYS_ELR);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ int ret = -EINVAL;
+ u32 instr;
+ u64 val;
+
+ /*
+ * Ideally, we would never trap on EL2 S1 TLB invalidations using
+ * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}.
+ * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2,
+ * meaning that we can't track changes to the virtual TGE bit. So we
+ * have to leave HCR_EL2.TTLB set on the host. Oopsie...
+ *
+ * Try and handle these invalidation as quickly as possible, without
+ * fully exiting. Note that we don't need to consider any forwarding
+ * here, as having E2H+TGE set is the very definition of being
+ * InHost.
+ *
+ * For the lesser hypervisors out there that have failed to get on
+ * with the VHE program, we can also handle the nVHE style of EL2
+ * invalidation.
+ */
+ if (!(is_hyp_ctxt(vcpu)))
+ return false;
+
+ instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+
+ if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) &&
+ vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) ||
+ kvm_supported_tlbi_s1e2_op (vcpu, instr))
+ ret = __kvm_tlbi_s1e2(NULL, val, instr);
+
+ if (ret)
+ return false;
+
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ int rt;
+
+ if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1)
+ return false;
+
+ rt = kvm_vcpu_sys_get_rt(vcpu);
+
+ if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) {
+ vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2));
+ } else {
+ vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2);
+ __activate_cptr_traps(vcpu);
+ }
+
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+
+ if (!vcpu_has_nv(vcpu))
+ return false;
+
+ if (sysreg != SYS_ZCR_EL2)
+ return false;
+
+ if (guest_owns_fp_regs())
+ return false;
+
+ /*
+ * ZCR_EL2 traps are handled in the slow path, with the expectation
+ * that the guest's FP context has already been loaded onto the CPU.
+ *
+ * Load the guest's FP context and unconditionally forward to the
+ * slow path for handling (i.e. return false).
+ */
+ kvm_hyp_handle_fpsimd(vcpu, exit_code);
+ return false;
+}
+
+static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_timer(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_zcr_el2(vcpu, exit_code))
+ return true;
+
+ return kvm_hyp_handle_sysreg(vcpu, exit_code);
+}
+
+static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 iss;
+
+ if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return false;
+
+ /*
+ * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2
+ * is populated with a correct ISS for a sysreg trap. These fruity
+ * parts are 64bit only, so unconditionally set IL.
+ */
+ iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2));
+ vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) |
+ FIELD_PREP(ESR_ELx_ISS_MASK, iss) |
+ ESR_ELx_IL;
+ return false;
}
static const exit_handler_fn hyp_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
[ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32,
- [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg,
+ [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe,
[ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
[ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
[ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
- [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
+ [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret,
[ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
+
+ /* Apple shenanigans */
+ [0x3F] = kvm_hyp_handle_impdef,
};
-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
+static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
- return hyp_exit_handlers;
-}
+ synchronize_vcpu_pstate(vcpu, exit_code);
-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
-{
/*
* If we were in HYP context on entry, adjust the PSTATE view
* so that the usual helpers work correctly.
*/
- if (unlikely(vcpu_get_flag(vcpu, VCPU_HYP_CONTEXT))) {
+ if (vcpu_has_nv(vcpu) && (read_sysreg(hcr_el2) & HCR_NV)) {
u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
switch (mode) {
@@ -212,6 +585,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
*vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT);
*vcpu_cpsr(vcpu) |= mode;
}
+
+ return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers);
}
/* Switch to the guest for VHE systems running in EL2 */
@@ -221,12 +596,13 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt;
u64 exit_code;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
- host_ctxt->__hyp_running_vcpu = vcpu;
+ host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
sysreg_save_host_state_vhe(host_ctxt);
+ fpsimd_lazy_switch_to_guest(vcpu);
+
/*
* Note that ARM erratum 1165522 requires us to configure both stage 1
* and stage 2 translation for the guest context before we clear
@@ -240,11 +616,6 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
sysreg_restore_guest_state_vhe(guest_ctxt);
__debug_switch_to_guest(vcpu);
- if (is_hyp_ctxt(vcpu))
- vcpu_set_flag(vcpu, VCPU_HYP_CONTEXT);
- else
- vcpu_clear_flag(vcpu, VCPU_HYP_CONTEXT);
-
do {
/* Jump in the fire! */
exit_code = __guest_enter(vcpu);
@@ -256,9 +627,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
__deactivate_traps(vcpu);
+ fpsimd_lazy_switch_to_host(vcpu);
+
sysreg_restore_host_state_vhe(host_ctxt);
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)
+ if (guest_owns_fp_regs())
__fpsimd_save_fpexc32(vcpu);
__debug_switch_to_host(vcpu);
@@ -301,12 +674,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return ret;
}
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
+static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
@@ -326,7 +699,6 @@ void __noreturn hyp_panic(void)
u64 par = read_sysreg_par();
__hyp_call_panic(spsr, elr, par);
- unreachable();
}
asmlinkage void kvm_unexpected_el2_exception(void)
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index a8b9ea496706..3814b0b2c937 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -15,6 +15,132 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_nested.h>
+static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
+{
+ /* These registers are common with EL1 */
+ __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1);
+ __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1);
+
+ __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR);
+ __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0);
+ __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1);
+ __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR);
+ __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR);
+ __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR);
+ __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR);
+ __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR);
+
+ /*
+ * In VHE mode those registers are compatible between EL1 and EL2,
+ * and the guest uses the _EL1 versions on the CPU naturally.
+ * So we save them into their _EL2 versions here.
+ * For nVHE mode we trap accesses to those registers, so our
+ * _EL2 copy in sys_regs[] is always up-to-date and we don't need
+ * to save anything here.
+ */
+ if (vcpu_el2_e2h_is_set(vcpu)) {
+ u64 val;
+
+ /*
+ * We don't save CPTR_EL2, as accesses to CPACR_EL1
+ * are always trapped, ensuring that the in-memory
+ * copy is always up-to-date. A small blessing...
+ */
+ __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR);
+ __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0);
+ __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1);
+ __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR);
+
+ if (ctxt_has_tcrx(&vcpu->arch.ctxt)) {
+ __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2);
+
+ if (ctxt_has_s1pie(&vcpu->arch.ctxt)) {
+ __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0);
+ __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR);
+ }
+
+ if (ctxt_has_s1poe(&vcpu->arch.ctxt))
+ __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR);
+ }
+
+ /*
+ * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where
+ * the interesting CNTHCTL_EL2 bits live. So preserve these
+ * bits when reading back the guest-visible value.
+ */
+ val = read_sysreg_el1(SYS_CNTKCTL);
+ val &= CNTKCTL_VALID_BITS;
+ __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS;
+ __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val;
+ }
+
+ __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1);
+ __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR);
+ __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR);
+}
+
+static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ /* These registers are common with EL1 */
+ write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1);
+ write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1);
+
+ write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2);
+ write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR);
+
+ if (vcpu_el2_e2h_is_set(vcpu)) {
+ /*
+ * In VHE mode those registers are compatible between
+ * EL1 and EL2.
+ */
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL);
+ } else {
+ /*
+ * CNTHCTL_EL2 only affects EL1 when running nVHE, so
+ * no need to restore it.
+ */
+ val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2));
+ write_sysreg_el1(val, SYS_SCTLR);
+ val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2));
+ write_sysreg_el1(val, SYS_CPACR);
+ val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2));
+ write_sysreg_el1(val, SYS_TTBR0);
+ val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2));
+ write_sysreg_el1(val, SYS_TCR);
+ }
+
+ if (ctxt_has_tcrx(&vcpu->arch.ctxt)) {
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2);
+
+ if (ctxt_has_s1pie(&vcpu->arch.ctxt)) {
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0);
+ }
+
+ if (ctxt_has_s1poe(&vcpu->arch.ctxt))
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR);
+ }
+
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR);
+ write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR);
+}
+
/*
* VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and
* pstate, which are handled as part of the el2 return state) on every
@@ -66,8 +192,9 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
+ u64 midr, mpidr;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
__sysreg_save_user_state(host_ctxt);
/*
@@ -89,7 +216,24 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
*/
__sysreg32_restore_state(vcpu);
__sysreg_restore_user_state(guest_ctxt);
- __sysreg_restore_el1_state(guest_ctxt);
+
+ if (unlikely(is_hyp_ctxt(vcpu))) {
+ __sysreg_restore_vel2_state(vcpu);
+ } else {
+ if (vcpu_has_nv(vcpu)) {
+ /*
+ * As we're restoring a nested guest, set the value
+ * provided by the guest hypervisor.
+ */
+ midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2);
+ mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2);
+ } else {
+ midr = ctxt_midr_el1(guest_ctxt);
+ mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1);
+ }
+
+ __sysreg_restore_el1_state(guest_ctxt, midr, mpidr);
+ }
vcpu_set_flag(vcpu, SYSREGS_ON_CPU);
}
@@ -110,9 +254,13 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
+
+ if (unlikely(is_hyp_ctxt(vcpu)))
+ __sysreg_save_vel2_state(vcpu);
+ else
+ __sysreg_save_el1_state(guest_ctxt);
- __sysreg_save_el1_state(guest_ctxt);
__sysreg_save_user_state(guest_ctxt);
__sysreg32_save_state(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index b32e2940df7d..3d50a1bd2bdb 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -17,8 +17,8 @@ struct tlb_inv_context {
u64 sctlr;
};
-static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
- struct tlb_inv_context *cxt)
+static void enter_vmid_context(struct kvm_s2_mmu *mmu,
+ struct tlb_inv_context *cxt)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
u64 val;
@@ -67,7 +67,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
isb();
}
-static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
+static void exit_vmid_context(struct tlb_inv_context *cxt)
{
/*
* We're done with the TLB operation, let's restore the host's
@@ -97,7 +97,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
/*
* We could do so much better if we had the VA as well.
@@ -118,7 +118,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
@@ -129,7 +129,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
dsb(nshst);
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
/*
* We could do so much better if we had the VA as well.
@@ -150,7 +150,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
dsb(nsh);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
@@ -169,16 +169,17 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
- __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0);
+ __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride,
+ TLBI_TTL_UNKNOWN);
dsb(ish);
__tlbi(vmalle1is);
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
@@ -188,13 +189,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
dsb(ishst);
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
__tlbi(vmalls12e1is);
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
@@ -202,14 +203,14 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
__tlbi(vmalle1);
asm volatile("ic iallu");
dsb(nsh);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_vm_context(void)
@@ -218,3 +219,150 @@ void __kvm_flush_vm_context(void)
__tlbi(alle1is);
dsb(ish);
}
+
+/*
+ * TLB invalidation emulation for NV. For any given instruction, we
+ * perform the following transformtions:
+ *
+ * - a TLBI targeting EL2 S1 is remapped to EL1 S1
+ * - a non-shareable TLBI is upgraded to being inner-shareable
+ * - an outer-shareable TLBI is also mapped to inner-shareable
+ * - an nXS TLBI is upgraded to XS
+ */
+int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding)
+{
+ struct tlb_inv_context cxt;
+ int ret = 0;
+
+ /*
+ * The guest will have provided its own DSB ISHST before trapping.
+ * If it hasn't, that's its own problem, and we won't paper over it
+ * (plus, there is plenty of extra synchronisation before we even
+ * get here...).
+ */
+
+ if (mmu)
+ enter_vmid_context(mmu, &cxt);
+
+ switch (sys_encoding) {
+ case OP_TLBI_ALLE2:
+ case OP_TLBI_ALLE2IS:
+ case OP_TLBI_ALLE2OS:
+ case OP_TLBI_VMALLE1:
+ case OP_TLBI_VMALLE1IS:
+ case OP_TLBI_VMALLE1OS:
+ case OP_TLBI_ALLE2NXS:
+ case OP_TLBI_ALLE2ISNXS:
+ case OP_TLBI_ALLE2OSNXS:
+ case OP_TLBI_VMALLE1NXS:
+ case OP_TLBI_VMALLE1ISNXS:
+ case OP_TLBI_VMALLE1OSNXS:
+ __tlbi(vmalle1is);
+ break;
+ case OP_TLBI_VAE2:
+ case OP_TLBI_VAE2IS:
+ case OP_TLBI_VAE2OS:
+ case OP_TLBI_VAE1:
+ case OP_TLBI_VAE1IS:
+ case OP_TLBI_VAE1OS:
+ case OP_TLBI_VAE2NXS:
+ case OP_TLBI_VAE2ISNXS:
+ case OP_TLBI_VAE2OSNXS:
+ case OP_TLBI_VAE1NXS:
+ case OP_TLBI_VAE1ISNXS:
+ case OP_TLBI_VAE1OSNXS:
+ __tlbi(vae1is, va);
+ break;
+ case OP_TLBI_VALE2:
+ case OP_TLBI_VALE2IS:
+ case OP_TLBI_VALE2OS:
+ case OP_TLBI_VALE1:
+ case OP_TLBI_VALE1IS:
+ case OP_TLBI_VALE1OS:
+ case OP_TLBI_VALE2NXS:
+ case OP_TLBI_VALE2ISNXS:
+ case OP_TLBI_VALE2OSNXS:
+ case OP_TLBI_VALE1NXS:
+ case OP_TLBI_VALE1ISNXS:
+ case OP_TLBI_VALE1OSNXS:
+ __tlbi(vale1is, va);
+ break;
+ case OP_TLBI_ASIDE1:
+ case OP_TLBI_ASIDE1IS:
+ case OP_TLBI_ASIDE1OS:
+ case OP_TLBI_ASIDE1NXS:
+ case OP_TLBI_ASIDE1ISNXS:
+ case OP_TLBI_ASIDE1OSNXS:
+ __tlbi(aside1is, va);
+ break;
+ case OP_TLBI_VAAE1:
+ case OP_TLBI_VAAE1IS:
+ case OP_TLBI_VAAE1OS:
+ case OP_TLBI_VAAE1NXS:
+ case OP_TLBI_VAAE1ISNXS:
+ case OP_TLBI_VAAE1OSNXS:
+ __tlbi(vaae1is, va);
+ break;
+ case OP_TLBI_VAALE1:
+ case OP_TLBI_VAALE1IS:
+ case OP_TLBI_VAALE1OS:
+ case OP_TLBI_VAALE1NXS:
+ case OP_TLBI_VAALE1ISNXS:
+ case OP_TLBI_VAALE1OSNXS:
+ __tlbi(vaale1is, va);
+ break;
+ case OP_TLBI_RVAE2:
+ case OP_TLBI_RVAE2IS:
+ case OP_TLBI_RVAE2OS:
+ case OP_TLBI_RVAE1:
+ case OP_TLBI_RVAE1IS:
+ case OP_TLBI_RVAE1OS:
+ case OP_TLBI_RVAE2NXS:
+ case OP_TLBI_RVAE2ISNXS:
+ case OP_TLBI_RVAE2OSNXS:
+ case OP_TLBI_RVAE1NXS:
+ case OP_TLBI_RVAE1ISNXS:
+ case OP_TLBI_RVAE1OSNXS:
+ __tlbi(rvae1is, va);
+ break;
+ case OP_TLBI_RVALE2:
+ case OP_TLBI_RVALE2IS:
+ case OP_TLBI_RVALE2OS:
+ case OP_TLBI_RVALE1:
+ case OP_TLBI_RVALE1IS:
+ case OP_TLBI_RVALE1OS:
+ case OP_TLBI_RVALE2NXS:
+ case OP_TLBI_RVALE2ISNXS:
+ case OP_TLBI_RVALE2OSNXS:
+ case OP_TLBI_RVALE1NXS:
+ case OP_TLBI_RVALE1ISNXS:
+ case OP_TLBI_RVALE1OSNXS:
+ __tlbi(rvale1is, va);
+ break;
+ case OP_TLBI_RVAAE1:
+ case OP_TLBI_RVAAE1IS:
+ case OP_TLBI_RVAAE1OS:
+ case OP_TLBI_RVAAE1NXS:
+ case OP_TLBI_RVAAE1ISNXS:
+ case OP_TLBI_RVAAE1OSNXS:
+ __tlbi(rvaae1is, va);
+ break;
+ case OP_TLBI_RVAALE1:
+ case OP_TLBI_RVAALE1IS:
+ case OP_TLBI_RVAALE1OS:
+ case OP_TLBI_RVAALE1NXS:
+ case OP_TLBI_RVAALE1ISNXS:
+ case OP_TLBI_RVAALE1OSNXS:
+ __tlbi(rvaale1is, va);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ dsb(ish);
+ isb();
+
+ if (mmu)
+ exit_vmid_context(&cxt);
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 5763d979d8ca..569941eeb3fe 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -15,6 +15,8 @@
GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0)
#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \
GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0)
+#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2 \
+ GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0)
static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
{
@@ -317,7 +319,7 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
* to the guest, and hide SSBS so that the
* guest stays protected.
*/
- if (cpus_have_final_cap(ARM64_SSBS))
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP))
break;
fallthrough;
case SPECTRE_UNAFFECTED:
@@ -360,6 +362,8 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
break;
case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
val[0] = smccc_feat->vendor_hyp_bmap;
+ /* Function numbers 2-63 are reserved for pKVM for now */
+ val[2] = smccc_feat->vendor_hyp_bmap_2;
break;
case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
kvm_ptp_get_time(vcpu, val);
@@ -387,6 +391,7 @@ static const u64 kvm_arm_fw_reg_ids[] = {
KVM_REG_ARM_STD_BMAP,
KVM_REG_ARM_STD_HYP_BMAP,
KVM_REG_ARM_VENDOR_HYP_BMAP,
+ KVM_REG_ARM_VENDOR_HYP_BMAP_2,
};
void kvm_arm_init_hypercalls(struct kvm *kvm)
@@ -428,7 +433,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
* Convert the workaround level into an easy-to-compare number, where higher
* values mean better protection.
*/
-static int get_kernel_wa_level(u64 regid)
+static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid)
{
switch (regid) {
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
@@ -449,7 +454,7 @@ static int get_kernel_wa_level(u64 regid)
* don't have any FW mitigation if SSBS is there at
* all times.
*/
- if (cpus_have_final_cap(ARM64_SSBS))
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP))
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
fallthrough;
case SPECTRE_UNAFFECTED:
@@ -486,7 +491,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
- val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+ val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
break;
case KVM_REG_ARM_STD_BMAP:
val = READ_ONCE(smccc_feat->std_bmap);
@@ -497,6 +502,9 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM_VENDOR_HYP_BMAP:
val = READ_ONCE(smccc_feat->vendor_hyp_bmap);
break;
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
+ val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2);
+ break;
default:
return -ENOENT;
}
@@ -527,6 +535,10 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val)
fw_reg_bmap = &smccc_feat->vendor_hyp_bmap;
fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
break;
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
+ fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2;
+ fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2;
+ break;
default:
return -ENOENT;
}
@@ -575,6 +587,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_ARM_PSCI_0_2:
case KVM_ARM_PSCI_1_0:
case KVM_ARM_PSCI_1_1:
+ case KVM_ARM_PSCI_1_2:
+ case KVM_ARM_PSCI_1_3:
if (!wants_02)
return -EINVAL;
vcpu->kvm->arch.psci_version = val;
@@ -588,7 +602,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
return -EINVAL;
- if (get_kernel_wa_level(reg->id) < val)
+ if (get_kernel_wa_level(vcpu, reg->id) < val)
return -EINVAL;
return 0;
@@ -624,13 +638,14 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
* We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
* other way around.
*/
- if (get_kernel_wa_level(reg->id) < wa_level)
+ if (get_kernel_wa_level(vcpu, reg->id) < wa_level)
return -EINVAL;
return 0;
case KVM_REG_ARM_STD_BMAP:
case KVM_REG_ARM_STD_HYP_BMAP:
case KVM_REG_ARM_VENDOR_HYP_BMAP:
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val);
default:
return -ENOENT;
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index 200c8019a82a..ab365e839874 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -72,6 +72,31 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
return data;
}
+static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
+ return false;
+
+ if (vcpu_el1_is_32bit(vcpu)) {
+ switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
+ case unpack_vcpu_flag(EXCEPT_AA32_UND):
+ case unpack_vcpu_flag(EXCEPT_AA32_IABT):
+ case unpack_vcpu_flag(EXCEPT_AA32_DABT):
+ return true;
+ default:
+ return false;
+ }
+ } else {
+ switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
+ return true;
+ default:
+ return false;
+ }
+ }
+}
+
/**
* kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
* or in-kernel IO emulation
@@ -84,9 +109,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
unsigned int len;
int mask;
- /* Detect an already handled MMIO return */
- if (unlikely(!vcpu->mmio_needed))
- return 0;
+ /*
+ * Detect if the MMIO return was already handled or if userspace aborted
+ * the MMIO access.
+ */
+ if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu)))
+ return 1;
vcpu->mmio_needed = 0;
@@ -117,7 +145,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
*/
kvm_incr_pc(vcpu);
- return 0;
+ return 1;
}
int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
@@ -133,11 +161,19 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
/*
* No valid syndrome? Ask userspace for help if it has
* volunteered to do so, and bail out otherwise.
+ *
+ * In the protected VM case, there isn't much userspace can do
+ * though, so directly deliver an exception to the guest.
*/
if (!kvm_vcpu_dabt_isvalid(vcpu)) {
trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
+ if (vcpu_is_protected(vcpu)) {
+ kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ return 1;
+ }
+
if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
&vcpu->kvm->arch.flags)) {
run->exit_reason = KVM_EXIT_ARM_NISV;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 18680771cdb0..2feb6c6b63af 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -15,6 +15,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
+#include <asm/kvm_pkvm.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
@@ -29,8 +30,12 @@ static unsigned long __ro_after_init hyp_idmap_start;
static unsigned long __ro_after_init hyp_idmap_end;
static phys_addr_t __ro_after_init hyp_idmap_vector;
+u32 __ro_after_init __hyp_va_bits;
+
static unsigned long __ro_after_init io_map_base;
+#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn)
+
static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
phys_addr_t size)
{
@@ -147,7 +152,7 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
return -EINVAL;
next = __stage2_range_addr_end(addr, end, chunk_size);
- ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
if (ret)
break;
} while (addr = next, addr != end);
@@ -168,15 +173,23 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
*/
int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
- kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
return 0;
}
int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
gfn_t gfn, u64 nr_pages)
{
- kvm_tlb_flush_vmid_range(&kvm->arch.mmu,
- gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
+ u64 size = nr_pages << PAGE_SHIFT;
+ u64 addr = gfn << PAGE_SHIFT;
+
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size);
return 0;
}
@@ -225,7 +238,7 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
void *pgtable = page_to_virt(page);
s8 level = page_private(page);
- kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
+ KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level);
}
static void stage2_free_unlinked_table(void *addr, s8 level)
@@ -324,13 +337,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
- WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
+ WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap),
may_block));
}
-static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
+ u64 size, bool may_block)
+{
+ __unmap_stage2_range(mmu, start, size, may_block);
+}
+
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- __unmap_stage2_range(mmu, start, size, true);
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush));
}
static void stage2_flush_memslot(struct kvm *kvm,
@@ -339,7 +358,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush);
+ kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
}
/**
@@ -362,6 +381,8 @@ static void stage2_flush_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_flush_memslot(kvm, memslot);
+ kvm_nested_s2_flush(kvm);
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -696,10 +717,10 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
mutex_lock(&kvm_hyp_pgd_mutex);
/*
- * Efficient stack verification using the PAGE_SHIFT bit implies
+ * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
* an alignment of our allocation on the order of the size.
*/
- size = PAGE_SIZE * 2;
+ size = NVHE_STACK_SIZE * 2;
base = ALIGN_DOWN(io_map_base - size, size);
ret = __hyp_alloc_private_va_range(base);
@@ -716,12 +737,12 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
* at the higher address and leave the lower guard page
* unbacked.
*
- * Any valid stack address now has the PAGE_SHIFT bit as 1
+ * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
* and addresses corresponding to the guard page have the
- * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+ * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
*/
- ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr,
- PAGE_HYP);
+ ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE,
+ phys_addr, PAGE_HYP);
if (ret)
kvm_err("Cannot map hyp stack\n");
@@ -855,21 +876,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.icache_inval_pou = invalidate_icache_guest_page,
};
-/**
- * kvm_init_stage2_mmu - Initialise a S2 MMU structure
- * @kvm: The pointer to the KVM structure
- * @mmu: The pointer to the s2 MMU structure
- * @type: The machine type of the virtual machine
- *
- * Allocates only the stage-2 HW PGD level table(s).
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
{
u32 kvm_ipa_limit = get_kvm_ipa_limit();
- int cpu, err;
- struct kvm_pgtable *pgt;
u64 mmfr0, mmfr1;
u32 phys_shift;
@@ -896,20 +905,64 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+ return 0;
+}
+
+/**
+ * kvm_init_stage2_mmu - Initialise a S2 MMU structure
+ * @kvm: The pointer to the KVM structure
+ * @mmu: The pointer to the s2 MMU structure
+ * @type: The machine type of the virtual machine
+ *
+ * Allocates only the stage-2 HW PGD level table(s).
+ * Note we don't need locking here as this is only called in two cases:
+ *
+ * - when the VM is created, which can't race against anything
+ *
+ * - when secondary kvm_s2_mmu structures are initialised for NV
+ * guests, and the caller must hold kvm->lock as this is called on a
+ * per-vcpu basis.
+ */
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+{
+ int cpu, err;
+ struct kvm_pgtable *pgt;
+
+ /*
+ * If we already have our page tables in place, and that the
+ * MMU context is the canonical one, we have a bug somewhere,
+ * as this is only supposed to ever happen once per VM.
+ *
+ * Otherwise, we're building nested page tables, and that's
+ * probably because userspace called KVM_ARM_VCPU_INIT more
+ * than once on the same vcpu. Since that's actually legal,
+ * don't kick a fuss and leave gracefully.
+ */
if (mmu->pgt != NULL) {
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ return 0;
+
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
+ err = kvm_init_ipa_range(mmu, type);
+ if (err)
+ return err;
+
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
if (!pgt)
return -ENOMEM;
mmu->arch = &kvm->arch;
- err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
+ err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops);
if (err)
goto out_free_pgtable;
+ mmu->pgt = pgt;
+ if (is_protected_kvm_enabled())
+ return 0;
+
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
err = -ENOMEM;
@@ -923,12 +976,15 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
mmu->split_page_cache.gfp_zero = __GFP_ZERO;
- mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
return 0;
out_destroy_pgtable:
- kvm_pgtable_stage2_destroy(pgt);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
out_free_pgtable:
kfree(pgt);
return err;
@@ -976,7 +1032,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
if (!(vma->vm_flags & VM_PFNMAP)) {
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
- unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
}
hva = vm_end;
} while (hva < reg_end);
@@ -1003,6 +1059,8 @@ void stage2_unmap_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_unmap_memslot(kvm, memslot);
+ kvm_nested_s2_unmap(kvm, true);
+
write_unlock(&kvm->mmu_lock);
mmap_read_unlock(current->mm);
srcu_read_unlock(&kvm->srcu, idx);
@@ -1023,26 +1081,40 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
write_unlock(&kvm->mmu_lock);
if (pgt) {
- kvm_pgtable_stage2_destroy(pgt);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
kfree(pgt);
}
}
-static void hyp_mc_free_fn(void *addr, void *unused)
+static void hyp_mc_free_fn(void *addr, void *mc)
{
+ struct kvm_hyp_memcache *memcache = mc;
+
+ if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, -1);
+
free_page((unsigned long)addr);
}
-static void *hyp_mc_alloc_fn(void *unused)
+static void *hyp_mc_alloc_fn(void *mc)
{
- return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ struct kvm_hyp_memcache *memcache = mc;
+ void *addr;
+
+ addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, 1);
+
+ return addr;
}
void free_hyp_memcache(struct kvm_hyp_memcache *mc)
{
- if (is_protected_kvm_enabled())
- __free_hyp_memcache(mc, hyp_mc_free_fn,
- kvm_host_va, NULL);
+ if (!is_protected_kvm_enabled())
+ return;
+
+ kfree(mc->mapping);
+ __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc);
}
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
@@ -1050,8 +1122,14 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
if (!is_protected_kvm_enabled())
return 0;
+ if (!mc->mapping) {
+ mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT);
+ if (!mc->mapping)
+ return -ENOMEM;
+ }
+
return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
- kvm_host_pa, NULL);
+ kvm_host_pa, mc);
}
/**
@@ -1088,8 +1166,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
break;
write_lock(&kvm->mmu_lock);
- ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
- &cache, 0);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE,
+ pa, prot, &cache, 0);
write_unlock(&kvm->mmu_lock);
if (ret)
break;
@@ -1102,14 +1180,14 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
}
/**
- * stage2_wp_range() - write protect stage2 memory region range
+ * kvm_stage2_wp_range() - write protect stage2 memory region range
* @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
-static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect));
}
/**
@@ -1138,7 +1216,8 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_nested_s2_wp(kvm);
write_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
@@ -1192,7 +1271,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
lockdep_assert_held_write(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
/*
* Eager-splitting is done when manual-protect is set. We
@@ -1204,6 +1283,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
*/
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
kvm_mmu_split_huge_pages(kvm, start, end);
+
+ kvm_nested_s2_wp(kvm);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1357,10 +1438,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
if (!kvm_has_mte(kvm))
return;
+ if (folio_test_hugetlb(folio)) {
+ /* Hugetlb has MTE flags set on head page only */
+ if (folio_try_hugetlb_mte_tagging(folio)) {
+ for (i = 0; i < nr_pages; i++, page++)
+ mte_clear_page_tags(page_address(page));
+ folio_set_hugetlb_mte_tagged(folio);
+ }
+ return;
+ }
+
for (i = 0; i < nr_pages; i++, page++) {
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
@@ -1375,6 +1467,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
bool fault_is_perm)
{
@@ -1383,16 +1476,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool exec_fault, mte_allowed;
bool device = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
+ phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
- struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ void *memcache;
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
+ struct page *page;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1412,8 +1508,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* and a write fault needs to collapse a block entry into a table.
*/
if (!fault_is_perm || (logging_active && write_fault)) {
- ret = kvm_mmu_topup_memory_cache(memcache,
- kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
+ int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
+
+ if (!is_protected_kvm_enabled()) {
+ memcache = &vcpu->arch.mmu_page_cache;
+ ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
+ } else {
+ memcache = &vcpu->arch.pkvm_memcache;
+ ret = topup_hyp_memcache(memcache, min_pages);
+ }
if (ret)
return ret;
}
@@ -1434,7 +1537,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* logging_active is guaranteed to never be true for VM_PFNMAP
* memslots.
*/
- if (logging_active) {
+ if (logging_active || is_protected_kvm_enabled()) {
force_pte = true;
vma_shift = PAGE_SHIFT;
} else {
@@ -1466,10 +1569,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
vma_pagesize = 1UL << vma_shift;
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+
+ if (nested) {
+ unsigned long max_map_size;
+
+ max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
+
+ ipa = kvm_s2_trans_output(nested);
+
+ /*
+ * If we're about to create a shadow stage 2 entry, then we
+ * can only create a block mapping if the guest stage 2 page
+ * table uses at least as big a mapping.
+ */
+ max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+
+ /*
+ * Be careful that if the mapping size falls between
+ * two host sizes, take the smallest of the two.
+ */
+ if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
+ max_map_size = PMD_SIZE;
+ else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
+ max_map_size = PAGE_SIZE;
+
+ force_pte = (max_map_size == PAGE_SIZE);
+ vma_pagesize = min(vma_pagesize, (long)max_map_size);
+ }
+
+ /*
+ * Both the canonical IPA and fault IPA must be hugepage-aligned to
+ * ensure we find the right PFN and lay down the mapping in the right
+ * place.
+ */
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
fault_ipa &= ~(vma_pagesize - 1);
+ ipa &= ~(vma_pagesize - 1);
+ }
- gfn = fault_ipa >> PAGE_SHIFT;
+ gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1479,7 +1617,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
/*
* Read mmu_invalidate_seq so that KVM can detect if the results of
- * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
+ * vma_lookup() or __kvm_faultin_pfn() become stale prior to
* acquiring kvm->mmu_lock.
*
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
@@ -1488,8 +1626,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
- write_fault, &writable, NULL);
+ pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+ &writable, &page);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -1502,7 +1640,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* If the page was identified as device early by looking at
* the VMA flags, vma_pagesize is already representing the
* largest quantity we can map. If instead it was mapped
- * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+ * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
* and must not be upgraded.
*
* In both cases, we don't let transparent_hugepage_adjust()
@@ -1520,10 +1658,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault && device)
return -ENOEXEC;
- read_lock(&kvm->mmu_lock);
+ /*
+ * Potentially reduce shadow S2 permissions to match the guest's own
+ * S2. For exec faults, we'd only reach this point if the guest
+ * actually allowed it (see kvm_s2_handle_perm_fault).
+ *
+ * Also encode the level of the original translation in the SW bits
+ * of the leaf entry as a proxy for the span of that translation.
+ * This will be retrieved on TLB invalidation from the guest and
+ * used to limit the invalidation scope if a TTL hint or a range
+ * isn't provided.
+ */
+ if (nested) {
+ writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_readable(nested))
+ prot &= ~KVM_PGTABLE_PROT_R;
+
+ prot |= kvm_encode_nested_level(nested);
+ }
+
+ kvm_fault_lock(kvm);
pgt = vcpu->arch.hw_mmu->pgt;
- if (mmu_invalidate_retry(kvm, mmu_seq))
+ if (mmu_invalidate_retry(kvm, mmu_seq)) {
+ ret = -EAGAIN;
goto out_unlock;
+ }
/*
* If we are not forced to use page mapping, check if we are
@@ -1564,7 +1723,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
prot |= KVM_PGTABLE_PROT_DEVICE;
- } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
+ (!nested || kvm_s2_trans_executable(nested))) {
prot |= KVM_PGTABLE_PROT_X;
}
@@ -1573,42 +1733,42 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_is_perm && vma_pagesize == fault_granule)
- ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- else
- ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ if (fault_is_perm && vma_pagesize == fault_granule) {
+ /*
+ * Drop the SW bits in favour of those stored in the
+ * PTE, which will be preserved.
+ */
+ prot &= ~KVM_NV_GUEST_MAP_SZ;
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
+ } else {
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
- memcache,
- KVM_PGTABLE_WALK_HANDLE_FAULT |
- KVM_PGTABLE_WALK_SHARED);
+ memcache, flags);
+ }
+
+out_unlock:
+ kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_fault_unlock(kvm);
/* Mark the page dirty only if the fault is handled successfully */
- if (writable && !ret) {
- kvm_set_pfn_dirty(pfn);
+ if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
- }
-out_unlock:
- read_unlock(&kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
return ret != -EAGAIN ? ret : 0;
}
/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- kvm_pte_t pte;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
read_lock(&vcpu->kvm->mmu_lock);
mmu = vcpu->arch.hw_mmu;
- pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
+ KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags);
read_unlock(&vcpu->kvm->mmu_lock);
-
- if (kvm_pte_valid(pte))
- kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
}
/**
@@ -1624,8 +1784,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
*/
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
+ struct kvm_s2_trans nested_trans, *nested = NULL;
unsigned long esr;
- phys_addr_t fault_ipa;
+ phys_addr_t fault_ipa; /* The address we faulted on */
+ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
struct kvm_memory_slot *memslot;
unsigned long hva;
bool is_iabt, write_fault, writable;
@@ -1634,10 +1796,10 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
esr = kvm_vcpu_get_esr(vcpu);
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
- if (esr_fsc_is_permission_fault(esr)) {
+ if (esr_fsc_is_translation_fault(esr)) {
/* Beyond sanitised PARange (which is the IPA limit) */
if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
kvm_inject_size_fault(vcpu);
@@ -1645,7 +1807,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
/* Falls between the IPA range and the PARange? */
- if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
+ if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
if (is_iabt)
@@ -1684,7 +1846,42 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
idx = srcu_read_lock(&vcpu->kvm->srcu);
- gfn = fault_ipa >> PAGE_SHIFT;
+ /*
+ * We may have faulted on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resolve the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we simply inject
+ * this to the guest and carry on.
+ *
+ * If there are no shadow S2 PTs because S2 is disabled, there is
+ * nothing to walk and we treat it as a 1:1 before going through the
+ * canonical translation.
+ */
+ if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
+ vcpu->arch.hw_mmu->nested_stage2_enabled) {
+ u32 esr;
+
+ ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ipa = kvm_s2_trans_output(&nested_trans);
+ nested = &nested_trans;
+ }
+
+ gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
@@ -1728,13 +1925,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* faulting VA. This is always 12 bits, irrespective
* of the page size.
*/
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, fault_ipa);
+ ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+ ret = io_mem_abort(vcpu, ipa);
goto out_unlock;
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
+ VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
if (esr_fsc_is_access_flag_fault(esr)) {
handle_access_fault(vcpu, fault_ipa);
@@ -1742,7 +1939,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva,
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
@@ -1765,40 +1962,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
(range->end - range->start) << PAGE_SHIFT,
range->may_block);
- return false;
-}
-
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-
- if (!kvm->arch.mmu.pgt)
- return false;
-
- WARN_ON(range->end - range->start != 1);
-
- /*
- * If the page isn't tagged, defer to user_mem_abort() for sanitising
- * the MTE tags. The S2 pte should have been unmapped by
- * mmu_notifier_invalidate_range_end().
- */
- if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
- return false;
-
- /*
- * We've moved a page around, probably through CoW, so let's treat
- * it just like a translation fault and the map handler will clean
- * the cache to the PoC.
- *
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
- PAGE_SIZE, __pfn_to_phys(pfn),
- KVM_PGTABLE_PROT_R, NULL, 0);
-
+ kvm_nested_s2_unmap(kvm, range->may_block);
return false;
}
@@ -1809,9 +1973,13 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
range->start << PAGE_SHIFT,
size, true);
+ /*
+ * TODO: Handle nested_mmu structures here using the reverse mapping in
+ * a later version of patch series.
+ */
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1821,7 +1989,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
range->start << PAGE_SHIFT,
size, false);
}
@@ -1931,6 +2099,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
+ __hyp_va_bits = *hyp_va_bits;
return 0;
out_destroy_pgtable:
@@ -2054,11 +2223,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
- kvm_uninit_stage2_mmu(kvm);
-}
-
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
@@ -2066,7 +2230,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
phys_addr_t size = slot->npages << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
+ kvm_nested_s2_unmap(kvm, true);
write_unlock(&kvm->mmu_lock);
}
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index ced30c90521a..4a3fc11f7ecf 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -4,17 +4,802 @@
* Author: Jintack Lim <jintack.lim@linaro.org>
*/
+#include <linux/bitfield.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/sysreg.h>
#include "sys_regs.h"
-/* Protection against the sysreg repainting madness... */
-#define NV_FTR(r, f) ID_AA64##r##_EL1_##f
+/*
+ * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
+ * memory usage and potential number of different sets of S2 PTs in
+ * the guests. Running out of S2 MMUs only affects performance (we
+ * will invalidate them more often).
+ */
+#define S2_MMU_PER_VCPU 2
+
+void kvm_init_nested(struct kvm *kvm)
+{
+ kvm->arch.nested_mmus = NULL;
+ kvm->arch.nested_mmus_size = 0;
+}
+
+static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+{
+ /*
+ * We only initialise the IPA range on the canonical MMU, which
+ * defines the contract between KVM and userspace on where the
+ * "hardware" is in the IPA space. This affects the validity of MMIO
+ * exits forwarded to userspace, for example.
+ *
+ * For nested S2s, we use the PARange as exposed to the guest, as it
+ * is allowed to use it at will to expose whatever memory map it
+ * wants to its own guests as it would be on real HW.
+ */
+ return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm));
+}
+
+int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_s2_mmu *tmp;
+ int num_mmus, ret = 0;
+
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) &&
+ !cpus_have_final_cap(ARM64_HAS_HCR_NV1))
+ return -EINVAL;
+
+ /*
+ * Let's treat memory allocation failures as benign: If we fail to
+ * allocate anything, return an error and keep the allocated array
+ * alive. Userspace may try to recover by intializing the vcpu
+ * again, and there is no reason to affect the whole VM for this.
+ */
+ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
+ tmp = kvrealloc(kvm->arch.nested_mmus,
+ size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tmp)
+ return -ENOMEM;
+
+ swap(kvm->arch.nested_mmus, tmp);
+
+ /*
+ * If we went through a realocation, adjust the MMU back-pointers in
+ * the previously initialised kvm_pgtable structures.
+ */
+ if (kvm->arch.nested_mmus != tmp)
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
+ kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];
+
+ for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
+ ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);
+
+ if (ret) {
+ for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
+ kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);
+
+ return ret;
+ }
+
+ kvm->arch.nested_mmus_size = num_mmus;
+
+ return 0;
+}
+
+struct s2_walk_info {
+ int (*read_desc)(phys_addr_t pa, u64 *desc, void *data);
+ void *data;
+ u64 baddr;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int sl;
+ unsigned int t0sz;
+ bool be;
+};
+
+static u32 compute_fsc(int level, u32 fsc)
+{
+ return fsc | (level & 0x3);
+}
+
+static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
+{
+ u32 esr;
+
+ esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
+ esr |= compute_fsc(level, fsc);
+ return esr;
+}
+
+static int get_ia_size(struct s2_walk_info *wi)
+{
+ return 64 - wi->t0sz;
+}
+
+static int check_base_s2_limits(struct s2_walk_info *wi,
+ int level, int input_size, int stride)
+{
+ int start_size, ia_size;
+
+ ia_size = get_ia_size(wi);
+
+ /* Check translation limits */
+ switch (BIT(wi->pgshift)) {
+ case SZ_64K:
+ if (level == 0 || (level == 1 && ia_size <= 42))
+ return -EFAULT;
+ break;
+ case SZ_16K:
+ if (level == 0 || (level == 1 && ia_size <= 40))
+ return -EFAULT;
+ break;
+ case SZ_4K:
+ if (level < 0 || (level == 0 && ia_size <= 42))
+ return -EFAULT;
+ break;
+ }
+
+ /* Check input size limits */
+ if (input_size > ia_size)
+ return -EFAULT;
+
+ /* Check number of entries in starting level table */
+ start_size = input_size - ((3 - level) * stride + wi->pgshift);
+ if (start_size < 1 || start_size > stride + 4)
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Check if output is within boundaries */
+static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
+{
+ unsigned int output_size = wi->max_oa_bits;
+
+ if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
+ return -1;
+
+ return 0;
+}
+
+/*
+ * This is essentially a C-version of the pseudo code from the ARM ARM
+ * AArch64.TranslationTableWalk function. I strongly recommend looking at
+ * that pseudocode in trying to understand this.
+ *
+ * Must be called with the kvm->srcu read lock held
+ */
+static int walk_nested_s2_pgd(phys_addr_t ipa,
+ struct s2_walk_info *wi, struct kvm_s2_trans *out)
+{
+ int first_block_level, level, stride, input_size, base_lower_bound;
+ phys_addr_t base_addr;
+ unsigned int addr_top, addr_bottom;
+ u64 desc; /* page table entry */
+ int ret;
+ phys_addr_t paddr;
+
+ switch (BIT(wi->pgshift)) {
+ default:
+ case SZ_64K:
+ case SZ_16K:
+ level = 3 - wi->sl;
+ first_block_level = 2;
+ break;
+ case SZ_4K:
+ level = 2 - wi->sl;
+ first_block_level = 1;
+ break;
+ }
+
+ stride = wi->pgshift - 3;
+ input_size = get_ia_size(wi);
+ if (input_size > 48 || input_size < 25)
+ return -EFAULT;
+
+ ret = check_base_s2_limits(wi, level, input_size, stride);
+ if (WARN_ON(ret))
+ return ret;
+
+ base_lower_bound = 3 + input_size - ((3 - level) * stride +
+ wi->pgshift);
+ base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
+
+ if (check_output_size(wi, base_addr)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ return 1;
+ }
+
+ addr_top = input_size - 1;
+
+ while (1) {
+ phys_addr_t index;
+
+ addr_bottom = (3 - level) * stride + wi->pgshift;
+ index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
+ >> (addr_bottom - 3);
+
+ paddr = base_addr | index;
+ ret = wi->read_desc(paddr, &desc, wi->data);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Handle reversedescriptors if endianness differs between the
+ * host and the guest hypervisor.
+ */
+ if (wi->be)
+ desc = be64_to_cpu((__force __be64)desc);
+ else
+ desc = le64_to_cpu((__force __le64)desc);
+
+ /* Check for valid descriptor at this point */
+ if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->desc = desc;
+ return 1;
+ }
+
+ /* We're at the final level or block translation level */
+ if ((desc & 3) == 1 || level == 3)
+ break;
+
+ if (check_output_size(wi, desc)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ out->desc = desc;
+ return 1;
+ }
+
+ base_addr = desc & GENMASK_ULL(47, wi->pgshift);
+
+ level += 1;
+ addr_top = addr_bottom - 1;
+ }
+
+ if (level < first_block_level) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->desc = desc;
+ return 1;
+ }
+
+ if (check_output_size(wi, desc)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ out->desc = desc;
+ return 1;
+ }
+
+ if (!(desc & BIT(10))) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
+ out->desc = desc;
+ return 1;
+ }
+
+ addr_bottom += contiguous_bit_shift(desc, wi, level);
+
+ /* Calculate and return the result */
+ paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
+ (ipa & GENMASK_ULL(addr_bottom - 1, 0));
+ out->output = paddr;
+ out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
+ out->readable = desc & (0b01 << 6);
+ out->writable = desc & (0b10 << 6);
+ out->level = level;
+ out->desc = desc;
+ return 0;
+}
+
+static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data)
+{
+ struct kvm_vcpu *vcpu = data;
+
+ return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc));
+}
+
+static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
+{
+ wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
+
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ wi->pgshift = 12; break;
+ case VTCR_EL2_TG0_16K:
+ wi->pgshift = 14; break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+
+ wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ /* Global limit for now, should eventually be per-VM */
+ wi->max_oa_bits = min(get_kvm_ipa_limit(),
+ ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr)));
+}
+
+int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+ struct kvm_s2_trans *result)
+{
+ u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ struct s2_walk_info wi;
+ int ret;
+
+ result->esr = 0;
+
+ if (!vcpu_has_nv(vcpu))
+ return 0;
+
+ wi.read_desc = read_guest_s2_desc;
+ wi.data = vcpu;
+ wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ vtcr_to_walk_info(vtcr, &wi);
+
+ wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
+
+ ret = walk_nested_s2_pgd(gipa, &wi, result);
+ if (ret)
+ result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
+
+ return ret;
+}
+
+static unsigned int ttl_to_size(u8 ttl)
+{
+ int level = ttl & 3;
+ int gran = (ttl >> 2) & 3;
+ unsigned int max_size = 0;
+
+ switch (gran) {
+ case TLBI_TTL_TG_4K:
+ switch (level) {
+ case 0:
+ break;
+ case 1:
+ max_size = SZ_1G;
+ break;
+ case 2:
+ max_size = SZ_2M;
+ break;
+ case 3:
+ max_size = SZ_4K;
+ break;
+ }
+ break;
+ case TLBI_TTL_TG_16K:
+ switch (level) {
+ case 0:
+ case 1:
+ break;
+ case 2:
+ max_size = SZ_32M;
+ break;
+ case 3:
+ max_size = SZ_16K;
+ break;
+ }
+ break;
+ case TLBI_TTL_TG_64K:
+ switch (level) {
+ case 0:
+ case 1:
+ /* No 52bit IPA support */
+ break;
+ case 2:
+ max_size = SZ_512M;
+ break;
+ case 3:
+ max_size = SZ_64K;
+ break;
+ }
+ break;
+ default: /* No size information */
+ break;
+ }
+
+ return max_size;
+}
+
+/*
+ * Compute the equivalent of the TTL field by parsing the shadow PT. The
+ * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
+ * retrieved from first entry carrying the level as a tag.
+ */
+static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
+{
+ u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
+ kvm_pte_t pte;
+ u8 ttl, level;
+
+ lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock);
+
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ ttl = (TLBI_TTL_TG_4K << 2);
+ break;
+ case VTCR_EL2_TG0_16K:
+ ttl = (TLBI_TTL_TG_16K << 2);
+ break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ ttl = (TLBI_TTL_TG_64K << 2);
+ break;
+ }
+
+ tmp = addr;
+
+again:
+ /* Iteratively compute the block sizes for a particular granule size */
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ if (sz < SZ_4K) sz = SZ_4K;
+ else if (sz < SZ_2M) sz = SZ_2M;
+ else if (sz < SZ_1G) sz = SZ_1G;
+ else sz = 0;
+ break;
+ case VTCR_EL2_TG0_16K:
+ if (sz < SZ_16K) sz = SZ_16K;
+ else if (sz < SZ_32M) sz = SZ_32M;
+ else sz = 0;
+ break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ if (sz < SZ_64K) sz = SZ_64K;
+ else if (sz < SZ_512M) sz = SZ_512M;
+ else sz = 0;
+ break;
+ }
+
+ if (sz == 0)
+ return 0;
+
+ tmp &= ~(sz - 1);
+ if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
+ goto again;
+ if (!(pte & PTE_VALID))
+ goto again;
+ level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
+ if (!level)
+ goto again;
+
+ ttl |= level;
+
+ /*
+ * We now have found some level information in the shadow S2. Check
+ * that the resulting range is actually including the original IPA.
+ */
+ sz = ttl_to_size(ttl);
+ if (addr < (tmp + sz))
+ return ttl;
+
+ return 0;
+}
+
+unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
+ unsigned long max_size;
+ u8 ttl;
+
+ ttl = FIELD_GET(TLBI_TTL_MASK, val);
+
+ if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
+ /* No TTL, check the shadow S2 for a hint */
+ u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
+ ttl = get_guest_mapping_ttl(mmu, addr);
+ }
+
+ max_size = ttl_to_size(ttl);
+
+ if (!max_size) {
+ /* Compute the maximum extent of the invalidation */
+ switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ max_size = SZ_1G;
+ break;
+ case VTCR_EL2_TG0_16K:
+ max_size = SZ_32M;
+ break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ /*
+ * No, we do not support 52bit IPA in nested yet. Once
+ * we do, this should be 4TB.
+ */
+ max_size = SZ_512M;
+ break;
+ }
+ }
+
+ WARN_ON(!max_size);
+ return max_size;
+}
+
+/*
+ * We can have multiple *different* MMU contexts with the same VMID:
+ *
+ * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit
+ *
+ * - Multiple vcpus using private S2s (huh huh...), hence differing by the
+ * VBBTR_EL2.BADDR address
+ *
+ * - A combination of the above...
+ *
+ * We can always identify which MMU context to pick at run-time. However,
+ * TLB invalidation involving a VMID must take action on all the TLBs using
+ * this particular VMID. This translates into applying the same invalidation
+ * operation to all the contexts that are using this VMID. Moar phun!
+ */
+void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
+ const union tlbi_info *info,
+ void (*tlbi_callback)(struct kvm_s2_mmu *,
+ const union tlbi_info *))
+{
+ write_lock(&kvm->mmu_lock);
+
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!kvm_s2_mmu_valid(mmu))
+ continue;
+
+ if (vmid == get_vmid(mmu->tlb_vttbr))
+ tlbi_callback(mmu, info);
+ }
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool nested_stage2_enabled;
+ u64 vttbr, vtcr, hcr;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
+
+ nested_stage2_enabled = hcr & HCR_VM;
+
+ /* Don't consider the CnP bit for the vttbr match */
+ vttbr &= ~VTTBR_CNP_BIT;
+
+ /*
+ * Two possibilities when looking up a S2 MMU context:
+ *
+ * - either S2 is enabled in the guest, and we need a context that is
+ * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
+ * which makes it safe from a TLB conflict perspective (a broken
+ * guest won't be able to generate them),
+ *
+ * - or S2 is disabled, and we need a context that is S2-disabled
+ * and matches the VMID only, as all TLBs are tagged by VMID even
+ * if S2 translation is disabled.
+ */
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!kvm_s2_mmu_valid(mmu))
+ continue;
+
+ if (nested_stage2_enabled &&
+ mmu->nested_stage2_enabled &&
+ vttbr == mmu->tlb_vttbr &&
+ vtcr == mmu->tlb_vtcr)
+ return mmu;
+
+ if (!nested_stage2_enabled &&
+ !mmu->nested_stage2_enabled &&
+ get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
+ return mmu;
+ }
+ return NULL;
+}
+
+static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_s2_mmu *s2_mmu;
+ int i;
+
+ lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
+
+ s2_mmu = lookup_s2_mmu(vcpu);
+ if (s2_mmu)
+ goto out;
+
+ /*
+ * Make sure we don't always search from the same point, or we
+ * will always reuse a potentially active context, leaving
+ * free contexts unused.
+ */
+ for (i = kvm->arch.nested_mmus_next;
+ i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
+ i++) {
+ s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
+
+ if (atomic_read(&s2_mmu->refcnt) == 0)
+ break;
+ }
+ BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
+
+ /* Set the scene for the next search */
+ kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
+
+ /* Make sure we don't forget to do the laundry */
+ if (kvm_s2_mmu_valid(s2_mmu))
+ s2_mmu->pending_unmap = true;
+
+ /*
+ * The virtual VMID (modulo CnP) will be used as a key when matching
+ * an existing kvm_s2_mmu.
+ *
+ * We cache VTCR at allocation time, once and for all. It'd be great
+ * if the guest didn't screw that one up, as this is not very
+ * forgiving...
+ */
+ s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
+ s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
+
+out:
+ atomic_inc(&s2_mmu->refcnt);
+
+ /*
+ * Set the vCPU request to perform an unmap, even if the pending unmap
+ * originates from another vCPU. This guarantees that the MMU has been
+ * completely unmapped before any vCPU actually uses it, and allows
+ * multiple vCPUs to lend a hand with completing the unmap.
+ */
+ if (s2_mmu->pending_unmap)
+ kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu);
+
+ return s2_mmu;
+}
+
+void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
+{
+ /* CnP being set denotes an invalid entry */
+ mmu->tlb_vttbr = VTTBR_CNP_BIT;
+ mmu->nested_stage2_enabled = false;
+ atomic_set(&mmu->refcnt, 0);
+}
+
+void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The vCPU kept its reference on the MMU after the last put, keep
+ * rolling with it.
+ */
+ if (vcpu->arch.hw_mmu)
+ return;
+
+ if (is_hyp_ctxt(vcpu)) {
+ vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
+ } else {
+ write_lock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ }
+}
+
+void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Keep a reference on the associated stage-2 MMU if the vCPU is
+ * scheduling out and not in WFI emulation, suggesting it is likely to
+ * reuse the MMU sometime soon.
+ */
+ if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI))
+ return;
+
+ if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu))
+ atomic_dec(&vcpu->arch.hw_mmu->refcnt);
+
+ vcpu->arch.hw_mmu = NULL;
+}
+
+/*
+ * Returns non-zero if permission fault is handled by injecting it to the next
+ * level hypervisor.
+ */
+int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
+{
+ bool forward_fault = false;
+
+ trans->esr = 0;
+
+ if (!kvm_vcpu_trap_is_permission_fault(vcpu))
+ return 0;
+
+ if (kvm_vcpu_trap_is_iabt(vcpu)) {
+ forward_fault = !kvm_s2_trans_executable(trans);
+ } else {
+ bool write_fault = kvm_is_write_fault(vcpu);
+
+ forward_fault = ((write_fault && !trans->writable) ||
+ (!write_fault && !trans->readable));
+ }
+
+ if (forward_fault)
+ trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
+
+ return forward_fault;
+}
+
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
+
+ return kvm_inject_nested_sync(vcpu, esr_el2);
+}
+
+void kvm_nested_s2_wp(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
+ }
+}
+
+void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
+ }
+}
+
+void kvm_nested_s2_flush(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
+ }
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!WARN_ON(atomic_read(&mmu->refcnt)))
+ kvm_free_stage2_pgd(mmu);
+ }
+ kvfree(kvm->arch.nested_mmus);
+ kvm->arch.nested_mmus = NULL;
+ kvm->arch.nested_mmus_size = 0;
+ kvm_uninit_stage2_mmu(kvm);
+}
/*
* Our emulated CPU doesn't support all the possible features. For the
@@ -23,156 +808,162 @@
* This list should get updated as new features get added to the NV
* support, and new extension to the architecture.
*/
-static u64 limit_nv_id_reg(u32 id, u64 val)
+u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
{
- u64 tmp;
-
- switch (id) {
+ switch (reg) {
case SYS_ID_AA64ISAR0_EL1:
- /* Support everything but TME, O.S. and Range TLBIs */
- val &= ~(NV_FTR(ISAR0, TLB) |
- NV_FTR(ISAR0, TME));
+ /* Support everything but TME */
+ val &= ~ID_AA64ISAR0_EL1_TME;
break;
case SYS_ID_AA64ISAR1_EL1:
- /* Support everything but PtrAuth and Spec Invalidation */
- val &= ~(GENMASK_ULL(63, 56) |
- NV_FTR(ISAR1, SPECRES) |
- NV_FTR(ISAR1, GPI) |
- NV_FTR(ISAR1, GPA) |
- NV_FTR(ISAR1, API) |
- NV_FTR(ISAR1, APA));
+ /* Support everything but LS64 and Spec Invalidation */
+ val &= ~(ID_AA64ISAR1_EL1_LS64 |
+ ID_AA64ISAR1_EL1_SPECRES);
break;
case SYS_ID_AA64PFR0_EL1:
- /* No AMU, MPAM, S-EL2, RAS or SVE */
- val &= ~(GENMASK_ULL(55, 52) |
- NV_FTR(PFR0, AMU) |
- NV_FTR(PFR0, MPAM) |
- NV_FTR(PFR0, SEL2) |
- NV_FTR(PFR0, RAS) |
- NV_FTR(PFR0, SVE) |
- NV_FTR(PFR0, EL3) |
- NV_FTR(PFR0, EL2) |
- NV_FTR(PFR0, EL1));
- /* 64bit EL1/EL2/EL3 only */
- val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001);
- val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001);
- val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001);
+ /* No RME, AMU, MPAM, S-EL2, or RAS */
+ val &= ~(ID_AA64PFR0_EL1_RME |
+ ID_AA64PFR0_EL1_AMU |
+ ID_AA64PFR0_EL1_MPAM |
+ ID_AA64PFR0_EL1_SEL2 |
+ ID_AA64PFR0_EL1_RAS |
+ ID_AA64PFR0_EL1_EL3 |
+ ID_AA64PFR0_EL1_EL2 |
+ ID_AA64PFR0_EL1_EL1 |
+ ID_AA64PFR0_EL1_EL0);
+ /* 64bit only at any EL */
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP);
break;
case SYS_ID_AA64PFR1_EL1:
- /* Only support SSBS */
- val &= NV_FTR(PFR1, SSBS);
+ /* Only support BTI, SSBS, CSV2_frac */
+ val &= (ID_AA64PFR1_EL1_BT |
+ ID_AA64PFR1_EL1_SSBS |
+ ID_AA64PFR1_EL1_CSV2_frac);
break;
case SYS_ID_AA64MMFR0_EL1:
- /* Hide ECV, ExS, Secure Memory */
- val &= ~(NV_FTR(MMFR0, ECV) |
- NV_FTR(MMFR0, EXS) |
- NV_FTR(MMFR0, TGRAN4_2) |
- NV_FTR(MMFR0, TGRAN16_2) |
- NV_FTR(MMFR0, TGRAN64_2) |
- NV_FTR(MMFR0, SNSMEM));
+ /* Hide ExS, Secure Memory */
+ val &= ~(ID_AA64MMFR0_EL1_EXS |
+ ID_AA64MMFR0_EL1_TGRAN4_2 |
+ ID_AA64MMFR0_EL1_TGRAN16_2 |
+ ID_AA64MMFR0_EL1_TGRAN64_2 |
+ ID_AA64MMFR0_EL1_SNSMEM);
+
+ /* Hide CNTPOFF if present */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP);
/* Disallow unsupported S2 page sizes */
switch (PAGE_SIZE) {
case SZ_64K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI);
fallthrough;
case SZ_16K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI);
fallthrough;
case SZ_4K:
/* Support everything */
break;
}
+
/*
- * Since we can't support a guest S2 page size smaller than
- * the host's own page size (due to KVM only populating its
- * own S2 using the kernel's page size), advertise the
- * limitation using FEAT_GTG.
+ * Since we can't support a guest S2 page size smaller
+ * than the host's own page size (due to KVM only
+ * populating its own S2 using the kernel's page
+ * size), advertise the limitation using FEAT_GTG.
*/
switch (PAGE_SIZE) {
case SZ_4K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
fallthrough;
case SZ_16K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
fallthrough;
case SZ_64K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
break;
}
+
/* Cap PARange to 48bits */
- tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val);
- if (tmp > 0b0101) {
- val &= ~NV_FTR(MMFR0, PARANGE);
- val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101);
- }
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48);
break;
case SYS_ID_AA64MMFR1_EL1:
- val &= (NV_FTR(MMFR1, HCX) |
- NV_FTR(MMFR1, PAN) |
- NV_FTR(MMFR1, LO) |
- NV_FTR(MMFR1, HPDS) |
- NV_FTR(MMFR1, VH) |
- NV_FTR(MMFR1, VMIDBits));
+ val &= (ID_AA64MMFR1_EL1_HCX |
+ ID_AA64MMFR1_EL1_PAN |
+ ID_AA64MMFR1_EL1_LO |
+ ID_AA64MMFR1_EL1_HPDS |
+ ID_AA64MMFR1_EL1_VH |
+ ID_AA64MMFR1_EL1_VMIDBits);
+ /* FEAT_E2H0 implies no VHE */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
+ val &= ~ID_AA64MMFR1_EL1_VH;
break;
case SYS_ID_AA64MMFR2_EL1:
- val &= ~(NV_FTR(MMFR2, BBM) |
- NV_FTR(MMFR2, TTL) |
+ val &= ~(ID_AA64MMFR2_EL1_BBM |
+ ID_AA64MMFR2_EL1_TTL |
GENMASK_ULL(47, 44) |
- NV_FTR(MMFR2, ST) |
- NV_FTR(MMFR2, CCIDX) |
- NV_FTR(MMFR2, VARange));
+ ID_AA64MMFR2_EL1_ST |
+ ID_AA64MMFR2_EL1_CCIDX |
+ ID_AA64MMFR2_EL1_VARange);
/* Force TTL support */
- val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP);
break;
case SYS_ID_AA64MMFR4_EL1:
- val = 0;
- if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
- val |= FIELD_PREP(NV_FTR(MMFR4, E2H0),
- ID_AA64MMFR4_EL1_E2H0_NI_NV1);
+ /*
+ * You get EITHER
+ *
+ * - FEAT_VHE without FEAT_E2H0
+ * - FEAT_NV limited to FEAT_NV2
+ * - HCR_EL2.NV1 being RES0
+ *
+ * OR
+ *
+ * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV
+ *
+ * Life is too short for anything else.
+ */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) {
+ val = 0;
+ } else {
+ val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1);
+ }
break;
case SYS_ID_AA64DFR0_EL1:
- /* Only limited support for PMU, Debug, BPs and WPs */
- val &= (NV_FTR(DFR0, PMUVer) |
- NV_FTR(DFR0, WRPs) |
- NV_FTR(DFR0, BRPs) |
- NV_FTR(DFR0, DebugVer));
+ /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
+ val &= (ID_AA64DFR0_EL1_PMUVer |
+ ID_AA64DFR0_EL1_WRPs |
+ ID_AA64DFR0_EL1_BRPs |
+ ID_AA64DFR0_EL1_DebugVer|
+ ID_AA64DFR0_EL1_HPMN0);
/* Cap Debug to ARMv8.1 */
- tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val);
- if (tmp > 0b0111) {
- val &= ~NV_FTR(DFR0, DebugVer);
- val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111);
- }
- break;
-
- default:
- /* Unknown register, just wipe it clean */
- val = 0;
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE);
break;
}
return val;
}
-u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr)
+u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg sr, u64 v)
{
- u64 v = ctxt_sys_reg(&vcpu->arch.ctxt, sr);
struct kvm_sysreg_masks *masks;
masks = vcpu->kvm->arch.sysreg_masks;
if (masks) {
- sr -= __VNCR_START__;
+ sr -= __SANITISED_REG_START__;
v &= ~masks->mask[sr].res0;
v |= masks->mask[sr].res1;
@@ -181,34 +972,32 @@ u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr)
return v;
}
-static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1)
+static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1)
{
- int i = sr - __VNCR_START__;
+ int i = sr - __SANITISED_REG_START__;
+
+ BUILD_BUG_ON(!__builtin_constant_p(sr));
+ BUILD_BUG_ON(sr < __SANITISED_REG_START__);
+ BUILD_BUG_ON(sr >= NR_SYS_REGS);
kvm->arch.sysreg_masks->mask[i].res0 = res0;
kvm->arch.sysreg_masks->mask[i].res1 = res1;
}
-int kvm_init_nv_sysregs(struct kvm *kvm)
+int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
u64 res0, res1;
- int ret = 0;
- mutex_lock(&kvm->arch.config_lock);
+ lockdep_assert_held(&kvm->arch.config_lock);
if (kvm->arch.sysreg_masks)
goto out;
kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)),
- GFP_KERNEL);
- if (!kvm->arch.sysreg_masks) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (int i = 0; i < KVM_ARM_ID_REG_NUM; i++)
- kvm->arch.id_regs[i] = limit_nv_id_reg(IDX_IDREG(i),
- kvm->arch.id_regs[i]);
+ GFP_KERNEL_ACCOUNT);
+ if (!kvm->arch.sysreg_masks)
+ return -ENOMEM;
/* VTTBR_EL2 */
res0 = res1 = 0;
@@ -248,12 +1037,13 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
res0 |= HCR_FIEN;
if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP))
res0 |= HCR_FWB;
- if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2))
- res0 |= HCR_NV2;
- if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP))
- res0 |= (HCR_AT | HCR_NV1 | HCR_NV);
- if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) &&
- __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
+ /* Implementation choice: NV2 is the only supported config */
+ if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
+ res0 |= (HCR_NV2 | HCR_NV | HCR_AT);
+ if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, NI))
+ res0 |= HCR_NV1;
+ if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) &&
+ kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
res0 |= (HCR_API | HCR_APK);
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP))
res0 |= BIT(39);
@@ -261,6 +1051,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
res0 |= (HCR_TEA | HCR_TERR);
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP))
res0 |= HCR_TLOR;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
+ res0 |= HCR_E2H;
if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP))
res1 |= HCR_E2H;
set_sysreg_masks(kvm, HCR_EL2, res0, res1);
@@ -286,7 +1078,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
res0 |= HCRX_EL2_PTTWI;
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP))
res0 |= HCRX_EL2_SCTLR2En;
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP))
+ if (!kvm_has_tcr2(kvm))
res0 |= HCRX_EL2_TCR2En;
if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP))
res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2);
@@ -309,8 +1101,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
/* HFG[RW]TR_EL2 */
res0 = res1 = 0;
- if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) &&
- __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
+ if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) &&
+ kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey |
HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey |
HFGxTR_EL2_APIBKey);
@@ -337,9 +1129,9 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0);
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP))
res0 |= HFGxTR_EL2_nRCWMASK_EL1;
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP))
+ if (!kvm_has_s1pie(kvm))
res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1);
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP))
+ if (!kvm_has_s1poe(kvm))
res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1);
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
res0 |= HFGxTR_EL2_nS2POR_EL1;
@@ -435,8 +1227,118 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1))
res0 |= ~(res0 | res1);
set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1);
+
+ /* TCR2_EL2 */
+ res0 = TCR2_EL2_RES0;
+ res1 = TCR2_EL2_RES1;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP))
+ res0 |= (TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1 | TCR2_EL2_D128);
+ if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, MEC, IMP))
+ res0 |= TCR2_EL2_AMEC1 | TCR2_EL2_AMEC0;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT))
+ res0 |= TCR2_EL2_HAFT;
+ if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP))
+ res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP))
+ res0 |= TCR2_EL2_AIE;
+ if (!kvm_has_s1poe(kvm))
+ res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE;
+ if (!kvm_has_s1pie(kvm))
+ res0 |= TCR2_EL2_PIE;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
+ res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 |
+ TCR2_EL2_AMEC1 | TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1);
+ set_sysreg_masks(kvm, TCR2_EL2, res0, res1);
+
+ /* SCTLR_EL1 */
+ res0 = SCTLR_EL1_RES0;
+ res1 = SCTLR_EL1_RES1;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
+ res0 |= SCTLR_EL1_EPAN;
+ set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);
+
+ /* MDCR_EL2 */
+ res0 = MDCR_EL2_RES0;
+ res1 = MDCR_EL2_RES1;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP))
+ res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR |
+ MDCR_EL2_TPM | MDCR_EL2_HPME);
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP))
+ res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS;
+ if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP))
+ res0 |= MDCR_EL2_EnSPM;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1))
+ res0 |= MDCR_EL2_HPMD;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
+ res0 |= MDCR_EL2_TTRF;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
+ res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP))
+ res0 |= MDCR_EL2_E2TB;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
+ res0 |= MDCR_EL2_TDCC;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) ||
+ kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP))
+ res0 |= MDCR_EL2_MTPME;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7))
+ res0 |= MDCR_EL2_HPMFZO;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP))
+ res0 |= MDCR_EL2_PMSSE;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2))
+ res0 |= MDCR_EL2_HPMFZS;
+ if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP))
+ res0 |= MDCR_EL2_PMEE;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9))
+ res0 |= MDCR_EL2_EBWE;
+ if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP))
+ res0 |= MDCR_EL2_EnSTEPOP;
+ set_sysreg_masks(kvm, MDCR_EL2, res0, res1);
+
+ /* CNTHCTL_EL2 */
+ res0 = GENMASK(63, 20);
+ res1 = 0;
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP))
+ res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) {
+ res0 |= CNTHCTL_ECV;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP))
+ res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT |
+ CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT);
+ }
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
+ res0 |= GENMASK(11, 8);
+ set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1);
+
+ /* ICH_HCR_EL2 */
+ res0 = ICH_HCR_EL2_RES0;
+ res1 = ICH_HCR_EL2_RES1;
+ if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS))
+ res0 |= ICH_HCR_EL2_TDIR;
+ /* No GICv4 is presented to the guest */
+ res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
+ set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1);
+
out:
- mutex_unlock(&kvm->arch.config_lock);
+ for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
+ (void)__vcpu_sys_reg(vcpu, sr);
- return ret;
+ return 0;
+}
+
+void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
+ struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ if (mmu->pending_unmap) {
+ kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
+ mmu->pending_unmap = false;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+ }
+
+ /* Must be last, as may switch context! */
+ if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
+ kvm_inject_nested_irq(vcpu);
}
diff --git a/arch/arm64/kvm/pauth.c b/arch/arm64/kvm/pauth.c
new file mode 100644
index 000000000000..d5eb3ae876be
--- /dev/null
+++ b/arch/arm64/kvm/pauth.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 - Google LLC
+ * Author: Marc Zyngier <maz@kernel.org>
+ *
+ * Primitive PAuth emulation for ERETAA/ERETAB.
+ *
+ * This code assumes that is is run from EL2, and that it is part of
+ * the emulation of ERETAx for a guest hypervisor. That's a lot of
+ * baked-in assumptions and shortcuts.
+ *
+ * Do no reuse for anything else!
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/gpr-num.h>
+#include <asm/kvm_emulate.h>
+#include <asm/pointer_auth.h>
+
+/* PACGA Xd, Xn, Xm */
+#define PACGA(d,n,m) \
+ asm volatile(__DEFINE_ASM_GPR_NUMS \
+ ".inst 0x9AC03000 |" \
+ "(.L__gpr_num_%[Rd] << 0) |" \
+ "(.L__gpr_num_%[Rn] << 5) |" \
+ "(.L__gpr_num_%[Rm] << 16)\n" \
+ : [Rd] "=r" ((d)) \
+ : [Rn] "r" ((n)), [Rm] "r" ((m)))
+
+static u64 compute_pac(struct kvm_vcpu *vcpu, u64 ptr,
+ struct ptrauth_key ikey)
+{
+ struct ptrauth_key gkey;
+ u64 mod, pac = 0;
+
+ preempt_disable();
+
+ if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
+ mod = __vcpu_sys_reg(vcpu, SP_EL2);
+ else
+ mod = read_sysreg(sp_el1);
+
+ gkey.lo = read_sysreg_s(SYS_APGAKEYLO_EL1);
+ gkey.hi = read_sysreg_s(SYS_APGAKEYHI_EL1);
+
+ __ptrauth_key_install_nosync(APGA, ikey);
+ isb();
+
+ PACGA(pac, ptr, mod);
+ isb();
+
+ __ptrauth_key_install_nosync(APGA, gkey);
+
+ preempt_enable();
+
+ /* PAC in the top 32bits */
+ return pac;
+}
+
+static bool effective_tbi(struct kvm_vcpu *vcpu, bool bit55)
+{
+ u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ bool tbi, tbid;
+
+ /*
+ * Since we are authenticating an instruction address, we have
+ * to take TBID into account. If E2H==0, ignore VA[55], as
+ * TCR_EL2 only has a single TBI/TBID. If VA[55] was set in
+ * this case, this is likely a guest bug...
+ */
+ if (!vcpu_el2_e2h_is_set(vcpu)) {
+ tbi = tcr & BIT(20);
+ tbid = tcr & BIT(29);
+ } else if (bit55) {
+ tbi = tcr & TCR_TBI1;
+ tbid = tcr & TCR_TBID1;
+ } else {
+ tbi = tcr & TCR_TBI0;
+ tbid = tcr & TCR_TBID0;
+ }
+
+ return tbi && !tbid;
+}
+
+static int compute_bottom_pac(struct kvm_vcpu *vcpu, bool bit55)
+{
+ static const int maxtxsz = 39; // Revisit these two values once
+ static const int mintxsz = 16; // (if) we support TTST/LVA/LVA2
+ u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ int txsz;
+
+ if (!vcpu_el2_e2h_is_set(vcpu) || !bit55)
+ txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+ else
+ txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+
+ return 64 - clamp(txsz, mintxsz, maxtxsz);
+}
+
+static u64 compute_pac_mask(struct kvm_vcpu *vcpu, bool bit55)
+{
+ int bottom_pac;
+ u64 mask;
+
+ bottom_pac = compute_bottom_pac(vcpu, bit55);
+
+ mask = GENMASK(54, bottom_pac);
+ if (!effective_tbi(vcpu, bit55))
+ mask |= GENMASK(63, 56);
+
+ return mask;
+}
+
+static u64 to_canonical_addr(struct kvm_vcpu *vcpu, u64 ptr, u64 mask)
+{
+ bool bit55 = !!(ptr & BIT(55));
+
+ if (bit55)
+ return ptr | mask;
+
+ return ptr & ~mask;
+}
+
+static u64 corrupt_addr(struct kvm_vcpu *vcpu, u64 ptr)
+{
+ bool bit55 = !!(ptr & BIT(55));
+ u64 mask, error_code;
+ int shift;
+
+ if (effective_tbi(vcpu, bit55)) {
+ mask = GENMASK(54, 53);
+ shift = 53;
+ } else {
+ mask = GENMASK(62, 61);
+ shift = 61;
+ }
+
+ if (esr_iss_is_eretab(kvm_vcpu_get_esr(vcpu)))
+ error_code = 2 << shift;
+ else
+ error_code = 1 << shift;
+
+ ptr &= ~mask;
+ ptr |= error_code;
+
+ return ptr;
+}
+
+/*
+ * Authenticate an ERETAA/ERETAB instruction, returning true if the
+ * authentication succeeded and false otherwise. In all cases, *elr
+ * contains the VA to ERET to. Potential exception injection is left
+ * to the caller.
+ */
+bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr)
+{
+ u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 ptr, cptr, pac, mask;
+ struct ptrauth_key ikey;
+
+ *elr = ptr = vcpu_read_sys_reg(vcpu, ELR_EL2);
+
+ /* We assume we're already in the context of an ERETAx */
+ if (esr_iss_is_eretab(esr)) {
+ if (!(sctlr & SCTLR_EL1_EnIB))
+ return true;
+
+ ikey.lo = __vcpu_sys_reg(vcpu, APIBKEYLO_EL1);
+ ikey.hi = __vcpu_sys_reg(vcpu, APIBKEYHI_EL1);
+ } else {
+ if (!(sctlr & SCTLR_EL1_EnIA))
+ return true;
+
+ ikey.lo = __vcpu_sys_reg(vcpu, APIAKEYLO_EL1);
+ ikey.hi = __vcpu_sys_reg(vcpu, APIAKEYHI_EL1);
+ }
+
+ mask = compute_pac_mask(vcpu, !!(ptr & BIT(55)));
+ cptr = to_canonical_addr(vcpu, ptr, mask);
+
+ pac = compute_pac(vcpu, cptr, ikey);
+
+ /*
+ * Slightly deviate from the pseudocode: if we have a PAC
+ * match with the signed pointer, then it must be good.
+ * Anything after this point is pure error handling.
+ */
+ if ((pac & mask) == (ptr & mask)) {
+ *elr = cptr;
+ return true;
+ }
+
+ /*
+ * Authentication failed, corrupt the canonical address if
+ * PAuth2 isn't implemented, or some XORing if it is.
+ */
+ if (!kvm_has_pauth(vcpu->kvm, PAuth2))
+ cptr = corrupt_addr(vcpu, cptr);
+ else
+ cptr = ptr ^ (pac & mask);
+
+ *elr = cptr;
+ return false;
+}
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index b7be96a53597..0f89157d31fd 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/kmemleak.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
#include <linux/memblock.h>
#include <linux/mutex.h>
#include <linux/sort.h>
@@ -110,6 +111,29 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
host_kvm->arch.pkvm.handle = 0;
free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
+ free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc);
+}
+
+static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
+{
+ size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
+ pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle;
+ void *hyp_vcpu;
+ int ret;
+
+ vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
+
+ hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
+ if (!hyp_vcpu)
+ return -ENOMEM;
+
+ ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu);
+ if (!ret)
+ vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED);
+ else
+ free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
+
+ return ret;
}
/*
@@ -124,11 +148,8 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
*/
static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
{
- size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
- struct kvm_vcpu *host_vcpu;
- pkvm_handle_t handle;
+ size_t pgd_sz, hyp_vm_sz;
void *pgd, *hyp_vm;
- unsigned long idx;
int ret;
if (host_kvm->created_vcpus < 1)
@@ -160,40 +181,11 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
if (ret < 0)
goto free_vm;
- handle = ret;
-
- host_kvm->arch.pkvm.handle = handle;
-
- /* Donate memory for the vcpus at hyp and initialize it. */
- hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
- kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
- void *hyp_vcpu;
-
- /* Indexing of the vcpus to be sequential starting at 0. */
- if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
- ret = -EINVAL;
- goto destroy_vm;
- }
-
- hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
- if (!hyp_vcpu) {
- ret = -ENOMEM;
- goto destroy_vm;
- }
-
- ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
- hyp_vcpu);
- if (ret) {
- free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
- goto destroy_vm;
- }
- }
+ host_kvm->arch.pkvm.handle = ret;
+ host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
+ kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE);
return 0;
-
-destroy_vm:
- __pkvm_destroy_hyp_vm(host_kvm);
- return ret;
free_vm:
free_pages_exact(hyp_vm, hyp_vm_sz);
free_pgd:
@@ -213,6 +205,18 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm)
return ret;
}
+int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
+{
+ int ret = 0;
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+ if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED))
+ ret = __pkvm_create_hyp_vcpu(vcpu);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ return ret;
+}
+
void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{
mutex_lock(&host_kvm->arch.config_lock);
@@ -222,7 +226,6 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
int pkvm_init_host_vm(struct kvm *host_kvm)
{
- mutex_init(&host_kvm->lock);
return 0;
}
@@ -259,6 +262,7 @@ static int __init finalize_pkvm(void)
* at, which would end badly once inaccessible.
*/
kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
+ kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start);
kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
ret = pkvm_drop_host_privileges();
@@ -268,3 +272,203 @@ static int __init finalize_pkvm(void)
return ret;
}
device_initcall_sync(finalize_pkvm);
+
+static int cmp_mappings(struct rb_node *node, const struct rb_node *parent)
+{
+ struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node);
+ struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node);
+
+ if (a->gfn < b->gfn)
+ return -1;
+ if (a->gfn > b->gfn)
+ return 1;
+ return 0;
+}
+
+static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn)
+{
+ struct rb_node *node = root->rb_node, *prev = NULL;
+ struct pkvm_mapping *mapping;
+
+ while (node) {
+ mapping = rb_entry(node, struct pkvm_mapping, node);
+ if (mapping->gfn == gfn)
+ return node;
+ prev = node;
+ node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right;
+ }
+
+ return prev;
+}
+
+/*
+ * __tmp is updated to rb_next(__tmp) *before* entering the body of the loop to allow freeing
+ * of __map inline.
+ */
+#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \
+ for (struct rb_node *__tmp = find_first_mapping_node(&(__pgt)->pkvm_mappings, \
+ ((__start) >> PAGE_SHIFT)); \
+ __tmp && ({ \
+ __map = rb_entry(__tmp, struct pkvm_mapping, node); \
+ __tmp = rb_next(__tmp); \
+ true; \
+ }); \
+ ) \
+ if (__map->gfn < ((__start) >> PAGE_SHIFT)) \
+ continue; \
+ else if (__map->gfn >= ((__end) >> PAGE_SHIFT)) \
+ break; \
+ else
+
+int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
+ struct kvm_pgtable_mm_ops *mm_ops)
+{
+ pgt->pkvm_mappings = RB_ROOT;
+ pgt->mmu = mmu;
+
+ return 0;
+}
+
+void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ struct rb_node *node;
+
+ if (!handle)
+ return;
+
+ node = rb_first(&pgt->pkvm_mappings);
+ while (node) {
+ mapping = rb_entry(node, struct pkvm_mapping, node);
+ kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn);
+ node = rb_next(node);
+ rb_erase(&mapping->node, &pgt->pkvm_mappings);
+ kfree(mapping);
+ }
+}
+
+int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ void *mc, enum kvm_pgtable_walk_flags flags)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ struct pkvm_mapping *mapping = NULL;
+ struct kvm_hyp_memcache *cache = mc;
+ u64 gfn = addr >> PAGE_SHIFT;
+ u64 pfn = phys >> PAGE_SHIFT;
+ int ret;
+
+ if (size != PAGE_SIZE)
+ return -EINVAL;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot);
+ if (ret) {
+ /* Is the gfn already mapped due to a racing vCPU? */
+ if (ret == -EPERM)
+ return -EAGAIN;
+ }
+
+ swap(mapping, cache->mapping);
+ mapping->gfn = gfn;
+ mapping->pfn = pfn;
+ WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm_mappings, cmp_mappings));
+
+ return ret;
+}
+
+int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ int ret = 0;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
+ ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn);
+ if (WARN_ON(ret))
+ break;
+ rb_erase(&mapping->node, &pgt->pkvm_mappings);
+ kfree(mapping);
+ }
+
+ return ret;
+}
+
+int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ int ret = 0;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
+ ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn);
+ if (WARN_ON(ret))
+ break;
+ }
+
+ return ret;
+}
+
+int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ struct pkvm_mapping *mapping;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
+ __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE);
+
+ return 0;
+}
+
+bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ bool young = false;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
+ young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
+ mkold);
+
+ return young;
+}
+
+int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
+ enum kvm_pgtable_walk_flags flags)
+{
+ return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot);
+}
+
+void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_walk_flags flags)
+{
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
+}
+
+void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
+{
+ WARN_ON_ONCE(1);
+}
+
+kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
+ enum kvm_pgtable_prot prot, void *mc, bool force_pte)
+{
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_mmu_memory_cache *mc)
+{
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index a35ce10e0a9f..a1bc10d7116a 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -14,17 +14,21 @@
#include <asm/kvm_emulate.h>
#include <kvm/arm_pmu.h>
#include <kvm/arm_vgic.h>
-#include <asm/arm_pmuv3.h>
#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0)
-DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
-
static LIST_HEAD(arm_pmus);
static DEFINE_MUTEX(arm_pmus_lock);
static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc);
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc);
+static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc);
+
+bool kvm_supports_guest_pmuv3(void)
+{
+ guard(mutex)(&arm_pmus_lock);
+ return !list_empty(&arm_pmus);
+}
static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc)
{
@@ -54,7 +58,7 @@ static u32 __kvm_pmu_event_mask(unsigned int pmuver)
static u32 kvm_pmu_event_mask(struct kvm *kvm)
{
- u64 dfr0 = IDREG(kvm, SYS_ID_AA64DFR0_EL1);
+ u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1);
u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0);
return __kvm_pmu_event_mask(pmuver);
@@ -90,7 +94,11 @@ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc)
static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc)
{
- u64 val = kvm_vcpu_read_pmcr(kvm_pmc_to_vcpu(pmc));
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ u64 val = kvm_vcpu_read_pmcr(vcpu);
+
+ if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx))
+ return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP;
return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) ||
(pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC));
@@ -112,6 +120,11 @@ static u32 counter_index_to_evtreg(u64 idx)
return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx;
}
+static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc)
+{
+ return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx));
+}
+
static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
{
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
@@ -141,9 +154,6 @@ static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
*/
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
}
@@ -182,13 +192,23 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
*/
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false);
}
/**
+ * kvm_pmu_set_counter_value_user - set PMU counter value from user
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ * @val: The counter value
+ */
+void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
+{
+ kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
+ __vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val;
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+}
+
+/**
* kvm_pmu_release_perf_event - remove the perf event
* @pmc: The PMU counter pointer
*/
@@ -234,25 +254,11 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
int i;
struct kvm_pmu *pmu = &vcpu->arch.pmu;
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+ for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++)
pmu->pmc[i].idx = i;
}
/**
- * kvm_pmu_vcpu_reset - reset pmu state for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
- int i;
-
- for_each_set_bit(i, &mask, 32)
- kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i));
-}
-
-/**
* kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
* @vcpu: The vcpu pointer
*
@@ -261,12 +267,55 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int i;
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+ for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++)
kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i));
irq_work_sync(&vcpu->arch.pmu.overflow_work);
}
-u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
+static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu)
+{
+ unsigned int hpmn, n;
+
+ if (!vcpu_has_nv(vcpu))
+ return 0;
+
+ hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2));
+ n = vcpu->kvm->arch.pmcr_n;
+
+ /*
+ * Programming HPMN to a value greater than PMCR_EL0.N is
+ * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an
+ * UNKNOWN number of counters (in our case, zero) are reserved for EL2.
+ */
+ if (hpmn >= n)
+ return 0;
+
+ /*
+ * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't
+ * implemented. Since KVM's ability to emulate HPMN=0 does not directly
+ * depend on hardware (all PMU registers are trapped), make the
+ * implementation choice that all counters are included in the second
+ * range reserved for EL2/EL3.
+ */
+ return GENMASK(n - 1, hpmn);
+}
+
+bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx);
+}
+
+u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu)
+{
+ u64 mask = kvm_pmu_implemented_counter_mask(vcpu);
+
+ if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu))
+ return mask;
+
+ return mask & ~kvm_pmu_hyp_counter_mask(vcpu);
+}
+
+u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu)
{
u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu));
@@ -276,76 +325,70 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
}
-/**
- * kvm_pmu_enable_counter_mask - enable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENSET register
- *
- * Call perf_event_enable to start counting the perf event
- */
-void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc)
{
- int i;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
- if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val)
+ if (!pmc->perf_event) {
+ kvm_pmu_create_perf_event(pmc);
return;
+ }
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
- struct kvm_pmc *pmc;
-
- if (!(val & BIT(i)))
- continue;
-
- pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
+ perf_event_enable(pmc->perf_event);
+ if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
+ kvm_debug("fail to enable perf event\n");
+}
- if (!pmc->perf_event) {
- kvm_pmu_create_perf_event(pmc);
- } else {
- perf_event_enable(pmc->perf_event);
- if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
- kvm_debug("fail to enable perf event\n");
- }
- }
+static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event)
+ perf_event_disable(pmc->perf_event);
}
-/**
- * kvm_pmu_disable_counter_mask - disable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENCLR register
- *
- * Call perf_event_disable to stop counting the perf event
- */
-void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- if (!kvm_vcpu_has_pmu(vcpu) || !val)
+ if (!val)
return;
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
- struct kvm_pmc *pmc;
+ for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) {
+ struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
if (!(val & BIT(i)))
continue;
- pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
-
- if (pmc->perf_event)
- perf_event_disable(pmc->perf_event);
+ if (kvm_pmu_counter_is_enabled(pmc))
+ kvm_pmc_enable_perf_event(pmc);
+ else
+ kvm_pmc_disable_perf_event(pmc);
}
+
+ kvm_vcpu_pmu_restore_guest(vcpu);
}
-static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
+/*
+ * Returns the PMU overflow state, which is true if there exists an event
+ * counter where the values of the global enable control, PMOVSSET_EL0[n], and
+ * PMINTENSET_EL1[n] are all 1.
+ */
+static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
{
- u64 reg = 0;
+ u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
- if ((kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) {
- reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
- reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
- reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
- }
+ reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+
+ /*
+ * PMCR_EL0.E is the global enable control for event counters available
+ * to EL0 and EL1.
+ */
+ if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E))
+ reg &= kvm_pmu_hyp_counter_mask(vcpu);
+
+ /*
+ * Otherwise, MDCR_EL2.HPME is the global enable control for event
+ * counters reserved for EL2.
+ */
+ if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME))
+ reg &= ~kvm_pmu_hyp_counter_mask(vcpu);
return reg;
}
@@ -355,10 +398,7 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
bool overflow;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
- overflow = !!kvm_pmu_overflow_status(vcpu);
+ overflow = kvm_pmu_overflow_status(vcpu);
if (pmu->irq_level == overflow)
return;
@@ -553,41 +593,89 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
/* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */
if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
val &= ~ARMV8_PMU_PMCR_LP;
+ /* Request a reload of the PMU to enable/disable affected counters */
+ if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E)
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
/* The reset bits don't indicate any state, and shouldn't be saved. */
__vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P);
- if (val & ARMV8_PMU_PMCR_E) {
- kvm_pmu_enable_counter_mask(vcpu,
- __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
- } else {
- kvm_pmu_disable_counter_mask(vcpu,
- __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
- }
-
if (val & ARMV8_PMU_PMCR_C)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
if (val & ARMV8_PMU_PMCR_P) {
- unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
- mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
+ /*
+ * Unlike other PMU sysregs, the controls in PMCR_EL0 always apply
+ * to the 'guest' range of counters and never the 'hyp' range.
+ */
+ unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) &
+ ~kvm_pmu_hyp_counter_mask(vcpu) &
+ ~BIT(ARMV8_PMU_CYCLE_IDX);
+
for_each_set_bit(i, &mask, 32)
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true);
}
- kvm_vcpu_pmu_restore_guest(vcpu);
}
static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
- return (kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) &&
- (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx));
+ unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)))
+ return false;
+
+ if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx))
+ return mdcr & MDCR_EL2_HPME;
+
+ return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E;
+}
+
+static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc)
+{
+ u64 evtreg = kvm_pmc_read_evtreg(pmc);
+ bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0;
+ bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0;
+
+ return u == nsu;
+}
+
+static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc)
+{
+ u64 evtreg = kvm_pmc_read_evtreg(pmc);
+ bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1;
+ bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1;
+
+ return p == nsk;
+}
+
+static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc)
+{
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD))
+ return false;
+
+ return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2;
+}
+
+static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel)
+{
+ struct arm_pmu *pmu = kvm->arch.arm_pmu;
+
+ /*
+ * The CPU PMU likely isn't PMUv3; let the driver provide a mapping
+ * for the guest's PMUv3 event ID.
+ */
+ if (unlikely(pmu->map_pmuv3_event))
+ return pmu->map_pmuv3_event(eventsel);
+
+ return eventsel;
}
/**
@@ -600,17 +688,16 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu;
struct perf_event *event;
struct perf_event_attr attr;
- u64 eventsel, reg, data;
- bool p, u, nsk, nsu;
+ int eventsel;
+ u64 evtreg;
- reg = counter_index_to_evtreg(pmc->idx);
- data = __vcpu_sys_reg(vcpu, reg);
+ evtreg = kvm_pmc_read_evtreg(pmc);
kvm_pmu_stop_counter(pmc);
if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
else
- eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
+ eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm);
/*
* Neither SW increment nor chained events need to be backed
@@ -628,23 +715,34 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
!test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
- p = data & ARMV8_PMU_EXCLUDE_EL1;
- u = data & ARMV8_PMU_EXCLUDE_EL0;
- nsk = data & ARMV8_PMU_EXCLUDE_NS_EL1;
- nsu = data & ARMV8_PMU_EXCLUDE_NS_EL0;
+ /*
+ * Don't create an event if we're running on hardware that requires
+ * PMUv3 event translation and we couldn't find a valid mapping.
+ */
+ eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel);
+ if (eventsel < 0)
+ return;
memset(&attr, 0, sizeof(struct perf_event_attr));
attr.type = arm_pmu->pmu.type;
attr.size = sizeof(attr);
attr.pinned = 1;
attr.disabled = !kvm_pmu_counter_is_enabled(pmc);
- attr.exclude_user = (u != nsu);
- attr.exclude_kernel = (p != nsk);
+ attr.exclude_user = !kvm_pmc_counts_at_el0(pmc);
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
attr.config = eventsel;
/*
+ * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the
+ * guest's EL2 filter.
+ */
+ if (unlikely(is_hyp_ctxt(vcpu)))
+ attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc);
+ else
+ attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc);
+
+ /*
* If counting with a 64bit counter, advertise it to the perf
* code, carefully dealing with the initial sample period
* which also depends on the overflow.
@@ -682,9 +780,6 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx);
u64 reg;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
reg = counter_index_to_evtreg(pmc->idx);
__vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm);
@@ -702,29 +797,23 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
return;
- mutex_lock(&arm_pmus_lock);
+ guard(mutex)(&arm_pmus_lock);
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
- goto out_unlock;
+ return;
entry->arm_pmu = pmu;
list_add_tail(&entry->entry, &arm_pmus);
-
- if (list_is_singular(&arm_pmus))
- static_branch_enable(&kvm_arm_pmu_available);
-
-out_unlock:
- mutex_unlock(&arm_pmus_lock);
}
static struct arm_pmu *kvm_pmu_probe_armpmu(void)
{
- struct arm_pmu *tmp, *pmu = NULL;
struct arm_pmu_entry *entry;
+ struct arm_pmu *pmu;
int cpu;
- mutex_lock(&arm_pmus_lock);
+ guard(mutex)(&arm_pmus_lock);
/*
* It is safe to use a stale cpu to iterate the list of PMUs so long as
@@ -745,42 +834,62 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void)
*/
cpu = raw_smp_processor_id();
list_for_each_entry(entry, &arm_pmus, entry) {
- tmp = entry->arm_pmu;
+ pmu = entry->arm_pmu;
- if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) {
- pmu = tmp;
- break;
- }
+ if (cpumask_test_cpu(cpu, &pmu->supported_cpus))
+ return pmu;
}
- mutex_unlock(&arm_pmus_lock);
+ return NULL;
+}
- return pmu;
+static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1)
+{
+ u32 hi[2], lo[2];
+
+ bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
+ bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
+
+ return ((u64)hi[pmceid1] << 32) | lo[pmceid1];
+}
+
+static u64 compute_pmceid0(struct arm_pmu *pmu)
+{
+ u64 val = __compute_pmceid(pmu, 0);
+
+ /* always support SW_INCR */
+ val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR);
+ /* always support CHAIN */
+ val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
+ return val;
+}
+
+static u64 compute_pmceid1(struct arm_pmu *pmu)
+{
+ u64 val = __compute_pmceid(pmu, 1);
+
+ /*
+ * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled
+ * as RAZ
+ */
+ val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) |
+ BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) |
+ BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32));
+ return val;
}
u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
{
+ struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu;
unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
u64 val, mask = 0;
int base, i, nr_events;
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
if (!pmceid1) {
- val = read_sysreg(pmceid0_el0);
- /* always support CHAIN */
- val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
+ val = compute_pmceid0(cpu_pmu);
base = 0;
} else {
- val = read_sysreg(pmceid1_el0);
- /*
- * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled
- * as RAZ
- */
- val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) |
- BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) |
- BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32));
+ val = compute_pmceid1(cpu_pmu);
base = 32;
}
@@ -805,20 +914,17 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu)
{
- u64 mask = kvm_pmu_valid_counter_mask(vcpu);
-
- kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu));
+ u64 mask = kvm_pmu_implemented_counter_mask(vcpu);
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask;
__vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask;
__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask;
+
+ kvm_pmu_reprogram_counter_mask(vcpu, mask);
}
int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
if (!vcpu->arch.pmu.created)
return -EINVAL;
@@ -841,9 +947,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
return -EINVAL;
}
- /* One-off reload of the PMU on first run */
- kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
-
return 0;
}
@@ -911,10 +1014,17 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm)
struct arm_pmu *arm_pmu = kvm->arch.arm_pmu;
/*
- * The arm_pmu->num_events considers the cycle counter as well.
- * Ignore that and return only the general-purpose counters.
+ * PMUv3 requires that all event counters are capable of counting any
+ * event, though the same may not be true of non-PMUv3 hardware.
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return 1;
+
+ /*
+ * The arm_pmu->cntr_mask considers the fixed counter(s) as well.
+ * Ignore those and return only the general-purpose counters.
*/
- return arm_pmu->num_events - 1;
+ return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS);
}
static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu)
@@ -1121,13 +1231,26 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
u8 kvm_arm_pmu_get_pmuver_limit(void)
{
- u64 tmp;
+ unsigned int pmuver;
- tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
- tmp = cpuid_feature_cap_perfmon_field(tmp,
- ID_AA64DFR0_EL1_PMUVer_SHIFT,
- ID_AA64DFR0_EL1_PMUVer_V3P5);
- return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp);
+ pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer,
+ read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1));
+
+ /*
+ * Spoof a barebones PMUv3 implementation if the system supports IMPDEF
+ * traps of the PMUv3 sysregs
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return ID_AA64DFR0_EL1_PMUVer_IMP;
+
+ /*
+ * Otherwise, treat IMPLEMENTATION DEFINED functionality as
+ * unimplemented
+ */
+ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ return 0;
+
+ return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5);
}
/**
@@ -1140,3 +1263,29 @@ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu)
return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N);
}
+
+void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu)
+{
+ bool reprogrammed = false;
+ unsigned long mask;
+ int i;
+
+ mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+ for_each_set_bit(i, &mask, 32) {
+ struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
+
+ /*
+ * We only need to reconfigure events where the filter is
+ * different at EL1 vs. EL2, as we're multiplexing the true EL1
+ * event filter bit for nested.
+ */
+ if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc))
+ continue;
+
+ kvm_pmu_create_perf_event(pmc);
+ reprogrammed = true;
+ }
+
+ if (reprogrammed)
+ kvm_vcpu_pmu_restore_guest(vcpu);
+}
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index a243934c5568..6b48a3d16d0d 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -5,6 +5,8 @@
*/
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/perf/arm_pmuv3.h>
static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events);
@@ -35,11 +37,11 @@ struct kvm_pmu_events *kvm_get_pmu_events(void)
* Add events to track that we may want to switch at guest entry/exit
* time.
*/
-void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
+void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr)
{
struct kvm_pmu_events *pmu = kvm_get_pmu_events();
- if (!kvm_arm_support_pmu_v3() || !kvm_pmu_switch_needed(attr))
+ if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
@@ -51,90 +53,43 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
/*
* Stop tracking events
*/
-void kvm_clr_pmu_events(u32 clr)
+void kvm_clr_pmu_events(u64 clr)
{
struct kvm_pmu_events *pmu = kvm_get_pmu_events();
- if (!kvm_arm_support_pmu_v3())
+ if (!system_supports_pmuv3())
return;
pmu->events_host &= ~clr;
pmu->events_guest &= ~clr;
}
-#define PMEVTYPER_READ_CASE(idx) \
- case idx: \
- return read_sysreg(pmevtyper##idx##_el0)
-
-#define PMEVTYPER_WRITE_CASE(idx) \
- case idx: \
- write_sysreg(val, pmevtyper##idx##_el0); \
- break
-
-#define PMEVTYPER_CASES(readwrite) \
- PMEVTYPER_##readwrite##_CASE(0); \
- PMEVTYPER_##readwrite##_CASE(1); \
- PMEVTYPER_##readwrite##_CASE(2); \
- PMEVTYPER_##readwrite##_CASE(3); \
- PMEVTYPER_##readwrite##_CASE(4); \
- PMEVTYPER_##readwrite##_CASE(5); \
- PMEVTYPER_##readwrite##_CASE(6); \
- PMEVTYPER_##readwrite##_CASE(7); \
- PMEVTYPER_##readwrite##_CASE(8); \
- PMEVTYPER_##readwrite##_CASE(9); \
- PMEVTYPER_##readwrite##_CASE(10); \
- PMEVTYPER_##readwrite##_CASE(11); \
- PMEVTYPER_##readwrite##_CASE(12); \
- PMEVTYPER_##readwrite##_CASE(13); \
- PMEVTYPER_##readwrite##_CASE(14); \
- PMEVTYPER_##readwrite##_CASE(15); \
- PMEVTYPER_##readwrite##_CASE(16); \
- PMEVTYPER_##readwrite##_CASE(17); \
- PMEVTYPER_##readwrite##_CASE(18); \
- PMEVTYPER_##readwrite##_CASE(19); \
- PMEVTYPER_##readwrite##_CASE(20); \
- PMEVTYPER_##readwrite##_CASE(21); \
- PMEVTYPER_##readwrite##_CASE(22); \
- PMEVTYPER_##readwrite##_CASE(23); \
- PMEVTYPER_##readwrite##_CASE(24); \
- PMEVTYPER_##readwrite##_CASE(25); \
- PMEVTYPER_##readwrite##_CASE(26); \
- PMEVTYPER_##readwrite##_CASE(27); \
- PMEVTYPER_##readwrite##_CASE(28); \
- PMEVTYPER_##readwrite##_CASE(29); \
- PMEVTYPER_##readwrite##_CASE(30)
-
/*
* Read a value direct from PMEVTYPER<idx> where idx is 0-30
- * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31).
+ * or PMxCFILTR_EL0 where idx is 31-32.
*/
static u64 kvm_vcpu_pmu_read_evtype_direct(int idx)
{
- switch (idx) {
- PMEVTYPER_CASES(READ);
- case ARMV8_PMU_CYCLE_IDX:
- return read_sysreg(pmccfiltr_el0);
- default:
- WARN_ON(1);
- }
+ if (idx == ARMV8_PMU_CYCLE_IDX)
+ return read_pmccfiltr();
+ else if (idx == ARMV8_PMU_INSTR_IDX)
+ return read_pmicfiltr();
- return 0;
+ return read_pmevtypern(idx);
}
/*
* Write a value direct to PMEVTYPER<idx> where idx is 0-30
- * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31).
+ * or PMxCFILTR_EL0 where idx is 31-32.
*/
static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val)
{
- switch (idx) {
- PMEVTYPER_CASES(WRITE);
- case ARMV8_PMU_CYCLE_IDX:
- write_sysreg(val, pmccfiltr_el0);
- break;
- default:
- WARN_ON(1);
- }
+ if (idx == ARMV8_PMU_CYCLE_IDX)
+ write_pmccfiltr(val);
+ else if (idx == ARMV8_PMU_INSTR_IDX)
+ write_pmicfiltr(val);
+ else
+ write_pmevtypern(idx, val);
}
/*
@@ -145,7 +100,7 @@ static void kvm_vcpu_pmu_enable_el0(unsigned long events)
u64 typer;
u32 counter;
- for_each_set_bit(counter, &events, 32) {
+ for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) {
typer = kvm_vcpu_pmu_read_evtype_direct(counter);
typer &= ~ARMV8_PMU_EXCLUDE_EL0;
kvm_vcpu_pmu_write_evtype_direct(counter, typer);
@@ -160,7 +115,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events)
u64 typer;
u32 counter;
- for_each_set_bit(counter, &events, 32) {
+ for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) {
typer = kvm_vcpu_pmu_read_evtype_direct(counter);
typer |= ARMV8_PMU_EXCLUDE_EL0;
kvm_vcpu_pmu_write_evtype_direct(counter, typer);
@@ -176,9 +131,9 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events)
void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
{
struct kvm_pmu_events *pmu;
- u32 events_guest, events_host;
+ u64 events_guest, events_host;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return;
preempt_disable();
@@ -197,9 +152,9 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
{
struct kvm_pmu_events *pmu;
- u32 events_guest, events_host;
+ u64 events_guest, events_host;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return;
pmu = kvm_get_pmu_events();
@@ -225,14 +180,14 @@ bool kvm_set_pmuserenr(u64 val)
struct kvm_cpu_context *hctxt;
struct kvm_vcpu *vcpu;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return false;
vcpu = kvm_get_running_vcpu();
if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU))
return false;
- hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ hctxt = host_data_ptr(host_ctxt);
ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val;
return true;
}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 1f69b667332b..3b5dbe9a0a0e 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -194,6 +194,12 @@ static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0);
}
+static void kvm_psci_system_off2(struct kvm_vcpu *vcpu)
+{
+ kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN,
+ KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2);
+}
+
static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
{
kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0);
@@ -322,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
switch(psci_fn) {
case PSCI_0_2_FN_PSCI_VERSION:
- val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1;
+ val = PSCI_VERSION(1, minor);
break;
case PSCI_1_0_FN_PSCI_FEATURES:
arg = smccc_get_arg1(vcpu);
@@ -358,6 +364,11 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
if (minor >= 1)
val = 0;
break;
+ case PSCI_1_3_FN_SYSTEM_OFF2:
+ case PSCI_1_3_FN64_SYSTEM_OFF2:
+ if (minor >= 3)
+ val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF;
+ break;
}
break;
case PSCI_1_0_FN_SYSTEM_SUSPEND:
@@ -392,6 +403,33 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
break;
}
break;
+ case PSCI_1_3_FN_SYSTEM_OFF2:
+ kvm_psci_narrow_to_32bit(vcpu);
+ fallthrough;
+ case PSCI_1_3_FN64_SYSTEM_OFF2:
+ if (minor < 3)
+ break;
+
+ arg = smccc_get_arg1(vcpu);
+ /*
+ * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2
+ * must be zero.
+ */
+ if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) ||
+ smccc_get_arg2(vcpu) != 0) {
+ val = PSCI_RET_INVALID_PARAMS;
+ break;
+ }
+ kvm_psci_system_off2(vcpu);
+ /*
+ * We shouldn't be going back to the guest after receiving a
+ * SYSTEM_OFF2 request. Preload a return value of
+ * INTERNAL_FAILURE should userspace ignore the exit and resume
+ * the vCPU.
+ */
+ val = PSCI_RET_INTERNAL_FAILURE;
+ ret = 0;
+ break;
default:
return kvm_psci_0_2_call(vcpu);
}
@@ -449,6 +487,10 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
}
switch (version) {
+ case KVM_ARM_PSCI_1_3:
+ return kvm_psci_1_x_call(vcpu, 3);
+ case KVM_ARM_PSCI_1_2:
+ return kvm_psci_1_x_call(vcpu, 2);
case KVM_ARM_PSCI_1_1:
return kvm_psci_1_x_call(vcpu, 1);
case KVM_ARM_PSCI_1_0:
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
new file mode 100644
index 000000000000..098416d7e5c2
--- /dev/null
+++ b/arch/arm64/kvm/ptdump.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Debug helper used to dump the stage-2 pagetables of the system and their
+ * associated permissions.
+ *
+ * Copyright (C) Google, 2024
+ * Author: Sebastian Ene <sebastianene@google.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/kvm_host.h>
+#include <linux/seq_file.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/ptdump.h>
+
+#define MARKERS_LEN 2
+#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1)
+
+struct kvm_ptdump_guest_state {
+ struct kvm *kvm;
+ struct ptdump_pg_state parser_state;
+ struct addr_marker ipa_marker[MARKERS_LEN];
+ struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS];
+ struct ptdump_range range[MARKERS_LEN];
+};
+
+static const struct ptdump_prot_bits stage2_pte_bits[] = {
+ {
+ .mask = PTE_VALID,
+ .val = PTE_VALID,
+ .set = " ",
+ .clear = "F",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+ .set = "R",
+ .clear = " ",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+ .set = "W",
+ .clear = " ",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
+ .val = PTE_VALID,
+ .set = " ",
+ .clear = "X",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+ .set = "AF",
+ .clear = " ",
+ }, {
+ .mask = PMD_TYPE_MASK,
+ .val = PMD_TYPE_SECT,
+ .set = "BLK",
+ .clear = " ",
+ },
+};
+
+static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct ptdump_pg_state *st = ctx->arg;
+ struct ptdump_state *pt_st = &st->ptdump;
+
+ note_page(pt_st, ctx->addr, ctx->level, ctx->old);
+
+ return 0;
+}
+
+static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
+{
+ u32 i;
+ u64 mask;
+
+ if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
+ return -EINVAL;
+
+ mask = 0;
+ for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
+ mask |= stage2_pte_bits[i].mask;
+
+ for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+ snprintf(level[i].name, sizeof(level[i].name), "%u", i);
+
+ level[i].num = ARRAY_SIZE(stage2_pte_bits);
+ level[i].bits = stage2_pte_bits;
+ level[i].mask = mask;
+ }
+
+ return 0;
+}
+
+static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
+{
+ struct kvm_ptdump_guest_state *st;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct kvm_pgtable *pgtable = mmu->pgt;
+ int ret;
+
+ st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
+ if (!st)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
+ if (ret) {
+ kfree(st);
+ return ERR_PTR(ret);
+ }
+
+ st->ipa_marker[0].name = "Guest IPA";
+ st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
+ st->range[0].end = BIT(pgtable->ia_bits);
+
+ st->kvm = kvm;
+ st->parser_state = (struct ptdump_pg_state) {
+ .marker = &st->ipa_marker[0],
+ .level = -1,
+ .pg_level = &st->level[0],
+ .ptdump.range = &st->range[0],
+ .start_address = 0,
+ };
+
+ return st;
+}
+
+static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
+{
+ int ret;
+ struct kvm_ptdump_guest_state *st = m->private;
+ struct kvm *kvm = st->kvm;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct ptdump_pg_state *parser_state = &st->parser_state;
+ struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
+ .cb = kvm_ptdump_visitor,
+ .arg = parser_state,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ parser_state->seq = m;
+
+ write_lock(&kvm->mmu_lock);
+ ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
+ write_unlock(&kvm->mmu_lock);
+
+ return ret;
+}
+
+static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+ struct kvm_ptdump_guest_state *st;
+ int ret;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ st = kvm_ptdump_parser_create(kvm);
+ if (IS_ERR(st)) {
+ ret = PTR_ERR(st);
+ goto err_with_kvm_ref;
+ }
+
+ ret = single_open(file, kvm_ptdump_guest_show, st);
+ if (!ret)
+ return 0;
+
+ kfree(st);
+err_with_kvm_ref:
+ kvm_put_kvm(kvm);
+ return ret;
+}
+
+static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+ void *st = ((struct seq_file *)file->private_data)->private;
+
+ kfree(st);
+ kvm_put_kvm(kvm);
+
+ return single_release(m, file);
+}
+
+static const struct file_operations kvm_ptdump_guest_fops = {
+ .open = kvm_ptdump_guest_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_ptdump_guest_close,
+};
+
+static int kvm_pgtable_range_show(struct seq_file *m, void *unused)
+{
+ struct kvm_pgtable *pgtable = m->private;
+
+ seq_printf(m, "%2u\n", pgtable->ia_bits);
+ return 0;
+}
+
+static int kvm_pgtable_levels_show(struct seq_file *m, void *unused)
+{
+ struct kvm_pgtable *pgtable = m->private;
+
+ seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level);
+ return 0;
+}
+
+static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file,
+ int (*show)(struct seq_file *, void *))
+{
+ struct kvm *kvm = m->i_private;
+ struct kvm_pgtable *pgtable;
+ int ret;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ pgtable = kvm->arch.mmu.pgt;
+
+ ret = single_open(file, show, pgtable);
+ if (ret < 0)
+ kvm_put_kvm(kvm);
+ return ret;
+}
+
+static int kvm_pgtable_range_open(struct inode *m, struct file *file)
+{
+ return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show);
+}
+
+static int kvm_pgtable_levels_open(struct inode *m, struct file *file)
+{
+ return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show);
+}
+
+static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+
+ kvm_put_kvm(kvm);
+ return single_release(m, file);
+}
+
+static const struct file_operations kvm_pgtable_range_fops = {
+ .open = kvm_pgtable_range_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_pgtable_debugfs_close,
+};
+
+static const struct file_operations kvm_pgtable_levels_fops = {
+ .open = kvm_pgtable_levels_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_pgtable_debugfs_close,
+};
+
+void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
+ kvm, &kvm_ptdump_guest_fops);
+ debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
+ &kvm_pgtable_range_fops);
+ debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
+ kvm, &kvm_pgtable_levels_fops);
+}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 68d1d05672bd..f82fcc614e13 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -32,6 +32,7 @@
/* Maximum phys_shift supported for any VM on this host */
static u32 __ro_after_init kvm_ipa_limit;
+unsigned int __ro_after_init kvm_host_sve_max_vl;
/*
* ARMv8 Reset Values
@@ -51,6 +52,8 @@ int __init kvm_arm_init_sve(void)
{
if (system_supports_sve()) {
kvm_sve_max_vl = sve_max_virtualisable_vl();
+ kvm_host_sve_max_vl = sve_max_vl();
+ kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl;
/*
* The get_sve_reg()/set_sve_reg() ioctl interface will need
@@ -82,7 +85,7 @@ static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
* KVM_REG_ARM64_SVE_VLS. Allocation is deferred until
* kvm_arm_vcpu_finalize(), which freezes the configuration.
*/
- vcpu_set_flag(vcpu, GUEST_HAS_SVE);
+ set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags);
}
/*
@@ -151,7 +154,6 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
{
void *sve_state = vcpu->arch.sve_state;
- kvm_vcpu_unshare_task_fp(vcpu);
kvm_unshare_hyp(vcpu, vcpu + 1);
if (sve_state)
kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu));
@@ -165,11 +167,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu));
}
-static void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
-{
- vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH);
-}
-
/**
* kvm_reset_vcpu - sets core registers and sys_regs to reset value
* @vcpu: The VCPU pointer
@@ -199,9 +196,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.reset_state.reset = false;
spin_unlock(&vcpu->arch.mp_state_lock);
- /* Reset PMU outside of the non-preemptible section */
- kvm_pmu_vcpu_reset(vcpu);
-
preempt_disable();
loaded = (vcpu->cpu != -1);
if (loaded)
@@ -214,10 +208,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
kvm_vcpu_reset_sve(vcpu);
}
- if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) ||
- vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC))
- kvm_vcpu_enable_ptrauth(vcpu);
-
if (vcpu_el1_is_32bit(vcpu))
pstate = VCPU_RESET_PSTATE_SVC;
else if (vcpu_has_nv(vcpu))
@@ -266,6 +256,12 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
preempt_enable();
}
+u32 kvm_get_pa_bits(struct kvm *kvm)
+{
+ /* Fixed limit until we can configure ID_AA64MMFR0.PARange */
+ return kvm_ipa_limit;
+}
+
u32 get_kvm_ipa_limit(void)
{
return kvm_ipa_limit;
diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c
index 3ace5b75813b..af5eec681127 100644
--- a/arch/arm64/kvm/stacktrace.c
+++ b/arch/arm64/kvm/stacktrace.c
@@ -19,6 +19,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
#include <asm/stacktrace/nvhe.h>
static struct stack_info stackinfo_get_overflow(void)
@@ -50,7 +51,7 @@ static struct stack_info stackinfo_get_hyp(void)
struct kvm_nvhe_stacktrace_info *stacktrace_info
= this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
unsigned long low = (unsigned long)stacktrace_info->stack_base;
- unsigned long high = low + PAGE_SIZE;
+ unsigned long high = low + NVHE_STACK_SIZE;
return (struct stack_info) {
.low = low,
@@ -60,8 +61,8 @@ static struct stack_info stackinfo_get_hyp(void)
static struct stack_info stackinfo_get_hyp_kern_va(void)
{
- unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page);
- unsigned long high = low + PAGE_SIZE;
+ unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_base);
+ unsigned long high = low + NVHE_STACK_SIZE;
return (struct stack_info) {
.low = low,
@@ -145,7 +146,7 @@ static void unwind(struct unwind_state *state,
*/
static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where)
{
- unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0);
+ unsigned long va_mask = GENMASK_ULL(__hyp_va_bits - 1, 0);
unsigned long hyp_offset = (unsigned long)arg;
/* Mask tags and convert to kern addr */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c9f4f387155f..005ad28f7306 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -17,7 +17,9 @@
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/uaccess.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <asm/arm_pmuv3.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
#include <asm/debug-monitors.h>
@@ -33,6 +35,7 @@
#include <trace/events/kvm.h>
#include "sys_regs.h"
+#include "vgic/vgic.h"
#include "trace.h"
@@ -46,6 +49,13 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 val);
+static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ kvm_inject_undefined(vcpu);
+ return false;
+}
+
static bool bad_trap(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
const struct sys_reg_desc *r,
@@ -53,8 +63,7 @@ static bool bad_trap(struct kvm_vcpu *vcpu,
{
WARN_ONCE(1, "Unexpected %s\n", msg);
print_sys_reg_instr(params);
- kvm_inject_undefined(vcpu);
- return false;
+ return undef_access(vcpu, params, r);
}
static bool read_from_write_only(struct kvm_vcpu *vcpu,
@@ -102,6 +111,14 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
PURE_EL2_SYSREG( RVBAR_EL2 );
PURE_EL2_SYSREG( TPIDR_EL2 );
PURE_EL2_SYSREG( HPFAR_EL2 );
+ PURE_EL2_SYSREG( HCRX_EL2 );
+ PURE_EL2_SYSREG( HFGRTR_EL2 );
+ PURE_EL2_SYSREG( HFGWTR_EL2 );
+ PURE_EL2_SYSREG( HFGITR_EL2 );
+ PURE_EL2_SYSREG( HDFGRTR_EL2 );
+ PURE_EL2_SYSREG( HDFGWTR_EL2 );
+ PURE_EL2_SYSREG( HAFGRTR_EL2 );
+ PURE_EL2_SYSREG( CNTVOFF_EL2 );
PURE_EL2_SYSREG( CNTHCTL_EL2 );
MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1,
translate_sctlr_el2_to_sctlr_el1 );
@@ -118,9 +135,15 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL );
MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL );
MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL );
+ MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL );
+ MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL );
+ MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL );
+ MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL );
MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL );
MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL );
MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL );
+ MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL );
+ MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL );
default:
return false;
}
@@ -140,6 +163,21 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
goto memory_read;
/*
+ * CNTHCTL_EL2 requires some special treatment to
+ * account for the bits that can be set via CNTKCTL_EL1.
+ */
+ switch (reg) {
+ case CNTHCTL_EL2:
+ if (vcpu_el2_e2h_is_set(vcpu)) {
+ val = read_sysreg_el1(SYS_CNTKCTL);
+ val &= CNTKCTL_VALID_BITS;
+ val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
+ return val;
+ }
+ break;
+ }
+
+ /*
* If this register does not have an EL1 counterpart,
* then read the stored EL2 version.
*/
@@ -156,6 +194,9 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
/* Get the current version of the EL1 counterpart. */
WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val));
+ if (reg >= __SANITISED_REG_START__)
+ val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
+
return val;
}
@@ -189,6 +230,19 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
*/
__vcpu_sys_reg(vcpu, reg) = val;
+ switch (reg) {
+ case CNTHCTL_EL2:
+ /*
+ * If E2H=0, CNHTCTL_EL2 is a pure shadow register.
+ * Otherwise, some of the bits are backed by
+ * CNTKCTL_EL1, while the rest is kept in memory.
+ * Yes, this is fun stuff.
+ */
+ if (vcpu_el2_e2h_is_set(vcpu))
+ write_sysreg_el1(val, SYS_CNTKCTL);
+ return;
+ }
+
/* No EL1 counterpart? We're done here.? */
if (reg == el1r)
return;
@@ -344,10 +398,8 @@ static bool access_dcgsw(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- if (!kvm_has_mte(vcpu->kvm)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_has_mte(vcpu->kvm))
+ return undef_access(vcpu, p, r);
/* Treat MTE S/W ops as we treat the classic ones: with contempt */
return access_dcsw(vcpu, p, r);
@@ -428,6 +480,9 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
{
bool g1;
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
if (!p->is_write)
return read_from_write_only(vcpu, p, r);
@@ -471,10 +526,19 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
if (p->is_write)
return ignore_write(vcpu, p);
- p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+ if (p->Op1 == 4) { /* ICC_SRE_EL2 */
+ p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE |
+ ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB);
+ } else { /* ICC_SRE_EL1 */
+ p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+ }
+
return true;
}
@@ -488,14 +552,6 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu,
return read_zero(vcpu, p);
}
-static bool trap_undef(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *r)
-{
- kvm_inject_undefined(vcpu);
- return false;
-}
-
/*
* ARMv8.1 mandates at least a trivial LORegion implementation, where all the
* RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0
@@ -508,10 +564,8 @@ static bool trap_loregion(struct kvm_vcpu *vcpu,
{
u32 sr = reg_to_encoding(r);
- if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP))
+ return undef_access(vcpu, p, r);
if (p->is_write && sr == SYS_LORID_EL1)
return write_to_read_only(vcpu, p, r);
@@ -523,17 +577,10 @@ static bool trap_oslar_el1(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 oslsr;
-
if (!p->is_write)
return read_from_write_only(vcpu, p, r);
- /* Forward the OSLK bit to OSLSR */
- oslsr = __vcpu_sys_reg(vcpu, OSLSR_EL1) & ~OSLSR_EL1_OSLK;
- if (p->regval & OSLAR_EL1_OSLK)
- oslsr |= OSLSR_EL1_OSLK;
-
- __vcpu_sys_reg(vcpu, OSLSR_EL1) = oslsr;
+ kvm_debug_handle_oslar(vcpu, p->regval);
return true;
}
@@ -574,43 +621,13 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu,
}
}
-/*
- * We want to avoid world-switching all the DBG registers all the
- * time:
- *
- * - If we've touched any debug register, it is likely that we're
- * going to touch more of them. It then makes sense to disable the
- * traps and start doing the save/restore dance
- * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is
- * then mandatory to save/restore the registers, as the guest
- * depends on them.
- *
- * For this, we use a DIRTY bit, indicating the guest has modified the
- * debug registers, used as follow:
- *
- * On guest entry:
- * - If the dirty bit is set (because we're coming back from trapping),
- * disable the traps, save host registers, restore guest registers.
- * - If debug is actively in use (DBG_MDSCR_KDE or DBG_MDSCR_MDE set),
- * set the dirty bit, disable the traps, save host registers,
- * restore guest registers.
- * - Otherwise, enable the traps
- *
- * On guest exit:
- * - If the dirty bit is set, save guest registers, restore host
- * registers and clear the dirty bit. This ensure that the host can
- * now use the debug registers.
- */
static bool trap_debug_regs(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
access_rw(vcpu, p, r);
- if (p->is_write)
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
-
- trace_trap_reg(__func__, r->reg, p->is_write, p->regval);
+ kvm_debug_set_guest_ownership(vcpu);
return true;
}
@@ -619,9 +636,6 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
*
* A 32 bit write to a debug register leave top bits alone
* A 32 bit read from a debug register only returns the bottom bits
- *
- * All writes will set the DEBUG_DIRTY flag to ensure the hyp code
- * switches between host and guest values in future.
*/
static void reg_to_dbg(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
@@ -636,8 +650,6 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu,
val &= ~mask;
val |= (p->regval & (mask >> shift)) << shift;
*dbg_reg = val;
-
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
}
static void dbg_to_reg(struct kvm_vcpu *vcpu,
@@ -651,152 +663,79 @@ static void dbg_to_reg(struct kvm_vcpu *vcpu,
p->regval = (*dbg_reg & mask) >> shift;
}
-static bool trap_bvr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
-{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm];
-
- if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
- else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
-
- return true;
-}
-
-static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
+static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
{
- vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val;
- return 0;
-}
+ struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state;
-static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
-{
- *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm];
- return 0;
+ switch (rd->Op2) {
+ case 0b100:
+ return &dbg->dbg_bvr[rd->CRm];
+ case 0b101:
+ return &dbg->dbg_bcr[rd->CRm];
+ case 0b110:
+ return &dbg->dbg_wvr[rd->CRm];
+ case 0b111:
+ return &dbg->dbg_wcr[rd->CRm];
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ return NULL;
+ }
}
-static u64 reset_bvr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
+static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *rd)
{
- vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val;
- return rd->val;
-}
+ u64 *reg = demux_wb_reg(vcpu, rd);
-static bool trap_bcr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
-{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm];
+ if (!reg)
+ return false;
if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
+ reg_to_dbg(vcpu, p, rd, reg);
else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
+ dbg_to_reg(vcpu, p, rd, reg);
+ kvm_debug_set_guest_ownership(vcpu);
return true;
}
-static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
+static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 val)
{
- vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val;
- return 0;
-}
-
-static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
-{
- *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm];
- return 0;
-}
+ u64 *reg = demux_wb_reg(vcpu, rd);
-static u64 reset_bcr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val;
- return rd->val;
-}
-
-static bool trap_wvr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
-{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm];
-
- if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
- else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write,
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]);
-
- return true;
-}
-
-static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
-{
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val;
- return 0;
-}
+ if (!reg)
+ return -EINVAL;
-static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
-{
- *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm];
+ *reg = val;
return 0;
}
-static u64 reset_wvr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val;
- return rd->val;
-}
-
-static bool trap_wcr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
+static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 *val)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm];
+ u64 *reg = demux_wb_reg(vcpu, rd);
- if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
- else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
-
- return true;
-}
+ if (!reg)
+ return -EINVAL;
-static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
-{
- vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val;
+ *val = *reg;
return 0;
}
-static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
+static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
{
- *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm];
- return 0;
-}
+ u64 *reg = demux_wb_reg(vcpu, rd);
-static u64 reset_wcr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val;
+ /*
+ * Bail early if we couldn't find storage for the register, the
+ * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever
+ * being run.
+ */
+ if (!reg)
+ return 0;
+
+ *reg = rd->val;
return rd->val;
}
@@ -880,7 +819,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
- __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+ __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK;
return __vcpu_sys_reg(vcpu, r->reg);
}
@@ -972,7 +911,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
else
/* return PMSELR.SEL field */
p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0)
- & ARMV8_PMU_COUNTER_MASK;
+ & PMSELR_EL0_SEL_MASK;
return true;
}
@@ -1028,6 +967,22 @@ static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
return 0;
}
+static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ u64 idx;
+
+ if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
+ /* PMCCNTR_EL0 */
+ idx = ARMV8_PMU_CYCLE_IDX;
+ else
+ /* PMEVCNTRn_EL0 */
+ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+
+ kvm_pmu_set_counter_value_user(vcpu, idx, val);
+ return 0;
+}
+
static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -1040,8 +995,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
if (pmu_access_event_counter_el0_disabled(vcpu))
return false;
- idx = __vcpu_sys_reg(vcpu, PMSELR_EL0)
- & ARMV8_PMU_COUNTER_MASK;
+ idx = SYS_FIELD_GET(PMSELR_EL0, SEL,
+ __vcpu_sys_reg(vcpu, PMSELR_EL0));
} else if (r->Op2 == 0) {
/* PMCCNTR_EL0 */
if (pmu_access_cycle_counter_el0_disabled(vcpu))
@@ -1091,7 +1046,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) {
/* PMXEVTYPER_EL0 */
- idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK;
+ idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0));
reg = PMEVTYPER0_EL0 + idx;
} else if (r->CRn == 14 && (r->CRm & 12) == 12) {
idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
@@ -1119,32 +1074,17 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val)
{
- bool set;
-
- val &= kvm_pmu_valid_counter_mask(vcpu);
-
- switch (r->reg) {
- case PMOVSSET_EL0:
- /* CRm[1] being set indicates a SET register, and CLR otherwise */
- set = r->CRm & 2;
- break;
- default:
- /* Op2[0] being set indicates a SET register, and CLR otherwise */
- set = r->Op2 & 1;
- break;
- }
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
- if (set)
- __vcpu_sys_reg(vcpu, r->reg) |= val;
- else
- __vcpu_sys_reg(vcpu, r->reg) &= ~val;
+ __vcpu_sys_reg(vcpu, r->reg) = val & mask;
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
return 0;
}
static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val)
{
- u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
*val = __vcpu_sys_reg(vcpu, r->reg) & mask;
return 0;
@@ -1158,19 +1098,17 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_access_el0_disabled(vcpu))
return false;
- mask = kvm_pmu_valid_counter_mask(vcpu);
+ mask = kvm_pmu_accessible_counter_mask(vcpu);
if (p->is_write) {
val = p->regval & mask;
- if (r->Op2 & 0x1) {
+ if (r->Op2 & 0x1)
/* accessing PMCNTENSET_EL0 */
__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
- kvm_pmu_enable_counter_mask(vcpu, val);
- kvm_vcpu_pmu_restore_guest(vcpu);
- } else {
+ else
/* accessing PMCNTENCLR_EL0 */
__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
- kvm_pmu_disable_counter_mask(vcpu, val);
- }
+
+ kvm_pmu_reprogram_counter_mask(vcpu, val);
} else {
p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
}
@@ -1181,7 +1119,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
if (check_pmu_access_disabled(vcpu, 0))
return false;
@@ -1205,7 +1143,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
if (pmu_access_el0_disabled(vcpu))
return false;
@@ -1235,7 +1173,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_write_swinc_el0_disabled(vcpu))
return false;
- mask = kvm_pmu_valid_counter_mask(vcpu);
+ mask = kvm_pmu_accessible_counter_mask(vcpu);
kvm_pmu_software_increment(vcpu, p->regval & mask);
return true;
}
@@ -1244,10 +1182,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
if (p->is_write) {
- if (!vcpu_mode_priv(vcpu)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!vcpu_mode_priv(vcpu))
+ return undef_access(vcpu, p, r);
__vcpu_sys_reg(vcpu, PMUSERENR_EL0) =
p->regval & ARMV8_PMU_USERENR_MASK;
@@ -1301,19 +1237,25 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
val |= ARMV8_PMU_PMCR_LC;
__vcpu_sys_reg(vcpu, r->reg) = val;
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
return 0;
}
/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
#define DBG_BCR_BVR_WCR_WVR_EL1(n) \
{ SYS_DESC(SYS_DBGBVRn_EL1(n)), \
- trap_bvr, reset_bvr, 0, 0, get_bvr, set_bvr }, \
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }, \
{ SYS_DESC(SYS_DBGBCRn_EL1(n)), \
- trap_bcr, reset_bcr, 0, 0, get_bcr, set_bcr }, \
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }, \
{ SYS_DESC(SYS_DBGWVRn_EL1(n)), \
- trap_wvr, reset_wvr, 0, 0, get_wvr, set_wvr }, \
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }, \
{ SYS_DESC(SYS_DBGWCRn_EL1(n)), \
- trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr }
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }
#define PMU_SYS_REG(name) \
SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \
@@ -1323,6 +1265,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
#define PMU_PMEVCNTR_EL0(n) \
{ PMU_SYS_REG(PMEVCNTRn_EL0(n)), \
.reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \
+ .set_user = set_pmu_evcntr, \
.access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
/* Macro to expand the PMEVTYPERn_EL0 register */
@@ -1331,14 +1274,6 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
.reset = reset_pmevtyper, \
.access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }
-static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
- const struct sys_reg_desc *r)
-{
- kvm_inject_undefined(vcpu);
-
- return false;
-}
-
/* Macro to expand the AMU counter and type registers*/
#define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access }
#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access }
@@ -1375,30 +1310,149 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
switch (reg) {
case SYS_CNTP_TVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
+ case SYS_CNTV_TVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
case SYS_AARCH32_CNTP_TVAL:
+ case SYS_CNTP_TVAL_EL02:
tmr = TIMER_PTIMER;
treg = TIMER_REG_TVAL;
break;
+
+ case SYS_CNTV_TVAL_EL02:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
+ case SYS_CNTHP_TVAL_EL2:
+ tmr = TIMER_HPTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
+ case SYS_CNTHV_TVAL_EL2:
+ tmr = TIMER_HVTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
case SYS_CNTP_CTL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
+ case SYS_CNTV_CTL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
case SYS_AARCH32_CNTP_CTL:
+ case SYS_CNTP_CTL_EL02:
tmr = TIMER_PTIMER;
treg = TIMER_REG_CTL;
break;
+
+ case SYS_CNTV_CTL_EL02:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
+ case SYS_CNTHP_CTL_EL2:
+ tmr = TIMER_HPTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
+ case SYS_CNTHV_CTL_EL2:
+ tmr = TIMER_HVTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
case SYS_CNTP_CVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
+ case SYS_CNTV_CVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
case SYS_AARCH32_CNTP_CVAL:
+ case SYS_CNTP_CVAL_EL02:
tmr = TIMER_PTIMER;
treg = TIMER_REG_CVAL;
break;
+
+ case SYS_CNTV_CVAL_EL02:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
+ case SYS_CNTHP_CVAL_EL2:
+ tmr = TIMER_HPTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
+ case SYS_CNTHV_CVAL_EL2:
+ tmr = TIMER_HVTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
case SYS_CNTPCT_EL0:
case SYS_CNTPCTSS_EL0:
+ if (is_hyp_ctxt(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CNT;
+ break;
+
case SYS_AARCH32_CNTPCT:
+ case SYS_AARCH32_CNTPCTSS:
tmr = TIMER_PTIMER;
treg = TIMER_REG_CNT;
break;
+
+ case SYS_CNTVCT_EL0:
+ case SYS_CNTVCTSS_EL0:
+ if (is_hyp_ctxt(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CNT;
+ break;
+
+ case SYS_AARCH32_CNTVCT:
+ case SYS_AARCH32_CNTVCTSS:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CNT;
+ break;
+
default:
print_sys_reg_msg(p, "%s", "Unhandled trapped timer register");
- kvm_inject_undefined(vcpu);
- return false;
+ return undef_access(vcpu, p, r);
}
if (p->is_write)
@@ -1409,6 +1463,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_hv_timer(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ return undef_access(vcpu, p, r);
+
+ return access_arch_timer(vcpu, p, r);
+}
+
static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
s64 new, s64 cur)
{
@@ -1513,6 +1577,9 @@ static u8 pmuver_to_perfmon(u8 pmuver)
}
}
+static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
+static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
+
/* Read a sanitised cpufeature ID register by sys_reg_desc */
static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
@@ -1526,11 +1593,30 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val = read_sanitised_ftr_reg(id);
switch (id) {
+ case SYS_ID_AA64DFR0_EL1:
+ val = sanitise_id_aa64dfr0_el1(vcpu, val);
+ break;
+ case SYS_ID_AA64PFR0_EL1:
+ val = sanitise_id_aa64pfr0_el1(vcpu, val);
+ break;
case SYS_ID_AA64PFR1_EL1:
if (!kvm_has_mte(vcpu->kvm))
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
+ break;
+ case SYS_ID_AA64PFR2_EL1:
+ /* We only expose FPMR */
+ val &= ID_AA64PFR2_EL1_FPMR;
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
@@ -1543,17 +1629,29 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
if (!vcpu_has_ptrauth(vcpu))
val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
- if (!cpus_have_final_cap(ARM64_HAS_WFXT))
+ if (!cpus_have_final_cap(ARM64_HAS_WFXT) ||
+ has_broken_cntvoff())
val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
break;
+ case SYS_ID_AA64ISAR3_EL1:
+ val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX;
+ break;
case SYS_ID_AA64MMFR2_EL1:
val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
+ val &= ~ID_AA64MMFR2_EL1_NV;
+ break;
+ case SYS_ID_AA64MMFR3_EL1:
+ val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE |
+ ID_AA64MMFR3_EL1_S1PIE;
break;
case SYS_ID_MMFR4_EL1:
val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
break;
}
+ if (vcpu_has_nv(vcpu))
+ val = limit_nv_id_reg(vcpu->kvm, id, val);
+
return val;
}
@@ -1565,18 +1663,44 @@ static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu,
static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
- return IDREG(vcpu->kvm, reg_to_encoding(r));
+ return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r));
+}
+
+static bool is_feature_id_reg(u32 encoding)
+{
+ return (sys_reg_Op0(encoding) == 3 &&
+ (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) &&
+ sys_reg_CRn(encoding) == 0 &&
+ sys_reg_CRm(encoding) <= 7);
}
/*
* Return true if the register's (Op0, Op1, CRn, CRm, Op2) is
- * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
+ * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID
+ * registers KVM maintains on a per-VM basis.
+ *
+ * Additionally, the implementation ID registers and CTR_EL0 are handled as
+ * per-VM registers.
*/
-static inline bool is_id_reg(u32 id)
+static inline bool is_vm_ftr_id_reg(u32 id)
{
- return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
- sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
- sys_reg_CRm(id) < 8);
+ switch (id) {
+ case SYS_CTR_EL0:
+ case SYS_MIDR_EL1:
+ case SYS_REVIDR_EL1:
+ case SYS_AIDR_EL1:
+ return true;
+ default:
+ return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
+ sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
+ sys_reg_CRm(id) < 8);
+
+ }
+}
+
+static inline bool is_vcpu_ftr_id_reg(u32 id)
+{
+ return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id);
}
static inline bool is_aa32_id_reg(u32 id)
@@ -1645,11 +1769,26 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
-static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
+static unsigned int sme_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
{
- u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP))
+ return 0;
+ return REG_HIDDEN;
+}
+
+static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_fpmr(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
if (!vcpu_has_sve(vcpu))
val &= ~ID_AA64PFR0_EL1_SVE_MASK;
@@ -1677,24 +1816,18 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
val &= ~ID_AA64PFR0_EL1_AMU_MASK;
+ /*
+ * MPAM is disabled by default as KVM also needs a set of PARTID to
+ * program the MPAMVPMx_EL2 PARTID remapping registers with. But some
+ * older kernels let the guest see the ID bit.
+ */
+ val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+
return val;
}
-#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \
-({ \
- u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \
- (val) &= ~reg##_##field##_MASK; \
- (val) |= FIELD_PREP(reg##_##field##_MASK, \
- min(__f_val, \
- (u64)SYS_FIELD_VALUE(reg, field, limit))); \
- (val); \
-})
-
-static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
+static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
- u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
-
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
/*
@@ -1708,6 +1841,9 @@ static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
/* Hide SPE from guests */
val &= ~ID_AA64DFR0_EL1_PMSVer_MASK;
+ /* Hide BRBE from guests */
+ val &= ~ID_AA64DFR0_EL1_BRBE_MASK;
+
return val;
}
@@ -1748,12 +1884,14 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
- u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ u8 perfmon;
u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1);
val &= ~ID_DFR0_EL1_PerfMon_MASK;
- if (kvm_vcpu_has_pmu(vcpu))
+ if (kvm_vcpu_has_pmu(vcpu)) {
+ perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon);
+ }
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8);
@@ -1787,6 +1925,101 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
return set_id_reg(vcpu, rd, val);
}
+static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+ u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK;
+
+ /*
+ * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits
+ * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to
+ * guests, but didn't add trap handling. KVM doesn't support MPAM and
+ * always returns an UNDEF for these registers. The guest must see 0
+ * for this field.
+ *
+ * But KVM must also accept values from user-space that were provided
+ * by KVM. On CPUs that support MPAM, permit user-space to write
+ * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field.
+ */
+ if ((hw_val & mpam_mask) == (user_val & mpam_mask))
+ user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+ u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK;
+
+ /* See set_id_aa64pfr0_el1 for comment about MPAM */
+ if ((hw_val & mpam_mask) == (user_val & mpam_mask))
+ user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd);
+ u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK |
+ ID_AA64MMFR0_EL1_TGRAN16_2_MASK |
+ ID_AA64MMFR0_EL1_TGRAN64_2_MASK;
+
+ if (vcpu_has_nv(vcpu) &&
+ ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask)))
+ return -EINVAL;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+ u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK;
+
+ /*
+ * We made the mistake to expose the now deprecated NV field,
+ * so allow userspace to write it, but silently ignore it.
+ */
+ if ((hw_val & nv_mask) == (user_val & nv_mask))
+ user_val &= ~nv_mask;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_ctr_el0(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val);
+
+ /*
+ * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved.
+ * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based
+ * on what hardware reports.
+ *
+ * Using a VIPT software model on PIPT will lead to over invalidation,
+ * but still correct. Hence, we can allow downgrading PIPT to VIPT,
+ * but not the other way around. This is handled via arm64_ftr_safe_value()
+ * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value
+ * set as VIPT.
+ */
+ switch (user_L1Ip) {
+ case CTR_EL0_L1Ip_RESERVED_VPIPT:
+ case CTR_EL0_L1Ip_RESERVED_AIVIVT:
+ return -EINVAL;
+ case CTR_EL0_L1Ip_VIPT:
+ case CTR_EL0_L1Ip_PIPT:
+ return set_id_reg(vcpu, rd, user_val);
+ default:
+ return -ENOENT;
+ }
+}
+
/*
* cpufeature ID register user accessors
*
@@ -1837,7 +2070,7 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
ret = arm64_check_features(vcpu, rd, val);
if (!ret)
- IDREG(vcpu->kvm, id) = val;
+ kvm_set_vm_id_reg(vcpu->kvm, id, val);
mutex_unlock(&vcpu->kvm->arch.config_lock);
@@ -1853,6 +2086,18 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return ret;
}
+void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val)
+{
+ u64 *p = __vm_id_reg(&kvm->arch, reg);
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm))
+ return;
+
+ *p = val;
+}
+
static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 *val)
{
@@ -1872,7 +2117,7 @@ static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
- p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0);
return true;
}
@@ -1935,7 +2180,7 @@ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
* one cache line.
*/
if (kvm_has_mte(vcpu->kvm))
- clidr |= 2 << CLIDR_TTYPE_SHIFT(loc);
+ clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc);
__vcpu_sys_reg(vcpu, r->reg) = clidr;
@@ -2045,29 +2290,18 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
.val = v, \
}
-#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v)
-#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v)
-
-/*
- * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when
- * HCR_EL2.E2H==1, and only in the sysreg table for convenience of
- * handling traps. Given that, they are always hidden from userspace.
- */
-static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- return REG_HIDDEN_USER;
-}
-
-#define EL12_REG(name, acc, rst, v) { \
- SYS_DESC(SYS_##name##_EL12), \
+#define EL2_REG_FILTERED(name, acc, rst, v, filter) { \
+ SYS_DESC(SYS_##name), \
.access = acc, \
.reset = rst, \
- .reg = name##_EL1, \
+ .reg = name, \
+ .visibility = filter, \
.val = v, \
- .visibility = hidden_user_visibility, \
}
+#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v)
+#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v)
+
/*
* Since reset() callback and field val are not used for idregs, they will be
* used for specific purposes for idregs.
@@ -2079,50 +2313,53 @@ static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu,
* from userspace.
*/
+#define ID_DESC_DEFAULT_CALLBACKS \
+ .access = access_id_reg, \
+ .get_user = get_id_reg, \
+ .set_user = set_id_reg, \
+ .visibility = id_visibility, \
+ .reset = kvm_read_sanitised_id_reg
+
#define ID_DESC(name) \
SYS_DESC(SYS_##name), \
- .access = access_id_reg, \
- .get_user = get_id_reg \
+ ID_DESC_DEFAULT_CALLBACKS
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
- .visibility = id_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define AA32_ID_SANITISED(name) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
.visibility = aa32_id_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
/* sys_reg_desc initialiser for writable ID registers */
#define ID_WRITABLE(name, mask) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
- .visibility = id_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = mask, \
}
+/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */
+#define ID_FILTERED(sysreg, name, mask) { \
+ ID_DESC(sysreg), \
+ .set_user = set_##name, \
+ .val = (mask), \
+}
+
/*
* sys_reg_desc initialiser for architecturally unallocated cpufeature ID
* register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
* (1 <= crm < 8, 0 <= Op2 < 8).
*/
#define ID_UNALLOCATED(crm, op2) { \
+ .name = "S3_0_0_" #crm "_" #op2, \
Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
- .access = access_id_reg, \
- .get_user = get_id_reg, \
- .set_user = set_id_reg, \
+ ID_DESC_DEFAULT_CALLBACKS, \
.visibility = raz_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
@@ -2133,9 +2370,7 @@ static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu,
*/
#define ID_HIDDEN(name) { \
ID_DESC(name), \
- .set_user = set_id_reg, \
.visibility = raz_visibility, \
- .reset = kvm_read_sanitised_id_reg, \
.val = 0, \
}
@@ -2175,6 +2410,18 @@ static bool access_spsr(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_cntkctl_el12(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval;
+ else
+ p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1);
+
+ return true;
+}
+
static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 val = r->val;
@@ -2185,6 +2432,274 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
return __vcpu_sys_reg(vcpu, r->reg) = val;
}
+static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ unsigned int (*fn)(const struct kvm_vcpu *,
+ const struct sys_reg_desc *))
+{
+ return el2_visibility(vcpu, rd) ?: fn(vcpu, rd);
+}
+
+static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, sve_visibility);
+}
+
+static bool access_zcr_el2(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ unsigned int vq;
+
+ if (guest_hyp_sve_traps_enabled(vcpu)) {
+ kvm_inject_nested_sve_trap(vcpu);
+ return true;
+ }
+
+ if (!p->is_write) {
+ p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2);
+ return true;
+ }
+
+ vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1;
+ vq = min(vq, vcpu_sve_max_vq(vcpu));
+ vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2);
+
+ return true;
+}
+
+static bool access_gic_vtr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = kvm_vgic_global_state.ich_vtr_el2;
+ p->regval &= ~(ICH_VTR_EL2_DVIM |
+ ICH_VTR_EL2_A3V |
+ ICH_VTR_EL2_IDbits);
+ p->regval |= ICH_VTR_EL2_nV4;
+
+ return true;
+}
+
+static bool access_gic_misr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_misr(vcpu);
+
+ return true;
+}
+
+static bool access_gic_eisr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_eisr(vcpu);
+
+ return true;
+}
+
+static bool access_gic_elrsr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_elrsr(vcpu);
+
+ return true;
+}
+
+static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_s1poe(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, s1poe_visibility);
+}
+
+static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_tcr2(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, tcr2_visibility);
+}
+
+static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_s1pie(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, s1pie_visibility);
+}
+
+static bool access_mdcr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u64 old = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ if (!access_rw(vcpu, p, r))
+ return false;
+
+ /*
+ * Request a reload of the PMU to enable/disable the counters affected
+ * by HPME.
+ */
+ if ((old ^ __vcpu_sys_reg(vcpu, MDCR_EL2)) & MDCR_EL2_HPME)
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
+ return true;
+}
+
+/*
+ * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and
+ * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them.
+ * The values made visible to userspace were the register values of the boot
+ * CPU.
+ *
+ * At the same time, reads from these registers at EL1 previously were not
+ * trapped, allowing the guest to read the actual hardware value. On big-little
+ * machines, this means the VM can see different values depending on where a
+ * given vCPU got scheduled.
+ *
+ * These registers are now trapped as collateral damage from SME, and what
+ * follows attempts to give a user / guest view consistent with the existing
+ * ABI.
+ */
+static bool access_imp_id_reg(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ /*
+ * Return the VM-scoped implementation ID register values if userspace
+ * has made them writable.
+ */
+ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags))
+ return access_id_reg(vcpu, p, r);
+
+ /*
+ * Otherwise, fall back to the old behavior of returning the value of
+ * the current CPU.
+ */
+ switch (reg_to_encoding(r)) {
+ case SYS_REVIDR_EL1:
+ p->regval = read_sysreg(revidr_el1);
+ break;
+ case SYS_AIDR_EL1:
+ p->regval = read_sysreg(aidr_el1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ return true;
+}
+
+static u64 __ro_after_init boot_cpu_midr_val;
+static u64 __ro_after_init boot_cpu_revidr_val;
+static u64 __ro_after_init boot_cpu_aidr_val;
+
+static void init_imp_id_regs(void)
+{
+ boot_cpu_midr_val = read_sysreg(midr_el1);
+ boot_cpu_revidr_val = read_sysreg(revidr_el1);
+ boot_cpu_aidr_val = read_sysreg(aidr_el1);
+}
+
+static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ switch (reg_to_encoding(r)) {
+ case SYS_MIDR_EL1:
+ return boot_cpu_midr_val;
+ case SYS_REVIDR_EL1:
+ return boot_cpu_revidr_val;
+ case SYS_AIDR_EL1:
+ return boot_cpu_aidr_val;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ return 0;
+ }
+}
+
+static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 expected;
+
+ guard(mutex)(&kvm->arch.config_lock);
+
+ expected = read_id_reg(vcpu, r);
+ if (expected == val)
+ return 0;
+
+ if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))
+ return -EINVAL;
+
+ /*
+ * Once the VM has started the ID registers are immutable. Reject the
+ * write if userspace tries to change it.
+ */
+ if (kvm_vm_has_ran_once(kvm))
+ return -EBUSY;
+
+ /*
+ * Any value is allowed for the implementation ID registers so long as
+ * it is within the writable mask.
+ */
+ if ((val & r->val) != val)
+ return -EINVAL;
+
+ kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val);
+ return 0;
+}
+
+#define IMPLEMENTATION_ID(reg, mask) { \
+ SYS_DESC(SYS_##reg), \
+ .access = access_imp_id_reg, \
+ .get_user = get_id_reg, \
+ .set_user = set_imp_id_reg, \
+ .reset = reset_imp_id_reg, \
+ .val = mask, \
+}
+
/*
* Architected system registers.
* Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -2231,9 +2746,11 @@ static const struct sys_reg_desc sys_reg_descs[] = {
// DBGDTR[TR]X_EL0 share the same encoding
{ SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi },
- { SYS_DESC(SYS_DBGVCR32_EL2), trap_undef, reset_val, DBGVCR32_EL2, 0 },
+ { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 },
+ IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)),
{ SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
+ IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)),
/*
* ID regs: all ID_SANITISED() entries here must have corresponding
@@ -2280,34 +2797,52 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* AArch64 ID registers */
/* CRm=4 */
- { SYS_DESC(SYS_ID_AA64PFR0_EL1),
- .access = access_id_reg,
- .get_user = get_id_reg,
- .set_user = set_id_reg,
- .reset = read_sanitised_id_aa64pfr0_el1,
- .val = ~(ID_AA64PFR0_EL1_AMU |
- ID_AA64PFR0_EL1_MPAM |
- ID_AA64PFR0_EL1_SVE |
- ID_AA64PFR0_EL1_RAS |
- ID_AA64PFR0_EL1_GIC |
- ID_AA64PFR0_EL1_AdvSIMD |
- ID_AA64PFR0_EL1_FP), },
- ID_SANITISED(ID_AA64PFR1_EL1),
- ID_UNALLOCATED(4,2),
+ ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1,
+ ~(ID_AA64PFR0_EL1_AMU |
+ ID_AA64PFR0_EL1_MPAM |
+ ID_AA64PFR0_EL1_SVE |
+ ID_AA64PFR0_EL1_RAS |
+ ID_AA64PFR0_EL1_AdvSIMD |
+ ID_AA64PFR0_EL1_FP)),
+ ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
+ ~(ID_AA64PFR1_EL1_PFAR |
+ ID_AA64PFR1_EL1_DF2 |
+ ID_AA64PFR1_EL1_MTEX |
+ ID_AA64PFR1_EL1_THE |
+ ID_AA64PFR1_EL1_GCS |
+ ID_AA64PFR1_EL1_MTE_frac |
+ ID_AA64PFR1_EL1_NMI |
+ ID_AA64PFR1_EL1_RNDR_trap |
+ ID_AA64PFR1_EL1_SME |
+ ID_AA64PFR1_EL1_RES0 |
+ ID_AA64PFR1_EL1_MPAM_frac |
+ ID_AA64PFR1_EL1_RAS_frac |
+ ID_AA64PFR1_EL1_MTE)),
+ ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR),
ID_UNALLOCATED(4,3),
ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
ID_HIDDEN(ID_AA64SMFR0_EL1),
ID_UNALLOCATED(4,6),
- ID_UNALLOCATED(4,7),
+ ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0),
/* CRm=5 */
- { SYS_DESC(SYS_ID_AA64DFR0_EL1),
- .access = access_id_reg,
- .get_user = get_id_reg,
- .set_user = set_id_aa64dfr0_el1,
- .reset = read_sanitised_id_aa64dfr0_el1,
- .val = ID_AA64DFR0_EL1_PMUVer_MASK |
- ID_AA64DFR0_EL1_DebugVer_MASK, },
+ /*
+ * Prior to FEAT_Debugv8.9, the architecture defines context-aware
+ * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs).
+ * KVM does not trap + emulate the breakpoint registers, and as such
+ * cannot support a layout that misaligns with the underlying hardware.
+ * While it may be possible to describe a subset that aligns with
+ * hardware, just prevent changes to BRPs and CTX_CMPs altogether for
+ * simplicity.
+ *
+ * See DDI0487K.a, section D2.8.3 Breakpoint types and linking
+ * of breakpoints for more details.
+ */
+ ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1,
+ ID_AA64DFR0_EL1_DoubleLock_MASK |
+ ID_AA64DFR0_EL1_WRPs_MASK |
+ ID_AA64DFR0_EL1_PMUVer_MASK |
+ ID_AA64DFR0_EL1_DebugVer_MASK),
ID_SANITISED(ID_AA64DFR1_EL1),
ID_UNALLOCATED(5,2),
ID_UNALLOCATED(5,3),
@@ -2325,32 +2860,34 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 |
ID_AA64ISAR2_EL1_APA3 |
ID_AA64ISAR2_EL1_GPA3)),
- ID_UNALLOCATED(6,3),
+ ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT |
+ ID_AA64ISAR3_EL1_FAMINMAX)),
ID_UNALLOCATED(6,4),
ID_UNALLOCATED(6,5),
ID_UNALLOCATED(6,6),
ID_UNALLOCATED(6,7),
/* CRm=7 */
- ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 |
- ID_AA64MMFR0_EL1_TGRAN4_2 |
- ID_AA64MMFR0_EL1_TGRAN64_2 |
- ID_AA64MMFR0_EL1_TGRAN16_2)),
+ ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1,
+ ~(ID_AA64MMFR0_EL1_RES0 |
+ ID_AA64MMFR0_EL1_ASIDBITS)),
ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 |
ID_AA64MMFR1_EL1_HCX |
- ID_AA64MMFR1_EL1_XNX |
ID_AA64MMFR1_EL1_TWED |
ID_AA64MMFR1_EL1_XNX |
ID_AA64MMFR1_EL1_VH |
ID_AA64MMFR1_EL1_VMIDBits)),
- ID_WRITABLE(ID_AA64MMFR2_EL1, ~(ID_AA64MMFR2_EL1_RES0 |
+ ID_FILTERED(ID_AA64MMFR2_EL1,
+ id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 |
ID_AA64MMFR2_EL1_EVT |
ID_AA64MMFR2_EL1_FWB |
ID_AA64MMFR2_EL1_IDS |
ID_AA64MMFR2_EL1_NV |
ID_AA64MMFR2_EL1_CCIDX)),
- ID_SANITISED(ID_AA64MMFR3_EL1),
- ID_SANITISED(ID_AA64MMFR4_EL1),
+ ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX |
+ ID_AA64MMFR3_EL1_S1PIE |
+ ID_AA64MMFR3_EL1_S1POE)),
+ ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac),
ID_UNALLOCATED(7,5),
ID_UNALLOCATED(7,6),
ID_UNALLOCATED(7,7),
@@ -2369,7 +2906,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
{ SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
{ SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
- { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0 },
+ { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0,
+ .visibility = tcr2_visibility },
PTRAUTH_KEY(APIA),
PTRAUTH_KEY(APIB),
@@ -2380,6 +2918,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SPSR_EL1), access_spsr},
{ SYS_DESC(SYS_ELR_EL1), access_elr},
+ { SYS_DESC(SYS_ICC_PMR_EL1), undef_access },
+
{ SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
@@ -2421,31 +2961,51 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi },
{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
- { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 },
- { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 },
+ { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1,
+ .visibility = s1pie_visibility },
+ { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1,
+ .visibility = s1pie_visibility },
+ { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1,
+ .visibility = s1poe_visibility },
{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
{ SYS_DESC(SYS_LORSA_EL1), trap_loregion },
{ SYS_DESC(SYS_LOREA_EL1), trap_loregion },
{ SYS_DESC(SYS_LORN_EL1), trap_loregion },
{ SYS_DESC(SYS_LORC_EL1), trap_loregion },
+ { SYS_DESC(SYS_MPAMIDR_EL1), undef_access },
{ SYS_DESC(SYS_LORID_EL1), trap_loregion },
+ { SYS_DESC(SYS_MPAM1_EL1), undef_access },
+ { SYS_DESC(SYS_MPAM0_EL1), undef_access },
{ SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
- { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only },
+ { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi },
- { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only },
+ { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
+ { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },
{ SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
{ SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 },
@@ -2458,12 +3018,19 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr },
{ SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1,
- .set_user = set_clidr },
+ .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 },
{ SYS_DESC(SYS_CCSIDR2_EL1), undef_access },
{ SYS_DESC(SYS_SMIDR_EL1), undef_access },
+ IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)),
{ SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
- { SYS_DESC(SYS_CTR_EL0), access_ctr },
- { SYS_DESC(SYS_SVCR), undef_access },
+ ID_FILTERED(CTR_EL0, ctr_el0,
+ CTR_EL0_DIC_MASK |
+ CTR_EL0_IDC_MASK |
+ CTR_EL0_DminLine_MASK |
+ CTR_EL0_L1Ip_MASK |
+ CTR_EL0_IminLine_MASK),
+ { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility },
+ { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility },
{ PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr,
.reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr },
@@ -2491,7 +3058,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
.access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(PMCCNTR_EL0),
.access = access_pmu_evcntr, .reset = reset_unknown,
- .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr},
+ .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr,
+ .set_user = set_pmu_evcntr },
{ PMU_SYS_REG(PMXEVTYPER_EL0),
.access = access_pmu_evtyper, .reset = NULL },
{ PMU_SYS_REG(PMXEVCNTR_EL0),
@@ -2506,6 +3074,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
.access = access_pmovs, .reg = PMOVSSET_EL0,
.get_user = get_pmreg, .set_user = set_pmreg },
+ { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0,
+ .visibility = s1poe_visibility },
{ SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 },
{ SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 },
{ SYS_DESC(SYS_TPIDR2_EL0), undef_access },
@@ -2586,11 +3156,17 @@ static const struct sys_reg_desc sys_reg_descs[] = {
AMU_AMEVTYPER1_EL0(15),
{ SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer },
+ { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer },
+ { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer },
+ { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer },
+ { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer },
+ { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer },
+
/* PMEVCNTRn_EL0 */
PMU_PMEVCNTR_EL0(0),
PMU_PMEVCNTR_EL0(1),
@@ -2667,7 +3243,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
EL2_REG_VNCR(HCR_EL2, reset_hcr, 0),
- EL2_REG(MDCR_EL2, access_rw, reset_val, 0),
+ EL2_REG(MDCR_EL2, access_mdcr, reset_val, 0),
EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
EL2_REG_VNCR(HSTR_EL2, reset_val, 0),
EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0),
@@ -2675,15 +3251,20 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG_VNCR(HFGITR_EL2, reset_val, 0),
EL2_REG_VNCR(HACR_EL2, reset_val, 0),
+ EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0,
+ sve_el2_visibility),
+
EL2_REG_VNCR(HCRX_EL2, reset_val, 0),
EL2_REG(TTBR0_EL2, access_rw, reset_val, 0),
EL2_REG(TTBR1_EL2, access_rw, reset_val, 0),
EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1),
+ EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1,
+ tcr2_el2_visibility),
EL2_REG_VNCR(VTTBR_EL2, reset_val, 0),
EL2_REG_VNCR(VTCR_EL2, reset_val, 0),
- { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 },
+ { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 },
EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0),
EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0),
EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0),
@@ -2692,55 +3273,584 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SP_EL1), access_sp_el1},
/* AArch32 SPSR_* are RES0 if trapped from a NV guest */
- { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
- { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
- { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
- { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
-
- { SYS_DESC(SYS_IFSR32_EL2), trap_undef, reset_unknown, IFSR32_EL2 },
+ { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi },
+
+ { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 },
EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
EL2_REG_REDIR(ESR_EL2, reset_val, 0),
- { SYS_DESC(SYS_FPEXC32_EL2), trap_undef, reset_val, FPEXC32_EL2, 0x700 },
+ { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 },
EL2_REG_REDIR(FAR_EL2, reset_val, 0),
EL2_REG(HPFAR_EL2, access_rw, reset_val, 0),
EL2_REG(MAIR_EL2, access_rw, reset_val, 0),
+ EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0,
+ s1pie_el2_visibility),
+ EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0,
+ s1pie_el2_visibility),
+ EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0,
+ s1poe_el2_visibility),
EL2_REG(AMAIR_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_MPAMHCR_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access },
+ { SYS_DESC(SYS_MPAM2_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access },
EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
- { SYS_DESC(SYS_RMR_EL2), trap_undef },
+ { SYS_DESC(SYS_RMR_EL2), undef_access },
+
+ EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0),
+
+ { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre },
+
+ EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0),
+ { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr },
+ { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr },
+ { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr },
+ { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr },
+ EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0),
+
+ EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0),
+ EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0),
EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0),
EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer },
+ EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0),
+ EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0),
- EL12_REG(CNTKCTL, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer },
+ EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0),
+ EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0),
+
+ { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 },
+
+ { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer },
+
+ { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer },
EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};
+static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ __kvm_at_s1e01(vcpu, op, p->regval);
+
+ return true;
+}
+
+static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ /* There is no FGT associated with AT S1E2A :-( */
+ if (op == OP_AT_S1E2A &&
+ !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ __kvm_at_s1e2(vcpu, op, p->regval);
+
+ return true;
+}
+
+static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ __kvm_at_s12(vcpu, op, p->regval);
+
+ return true;
+}
+
+static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+
+ if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
+ return false;
+
+ if (CRm == TLBI_CRm_nROS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ return true;
+}
+
+static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ /*
+ * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the
+ * corresponding VMIDs.
+ */
+ kvm_nested_s2_unmap(vcpu->kvm, true);
+
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return true;
+}
+
+static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+ u8 Op2 = sys_reg_Op2(instr);
+
+ if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ return true;
+}
+
+/* Only defined here as this is an internal "abstraction" */
+union tlbi_info {
+ struct {
+ u64 start;
+ u64 size;
+ } range;
+
+ struct {
+ u64 addr;
+ } ipa;
+
+ struct {
+ u64 addr;
+ u32 encoding;
+ } va;
+};
+
+static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ /*
+ * The unmap operation is allowed to drop the MMU lock and block, which
+ * means that @mmu could be used for a different context than the one
+ * currently being invalidated.
+ *
+ * This behavior is still safe, as:
+ *
+ * 1) The vCPU(s) that recycled the MMU are responsible for invalidating
+ * the entire MMU before reusing it, which still honors the intent
+ * of a TLBI.
+ *
+ * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC
+ * and ERET to the guest), other vCPUs are allowed to use stale
+ * translations.
+ *
+ * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and
+ * at worst may cause more aborts for shadow stage-2 fills.
+ *
+ * Dropping the MMU lock also implies that shadow stage-2 fills could
+ * happen behind the back of the TLBI. This is still safe, though, as
+ * the L1 needs to put its stage-2 in a consistent state before doing
+ * the TLBI.
+ */
+ kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true);
+}
+
+static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 limit, vttbr;
+
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm));
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .range = {
+ .start = 0,
+ .size = limit,
+ },
+ },
+ s2_mmu_unmap_range);
+
+ return true;
+}
+
+static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ u64 base, range, tg, num, scale;
+ int shift;
+
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ /*
+ * Because the shadow S2 structure doesn't necessarily reflect that
+ * of the guest's S2 (different base granule size, for example), we
+ * decide to ignore TTL and only use the described range.
+ */
+ tg = FIELD_GET(GENMASK(47, 46), p->regval);
+ scale = FIELD_GET(GENMASK(45, 44), p->regval);
+ num = FIELD_GET(GENMASK(43, 39), p->regval);
+ base = p->regval & GENMASK(36, 0);
+
+ switch(tg) {
+ case 1:
+ shift = 12;
+ break;
+ case 2:
+ shift = 14;
+ break;
+ case 3:
+ default: /* IMPDEF: handle tg==0 as 64k */
+ shift = 16;
+ break;
+ }
+
+ base <<= shift;
+ range = __TLBI_RANGE_PAGES(num, scale) << shift;
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .range = {
+ .start = base,
+ .size = range,
+ },
+ },
+ s2_mmu_unmap_range);
+
+ return true;
+}
+
+static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ unsigned long max_size;
+ u64 base_addr;
+
+ /*
+ * We drop a number of things from the supplied value:
+ *
+ * - NS bit: we're non-secure only.
+ *
+ * - IPA[51:48]: We don't support 52bit IPA just yet...
+ *
+ * And of course, adjust the IPA to be on an actual address.
+ */
+ base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12;
+ max_size = compute_tlb_inval_range(mmu, info->ipa.addr);
+ base_addr &= ~(max_size - 1);
+
+ /*
+ * See comment in s2_mmu_unmap_range() for why this is allowed to
+ * reschedule.
+ */
+ kvm_stage2_unmap_range(mmu, base_addr, max_size, true);
+}
+
+static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .ipa = {
+ .addr = p->regval,
+ },
+ },
+ s2_mmu_unmap_ipa);
+
+ return true;
+}
+
+static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding));
+}
+
+static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ /*
+ * If we're here, this is because we've trapped on a EL1 TLBI
+ * instruction that affects the EL1 translation regime while
+ * we're running in a context that doesn't allow us to let the
+ * HW do its thing (aka vEL2):
+ *
+ * - HCR_EL2.E2H == 0 : a non-VHE guest
+ * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode
+ *
+ * We don't expect these helpers to ever be called when running
+ * in a vEL1 context.
+ */
+
+ WARN_ON(!vcpu_is_el2(vcpu));
+
+ if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .va = {
+ .addr = p->regval,
+ .encoding = sys_encoding,
+ },
+ },
+ s2_mmu_tlbi_s1e1);
+
+ return true;
+}
+
+#define SYS_INSN(insn, access_fn) \
+ { \
+ SYS_DESC(OP_##insn), \
+ .access = (access_fn), \
+ }
+
static struct sys_reg_desc sys_insn_descs[] = {
{ SYS_DESC(SYS_DC_ISW), access_dcsw },
{ SYS_DESC(SYS_DC_IGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_IGDSW), access_dcgsw },
+
+ SYS_INSN(AT_S1E1R, handle_at_s1e01),
+ SYS_INSN(AT_S1E1W, handle_at_s1e01),
+ SYS_INSN(AT_S1E0R, handle_at_s1e01),
+ SYS_INSN(AT_S1E0W, handle_at_s1e01),
+ SYS_INSN(AT_S1E1RP, handle_at_s1e01),
+ SYS_INSN(AT_S1E1WP, handle_at_s1e01),
+
{ SYS_DESC(SYS_DC_CSW), access_dcsw },
{ SYS_DESC(SYS_DC_CGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_CGDSW), access_dcgsw },
{ SYS_DESC(SYS_DC_CISW), access_dcsw },
{ SYS_DESC(SYS_DC_CIGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_CIGDSW), access_dcgsw },
-};
-static const struct sys_reg_desc *first_idreg;
+ SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1),
+
+ SYS_INSN(AT_S1E2R, handle_at_s1e2),
+ SYS_INSN(AT_S1E2W, handle_at_s1e2),
+ SYS_INSN(AT_S12E1R, handle_at_s12),
+ SYS_INSN(AT_S12E1W, handle_at_s12),
+ SYS_INSN(AT_S12E0R, handle_at_s12),
+ SYS_INSN(AT_S12E0W, handle_at_s12),
+ SYS_INSN(AT_S1E2A, handle_at_s1e2),
+
+ SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is),
+
+ SYS_INSN(TLBI_ALLE2OS, undef_access),
+ SYS_INSN(TLBI_VAE2OS, undef_access),
+ SYS_INSN(TLBI_ALLE1OS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2OS, undef_access),
+ SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_RVAE2IS, undef_access),
+ SYS_INSN(TLBI_RVALE2IS, undef_access),
+
+ SYS_INSN(TLBI_ALLE1IS, handle_alle1is),
+ SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is),
+ SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RVAE2OS, undef_access),
+ SYS_INSN(TLBI_RVALE2OS, undef_access),
+ SYS_INSN(TLBI_RVAE2, undef_access),
+ SYS_INSN(TLBI_RVALE2, undef_access),
+ SYS_INSN(TLBI_ALLE1, handle_alle1is),
+ SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is),
+
+ SYS_INSN(TLBI_ALLE2OSNXS, undef_access),
+ SYS_INSN(TLBI_VAE2OSNXS, undef_access),
+ SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2OSNXS, undef_access),
+ SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_RVAE2ISNXS, undef_access),
+ SYS_INSN(TLBI_RVALE2ISNXS, undef_access),
+ SYS_INSN(TLBI_ALLE2ISNXS, undef_access),
+ SYS_INSN(TLBI_VAE2ISNXS, undef_access),
+
+ SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2ISNXS, undef_access),
+ SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is),
+ SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RVAE2OSNXS, undef_access),
+ SYS_INSN(TLBI_RVALE2OSNXS, undef_access),
+ SYS_INSN(TLBI_RVAE2NXS, undef_access),
+ SYS_INSN(TLBI_RVALE2NXS, undef_access),
+ SYS_INSN(TLBI_ALLE2NXS, undef_access),
+ SYS_INSN(TLBI_VAE2NXS, undef_access),
+ SYS_INSN(TLBI_ALLE1NXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2NXS, undef_access),
+ SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is),
+};
static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
@@ -2749,7 +3859,7 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
if (p->is_write) {
return ignore_write(vcpu, p);
} else {
- u64 dfr = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
+ u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP);
p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) |
@@ -2770,18 +3880,20 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
* None of the other registers share their location, so treat them as
* if they were 64bit.
*/
-#define DBG_BCR_BVR_WCR_WVR(n) \
- /* DBGBVRn */ \
- { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \
- /* DBGBCRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \
- /* DBGWVRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \
- /* DBGWCRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n }
-
-#define DBGBXVR(n) \
- { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n }
+#define DBG_BCR_BVR_WCR_WVR(n) \
+ /* DBGBVRn */ \
+ { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \
+ trap_dbg_wb_reg, NULL, n }, \
+ /* DBGBCRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \
+ /* DBGWVRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \
+ /* DBGWCRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n }
+
+#define DBGBXVR(n) \
+ { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \
+ trap_dbg_wb_reg, NULL, n }
/*
* Trapped cp14 registers. We generally ignore most of the external
@@ -2916,6 +4028,7 @@ static const struct sys_reg_desc cp15_regs[] = {
/* TTBCR2 */
{ AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 },
{ Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 },
+ { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access },
/* DFSR */
{ Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 },
{ Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 },
@@ -2965,8 +4078,28 @@ static const struct sys_reg_desc cp15_regs[] = {
/* AMAIR1 */
{ AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 },
- /* ICC_SRE */
- { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre },
+ { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
+ { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },
{ Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 },
@@ -3057,9 +4190,11 @@ static const struct sys_reg_desc cp15_64_regs[] = {
{ SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer },
{ Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 },
{ Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
+ { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer },
{ Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */
{ SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer },
{ SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer },
+ { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer },
};
static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
@@ -3069,12 +4204,14 @@ static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
for (i = 0; i < n; i++) {
if (!is_32 && table[i].reg && !table[i].reset) {
- kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i);
+ kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n",
+ &table[i], i, table[i].name);
return false;
}
if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
- kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1);
+ kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n",
+ &table[i], i, table[i - 1].name, table[i].name);
return false;
}
}
@@ -3377,9 +4514,13 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu)
* Certain AArch32 ID registers are handled by rerouting to the AArch64
* system register table. Registers in the ID range where CRm=0 are
* excluded from this scheme as they do not trivially map into AArch64
- * system register encodings.
+ * system register encodings, except for AIDR/REVIDR.
*/
- if (params.Op1 == 0 && params.CRn == 0 && params.CRm)
+ if (params.Op1 == 0 && params.CRn == 0 &&
+ (params.CRm || params.Op2 == 6 /* REVIDR */))
+ return kvm_emulate_cp15_id_reg(vcpu, &params);
+ if (params.Op1 == 1 && params.CRn == 0 &&
+ params.CRm == 0 && params.Op2 == 7 /* AIDR */)
return kvm_emulate_cp15_id_reg(vcpu, &params);
return kvm_handle_cp_32(vcpu, &params, cp15_regs, ARRAY_SIZE(cp15_regs));
@@ -3425,6 +4566,25 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
return false;
}
+static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos)
+{
+ unsigned long i, idreg_idx = 0;
+
+ for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+ const struct sys_reg_desc *r = &sys_reg_descs[i];
+
+ if (!is_vm_ftr_id_reg(reg_to_encoding(r)))
+ continue;
+
+ if (idreg_idx == pos)
+ return r;
+
+ idreg_idx++;
+ }
+
+ return NULL;
+}
+
static void *idregs_debug_start(struct seq_file *s, loff_t *pos)
{
struct kvm *kvm = s->private;
@@ -3436,7 +4596,7 @@ static void *idregs_debug_start(struct seq_file *s, loff_t *pos)
if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) &&
*iter == (u8)~0) {
*iter = *pos;
- if (*iter >= KVM_ARM_ID_REG_NUM)
+ if (!idregs_debug_find(kvm, *iter))
iter = NULL;
} else {
iter = ERR_PTR(-EBUSY);
@@ -3453,7 +4613,7 @@ static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos)
(*pos)++;
- if ((kvm->arch.idreg_debugfs_iter + 1) < KVM_ARM_ID_REG_NUM) {
+ if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) {
kvm->arch.idreg_debugfs_iter++;
return &kvm->arch.idreg_debugfs_iter;
@@ -3478,16 +4638,16 @@ static void idregs_debug_stop(struct seq_file *s, void *v)
static int idregs_debug_show(struct seq_file *s, void *v)
{
- struct kvm *kvm = s->private;
const struct sys_reg_desc *desc;
+ struct kvm *kvm = s->private;
- desc = first_idreg + kvm->arch.idreg_debugfs_iter;
+ desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter);
if (!desc->name)
return 0;
seq_printf(s, "%20s:\t%016llx\n",
- desc->name, IDREG(kvm, IDX_IDREG(kvm->arch.idreg_debugfs_iter)));
+ desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc)));
return 0;
}
@@ -3509,26 +4669,24 @@ void kvm_sys_regs_create_debugfs(struct kvm *kvm)
&idregs_debug_fops);
}
-static void kvm_reset_id_regs(struct kvm_vcpu *vcpu)
+static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg)
{
- const struct sys_reg_desc *idreg = first_idreg;
- u32 id = reg_to_encoding(idreg);
+ u32 id = reg_to_encoding(reg);
struct kvm *kvm = vcpu->kvm;
if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
return;
- lockdep_assert_held(&kvm->arch.config_lock);
-
- /* Initialize all idregs */
- while (is_id_reg(id)) {
- IDREG(kvm, id) = idreg->reset(vcpu, idreg);
+ kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg));
+}
- idreg++;
- id = reg_to_encoding(idreg);
- }
+static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *reg)
+{
+ if (kvm_vcpu_initialized(vcpu))
+ return;
- set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+ reg->reset(vcpu, reg);
}
/**
@@ -3540,19 +4698,30 @@ static void kvm_reset_id_regs(struct kvm_vcpu *vcpu)
*/
void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
unsigned long i;
- kvm_reset_id_regs(vcpu);
-
for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
const struct sys_reg_desc *r = &sys_reg_descs[i];
- if (is_id_reg(reg_to_encoding(r)))
+ if (!r->reset)
continue;
- if (r->reset)
+ if (is_vm_ftr_id_reg(reg_to_encoding(r)))
+ reset_vm_ftr_id_reg(vcpu, r);
+ else if (is_vcpu_ftr_id_reg(reg_to_encoding(r)))
+ reset_vcpu_ftr_id_reg(vcpu, r);
+ else
r->reset(vcpu, r);
+
+ if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS)
+ (void)__vcpu_sys_reg(vcpu, r->reg);
}
+
+ set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
}
/**
@@ -3658,72 +4827,6 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
return r;
}
-/*
- * These are the invariant sys_reg registers: we let the guest see the
- * host versions of these, so they're part of the guest state.
- *
- * A future CPU may provide a mechanism to present different values to
- * the guest, or a future kvm may trap them.
- */
-
-#define FUNCTION_INVARIANT(reg) \
- static u64 get_##reg(struct kvm_vcpu *v, \
- const struct sys_reg_desc *r) \
- { \
- ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \
- return ((struct sys_reg_desc *)r)->val; \
- }
-
-FUNCTION_INVARIANT(midr_el1)
-FUNCTION_INVARIANT(revidr_el1)
-FUNCTION_INVARIANT(aidr_el1)
-
-static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
-{
- ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0);
- return ((struct sys_reg_desc *)r)->val;
-}
-
-/* ->val is filled in by kvm_sys_reg_table_init() */
-static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = {
- { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
- { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 },
- { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 },
- { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 },
-};
-
-static int get_invariant_sys_reg(u64 id, u64 __user *uaddr)
-{
- const struct sys_reg_desc *r;
-
- r = get_reg_by_id(id, invariant_sys_regs,
- ARRAY_SIZE(invariant_sys_regs));
- if (!r)
- return -ENOENT;
-
- return put_user(r->val, uaddr);
-}
-
-static int set_invariant_sys_reg(u64 id, u64 __user *uaddr)
-{
- const struct sys_reg_desc *r;
- u64 val;
-
- r = get_reg_by_id(id, invariant_sys_regs,
- ARRAY_SIZE(invariant_sys_regs));
- if (!r)
- return -ENOENT;
-
- if (get_user(val, uaddr))
- return -EFAULT;
-
- /* This is what we mean by invariant: you can't change it. */
- if (r->val != val)
- return -EINVAL;
-
- return 0;
-}
-
static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
u32 val;
@@ -3786,7 +4889,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int ret;
r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r || sysreg_hidden_user(vcpu, r))
+ if (!r || sysreg_hidden(vcpu, r))
return -ENOENT;
if (r->get_user) {
@@ -3805,15 +4908,10 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
void __user *uaddr = (void __user *)(unsigned long)reg->addr;
- int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
return demux_c15_get(vcpu, reg->id, uaddr);
- err = get_invariant_sys_reg(reg->id, uaddr);
- if (err != -ENOENT)
- return err;
-
return kvm_sys_reg_get_user(vcpu, reg,
sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
}
@@ -3830,7 +4928,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
return -EFAULT;
r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r || sysreg_hidden_user(vcpu, r))
+ if (!r || sysreg_hidden(vcpu, r))
return -ENOENT;
if (sysreg_user_write_ignore(vcpu, r))
@@ -3849,15 +4947,10 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
void __user *uaddr = (void __user *)(unsigned long)reg->addr;
- int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
return demux_c15_set(vcpu, reg->id, uaddr);
- err = set_invariant_sys_reg(reg->id, uaddr);
- if (err != -ENOENT)
- return err;
-
return kvm_sys_reg_set_user(vcpu, reg,
sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
}
@@ -3916,7 +5009,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
if (!(rd->reg || rd->get_user))
return 0;
- if (sysreg_hidden_user(vcpu, rd))
+ if (sysreg_hidden(vcpu, rd))
return 0;
if (!copy_reg_to_user(rd, uind))
@@ -3946,23 +5039,14 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu)
{
- return ARRAY_SIZE(invariant_sys_regs)
- + num_demux_regs()
+ return num_demux_regs()
+ walk_sys_regs(vcpu, (u64 __user *)NULL);
}
int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
- unsigned int i;
int err;
- /* Then give them all the invariant registers' indices. */
- for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) {
- if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices))
- return -EFAULT;
- uindices++;
- }
-
err = walk_sys_regs(vcpu, uindices);
if (err < 0)
return err;
@@ -3978,14 +5062,6 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
sys_reg_CRm(r), \
sys_reg_Op2(r))
-static bool is_feature_id_reg(u32 encoding)
-{
- return (sys_reg_Op0(encoding) == 3 &&
- (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) &&
- sys_reg_CRn(encoding) == 0 &&
- sys_reg_CRm(encoding) <= 7);
-}
-
int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range)
{
const void *zero_page = page_to_virt(ZERO_PAGE(0));
@@ -4008,20 +5084,11 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *
if (!is_feature_id_reg(encoding) || !reg->set_user)
continue;
- /*
- * For ID registers, we return the writable mask. Other feature
- * registers return a full 64bit mask. That's not necessary
- * compliant with a given revision of the architecture, but the
- * RES0/RES1 definitions allow us to do that.
- */
- if (is_id_reg(encoding)) {
- if (!reg->val ||
- (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0()))
- continue;
- val = reg->val;
- } else {
- val = ~0UL;
+ if (!reg->val ||
+ (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) {
+ continue;
}
+ val = reg->val;
if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding))))
return -EFAULT;
@@ -4030,11 +5097,34 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *
return 0;
}
-void kvm_init_sysreg(struct kvm_vcpu *vcpu)
+static void vcpu_set_hcr(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- mutex_lock(&kvm->arch.config_lock);
+ if (has_vhe() || has_hvhe())
+ vcpu->arch.hcr_el2 |= HCR_E2H;
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
+ /* route synchronous external abort exceptions to EL2 */
+ vcpu->arch.hcr_el2 |= HCR_TEA;
+ /* trap error record accesses */
+ vcpu->arch.hcr_el2 |= HCR_TERR;
+ }
+
+ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
+ vcpu->arch.hcr_el2 |= HCR_FWB;
+
+ if (cpus_have_final_cap(ARM64_HAS_EVT) &&
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
+ kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0))
+ vcpu->arch.hcr_el2 |= HCR_TID4;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TID2;
+
+ if (vcpu_el1_is_32bit(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_RW;
+
+ if (kvm_has_mte(vcpu->kvm))
+ vcpu->arch.hcr_el2 |= HCR_ATA;
/*
* In the absence of FGT, we cannot independently trap TLBI
@@ -4043,13 +5133,16 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu)
*/
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
vcpu->arch.hcr_el2 |= HCR_TTLBOS;
+}
- if (cpus_have_final_cap(ARM64_HAS_HCX)) {
- vcpu->arch.hcrx_el2 = HCRX_GUEST_FLAGS;
+void kvm_calculate_traps(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
- if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP))
- vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2);
- }
+ mutex_lock(&kvm->arch.config_lock);
+ vcpu_set_hcr(vcpu);
+ vcpu_set_ich_hcr(vcpu);
+ vcpu_set_hcrx(vcpu);
if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags))
goto out;
@@ -4057,8 +5150,6 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu)
kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 |
HFGxTR_EL2_nMAIR2_EL1 |
HFGxTR_EL2_nS2POR_EL1 |
- HFGxTR_EL2_nPOR_EL1 |
- HFGxTR_EL2_nPOR_EL0 |
HFGxTR_EL2_nACCDATA_EL1 |
HFGxTR_EL2_nSMPRI_EL1_MASK |
HFGxTR_EL2_nTPIDR2_EL0_MASK);
@@ -4089,22 +5180,70 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu)
HFGITR_EL2_TLBIRVAAE1OS |
HFGITR_EL2_TLBIRVAE1OS);
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP))
+ if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP))
+ kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A;
+
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2))
+ kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP |
+ HFGITR_EL2_ATS1E1WP);
+
+ if (!kvm_has_s1pie(kvm))
kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 |
HFGxTR_EL2_nPIR_EL1);
+ if (!kvm_has_s1poe(kvm))
+ kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 |
+ HFGxTR_EL2_nPOR_EL0);
+
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP))
kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 |
HAFGRTR_EL2_RES1);
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) {
+ kvm->arch.fgu[HDFGRTR_GROUP] |= (HDFGRTR_EL2_nBRBDATA |
+ HDFGRTR_EL2_nBRBCTL |
+ HDFGRTR_EL2_nBRBIDR);
+ kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_nBRBINJ |
+ HFGITR_EL2_nBRBIALL);
+ }
+
set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags);
out:
mutex_unlock(&kvm->arch.config_lock);
}
+/*
+ * Perform last adjustments to the ID registers that are implied by the
+ * configuration outside of the ID regs themselves, as well as any
+ * initialisation that directly depend on these ID registers (such as
+ * RES0/RES1 behaviours). This is not the place to configure traps though.
+ *
+ * Because this can be called once per CPU, changes must be idempotent.
+ */
+int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ guard(mutex)(&kvm->arch.config_lock);
+
+ if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) &&
+ irqchip_in_kernel(kvm) &&
+ kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) {
+ kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK;
+ kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK;
+ }
+
+ if (vcpu_has_nv(vcpu)) {
+ int ret = kvm_init_nv_sysregs(vcpu);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int __init kvm_sys_reg_table_init(void)
{
- struct sys_reg_params params;
bool valid = true;
unsigned int i;
int ret = 0;
@@ -4115,21 +5254,12 @@ int __init kvm_sys_reg_table_init(void)
valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true);
valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true);
valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true);
- valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false);
valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false);
if (!valid)
return -EINVAL;
- /* We abuse the reset function to overwrite the table itself. */
- for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
- invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
-
- /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */
- params = encoding_to_params(SYS_ID_PFR0_EL1);
- first_idreg = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
- if (!first_idreg)
- return -EINVAL;
+ init_imp_id_regs();
ret = populate_nv_trap_config();
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 997eea21ba2a..cc6338d38766 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -95,9 +95,8 @@ struct sys_reg_desc {
};
#define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */
-#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */
-#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */
-#define REG_USER_WI (1 << 3) /* WI from userspace only */
+#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */
+#define REG_USER_WI (1 << 2) /* WI from userspace only */
static __printf(2, 3)
inline void print_sys_reg_msg(const struct sys_reg_params *p,
@@ -165,15 +164,6 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu,
return sysreg_visibility(vcpu, r) & REG_HIDDEN;
}
-static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *r)
-{
- if (likely(!r->visibility))
- return false;
-
- return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER);
-}
-
static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
@@ -235,6 +225,8 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index);
+int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu);
+
#define AA32(_x) .aarch32_map = AA32_##_x
#define Op0(_x) .Op0 = _x
#define Op1(_x) .Op1 = _x
@@ -248,4 +240,21 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index);
CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
Op2(sys_reg_Op2(reg))
+#define CP15_SYS_DESC(reg) \
+ .name = #reg, \
+ .aarch32_map = AA32_DIRECT, \
+ Op0(0), Op1(sys_reg_Op1(reg)), \
+ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
+ Op2(sys_reg_Op2(reg))
+
+#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \
+({ \
+ u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \
+ (val) &= ~reg##_##field##_MASK; \
+ (val) |= FIELD_PREP(reg##_##field##_MASK, \
+ min(__f_val, \
+ (u64)SYS_FIELD_VALUE(reg, field, limit))); \
+ (val); \
+})
+
#endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
index 064a58c19f48..f85415db7713 100644
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -46,38 +46,6 @@ TRACE_EVENT(kvm_hvc_arm64,
__entry->vcpu_pc, __entry->r0, __entry->imm)
);
-TRACE_EVENT(kvm_arm_setup_debug,
- TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
- TP_ARGS(vcpu, guest_debug),
-
- TP_STRUCT__entry(
- __field(struct kvm_vcpu *, vcpu)
- __field(__u32, guest_debug)
- ),
-
- TP_fast_assign(
- __entry->vcpu = vcpu;
- __entry->guest_debug = guest_debug;
- ),
-
- TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_clear_debug,
- TP_PROTO(__u32 guest_debug),
- TP_ARGS(guest_debug),
-
- TP_STRUCT__entry(
- __field(__u32, guest_debug)
- ),
-
- TP_fast_assign(
- __entry->guest_debug = guest_debug;
- ),
-
- TP_printk("flags: 0x%08x", __entry->guest_debug)
-);
-
/*
* The dreg32 name is a leftover from a distant past. This will really
* output a 64bit value...
@@ -99,49 +67,6 @@ TRACE_EVENT(kvm_arm_set_dreg32,
TP_printk("%s: 0x%llx", __entry->name, __entry->value)
);
-TRACE_DEFINE_SIZEOF(__u64);
-
-TRACE_EVENT(kvm_arm_set_regset,
- TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
- TP_ARGS(type, len, control, value),
- TP_STRUCT__entry(
- __field(const char *, name)
- __field(int, len)
- __array(u64, ctrls, 16)
- __array(u64, values, 16)
- ),
- TP_fast_assign(
- __entry->name = type;
- __entry->len = len;
- memcpy(__entry->ctrls, control, len << 3);
- memcpy(__entry->values, value, len << 3);
- ),
- TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
- __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
- __print_array(__entry->values, __entry->len, sizeof(__u64)))
-);
-
-TRACE_EVENT(trap_reg,
- TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
- TP_ARGS(fn, reg, is_write, write_value),
-
- TP_STRUCT__entry(
- __field(const char *, fn)
- __field(int, reg)
- __field(bool, is_write)
- __field(u64, write_value)
- ),
-
- TP_fast_assign(
- __entry->fn = fn;
- __entry->reg = reg;
- __entry->is_write = is_write;
- __entry->write_value = write_value;
- ),
-
- TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
-);
-
TRACE_EVENT(kvm_handle_sys_reg,
TP_PROTO(unsigned long hsr),
TP_ARGS(hsr),
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 9e7c486b48c2..5eacb4b3250a 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -35,12 +35,12 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
vgic_v3_cpu->num_id_bits = host_id_bits;
- host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2);
+ host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2);
seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val);
if (host_seis != seis)
return -EINVAL;
- host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2);
+ host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2);
a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val);
if (host_a3v != a3v)
return -EINVAL;
@@ -68,10 +68,10 @@ static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1);
val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits);
val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK,
- FIELD_GET(ICH_VTR_SEIS_MASK,
+ FIELD_GET(ICH_VTR_EL2_SEIS,
kvm_vgic_global_state.ich_vtr_el2));
val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK,
- FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2));
+ FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2));
/*
* The VMCR.CTLR value is in ICC_CTLR_EL1 layout.
* Extract it directly using ICC_CTLR_EL1 reg definitions.
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 389025ce7749..afb018528bc3 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -28,27 +28,66 @@ struct vgic_state_iter {
int nr_lpis;
int dist_id;
int vcpu_id;
- int intid;
+ unsigned long intid;
int lpi_idx;
- u32 *lpi_array;
};
-static void iter_next(struct vgic_state_iter *iter)
+static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter)
{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
if (iter->dist_id == 0) {
iter->dist_id++;
return;
}
+ /*
+ * Let the xarray drive the iterator after the last SPI, as the iterator
+ * has exhausted the sequentially-allocated INTID space.
+ */
+ if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) &&
+ iter->nr_lpis) {
+ if (iter->lpi_idx < iter->nr_lpis)
+ xa_find_after(&dist->lpi_xa, &iter->intid,
+ VGIC_LPI_MAX_INTID,
+ LPI_XA_MARK_DEBUG_ITER);
+ iter->lpi_idx++;
+ return;
+ }
+
iter->intid++;
if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
++iter->vcpu_id < iter->nr_cpus)
iter->intid = 0;
+}
- if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
- if (iter->lpi_idx < iter->nr_lpis)
- iter->intid = iter->lpi_array[iter->lpi_idx];
- iter->lpi_idx++;
+static int iter_mark_lpis(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ unsigned long intid;
+ int nr_lpis = 0;
+
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ if (!vgic_try_get_irq_kref(irq))
+ continue;
+
+ xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ nr_lpis++;
+ }
+
+ return nr_lpis;
+}
+
+static void iter_unmark_lpis(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ unsigned long intid;
+
+ xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) {
+ xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ vgic_put_irq(kvm, irq);
}
}
@@ -61,15 +100,12 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
iter->nr_cpus = nr_cpus;
iter->nr_spis = kvm->arch.vgic.nr_spis;
- if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
- if (iter->nr_lpis < 0)
- iter->nr_lpis = 0;
- }
+ if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ iter->nr_lpis = iter_mark_lpis(kvm);
/* Fast forward to the right position if needed */
while (pos--)
- iter_next(iter);
+ iter_next(kvm, iter);
}
static bool end_of_vgic(struct vgic_state_iter *iter)
@@ -77,7 +113,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
return iter->dist_id > 0 &&
iter->vcpu_id == iter->nr_cpus &&
iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
- iter->lpi_idx > iter->nr_lpis;
+ (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis);
}
static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
@@ -114,7 +150,7 @@ static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
struct vgic_state_iter *iter = kvm->arch.vgic.iter;
++*pos;
- iter_next(iter);
+ iter_next(kvm, iter);
if (end_of_vgic(iter))
iter = NULL;
return iter;
@@ -134,13 +170,14 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
mutex_lock(&kvm->arch.config_lock);
iter = kvm->arch.vgic.iter;
- kfree(iter->lpi_array);
+ iter_unmark_lpis(kvm);
kfree(iter);
kvm->arch.vgic.iter = NULL;
mutex_unlock(&kvm->arch.config_lock);
}
-static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
+static void print_dist_state(struct seq_file *s, struct vgic_dist *dist,
+ struct vgic_state_iter *iter)
{
bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
@@ -149,7 +186,7 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
if (v3)
- seq_printf(s, "nr_lpis:\t%d\n", atomic_read(&dist->lpi_count));
+ seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis);
seq_printf(s, "enabled:\t%d\n", dist->enabled);
seq_printf(s, "\n");
@@ -236,7 +273,7 @@ static int vgic_debug_show(struct seq_file *s, void *v)
unsigned long flags;
if (iter->dist_id == 0) {
- print_dist_state(s, &kvm->arch.vgic);
+ print_dist_state(s, &kvm->arch.vgic, iter);
return 0;
}
@@ -246,11 +283,16 @@ static int vgic_debug_show(struct seq_file *s, void *v)
if (iter->vcpu_id < iter->nr_cpus)
vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
- irq = vgic_get_irq(kvm, vcpu, iter->intid);
- if (!irq) {
- seq_printf(s, " LPI %4d freed\n", iter->intid);
- return 0;
- }
+ /*
+ * Expect this to succeed, as iter_mark_lpis() takes a reference on
+ * every LPI to be visited.
+ */
+ if (iter->intid < VGIC_NR_PRIVATE_IRQS)
+ irq = vgic_get_vcpu_irq(vcpu, iter->intid);
+ else
+ irq = vgic_get_irq(kvm, iter->intid);
+ if (WARN_ON_ONCE(!irq))
+ return -EINVAL;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
print_irq_state(s, irq, vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index f20941f83a07..1f33e71c2a73 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -34,9 +34,9 @@
*
* CPU Interface:
*
- * - kvm_vgic_vcpu_init(): initialization of static data that
- * doesn't depend on any sizing information or emulation type. No
- * allocation is allowed there.
+ * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend
+ * on any sizing information. Private interrupts are allocated if not
+ * already allocated at vgic-creation time.
*/
/* EARLY INIT */
@@ -53,13 +53,13 @@ void kvm_vgic_early_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- INIT_LIST_HEAD(&dist->lpi_translation_cache);
- raw_spin_lock_init(&dist->lpi_list_lock);
xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
}
/* CREATION */
+static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
+
/**
* kvm_vgic_create: triggered by the instantiation of the VGIC device by
* user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
@@ -114,6 +114,22 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
goto out_unlock;
}
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = vgic_allocate_private_irqs_locked(vcpu, type);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ kfree(vgic_cpu->private_irqs);
+ vgic_cpu->private_irqs = NULL;
+ }
+
+ goto out_unlock;
+ }
+
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
@@ -182,27 +198,43 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
return 0;
}
-/**
- * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
- * structures and register VCPU-specific KVM iodevs
- *
- * @vcpu: pointer to the VCPU being created and initialized
- *
- * Only do initialization, but do not actually enable the
- * VGIC CPU interface
- */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */
+#define DEFAULT_MI_INTID 25
+
+int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ guard(mutex)(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * Matching the tradition established with the timers, provide
+ * a default PPI for the maintenance interrupt. It makes
+ * things easier to reason about.
+ */
+ if (vcpu->kvm->arch.vgic.mi_intid == 0)
+ vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID;
+ ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu);
+
+ return ret;
+}
+
+static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int ret = 0;
int i;
- vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+ lockdep_assert_held(&vcpu->kvm->arch.config_lock);
- INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
- raw_spin_lock_init(&vgic_cpu->ap_list_lock);
- atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+ if (vgic_cpu->private_irqs)
+ return 0;
+
+ vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS,
+ sizeof(struct vgic_irq),
+ GFP_KERNEL_ACCOUNT);
+
+ if (!vgic_cpu->private_irqs)
+ return -ENOMEM;
/*
* Enable and configure all SGIs to be edge-triggered and
@@ -225,11 +257,61 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
/* PPIs */
irq->config = VGIC_CONFIG_LEVEL;
}
+
+ switch (type) {
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ irq->group = 1;
+ irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ irq->group = 0;
+ irq->targets = BIT(vcpu->vcpu_id);
+ break;
+ }
}
+ return 0;
+}
+
+static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type)
+{
+ int ret;
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+ ret = vgic_allocate_private_irqs_locked(vcpu, type);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ return ret;
+}
+
+/**
+ * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
+ * structures and register VCPU-specific KVM iodevs
+ *
+ * @vcpu: pointer to the VCPU being created and initialized
+ *
+ * Only do initialization, but do not actually enable the
+ * VGIC CPU interface
+ */
+int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ int ret = 0;
+
+ vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+
+ INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+ raw_spin_lock_init(&vgic_cpu->ap_list_lock);
+ atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+
if (!irqchip_in_kernel(vcpu->kvm))
return 0;
+ ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model);
+ if (ret)
+ return ret;
+
/*
* If we are creating a VCPU with a GICv3 we must also register the
* KVM io device for the redistributor that belongs to this VCPU.
@@ -263,7 +345,7 @@ int vgic_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu;
- int ret = 0, i;
+ int ret = 0;
unsigned long idx;
lockdep_assert_held(&kvm->arch.config_lock);
@@ -283,31 +365,6 @@ int vgic_init(struct kvm *kvm)
if (ret)
goto out;
- /* Initialize groups on CPUs created before the VGIC type was known */
- kvm_for_each_vcpu(idx, vcpu, kvm) {
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
- for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
- struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
- switch (dist->vgic_model) {
- case KVM_DEV_TYPE_ARM_VGIC_V3:
- irq->group = 1;
- irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
- break;
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- irq->group = 0;
- irq->targets = 1U << idx;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
- }
-
- if (vgic_has_its(kvm))
- vgic_lpi_translation_cache_init(kvm);
-
/*
* If we have GICv4.1 enabled, unconditionally request enable the
* v4 support so that we get HW-accelerated vSGIs. Otherwise, only
@@ -355,15 +412,12 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
- vgic_v3_free_redist_region(rdreg);
+ vgic_v3_free_redist_region(kvm, rdreg);
INIT_LIST_HEAD(&dist->rd_regions);
} else {
dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
- if (vgic_has_its(kvm))
- vgic_lpi_translation_cache_destroy(kvm);
-
if (vgic_supports_direct_msis(kvm))
vgic_v4_teardown(kvm);
@@ -381,8 +435,29 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
vgic_flush_pending_lpis(vcpu);
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+ kfree(vgic_cpu->private_irqs);
+ vgic_cpu->private_irqs = NULL;
+
if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- vgic_unregister_redist_iodev(vcpu);
+ /*
+ * If this vCPU is being destroyed because of a failed creation
+ * then unregister the redistributor to avoid leaving behind a
+ * dangling pointer to the vCPU struct.
+ *
+ * vCPUs that have been successfully created (i.e. added to
+ * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as
+ * this function gets called while holding kvm->arch.config_lock
+ * in the VM teardown path and would otherwise introduce a lock
+ * inversion w.r.t. kvm->srcu.
+ *
+ * vCPUs that failed creation are torn down outside of the
+ * kvm->arch.config_lock and do not get unregistered in
+ * kvm_vgic_destroy(), meaning it is both safe and necessary to
+ * do so here.
+ */
+ if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu)
+ vgic_unregister_redist_iodev(vcpu);
+
vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
}
}
@@ -402,17 +477,21 @@ void kvm_vgic_destroy(struct kvm *kvm)
unsigned long i;
mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->arch.config_lock);
vgic_debug_destroy(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
__kvm_vgic_vcpu_destroy(vcpu);
- mutex_lock(&kvm->arch.config_lock);
-
kvm_vgic_dist_destroy(kvm);
mutex_unlock(&kvm->arch.config_lock);
+
+ if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vgic_unregister_redist_iodev(vcpu);
+
mutex_unlock(&kvm->slots_lock);
}
@@ -486,22 +565,31 @@ int kvm_vgic_map_resources(struct kvm *kvm)
if (ret)
goto out;
- dist->ready = true;
dist_base = dist->vgic_dist_base;
mutex_unlock(&kvm->arch.config_lock);
ret = vgic_register_dist_iodev(kvm, dist_base, type);
- if (ret)
+ if (ret) {
kvm_err("Unable to register VGIC dist MMIO regions\n");
+ goto out_slots;
+ }
+ /*
+ * kvm_io_bus_register_dev() guarantees all readers see the new MMIO
+ * registration before returning through synchronize_srcu(), which also
+ * implies a full memory barrier. As such, marking the distributor as
+ * 'ready' here is guaranteed to be ordered after all vCPUs having seen
+ * a completely configured distributor.
+ */
+ dist->ready = true;
goto out_slots;
out:
mutex_unlock(&kvm->arch.config_lock);
out_slots:
- mutex_unlock(&kvm->slots_lock);
-
if (ret)
- kvm_vgic_destroy(kvm);
+ kvm_vm_dead(kvm);
+
+ mutex_unlock(&kvm->slots_lock);
return ret;
}
@@ -521,12 +609,20 @@ void kvm_vgic_cpu_down(void)
static irqreturn_t vgic_maintenance_handler(int irq, void *data)
{
+ struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data;
+
/*
* We cannot rely on the vgic maintenance interrupt to be
* delivered synchronously. This means we can only use it to
* exit the VM, and we perform the handling of EOIed
* interrupts on the exit path (see vgic_fold_lr_state).
+ *
+ * Of course, NV throws a wrench in this plan, and needs
+ * something special.
*/
+ if (vcpu && vgic_state_is_nested(vcpu))
+ vgic_v3_handle_nested_maint_irq(vcpu);
+
return IRQ_HANDLED;
}
diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
index 8c711deb25aa..c314c016659a 100644
--- a/arch/arm64/kvm/vgic/vgic-irqfd.c
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -9,7 +9,7 @@
#include <kvm/arm_vgic.h>
#include "vgic.h"
-/**
+/*
* vgic_irqfd_set_irq: inject the IRQ corresponding to the
* irqchip routing entry
*
@@ -75,7 +75,8 @@ static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
msi->flags = e->msi.flags;
msi->devid = e->msi.devid;
}
-/**
+
+/*
* kvm_set_msi: inject the MSI corresponding to the
* MSI routing entry
*
@@ -98,7 +99,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
return vgic_its_inject_msi(kvm, &msi);
}
-/**
+/*
* kvm_arch_set_irq_inatomic: fast-path for irqfd injection
*/
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index e85a495ada9c..fb96802799c6 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -23,12 +23,49 @@
#include "vgic.h"
#include "vgic-mmio.h"
+static struct kvm_device_ops kvm_arm_vgic_its_ops;
+
static int vgic_its_save_tables_v0(struct vgic_its *its);
static int vgic_its_restore_tables_v0(struct vgic_its *its);
static int vgic_its_commit_v0(struct vgic_its *its);
static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
struct kvm_vcpu *filter_vcpu, bool needs_inv);
+#define vgic_its_read_entry_lock(i, g, valp, t) \
+ ({ \
+ int __sz = vgic_its_get_abi(i)->t##_esz; \
+ struct kvm *__k = (i)->dev->kvm; \
+ int __ret; \
+ \
+ BUILD_BUG_ON(NR_ITS_ABIS == 1 && \
+ sizeof(*(valp)) != ABI_0_ESZ); \
+ if (NR_ITS_ABIS > 1 && \
+ KVM_BUG_ON(__sz != sizeof(*(valp)), __k)) \
+ __ret = -EINVAL; \
+ else \
+ __ret = kvm_read_guest_lock(__k, (g), \
+ valp, __sz); \
+ __ret; \
+ })
+
+#define vgic_its_write_entry_lock(i, g, val, t) \
+ ({ \
+ int __sz = vgic_its_get_abi(i)->t##_esz; \
+ struct kvm *__k = (i)->dev->kvm; \
+ typeof(val) __v = (val); \
+ int __ret; \
+ \
+ BUILD_BUG_ON(NR_ITS_ABIS == 1 && \
+ sizeof(__v) != ABI_0_ESZ); \
+ if (NR_ITS_ABIS > 1 && \
+ KVM_BUG_ON(__sz != sizeof(__v), __k)) \
+ __ret = -EINVAL; \
+ else \
+ __ret = vgic_write_guest_lock(__k, (g), \
+ &__v, __sz); \
+ __ret; \
+ })
+
/*
* Creates a new (reference to a) struct vgic_irq for a given LPI.
* If this LPI is already mapped on another ITS, we increase its refcount
@@ -40,7 +77,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
struct kvm_vcpu *vcpu)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+ struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
unsigned long flags;
int ret;
@@ -67,7 +104,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
irq->target_vcpu = vcpu;
irq->group = 1;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
/*
* There could be a race with another vgic_add_lpi(), so we need to
@@ -82,17 +119,14 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
goto out_unlock;
}
- ret = xa_err(xa_store(&dist->lpi_xa, intid, irq, 0));
+ ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
if (ret) {
xa_release(&dist->lpi_xa, intid);
kfree(irq);
- goto out_unlock;
}
- atomic_inc(&dist->lpi_count);
-
out_unlock:
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
if (ret)
return ERR_PTR(ret);
@@ -150,14 +184,6 @@ struct its_ite {
u32 event_id;
};
-struct vgic_translation_cache_entry {
- struct list_head entry;
- phys_addr_t db;
- u32 devid;
- u32 eventid;
- struct vgic_irq *irq;
-};
-
/**
* struct vgic_its_abi - ITS abi ops and settings
* @cte_esz: collection table entry size
@@ -252,8 +278,10 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
#define GIC_LPI_OFFSET 8192
-#define VITS_TYPER_IDBITS 16
-#define VITS_TYPER_DEVBITS 16
+#define VITS_TYPER_IDBITS 16
+#define VITS_MAX_EVENTID (BIT(VITS_TYPER_IDBITS) - 1)
+#define VITS_TYPER_DEVBITS 16
+#define VITS_MAX_DEVID (BIT(VITS_TYPER_DEVBITS) - 1)
#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1)
#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1)
@@ -316,53 +344,6 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
return 0;
}
-#define GIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1)
-
-/*
- * Create a snapshot of the current LPIs targeting @vcpu, so that we can
- * enumerate those LPIs without holding any lock.
- * Returns their number and puts the kmalloc'ed array into intid_ptr.
- */
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- XA_STATE(xas, &dist->lpi_xa, GIC_LPI_OFFSET);
- struct vgic_irq *irq;
- unsigned long flags;
- u32 *intids;
- int irq_count, i = 0;
-
- /*
- * There is an obvious race between allocating the array and LPIs
- * being mapped/unmapped. If we ended up here as a result of a
- * command, we're safe (locks are held, preventing another
- * command). If coming from another path (such as enabling LPIs),
- * we must be careful not to overrun the array.
- */
- irq_count = atomic_read(&dist->lpi_count);
- intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT);
- if (!intids)
- return -ENOMEM;
-
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
- rcu_read_lock();
-
- xas_for_each(&xas, irq, GIC_LPI_MAX_INTID) {
- if (i == irq_count)
- break;
- /* We don't need to "get" the IRQ, as we hold the list lock. */
- if (vcpu && irq->target_vcpu != vcpu)
- continue;
- intids[i++] = irq->intid;
- }
-
- rcu_read_unlock();
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
- *intid_ptr = intids;
- return i;
-}
-
static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
{
int ret = 0;
@@ -446,23 +427,18 @@ static u32 max_lpis_propbaser(u64 propbaser)
static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
{
gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
int last_byte_offset = -1;
int ret = 0;
- u32 *intids;
- int nr_irqs, i;
- unsigned long flags;
u8 pendmask;
- nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
- if (nr_irqs < 0)
- return nr_irqs;
-
- for (i = 0; i < nr_irqs; i++) {
+ xa_for_each(&dist->lpi_xa, intid, irq) {
int byte_offset, bit_nr;
- byte_offset = intids[i] / BITS_PER_BYTE;
- bit_nr = intids[i] % BITS_PER_BYTE;
+ byte_offset = intid / BITS_PER_BYTE;
+ bit_nr = intid % BITS_PER_BYTE;
/*
* For contiguously allocated LPIs chances are we just read
@@ -472,25 +448,23 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
ret = kvm_read_guest_lock(vcpu->kvm,
pendbase + byte_offset,
&pendmask, 1);
- if (ret) {
- kfree(intids);
+ if (ret)
return ret;
- }
+
last_byte_offset = byte_offset;
}
- irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ irq = vgic_get_irq(vcpu->kvm, intid);
if (!irq)
continue;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
- irq->pending_latch = pendmask & (1U << bit_nr);
+ if (irq->target_vcpu == vcpu)
+ irq->pending_latch = pendmask & (1U << bit_nr);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
vgic_put_irq(vcpu->kvm, irq);
}
- kfree(intids);
-
return ret;
}
@@ -566,51 +540,52 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
return 0;
}
-static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
- phys_addr_t db,
- u32 devid, u32 eventid)
+static struct vgic_its *__vgic_doorbell_to_its(struct kvm *kvm, gpa_t db)
{
- struct vgic_translation_cache_entry *cte;
+ struct kvm_io_device *kvm_io_dev;
+ struct vgic_io_device *iodev;
- list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
- /*
- * If we hit a NULL entry, there is nothing after this
- * point.
- */
- if (!cte->irq)
- break;
+ kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, db);
+ if (!kvm_io_dev)
+ return ERR_PTR(-EINVAL);
- if (cte->db != db || cte->devid != devid ||
- cte->eventid != eventid)
- continue;
+ if (kvm_io_dev->ops != &kvm_io_gic_ops)
+ return ERR_PTR(-EINVAL);
- /*
- * Move this entry to the head, as it is the most
- * recently used.
- */
- if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
- list_move(&cte->entry, &dist->lpi_translation_cache);
+ iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+ if (iodev->iodev_type != IODEV_ITS)
+ return ERR_PTR(-EINVAL);
- return cte->irq;
- }
+ return iodev->its;
+}
+
+static unsigned long vgic_its_cache_key(u32 devid, u32 eventid)
+{
+ return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid;
- return NULL;
}
static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
u32 devid, u32 eventid)
{
- struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long cache_key = vgic_its_cache_key(devid, eventid);
+ struct vgic_its *its;
struct vgic_irq *irq;
- unsigned long flags;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ if (devid > VITS_MAX_DEVID || eventid > VITS_MAX_EVENTID)
+ return NULL;
- irq = __vgic_its_check_cache(dist, db, devid, eventid);
+ its = __vgic_doorbell_to_its(kvm, db);
+ if (IS_ERR(its))
+ return NULL;
+
+ rcu_read_lock();
+
+ irq = xa_load(&its->translation_cache, cache_key);
if (!vgic_try_get_irq_kref(irq))
irq = NULL;
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ rcu_read_unlock();
return irq;
}
@@ -619,81 +594,68 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
u32 devid, u32 eventid,
struct vgic_irq *irq)
{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_translation_cache_entry *cte;
- unsigned long flags;
- phys_addr_t db;
+ unsigned long cache_key = vgic_its_cache_key(devid, eventid);
+ struct vgic_irq *old;
/* Do not cache a directly injected interrupt */
if (irq->hw)
return;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
- if (unlikely(list_empty(&dist->lpi_translation_cache)))
- goto out;
-
/*
- * We could have raced with another CPU caching the same
- * translation behind our back, so let's check it is not in
- * already
+ * The irq refcount is guaranteed to be nonzero while holding the
+ * its_lock, as the ITE (and the reference it holds) cannot be freed.
*/
- db = its->vgic_its_base + GITS_TRANSLATER;
- if (__vgic_its_check_cache(dist, db, devid, eventid))
- goto out;
+ lockdep_assert_held(&its->its_lock);
+ vgic_get_irq_kref(irq);
- /* Always reuse the last entry (LRU policy) */
- cte = list_last_entry(&dist->lpi_translation_cache,
- typeof(*cte), entry);
+ old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);
/*
- * Caching the translation implies having an extra reference
- * to the interrupt, so drop the potential reference on what
- * was in the cache, and increment it on the new interrupt.
+ * Put the reference taken on @irq if the store fails. Intentionally do
+ * not return the error as the translation cache is best effort.
*/
- if (cte->irq)
- vgic_put_irq(kvm, cte->irq);
+ if (xa_is_err(old)) {
+ vgic_put_irq(kvm, irq);
+ return;
+ }
/*
- * The irq refcount is guaranteed to be nonzero while holding the
- * its_lock, as the ITE (and the reference it holds) cannot be freed.
+ * We could have raced with another CPU caching the same
+ * translation behind our back, ensure we don't leak a
+ * reference if that is the case.
*/
- lockdep_assert_held(&its->its_lock);
- vgic_get_irq_kref(irq);
-
- cte->db = db;
- cte->devid = devid;
- cte->eventid = eventid;
- cte->irq = irq;
+ if (old)
+ vgic_put_irq(kvm, old);
+}
- /* Move the new translation to the head of the list */
- list_move(&cte->entry, &dist->lpi_translation_cache);
+static void vgic_its_invalidate_cache(struct vgic_its *its)
+{
+ struct kvm *kvm = its->dev->kvm;
+ struct vgic_irq *irq;
+ unsigned long idx;
-out:
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ xa_for_each(&its->translation_cache, idx, irq) {
+ xa_erase(&its->translation_cache, idx);
+ vgic_put_irq(kvm, irq);
+ }
}
-void vgic_its_invalidate_cache(struct kvm *kvm)
+void vgic_its_invalidate_all_caches(struct kvm *kvm)
{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_translation_cache_entry *cte;
- unsigned long flags;
+ struct kvm_device *dev;
+ struct vgic_its *its;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ rcu_read_lock();
- list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
- /*
- * If we hit a NULL entry, there is nothing after this
- * point.
- */
- if (!cte->irq)
- break;
+ list_for_each_entry_rcu(dev, &kvm->devices, vm_node) {
+ if (dev->ops != &kvm_arm_vgic_its_ops)
+ continue;
- vgic_put_irq(kvm, cte->irq);
- cte->irq = NULL;
+ its = dev->private;
+ vgic_its_invalidate_cache(its);
}
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ rcu_read_unlock();
}
int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
@@ -725,8 +687,6 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
{
u64 address;
- struct kvm_io_device *kvm_io_dev;
- struct vgic_io_device *iodev;
if (!vgic_has_its(kvm))
return ERR_PTR(-ENODEV);
@@ -736,18 +696,7 @@ struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
address = (u64)msi->address_hi << 32 | msi->address_lo;
- kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
- if (!kvm_io_dev)
- return ERR_PTR(-EINVAL);
-
- if (kvm_io_dev->ops != &kvm_io_gic_ops)
- return ERR_PTR(-EINVAL);
-
- iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
- if (iodev->iodev_type != IODEV_ITS)
- return ERR_PTR(-EINVAL);
-
- return iodev->its;
+ return __vgic_doorbell_to_its(kvm, address);
}
/*
@@ -878,15 +827,19 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
ite = find_ite(its, device_id, event_id);
if (ite && its_is_collection_mapped(ite->collection)) {
+ struct its_device *device = find_its_device(its, device_id);
+ int ite_esz = vgic_its_get_abi(its)->ite_esz;
+ gpa_t gpa = device->itt_addr + ite->event_id * ite_esz;
/*
* Though the spec talks about removing the pending state, we
* don't bother here since we clear the ITTE anyway and the
* pending state is a property of the ITTE struct.
*/
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
its_free_ite(kvm, ite);
- return 0;
+
+ return vgic_its_write_entry_lock(its, gpa, 0ULL, ite);
}
return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
@@ -920,7 +873,7 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
ite->collection = collection;
vcpu = collection_to_vcpu(kvm, collection);
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
return update_affinity(ite->irq, vcpu);
}
@@ -955,7 +908,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
switch (type) {
case GITS_BASER_TYPE_DEVICE:
- if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
+ if (id > VITS_MAX_DEVID)
return false;
break;
case GITS_BASER_TYPE_COLLECTION:
@@ -1167,7 +1120,8 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
}
/* Requires the its_lock to be held. */
-static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
+static void vgic_its_free_device(struct kvm *kvm, struct vgic_its *its,
+ struct its_device *device)
{
struct its_ite *ite, *temp;
@@ -1179,7 +1133,7 @@ static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
its_free_ite(kvm, ite);
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
list_del(&device->dev_list);
kfree(device);
@@ -1191,7 +1145,7 @@ static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
struct its_device *cur, *temp;
list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
- vgic_its_free_device(kvm, cur);
+ vgic_its_free_device(kvm, its, cur);
}
/* its lock must be held */
@@ -1235,8 +1189,9 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
u8 num_eventid_bits = its_cmd_get_size(its_cmd);
gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
struct its_device *device;
+ gpa_t gpa;
- if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
+ if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa))
return E_ITS_MAPD_DEVICE_OOR;
if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
@@ -1250,14 +1205,14 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
* by removing the mapping and re-establishing it.
*/
if (device)
- vgic_its_free_device(kvm, device);
+ vgic_its_free_device(kvm, its, device);
/*
* The spec does not say whether unmapping a not-mapped device
* is an error, so we are done in any case.
*/
if (!valid)
- return 0;
+ return vgic_its_write_entry_lock(its, gpa, 0ULL, dte);
device = vgic_its_alloc_device(its, device_id, itt_addr,
num_eventid_bits);
@@ -1281,7 +1236,7 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
if (!valid) {
vgic_its_free_collection(its, coll_id);
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
} else {
struct kvm_vcpu *vcpu;
@@ -1372,23 +1327,19 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
int vgic_its_invall(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- int irq_count, i = 0;
- u32 *intids;
-
- irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
- if (irq_count < 0)
- return irq_count;
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ unsigned long intid;
- for (i = 0; i < irq_count; i++) {
- struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intids[i]);
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ irq = vgic_get_irq(kvm, intid);
if (!irq)
continue;
+
update_lpi_config(kvm, irq, vcpu, false);
vgic_put_irq(kvm, irq);
}
- kfree(intids);
-
if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
@@ -1431,10 +1382,10 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
u64 *its_cmd)
{
+ struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu1, *vcpu2;
struct vgic_irq *irq;
- u32 *intids;
- int irq_count, i;
+ unsigned long intid;
/* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */
vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
@@ -1446,12 +1397,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
if (vcpu1 == vcpu2)
return 0;
- irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
- if (irq_count < 0)
- return irq_count;
-
- for (i = 0; i < irq_count; i++) {
- irq = vgic_get_irq(kvm, NULL, intids[i]);
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ irq = vgic_get_irq(kvm, intid);
if (!irq)
continue;
@@ -1460,9 +1407,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
vgic_put_irq(kvm, irq);
}
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
- kfree(intids);
return 0;
}
@@ -1813,7 +1759,7 @@ static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
its->enabled = !!(val & GITS_CTLR_ENABLE);
if (!its->enabled)
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
/*
* Try to process any pending commands. This function bails out early
@@ -1914,47 +1860,6 @@ out:
return ret;
}
-/* Default is 16 cached LPIs per vcpu */
-#define LPI_DEFAULT_PCPU_CACHE_SIZE 16
-
-void vgic_lpi_translation_cache_init(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- unsigned int sz;
- int i;
-
- if (!list_empty(&dist->lpi_translation_cache))
- return;
-
- sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
-
- for (i = 0; i < sz; i++) {
- struct vgic_translation_cache_entry *cte;
-
- /* An allocation failure is not fatal */
- cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT);
- if (WARN_ON(!cte))
- break;
-
- INIT_LIST_HEAD(&cte->entry);
- list_add(&cte->entry, &dist->lpi_translation_cache);
- }
-}
-
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_translation_cache_entry *cte, *tmp;
-
- vgic_its_invalidate_cache(kvm);
-
- list_for_each_entry_safe(cte, tmp,
- &dist->lpi_translation_cache, entry) {
- list_del(&cte->entry);
- kfree(cte);
- }
-}
-
#define INITIAL_BASER_VALUE \
(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \
GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \
@@ -1987,8 +1892,6 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
kfree(its);
return ret;
}
-
- vgic_lpi_translation_cache_init(dev->kvm);
}
mutex_init(&its->its_lock);
@@ -2006,6 +1909,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
INIT_LIST_HEAD(&its->device_list);
INIT_LIST_HEAD(&its->collection_list);
+ xa_init(&its->translation_cache);
dev->kvm->arch.vgic.msis_require_devid = true;
dev->kvm->arch.vgic.has_its = true;
@@ -2036,6 +1940,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev)
vgic_its_free_device_list(kvm, its);
vgic_its_free_collection_list(kvm, its);
+ vgic_its_invalidate_cache(its);
+ xa_destroy(&its->translation_cache);
mutex_unlock(&its->its_lock);
kfree(its);
@@ -2184,6 +2090,7 @@ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
* @start_id: the ID of the first entry in the table
* (non zero for 2d level tables)
* @fn: function to apply on each entry
+ * @opaque: pointer to opaque data
*
* Return: < 0 on error, 0 if last element was identified, 1 otherwise
* (the last element may not be found on second level tables)
@@ -2223,13 +2130,12 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
return 1;
}
-/**
+/*
* vgic_its_save_ite - Save an interrupt translation entry at @gpa
*/
static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
- struct its_ite *ite, gpa_t gpa, int ite_esz)
+ struct its_ite *ite, gpa_t gpa)
{
- struct kvm *kvm = its->dev->kvm;
u32 next_offset;
u64 val;
@@ -2238,11 +2144,14 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
ite->collection->collection_id;
val = cpu_to_le64(val);
- return vgic_write_guest_lock(kvm, gpa, &val, ite_esz);
+
+ return vgic_its_write_entry_lock(its, gpa, val, ite);
}
/**
* vgic_its_restore_ite - restore an interrupt translation entry
+ *
+ * @its: its handle
* @event_id: id used for indexing
* @ptr: pointer to the ITE entry
* @opaque: pointer to the its_device
@@ -2336,7 +2245,7 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
- ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
+ ret = vgic_its_save_ite(its, device, ite, gpa);
if (ret)
return ret;
}
@@ -2377,9 +2286,8 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
* @ptr: GPA
*/
static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
- gpa_t ptr, int dte_esz)
+ gpa_t ptr)
{
- struct kvm *kvm = its->dev->kvm;
u64 val, itt_addr_field;
u32 next_offset;
@@ -2390,7 +2298,8 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
(itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
(dev->num_eventid_bits - 1));
val = cpu_to_le64(val);
- return vgic_write_guest_lock(kvm, ptr, &val, dte_esz);
+
+ return vgic_its_write_entry_lock(its, ptr, val, dte);
}
/**
@@ -2438,7 +2347,7 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
ret = vgic_its_restore_itt(its, dev);
if (ret) {
- vgic_its_free_device(its->dev->kvm, dev);
+ vgic_its_free_device(its->dev->kvm, its, dev);
return ret;
}
@@ -2457,7 +2366,7 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a,
return 1;
}
-/**
+/*
* vgic_its_save_device_tables - Save the device table and all ITT
* into guest RAM
*
@@ -2466,10 +2375,8 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a,
*/
static int vgic_its_save_device_tables(struct vgic_its *its)
{
- const struct vgic_its_abi *abi = vgic_its_get_abi(its);
u64 baser = its->baser_device_table;
struct its_device *dev;
- int dte_esz = abi->dte_esz;
if (!(baser & GITS_BASER_VALID))
return 0;
@@ -2488,7 +2395,7 @@ static int vgic_its_save_device_tables(struct vgic_its *its)
if (ret)
return ret;
- ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
+ ret = vgic_its_save_dte(its, dev, eaddr);
if (ret)
return ret;
}
@@ -2530,7 +2437,7 @@ static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
return ret;
}
-/**
+/*
* vgic_its_restore_device_tables - Restore the device table and all ITT
* from guest RAM to internal data structs
*/
@@ -2569,7 +2476,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its)
static int vgic_its_save_cte(struct vgic_its *its,
struct its_collection *collection,
- gpa_t gpa, int esz)
+ gpa_t gpa)
{
u64 val;
@@ -2577,7 +2484,8 @@ static int vgic_its_save_cte(struct vgic_its *its,
((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
collection->collection_id);
val = cpu_to_le64(val);
- return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz);
+
+ return vgic_its_write_entry_lock(its, gpa, val, cte);
}
/*
@@ -2585,7 +2493,7 @@ static int vgic_its_save_cte(struct vgic_its *its,
* Return +1 on success, 0 if the entry was invalid (which should be
* interpreted as end-of-table), and a negative error value for generic errors.
*/
-static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
+static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa)
{
struct its_collection *collection;
struct kvm *kvm = its->dev->kvm;
@@ -2593,8 +2501,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
u64 val;
int ret;
- BUG_ON(esz > sizeof(val));
- ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
+ ret = vgic_its_read_entry_lock(its, gpa, &val, cte);
if (ret)
return ret;
val = le64_to_cpu(val);
@@ -2622,7 +2529,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
return 1;
}
-/**
+/*
* vgic_its_save_collection_table - Save the collection table into
* guest RAM
*/
@@ -2632,7 +2539,6 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
u64 baser = its->baser_coll_table;
gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
struct its_collection *collection;
- u64 val;
size_t max_size, filled = 0;
int ret, cte_esz = abi->cte_esz;
@@ -2642,7 +2548,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
list_for_each_entry(collection, &its->collection_list, coll_list) {
- ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
+ ret = vgic_its_save_cte(its, collection, gpa);
if (ret)
return ret;
gpa += cte_esz;
@@ -2656,13 +2562,10 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
* table is not fully filled, add a last dummy element
* with valid bit unset
*/
- val = 0;
- BUG_ON(cte_esz > sizeof(val));
- ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
- return ret;
+ return vgic_its_write_entry_lock(its, gpa, 0ULL, cte);
}
-/**
+/*
* vgic_its_restore_collection_table - reads the collection table
* in guest memory and restores the ITS internal state. Requires the
* BASER registers to be restored before.
@@ -2684,7 +2587,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its)
max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
while (read < max_size) {
- ret = vgic_its_restore_cte(its, gpa, cte_esz);
+ ret = vgic_its_restore_cte(its, gpa);
if (ret <= 0)
break;
gpa += cte_esz;
@@ -2700,7 +2603,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its)
return ret;
}
-/**
+/*
* vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
* according to v0 ABI
*/
@@ -2715,7 +2618,7 @@ static int vgic_its_save_tables_v0(struct vgic_its *its)
return vgic_its_save_collection_table(its);
}
-/**
+/*
* vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
* to internal data structs according to V0 ABI
*
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index f48b8dab8b3d..359094f68c23 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -236,7 +236,12 @@ static int vgic_set_common_attr(struct kvm_device *dev,
mutex_lock(&dev->kvm->arch.config_lock);
- if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
+ /*
+ * Either userspace has already configured NR_IRQS or
+ * the vgic has already been initialized and vgic_init()
+ * supplied a default amount of SPIs.
+ */
+ if (dev->kvm->arch.vgic.nr_spis)
ret = -EBUSY;
else
dev->kvm->arch.vgic.nr_spis =
@@ -298,6 +303,12 @@ static int vgic_get_common_attr(struct kvm_device *dev,
VGIC_NR_PRIVATE_IRQS, uaddr);
break;
}
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+ r = put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
+ break;
+ }
}
return r;
@@ -338,12 +349,12 @@ int kvm_register_vgic_device(unsigned long type)
int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
struct vgic_reg_attr *reg_attr)
{
- int cpuid;
-
- cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr);
+ int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr);
- reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid);
reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+ reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid);
+ if (!reg_attr->vcpu)
+ return -EINVAL;
return 0;
}
@@ -512,7 +523,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
struct vgic_reg_attr reg_attr;
gpa_t addr;
struct kvm_vcpu *vcpu;
- bool uaccess;
+ bool uaccess, post_init = true;
u32 val;
int ret;
@@ -528,6 +539,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
/* Sysregs uaccess is performed by the sysreg handling code */
uaccess = false;
break;
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
+ post_init = false;
+ fallthrough;
default:
uaccess = true;
}
@@ -547,7 +561,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->arch.config_lock);
- if (unlikely(!vgic_initialized(dev->kvm))) {
+ if (post_init != vgic_initialized(dev->kvm)) {
ret = -EBUSY;
goto out;
}
@@ -577,6 +591,19 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
}
break;
}
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
+ if (!is_write) {
+ val = dev->kvm->arch.vgic.mi_intid;
+ ret = 0;
+ break;
+ }
+
+ ret = -EINVAL;
+ if ((val < VGIC_NR_PRIVATE_IRQS) && (val >= VGIC_NR_SGIS)) {
+ dev->kvm->arch.vgic.mi_intid = val;
+ ret = 0;
+ }
+ break;
default:
ret = -EINVAL;
break;
@@ -603,6 +630,7 @@ static int vgic_v3_set_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, true);
default:
return vgic_set_common_attr(dev, attr);
@@ -617,6 +645,7 @@ static int vgic_v3_get_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, false);
default:
return vgic_get_common_attr(dev, attr);
@@ -640,6 +669,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
return vgic_v3_has_attr_regs(dev, attr);
case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return 0;
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index e070cda86e12..f25fccb1f8e6 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -148,7 +148,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
if (!(targets & (1U << c)))
continue;
- irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = true;
@@ -167,7 +167,7 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
u64 val = 0;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
val |= (u64)irq->targets << (i * 8);
@@ -191,7 +191,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
return;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid + i);
int target;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -213,7 +213,7 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
u64 val = 0;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
val |= (u64)irq->source << (i * 8);
@@ -231,7 +231,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
unsigned long flags;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -253,7 +253,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
unsigned long flags;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index c15ee1df036a..ae4c0593d114 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -194,7 +194,7 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
int intid = VGIC_ADDR_TO_INTID(addr, 64);
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid);
unsigned long ret = 0;
if (!irq)
@@ -220,7 +220,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
if (addr & 4)
return;
- irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+ irq = vgic_get_irq(vcpu->kvm, intid);
if (!irq)
return;
@@ -277,7 +277,7 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
return;
vgic_flush_pending_lpis(vcpu);
- vgic_its_invalidate_cache(vcpu->kvm);
+ vgic_its_invalidate_all_caches(vcpu->kvm);
atomic_set_release(&vgic_cpu->ctlr, 0);
} else {
ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0,
@@ -530,6 +530,7 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu,
unsigned long val)
{
struct vgic_irq *irq;
+ u32 intid;
/*
* If the guest wrote only to the upper 32bit part of the
@@ -541,9 +542,13 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu,
if ((addr & 4) || !vgic_lpis_enabled(vcpu))
return;
+ intid = lower_32_bits(val);
+ if (intid < VGIC_MIN_LPI)
+ return;
+
vgic_set_rdist_busy(vcpu, true);
- irq = vgic_get_irq(vcpu->kvm, NULL, lower_32_bits(val));
+ irq = vgic_get_irq(vcpu->kvm, intid);
if (irq) {
vgic_its_inv_lpi(vcpu->kvm, irq);
vgic_put_irq(vcpu->kvm, irq);
@@ -919,8 +924,19 @@ free:
return ret;
}
-void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg)
+void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg)
{
+ struct kvm_vcpu *vcpu;
+ unsigned long c;
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ /* Garbage collect the region */
+ kvm_for_each_vcpu(c, vcpu, kvm) {
+ if (vcpu->arch.vgic_cpu.rdreg == rdreg)
+ vcpu->arch.vgic_cpu.rdreg = NULL;
+ }
+
list_del(&rdreg->list);
kfree(rdreg);
}
@@ -945,7 +961,7 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
mutex_lock(&kvm->arch.config_lock);
rdreg = vgic_v3_rdist_region_from_index(kvm, index);
- vgic_v3_free_redist_region(rdreg);
+ vgic_v3_free_redist_region(kvm, rdreg);
mutex_unlock(&kvm->arch.config_lock);
return ret;
}
@@ -1009,7 +1025,7 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, sgi);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, sgi);
unsigned long flags;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index cf76523a2194..e416e433baff 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -50,7 +50,7 @@ unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->group)
value |= BIT(i);
@@ -74,7 +74,7 @@ void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
unsigned long flags;
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->group = !!(val & BIT(i));
@@ -102,7 +102,7 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->enabled)
value |= (1U << i);
@@ -122,7 +122,7 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
@@ -171,7 +171,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
@@ -193,7 +193,7 @@ int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->enabled = true;
@@ -214,7 +214,7 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->enabled = false;
@@ -236,7 +236,7 @@ static unsigned long __read_pending(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
unsigned long flags;
bool val;
@@ -309,7 +309,7 @@ static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
/* GICD_ISPENDR0 SGI bits are WI when written from the guest. */
if (is_vgic_v2_sgi(vcpu, irq) && !is_user) {
@@ -395,7 +395,7 @@ static void __clear_pending(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
/* GICD_ICPENDR0 SGI bits are WI when written from the guest. */
if (is_vgic_v2_sgi(vcpu, irq) && !is_user) {
@@ -494,7 +494,7 @@ static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
/*
* Even for HW interrupts, don't evaluate the HW state as
@@ -598,7 +598,7 @@ static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
int i;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, false);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -635,7 +635,7 @@ static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
int i;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, true);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -672,7 +672,7 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
u64 val = 0;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
val |= (u64)irq->priority << (i * 8);
@@ -698,7 +698,7 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
unsigned long flags;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
/* Narrow the priority range to what we actually support */
@@ -719,7 +719,7 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
int i;
for (i = 0; i < len * 4; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->config == VGIC_CONFIG_EDGE)
value |= (2U << (i * 2));
@@ -750,7 +750,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
if (intid + i < VGIC_NR_PRIVATE_IRQS)
continue;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ irq = vgic_get_irq(vcpu->kvm, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (test_bit(i * 2 + 1, &val))
@@ -775,7 +775,7 @@ u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
continue;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
val |= (1U << i);
@@ -799,7 +799,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
continue;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ irq = vgic_get_vcpu_irq(vcpu, intid + i);
/*
* Line level is set irrespective of irq type
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 7e9cdb78f7ce..381673f03c39 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -72,7 +72,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
kvm_notify_acked_irq(vcpu->kvm, 0,
intid - VGIC_NR_PRIVATE_IRQS);
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
raw_spin_lock(&irq->irq_lock);
@@ -464,17 +464,10 @@ void vgic_v2_load(struct kvm_vcpu *vcpu)
kvm_vgic_global_state.vctrl_base + GICH_APR);
}
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
-{
- struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
- cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-}
-
void vgic_v2_put(struct kvm_vcpu *vcpu)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- vgic_v2_vmcr_sync(vcpu);
+ cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
new file mode 100644
index 000000000000..bfa5bde1f106
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
+
+#include "vgic.h"
+
+#define ICH_LRN(n) (ICH_LR0_EL2 + (n))
+#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n))
+#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n))
+
+struct mi_state {
+ u16 eisr;
+ u16 elrsr;
+ bool pend;
+};
+
+/*
+ * The shadow registers loaded to the hardware when running a L2 guest
+ * with the virtual IMO/FMO bits set.
+ */
+struct shadow_if {
+ struct vgic_v3_cpu_if cpuif;
+ unsigned long lr_map;
+};
+
+static DEFINE_PER_CPU(struct shadow_if, shadow_if);
+
+/*
+ * Nesting GICv3 support
+ *
+ * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
+ * completely controls the interrupts injected via the list registers.
+ * Consequently, most of the state that is modified by the guest (by ACK-ing
+ * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
+ * keep a semi-consistent view of the interrupts.
+ *
+ * This still applies for a NV guest, but only while "InHost" (either
+ * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
+ *
+ * When running a L2 guest ("not InHost"), things are radically different,
+ * as the L1 guest is in charge of provisioning the interrupts via its own
+ * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
+ * page. This means that the flow described above does work (there is no
+ * state to rebuild in the L0 hypervisor), and that most things happed on L2
+ * load/put:
+ *
+ * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
+ * per-CPU data structure that is used to populate the actual LRs. This is
+ * an extra copy that we could avoid, but life is short. In the process,
+ * we remap any interrupt that has the HW bit set to the mapped interrupt
+ * on the host, should the host consider it a HW one. This allows the HW
+ * deactivation to take its course, such as for the timer.
+ *
+ * - on L2 put: perform the inverse transformation, so that the result of L2
+ * running becomes visible to L1 in the VNCR-accessible registers.
+ *
+ * - there is nothing to do on L2 entry, as everything will have happened
+ * on load. However, this is the point where we detect that an interrupt
+ * targeting L1 and prepare the grand switcheroo.
+ *
+ * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
+ * interrupt. The L0 active state will be cleared by the HW if the L1
+ * interrupt was itself backed by a HW interrupt.
+ *
+ * Maintenance Interrupt (MI) management:
+ *
+ * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
+ * used as a handover point between L2 and L1.
+ *
+ * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
+ * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
+ * run and process the MI.
+ *
+ * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
+ * state must be computed at each entry/exit of the guest, much like we do
+ * it for the PMU interrupt.
+ *
+ * - because most of the ICH_*_EL2 registers live in the VNCR page, the
+ * quality of emulation is poor: L1 can setup the vgic so that an MI would
+ * immediately fire, and not observe anything until the next exit. Trying
+ * to read ICH_MISR_EL2 would do the trick, for example.
+ *
+ * System register emulation:
+ *
+ * We get two classes of registers:
+ *
+ * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
+ * them, and L0 doesn't see a thing.
+ *
+ * - those that always trap (ELRSR, EISR, MISR): these are status registers
+ * that are built on the fly based on the in-memory state.
+ *
+ * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
+ * and a NV L2 would either access the VNCR page provided by L1 (memory
+ * based registers), or see the access redirected to L1 (registers that
+ * trap) thanks to NV being set by L1.
+ */
+
+bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
+{
+ u64 xmo;
+
+ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
+ WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
+ "Separate virtual IRQ/FIQ settings not supported\n");
+
+ return !!xmo;
+ }
+
+ return false;
+}
+
+static struct shadow_if *get_shadow_if(void)
+{
+ return this_cpu_ptr(&shadow_if);
+}
+
+static bool lr_triggers_eoi(u64 lr)
+{
+ return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
+}
+
+static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
+{
+ u16 eisr = 0, elrsr = 0;
+ bool pend = false;
+
+ for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+
+ if (lr_triggers_eoi(lr))
+ eisr |= BIT(i);
+ if (!(lr & ICH_LR_STATE))
+ elrsr |= BIT(i);
+ pend |= (lr & ICH_LR_PENDING_BIT);
+ }
+
+ mi_state->eisr = eisr;
+ mi_state->elrsr = elrsr;
+ mi_state->pend = pend;
+}
+
+u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+ return mi_state.eisr;
+}
+
+u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+ return mi_state.elrsr;
+}
+
+u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+ u64 reg = 0, hcr, vmcr;
+
+ hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+ vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+
+ if (mi_state.eisr)
+ reg |= ICH_MISR_EL2_EOI;
+
+ if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
+ int used_lrs = kvm_vgic_global_state.nr_lr;
+
+ used_lrs -= hweight16(mi_state.elrsr);
+ reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
+ }
+
+ if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
+ reg |= ICH_MISR_EL2_LRENP;
+
+ if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
+ reg |= ICH_MISR_EL2_NP;
+
+ if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
+ reg |= ICH_MISR_EL2_VGrp0E;
+
+ if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
+ reg |= ICH_MISR_EL2_VGrp0D;
+
+ if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
+ reg |= ICH_MISR_EL2_VGrp1E;
+
+ if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
+ reg |= ICH_MISR_EL2_VGrp1D;
+
+ return reg;
+}
+
+/*
+ * For LRs which have HW bit set such as timer interrupts, we modify them to
+ * have the host hardware interrupt number instead of the virtual one programmed
+ * by the guest hypervisor.
+ */
+static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
+ struct vgic_v3_cpu_if *s_cpu_if)
+{
+ unsigned long lr_map = 0;
+ int index = 0;
+
+ for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+ struct vgic_irq *irq;
+
+ if (!(lr & ICH_LR_STATE))
+ lr = 0;
+
+ if (!(lr & ICH_LR_HW))
+ goto next;
+
+ /* We have the HW bit set, check for validity of pINTID */
+ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
+ /* There was no real mapping, so nuke the HW bit */
+ lr &= ~ICH_LR_HW;
+ if (irq)
+ vgic_put_irq(vcpu->kvm, irq);
+ goto next;
+ }
+
+ /* It is illegal to have the EOI bit set with HW */
+ lr &= ~ICH_LR_EOI;
+
+ /* Translate the virtual mapping to the real one */
+ lr &= ~ICH_LR_PHYS_ID_MASK;
+ lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+
+ vgic_put_irq(vcpu->kvm, irq);
+
+next:
+ s_cpu_if->vgic_lr[index] = lr;
+ if (lr) {
+ lr_map |= BIT(i);
+ index++;
+ }
+ }
+
+ container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
+ s_cpu_if->used_lrs = index;
+}
+
+void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ int i, index = 0;
+
+ for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+ struct vgic_irq *irq;
+
+ if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
+ goto next;
+
+ /*
+ * If we had a HW lr programmed by the guest hypervisor, we
+ * need to emulate the HW effect between the guest hypervisor
+ * and the nested guest.
+ */
+ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
+ goto next;
+
+ lr = __gic_v3_get_lr(index);
+ if (!(lr & ICH_LR_STATE))
+ irq->active = false;
+
+ vgic_put_irq(vcpu->kvm, irq);
+ next:
+ index++;
+ }
+}
+
+static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
+ struct vgic_v3_cpu_if *s_cpu_if)
+{
+ struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ u64 val = 0;
+ int i;
+
+ /*
+ * If we're on a system with a broken vgic that requires
+ * trapping, propagate the trapping requirements.
+ *
+ * Ah, the smell of rotten fruits...
+ */
+ if (static_branch_unlikely(&vgic_v3_cpuif_trap))
+ val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
+ ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
+ s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
+ s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
+ s_cpu_if->vgic_sre = host_if->vgic_sre;
+
+ for (i = 0; i < 4; i++) {
+ s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
+ s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
+ }
+
+ vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
+}
+
+void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
+
+ BUG_ON(!vgic_state_is_nested(vcpu));
+
+ vgic_v3_create_shadow_state(vcpu, cpu_if);
+
+ __vgic_v3_restore_vmcr_aprs(cpu_if);
+ __vgic_v3_activate_traps(cpu_if);
+
+ __vgic_v3_restore_state(cpu_if);
+
+ /*
+ * Propagate the number of used LRs for the benefit of the HYP
+ * GICv3 emulation code. Yes, this is a pretty sorry hack.
+ */
+ vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
+}
+
+void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
+ u64 val;
+ int i;
+
+ __vgic_v3_save_vmcr_aprs(s_cpu_if);
+ __vgic_v3_deactivate_traps(s_cpu_if);
+ __vgic_v3_save_state(s_cpu_if);
+
+ /*
+ * Translate the shadow state HW fields back to the virtual ones
+ * before copying the shadow struct back to the nested one.
+ */
+ val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+ val &= ~ICH_HCR_EL2_EOIcount_MASK;
+ val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
+ __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val;
+ __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr;
+
+ for (i = 0; i < 4; i++) {
+ __vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i];
+ __vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i];
+ }
+
+ for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
+ val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+
+ val &= ~ICH_LR_STATE;
+ val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
+
+ __vcpu_sys_reg(vcpu, ICH_LRN(i)) = val;
+ s_cpu_if->vgic_lr[i] = 0;
+ }
+
+ shadow_if->lr_map = 0;
+ vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
+}
+
+/*
+ * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
+ * then we need to forward this to L1 so that it can re-sync the appropriate
+ * LRs and sample level triggered interrupts again.
+ */
+void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
+{
+ bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
+
+ /* This will force a switch back to L1 if the level is high */
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
+
+ sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
+}
+
+void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
+{
+ bool level;
+
+ level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En;
+ if (level)
+ level &= vgic_v3_get_misr(vcpu);
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 4ea3340786b9..b9ad7c42c5b0 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -6,6 +6,7 @@
#include <linux/kstrtox.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/string_choices.h>
#include <kvm/arm_vgic.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
@@ -23,7 +24,7 @@ void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
- cpuif->vgic_hcr |= ICH_HCR_UIE;
+ cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
}
static bool lr_signals_eoi_mi(u64 lr_val)
@@ -41,7 +42,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+ cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE;
for (lr = 0; lr < cpuif->used_lrs; lr++) {
u64 val = cpuif->vgic_lr[lr];
@@ -65,7 +66,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
kvm_notify_acked_irq(vcpu->kvm, 0,
intid - VGIC_NR_PRIVATE_IRQS);
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
if (!irq) /* An LPI could have been unmapped. */
continue;
@@ -283,23 +284,34 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
vgic_v3->vgic_sre = 0;
}
- vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
- ICH_VTR_ID_BITS_MASK) >>
- ICH_VTR_ID_BITS_SHIFT;
- vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
- ICH_VTR_PRI_BITS_MASK) >>
- ICH_VTR_PRI_BITS_SHIFT) + 1;
+ vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits,
+ kvm_vgic_global_state.ich_vtr_el2);
+ vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits,
+ kvm_vgic_global_state.ich_vtr_el2) + 1;
/* Get the show on the road... */
- vgic_v3->vgic_hcr = ICH_HCR_EN;
+ vgic_v3->vgic_hcr = ICH_HCR_EL2_En;
+}
+
+void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ /* Hide GICv3 sysreg if necessary */
+ if (!kvm_has_gicv3(vcpu->kvm)) {
+ vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
+ ICH_HCR_EL2_TC);
+ return;
+ }
+
if (group0_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0;
if (group1_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1;
if (common_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TC;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC;
if (dir_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TDIR;
+ vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR;
}
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
@@ -370,7 +382,7 @@ static void map_all_vpes(struct kvm *kvm)
dist->its_vm.vpes[i]->irq));
}
-/**
+/*
* vgic_v3_save_pending_tables - Save the pending tables into guest RAM
* kvm lock and all vcpu lock must be held
*/
@@ -619,8 +631,8 @@ static const struct midr_range broken_seis[] = {
static bool vgic_v3_broken_seis(void)
{
- return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) &&
- is_midr_in_range_list(read_cpuid_id(), broken_seis));
+ return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) &&
+ is_midr_in_range_list(broken_seis));
}
/**
@@ -651,9 +663,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (info->has_v4) {
kvm_vgic_global_state.has_gicv4 = gicv4_enable;
kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
- kvm_info("GICv4%s support %sabled\n",
+ kvm_info("GICv4%s support %s\n",
kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
- gicv4_enable ? "en" : "dis");
+ str_enabled_disabled(gicv4_enable));
}
kvm_vgic_global_state.vcpu_base = 0;
@@ -693,10 +705,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (vgic_v3_broken_seis()) {
kvm_info("GICv3 with broken locally generated SEI\n");
- kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK;
+ kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS;
group0_trap = true;
group1_trap = true;
- if (ich_vtr_el2 & ICH_VTR_TDS_MASK)
+ if (ich_vtr_el2 & ICH_VTR_EL2_TDS)
dir_trap = true;
else
common_trap = true;
@@ -722,15 +734,14 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- /*
- * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
- * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
- * VMCR_EL2 save/restore in the world switch.
- */
- if (likely(cpu_if->vgic_sre))
- kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+ /* If the vgic is nested, perform the full state loading */
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_load_nested(vcpu);
+ return;
+ }
- kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
+ if (likely(!is_protected_kvm_enabled()))
+ kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);
if (has_vhe())
__vgic_v3_activate_traps(cpu_if);
@@ -738,23 +749,18 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
WARN_ON(vgic_v4_load(vcpu));
}
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
-{
- struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
- if (likely(cpu_if->vgic_sre))
- cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
-}
-
void vgic_v3_put(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- WARN_ON(vgic_v4_put(vcpu));
-
- vgic_v3_vmcr_sync(vcpu);
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_put_nested(vcpu);
+ return;
+ }
- kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
+ if (likely(!is_protected_kvm_enabled()))
+ kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
+ WARN_ON(vgic_v4_put(vcpu));
if (has_vhe())
__vgic_v3_deactivate_traps(cpu_if);
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 74a67ad87f29..c7de6154627c 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -123,7 +123,7 @@ static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
* IRQ. The SGI code will do its magic.
*/
for (i = 0; i < VGIC_NR_SGIS; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
struct irq_desc *desc;
unsigned long flags;
int ret;
@@ -160,7 +160,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
int i;
for (i = 0; i < VGIC_NR_SGIS; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
struct irq_desc *desc;
unsigned long flags;
int ret;
@@ -336,6 +336,22 @@ void vgic_v4_teardown(struct kvm *kvm)
its_vm->vpes = NULL;
}
+static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_get_flag(vcpu, IN_WFI))
+ return true;
+
+ if (likely(!vcpu_has_nv(vcpu)))
+ return false;
+
+ /*
+ * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the
+ * L1 context) nonresident and request a doorbell to kick us out of the
+ * L2 when an IRQ becomes pending.
+ */
+ return vcpu_get_flag(vcpu, IN_NESTED_ERET);
+}
+
int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
@@ -343,7 +359,7 @@ int vgic_v4_put(struct kvm_vcpu *vcpu)
if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
return 0;
- return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI));
+ return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu));
}
int vgic_v4_load(struct kvm_vcpu *vcpu)
@@ -415,7 +431,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
struct vgic_irq *irq;
struct its_vlpi_map map;
unsigned long flags;
- int ret;
+ int ret = 0;
if (!vgic_supports_direct_msis(kvm))
return 0;
@@ -430,10 +446,15 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
mutex_lock(&its->its_lock);
- /* Perform the actual DevID/EventID -> LPI translation. */
- ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
- irq_entry->msi.data, &irq);
- if (ret)
+ /*
+ * Perform the actual DevID/EventID -> LPI translation.
+ *
+ * Silently exit if translation fails as the guest (or userspace!) has
+ * managed to do something stupid. Emulated LPI injection will still
+ * work if the guest figures itself out at a later time.
+ */
+ if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+ irq_entry->msi.data, &irq))
goto out;
/* Silently exit if the vLPI is already mapped */
@@ -512,7 +533,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
if (ret)
goto out;
- WARN_ON(!(irq->hw && irq->host_irq == virq));
+ WARN_ON(irq->hw && irq->host_irq != virq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 4ec93587c8cd..8f8096d48925 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -29,14 +29,18 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
* its->cmd_lock (mutex)
* its->its_lock (mutex)
* vgic_cpu->ap_list_lock must be taken with IRQs disabled
- * kvm->lpi_list_lock must be taken with IRQs disabled
- * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled
- * vgic_irq->irq_lock must be taken with IRQs disabled
+ * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled
+ * vgic_irq->irq_lock must be taken with IRQs disabled
*
* As the ap_list_lock might be taken from the timer interrupt handler,
* we have to disable IRQs before taking this lock and everything lower
* than it.
*
+ * The config_lock has additional ordering requirements:
+ * kvm->slots_lock
+ * kvm->srcu
+ * kvm->arch.config_lock
+ *
* If you need to take multiple locks, always take the upper lock first,
* then the lower ones, e.g. first take the its_lock, then the irq_lock.
* If you are already holding a lock and need to take a higher one, you
@@ -80,17 +84,11 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
* struct vgic_irq. It also increases the refcount, so any caller is expected
* to call vgic_put_irq() once it's finished with this IRQ.
*/
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
- u32 intid)
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid)
{
- /* SGIs and PPIs */
- if (intid <= VGIC_MAX_PRIVATE) {
- intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
- return &vcpu->arch.vgic_cpu.private_irqs[intid];
- }
-
/* SPIs */
- if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+ if (intid >= VGIC_NR_PRIVATE_IRQS &&
+ intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
}
@@ -102,6 +100,20 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
return NULL;
}
+struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
+{
+ if (WARN_ON(!vcpu))
+ return NULL;
+
+ /* SGIs and PPIs */
+ if (intid < VGIC_NR_PRIVATE_IRQS) {
+ intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS);
+ return &vcpu->arch.vgic_cpu.private_irqs[intid];
+ }
+
+ return vgic_get_irq(vcpu->kvm, intid);
+}
+
/*
* We can't do anything in here, because we lack the kvm pointer to
* lock and remove the item from the lpi_list. So we keep this function
@@ -126,7 +138,6 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
__xa_erase(&dist->lpi_xa, irq->intid);
xa_unlock_irqrestore(&dist->lpi_xa, flags);
- atomic_dec(&dist->lpi_count);
kfree_rcu(irq, rcu);
}
@@ -315,7 +326,7 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne
* with all locks dropped.
*/
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
- unsigned long flags)
+ unsigned long flags) __releases(&irq->irq_lock)
{
struct kvm_vcpu *vcpu;
@@ -434,7 +445,10 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level);
- irq = vgic_get_irq(kvm, vcpu, intid);
+ if (intid < VGIC_NR_PRIVATE_IRQS)
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ else
+ irq = vgic_get_irq(kvm, intid);
if (!irq)
return -EINVAL;
@@ -496,7 +510,7 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
u32 vintid, struct irq_ops *ops)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
unsigned long flags;
int ret;
@@ -521,7 +535,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
*/
void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
unsigned long flags;
if (!irq->hw)
@@ -544,7 +558,7 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
if (!vgic_initialized(vcpu->kvm))
return -EAGAIN;
- irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ irq = vgic_get_vcpu_irq(vcpu, vintid);
BUG_ON(!irq);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -557,7 +571,7 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
unsigned long flags;
int ret = -1;
@@ -593,7 +607,7 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
return -EINVAL;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->owner && irq->owner != owner)
ret = -EEXIST;
@@ -858,6 +872,15 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
int used_lrs;
+ /* If nesting, emulate the HW effect from L0 to L1 */
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_sync_nested(vcpu);
+ return;
+ }
+
+ if (vcpu_has_nv(vcpu))
+ vgic_v3_nested_update_mi(vcpu);
+
/* An empty ap_list_head implies used_lrs == 0 */
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
return;
@@ -887,6 +910,35 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
/*
+ * If in a nested state, we must return early. Two possibilities:
+ *
+ * - If we have any pending IRQ for the guest and the guest
+ * expects IRQs to be handled in its virtual EL2 mode (the
+ * virtual IMO bit is set) and it is not already running in
+ * virtual EL2 mode, then we have to emulate an IRQ
+ * exception to virtual EL2.
+ *
+ * We do that by placing a request to ourselves which will
+ * abort the entry procedure and inject the exception at the
+ * beginning of the run loop.
+ *
+ * - Otherwise, do exactly *NOTHING*. The guest state is
+ * already loaded, and we can carry on with running it.
+ *
+ * If we have NV, but are not in a nested state, compute the
+ * maintenance interrupt state, as it may fire.
+ */
+ if (vgic_state_is_nested(vcpu)) {
+ if (kvm_vgic_vcpu_pending_irq(vcpu))
+ kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu);
+
+ return;
+ }
+
+ if (vcpu_has_nv(vcpu))
+ vgic_v3_nested_update_mi(vcpu);
+
+ /*
* If there are no virtual interrupts active or pending for this
* VCPU, then there is no work to do and we can bail out without
* taking any lock. There is a potential race with someone injecting
@@ -919,10 +971,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
void kvm_vgic_load(struct kvm_vcpu *vcpu)
{
- if (unlikely(!vgic_initialized(vcpu->kvm)))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
+ if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
+ }
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_load(vcpu);
else
vgic_v3_load(vcpu);
@@ -930,26 +985,18 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu)
void kvm_vgic_put(struct kvm_vcpu *vcpu)
{
- if (unlikely(!vgic_initialized(vcpu->kvm)))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
+ if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
+ }
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_put(vcpu);
else
vgic_v3_put(vcpu);
}
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
-{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
- return;
-
- if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_vmcr_sync(vcpu);
- else
- vgic_v3_vmcr_sync(vcpu);
-}
-
int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -1010,7 +1057,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
if (!vgic_initialized(vcpu->kvm))
return false;
- irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ irq = vgic_get_vcpu_irq(vcpu, vintid);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
map_is_active = irq->hw && irq->active;
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 0c2b82de8fa3..0c5a63712702 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -16,6 +16,7 @@
#define INTERRUPT_ID_BITS_SPIS 10
#define INTERRUPT_ID_BITS_ITS 16
+#define VGIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1)
#define VGIC_PRI_BITS 5
#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
@@ -178,14 +179,14 @@ int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
const struct vgic_register_region *
vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
gpa_t addr, int len);
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
- u32 intid);
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid);
+struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid);
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
bool vgic_get_phys_line_level(struct vgic_irq *irq);
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
- unsigned long flags);
+ unsigned long flags) __releases(&irq->irq_lock);
void vgic_kick_vcpus(struct kvm *kvm);
void vgic_irq_handle_resampling(struct vgic_irq *irq,
bool lr_deactivated, bool lr_pending);
@@ -214,7 +215,6 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
void vgic_v2_init_lrs(void);
void vgic_v2_load(struct kvm_vcpu *vcpu);
void vgic_v2_put(struct kvm_vcpu *vcpu);
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
void vgic_v2_save_state(struct kvm_vcpu *vcpu);
void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
@@ -253,7 +253,6 @@ bool vgic_v3_check_base(struct kvm *kvm);
void vgic_v3_load(struct kvm_vcpu *vcpu);
void vgic_v3_put(struct kvm_vcpu *vcpu);
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
bool vgic_has_its(struct kvm *kvm);
int kvm_vgic_register_its_device(void);
@@ -317,7 +316,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
u32 index);
-void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg);
+void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg);
bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
@@ -330,14 +329,11 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
}
bool vgic_lpis_enabled(struct kvm_vcpu *vcpu);
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
u32 devid, u32 eventid, struct vgic_irq **irq);
struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
-void vgic_lpi_translation_cache_init(struct kvm *kvm);
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
-void vgic_its_invalidate_cache(struct kvm *kvm);
+void vgic_its_invalidate_all_caches(struct kvm *kvm);
/* GICv4.1 MMIO interface */
int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
@@ -350,4 +346,17 @@ void vgic_v4_configure_vsgis(struct kvm *kvm);
void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq);
+void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu);
+
+static inline bool kvm_has_gicv3(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
+}
+
+void vgic_v3_sync_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_load_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
+void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c
index 806223b7022a..7fe8ba1a2851 100644
--- a/arch/arm64/kvm/vmid.c
+++ b/arch/arm64/kvm/vmid.c
@@ -135,11 +135,10 @@ void kvm_arm_vmid_clear_active(void)
atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID);
}
-bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
+void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
{
unsigned long flags;
u64 vmid, old_active_vmid;
- bool updated = false;
vmid = atomic64_read(&kvm_vmid->id);
@@ -157,21 +156,17 @@ bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
if (old_active_vmid != 0 && vmid_gen_match(vmid) &&
0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids),
old_active_vmid, vmid))
- return false;
+ return;
raw_spin_lock_irqsave(&cpu_vmid_lock, flags);
/* Check that our VMID belongs to the current generation. */
vmid = atomic64_read(&kvm_vmid->id);
- if (!vmid_gen_match(vmid)) {
+ if (!vmid_gen_match(vmid))
vmid = new_vmid(kvm_vmid);
- updated = true;
- }
atomic64_set(this_cpu_ptr(&active_vmids), vmid);
raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags);
-
- return updated;
}
/*