summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r--arch/arm64/kvm/Kconfig34
-rw-r--r--arch/arm64/kvm/Makefile16
-rw-r--r--arch/arm64/kvm/arch_timer.c900
-rw-r--r--arch/arm64/kvm/arm.c1386
-rw-r--r--arch/arm64/kvm/at.c1795
-rw-r--r--arch/arm64/kvm/config.c1520
-rw-r--r--arch/arm64/kvm/debug.c432
-rw-r--r--arch/arm64/kvm/emulate-nested.c2854
-rw-r--r--arch/arm64/kvm/fpsimd.c143
-rw-r--r--arch/arm64/kvm/guest.c239
-rw-r--r--arch/arm64/kvm/handle_exit.c256
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/aarch32.c22
-rw-r--r--arch/arm64/kvm/hyp/entry.S15
-rw-r--r--arch/arm64/kvm/hyp/exception.c64
-rw-r--r--arch/arm64/kvm/hyp/fpsimd.S6
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S8
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/debug-sr.h50
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/fault.h75
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h700
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h196
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/ffa.h17
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/fixed_config.h205
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/gfp.h8
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h50
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/memory.h84
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mm.h5
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/pkvm.h38
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/trap_handler.h5
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile32
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c108
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c993
-rw-r--r--arch/arm64/kvm/hyp/nvhe/gen-hyprel.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S60
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S126
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c408
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/list_debug.c10
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c1216
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c184
-rw-r--r--arch/arm64/kvm/hyp/nvhe/page_alloc.c25
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c663
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c12
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c130
-rw-r--r--arch/arm64/kvm/hyp/nvhe/stacktrace.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c112
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c424
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sysreg-sr.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/timer-sr.c46
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c221
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c728
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c6
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c291
-rw-r--r--arch/arm64/kvm/hyp/vhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/vhe/debug-sr.c5
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c557
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c185
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c253
-rw-r--r--arch/arm64/kvm/hypercalls.c244
-rw-r--r--arch/arm64/kvm/inject_fault.c267
-rw-r--r--arch/arm64/kvm/mmio.c48
-rw-r--r--arch/arm64/kvm/mmu.c1314
-rw-r--r--arch/arm64/kvm/nested.c1919
-rw-r--r--arch/arm64/kvm/pauth.c206
-rw-r--r--arch/arm64/kvm/pkvm.c420
-rw-r--r--arch/arm64/kvm/pmu-emul.c719
-rw-r--r--arch/arm64/kvm/pmu.c140
-rw-r--r--arch/arm64/kvm/psci.c81
-rw-r--r--arch/arm64/kvm/ptdump.c287
-rw-r--r--arch/arm64/kvm/pvtime.c8
-rw-r--r--arch/arm64/kvm/reset.c166
-rw-r--r--arch/arm64/kvm/stacktrace.c9
-rw-r--r--arch/arm64/kvm/sys_regs.c4090
-rw-r--r--arch/arm64/kvm/sys_regs.h55
-rw-r--r--arch/arm64/kvm/trace_arm.h122
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h77
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c135
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c339
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c404
-rw-r--r--arch/arm64/kvm/vgic/vgic-irqfd.c9
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c688
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c160
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v2.c36
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c286
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c175
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.h1
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c300
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c407
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c517
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c161
-rw-r--r--arch/arm64/kvm/vgic/vgic-v5.c52
-rw-r--r--arch/arm64/kvm/vgic/vgic.c543
-rw-r--r--arch/arm64/kvm/vgic/vgic.h178
-rw-r--r--arch/arm64/kvm/vmid.c13
94 files changed, 26127 insertions, 6387 deletions
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 05da3c8f7e88..4f803fd1c99a 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -3,7 +3,6 @@
# KVM configuration
#
-source "virt/lib/Kconfig"
source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
@@ -20,29 +19,25 @@ if VIRTUALIZATION
menuconfig KVM
bool "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
- select MMU_NOTIFIER
- select PREEMPT_NOTIFIERS
+ select KVM_COMMON
+ select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_MMU_NOTIFIER
select HAVE_KVM_CPU_RELAX_INTERCEPT
- select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
- select KVM_XFER_TO_GUEST_WORK
- select SRCU
+ select VIRT_XFER_TO_GUEST_WORK
select KVM_VFIO
- select HAVE_KVM_EVENTFD
- select HAVE_KVM_IRQFD
select HAVE_KVM_DIRTY_RING_ACQ_REL
select NEED_KVM_DIRTY_RING_WITH_BITMAP
select HAVE_KVM_MSI
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING
- select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
+ select HAVE_KVM_READONLY_MEM
select HAVE_KVM_VCPU_RUN_PID_CHANGE
select SCHED_INFO
select GUEST_PERF_EVENTS if PERF_EVENTS
- select INTERVAL_TREE
+ select KVM_GUEST_MEMFD
help
Support hosting virtualized guest machines.
@@ -71,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE
If unsure, or not using protected nVHE (pKVM), say N.
+config PTDUMP_STAGE2_DEBUGFS
+ bool "Present the stage-2 pagetables to debugfs"
+ depends on KVM
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on ARCH_HAS_PTDUMP
+ select PTDUMP
+ default n
+ help
+ Say Y here if you want to show the stage-2 kernel pagetables
+ layout in a debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
+
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 5e33c2d4645a..3ebc0570345c 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -3,26 +3,32 @@
# Makefile for Kernel-based Virtual Machine module
#
-ccflags-y += -I $(srctree)/$(src)
+ccflags-y += -I $(src)
include $(srctree)/virt/kvm/Makefile.kvm
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM) += hyp/
+CFLAGS_sys_regs.o += -Wno-override-init
+CFLAGS_handle_exit.o += -Wno-override-init
+
kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
- inject_fault.o va_layout.o handle_exit.o \
+ inject_fault.o va_layout.o handle_exit.o config.o \
guest.o debug.o reset.o sys_regs.o stacktrace.o \
vgic-sys-reg-v3.o fpsimd.o pkvm.o \
- arch_timer.o trng.o vmid.o \
+ arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
- vgic/vgic-its.o vgic/vgic-debug.o
+ vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \
+ vgic/vgic-v5.o
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
+kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
+kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
always-y := hyp_constants.h hyp-constants.s
@@ -30,7 +36,7 @@ define rule_gen_hyp_constants
$(call filechk,offsets,__HYP_CONSTANTS_H__)
endef
-CFLAGS_hyp-constants.o = -I $(srctree)/$(src)/hyp/include
+CFLAGS_hyp-constants.o = -I $(src)/hyp/include
$(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE
$(call if_changed_dep,cc_s_c)
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index bb24a76b4224..99a07972068d 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -16,6 +16,7 @@
#include <asm/arch_timer.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_nested.h>
#include <kvm/arm_vgic.h>
#include <kvm/arm_arch_timer.h>
@@ -29,15 +30,13 @@ static u32 host_vtimer_irq_flags;
static u32 host_ptimer_irq_flags;
static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
+DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key);
-static const struct kvm_irq_level default_ptimer_irq = {
- .irq = 30,
- .level = 1,
-};
-
-static const struct kvm_irq_level default_vtimer_irq = {
- .irq = 27,
- .level = 1,
+static const u8 default_ppi[] = {
+ [TIMER_PTIMER] = 30,
+ [TIMER_VTIMER] = 27,
+ [TIMER_HPTIMER] = 26,
+ [TIMER_HVTIMER] = 28,
};
static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
@@ -51,16 +50,33 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
struct arch_timer_context *timer,
enum kvm_arch_timer_regs treg);
+static bool kvm_arch_timer_get_input_level(int vintid);
+
+static struct irq_ops arch_timer_irq_ops = {
+ .get_input_level = kvm_arch_timer_get_input_level,
+};
+
+static int nr_timers(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_has_nv(vcpu))
+ return NR_KVM_EL0_TIMERS;
+
+ return NR_KVM_TIMERS;
+}
u32 timer_get_ctl(struct arch_timer_context *ctxt)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);
switch(arch_timer_ctx_index(ctxt)) {
case TIMER_VTIMER:
return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0);
case TIMER_PTIMER:
return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0);
+ case TIMER_HVTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2);
+ case TIMER_HPTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2);
default:
WARN_ON(1);
return 0;
@@ -69,41 +85,39 @@ u32 timer_get_ctl(struct arch_timer_context *ctxt)
u64 timer_get_cval(struct arch_timer_context *ctxt)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);
switch(arch_timer_ctx_index(ctxt)) {
case TIMER_VTIMER:
return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
case TIMER_PTIMER:
return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ case TIMER_HVTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2);
+ case TIMER_HPTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2);
default:
WARN_ON(1);
return 0;
}
}
-static u64 timer_get_offset(struct arch_timer_context *ctxt)
-{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
-
- switch(arch_timer_ctx_index(ctxt)) {
- case TIMER_VTIMER:
- return __vcpu_sys_reg(vcpu, CNTVOFF_EL2);
- default:
- return 0;
- }
-}
-
static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);
switch(arch_timer_ctx_index(ctxt)) {
case TIMER_VTIMER:
- __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = ctl;
+ __vcpu_assign_sys_reg(vcpu, CNTV_CTL_EL0, ctl);
break;
case TIMER_PTIMER:
- __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl;
+ __vcpu_assign_sys_reg(vcpu, CNTP_CTL_EL0, ctl);
+ break;
+ case TIMER_HVTIMER:
+ __vcpu_assign_sys_reg(vcpu, CNTHV_CTL_EL2, ctl);
+ break;
+ case TIMER_HPTIMER:
+ __vcpu_assign_sys_reg(vcpu, CNTHP_CTL_EL2, ctl);
break;
default:
WARN_ON(1);
@@ -112,30 +126,23 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);
switch(arch_timer_ctx_index(ctxt)) {
case TIMER_VTIMER:
- __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = cval;
+ __vcpu_assign_sys_reg(vcpu, CNTV_CVAL_EL0, cval);
break;
case TIMER_PTIMER:
- __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval;
+ __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, cval);
break;
- default:
- WARN_ON(1);
- }
-}
-
-static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset)
-{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
-
- switch(arch_timer_ctx_index(ctxt)) {
- case TIMER_VTIMER:
- __vcpu_sys_reg(vcpu, CNTVOFF_EL2) = offset;
+ case TIMER_HVTIMER:
+ __vcpu_assign_sys_reg(vcpu, CNTHV_CVAL_EL2, cval);
+ break;
+ case TIMER_HPTIMER:
+ __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, cval);
break;
default:
- WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt));
+ WARN_ON(1);
}
}
@@ -144,15 +151,29 @@ u64 kvm_phys_timer_read(void)
return timecounter->cc->read(timecounter->cc);
}
-static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
+void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
{
- if (has_vhe()) {
+ if (vcpu_has_nv(vcpu)) {
+ if (is_hyp_ctxt(vcpu)) {
+ map->direct_vtimer = vcpu_hvtimer(vcpu);
+ map->direct_ptimer = vcpu_hptimer(vcpu);
+ map->emul_vtimer = vcpu_vtimer(vcpu);
+ map->emul_ptimer = vcpu_ptimer(vcpu);
+ } else {
+ map->direct_vtimer = vcpu_vtimer(vcpu);
+ map->direct_ptimer = vcpu_ptimer(vcpu);
+ map->emul_vtimer = vcpu_hvtimer(vcpu);
+ map->emul_ptimer = vcpu_hptimer(vcpu);
+ }
+ } else if (has_vhe()) {
map->direct_vtimer = vcpu_vtimer(vcpu);
map->direct_ptimer = vcpu_ptimer(vcpu);
+ map->emul_vtimer = NULL;
map->emul_ptimer = NULL;
} else {
map->direct_vtimer = vcpu_vtimer(vcpu);
map->direct_ptimer = NULL;
+ map->emul_vtimer = NULL;
map->emul_ptimer = vcpu_ptimer(vcpu);
}
@@ -161,8 +182,7 @@ static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
static inline bool userspace_irqchip(struct kvm *kvm)
{
- return static_branch_unlikely(&userspace_irqchip_in_use) &&
- unlikely(!irqchip_in_kernel(kvm));
+ return unlikely(!irqchip_in_kernel(kvm));
}
static void soft_timer_start(struct hrtimer *hrt, u64 ns)
@@ -219,7 +239,7 @@ static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx,
ns = cyclecounter_cyc2ns(timecounter->cc,
val - now,
timecounter->mask,
- &timecounter->frac);
+ &timer_ctx->ns_frac);
return ns;
}
@@ -247,8 +267,10 @@ static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu)
static u64 wfit_delay_ns(struct kvm_vcpu *vcpu)
{
- struct arch_timer_context *ctx = vcpu_vtimer(vcpu);
u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+ struct arch_timer_context *ctx;
+
+ ctx = is_hyp_ctxt(vcpu) ? vcpu_hvtimer(vcpu) : vcpu_vtimer(vcpu);
return kvm_counter_compute_delta(ctx, val);
}
@@ -262,7 +284,7 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
u64 min_delta = ULLONG_MAX;
int i;
- for (i = 0; i < NR_KVM_TIMERS; i++) {
+ for (i = 0; i < nr_timers(vcpu); i++) {
struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
WARN(ctx->loaded, "timer %d loaded\n", i);
@@ -311,7 +333,7 @@ static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
u64 ns;
ctx = container_of(hrt, struct arch_timer_context, hrtimer);
- vcpu = ctx->vcpu;
+ vcpu = timer_context_to_vcpu(ctx);
trace_kvm_timer_hrtimer_expire(ctx);
@@ -345,9 +367,11 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
switch (index) {
case TIMER_VTIMER:
+ case TIMER_HVTIMER:
cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
break;
case TIMER_PTIMER:
+ case TIMER_HPTIMER:
cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
break;
case NR_KVM_TIMERS:
@@ -393,22 +417,40 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu)
regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
}
+static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level)
+{
+ /*
+ * Paper over NV2 brokenness by publishing the interrupt status
+ * bit. This still results in a poor quality of emulation (guest
+ * writes will have no effect until the next exit).
+ *
+ * But hey, it's fast, right?
+ */
+ struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx);
+ if (is_hyp_ctxt(vcpu) &&
+ (ctx == vcpu_vtimer(vcpu) || ctx == vcpu_ptimer(vcpu))) {
+ unsigned long val = timer_get_ctl(ctx);
+ __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level);
+ timer_set_ctl(ctx, val);
+ }
+}
+
static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
struct arch_timer_context *timer_ctx)
{
- int ret;
+ kvm_timer_update_status(timer_ctx, new_level);
timer_ctx->irq.level = new_level;
- trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
+ trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx),
timer_ctx->irq.level);
- if (!userspace_irqchip(vcpu->kvm)) {
- ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
- timer_ctx->irq.irq,
- timer_ctx->irq.level,
- timer_ctx);
- WARN_ON(ret);
- }
+ if (userspace_irqchip(vcpu->kvm))
+ return;
+
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ timer_irq(timer_ctx),
+ timer_ctx->irq.level,
+ timer_ctx);
}
/* Only called for a fully emulated timer */
@@ -418,27 +460,36 @@ static void timer_emulate(struct arch_timer_context *ctx)
trace_kvm_timer_emulate(ctx, should_fire);
- if (should_fire != ctx->irq.level) {
- kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
- return;
- }
+ if (should_fire != ctx->irq.level)
+ kvm_timer_update_irq(timer_context_to_vcpu(ctx), should_fire, ctx);
+
+ kvm_timer_update_status(ctx, should_fire);
/*
* If the timer can fire now, we don't need to have a soft timer
* scheduled for the future. If the timer cannot fire at all,
* then we also don't need a soft timer.
*/
- if (!kvm_timer_irq_can_fire(ctx)) {
- soft_timer_cancel(&ctx->hrtimer);
+ if (should_fire || !kvm_timer_irq_can_fire(ctx))
return;
- }
soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
}
+static void set_cntvoff(u64 cntvoff)
+{
+ kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
+}
+
+static void set_cntpoff(u64 cntpoff)
+{
+ if (has_cntpoff())
+ write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2);
+}
+
static void timer_save_state(struct arch_timer_context *ctx)
{
- struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx));
enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
unsigned long flags;
@@ -451,23 +502,53 @@ static void timer_save_state(struct arch_timer_context *ctx)
goto out;
switch (index) {
+ u64 cval;
+
case TIMER_VTIMER:
+ case TIMER_HVTIMER:
timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL));
- timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL));
+ cval = read_sysreg_el0(SYS_CNTV_CVAL);
+
+ if (has_broken_cntvoff())
+ cval -= timer_get_offset(ctx);
+
+ timer_set_cval(ctx, cval);
/* Disable the timer */
write_sysreg_el0(0, SYS_CNTV_CTL);
isb();
+ /*
+ * The kernel may decide to run userspace after
+ * calling vcpu_put, so we reset cntvoff to 0 to
+ * ensure a consistent read between user accesses to
+ * the virtual counter and kernel access to the
+ * physical counter of non-VHE case.
+ *
+ * For VHE, the virtual counter uses a fixed virtual
+ * offset of zero, so no need to zero CNTVOFF_EL2
+ * register, but this is actually useful when switching
+ * between EL1/vEL2 with NV.
+ *
+ * Do it unconditionally, as this is either unavoidable
+ * or dirt cheap.
+ */
+ set_cntvoff(0);
break;
case TIMER_PTIMER:
+ case TIMER_HPTIMER:
timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL));
- timer_set_cval(ctx, read_sysreg_el0(SYS_CNTP_CVAL));
+ cval = read_sysreg_el0(SYS_CNTP_CVAL);
+
+ cval -= timer_get_offset(ctx);
+
+ timer_set_cval(ctx, cval);
/* Disable the timer */
write_sysreg_el0(0, SYS_CNTP_CTL);
isb();
+ set_cntpoff(0);
break;
case NR_KVM_TIMERS:
BUG();
@@ -498,6 +579,7 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
*/
if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
!kvm_timer_irq_can_fire(map.direct_ptimer) &&
+ !kvm_timer_irq_can_fire(map.emul_vtimer) &&
!kvm_timer_irq_can_fire(map.emul_ptimer) &&
!vcpu_has_wfit_active(vcpu))
return;
@@ -518,7 +600,7 @@ static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
static void timer_restore_state(struct arch_timer_context *ctx)
{
- struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx));
enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
unsigned long flags;
@@ -531,13 +613,29 @@ static void timer_restore_state(struct arch_timer_context *ctx)
goto out;
switch (index) {
+ u64 cval, offset;
+
case TIMER_VTIMER:
- write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL);
+ case TIMER_HVTIMER:
+ cval = timer_get_cval(ctx);
+ offset = timer_get_offset(ctx);
+ if (has_broken_cntvoff()) {
+ set_cntvoff(0);
+ cval += offset;
+ } else {
+ set_cntvoff(offset);
+ }
+ write_sysreg_el0(cval, SYS_CNTV_CVAL);
isb();
write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
break;
case TIMER_PTIMER:
- write_sysreg_el0(timer_get_cval(ctx), SYS_CNTP_CVAL);
+ case TIMER_HPTIMER:
+ cval = timer_get_cval(ctx);
+ offset = timer_get_offset(ctx);
+ set_cntpoff(offset);
+ cval += offset;
+ write_sysreg_el0(cval, SYS_CNTP_CVAL);
isb();
write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL);
break;
@@ -552,11 +650,6 @@ out:
local_irq_restore(flags);
}
-static void set_cntvoff(u64 cntvoff)
-{
- kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
-}
-
static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
{
int r;
@@ -566,7 +659,7 @@ static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, boo
static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
{
- struct kvm_vcpu *vcpu = ctx->vcpu;
+ struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx);
bool phys_active = false;
/*
@@ -575,10 +668,10 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
* this point and the register restoration, we'll take the
* interrupt anyway.
*/
- kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
+ kvm_timer_update_irq(vcpu, kvm_timer_should_fire(ctx), ctx);
if (irqchip_in_kernel(vcpu->kvm))
- phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
+ phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx));
phys_active |= ctx->irq.level;
@@ -613,6 +706,153 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
+/* If _pred is true, set bit in _set, otherwise set it in _clr */
+#define assign_clear_set_bit(_pred, _bit, _clr, _set) \
+ do { \
+ if (_pred) \
+ (_set) |= (_bit); \
+ else \
+ (_clr) |= (_bit); \
+ } while (0)
+
+static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
+ struct timer_map *map)
+{
+ int hw, ret;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ /*
+ * We only ever unmap the vtimer irq on a VHE system that runs nested
+ * virtualization, in which case we have both a valid emul_vtimer,
+ * emul_ptimer, direct_vtimer, and direct_ptimer.
+ *
+ * Since this is called from kvm_timer_vcpu_load(), a change between
+ * vEL2 and vEL1/0 will have just happened, and the timer_map will
+ * represent this, and therefore we switch the emul/direct mappings
+ * below.
+ */
+ hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer));
+ if (hw < 0) {
+ kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer));
+ kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer));
+
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map->direct_vtimer->host_timer_irq,
+ timer_irq(map->direct_vtimer),
+ &arch_timer_irq_ops);
+ WARN_ON_ONCE(ret);
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map->direct_ptimer->host_timer_irq,
+ timer_irq(map->direct_ptimer),
+ &arch_timer_irq_ops);
+ WARN_ON_ONCE(ret);
+ }
+}
+
+static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
+{
+ bool tvt, tpt, tvc, tpc, tvt02, tpt02;
+ u64 clr, set;
+
+ /*
+ * No trapping gets configured here with nVHE. See
+ * __timer_enable_traps(), which is where the stuff happens.
+ */
+ if (!has_vhe())
+ return;
+
+ /*
+ * Our default policy is not to trap anything. As we progress
+ * within this function, reality kicks in and we start adding
+ * traps based on emulation requirements.
+ */
+ tvt = tpt = tvc = tpc = false;
+ tvt02 = tpt02 = false;
+
+ /*
+ * NV2 badly breaks the timer semantics by redirecting accesses to
+ * the EL1 timer state to memory, so let's call ECV to the rescue if
+ * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses.
+ *
+ * The treatment slightly varies depending whether we run a nVHE or
+ * VHE guest: nVHE will use the _EL0 registers directly, while VHE
+ * will use the _EL02 accessors. This translates in different trap
+ * bits.
+ *
+ * None of the trapping is required when running in non-HYP context,
+ * unless required by the L1 hypervisor settings once we advertise
+ * ECV+NV in the guest, or that we need trapping for other reasons.
+ */
+ if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) {
+ if (vcpu_el2_e2h_is_set(vcpu))
+ tvt02 = tpt02 = true;
+ else
+ tvt = tpt = true;
+ }
+
+ /*
+ * We have two possibility to deal with a physical offset:
+ *
+ * - Either we have CNTPOFF (yay!) or the offset is 0:
+ * we let the guest freely access the HW
+ *
+ * - or neither of these condition apply:
+ * we trap accesses to the HW, but still use it
+ * after correcting the physical offset
+ */
+ if (!has_cntpoff() && timer_get_offset(map->direct_ptimer))
+ tpt = tpc = true;
+
+ /*
+ * For the poor sods that could not correctly subtract one value
+ * from another, trap the full virtual timer and counter.
+ */
+ if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer))
+ tvt = tvc = true;
+
+ /*
+ * Apply the enable bits that the guest hypervisor has requested for
+ * its own guest. We can only add traps that wouldn't have been set
+ * above.
+ * Implementation choices: we do not support NV when E2H=0 in the
+ * guest, and we don't support configuration where E2H is writable
+ * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but
+ * not both). This simplifies the handling of the EL1NV* bits.
+ */
+ if (is_nested_ctxt(vcpu)) {
+ u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+
+ /* Use the VHE format for mental sanity */
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10;
+
+ tpt |= !(val & (CNTHCTL_EL1PCEN << 10));
+ tpc |= !(val & (CNTHCTL_EL1PCTEN << 10));
+
+ tpt02 |= (val & CNTHCTL_EL1NVPCT);
+ tvt02 |= (val & CNTHCTL_EL1NVVCT);
+ }
+
+ /*
+ * Now that we have collected our requirements, compute the
+ * trap and enable bits.
+ */
+ set = 0;
+ clr = 0;
+
+ assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr);
+ assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr);
+ assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set);
+ assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set);
+ assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set);
+ assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set);
+
+ /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */
+ sysreg_clear_set(cnthctl_el2, clr, set);
+}
+
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = vcpu_timer(vcpu);
@@ -624,6 +864,9 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
get_timer_map(vcpu, &map);
if (static_branch_likely(&has_gic_active_state)) {
+ if (vcpu_has_nv(vcpu))
+ kvm_timer_vcpu_load_nested_switch(vcpu, &map);
+
kvm_timer_vcpu_load_gic(map.direct_vtimer);
if (map.direct_ptimer)
kvm_timer_vcpu_load_gic(map.direct_ptimer);
@@ -631,16 +874,17 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
kvm_timer_vcpu_load_nogic(vcpu);
}
- set_cntvoff(timer_get_offset(map.direct_vtimer));
-
kvm_timer_unblocking(vcpu);
timer_restore_state(map.direct_vtimer);
if (map.direct_ptimer)
timer_restore_state(map.direct_ptimer);
-
+ if (map.emul_vtimer)
+ timer_emulate(map.emul_vtimer);
if (map.emul_ptimer)
timer_emulate(map.emul_ptimer);
+
+ timer_set_traps(vcpu, &map);
}
bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
@@ -683,20 +927,51 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
* In any case, we re-schedule the hrtimer for the physical timer when
* coming back to the VCPU thread in kvm_timer_vcpu_load().
*/
+ if (map.emul_vtimer)
+ soft_timer_cancel(&map.emul_vtimer->hrtimer);
if (map.emul_ptimer)
soft_timer_cancel(&map.emul_ptimer->hrtimer);
if (kvm_vcpu_is_blocking(vcpu))
kvm_timer_blocking(vcpu);
+}
+void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
+{
/*
- * The kernel may decide to run userspace after calling vcpu_put, so
- * we reset cntvoff to 0 to ensure a consistent read between user
- * accesses to the virtual counter and kernel access to the physical
- * counter of non-VHE case. For VHE, the virtual counter uses a fixed
- * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
+ * When NV2 is on, guest hypervisors have their EL1 timer register
+ * accesses redirected to the VNCR page. Any guest action taken on
+ * the timer is postponed until the next exit, leading to a very
+ * poor quality of emulation.
+ *
+ * This is an unmitigated disaster, only papered over by FEAT_ECV,
+ * which allows trapping of the timer registers even with NV2.
+ * Still, this is still worse than FEAT_NV on its own. Meh.
*/
- set_cntvoff(0);
+ if (!cpus_have_final_cap(ARM64_HAS_ECV)) {
+ /*
+ * For a VHE guest hypervisor, the EL2 state is directly
+ * stored in the host EL1 timers, while the emulated EL1
+ * state is stored in the VNCR page. The latter could have
+ * been updated behind our back, and we must reset the
+ * emulation of the timers.
+ *
+ * A non-VHE guest hypervisor doesn't have any direct access
+ * to its timers: the EL2 registers trap despite being
+ * notionally direct (we use the EL1 HW, as for VHE), while
+ * the EL1 registers access memory.
+ *
+ * In both cases, process the emulated timers on each guest
+ * exit. Boo.
+ */
+ struct timer_map map;
+ get_timer_map(vcpu, &map);
+
+ soft_timer_cancel(&map.emul_vtimer->hrtimer);
+ soft_timer_cancel(&map.emul_ptimer->hrtimer);
+ timer_emulate(map.emul_vtimer);
+ timer_emulate(map.emul_ptimer);
+ }
}
/*
@@ -728,7 +1003,7 @@ void kvm_timer_sync_user(struct kvm_vcpu *vcpu)
unmask_vtimer_irq_user(vcpu);
}
-int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = vcpu_timer(vcpu);
struct timer_map map;
@@ -741,113 +1016,100 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
* resets the timer to be disabled and unmasked and is compliant with
* the ARMv7 architecture.
*/
- timer_set_ctl(vcpu_vtimer(vcpu), 0);
- timer_set_ctl(vcpu_ptimer(vcpu), 0);
+ for (int i = 0; i < nr_timers(vcpu); i++)
+ timer_set_ctl(vcpu_get_timer(vcpu, i), 0);
+
+ /*
+ * A vcpu running at EL2 is in charge of the offset applied to
+ * the virtual timer, so use the physical VM offset, and point
+ * the vcpu offset to CNTVOFF_EL2.
+ */
+ if (vcpu_has_nv(vcpu)) {
+ struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset;
+
+ offs->vcpu_offset = __ctxt_sys_reg(&vcpu->arch.ctxt, CNTVOFF_EL2);
+ offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset;
+ }
if (timer->enabled) {
- kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
- kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+ for (int i = 0; i < nr_timers(vcpu); i++)
+ kvm_timer_update_irq(vcpu, false,
+ vcpu_get_timer(vcpu, i));
if (irqchip_in_kernel(vcpu->kvm)) {
- kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+ kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer));
if (map.direct_ptimer)
- kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+ kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer));
}
}
+ if (map.emul_vtimer)
+ soft_timer_cancel(&map.emul_vtimer->hrtimer);
if (map.emul_ptimer)
soft_timer_cancel(&map.emul_ptimer->hrtimer);
-
- return 0;
}
-/* Make the updates of cntvoff for all vtimer contexts atomic */
-static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
+static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
{
- unsigned long i;
+ struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid);
struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu *tmp;
- mutex_lock(&kvm->lock);
- kvm_for_each_vcpu(i, tmp, kvm)
- timer_set_offset(vcpu_vtimer(tmp), cntvoff);
+ ctxt->timer_id = timerid;
- /*
- * When called from the vcpu create path, the CPU being created is not
- * included in the loop above, so we just set it here as well.
- */
- timer_set_offset(vcpu_vtimer(vcpu), cntvoff);
- mutex_unlock(&kvm->lock);
+ if (timerid == TIMER_VTIMER)
+ ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset;
+ else
+ ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset;
+
+ hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+
+ switch (timerid) {
+ case TIMER_PTIMER:
+ case TIMER_HPTIMER:
+ ctxt->host_timer_irq = host_ptimer_irq;
+ break;
+ case TIMER_VTIMER:
+ case TIMER_HVTIMER:
+ ctxt->host_timer_irq = host_vtimer_irq;
+ break;
+ }
}
void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = vcpu_timer(vcpu);
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
- vtimer->vcpu = vcpu;
- ptimer->vcpu = vcpu;
- /* Synchronize cntvoff across all vtimers of a VM. */
- update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
- timer_set_offset(ptimer, 0);
+ for (int i = 0; i < NR_KVM_TIMERS; i++)
+ timer_context_init(vcpu, i);
- hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- timer->bg_timer.function = kvm_bg_timer_expire;
-
- hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- vtimer->hrtimer.function = kvm_hrtimer_expire;
- ptimer->hrtimer.function = kvm_hrtimer_expire;
-
- vtimer->irq.irq = default_vtimer_irq.irq;
- ptimer->irq.irq = default_ptimer_irq.irq;
+ /* Synchronize offsets across timers of a VM if not already provided */
+ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) {
+ timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read());
+ timer_set_offset(vcpu_ptimer(vcpu), 0);
+ }
- vtimer->host_timer_irq = host_vtimer_irq;
- ptimer->host_timer_irq = host_ptimer_irq;
+ hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+}
- vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
- ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
+void kvm_timer_init_vm(struct kvm *kvm)
+{
+ for (int i = 0; i < NR_KVM_TIMERS; i++)
+ kvm->arch.timer_data.ppi[i] = default_ppi[i];
}
-static void kvm_timer_init_interrupt(void *info)
+void kvm_timer_cpu_up(void)
{
enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
- enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
+ if (host_ptimer_irq)
+ enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
}
-int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
+void kvm_timer_cpu_down(void)
{
- struct arch_timer_context *timer;
-
- switch (regid) {
- case KVM_REG_ARM_TIMER_CTL:
- timer = vcpu_vtimer(vcpu);
- kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
- break;
- case KVM_REG_ARM_TIMER_CNT:
- timer = vcpu_vtimer(vcpu);
- update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
- break;
- case KVM_REG_ARM_TIMER_CVAL:
- timer = vcpu_vtimer(vcpu);
- kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
- break;
- case KVM_REG_ARM_PTIMER_CTL:
- timer = vcpu_ptimer(vcpu);
- kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
- break;
- case KVM_REG_ARM_PTIMER_CVAL:
- timer = vcpu_ptimer(vcpu);
- kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
- break;
-
- default:
- return -1;
- }
-
- return 0;
+ disable_percpu_irq(host_vtimer_irq);
+ if (host_ptimer_irq)
+ disable_percpu_irq(host_ptimer_irq);
}
static u64 read_timer_ctl(struct arch_timer_context *timer)
@@ -866,31 +1128,6 @@ static u64 read_timer_ctl(struct arch_timer_context *timer)
return ctl;
}
-u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
-{
- switch (regid) {
- case KVM_REG_ARM_TIMER_CTL:
- return kvm_arm_timer_read(vcpu,
- vcpu_vtimer(vcpu), TIMER_REG_CTL);
- case KVM_REG_ARM_TIMER_CNT:
- return kvm_arm_timer_read(vcpu,
- vcpu_vtimer(vcpu), TIMER_REG_CNT);
- case KVM_REG_ARM_TIMER_CVAL:
- return kvm_arm_timer_read(vcpu,
- vcpu_vtimer(vcpu), TIMER_REG_CVAL);
- case KVM_REG_ARM_PTIMER_CTL:
- return kvm_arm_timer_read(vcpu,
- vcpu_ptimer(vcpu), TIMER_REG_CTL);
- case KVM_REG_ARM_PTIMER_CNT:
- return kvm_arm_timer_read(vcpu,
- vcpu_ptimer(vcpu), TIMER_REG_CNT);
- case KVM_REG_ARM_PTIMER_CVAL:
- return kvm_arm_timer_read(vcpu,
- vcpu_ptimer(vcpu), TIMER_REG_CVAL);
- }
- return (u64)-1;
-}
-
static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
struct arch_timer_context *timer,
enum kvm_arch_timer_regs treg)
@@ -915,6 +1152,10 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
val = kvm_phys_timer_read() - timer_get_offset(timer);
break;
+ case TIMER_REG_VOFF:
+ val = *timer->offset.vcpu_offset;
+ break;
+
default:
BUG();
}
@@ -926,14 +1167,22 @@ u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
enum kvm_arch_timers tmr,
enum kvm_arch_timer_regs treg)
{
+ struct arch_timer_context *timer;
+ struct timer_map map;
u64 val;
+ get_timer_map(vcpu, &map);
+ timer = vcpu_get_timer(vcpu, tmr);
+
+ if (timer == map.emul_vtimer || timer == map.emul_ptimer)
+ return kvm_arm_timer_read(vcpu, timer, treg);
+
preempt_disable();
- kvm_timer_vcpu_put(vcpu);
+ timer_save_state(timer);
- val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
+ val = kvm_arm_timer_read(vcpu, timer, treg);
- kvm_timer_vcpu_load(vcpu);
+ timer_restore_state(timer);
preempt_enable();
return val;
@@ -957,6 +1206,10 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
timer_set_cval(timer, val);
break;
+ case TIMER_REG_VOFF:
+ *timer->offset.vcpu_offset = val;
+ break;
+
default:
BUG();
}
@@ -967,25 +1220,22 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
enum kvm_arch_timer_regs treg,
u64 val)
{
- preempt_disable();
- kvm_timer_vcpu_put(vcpu);
-
- kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
-
- kvm_timer_vcpu_load(vcpu);
- preempt_enable();
-}
-
-static int kvm_timer_starting_cpu(unsigned int cpu)
-{
- kvm_timer_init_interrupt(NULL);
- return 0;
-}
+ struct arch_timer_context *timer;
+ struct timer_map map;
-static int kvm_timer_dying_cpu(unsigned int cpu)
-{
- disable_percpu_irq(host_vtimer_irq);
- return 0;
+ get_timer_map(vcpu, &map);
+ timer = vcpu_get_timer(vcpu, tmr);
+ if (timer == map.emul_vtimer || timer == map.emul_ptimer) {
+ soft_timer_cancel(&timer->hrtimer);
+ kvm_arm_timer_write(vcpu, timer, treg, val);
+ timer_emulate(timer);
+ } else {
+ preempt_disable();
+ timer_save_state(timer);
+ kvm_arm_timer_write(vcpu, timer, treg, val);
+ timer_restore_state(timer);
+ preempt_enable();
+ }
}
static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
@@ -1055,10 +1305,6 @@ static const struct irq_domain_ops timer_domain_ops = {
.free = timer_irq_domain_free,
};
-static struct irq_ops arch_timer_irq_ops = {
- .get_input_level = kvm_arch_timer_get_input_level,
-};
-
static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags)
{
*flags = irq_get_trigger_type(virq);
@@ -1117,7 +1363,38 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info)
return 0;
}
-int kvm_timer_hyp_init(bool has_gic)
+static void kvm_timer_handle_errata(void)
+{
+ u64 mmfr0, mmfr1, mmfr4;
+
+ /*
+ * CNTVOFF_EL2 is broken on some implementations. For those, we trap
+ * all virtual timer/counter accesses, requiring FEAT_ECV.
+ *
+ * However, a hypervisor supporting nesting is likely to mitigate the
+ * erratum at L0, and not require other levels to mitigate it (which
+ * would otherwise be a terrible performance sink due to trap
+ * amplification).
+ *
+ * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0,
+ * and that NV is likely not to (because of limitations of the
+ * architecture), only enable the workaround when FEAT_VHE and
+ * FEAT_E2H0 are both detected. Time will tell if this actually holds.
+ */
+ mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+ mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1);
+ if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) &&
+ !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) &&
+ SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) &&
+ (has_vhe() || has_hvhe()) &&
+ cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) {
+ static_branch_enable(&broken_cntvoff_key);
+ kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n");
+ }
+}
+
+int __init kvm_timer_hyp_init(bool has_gic)
{
struct arch_timer_kvm_info *info;
int err;
@@ -1149,7 +1426,7 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
- goto out_free_irq;
+ goto out_free_vtimer_irq;
}
static_branch_enable(&has_gic_active_state);
@@ -1165,7 +1442,7 @@ int kvm_timer_hyp_init(bool has_gic)
if (err) {
kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
host_ptimer_irq, err);
- return err;
+ goto out_free_vtimer_irq;
}
if (has_gic) {
@@ -1173,7 +1450,7 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
- goto out_free_irq;
+ goto out_free_ptimer_irq;
}
}
@@ -1182,14 +1459,16 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
info->physical_irq);
err = -ENODEV;
- goto out_free_irq;
+ goto out_free_vtimer_irq;
}
- cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
- "kvm/arm/timer:starting", kvm_timer_starting_cpu,
- kvm_timer_dying_cpu);
+ kvm_timer_handle_errata();
return 0;
-out_free_irq:
+
+out_free_ptimer_irq:
+ if (info->physical_irq > 0)
+ free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus());
+out_free_vtimer_irq:
free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
return err;
}
@@ -1203,44 +1482,56 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
{
- int vtimer_irq, ptimer_irq, ret;
- unsigned long i;
+ u32 ppis = 0;
+ bool valid;
- vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
- ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
- if (ret)
- return false;
+ mutex_lock(&vcpu->kvm->arch.config_lock);
- ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
- ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
- if (ret)
- return false;
+ for (int i = 0; i < nr_timers(vcpu); i++) {
+ struct arch_timer_context *ctx;
+ int irq;
- kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
- if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
- vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
- return false;
+ ctx = vcpu_get_timer(vcpu, i);
+ irq = timer_irq(ctx);
+ if (kvm_vgic_set_owner(vcpu, irq, ctx))
+ break;
+
+ /*
+ * We know by construction that we only have PPIs, so
+ * all values are less than 32.
+ */
+ ppis |= BIT(irq);
}
- return true;
+ valid = hweight32(ppis) == nr_timers(vcpu);
+
+ if (valid)
+ set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags);
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ return valid;
}
-bool kvm_arch_timer_get_input_level(int vintid)
+static bool kvm_arch_timer_get_input_level(int vintid)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
- struct arch_timer_context *timer;
if (WARN(!vcpu, "No vcpu context!\n"))
return false;
- if (vintid == vcpu_vtimer(vcpu)->irq.irq)
- timer = vcpu_vtimer(vcpu);
- else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
- timer = vcpu_ptimer(vcpu);
- else
- BUG();
+ for (int i = 0; i < nr_timers(vcpu); i++) {
+ struct arch_timer_context *ctx;
+
+ ctx = vcpu_get_timer(vcpu, i);
+ if (timer_irq(ctx) == vintid)
+ return kvm_timer_should_fire(ctx);
+ }
+
+ /* A timer IRQ has fired, but no matching timer was found? */
+ WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid);
- return kvm_timer_should_fire(timer);
+ return false;
}
int kvm_timer_enable(struct kvm_vcpu *vcpu)
@@ -1269,7 +1560,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_vtimer->host_timer_irq,
- map.direct_vtimer->irq.irq,
+ timer_irq(map.direct_vtimer),
&arch_timer_irq_ops);
if (ret)
return ret;
@@ -1277,7 +1568,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
if (map.direct_ptimer) {
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_ptimer->host_timer_irq,
- map.direct_ptimer->irq.irq,
+ timer_irq(map.direct_ptimer),
&arch_timer_irq_ops);
}
@@ -1289,45 +1580,17 @@ no_vgic:
return 0;
}
-/*
- * On VHE system, we only need to configure the EL2 timer trap register once,
- * not for every world switch.
- * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
- * and this makes those bits have no effect for the host kernel execution.
- */
+/* If we have CNTPOFF, permanently set ECV to enable it */
void kvm_timer_init_vhe(void)
{
- /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
- u32 cnthctl_shift = 10;
- u64 val;
-
- /*
- * VHE systems allow the guest direct access to the EL1 physical
- * timer/counter.
- */
- val = read_sysreg(cnthctl_el2);
- val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
- val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
- write_sysreg(val, cnthctl_el2);
-}
-
-static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
- vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
- }
+ if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF))
+ sysreg_clear_set(cnthctl_el2, 0, CNTHCTL_ECV);
}
int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
int __user *uaddr = (int __user *)(long)attr->addr;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
- int irq;
+ int irq, idx, ret = 0;
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
@@ -1338,21 +1601,42 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!(irq_is_ppi(irq)))
return -EINVAL;
- if (vcpu->arch.timer_cpu.enabled)
- return -EBUSY;
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+
+ if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE,
+ &vcpu->kvm->arch.flags)) {
+ ret = -EBUSY;
+ goto out;
+ }
switch (attr->attr) {
case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
- set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
+ idx = TIMER_VTIMER;
break;
case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
- set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
+ idx = TIMER_PTIMER;
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+ idx = TIMER_HVTIMER;
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
+ idx = TIMER_HPTIMER;
break;
default:
- return -ENXIO;
+ ret = -ENXIO;
+ goto out;
}
- return 0;
+ /*
+ * We cannot validate the IRQ unicity before we run, so take it at
+ * face value. The verdict will be given on first vcpu run, for each
+ * vcpu. Yes this is late. Blame it on the stupid API.
+ */
+ vcpu->kvm->arch.timer_data.ppi[idx] = irq;
+
+out:
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+ return ret;
}
int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
@@ -1368,11 +1652,17 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
timer = vcpu_ptimer(vcpu);
break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+ timer = vcpu_hvtimer(vcpu);
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
+ timer = vcpu_hptimer(vcpu);
+ break;
default:
return -ENXIO;
}
- irq = timer->irq.irq;
+ irq = timer_irq(timer);
return put_user(irq, uaddr);
}
@@ -1381,8 +1671,42 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+ case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+ case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
return 0;
}
return -ENXIO;
}
+
+int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
+ struct kvm_arm_counter_offset *offset)
+{
+ int ret = 0;
+
+ if (offset->reserved)
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (!kvm_trylock_all_vcpus(kvm)) {
+ set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags);
+
+ /*
+ * If userspace decides to set the offset using this
+ * API rather than merely restoring the counter
+ * values, the offset applies to both the virtual and
+ * physical views.
+ */
+ kvm->arch.timer_data.voffset = offset->counter_offset;
+ kvm->arch.timer_data.poffset = offset->counter_offset;
+
+ kvm_unlock_all_vcpus(kvm);
+ } else {
+ ret = -EBUSY;
+ }
+
+ mutex_unlock(&kvm->lock);
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9c5573bc4614..4f80da0c0d1d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -6,7 +6,6 @@
#include <linux/bug.h>
#include <linux/cpu_pm.h>
-#include <linux/entry-kvm.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kvm_host.h>
@@ -16,7 +15,6 @@
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
-#include <linux/kmemleak.h>
#include <linux/kvm.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
@@ -36,51 +34,62 @@
#include <asm/virt.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/kvm_pkvm.h>
-#include <asm/kvm_emulate.h>
+#include <asm/kvm_ptrauth.h>
#include <asm/sections.h>
#include <kvm/arm_hypercalls.h>
#include <kvm/arm_pmu.h>
#include <kvm/arm_psci.h>
+#include "sys_regs.h"
+
static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
-DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
+
+enum kvm_wfx_trap_policy {
+ KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
+ KVM_WFX_NOTRAP,
+ KVM_WFX_TRAP,
+};
+
+static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
+static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
-DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
-static bool vgic_present;
+DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
-static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
-DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+static bool vgic_present, kvm_arm_initialised;
-int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
-}
+static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
-int kvm_arch_hardware_setup(void *opaque)
+bool is_kvm_arm_initialised(void)
{
- return 0;
+ return kvm_arm_initialised;
}
-int kvm_arch_check_processor_compat(void *opaque)
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
- return 0;
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
}
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
- int r;
+ int r = -EINVAL;
if (cap->flags)
return -EINVAL;
+ if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap))
+ return -EINVAL;
+
switch (cap->cap) {
case KVM_CAP_ARM_NISV_TO_USER:
r = 0;
@@ -89,9 +98,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
break;
case KVM_CAP_ARM_MTE:
mutex_lock(&kvm->lock);
- if (!system_supports_mte() || kvm->created_vcpus) {
- r = -EINVAL;
- } else {
+ if (system_supports_mte() && !kvm->created_vcpus) {
r = 0;
set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
}
@@ -101,8 +108,35 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ mutex_lock(&kvm->slots_lock);
+ /*
+ * To keep things simple, allow changing the chunk
+ * size only when no memory slots have been created.
+ */
+ if (kvm_are_all_memslots_empty(kvm)) {
+ u64 new_cap = cap->args[0];
+
+ if (!new_cap || kvm_is_block_size_supported(new_cap)) {
+ r = 0;
+ kvm->arch.mmu.split_page_chunk_size = new_cap;
+ }
+ }
+ mutex_unlock(&kvm->slots_lock);
+ break;
+ case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ r = 0;
+ set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags);
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_ARM_SEA_TO_USER:
+ r = 0;
+ set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags);
+ break;
default:
- r = -EINVAL;
break;
}
@@ -114,39 +148,32 @@ static int kvm_arm_default_max_vcpus(void)
return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
}
-static void set_default_spectre(struct kvm *kvm)
-{
- /*
- * The default is to expose CSV2 == 1 if the HW isn't affected.
- * Although this is a per-CPU feature, we make it global because
- * asymmetric systems are just a nuisance.
- *
- * Userspace can override this as long as it doesn't promise
- * the impossible.
- */
- if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
- kvm->arch.pfr0_csv2 = 1;
- if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
- kvm->arch.pfr0_csv3 = 1;
-}
-
/**
* kvm_arch_init_vm - initializes a VM data structure
* @kvm: pointer to the KVM struct
+ * @type: kvm device type
*/
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ mutex_init(&kvm->arch.config_lock);
+
+#ifdef CONFIG_LOCKDEP
+ /* Clue in lockdep that the config_lock must be taken inside kvm->lock */
+ mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
+ mutex_unlock(&kvm->arch.config_lock);
+ mutex_unlock(&kvm->lock);
+#endif
+
+ kvm_init_nested(kvm);
+
ret = kvm_share_hyp(kvm, kvm + 1);
if (ret)
return ret;
- ret = pkvm_init_host_vm(kvm);
- if (ret)
- goto err_unshare_kvm;
-
- if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto err_unshare_kvm;
}
@@ -156,19 +183,26 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto err_free_cpumask;
+ if (is_protected_kvm_enabled()) {
+ /*
+ * If any failures occur after this is successful, make sure to
+ * call __pkvm_unreserve_vm to unreserve the VM in hyp.
+ */
+ ret = pkvm_init_host_vm(kvm);
+ if (ret)
+ goto err_free_cpumask;
+ }
+
kvm_vgic_early_init(kvm);
+ kvm_timer_init_vm(kvm);
+
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->max_vcpus = kvm_arm_default_max_vcpus();
- set_default_spectre(kvm);
kvm_arm_init_hypercalls(kvm);
- /*
- * Initialise the default PMUver before there is a chance to
- * create an actual PMU.
- */
- kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
+ bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
return 0;
@@ -184,6 +218,28 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
+void kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+ kvm_sys_regs_create_debugfs(kvm);
+ kvm_s2_ptdump_create_debugfs(kvm);
+}
+
+static void kvm_destroy_mpidr_data(struct kvm *kvm)
+{
+ struct kvm_mpidr_data *data;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ data = rcu_dereference_protected(kvm->arch.mpidr_data,
+ lockdep_is_held(&kvm->arch.config_lock));
+ if (data) {
+ rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
+ synchronize_rcu();
+ kfree(data);
+ }
+
+ mutex_unlock(&kvm->arch.config_lock);
+}
/**
* kvm_arch_destroy_vm - destroy the VM data structure
@@ -199,20 +255,62 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (is_protected_kvm_enabled())
pkvm_destroy_hyp_vm(kvm);
+ kvm_destroy_mpidr_data(kvm);
+
+ kfree(kvm->arch.sysreg_masks);
kvm_destroy_vcpus(kvm);
kvm_unshare_hyp(kvm, kvm + 1);
+
+ kvm_arm_teardown_hypercalls(kvm);
+}
+
+static bool kvm_has_full_ptr_auth(void)
+{
+ bool apa, gpa, api, gpi, apa3, gpa3;
+ u64 isar1, isar2, val;
+
+ /*
+ * Check that:
+ *
+ * - both Address and Generic auth are implemented for a given
+ * algorithm (Q5, IMPDEF or Q3)
+ * - only a single algorithm is implemented.
+ */
+ if (!system_has_full_ptr_auth())
+ return false;
+
+ isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
+ isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
+
+ apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
+ val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
+ gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);
+
+ api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
+ val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
+ gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);
+
+ apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
+ val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
+ gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);
+
+ return (apa == gpa && api == gpi && apa3 == gpa3 &&
+ (apa + api + apa3) == 1);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
+
+ if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext))
+ return 0;
+
switch (ext) {
case KVM_CAP_IRQCHIP:
r = vgic_present;
break;
case KVM_CAP_IOEVENTFD:
- case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
@@ -230,6 +328,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_PTP_KVM:
case KVM_CAP_ARM_SYSTEM_SUSPEND:
+ case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_COUNTER_OFFSET:
+ case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+ case KVM_CAP_ARM_SEA_TO_USER:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -274,7 +376,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = kvm_arm_pvtime_supported();
break;
case KVM_CAP_ARM_EL1_32BIT:
- r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1);
+ r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
+ break;
+ case KVM_CAP_ARM_EL2:
+ r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT);
+ break;
+ case KVM_CAP_ARM_EL2_E2H0:
+ r = cpus_have_final_cap(ARM64_HAS_HCR_NV1);
break;
case KVM_CAP_GUEST_DEBUG_HW_BPS:
r = get_num_brps();
@@ -283,10 +391,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = get_num_wrps();
break;
case KVM_CAP_ARM_PMU_V3:
- r = kvm_arm_support_pmu_v3();
+ r = kvm_supports_guest_pmuv3();
break;
case KVM_CAP_ARM_INJECT_SERROR_ESR:
- r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
+ r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
break;
case KVM_CAP_ARM_VM_IPA_SIZE:
r = get_kvm_ipa_limit();
@@ -296,8 +404,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_ARM_PTRAUTH_ADDRESS:
case KVM_CAP_ARM_PTRAUTH_GENERIC:
- r = system_has_full_ptr_auth();
+ r = kvm_has_full_ptr_auth();
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ if (kvm)
+ r = kvm->arch.mmu.split_page_chunk_size;
+ else
+ r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ break;
+ case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
+ r = kvm_supported_block_sizes();
+ break;
+ case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
+ r = BIT(0);
+ break;
+ case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED:
+ if (!kvm)
+ r = -EINVAL;
+ else
+ r = kvm_supports_cacheable_pfnmap();
+ break;
+
default:
r = 0;
}
@@ -318,7 +445,7 @@ struct kvm *kvm_arch_alloc_vm(void)
if (!has_vhe())
return kzalloc(sz, GFP_KERNEL_ACCOUNT);
- return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
+ return kvzalloc(sz, GFP_KERNEL_ACCOUNT);
}
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
@@ -336,34 +463,46 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
int err;
+ spin_lock_init(&vcpu->arch.mp_state_lock);
+
+#ifdef CONFIG_LOCKDEP
+ /* Inform lockdep that the config_lock is acquired after vcpu->mutex */
+ mutex_lock(&vcpu->mutex);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+ mutex_unlock(&vcpu->mutex);
+#endif
+
/* Force users to call KVM_ARM_VCPU_INIT */
- vcpu->arch.target = -1;
- bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+ vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
- /*
- * Default value for the FP state, will be overloaded at load
- * time if we support FP (pretty likely)
- */
- vcpu->arch.fp_state = FP_STATE_FREE;
-
/* Set up the timer */
kvm_timer_vcpu_init(vcpu);
kvm_pmu_vcpu_init(vcpu);
- kvm_arm_reset_debug_ptr(vcpu);
-
kvm_arm_pvtime_vcpu_init(&vcpu->arch);
vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
+ /*
+ * This vCPU may have been created after mpidr_data was initialized.
+ * Throw out the pre-computed mappings if that is the case which forces
+ * KVM to fall back to iteratively searching the vCPUs.
+ */
+ kvm_destroy_mpidr_data(vcpu->kvm);
+
err = kvm_vgic_vcpu_init(vcpu);
if (err)
return err;
- return kvm_share_hyp(vcpu, vcpu + 1);
+ err = kvm_share_hyp(vcpu, vcpu + 1);
+ if (err)
+ kvm_vgic_vcpu_destroy(vcpu);
+
+ return err;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -372,13 +511,13 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
- static_branch_dec(&userspace_irqchip_in_use);
-
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+ if (!is_protected_kvm_enabled())
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+ else
+ free_hyp_memcache(&vcpu->arch.pkvm_memcache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
-
+ kvm_vgic_vcpu_destroy(vcpu);
kvm_arm_vcpu_destroy(vcpu);
}
@@ -392,15 +531,81 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
}
+static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) {
+ /*
+ * Either we're running an L2 guest, and the API/APK bits come
+ * from L1's HCR_EL2, or API/APK are both set.
+ */
+ if (unlikely(is_nested_ctxt(vcpu))) {
+ u64 val;
+
+ val = __vcpu_sys_reg(vcpu, HCR_EL2);
+ val &= (HCR_API | HCR_APK);
+ vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
+ vcpu->arch.hcr_el2 |= val;
+ } else {
+ vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
+ }
+
+ /*
+ * Save the host keys if there is any chance for the guest
+ * to use pauth, as the entry code will reload the guest
+ * keys in that case.
+ */
+ if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
+ struct kvm_cpu_context *ctxt;
+
+ ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
+ ptrauth_save_keys(ctxt);
+ }
+ }
+}
+
+static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
+ return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
+
+ return single_task_running() &&
+ (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
+ vcpu->kvm->arch.vgic.nassgireq);
+}
+
+static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
+ return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
+
+ return single_task_running();
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_s2_mmu *mmu;
int *last_ran;
+ if (is_protected_kvm_enabled())
+ goto nommu;
+
+ if (vcpu_has_nv(vcpu))
+ kvm_vcpu_load_hw_mmu(vcpu);
+
mmu = vcpu->arch.hw_mmu;
last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
/*
+ * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2,
+ * which happens eagerly in VHE.
+ *
+ * Also, the VMID allocator only preserves VMIDs that are active at the
+ * time of rollover, so KVM might need to grab a new VMID for the MMU if
+ * this is called from kvm_sched_in().
+ */
+ kvm_arm_vmid_update(&mmu->vmid);
+
+ /*
* We guarantee that both TLBs and I-cache are private to each
* vcpu. If detecting that a vcpu from the same VM has
* previously run on the same physical CPU, call into the
@@ -409,78 +614,110 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* We might get preempted before the vCPU actually runs, but
* over-invalidation doesn't affect correctness.
*/
- if (*last_ran != vcpu->vcpu_id) {
+ if (*last_ran != vcpu->vcpu_idx) {
kvm_call_hyp(__kvm_flush_cpu_context, mmu);
- *last_ran = vcpu->vcpu_id;
+ *last_ran = vcpu->vcpu_idx;
}
+nommu:
vcpu->cpu = cpu;
- kvm_vgic_load(vcpu);
+ /*
+ * The timer must be loaded before the vgic to correctly set up physical
+ * interrupt deactivation in nested state (e.g. timer interrupt).
+ */
kvm_timer_vcpu_load(vcpu);
+ kvm_vgic_load(vcpu);
+ kvm_vcpu_load_debug(vcpu);
+ kvm_vcpu_load_fgt(vcpu);
if (has_vhe())
- kvm_vcpu_load_sysregs_vhe(vcpu);
+ kvm_vcpu_load_vhe(vcpu);
kvm_arch_vcpu_load_fp(vcpu);
kvm_vcpu_pmu_restore_guest(vcpu);
if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
- if (single_task_running())
- vcpu_clear_wfx_traps(vcpu);
+ if (kvm_vcpu_should_clear_twe(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_TWE;
else
- vcpu_set_wfx_traps(vcpu);
+ vcpu->arch.hcr_el2 |= HCR_TWE;
- if (vcpu_has_ptrauth(vcpu))
- vcpu_ptrauth_disable(vcpu);
- kvm_arch_vcpu_load_debug_state_flags(vcpu);
+ if (kvm_vcpu_should_clear_twi(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_TWI;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TWI;
+
+ vcpu_set_pauth_traps(vcpu);
+
+ if (is_protected_kvm_enabled()) {
+ kvm_call_hyp_nvhe(__pkvm_vcpu_load,
+ vcpu->kvm->arch.pkvm.handle,
+ vcpu->vcpu_idx, vcpu->arch.hcr_el2);
+ kvm_call_hyp(__vgic_v3_restore_vmcr_aprs,
+ &vcpu->arch.vgic_cpu.vgic_v3);
+ }
- if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus))
+ if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
vcpu_set_on_unsupported_cpu(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_arch_vcpu_put_debug_state_flags(vcpu);
+ if (is_protected_kvm_enabled()) {
+ kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3);
+ kvm_call_hyp_nvhe(__pkvm_vcpu_put);
+ }
+
+ kvm_vcpu_put_debug(vcpu);
kvm_arch_vcpu_put_fp(vcpu);
if (has_vhe())
- kvm_vcpu_put_sysregs_vhe(vcpu);
+ kvm_vcpu_put_vhe(vcpu);
kvm_timer_vcpu_put(vcpu);
kvm_vgic_put(vcpu);
kvm_vcpu_pmu_restore_host(vcpu);
+ if (vcpu_has_nv(vcpu))
+ kvm_vcpu_put_hw_mmu(vcpu);
kvm_arm_vmid_clear_active();
vcpu_clear_on_unsupported_cpu(vcpu);
vcpu->cpu = -1;
}
-void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
+static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
{
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
kvm_make_request(KVM_REQ_SLEEP, vcpu);
kvm_vcpu_kick(vcpu);
}
+void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->arch.mp_state_lock);
+ __kvm_arm_vcpu_power_off(vcpu);
+ spin_unlock(&vcpu->arch.mp_state_lock);
+}
+
bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
+ return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
}
static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
{
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
kvm_make_request(KVM_REQ_SUSPEND, vcpu);
kvm_vcpu_kick(vcpu);
}
static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
+ return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- *mp_state = vcpu->arch.mp_state;
+ *mp_state = READ_ONCE(vcpu->arch.mp_state);
return 0;
}
@@ -490,12 +727,14 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
{
int ret = 0;
+ spin_lock(&vcpu->arch.mp_state_lock);
+
switch (mp_state->mp_state) {
case KVM_MP_STATE_RUNNABLE:
- vcpu->arch.mp_state = *mp_state;
+ WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
break;
case KVM_MP_STATE_STOPPED:
- kvm_arm_vcpu_power_off(vcpu);
+ __kvm_arm_vcpu_power_off(vcpu);
break;
case KVM_MP_STATE_SUSPENDED:
kvm_arm_vcpu_suspend(vcpu);
@@ -504,6 +743,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
ret = -EINVAL;
}
+ spin_unlock(&vcpu->arch.mp_state_lock);
+
return ret;
}
@@ -516,7 +757,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
*/
int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
{
- bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
+ bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE);
+
return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
}
@@ -533,9 +775,56 @@ unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
}
#endif
-static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
+static void kvm_init_mpidr_data(struct kvm *kvm)
{
- return vcpu->arch.target >= 0;
+ struct kvm_mpidr_data *data = NULL;
+ unsigned long c, mask, nr_entries;
+ u64 aff_set = 0, aff_clr = ~0UL;
+ struct kvm_vcpu *vcpu;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ if (rcu_access_pointer(kvm->arch.mpidr_data) ||
+ atomic_read(&kvm->online_vcpus) == 1)
+ goto out;
+
+ kvm_for_each_vcpu(c, vcpu, kvm) {
+ u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
+ aff_set |= aff;
+ aff_clr &= aff;
+ }
+
+ /*
+ * A significant bit can be either 0 or 1, and will only appear in
+ * aff_set. Use aff_clr to weed out the useless stuff.
+ */
+ mask = aff_set ^ aff_clr;
+ nr_entries = BIT_ULL(hweight_long(mask));
+
+ /*
+ * Don't let userspace fool us. If we need more than a single page
+ * to describe the compressed MPIDR array, just fall back to the
+ * iterative method. Single vcpu VMs do not need this either.
+ */
+ if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
+ data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
+ GFP_KERNEL_ACCOUNT);
+
+ if (!data)
+ goto out;
+
+ data->mpidr_mask = mask;
+
+ kvm_for_each_vcpu(c, vcpu, kvm) {
+ u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
+ u16 index = kvm_mpidr_index(data, aff);
+
+ data->cmpidr_to_idx[index] = c;
+ }
+
+ rcu_assign_pointer(kvm->arch.mpidr_data, data);
+out:
+ mutex_unlock(&kvm->arch.config_lock);
}
/*
@@ -554,14 +843,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
if (!kvm_arm_vcpu_is_finalized(vcpu))
return -EPERM;
- ret = kvm_arch_vcpu_run_map_fp(vcpu);
- if (ret)
- return ret;
-
if (likely(vcpu_has_run_once(vcpu)))
return 0;
- kvm_arm_vcpu_init_debug(vcpu);
+ kvm_init_mpidr_data(kvm);
if (likely(irqchip_in_kernel(kvm))) {
/*
@@ -573,39 +858,49 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
return ret;
}
- ret = kvm_timer_enable(vcpu);
+ ret = kvm_finalize_sys_regs(vcpu);
if (ret)
return ret;
- ret = kvm_arm_pmu_v3_enable(vcpu);
+ if (vcpu_has_nv(vcpu)) {
+ ret = kvm_vcpu_allocate_vncr_tlb(vcpu);
+ if (ret)
+ return ret;
+
+ ret = kvm_vgic_vcpu_nv_init(vcpu);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * This needs to happen after any restriction has been applied
+ * to the feature set.
+ */
+ kvm_calculate_traps(vcpu);
+
+ ret = kvm_timer_enable(vcpu);
if (ret)
return ret;
+ if (kvm_vcpu_has_pmu(vcpu)) {
+ ret = kvm_arm_pmu_v3_enable(vcpu);
+ if (ret)
+ return ret;
+ }
+
if (is_protected_kvm_enabled()) {
ret = pkvm_create_hyp_vm(kvm);
if (ret)
return ret;
- }
- if (!irqchip_in_kernel(kvm)) {
- /*
- * Tell the rest of the code that there are userspace irqchip
- * VMs in the wild.
- */
- static_branch_inc(&userspace_irqchip_in_use);
+ ret = pkvm_create_hyp_vcpu(vcpu);
+ if (ret)
+ return ret;
}
- /*
- * Initialize traps for protected VMs.
- * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
- * the code is in place for first run initialization at EL2.
- */
- if (kvm_vm_is_protected(kvm))
- kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
-
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
@@ -678,15 +973,16 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
* doorbells to be signalled, should an interrupt become pending.
*/
preempt_disable();
- kvm_vgic_vmcr_sync(vcpu);
- vgic_v4_put(vcpu, true);
+ vcpu_set_flag(vcpu, IN_WFI);
+ kvm_vgic_put(vcpu);
preempt_enable();
kvm_vcpu_halt(vcpu);
vcpu_clear_flag(vcpu, IN_WFIT);
preempt_disable();
- vgic_v4_load(vcpu);
+ vcpu_clear_flag(vcpu, IN_WFI);
+ kvm_vgic_load(vcpu);
preempt_enable();
}
@@ -735,6 +1031,9 @@ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
static int check_vcpu_requests(struct kvm_vcpu *vcpu)
{
if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
kvm_vcpu_sleep(vcpu);
@@ -747,26 +1046,34 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
*/
kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+ /* Process interrupts deactivated through a trap */
+ if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu))
+ kvm_vgic_process_async_update(vcpu);
+
if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
kvm_update_stolen_time(vcpu);
if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
/* The distributor enable bits were changed */
preempt_disable();
- vgic_v4_put(vcpu, false);
+ vgic_v4_put(vcpu);
vgic_v4_load(vcpu);
preempt_enable();
}
if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
- kvm_pmu_handle_pmcr(vcpu,
- __vcpu_sys_reg(vcpu, PMCR_EL0));
+ kvm_vcpu_reload_pmu(vcpu);
+
+ if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
+ kvm_vcpu_pmu_restore_guest(vcpu);
if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
return kvm_vcpu_suspend(vcpu);
if (kvm_dirty_ring_check_request(vcpu))
return 0;
+
+ check_nested_vcpu_requests(vcpu);
}
return 1;
@@ -777,6 +1084,9 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
if (likely(!vcpu_mode_is_32bit(vcpu)))
return false;
+ if (vcpu_has_nv(vcpu))
+ return true;
+
return !kvm_supports_32bit_el0();
}
@@ -805,7 +1115,7 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
* state gets updated in kvm_timer_update_run and
* kvm_pmu_update_run below).
*/
- if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
if (kvm_timer_should_notify_user(vcpu) ||
kvm_pmu_should_notify_user(vcpu)) {
*ret = -EINTR;
@@ -861,13 +1171,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (run->exit_reason == KVM_EXIT_MMIO) {
ret = kvm_handle_mmio_return(vcpu);
- if (ret)
+ if (ret <= 0)
return ret;
}
vcpu_load(vcpu);
- if (run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
ret = -EINTR;
goto out;
}
@@ -881,7 +1191,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/*
* Check conditions before entering the guest
*/
- ret = xfer_to_guest_mode_handle_work(vcpu);
+ ret = kvm_xfer_to_guest_mode_handle_work(vcpu);
if (!ret)
ret = 1;
@@ -895,16 +1205,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
preempt_disable();
- /*
- * The VMID allocator only tracks active VMIDs per
- * physical CPU, and therefore the VMID allocated may not be
- * preserved on VMID roll-over if the task was preempted,
- * making a thread's VMID inactive. So we need to call
- * kvm_arm_vmid_update() in non-premptible context.
- */
- kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid);
+ kvm_nested_flush_hwstate(vcpu);
- kvm_pmu_flush_hwstate(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_flush_hwstate(vcpu);
local_irq_disable();
@@ -923,8 +1227,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
isb(); /* Ensure work in x_flush_hwstate is committed */
- kvm_pmu_sync_hwstate(vcpu);
- if (static_branch_unlikely(&userspace_irqchip_in_use))
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_sync_hwstate(vcpu);
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
kvm_timer_sync_user(vcpu);
kvm_vgic_sync_hwstate(vcpu);
local_irq_enable();
@@ -932,7 +1237,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
continue;
}
- kvm_arm_setup_debug(vcpu);
kvm_arch_vcpu_ctxflush_fp(vcpu);
/**************************************************************
@@ -949,14 +1253,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* Back from guest
*************************************************************/
- kvm_arm_clear_debug(vcpu);
-
/*
* We must sync the PMU state before the vgic state so
* that the vgic can properly sample the updated state of the
* interrupt line.
*/
- kvm_pmu_sync_hwstate(vcpu);
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_sync_hwstate(vcpu);
/*
* Sync the vgic state before syncing the timer state because
@@ -970,9 +1273,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* we don't want vtimer interrupts to race with syncing the
* timer virtual interrupt state.
*/
- if (static_branch_unlikely(&userspace_irqchip_in_use))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
kvm_timer_sync_user(vcpu);
+ if (is_hyp_ctxt(vcpu))
+ kvm_timer_sync_nested(vcpu);
+
kvm_arch_vcpu_ctxsync_fp(vcpu);
/*
@@ -1000,6 +1306,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/* Exit types that need handling before we can be preempted */
handle_exit_early(vcpu, ret);
+ kvm_nested_sync_hwstate(vcpu);
+
preempt_enable();
/*
@@ -1017,7 +1325,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* invalid. The VMM can try and fix it by issuing a
* KVM_ARM_VCPU_INIT if it really wants to.
*/
- vcpu->arch.target = -1;
+ vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
ret = ARM_EXCEPTION_IL;
}
@@ -1086,27 +1394,23 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
bool line_status)
{
u32 irq = irq_level->irq;
- unsigned int irq_type, vcpu_idx, irq_num;
- int nrcpus = atomic_read(&kvm->online_vcpus);
+ unsigned int irq_type, vcpu_id, irq_num;
struct kvm_vcpu *vcpu = NULL;
bool level = irq_level->level;
irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
- vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
- vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
+ vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
+ vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
- trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
+ trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);
switch (irq_type) {
case KVM_ARM_IRQ_TYPE_CPU:
if (irqchip_in_kernel(kvm))
return -ENXIO;
- if (vcpu_idx >= nrcpus)
- return -EINVAL;
-
- vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
if (!vcpu)
return -EINVAL;
@@ -1118,17 +1422,14 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
if (!irqchip_in_kernel(kvm))
return -ENXIO;
- if (vcpu_idx >= nrcpus)
- return -EINVAL;
-
- vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
if (!vcpu)
return -EINVAL;
if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
return -EINVAL;
- return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
+ return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
case KVM_ARM_IRQ_TYPE_SPI:
if (!irqchip_in_kernel(kvm))
return -ENXIO;
@@ -1136,64 +1437,172 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
if (irq_num < VGIC_NR_PRIVATE_IRQS)
return -EINVAL;
- return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
+ return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
}
return -EINVAL;
}
-static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
- const struct kvm_vcpu_init *init)
+static unsigned long system_supported_vcpu_features(void)
+{
+ unsigned long features = KVM_VCPU_VALID_FEATURES;
+
+ if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
+ clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
+
+ if (!kvm_supports_guest_pmuv3())
+ clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
+
+ if (!system_supports_sve())
+ clear_bit(KVM_ARM_VCPU_SVE, &features);
+
+ if (!kvm_has_full_ptr_auth()) {
+ clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
+ clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
+ }
+
+ if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
+ clear_bit(KVM_ARM_VCPU_HAS_EL2, &features);
+
+ return features;
+}
+
+static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
{
- unsigned int i, ret;
- u32 phys_target = kvm_target_cpu();
+ unsigned long features = init->features[0];
+ int i;
+
+ if (features & ~KVM_VCPU_VALID_FEATURES)
+ return -ENOENT;
- if (init->target != phys_target)
+ for (i = 1; i < ARRAY_SIZE(init->features); i++) {
+ if (init->features[i])
+ return -ENOENT;
+ }
+
+ if (features & ~system_supported_vcpu_features())
return -EINVAL;
/*
- * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
- * use the same target.
+ * For now make sure that both address/generic pointer authentication
+ * features are requested by the userspace together.
*/
- if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+ if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) !=
+ test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
return -EINVAL;
- /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
- for (i = 0; i < sizeof(init->features) * 8; i++) {
- bool set = (init->features[i / 32] & (1 << (i % 32)));
+ if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
+ return 0;
- if (set && i >= KVM_VCPU_MAX_FEATURES)
- return -ENOENT;
+ /* MTE is incompatible with AArch32 */
+ if (kvm_has_mte(vcpu->kvm))
+ return -EINVAL;
- /*
- * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
- * use the same feature set.
- */
- if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
- test_bit(i, vcpu->arch.features) != set)
- return -EINVAL;
+ /* NV is incompatible with AArch32 */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
+ return -EINVAL;
- if (set)
- set_bit(i, vcpu->arch.features);
- }
+ return 0;
+}
+
+static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ unsigned long features = init->features[0];
- vcpu->arch.target = phys_target;
+ return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features,
+ KVM_VCPU_MAX_FEATURES);
+}
+
+static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int ret = 0;
+
+ /*
+ * When the vCPU has a PMU, but no PMU is set for the guest
+ * yet, set the default one.
+ */
+ if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
+ ret = kvm_arm_set_default_pmu(kvm);
+
+ /* Prepare for nested if required */
+ if (!ret && vcpu_has_nv(vcpu))
+ ret = kvm_vcpu_init_nested(vcpu);
+
+ return ret;
+}
+
+static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ unsigned long features = init->features[0];
+ struct kvm *kvm = vcpu->kvm;
+ int ret = -EINVAL;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
+ kvm_vcpu_init_changed(vcpu, init))
+ goto out_unlock;
+
+ bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
+
+ ret = kvm_setup_vcpu(vcpu);
+ if (ret)
+ goto out_unlock;
/* Now we know what it is, we can reset it. */
- ret = kvm_reset_vcpu(vcpu);
- if (ret) {
- vcpu->arch.target = -1;
- bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
- }
+ kvm_reset_vcpu(vcpu);
+ set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
+ vcpu_set_flag(vcpu, VCPU_INITIALIZED);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ int ret;
+
+ if (init->target != KVM_ARM_TARGET_GENERIC_V8 &&
+ init->target != kvm_target_cpu())
+ return -EINVAL;
+
+ ret = kvm_vcpu_init_check_features(vcpu, init);
+ if (ret)
+ return ret;
+
+ if (!kvm_vcpu_initialized(vcpu))
+ return __kvm_vcpu_set_target(vcpu, init);
+
+ if (kvm_vcpu_init_changed(vcpu, init))
+ return -EINVAL;
+
+ kvm_reset_vcpu(vcpu);
+ return 0;
+}
+
static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm_vcpu_init *init)
{
+ bool power_off = false;
int ret;
+ /*
+ * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
+ * reflecting it in the finalized feature set, thus limiting its scope
+ * to a single KVM_ARM_VCPU_INIT call.
+ */
+ if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
+ init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
+ power_off = true;
+ }
+
ret = kvm_vcpu_set_target(vcpu, init);
if (ret)
return ret;
@@ -1215,15 +1624,18 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
}
vcpu_reset_hcr(vcpu);
- vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
/*
* Handle the "start in power-off" case.
*/
- if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
- kvm_arm_vcpu_power_off(vcpu);
+ spin_lock(&vcpu->arch.mp_state_lock);
+
+ if (power_off)
+ __kvm_arm_vcpu_power_off(vcpu);
else
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
+
+ spin_unlock(&vcpu->arch.mp_state_lock);
return 0;
}
@@ -1391,6 +1803,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_VCPU_EVENTS: {
struct kvm_vcpu_events events;
+ if (!kvm_vcpu_initialized(vcpu))
+ return -ENOEXEC;
+
if (kvm_arm_vcpu_get_events(vcpu, &events))
return -EINVAL;
@@ -1402,6 +1817,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_VCPU_EVENTS: {
struct kvm_vcpu_events events;
+ if (!kvm_vcpu_initialized(vcpu))
+ return -ENOEXEC;
+
if (copy_from_user(&events, argp, sizeof(events)))
return -EFAULT;
@@ -1425,15 +1843,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
return r;
}
-void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
-
+ return -ENOIOCTLCMD;
}
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *memslot)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
- kvm_flush_remote_tlbs(kvm);
+
}
static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
@@ -1449,11 +1867,31 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
}
}
-long kvm_arch_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_ARM_VM_SMCCC_CTRL:
+ return kvm_vm_smccc_has_attr(kvm, attr);
+ default:
+ return -ENXIO;
+ }
+}
+
+static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_ARM_VM_SMCCC_CTRL:
+ return kvm_vm_smccc_set_attr(kvm, attr);
+ default:
+ return -ENXIO;
+ }
+}
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
+ struct kvm_device_attr attr;
switch (ioctl) {
case KVM_CREATE_IRQCHIP: {
@@ -1473,9 +1911,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
}
case KVM_ARM_PREFERRED_TARGET: {
- struct kvm_vcpu_init init;
-
- kvm_vcpu_preferred_target(&init);
+ struct kvm_vcpu_init init = {
+ .target = KVM_ARM_TARGET_GENERIC_V8,
+ };
if (copy_to_user(argp, &init, sizeof(init)))
return -EFAULT;
@@ -1489,6 +1927,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
return -EFAULT;
return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
}
+ case KVM_ARM_SET_COUNTER_OFFSET: {
+ struct kvm_arm_counter_offset offset;
+
+ if (copy_from_user(&offset, argp, sizeof(offset)))
+ return -EFAULT;
+ return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ return kvm_vm_has_attr(kvm, &attr);
+ }
+ case KVM_SET_DEVICE_ATTR: {
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ return kvm_vm_set_attr(kvm, &attr);
+ }
+ case KVM_ARM_GET_REG_WRITABLE_MASKS: {
+ struct reg_mask_range range;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+ return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range);
+ }
default:
return -EINVAL;
}
@@ -1507,6 +1971,11 @@ static unsigned long nvhe_percpu_order(void)
return size ? get_order(size) : 0;
}
+static size_t pkvm_host_sve_state_order(void)
+{
+ return get_order(pkvm_host_sve_state_size());
+}
+
/* A lookup table holding the hypervisor VA for each vector slot */
static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
@@ -1539,7 +2008,7 @@ static int kvm_init_vector_slots(void)
return 0;
}
-static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
+static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
{
struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
unsigned long tcr;
@@ -1555,8 +2024,18 @@ static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->mair_el2 = read_sysreg(mair_el1);
- tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
- tcr &= ~TCR_T0SZ_MASK;
+ tcr = read_sysreg(tcr_el1);
+ if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
+ tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK);
+ tcr |= TCR_EPD1_MASK;
+ } else {
+ unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr);
+
+ tcr &= TCR_EL2_MASK;
+ tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips);
+ if (lpa2_is_enabled())
+ tcr |= TCR_EL2_DS;
+ }
tcr |= TCR_T0SZ(hyp_va_bits);
params->tcr_el2 = tcr;
@@ -1565,6 +2044,8 @@ static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
else
params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
+ if (cpus_have_final_cap(ARM64_KVM_HVHE))
+ params->hcr_el2 |= HCR_E2H;
params->vttbr = params->vtcr = 0;
/*
@@ -1586,7 +2067,7 @@ static void hyp_install_host_vector(void)
* Call initialization code, and switch to the full blown HYP code.
* If the cpucaps haven't been finalized yet, something has gone very
* wrong, and hyp will crash and burn when it uses any
- * cpus_have_const_cap() wrapper.
+ * cpus_have_*_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
params = this_cpu_ptr_nvhe_sym(kvm_init_params);
@@ -1647,7 +2128,8 @@ static void cpu_set_hyp_vector(void)
static void cpu_hyp_init_context(void)
{
- kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
+ kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
+ kvm_init_host_debug_data();
if (!is_kernel_in_hyp_mode())
cpu_init_hyp_mode();
@@ -1656,10 +2138,11 @@ static void cpu_hyp_init_context(void)
static void cpu_hyp_init_features(void)
{
cpu_set_hyp_vector();
- kvm_arm_init_debug();
- if (is_kernel_in_hyp_mode())
+ if (is_kernel_in_hyp_mode()) {
kvm_timer_init_vhe();
+ kvm_debug_init_vhe();
+ }
if (vgic_present)
kvm_vgic_init_cpu_hardware();
@@ -1672,32 +2155,49 @@ static void cpu_hyp_reinit(void)
cpu_hyp_init_features();
}
-static void _kvm_arch_hardware_enable(void *discard)
+static void cpu_hyp_init(void *discard)
{
- if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+ if (!__this_cpu_read(kvm_hyp_initialized)) {
cpu_hyp_reinit();
- __this_cpu_write(kvm_arm_hardware_enabled, 1);
+ __this_cpu_write(kvm_hyp_initialized, 1);
}
}
-int kvm_arch_hardware_enable(void)
+static void cpu_hyp_uninit(void *discard)
{
- _kvm_arch_hardware_enable(NULL);
- return 0;
+ if (!is_protected_kvm_enabled() && __this_cpu_read(kvm_hyp_initialized)) {
+ cpu_hyp_reset();
+ __this_cpu_write(kvm_hyp_initialized, 0);
+ }
}
-static void _kvm_arch_hardware_disable(void *discard)
+int kvm_arch_enable_virtualization_cpu(void)
{
- if (__this_cpu_read(kvm_arm_hardware_enabled)) {
- cpu_hyp_reset();
- __this_cpu_write(kvm_arm_hardware_enabled, 0);
- }
+ /*
+ * Most calls to this function are made with migration
+ * disabled, but not with preemption disabled. The former is
+ * enough to ensure correctness, but most of the helpers
+ * expect the later and will throw a tantrum otherwise.
+ */
+ preempt_disable();
+
+ cpu_hyp_init(NULL);
+
+ kvm_vgic_cpu_up();
+ kvm_timer_cpu_up();
+
+ preempt_enable();
+
+ return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
+ kvm_timer_cpu_down();
+ kvm_vgic_cpu_down();
+
if (!is_protected_kvm_enabled())
- _kvm_arch_hardware_disable(NULL);
+ cpu_hyp_uninit(NULL);
}
#ifdef CONFIG_CPU_PM
@@ -1706,16 +2206,16 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
void *v)
{
/*
- * kvm_arm_hardware_enabled is left with its old value over
+ * kvm_hyp_initialized is left with its old value over
* PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
* re-enable hyp.
*/
switch (cmd) {
case CPU_PM_ENTER:
- if (__this_cpu_read(kvm_arm_hardware_enabled))
+ if (__this_cpu_read(kvm_hyp_initialized))
/*
- * don't update kvm_arm_hardware_enabled here
- * so that the hardware will be re-enabled
+ * don't update kvm_hyp_initialized here
+ * so that the hyp will be re-enabled
* when we resume. See below.
*/
cpu_hyp_reset();
@@ -1723,8 +2223,8 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
return NOTIFY_OK;
case CPU_PM_ENTER_FAILED:
case CPU_PM_EXIT:
- if (__this_cpu_read(kvm_arm_hardware_enabled))
- /* The hardware was enabled before suspend. */
+ if (__this_cpu_read(kvm_hyp_initialized))
+ /* The hyp was enabled before suspend. */
cpu_hyp_reinit();
return NOTIFY_OK;
@@ -1738,26 +2238,26 @@ static struct notifier_block hyp_init_cpu_pm_nb = {
.notifier_call = hyp_init_cpu_pm_notifier,
};
-static void hyp_cpu_pm_init(void)
+static void __init hyp_cpu_pm_init(void)
{
if (!is_protected_kvm_enabled())
cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
}
-static void hyp_cpu_pm_exit(void)
+static void __init hyp_cpu_pm_exit(void)
{
if (!is_protected_kvm_enabled())
cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
}
#else
-static inline void hyp_cpu_pm_init(void)
+static inline void __init hyp_cpu_pm_init(void)
{
}
-static inline void hyp_cpu_pm_exit(void)
+static inline void __init hyp_cpu_pm_exit(void)
{
}
#endif
-static void init_cpu_logical_map(void)
+static void __init init_cpu_logical_map(void)
{
unsigned int cpu;
@@ -1774,7 +2274,7 @@ static void init_cpu_logical_map(void)
#define init_psci_0_1_impl_state(config, what) \
config.psci_0_1_ ## what ## _implemented = psci_ops.what
-static bool init_psci_relay(void)
+static bool __init init_psci_relay(void)
{
/*
* If PSCI has not been initialized, protected KVM cannot install
@@ -1786,6 +2286,7 @@ static bool init_psci_relay(void)
}
kvm_host_psci_config.version = psci_ops.get_version();
+ kvm_host_psci_config.smccc_version = arm_smccc_get_version();
if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
@@ -1797,14 +2298,14 @@ static bool init_psci_relay(void)
return true;
}
-static int init_subsystems(void)
+static int __init init_subsystems(void)
{
int err = 0;
/*
* Enable hardware so that subsystem initialisation can access EL2.
*/
- on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+ on_each_cpu(cpu_hyp_init, NULL, 1);
/*
* Register CPU lower-power notifier
@@ -1821,6 +2322,19 @@ static int init_subsystems(void)
break;
case -ENODEV:
case -ENXIO:
+ /*
+ * No VGIC? No pKVM for you.
+ *
+ * Protected mode assumes that VGICv3 is present, so no point
+ * in trying to hobble along if vgic initialization fails.
+ */
+ if (is_protected_kvm_enabled())
+ goto out;
+
+ /*
+ * Otherwise, userspace could choose to implement a GIC for its
+ * guest on non-cooperative hardware.
+ */
vgic_present = false;
err = 0;
break;
@@ -1828,6 +2342,14 @@ static int init_subsystems(void)
goto out;
}
+ if (kvm_mode == KVM_MODE_NV &&
+ !(vgic_present && (kvm_vgic_global_state.type == VGIC_V3 ||
+ kvm_vgic_global_state.has_gcie_v3_compat))) {
+ kvm_err("NV support requires GICv3 or GICv5 with legacy support, giving up\n");
+ err = -EINVAL;
+ goto out;
+ }
+
/*
* Init HYP architected timer support
*/
@@ -1838,24 +2360,49 @@ static int init_subsystems(void)
kvm_register_perf_callbacks(NULL);
out:
+ if (err)
+ hyp_cpu_pm_exit();
+
if (err || !is_protected_kvm_enabled())
- on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+ on_each_cpu(cpu_hyp_uninit, NULL, 1);
return err;
}
-static void teardown_hyp_mode(void)
+static void __init teardown_subsystems(void)
+{
+ kvm_unregister_perf_callbacks();
+ hyp_cpu_pm_exit();
+}
+
+static void __init teardown_hyp_mode(void)
{
+ bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
int cpu;
free_hyp_pgds();
for_each_possible_cpu(cpu) {
- free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ if (per_cpu(kvm_hyp_initialized, cpu))
+ continue;
+
+ free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT);
+
+ if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu])
+ continue;
+
+ if (free_sve) {
+ struct cpu_sve_state *sve_state;
+
+ sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
+ free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
+ }
+
free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
+
}
}
-static int do_pkvm_init(u32 hyp_va_bits)
+static int __init do_pkvm_init(u32 hyp_va_bits)
{
void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
int ret;
@@ -1869,17 +2416,41 @@ static int do_pkvm_init(u32 hyp_va_bits)
/*
* The stub hypercalls are now disabled, so set our local flag to
- * prevent a later re-init attempt in kvm_arch_hardware_enable().
+ * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
*/
- __this_cpu_write(kvm_arm_hardware_enabled, 1);
+ __this_cpu_write(kvm_hyp_initialized, 1);
preempt_enable();
return ret;
}
+static u64 get_hyp_id_aa64pfr0_el1(void)
+{
+ /*
+ * Track whether the system isn't affected by spectre/meltdown in the
+ * hypervisor's view of id_aa64pfr0_el1, used for protected VMs.
+ * Although this is per-CPU, we make it global for simplicity, e.g., not
+ * to have to worry about vcpu migration.
+ *
+ * Unlike for non-protected VMs, userspace cannot override this for
+ * protected VMs.
+ */
+ u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+ val &= ~(ID_AA64PFR0_EL1_CSV2 |
+ ID_AA64PFR0_EL1_CSV3);
+
+ val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2,
+ arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
+ val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3,
+ arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
+
+ return val;
+}
+
static void kvm_hyp_init_symbols(void)
{
- kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+ kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
@@ -1887,11 +2458,32 @@ static void kvm_hyp_init_symbols(void)
kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+ kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
kvm_nvhe_sym(__icache_flags) = __icache_flags;
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
+
+ /* Propagate the FGT state to the nVHE side */
+ kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks;
+ kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks;
+ kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks;
+ kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks;
+ kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks;
+ kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks;
+ kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks;
+ kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks;
+ kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks;
+ kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks;
+ kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks;
+
+ /*
+ * Flush entire BSS since part of its data containing init symbols is read
+ * while the MMU is off.
+ */
+ kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
+ kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
}
-static int kvm_hyp_init_protection(u32 hyp_va_bits)
+static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
{
void *addr = phys_to_virt(hyp_mem_base);
int ret;
@@ -1909,10 +2501,72 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
return 0;
}
-/**
- * Inits Hyp-mode on all online CPUs
+static int init_pkvm_host_sve_state(void)
+{
+ int cpu;
+
+ if (!system_supports_sve())
+ return 0;
+
+ /* Allocate pages for host sve state in protected mode. */
+ for_each_possible_cpu(cpu) {
+ struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());
+
+ if (!page)
+ return -ENOMEM;
+
+ per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
+ }
+
+ /*
+ * Don't map the pages in hyp since these are only used in protected
+ * mode, which will (re)create its own mapping when initialized.
+ */
+
+ return 0;
+}
+
+/*
+ * Finalizes the initialization of hyp mode, once everything else is initialized
+ * and the initialziation process cannot fail.
*/
-static int init_hyp_mode(void)
+static void finalize_init_hyp_mode(void)
+{
+ int cpu;
+
+ if (system_supports_sve() && is_protected_kvm_enabled()) {
+ for_each_possible_cpu(cpu) {
+ struct cpu_sve_state *sve_state;
+
+ sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
+ per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
+ kern_hyp_va(sve_state);
+ }
+ }
+}
+
+static void pkvm_hyp_init_ptrauth(void)
+{
+ struct kvm_cpu_context *hyp_ctxt;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
+ hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
+ }
+}
+
+/* Inits Hyp-mode on all online CPUs */
+static int __init init_hyp_mode(void)
{
u32 hyp_va_bits;
int cpu;
@@ -1936,15 +2590,15 @@ static int init_hyp_mode(void)
* Allocate stack pages for Hypervisor-mode
*/
for_each_possible_cpu(cpu) {
- unsigned long stack_page;
+ unsigned long stack_base;
- stack_page = __get_free_page(GFP_KERNEL);
- if (!stack_page) {
+ stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT);
+ if (!stack_base) {
err = -ENOMEM;
goto out_err;
}
- per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
+ per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base;
}
/*
@@ -1975,6 +2629,13 @@ static int init_hyp_mode(void)
goto out_err;
}
+ err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start),
+ kvm_ksym_ref(__hyp_data_end), PAGE_HYP);
+ if (err) {
+ kvm_err("Cannot map .hyp.data section\n");
+ goto out_err;
+ }
+
err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
if (err) {
@@ -2013,31 +2674,9 @@ static int init_hyp_mode(void)
*/
for_each_possible_cpu(cpu) {
struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
- char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
- unsigned long hyp_addr;
+ char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu);
- /*
- * Allocate a contiguous HYP private VA range for the stack
- * and guard page. The allocation is also aligned based on
- * the order of its size.
- */
- err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
- if (err) {
- kvm_err("Cannot allocate hyp stack guard page\n");
- goto out_err;
- }
-
- /*
- * Since the stack grows downwards, map the stack to the page
- * at the higher address and leave the lower guard page
- * unbacked.
- *
- * Any valid stack address now has the PAGE_SHIFT bit as 1
- * and addresses corresponding to the guard page have the
- * PAGE_SHIFT bit as 0 - this is used for overflow detection.
- */
- err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE,
- __pa(stack_page), PAGE_HYP);
+ err = create_hyp_stack(__pa(stack_base), &params->stack_hyp_va);
if (err) {
kvm_err("Cannot map hyp stack\n");
goto out_err;
@@ -2049,9 +2688,7 @@ static int init_hyp_mode(void)
* __hyp_pa() won't do the right thing there, since the stack
* has been mapped in the flexible private VA space.
*/
- params->stack_pa = __pa(stack_page);
-
- params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
+ params->stack_pa = __pa(stack_base);
}
for_each_possible_cpu(cpu) {
@@ -2072,6 +2709,10 @@ static int init_hyp_mode(void)
kvm_hyp_init_symbols();
if (is_protected_kvm_enabled()) {
+ if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
+ cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
+ pkvm_hyp_init_ptrauth();
+
init_cpu_logical_map();
if (!init_psci_relay()) {
@@ -2079,6 +2720,10 @@ static int init_hyp_mode(void)
goto out_err;
}
+ err = init_pkvm_host_sve_state();
+ if (err)
+ goto out_err;
+
err = kvm_hyp_init_protection(hyp_va_bits);
if (err) {
kvm_err("Failed to init hyp memory protection\n");
@@ -2094,47 +2739,30 @@ out_err:
return err;
}
-static void _kvm_host_prot_finalize(void *arg)
+struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
{
- int *err = arg;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_mpidr_data *data;
+ unsigned long i;
- if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
- WRITE_ONCE(*err, -EINVAL);
-}
+ mpidr &= MPIDR_HWID_BITMASK;
-static int pkvm_drop_host_privileges(void)
-{
- int ret = 0;
+ rcu_read_lock();
+ data = rcu_dereference(kvm->arch.mpidr_data);
- /*
- * Flip the static key upfront as that may no longer be possible
- * once the host stage 2 is installed.
- */
- static_branch_enable(&kvm_protected_mode_initialized);
- on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
- return ret;
-}
+ if (data) {
+ u16 idx = kvm_mpidr_index(data, mpidr);
-static int finalize_hyp_mode(void)
-{
- if (!is_protected_kvm_enabled())
- return 0;
+ vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
+ if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
+ vcpu = NULL;
+ }
- /*
- * Exclude HYP sections from kmemleak so that they don't get peeked
- * at, which would end badly once inaccessible.
- */
- kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
- kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
- return pkvm_drop_host_privileges();
-}
+ rcu_read_unlock();
-struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i;
+ if (vcpu)
+ return vcpu;
- mpidr &= MPIDR_HWID_BITMASK;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
return vcpu;
@@ -2147,28 +2775,54 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
return irqchip_in_kernel(kvm);
}
-bool kvm_arch_has_irq_bypass(void)
-{
- return true;
-}
-
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
+
+ /*
+ * The only thing we have a chance of directly-injecting is LPIs. Maybe
+ * one day...
+ */
+ if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
+ return 0;
return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
&irqfd->irq_entry);
}
+
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
+
+ if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
- kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
- &irqfd->irq_entry);
+ /*
+ * Remapping the vLPI requires taking the its_lock mutex to resolve
+ * the new translation. We're in spinlock land at this point, so no
+ * chance of resolving the translation.
+ *
+ * Unmap the vLPI and fall back to software LPI injection.
+ */
+ return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
}
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
@@ -2187,10 +2841,8 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
kvm_arm_resume_guest(irqfd->kvm);
}
-/**
- * Initialize Hyp-mode and memory mappings on all CPUs.
- */
-int kvm_arch_init(void *opaque)
+/* Initialize Hyp-mode and memory mappings on all CPUs */
+static __init int kvm_arm_init(void)
{
int err;
bool in_hyp_mode;
@@ -2241,33 +2893,42 @@ int kvm_arch_init(void *opaque)
err = kvm_init_vector_slots();
if (err) {
kvm_err("Cannot initialise vector slots\n");
- goto out_err;
+ goto out_hyp;
}
err = init_subsystems();
if (err)
goto out_hyp;
- if (!in_hyp_mode) {
- err = finalize_hyp_mode();
- if (err) {
- kvm_err("Failed to finalize Hyp protection\n");
- goto out_hyp;
- }
- }
+ kvm_info("%s%sVHE%s mode initialized successfully\n",
+ in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
+ "Protected " : "Hyp "),
+ in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
+ "h" : "n"),
+ cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": "");
- if (is_protected_kvm_enabled()) {
- kvm_info("Protected nVHE mode initialized successfully\n");
- } else if (in_hyp_mode) {
- kvm_info("VHE mode initialized successfully\n");
- } else {
- kvm_info("Hyp mode initialized successfully\n");
- }
+ /*
+ * FIXME: Do something reasonable if kvm_init() fails after pKVM
+ * hypervisor protection is finalized.
+ */
+ err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ if (err)
+ goto out_subs;
+
+ /*
+ * This should be called after initialization is done and failure isn't
+ * possible anymore.
+ */
+ if (!in_hyp_mode)
+ finalize_init_hyp_mode();
+
+ kvm_arm_initialised = true;
return 0;
+out_subs:
+ teardown_subsystems();
out_hyp:
- hyp_cpu_pm_exit();
if (!in_hyp_mode)
teardown_hyp_mode();
out_err:
@@ -2275,12 +2936,6 @@ out_err:
return err;
}
-/* NOP: Compiling as a module not supported */
-void kvm_arch_exit(void)
-{
- kvm_unregister_perf_callbacks();
-}
-
static int __init early_kvm_mode_cfg(char *arg)
{
if (!arg)
@@ -2310,19 +2965,48 @@ static int __init early_kvm_mode_cfg(char *arg)
return 0;
}
+ if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
+ kvm_mode = KVM_MODE_NV;
+ return 0;
+ }
+
return -EINVAL;
}
early_param("kvm-arm.mode", early_kvm_mode_cfg);
-enum kvm_mode kvm_get_mode(void)
+static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
{
- return kvm_mode;
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "trap") == 0) {
+ *p = KVM_WFX_TRAP;
+ return 0;
+ }
+
+ if (strcmp(arg, "notrap") == 0) {
+ *p = KVM_WFX_NOTRAP;
+ return 0;
+ }
+
+ return -EINVAL;
}
-static int arm_init(void)
+static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
{
- int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
- return rc;
+ return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
+}
+early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);
+
+static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
+{
+ return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
+}
+early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);
+
+enum kvm_mode kvm_get_mode(void)
+{
+ return kvm_mode;
}
-module_init(arm_init);
+module_init(kvm_arm_init);
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
new file mode 100644
index 000000000000..53bf70126f81
--- /dev/null
+++ b/arch/arm64/kvm/at.c
@@ -0,0 +1,1795 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 - Linaro Ltd
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/esr.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
+{
+ wr->fst = fst;
+ wr->ptw = s1ptw;
+ wr->s2 = s1ptw;
+ wr->failed = true;
+}
+
+#define S1_MMU_DISABLED (-127)
+
+static int get_ia_size(struct s1_walk_info *wi)
+{
+ return 64 - wi->txsz;
+}
+
+/* Return true if the IPA is out of the OA range */
+static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
+{
+ if (wi->pa52bit)
+ return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
+ return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
+}
+
+static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
+{
+ switch (BIT(wi->pgshift)) {
+ case SZ_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
+ return false;
+ return ((wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_PS_MASK, tcr) :
+ FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
+ case SZ_16K:
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
+ return false;
+ break;
+ case SZ_4K:
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
+ return false;
+ break;
+ }
+
+ return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
+}
+
+static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
+{
+ u64 addr;
+
+ if (!wi->pa52bit)
+ return desc & GENMASK_ULL(47, wi->pgshift);
+
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ case SZ_16K:
+ addr = desc & GENMASK_ULL(49, wi->pgshift);
+ addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
+ break;
+ case SZ_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ addr = desc & GENMASK_ULL(47, wi->pgshift);
+ addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
+ break;
+ }
+
+ return addr;
+}
+
+/* Return the translation regime that applies to an AT instruction */
+static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
+{
+ /*
+ * We only get here from guest EL2, so the translation
+ * regime AT applies to is solely defined by {E2H,TGE}.
+ */
+ switch (op) {
+ case OP_AT_S1E2R:
+ case OP_AT_S1E2W:
+ case OP_AT_S1E2A:
+ return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
+ default:
+ return (vcpu_el2_e2h_is_set(vcpu) &&
+ vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
+ }
+}
+
+static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ if (regime == TR_EL10) {
+ if (vcpu_has_nv(vcpu) &&
+ !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
+ return 0;
+
+ return vcpu_read_sys_reg(vcpu, TCR2_EL1);
+ }
+
+ return vcpu_read_sys_reg(vcpu, TCR2_EL2);
+}
+
+static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ if (!kvm_has_s1pie(vcpu->kvm))
+ return false;
+
+ /* Abuse TCR2_EL1_PIE and use it for EL2 as well */
+ return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
+}
+
+static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
+{
+ u64 val;
+
+ if (!kvm_has_s1poe(vcpu->kvm)) {
+ wi->poe = wi->e0poe = false;
+ return;
+ }
+
+ val = effective_tcr2(vcpu, wi->regime);
+
+ /* Abuse TCR2_EL1_* for EL2 */
+ wi->poe = val & TCR2_EL1_POE;
+ wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
+}
+
+static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
+ unsigned int stride, x;
+ bool va55, tbi, lva;
+
+ va55 = va & BIT(55);
+
+ if (vcpu_has_nv(vcpu)) {
+ hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
+ wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
+ } else {
+ WARN_ON_ONCE(wi->regime != TR_EL10);
+ wi->s2 = false;
+ hcr = 0;
+ }
+
+ switch (wi->regime) {
+ case TR_EL10:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+ break;
+ case TR_EL2:
+ case TR_EL20:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+ break;
+ default:
+ BUG();
+ }
+
+ /* Someone was silly enough to encode TG0/TG1 differently */
+ if (va55 && wi->regime != TR_EL2) {
+ wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+ switch (tg << TCR_TG1_SHIFT) {
+ case TCR_TG1_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG1_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG1_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ } else {
+ wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+ switch (tg << TCR_TG0_SHIFT) {
+ case TCR_TG0_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG0_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ }
+
+ wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
+
+ ia_bits = get_ia_size(wi);
+
+ /* AArch64.S1StartLevel() */
+ stride = wi->pgshift - 3;
+ wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+ if (wi->regime == TR_EL2 && va55)
+ goto addrsz;
+
+ tbi = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_TBI, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_TBI1, tcr) :
+ FIELD_GET(TCR_TBI0, tcr)));
+
+ if (!tbi && (u64)sign_extend64(va, 55) != va)
+ goto addrsz;
+
+ wi->sh = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_SH1_MASK, tcr) :
+ FIELD_GET(TCR_SH0_MASK, tcr)));
+
+ va = (u64)sign_extend64(va, 55);
+
+ /* Let's put the MMU disabled case aside immediately */
+ switch (wi->regime) {
+ case TR_EL10:
+ /*
+ * If dealing with the EL1&0 translation regime, 3 things
+ * can disable the S1 translation:
+ *
+ * - HCR_EL2.DC = 1
+ * - HCR_EL2.{E2H,TGE} = {0,1}
+ * - SCTLR_EL1.M = 0
+ *
+ * The TGE part is interesting. If we have decided that this
+ * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
+ * {0,x}, and we only need to test for TGE == 1.
+ */
+ if (hcr & (HCR_DC | HCR_TGE)) {
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+ fallthrough;
+ case TR_EL2:
+ case TR_EL20:
+ if (!(sctlr & SCTLR_ELx_M))
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+
+ if (wr->level == S1_MMU_DISABLED) {
+ if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
+ goto addrsz;
+
+ wr->pa = va;
+ return 0;
+ }
+
+ wi->be = sctlr & SCTLR_ELx_EE;
+
+ wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
+ wi->hpd &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HPD, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_HPD1, tcr) :
+ FIELD_GET(TCR_HPD0, tcr)));
+ /* R_JHSVW */
+ wi->hpd |= s1pie_enabled(vcpu, wi->regime);
+
+ /* Do we have POE? */
+ compute_s1poe(vcpu, wi);
+
+ /* R_BVXDG */
+ wi->hpd |= (wi->poe || wi->e0poe);
+
+ /* R_PLCGL, R_YXNYW */
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
+ if (wi->txsz > 39)
+ goto transfault;
+ } else {
+ if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
+ goto transfault;
+ }
+
+ /* R_GTJBY, R_SXWGM */
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ case SZ_16K:
+ lva = wi->pa52bit;
+ break;
+ case SZ_64K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
+ break;
+ }
+
+ if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
+ goto transfault;
+
+ /* R_YYVYV, I_THCZK */
+ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
+ (va55 && va < GENMASK(63, ia_bits)))
+ goto transfault;
+
+ /* I_ZFSYQ */
+ if (wi->regime != TR_EL2 &&
+ (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
+ goto transfault;
+
+ /* R_BNDVG and following statements */
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
+ wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
+ goto transfault;
+
+ ps = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
+
+ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
+
+ /* Compute minimal alignment */
+ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
+
+ wi->baddr = ttbr & TTBRx_EL1_BADDR;
+ if (wi->pa52bit) {
+ /*
+ * Force the alignment on 64 bytes for top-level tables
+ * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
+ * store bits [51:48] of the first level of lookup.
+ */
+ x = max(x, 6);
+
+ wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
+ }
+
+ /* R_VPBBF */
+ if (check_output_size(wi->baddr, wi))
+ goto addrsz;
+
+ wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+
+ wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF);
+ wi->ha &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HA, tcr) :
+ FIELD_GET(TCR_HA, tcr));
+
+ return 0;
+
+addrsz:
+ /*
+ * Address Size Fault level 0 to indicate it comes from TTBR.
+ * yes, this is an oddity.
+ */
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
+ return -EFAULT;
+
+transfault:
+ /* Translation Fault on start level */
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
+ return -EFAULT;
+}
+
+static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
+ struct s1_walk_info *wi)
+{
+ u64 val;
+ int r;
+
+ r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
+ if (r)
+ return r;
+
+ if (wi->be)
+ *desc = be64_to_cpu((__force __be64)val);
+ else
+ *desc = le64_to_cpu((__force __le64)val);
+
+ return 0;
+}
+
+static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
+ struct s1_walk_info *wi)
+{
+ if (wi->be) {
+ old = (__force u64)cpu_to_be64(old);
+ new = (__force u64)cpu_to_be64(new);
+ } else {
+ old = (__force u64)cpu_to_le64(old);
+ new = (__force u64)cpu_to_le64(new);
+ }
+
+ return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+}
+
+static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 va_top, va_bottom, baddr, desc, new_desc, ipa;
+ int level, stride, ret;
+
+ level = wi->sl;
+ stride = wi->pgshift - 3;
+ baddr = wi->baddr;
+
+ va_top = get_ia_size(wi) - 1;
+
+ while (1) {
+ u64 index;
+
+ va_bottom = (3 - level) * stride + wi->pgshift;
+ index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
+
+ ipa = baddr | index;
+
+ if (wi->s2) {
+ struct kvm_s2_trans s2_trans = {};
+
+ ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+ if (ret) {
+ fail_s1_walk(wr,
+ (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+ true);
+ return ret;
+ }
+
+ if (!kvm_s2_trans_readable(&s2_trans)) {
+ fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
+ true);
+
+ return -EPERM;
+ }
+
+ ipa = kvm_s2_trans_output(&s2_trans);
+ }
+
+ if (wi->filter) {
+ ret = wi->filter->fn(&(struct s1_walk_context)
+ {
+ .wi = wi,
+ .table_ipa = baddr,
+ .level = level,
+ }, wi->filter->priv);
+ if (ret)
+ return ret;
+ }
+
+ ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi);
+ if (ret) {
+ fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
+ return ret;
+ }
+
+ new_desc = desc;
+
+ /* Invalid descriptor */
+ if (!(desc & BIT(0)))
+ goto transfault;
+
+ /* Block mapping, check validity down the line */
+ if (!(desc & BIT(1)))
+ break;
+
+ /* Page mapping */
+ if (level == 3)
+ break;
+
+ /* Table handling */
+ if (!wi->hpd) {
+ wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
+ wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
+ wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+ }
+
+ baddr = desc_to_oa(wi, desc);
+
+ /* Check for out-of-range OA */
+ if (check_output_size(baddr, wi))
+ goto addrsz;
+
+ /* Prepare for next round */
+ va_top = va_bottom - 1;
+ level++;
+ }
+
+ /* Block mapping, check the validity of the level */
+ if (!(desc & BIT(1))) {
+ bool valid_block = false;
+
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
+ break;
+ case SZ_16K:
+ case SZ_64K:
+ valid_block = level == 2 || (wi->pa52bit && level == 1);
+ break;
+ }
+
+ if (!valid_block)
+ goto transfault;
+ }
+
+ baddr = desc_to_oa(wi, desc);
+ if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
+ goto addrsz;
+
+ if (wi->ha)
+ new_desc |= PTE_AF;
+
+ if (new_desc != desc) {
+ ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi);
+ if (ret)
+ return ret;
+
+ desc = new_desc;
+ }
+
+ if (!(desc & PTE_AF)) {
+ fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
+ return -EACCES;
+ }
+
+ va_bottom += contiguous_bit_shift(desc, wi, level);
+
+ wr->failed = false;
+ wr->level = level;
+ wr->desc = desc;
+ wr->pa = baddr & GENMASK(52, va_bottom);
+ wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
+
+ wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
+ if (wr->nG) {
+ u64 asid_ttbr, tcr;
+
+ switch (wi->regime) {
+ case TR_EL10:
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
+ asid_ttbr = ((tcr & TCR_A1) ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+ break;
+ case TR_EL20:
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ asid_ttbr = ((tcr & TCR_A1) ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+ break;
+ default:
+ BUG();
+ }
+
+ wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr);
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
+ !(tcr & TCR_ASID16))
+ wr->asid &= GENMASK(7, 0);
+ }
+
+ return 0;
+
+addrsz:
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
+ return -EINVAL;
+transfault:
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
+ return -ENOENT;
+}
+
+struct mmu_config {
+ u64 ttbr0;
+ u64 ttbr1;
+ u64 tcr;
+ u64 mair;
+ u64 tcr2;
+ u64 pir;
+ u64 pire0;
+ u64 por_el0;
+ u64 por_el1;
+ u64 sctlr;
+ u64 vttbr;
+ u64 vtcr;
+};
+
+static void __mmu_config_save(struct mmu_config *config)
+{
+ config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
+ config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
+ config->tcr = read_sysreg_el1(SYS_TCR);
+ config->mair = read_sysreg_el1(SYS_MAIR);
+ if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
+ config->tcr2 = read_sysreg_el1(SYS_TCR2);
+ if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
+ config->pir = read_sysreg_el1(SYS_PIR);
+ config->pire0 = read_sysreg_el1(SYS_PIRE0);
+ }
+ if (system_supports_poe()) {
+ config->por_el1 = read_sysreg_el1(SYS_POR);
+ config->por_el0 = read_sysreg_s(SYS_POR_EL0);
+ }
+ }
+ config->sctlr = read_sysreg_el1(SYS_SCTLR);
+ config->vttbr = read_sysreg(vttbr_el2);
+ config->vtcr = read_sysreg(vtcr_el2);
+}
+
+static void __mmu_config_restore(struct mmu_config *config)
+{
+ /*
+ * ARM errata 1165522 and 1530923 require TGE to be 1 before
+ * we update the guest state.
+ */
+ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+ write_sysreg_el1(config->ttbr0, SYS_TTBR0);
+ write_sysreg_el1(config->ttbr1, SYS_TTBR1);
+ write_sysreg_el1(config->tcr, SYS_TCR);
+ write_sysreg_el1(config->mair, SYS_MAIR);
+ if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
+ write_sysreg_el1(config->tcr2, SYS_TCR2);
+ if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
+ write_sysreg_el1(config->pir, SYS_PIR);
+ write_sysreg_el1(config->pire0, SYS_PIRE0);
+ }
+ if (system_supports_poe()) {
+ write_sysreg_el1(config->por_el1, SYS_POR);
+ write_sysreg_s(config->por_el0, SYS_POR_EL0);
+ }
+ }
+ write_sysreg_el1(config->sctlr, SYS_SCTLR);
+ write_sysreg(config->vttbr, vttbr_el2);
+ write_sysreg(config->vtcr, vtcr_el2);
+}
+
+static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 host_pan;
+ bool fail;
+
+ host_pan = read_sysreg_s(SYS_PSTATE_PAN);
+ write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ fail = __kvm_at(OP_AT_S1E1RP, vaddr);
+ break;
+ case OP_AT_S1E1WP:
+ fail = __kvm_at(OP_AT_S1E1WP, vaddr);
+ break;
+ }
+
+ write_sysreg_s(host_pan, SYS_PSTATE_PAN);
+
+ return fail;
+}
+
+#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
+#define MEMATTR_NC 0b0100
+#define MEMATTR_Wt 0b1000
+#define MEMATTR_Wb 0b1100
+#define MEMATTR_WbRaWa 0b1111
+
+#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
+
+static u8 s2_memattr_to_attr(u8 memattr)
+{
+ memattr &= 0b1111;
+
+ switch (memattr) {
+ case 0b0000:
+ case 0b0001:
+ case 0b0010:
+ case 0b0011:
+ return memattr << 2;
+ case 0b0100:
+ return MEMATTR(Wb, Wb);
+ case 0b0101:
+ return MEMATTR(NC, NC);
+ case 0b0110:
+ return MEMATTR(Wt, NC);
+ case 0b0111:
+ return MEMATTR(Wb, NC);
+ case 0b1000:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1001:
+ return MEMATTR(NC, Wt);
+ case 0b1010:
+ return MEMATTR(Wt, Wt);
+ case 0b1011:
+ return MEMATTR(Wb, Wt);
+ case 0b1100:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1101:
+ return MEMATTR(NC, Wb);
+ case 0b1110:
+ return MEMATTR(Wt, Wb);
+ case 0b1111:
+ return MEMATTR(Wb, Wb);
+ default:
+ unreachable();
+ }
+}
+
+static u8 combine_s1_s2_attr(u8 s1, u8 s2)
+{
+ bool transient;
+ u8 final = 0;
+
+ /* Upgrade transient s1 to non-transient to simplify things */
+ switch (s1) {
+ case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
+ transient = true;
+ s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
+ break;
+ case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
+ transient = true;
+ s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
+ break;
+ default:
+ transient = false;
+ }
+
+ /* S2CombineS1AttrHints() */
+ if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_NC)
+ final = MEMATTR_NC;
+ else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
+ final = MEMATTR_Wt;
+ else
+ final = MEMATTR_Wb;
+
+ if (final != MEMATTR_NC) {
+ /* Inherit RaWa hints form S1 */
+ if (transient) {
+ switch (s1 & GENMASK(3, 2)) {
+ case MEMATTR_Wt:
+ final = 0;
+ break;
+ case MEMATTR_Wb:
+ final = MEMATTR_NC;
+ break;
+ }
+ }
+
+ final |= s1 & GENMASK(1, 0);
+ }
+
+ return final;
+}
+
+#define ATTR_NSH 0b00
+#define ATTR_RSV 0b01
+#define ATTR_OSH 0b10
+#define ATTR_ISH 0b11
+
+static u8 compute_final_sh(u8 attr, u8 sh)
+{
+ /* Any form of device, as well as NC has SH[1:0]=0b10 */
+ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
+ return ATTR_OSH;
+
+ if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
+ sh = ATTR_NSH;
+
+ return sh;
+}
+
+static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
+ u8 attr)
+{
+ u8 sh;
+
+ /*
+ * non-52bit and LPA have their basic shareability described in the
+ * descriptor. LPA2 gets it from the corresponding field in TCR,
+ * conveniently recorded in the walk info.
+ */
+ if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
+ sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
+ else
+ sh = wi->sh;
+
+ return compute_final_sh(attr, sh);
+}
+
+static u8 combine_sh(u8 s1_sh, u8 s2_sh)
+{
+ if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
+ return ATTR_OSH;
+ if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
+ return ATTR_ISH;
+
+ return ATTR_NSH;
+}
+
+static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
+ struct kvm_s2_trans *tr)
+{
+ u8 s1_parattr, s2_memattr, final_attr, s2_sh;
+ u64 par;
+
+ /* If S2 has failed to translate, report the damage */
+ if (tr->esr) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= SYS_PAR_EL1_S;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
+ return par;
+ }
+
+ s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
+ s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
+
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
+ s2_memattr &= ~BIT(3);
+
+ /* Combination of R_VRJSW and R_RHWZM */
+ switch (s2_memattr) {
+ case 0b0101:
+ if (MEMATTR_IS_DEVICE(s1_parattr))
+ final_attr = s1_parattr;
+ else
+ final_attr = MEMATTR(NC, NC);
+ break;
+ case 0b0110:
+ case 0b1110:
+ final_attr = MEMATTR(WbRaWa, WbRaWa);
+ break;
+ case 0b0111:
+ case 0b1111:
+ /* Preserve S1 attribute */
+ final_attr = s1_parattr;
+ break;
+ case 0b0100:
+ case 0b1100:
+ case 0b1101:
+ /* Reserved, do something non-silly */
+ final_attr = s1_parattr;
+ break;
+ default:
+ /*
+ * MemAttr[2]=0, Device from S2.
+ *
+ * FWB does not influence the way that stage 1
+ * memory types and attributes are combined
+ * with stage 2 Device type and attributes.
+ */
+ final_attr = min(s2_memattr_to_attr(s2_memattr),
+ s1_parattr);
+ }
+ } else {
+ /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
+ u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
+
+ if (MEMATTR_IS_DEVICE(s1_parattr) ||
+ MEMATTR_IS_DEVICE(s2_parattr)) {
+ final_attr = min(s1_parattr, s2_parattr);
+ } else {
+ /* At this stage, this is memory vs memory */
+ final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
+ s2_parattr & 0xf);
+ final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
+ s2_parattr >> 4) << 4;
+ }
+ }
+
+ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
+ !MEMATTR_IS_DEVICE(final_attr))
+ final_attr = MEMATTR(NC, NC);
+
+ s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
+
+ par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
+ par |= tr->output & GENMASK(47, 12);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH,
+ combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
+ compute_final_sh(final_attr, s2_sh)));
+
+ return par;
+}
+
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ u64 par;
+
+ if (wr->failed) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
+ par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
+ par |= wr->s2 ? SYS_PAR_EL1_S : 0;
+ } else if (wr->level == S1_MMU_DISABLED) {
+ /* MMU off or HCR_EL2.DC == 1 */
+ par = SYS_PAR_EL1_NSE;
+ par |= wr->pa & SYS_PAR_EL1_PA;
+
+ if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
+ (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
+ MEMATTR(WbRaWa, WbRaWa));
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
+ } else {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
+ }
+ } else {
+ u64 mair, sctlr;
+ u8 sh;
+
+ par = SYS_PAR_EL1_NSE;
+
+ mair = (wi->regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, MAIR_EL1) :
+ vcpu_read_sys_reg(vcpu, MAIR_EL2));
+
+ mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
+ mair &= 0xff;
+
+ sctlr = (wi->regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
+ vcpu_read_sys_reg(vcpu, SCTLR_EL2));
+
+ /* Force NC for memory if SCTLR_ELx.C is clear */
+ if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
+ mair = MEMATTR(NC, NC);
+
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
+ par |= wr->pa & SYS_PAR_EL1_PA;
+
+ sh = compute_s1_sh(wi, wr, mair);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
+ }
+
+ return par;
+}
+
+static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ u64 sctlr;
+
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
+ return false;
+
+ if (s1pie_enabled(vcpu, regime))
+ return true;
+
+ if (regime == TR_EL10)
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ else
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+
+ return sctlr & SCTLR_EL1_EPAN;
+}
+
+static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ bool wxn;
+
+ /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
+ if (wi->regime != TR_EL2) {
+ switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
+ case 0b00:
+ wr->pr = wr->pw = true;
+ wr->ur = wr->uw = false;
+ break;
+ case 0b01:
+ wr->pr = wr->pw = wr->ur = wr->uw = true;
+ break;
+ case 0b10:
+ wr->pr = true;
+ wr->pw = wr->ur = wr->uw = false;
+ break;
+ case 0b11:
+ wr->pr = wr->ur = true;
+ wr->pw = wr->uw = false;
+ break;
+ }
+
+ /* We don't use px for anything yet, but hey... */
+ wr->px = !((wr->desc & PTE_PXN) || wr->uw);
+ wr->ux = !(wr->desc & PTE_UXN);
+ } else {
+ wr->ur = wr->uw = wr->ux = false;
+
+ if (!(wr->desc & PTE_RDONLY)) {
+ wr->pr = wr->pw = true;
+ } else {
+ wr->pr = true;
+ wr->pw = false;
+ }
+
+ /* XN maps to UXN */
+ wr->px = !(wr->desc & PTE_UXN);
+ }
+
+ switch (wi->regime) {
+ case TR_EL2:
+ case TR_EL20:
+ wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
+ break;
+ case TR_EL10:
+ wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
+ break;
+ }
+
+ wr->pwxn = wr->uwxn = wxn;
+ wr->pov = wi->poe;
+ wr->uov = wi->e0poe;
+}
+
+static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ /* Hierarchical part of AArch64.S1DirectBasePermissions() */
+ if (wi->regime != TR_EL2) {
+ switch (wr->APTable) {
+ case 0b00:
+ break;
+ case 0b01:
+ wr->ur = wr->uw = false;
+ break;
+ case 0b10:
+ wr->pw = wr->uw = false;
+ break;
+ case 0b11:
+ wr->pw = wr->ur = wr->uw = false;
+ break;
+ }
+
+ wr->px &= !wr->PXNTable;
+ wr->ux &= !wr->UXNTable;
+ } else {
+ if (wr->APTable & BIT(1))
+ wr->pw = false;
+
+ /* XN maps to UXN */
+ wr->px &= !wr->UXNTable;
+ }
+}
+
+#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
+
+#define set_priv_perms(wr, r, w, x) \
+ do { \
+ (wr)->pr = (r); \
+ (wr)->pw = (w); \
+ (wr)->px = (x); \
+ } while (0)
+
+#define set_unpriv_perms(wr, r, w, x) \
+ do { \
+ (wr)->ur = (r); \
+ (wr)->uw = (w); \
+ (wr)->ux = (x); \
+ } while (0)
+
+#define set_priv_wxn(wr, v) \
+ do { \
+ (wr)->pwxn = (v); \
+ } while (0)
+
+#define set_unpriv_wxn(wr, v) \
+ do { \
+ (wr)->uwxn = (v); \
+ } while (0)
+
+/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
+#define set_perms(w, wr, ip) \
+ do { \
+ /* R_LLZDZ */ \
+ switch ((ip)) { \
+ case 0b0000: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b0001: \
+ set_ ## w ## _perms((wr), true , false, false); \
+ break; \
+ case 0b0010: \
+ set_ ## w ## _perms((wr), false, false, true ); \
+ break; \
+ case 0b0011: \
+ set_ ## w ## _perms((wr), true , false, true ); \
+ break; \
+ case 0b0100: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b0101: \
+ set_ ## w ## _perms((wr), true , true , false); \
+ break; \
+ case 0b0110: \
+ set_ ## w ## _perms((wr), true , true , true ); \
+ break; \
+ case 0b0111: \
+ set_ ## w ## _perms((wr), true , true , true ); \
+ break; \
+ case 0b1000: \
+ set_ ## w ## _perms((wr), true , false, false); \
+ break; \
+ case 0b1001: \
+ set_ ## w ## _perms((wr), true , false, false); \
+ break; \
+ case 0b1010: \
+ set_ ## w ## _perms((wr), true , false, true ); \
+ break; \
+ case 0b1011: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b1100: \
+ set_ ## w ## _perms((wr), true , true , false); \
+ break; \
+ case 0b1101: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ case 0b1110: \
+ set_ ## w ## _perms((wr), true , true , true ); \
+ break; \
+ case 0b1111: \
+ set_ ## w ## _perms((wr), false, false, false); \
+ break; \
+ } \
+ \
+ /* R_HJYGR */ \
+ set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
+ \
+ } while (0)
+
+static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ u8 up, pp, idx;
+
+ idx = pte_pi_index(wr->desc);
+
+ switch (wi->regime) {
+ case TR_EL10:
+ pp = perm_idx(vcpu, PIR_EL1, idx);
+ up = perm_idx(vcpu, PIRE0_EL1, idx);
+ break;
+ case TR_EL20:
+ pp = perm_idx(vcpu, PIR_EL2, idx);
+ up = perm_idx(vcpu, PIRE0_EL2, idx);
+ break;
+ case TR_EL2:
+ pp = perm_idx(vcpu, PIR_EL2, idx);
+ up = 0;
+ break;
+ }
+
+ set_perms(priv, wr, pp);
+
+ if (wi->regime != TR_EL2)
+ set_perms(unpriv, wr, up);
+ else
+ set_unpriv_perms(wr, false, false, false);
+
+ wr->pov = wi->poe && !(pp & BIT(3));
+ wr->uov = wi->e0poe && !(up & BIT(3));
+
+ /* R_VFPJF */
+ if (wr->px && wr->uw) {
+ set_priv_perms(wr, false, false, false);
+ set_unpriv_perms(wr, false, false, false);
+ }
+}
+
+static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ u8 idx, pov_perms, uov_perms;
+
+ idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
+
+ if (wr->pov) {
+ switch (wi->regime) {
+ case TR_EL10:
+ pov_perms = perm_idx(vcpu, POR_EL1, idx);
+ break;
+ case TR_EL20:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ break;
+ case TR_EL2:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ break;
+ }
+
+ if (pov_perms & ~POE_RWX)
+ pov_perms = POE_NONE;
+
+ /* R_QXXPC, S1PrivOverflow enabled */
+ if (wr->pwxn && (pov_perms & POE_X))
+ pov_perms &= ~POE_W;
+
+ wr->pr &= pov_perms & POE_R;
+ wr->pw &= pov_perms & POE_W;
+ wr->px &= pov_perms & POE_X;
+ }
+
+ if (wr->uov) {
+ switch (wi->regime) {
+ case TR_EL10:
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL20:
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL2:
+ uov_perms = 0;
+ break;
+ }
+
+ if (uov_perms & ~POE_RWX)
+ uov_perms = POE_NONE;
+
+ /* R_NPBXC, S1UnprivOverlay enabled */
+ if (wr->uwxn && (uov_perms & POE_X))
+ uov_perms &= ~POE_W;
+
+ wr->ur &= uov_perms & POE_R;
+ wr->uw &= uov_perms & POE_W;
+ wr->ux &= uov_perms & POE_X;
+ }
+}
+
+static void compute_s1_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
+{
+ bool pan;
+
+ if (!s1pie_enabled(vcpu, wi->regime))
+ compute_s1_direct_permissions(vcpu, wi, wr);
+ else
+ compute_s1_indirect_permissions(vcpu, wi, wr);
+
+ if (!wi->hpd)
+ compute_s1_hierarchical_permissions(vcpu, wi, wr);
+
+ compute_s1_overlay_permissions(vcpu, wi, wr);
+
+ /* R_QXXPC, S1PrivOverlay disabled */
+ if (!wr->pov)
+ wr->px &= !(wr->pwxn && wr->pw);
+
+ /* R_NPBXC, S1UnprivOverlay disabled */
+ if (!wr->uov)
+ wr->ux &= !(wr->uwxn && wr->uw);
+
+ pan = wi->pan && (wr->ur || wr->uw ||
+ (pan3_enabled(vcpu, wi->regime) && wr->ux));
+ wr->pw &= !pan;
+ wr->pr &= !pan;
+}
+
+static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
+{
+ struct s1_walk_result wr = {};
+ struct s1_walk_info wi = {};
+ bool perm_fail = false;
+ int ret, idx;
+
+ wi.regime = compute_translation_regime(vcpu, op);
+ wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
+ wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
+ (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
+
+ ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
+ if (ret)
+ goto compute_par;
+
+ if (wr.level == S1_MMU_DISABLED)
+ goto compute_par;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ ret = walk_s1(vcpu, &wi, &wr, vaddr);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ /*
+ * Race to update a descriptor -- restart the walk.
+ */
+ if (ret == -EAGAIN)
+ return ret;
+ if (ret)
+ goto compute_par;
+
+ compute_s1_permissions(vcpu, &wi, &wr);
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1R:
+ case OP_AT_S1E2R:
+ perm_fail = !wr.pr;
+ break;
+ case OP_AT_S1E1WP:
+ case OP_AT_S1E1W:
+ case OP_AT_S1E2W:
+ perm_fail = !wr.pw;
+ break;
+ case OP_AT_S1E0R:
+ perm_fail = !wr.ur;
+ break;
+ case OP_AT_S1E0W:
+ perm_fail = !wr.uw;
+ break;
+ case OP_AT_S1E1A:
+ case OP_AT_S1E2A:
+ break;
+ default:
+ BUG();
+ }
+
+ if (perm_fail)
+ fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
+
+compute_par:
+ *par = compute_par_s1(vcpu, &wi, &wr);
+ return 0;
+}
+
+/*
+ * Return the PAR_EL1 value as the result of a valid translation.
+ *
+ * If the translation is unsuccessful, the value may only contain
+ * PAR_EL1.F, and cannot be taken at face value. It isn't an
+ * indication of the translation having failed, only that the fast
+ * path did not succeed, *unless* it indicates a S1 permission or
+ * access fault.
+ */
+static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct mmu_config config;
+ struct kvm_s2_mmu *mmu;
+ bool fail, mmu_cs;
+ u64 par;
+
+ par = SYS_PAR_EL1_F;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will
+ * be switching contexts behind everybody's back, disable
+ * interrupts while holding the mmu lock.
+ */
+ guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
+
+ /*
+ * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
+ * the right one (as we trapped from vEL2). If not, save the
+ * full MMU context.
+ *
+ * We are also guaranteed to be in the correct context if
+ * we're not in a nested VM.
+ */
+ mmu_cs = (vcpu_has_nv(vcpu) &&
+ !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
+ if (!mmu_cs)
+ goto skip_mmu_switch;
+
+ /*
+ * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
+ * find it (recycled by another vcpu, for example). When this
+ * happens, admit defeat immediately and use the SW (slow) path.
+ */
+ mmu = lookup_s2_mmu(vcpu);
+ if (!mmu)
+ return par;
+
+ __mmu_config_save(&config);
+
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
+ if (kvm_has_tcr2(vcpu->kvm)) {
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
+ if (kvm_has_s1pie(vcpu->kvm)) {
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
+ }
+ if (kvm_has_s1poe(vcpu->kvm)) {
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
+ write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
+ }
+ }
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
+ __load_stage2(mmu, mmu->arch);
+
+skip_mmu_switch:
+ /* Temporarily switch back to guest context */
+ write_sysreg_hcr(vcpu->arch.hcr_el2);
+ isb();
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1WP:
+ fail = at_s1e1p_fast(vcpu, op, vaddr);
+ break;
+ case OP_AT_S1E1R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E1W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E0R:
+ fail = __kvm_at(OP_AT_S1E0R, vaddr);
+ break;
+ case OP_AT_S1E0W:
+ fail = __kvm_at(OP_AT_S1E0W, vaddr);
+ break;
+ case OP_AT_S1E1A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ break;
+ }
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
+
+ if (mmu_cs)
+ __mmu_config_restore(&config);
+
+ return par;
+}
+
+static bool par_check_s1_perm_fault(u64 par)
+{
+ u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+ return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
+ !(par & SYS_PAR_EL1_S));
+}
+
+static bool par_check_s1_access_fault(u64 par)
+{
+ u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+ return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
+ !(par & SYS_PAR_EL1_S));
+}
+
+int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
+ int ret;
+
+ /*
+ * If PAR_EL1 reports that AT failed on a S1 permission or access
+ * fault, we know for sure that the PTW was able to walk the S1
+ * tables and there's nothing else to do.
+ *
+ * If AT failed for any other reason, then we must walk the guest S1
+ * to emulate the instruction.
+ */
+ if ((par & SYS_PAR_EL1_F) &&
+ !par_check_s1_perm_fault(par) &&
+ !par_check_s1_access_fault(par)) {
+ ret = handle_at_slow(vcpu, op, vaddr, &par);
+ if (ret)
+ return ret;
+ }
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
+}
+
+int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par;
+ int ret;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will be
+ * switching context behind everybody's back, disable interrupts...
+ */
+ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
+ u64 val, hcr;
+ bool fail;
+
+ val = hcr = read_sysreg(hcr_el2);
+ val &= ~HCR_TGE;
+ val |= HCR_VM;
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val |= HCR_NV | HCR_NV1;
+
+ write_sysreg_hcr(val);
+ isb();
+
+ par = SYS_PAR_EL1_F;
+
+ switch (op) {
+ case OP_AT_S1E2R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E2W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E2A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ }
+
+ isb();
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ write_sysreg_hcr(hcr);
+ isb();
+ }
+
+ /* We failed the translation, let's replay it in slow motion */
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) {
+ ret = handle_at_slow(vcpu, op, vaddr, &par);
+ if (ret)
+ return ret;
+ }
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
+}
+
+int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct kvm_s2_trans out = {};
+ u64 ipa, par;
+ bool write;
+ int ret;
+
+ /* Do the stage-1 translation */
+ switch (op) {
+ case OP_AT_S12E1R:
+ op = OP_AT_S1E1R;
+ write = false;
+ break;
+ case OP_AT_S12E1W:
+ op = OP_AT_S1E1W;
+ write = true;
+ break;
+ case OP_AT_S12E0R:
+ op = OP_AT_S1E0R;
+ write = false;
+ break;
+ case OP_AT_S12E0W:
+ op = OP_AT_S1E0W;
+ write = true;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ __kvm_at_s1e01(vcpu, op, vaddr);
+ par = vcpu_read_sys_reg(vcpu, PAR_EL1);
+ if (par & SYS_PAR_EL1_F)
+ return 0;
+
+ /*
+ * If we only have a single stage of translation (EL2&0), exit
+ * early. Same thing if {VM,DC}=={0,0}.
+ */
+ if (compute_translation_regime(vcpu, op) == TR_EL20 ||
+ !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
+ return 0;
+
+ /* Do the stage-2 translation */
+ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
+ out.esr = 0;
+ ret = kvm_walk_nested_s2(vcpu, ipa, &out);
+ if (ret < 0)
+ return ret;
+
+ /* Check the access permission */
+ if (!out.esr &&
+ ((!write && !out.readable) || (write && !out.writable)))
+ out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
+
+ par = compute_par_s12(vcpu, par, &out);
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
+}
+
+/*
+ * Translate a VA for a given EL in a given translation regime, with
+ * or without PAN. This requires wi->{regime, as_el0, pan} to be
+ * set. The rest of the wi and wr should be 0-initialised.
+ */
+int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ int ret;
+
+ ret = setup_s1_walk(vcpu, wi, wr, va);
+ if (ret)
+ return ret;
+
+ if (wr->level == S1_MMU_DISABLED) {
+ wr->ur = wr->uw = wr->ux = true;
+ wr->pr = wr->pw = wr->px = true;
+ } else {
+ ret = walk_s1(vcpu, wi, wr, va);
+ if (ret)
+ return ret;
+
+ compute_s1_permissions(vcpu, wi, wr);
+ }
+
+ return 0;
+}
+
+struct desc_match {
+ u64 ipa;
+ int level;
+};
+
+static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
+{
+ struct desc_match *dm = priv;
+ u64 ipa = dm->ipa;
+
+ /* Use S1 granule alignment */
+ ipa &= GENMASK(51, ctxt->wi->pgshift);
+
+ /* Not the IPA we're looking for? Continue. */
+ if (ipa != ctxt->table_ipa)
+ return 0;
+
+ /* Note the level and interrupt the walk */
+ dm->level = ctxt->level;
+ return -EINTR;
+}
+
+int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
+{
+ struct desc_match dm = {
+ .ipa = ipa,
+ };
+ struct s1_walk_info wi = {
+ .filter = &(struct s1_walk_filter){
+ .fn = match_s1_desc,
+ .priv = &dm,
+ },
+ .as_el0 = false,
+ .pan = false,
+ };
+ struct s1_walk_result wr = {};
+ int ret;
+
+ if (is_hyp_ctxt(vcpu))
+ wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
+ else
+ wi.regime = TR_EL10;
+
+ ret = setup_s1_walk(vcpu, &wi, &wr, va);
+ if (ret)
+ return ret;
+
+ /* We really expect the S1 MMU to be on here... */
+ if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
+ *level = 0;
+ return 0;
+ }
+
+ /* Walk the guest's PT, looking for a match along the way */
+ ret = walk_s1(vcpu, &wi, &wr, va);
+ switch (ret) {
+ case -EINTR:
+ /* We interrupted the walk on a match, return the level */
+ *level = dm.level;
+ return 0;
+ case 0:
+ /* The walk completed, we failed to find the entry */
+ return -ENOENT;
+ default:
+ /* Any other error... */
+ return ret;
+ }
+}
+
+#ifdef CONFIG_ARM64_LSE_ATOMICS
+static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ u64 tmp = old;
+ int ret = 0;
+
+ uaccess_enable_privileged();
+
+ asm volatile(__LSE_PREAMBLE
+ "1: cas %[old], %[new], %[addr]\n"
+ "2:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
+ : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
+ : [new] "r" (new)
+ : "memory");
+
+ uaccess_disable_privileged();
+
+ if (ret)
+ return ret;
+ if (tmp != old)
+ return -EAGAIN;
+
+ return ret;
+}
+#else
+static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ return -EINVAL;
+}
+#endif
+
+static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ int ret = 1;
+ u64 tmp;
+
+ uaccess_enable_privileged();
+
+ asm volatile("prfm pstl1strm, %[addr]\n"
+ "1: ldxr %[tmp], %[addr]\n"
+ "sub %[tmp], %[tmp], %[old]\n"
+ "cbnz %[tmp], 3f\n"
+ "2: stlxr %w[ret], %[new], %[addr]\n"
+ "3:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret])
+ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret])
+ : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp)
+ : [old] "r" (old), [new] "r" (new)
+ : "memory");
+
+ uaccess_disable_privileged();
+
+ /* STLXR didn't update the descriptor, or the compare failed */
+ if (ret == 1)
+ return -EAGAIN;
+
+ return ret;
+}
+
+int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long hva;
+ u64 __user *ptep;
+ bool writable;
+ int offset;
+ gfn_t gfn;
+ int r;
+
+ lockdep_assert(srcu_read_lock_held(&kvm->srcu));
+
+ gfn = ipa >> PAGE_SHIFT;
+ offset = offset_in_page(ipa);
+ slot = gfn_to_memslot(kvm, gfn);
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (kvm_is_error_hva(hva))
+ return -EINVAL;
+ if (!writable)
+ return -EPERM;
+
+ ptep = (u64 __user *)hva + offset;
+ if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
+ r = __lse_swap_desc(ptep, old, new);
+ else
+ r = __llsc_swap_desc(ptep, old, new);
+
+ if (r < 0)
+ return r;
+
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ return 0;
+}
diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c
new file mode 100644
index 000000000000..24bb3f36e9d5
--- /dev/null
+++ b/arch/arm64/kvm/config.c
@@ -0,0 +1,1520 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Marc Zyngier <maz@kernel.org>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
+#include <asm/sysreg.h>
+
+/*
+ * Describes the dependencies between a set of bits (or the negation
+ * of a set of RES0 bits) and a feature. The flags indicate how the
+ * data is interpreted.
+ */
+struct reg_bits_to_feat_map {
+ union {
+ u64 bits;
+ u64 *res0p;
+ };
+
+#define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */
+#define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */
+#define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */
+#define RES0_POINTER BIT(3) /* Pointer to RES0 value instead of bits */
+
+ unsigned long flags;
+
+ union {
+ struct {
+ u8 regidx;
+ u8 shift;
+ u8 width;
+ bool sign;
+ s8 lo_lim;
+ };
+ bool (*match)(struct kvm *);
+ bool (*fval)(struct kvm *, u64 *);
+ };
+};
+
+/*
+ * Describes the dependencies for a given register:
+ *
+ * @feat_map describes the dependency for the whole register. If the
+ * features the register depends on are not present, the whole
+ * register is effectively RES0.
+ *
+ * @bit_feat_map describes the dependencies for a set of bits in that
+ * register. If the features these bits depend on are not present, the
+ * bits are effectively RES0.
+ */
+struct reg_feat_map_desc {
+ const char *name;
+ const struct reg_bits_to_feat_map feat_map;
+ const struct reg_bits_to_feat_map *bit_feat_map;
+ const unsigned int bit_feat_map_sz;
+};
+
+#define __NEEDS_FEAT_3(m, f, w, id, fld, lim) \
+ { \
+ .w = (m), \
+ .flags = (f), \
+ .regidx = IDREG_IDX(SYS_ ## id), \
+ .shift = id ##_## fld ## _SHIFT, \
+ .width = id ##_## fld ## _WIDTH, \
+ .sign = id ##_## fld ## _SIGNED, \
+ .lo_lim = id ##_## fld ##_## lim \
+ }
+
+#define __NEEDS_FEAT_2(m, f, w, fun, dummy) \
+ { \
+ .w = (m), \
+ .flags = (f) | CALL_FUNC, \
+ .fval = (fun), \
+ }
+
+#define __NEEDS_FEAT_1(m, f, w, fun) \
+ { \
+ .w = (m), \
+ .flags = (f) | CALL_FUNC, \
+ .match = (fun), \
+ }
+
+#define __NEEDS_FEAT_FLAG(m, f, w, ...) \
+ CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, w, __VA_ARGS__)
+
+#define NEEDS_FEAT_FLAG(m, f, ...) \
+ __NEEDS_FEAT_FLAG(m, f, bits, __VA_ARGS__)
+
+#define NEEDS_FEAT_FIXED(m, ...) \
+ __NEEDS_FEAT_FLAG(m, FIXED_VALUE, bits, __VA_ARGS__, 0)
+
+#define NEEDS_FEAT_RES0(p, ...) \
+ __NEEDS_FEAT_FLAG(p, RES0_POINTER, res0p, __VA_ARGS__)
+
+/*
+ * Declare the dependency between a set of bits and a set of features,
+ * generating a struct reg_bit_to_feat_map.
+ */
+#define NEEDS_FEAT(m, ...) NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__)
+
+/*
+ * Declare the dependency between a non-FGT register, a set of
+ * feature, and the set of individual bits it contains. This generates
+ * a struct reg_feat_map_desc.
+ */
+#define DECLARE_FEAT_MAP(n, r, m, f) \
+ struct reg_feat_map_desc n = { \
+ .name = #r, \
+ .feat_map = NEEDS_FEAT(~r##_RES0, f), \
+ .bit_feat_map = m, \
+ .bit_feat_map_sz = ARRAY_SIZE(m), \
+ }
+
+/*
+ * Specialised version of the above for FGT registers that have their
+ * RES0 masks described as struct fgt_masks.
+ */
+#define DECLARE_FEAT_MAP_FGT(n, msk, m, f) \
+ struct reg_feat_map_desc n = { \
+ .name = #msk, \
+ .feat_map = NEEDS_FEAT_RES0(&msk.res0, f),\
+ .bit_feat_map = m, \
+ .bit_feat_map_sz = ARRAY_SIZE(m), \
+ }
+
+#define FEAT_SPE ID_AA64DFR0_EL1, PMSVer, IMP
+#define FEAT_SPE_FnE ID_AA64DFR0_EL1, PMSVer, V1P2
+#define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP
+#define FEAT_TRC_SR ID_AA64DFR0_EL1, TraceVer, IMP
+#define FEAT_PMUv3 ID_AA64DFR0_EL1, PMUVer, IMP
+#define FEAT_TRBE ID_AA64DFR0_EL1, TraceBuffer, IMP
+#define FEAT_TRBEv1p1 ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1
+#define FEAT_DoubleLock ID_AA64DFR0_EL1, DoubleLock, IMP
+#define FEAT_TRF ID_AA64DFR0_EL1, TraceFilt, IMP
+#define FEAT_AA32EL0 ID_AA64PFR0_EL1, EL0, AARCH32
+#define FEAT_AA32EL1 ID_AA64PFR0_EL1, EL1, AARCH32
+#define FEAT_AA64EL1 ID_AA64PFR0_EL1, EL1, IMP
+#define FEAT_AA64EL2 ID_AA64PFR0_EL1, EL2, IMP
+#define FEAT_AA64EL3 ID_AA64PFR0_EL1, EL3, IMP
+#define FEAT_AIE ID_AA64MMFR3_EL1, AIE, IMP
+#define FEAT_S2POE ID_AA64MMFR3_EL1, S2POE, IMP
+#define FEAT_S1POE ID_AA64MMFR3_EL1, S1POE, IMP
+#define FEAT_S1PIE ID_AA64MMFR3_EL1, S1PIE, IMP
+#define FEAT_THE ID_AA64PFR1_EL1, THE, IMP
+#define FEAT_SME ID_AA64PFR1_EL1, SME, IMP
+#define FEAT_GCS ID_AA64PFR1_EL1, GCS, IMP
+#define FEAT_LS64 ID_AA64ISAR1_EL1, LS64, LS64
+#define FEAT_LS64_V ID_AA64ISAR1_EL1, LS64, LS64_V
+#define FEAT_LS64_ACCDATA ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA
+#define FEAT_RAS ID_AA64PFR0_EL1, RAS, IMP
+#define FEAT_RASv2 ID_AA64PFR0_EL1, RAS, V2
+#define FEAT_GICv3 ID_AA64PFR0_EL1, GIC, IMP
+#define FEAT_LOR ID_AA64MMFR1_EL1, LO, IMP
+#define FEAT_SPEv1p2 ID_AA64DFR0_EL1, PMSVer, V1P2
+#define FEAT_SPEv1p4 ID_AA64DFR0_EL1, PMSVer, V1P4
+#define FEAT_SPEv1p5 ID_AA64DFR0_EL1, PMSVer, V1P5
+#define FEAT_ATS1A ID_AA64ISAR2_EL1, ATS1A, IMP
+#define FEAT_SPECRES2 ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX
+#define FEAT_SPECRES ID_AA64ISAR1_EL1, SPECRES, IMP
+#define FEAT_TLBIRANGE ID_AA64ISAR0_EL1, TLB, RANGE
+#define FEAT_TLBIOS ID_AA64ISAR0_EL1, TLB, OS
+#define FEAT_PAN2 ID_AA64MMFR1_EL1, PAN, PAN2
+#define FEAT_DPB2 ID_AA64ISAR1_EL1, DPB, DPB2
+#define FEAT_AMUv1 ID_AA64PFR0_EL1, AMU, IMP
+#define FEAT_AMUv1p1 ID_AA64PFR0_EL1, AMU, V1P1
+#define FEAT_CMOW ID_AA64MMFR1_EL1, CMOW, IMP
+#define FEAT_D128 ID_AA64MMFR3_EL1, D128, IMP
+#define FEAT_DoubleFault2 ID_AA64PFR1_EL1, DF2, IMP
+#define FEAT_FPMR ID_AA64PFR2_EL1, FPMR, IMP
+#define FEAT_MOPS ID_AA64ISAR2_EL1, MOPS, IMP
+#define FEAT_NMI ID_AA64PFR1_EL1, NMI, IMP
+#define FEAT_SCTLR2 ID_AA64MMFR3_EL1, SCTLRX, IMP
+#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP
+#define FEAT_TCR2 ID_AA64MMFR3_EL1, TCRX, IMP
+#define FEAT_XS ID_AA64ISAR1_EL1, XS, IMP
+#define FEAT_EVT ID_AA64MMFR2_EL1, EVT, IMP
+#define FEAT_EVT_TTLBxS ID_AA64MMFR2_EL1, EVT, TTLBxS
+#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2
+#define FEAT_RME ID_AA64PFR0_EL1, RME, IMP
+#define FEAT_MPAM ID_AA64PFR0_EL1, MPAM, 1
+#define FEAT_S2FWB ID_AA64MMFR2_EL1, FWB, IMP
+#define FEAT_TME ID_AA64ISAR0_EL1, TME, IMP
+#define FEAT_TWED ID_AA64MMFR1_EL1, TWED, IMP
+#define FEAT_E2H0 ID_AA64MMFR4_EL1, E2H0, IMP
+#define FEAT_SRMASK ID_AA64MMFR4_EL1, SRMASK, IMP
+#define FEAT_PoPS ID_AA64MMFR4_EL1, PoPS, IMP
+#define FEAT_PFAR ID_AA64PFR1_EL1, PFAR, IMP
+#define FEAT_Debugv8p9 ID_AA64DFR0_EL1, PMUVer, V3P9
+#define FEAT_PMUv3_SS ID_AA64DFR0_EL1, PMSS, IMP
+#define FEAT_SEBEP ID_AA64DFR0_EL1, SEBEP, IMP
+#define FEAT_EBEP ID_AA64DFR1_EL1, EBEP, IMP
+#define FEAT_ITE ID_AA64DFR1_EL1, ITE, IMP
+#define FEAT_PMUv3_ICNTR ID_AA64DFR1_EL1, PMICNTR, IMP
+#define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP
+#define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP
+#define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP
+#define FEAT_CPA2 ID_AA64ISAR3_EL1, CPA, CPA2
+#define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP
+#define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP
+#define FEAT_HAFT ID_AA64MMFR1_EL1, HAFDBS, HAFT
+#define FEAT_BTI ID_AA64PFR1_EL1, BT, IMP
+#define FEAT_ExS ID_AA64MMFR0_EL1, EXS, IMP
+#define FEAT_IESB ID_AA64MMFR2_EL1, IESB, IMP
+#define FEAT_LSE2 ID_AA64MMFR2_EL1, AT, IMP
+#define FEAT_LSMAOC ID_AA64MMFR2_EL1, LSM, IMP
+#define FEAT_MixedEnd ID_AA64MMFR0_EL1, BIGEND, IMP
+#define FEAT_MixedEndEL0 ID_AA64MMFR0_EL1, BIGENDEL0, IMP
+#define FEAT_MTE_ASYNC ID_AA64PFR1_EL1, MTE_frac, ASYNC
+#define FEAT_MTE_STORE_ONLY ID_AA64PFR2_EL1, MTESTOREONLY, IMP
+#define FEAT_PAN ID_AA64MMFR1_EL1, PAN, IMP
+#define FEAT_PAN3 ID_AA64MMFR1_EL1, PAN, PAN3
+#define FEAT_SSBS ID_AA64PFR1_EL1, SSBS, IMP
+#define FEAT_TIDCP1 ID_AA64MMFR1_EL1, TIDCP1, IMP
+#define FEAT_FGT ID_AA64MMFR0_EL1, FGT, IMP
+#define FEAT_FGT2 ID_AA64MMFR0_EL1, FGT, FGT2
+#define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP
+#define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP
+
+static bool not_feat_aa64el3(struct kvm *kvm)
+{
+ return !kvm_has_feat(kvm, FEAT_AA64EL3);
+}
+
+static bool feat_nv2(struct kvm *kvm)
+{
+ return ((kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) &&
+ kvm_has_feat_enum(kvm, ID_AA64MMFR2_EL1, NV, NI)) ||
+ kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2));
+}
+
+static bool feat_nv2_e2h0_ni(struct kvm *kvm)
+{
+ return feat_nv2(kvm) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_rasv1p1(struct kvm *kvm)
+{
+ return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) ||
+ (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) &&
+ kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)));
+}
+
+static bool feat_csv2_2_csv2_1p2(struct kvm *kvm)
+{
+ return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) ||
+ (kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2) &&
+ kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, CSV2, IMP)));
+}
+
+static bool feat_pauth(struct kvm *kvm)
+{
+ return kvm_has_pauth(kvm, PAuth);
+}
+
+static bool feat_pauth_lr(struct kvm *kvm)
+{
+ return kvm_has_pauth(kvm, PAuth_LR);
+}
+
+static bool feat_aderr(struct kvm *kvm)
+{
+ return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, FEAT_ADERR) &&
+ kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SDERR, FEAT_ADERR));
+}
+
+static bool feat_anerr(struct kvm *kvm)
+{
+ return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ANERR, FEAT_ANERR) &&
+ kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SNERR, FEAT_ANERR));
+}
+
+static bool feat_sme_smps(struct kvm *kvm)
+{
+ /*
+ * Revists this if KVM ever supports SME -- this really should
+ * look at the guest's view of SMIDR_EL1. Funnily enough, this
+ * is not captured in the JSON file, but only as a note in the
+ * ARM ARM.
+ */
+ return (kvm_has_feat(kvm, FEAT_SME) &&
+ (read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS));
+}
+
+static bool feat_spe_fds(struct kvm *kvm)
+{
+ /*
+ * Revists this if KVM ever supports SPE -- this really should
+ * look at the guest's view of PMSIDR_EL1.
+ */
+ return (kvm_has_feat(kvm, FEAT_SPEv1p4) &&
+ (read_sysreg_s(SYS_PMSIDR_EL1) & PMSIDR_EL1_FDS));
+}
+
+static bool feat_trbe_mpam(struct kvm *kvm)
+{
+ /*
+ * Revists this if KVM ever supports both MPAM and TRBE --
+ * this really should look at the guest's view of TRBIDR_EL1.
+ */
+ return (kvm_has_feat(kvm, FEAT_TRBE) &&
+ kvm_has_feat(kvm, FEAT_MPAM) &&
+ (read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM));
+}
+
+static bool feat_asid2_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_ASID2) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_d128_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_D128) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_mec_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MEC) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_ebep_pmuv3_ss(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS);
+}
+
+static bool feat_mixedendel0(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MixedEnd) || kvm_has_feat(kvm, FEAT_MixedEndEL0);
+}
+
+static bool feat_mte_async(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MTE2) && kvm_has_feat_enum(kvm, FEAT_MTE_ASYNC);
+}
+
+#define check_pmu_revision(k, r) \
+ ({ \
+ (kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, r) && \
+ !kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, IMP_DEF)); \
+ })
+
+static bool feat_pmuv3p1(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P1);
+}
+
+static bool feat_pmuv3p5(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P5);
+}
+
+static bool feat_pmuv3p7(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P7);
+}
+
+static bool feat_pmuv3p9(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P9);
+}
+
+static bool compute_hcr_rw(struct kvm *kvm, u64 *bits)
+{
+ /* This is purely academic: AArch32 and NV are mutually exclusive */
+ if (bits) {
+ if (kvm_has_feat(kvm, FEAT_AA32EL1))
+ *bits &= ~HCR_EL2_RW;
+ else
+ *bits |= HCR_EL2_RW;
+ }
+
+ return true;
+}
+
+static bool compute_hcr_e2h(struct kvm *kvm, u64 *bits)
+{
+ if (bits) {
+ if (kvm_has_feat(kvm, FEAT_E2H0))
+ *bits &= ~HCR_EL2_E2H;
+ else
+ *bits |= HCR_EL2_E2H;
+ }
+
+ return true;
+}
+
+static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = {
+ NEEDS_FEAT(HFGRTR_EL2_nAMAIR2_EL1 |
+ HFGRTR_EL2_nMAIR2_EL1,
+ FEAT_AIE),
+ NEEDS_FEAT(HFGRTR_EL2_nS2POR_EL1, FEAT_S2POE),
+ NEEDS_FEAT(HFGRTR_EL2_nPOR_EL1 |
+ HFGRTR_EL2_nPOR_EL0,
+ FEAT_S1POE),
+ NEEDS_FEAT(HFGRTR_EL2_nPIR_EL1 |
+ HFGRTR_EL2_nPIRE0_EL1,
+ FEAT_S1PIE),
+ NEEDS_FEAT(HFGRTR_EL2_nRCWMASK_EL1, FEAT_THE),
+ NEEDS_FEAT(HFGRTR_EL2_nTPIDR2_EL0 |
+ HFGRTR_EL2_nSMPRI_EL1,
+ FEAT_SME),
+ NEEDS_FEAT(HFGRTR_EL2_nGCS_EL1 |
+ HFGRTR_EL2_nGCS_EL0,
+ FEAT_GCS),
+ NEEDS_FEAT(HFGRTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA),
+ NEEDS_FEAT(HFGRTR_EL2_ERXADDR_EL1 |
+ HFGRTR_EL2_ERXMISCn_EL1 |
+ HFGRTR_EL2_ERXSTATUS_EL1 |
+ HFGRTR_EL2_ERXCTLR_EL1 |
+ HFGRTR_EL2_ERXFR_EL1 |
+ HFGRTR_EL2_ERRSELR_EL1 |
+ HFGRTR_EL2_ERRIDR_EL1,
+ FEAT_RAS),
+ NEEDS_FEAT(HFGRTR_EL2_ERXPFGCDN_EL1 |
+ HFGRTR_EL2_ERXPFGCTL_EL1 |
+ HFGRTR_EL2_ERXPFGF_EL1,
+ feat_rasv1p1),
+ NEEDS_FEAT(HFGRTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3),
+ NEEDS_FEAT(HFGRTR_EL2_SCXTNUM_EL0 |
+ HFGRTR_EL2_SCXTNUM_EL1,
+ feat_csv2_2_csv2_1p2),
+ NEEDS_FEAT(HFGRTR_EL2_LORSA_EL1 |
+ HFGRTR_EL2_LORN_EL1 |
+ HFGRTR_EL2_LORID_EL1 |
+ HFGRTR_EL2_LOREA_EL1 |
+ HFGRTR_EL2_LORC_EL1,
+ FEAT_LOR),
+ NEEDS_FEAT(HFGRTR_EL2_APIBKey |
+ HFGRTR_EL2_APIAKey |
+ HFGRTR_EL2_APGAKey |
+ HFGRTR_EL2_APDBKey |
+ HFGRTR_EL2_APDAKey,
+ feat_pauth),
+ NEEDS_FEAT_FLAG(HFGRTR_EL2_VBAR_EL1 |
+ HFGRTR_EL2_TTBR1_EL1 |
+ HFGRTR_EL2_TTBR0_EL1 |
+ HFGRTR_EL2_TPIDR_EL0 |
+ HFGRTR_EL2_TPIDRRO_EL0 |
+ HFGRTR_EL2_TPIDR_EL1 |
+ HFGRTR_EL2_TCR_EL1 |
+ HFGRTR_EL2_SCTLR_EL1 |
+ HFGRTR_EL2_REVIDR_EL1 |
+ HFGRTR_EL2_PAR_EL1 |
+ HFGRTR_EL2_MPIDR_EL1 |
+ HFGRTR_EL2_MIDR_EL1 |
+ HFGRTR_EL2_MAIR_EL1 |
+ HFGRTR_EL2_ISR_EL1 |
+ HFGRTR_EL2_FAR_EL1 |
+ HFGRTR_EL2_ESR_EL1 |
+ HFGRTR_EL2_DCZID_EL0 |
+ HFGRTR_EL2_CTR_EL0 |
+ HFGRTR_EL2_CSSELR_EL1 |
+ HFGRTR_EL2_CPACR_EL1 |
+ HFGRTR_EL2_CONTEXTIDR_EL1|
+ HFGRTR_EL2_CLIDR_EL1 |
+ HFGRTR_EL2_CCSIDR_EL1 |
+ HFGRTR_EL2_AMAIR_EL1 |
+ HFGRTR_EL2_AIDR_EL1 |
+ HFGRTR_EL2_AFSR1_EL1 |
+ HFGRTR_EL2_AFSR0_EL1,
+ NEVER_FGU, FEAT_AA64EL1),
+};
+
+
+static const DECLARE_FEAT_MAP_FGT(hfgrtr_desc, hfgrtr_masks,
+ hfgrtr_feat_map, FEAT_FGT);
+
+static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = {
+ NEEDS_FEAT(HFGWTR_EL2_nAMAIR2_EL1 |
+ HFGWTR_EL2_nMAIR2_EL1,
+ FEAT_AIE),
+ NEEDS_FEAT(HFGWTR_EL2_nS2POR_EL1, FEAT_S2POE),
+ NEEDS_FEAT(HFGWTR_EL2_nPOR_EL1 |
+ HFGWTR_EL2_nPOR_EL0,
+ FEAT_S1POE),
+ NEEDS_FEAT(HFGWTR_EL2_nPIR_EL1 |
+ HFGWTR_EL2_nPIRE0_EL1,
+ FEAT_S1PIE),
+ NEEDS_FEAT(HFGWTR_EL2_nRCWMASK_EL1, FEAT_THE),
+ NEEDS_FEAT(HFGWTR_EL2_nTPIDR2_EL0 |
+ HFGWTR_EL2_nSMPRI_EL1,
+ FEAT_SME),
+ NEEDS_FEAT(HFGWTR_EL2_nGCS_EL1 |
+ HFGWTR_EL2_nGCS_EL0,
+ FEAT_GCS),
+ NEEDS_FEAT(HFGWTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA),
+ NEEDS_FEAT(HFGWTR_EL2_ERXADDR_EL1 |
+ HFGWTR_EL2_ERXMISCn_EL1 |
+ HFGWTR_EL2_ERXSTATUS_EL1 |
+ HFGWTR_EL2_ERXCTLR_EL1 |
+ HFGWTR_EL2_ERRSELR_EL1,
+ FEAT_RAS),
+ NEEDS_FEAT(HFGWTR_EL2_ERXPFGCDN_EL1 |
+ HFGWTR_EL2_ERXPFGCTL_EL1,
+ feat_rasv1p1),
+ NEEDS_FEAT(HFGWTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3),
+ NEEDS_FEAT(HFGWTR_EL2_SCXTNUM_EL0 |
+ HFGWTR_EL2_SCXTNUM_EL1,
+ feat_csv2_2_csv2_1p2),
+ NEEDS_FEAT(HFGWTR_EL2_LORSA_EL1 |
+ HFGWTR_EL2_LORN_EL1 |
+ HFGWTR_EL2_LOREA_EL1 |
+ HFGWTR_EL2_LORC_EL1,
+ FEAT_LOR),
+ NEEDS_FEAT(HFGWTR_EL2_APIBKey |
+ HFGWTR_EL2_APIAKey |
+ HFGWTR_EL2_APGAKey |
+ HFGWTR_EL2_APDBKey |
+ HFGWTR_EL2_APDAKey,
+ feat_pauth),
+ NEEDS_FEAT_FLAG(HFGWTR_EL2_VBAR_EL1 |
+ HFGWTR_EL2_TTBR1_EL1 |
+ HFGWTR_EL2_TTBR0_EL1 |
+ HFGWTR_EL2_TPIDR_EL0 |
+ HFGWTR_EL2_TPIDRRO_EL0 |
+ HFGWTR_EL2_TPIDR_EL1 |
+ HFGWTR_EL2_TCR_EL1 |
+ HFGWTR_EL2_SCTLR_EL1 |
+ HFGWTR_EL2_PAR_EL1 |
+ HFGWTR_EL2_MAIR_EL1 |
+ HFGWTR_EL2_FAR_EL1 |
+ HFGWTR_EL2_ESR_EL1 |
+ HFGWTR_EL2_CSSELR_EL1 |
+ HFGWTR_EL2_CPACR_EL1 |
+ HFGWTR_EL2_CONTEXTIDR_EL1|
+ HFGWTR_EL2_AMAIR_EL1 |
+ HFGWTR_EL2_AFSR1_EL1 |
+ HFGWTR_EL2_AFSR0_EL1,
+ NEVER_FGU, FEAT_AA64EL1),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hfgwtr_desc, hfgwtr_masks,
+ hfgwtr_feat_map, FEAT_FGT);
+
+static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = {
+ NEEDS_FEAT(HDFGRTR_EL2_PMBIDR_EL1 |
+ HDFGRTR_EL2_PMSLATFR_EL1 |
+ HDFGRTR_EL2_PMSIRR_EL1 |
+ HDFGRTR_EL2_PMSIDR_EL1 |
+ HDFGRTR_EL2_PMSICR_EL1 |
+ HDFGRTR_EL2_PMSFCR_EL1 |
+ HDFGRTR_EL2_PMSEVFR_EL1 |
+ HDFGRTR_EL2_PMSCR_EL1 |
+ HDFGRTR_EL2_PMBSR_EL1 |
+ HDFGRTR_EL2_PMBPTR_EL1 |
+ HDFGRTR_EL2_PMBLIMITR_EL1,
+ FEAT_SPE),
+ NEEDS_FEAT(HDFGRTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE),
+ NEEDS_FEAT(HDFGRTR_EL2_nBRBDATA |
+ HDFGRTR_EL2_nBRBCTL |
+ HDFGRTR_EL2_nBRBIDR,
+ FEAT_BRBE),
+ NEEDS_FEAT(HDFGRTR_EL2_TRCVICTLR |
+ HDFGRTR_EL2_TRCSTATR |
+ HDFGRTR_EL2_TRCSSCSRn |
+ HDFGRTR_EL2_TRCSEQSTR |
+ HDFGRTR_EL2_TRCPRGCTLR |
+ HDFGRTR_EL2_TRCOSLSR |
+ HDFGRTR_EL2_TRCIMSPECn |
+ HDFGRTR_EL2_TRCID |
+ HDFGRTR_EL2_TRCCNTVRn |
+ HDFGRTR_EL2_TRCCLAIM |
+ HDFGRTR_EL2_TRCAUXCTLR |
+ HDFGRTR_EL2_TRCAUTHSTATUS |
+ HDFGRTR_EL2_TRC,
+ FEAT_TRC_SR),
+ NEEDS_FEAT(HDFGRTR_EL2_PMCEIDn_EL0 |
+ HDFGRTR_EL2_PMUSERENR_EL0 |
+ HDFGRTR_EL2_PMMIR_EL1 |
+ HDFGRTR_EL2_PMSELR_EL0 |
+ HDFGRTR_EL2_PMOVS |
+ HDFGRTR_EL2_PMINTEN |
+ HDFGRTR_EL2_PMCNTEN |
+ HDFGRTR_EL2_PMCCNTR_EL0 |
+ HDFGRTR_EL2_PMCCFILTR_EL0 |
+ HDFGRTR_EL2_PMEVTYPERn_EL0 |
+ HDFGRTR_EL2_PMEVCNTRn_EL0,
+ FEAT_PMUv3),
+ NEEDS_FEAT(HDFGRTR_EL2_TRBTRG_EL1 |
+ HDFGRTR_EL2_TRBSR_EL1 |
+ HDFGRTR_EL2_TRBPTR_EL1 |
+ HDFGRTR_EL2_TRBMAR_EL1 |
+ HDFGRTR_EL2_TRBLIMITR_EL1 |
+ HDFGRTR_EL2_TRBIDR_EL1 |
+ HDFGRTR_EL2_TRBBASER_EL1,
+ FEAT_TRBE),
+ NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSDLR_EL1, NEVER_FGU,
+ FEAT_DoubleLock),
+ NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSECCR_EL1 |
+ HDFGRTR_EL2_OSLSR_EL1 |
+ HDFGRTR_EL2_DBGPRCR_EL1 |
+ HDFGRTR_EL2_DBGAUTHSTATUS_EL1|
+ HDFGRTR_EL2_DBGCLAIM |
+ HDFGRTR_EL2_MDSCR_EL1 |
+ HDFGRTR_EL2_DBGWVRn_EL1 |
+ HDFGRTR_EL2_DBGWCRn_EL1 |
+ HDFGRTR_EL2_DBGBVRn_EL1 |
+ HDFGRTR_EL2_DBGBCRn_EL1,
+ NEVER_FGU, FEAT_AA64EL1)
+};
+
+static const DECLARE_FEAT_MAP_FGT(hdfgrtr_desc, hdfgrtr_masks,
+ hdfgrtr_feat_map, FEAT_FGT);
+
+static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = {
+ NEEDS_FEAT(HDFGWTR_EL2_PMSLATFR_EL1 |
+ HDFGWTR_EL2_PMSIRR_EL1 |
+ HDFGWTR_EL2_PMSICR_EL1 |
+ HDFGWTR_EL2_PMSFCR_EL1 |
+ HDFGWTR_EL2_PMSEVFR_EL1 |
+ HDFGWTR_EL2_PMSCR_EL1 |
+ HDFGWTR_EL2_PMBSR_EL1 |
+ HDFGWTR_EL2_PMBPTR_EL1 |
+ HDFGWTR_EL2_PMBLIMITR_EL1,
+ FEAT_SPE),
+ NEEDS_FEAT(HDFGWTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE),
+ NEEDS_FEAT(HDFGWTR_EL2_nBRBDATA |
+ HDFGWTR_EL2_nBRBCTL,
+ FEAT_BRBE),
+ NEEDS_FEAT(HDFGWTR_EL2_TRCVICTLR |
+ HDFGWTR_EL2_TRCSSCSRn |
+ HDFGWTR_EL2_TRCSEQSTR |
+ HDFGWTR_EL2_TRCPRGCTLR |
+ HDFGWTR_EL2_TRCOSLAR |
+ HDFGWTR_EL2_TRCIMSPECn |
+ HDFGWTR_EL2_TRCCNTVRn |
+ HDFGWTR_EL2_TRCCLAIM |
+ HDFGWTR_EL2_TRCAUXCTLR |
+ HDFGWTR_EL2_TRC,
+ FEAT_TRC_SR),
+ NEEDS_FEAT(HDFGWTR_EL2_PMUSERENR_EL0 |
+ HDFGWTR_EL2_PMCR_EL0 |
+ HDFGWTR_EL2_PMSWINC_EL0 |
+ HDFGWTR_EL2_PMSELR_EL0 |
+ HDFGWTR_EL2_PMOVS |
+ HDFGWTR_EL2_PMINTEN |
+ HDFGWTR_EL2_PMCNTEN |
+ HDFGWTR_EL2_PMCCNTR_EL0 |
+ HDFGWTR_EL2_PMCCFILTR_EL0 |
+ HDFGWTR_EL2_PMEVTYPERn_EL0 |
+ HDFGWTR_EL2_PMEVCNTRn_EL0,
+ FEAT_PMUv3),
+ NEEDS_FEAT(HDFGWTR_EL2_TRBTRG_EL1 |
+ HDFGWTR_EL2_TRBSR_EL1 |
+ HDFGWTR_EL2_TRBPTR_EL1 |
+ HDFGWTR_EL2_TRBMAR_EL1 |
+ HDFGWTR_EL2_TRBLIMITR_EL1 |
+ HDFGWTR_EL2_TRBBASER_EL1,
+ FEAT_TRBE),
+ NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSDLR_EL1,
+ NEVER_FGU, FEAT_DoubleLock),
+ NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSECCR_EL1 |
+ HDFGWTR_EL2_OSLAR_EL1 |
+ HDFGWTR_EL2_DBGPRCR_EL1 |
+ HDFGWTR_EL2_DBGCLAIM |
+ HDFGWTR_EL2_MDSCR_EL1 |
+ HDFGWTR_EL2_DBGWVRn_EL1 |
+ HDFGWTR_EL2_DBGWCRn_EL1 |
+ HDFGWTR_EL2_DBGBVRn_EL1 |
+ HDFGWTR_EL2_DBGBCRn_EL1,
+ NEVER_FGU, FEAT_AA64EL1),
+ NEEDS_FEAT(HDFGWTR_EL2_TRFCR_EL1, FEAT_TRF),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hdfgwtr_desc, hdfgwtr_masks,
+ hdfgwtr_feat_map, FEAT_FGT);
+
+static const struct reg_bits_to_feat_map hfgitr_feat_map[] = {
+ NEEDS_FEAT(HFGITR_EL2_PSBCSYNC, FEAT_SPEv1p5),
+ NEEDS_FEAT(HFGITR_EL2_ATS1E1A, FEAT_ATS1A),
+ NEEDS_FEAT(HFGITR_EL2_COSPRCTX, FEAT_SPECRES2),
+ NEEDS_FEAT(HFGITR_EL2_nGCSEPP |
+ HFGITR_EL2_nGCSSTR_EL1 |
+ HFGITR_EL2_nGCSPUSHM_EL1,
+ FEAT_GCS),
+ NEEDS_FEAT(HFGITR_EL2_nBRBIALL |
+ HFGITR_EL2_nBRBINJ,
+ FEAT_BRBE),
+ NEEDS_FEAT(HFGITR_EL2_CPPRCTX |
+ HFGITR_EL2_DVPRCTX |
+ HFGITR_EL2_CFPRCTX,
+ FEAT_SPECRES),
+ NEEDS_FEAT(HFGITR_EL2_TLBIRVAALE1 |
+ HFGITR_EL2_TLBIRVALE1 |
+ HFGITR_EL2_TLBIRVAAE1 |
+ HFGITR_EL2_TLBIRVAE1 |
+ HFGITR_EL2_TLBIRVAALE1IS |
+ HFGITR_EL2_TLBIRVALE1IS |
+ HFGITR_EL2_TLBIRVAAE1IS |
+ HFGITR_EL2_TLBIRVAE1IS |
+ HFGITR_EL2_TLBIRVAALE1OS |
+ HFGITR_EL2_TLBIRVALE1OS |
+ HFGITR_EL2_TLBIRVAAE1OS |
+ HFGITR_EL2_TLBIRVAE1OS,
+ FEAT_TLBIRANGE),
+ NEEDS_FEAT(HFGITR_EL2_TLBIVAALE1OS |
+ HFGITR_EL2_TLBIVALE1OS |
+ HFGITR_EL2_TLBIVAAE1OS |
+ HFGITR_EL2_TLBIASIDE1OS |
+ HFGITR_EL2_TLBIVAE1OS |
+ HFGITR_EL2_TLBIVMALLE1OS,
+ FEAT_TLBIOS),
+ NEEDS_FEAT(HFGITR_EL2_ATS1E1WP |
+ HFGITR_EL2_ATS1E1RP,
+ FEAT_PAN2),
+ NEEDS_FEAT(HFGITR_EL2_DCCVADP, FEAT_DPB2),
+ NEEDS_FEAT_FLAG(HFGITR_EL2_DCCVAC |
+ HFGITR_EL2_SVC_EL1 |
+ HFGITR_EL2_SVC_EL0 |
+ HFGITR_EL2_ERET |
+ HFGITR_EL2_TLBIVAALE1 |
+ HFGITR_EL2_TLBIVALE1 |
+ HFGITR_EL2_TLBIVAAE1 |
+ HFGITR_EL2_TLBIASIDE1 |
+ HFGITR_EL2_TLBIVAE1 |
+ HFGITR_EL2_TLBIVMALLE1 |
+ HFGITR_EL2_TLBIVAALE1IS |
+ HFGITR_EL2_TLBIVALE1IS |
+ HFGITR_EL2_TLBIVAAE1IS |
+ HFGITR_EL2_TLBIASIDE1IS |
+ HFGITR_EL2_TLBIVAE1IS |
+ HFGITR_EL2_TLBIVMALLE1IS|
+ HFGITR_EL2_ATS1E0W |
+ HFGITR_EL2_ATS1E0R |
+ HFGITR_EL2_ATS1E1W |
+ HFGITR_EL2_ATS1E1R |
+ HFGITR_EL2_DCZVA |
+ HFGITR_EL2_DCCIVAC |
+ HFGITR_EL2_DCCVAP |
+ HFGITR_EL2_DCCVAU |
+ HFGITR_EL2_DCCISW |
+ HFGITR_EL2_DCCSW |
+ HFGITR_EL2_DCISW |
+ HFGITR_EL2_DCIVAC |
+ HFGITR_EL2_ICIVAU |
+ HFGITR_EL2_ICIALLU |
+ HFGITR_EL2_ICIALLUIS,
+ NEVER_FGU, FEAT_AA64EL1),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hfgitr_desc, hfgitr_masks,
+ hfgitr_feat_map, FEAT_FGT);
+
+static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = {
+ NEEDS_FEAT(HAFGRTR_EL2_AMEVTYPER115_EL0 |
+ HAFGRTR_EL2_AMEVTYPER114_EL0 |
+ HAFGRTR_EL2_AMEVTYPER113_EL0 |
+ HAFGRTR_EL2_AMEVTYPER112_EL0 |
+ HAFGRTR_EL2_AMEVTYPER111_EL0 |
+ HAFGRTR_EL2_AMEVTYPER110_EL0 |
+ HAFGRTR_EL2_AMEVTYPER19_EL0 |
+ HAFGRTR_EL2_AMEVTYPER18_EL0 |
+ HAFGRTR_EL2_AMEVTYPER17_EL0 |
+ HAFGRTR_EL2_AMEVTYPER16_EL0 |
+ HAFGRTR_EL2_AMEVTYPER15_EL0 |
+ HAFGRTR_EL2_AMEVTYPER14_EL0 |
+ HAFGRTR_EL2_AMEVTYPER13_EL0 |
+ HAFGRTR_EL2_AMEVTYPER12_EL0 |
+ HAFGRTR_EL2_AMEVTYPER11_EL0 |
+ HAFGRTR_EL2_AMEVTYPER10_EL0 |
+ HAFGRTR_EL2_AMEVCNTR115_EL0 |
+ HAFGRTR_EL2_AMEVCNTR114_EL0 |
+ HAFGRTR_EL2_AMEVCNTR113_EL0 |
+ HAFGRTR_EL2_AMEVCNTR112_EL0 |
+ HAFGRTR_EL2_AMEVCNTR111_EL0 |
+ HAFGRTR_EL2_AMEVCNTR110_EL0 |
+ HAFGRTR_EL2_AMEVCNTR19_EL0 |
+ HAFGRTR_EL2_AMEVCNTR18_EL0 |
+ HAFGRTR_EL2_AMEVCNTR17_EL0 |
+ HAFGRTR_EL2_AMEVCNTR16_EL0 |
+ HAFGRTR_EL2_AMEVCNTR15_EL0 |
+ HAFGRTR_EL2_AMEVCNTR14_EL0 |
+ HAFGRTR_EL2_AMEVCNTR13_EL0 |
+ HAFGRTR_EL2_AMEVCNTR12_EL0 |
+ HAFGRTR_EL2_AMEVCNTR11_EL0 |
+ HAFGRTR_EL2_AMEVCNTR10_EL0 |
+ HAFGRTR_EL2_AMCNTEN1 |
+ HAFGRTR_EL2_AMCNTEN0 |
+ HAFGRTR_EL2_AMEVCNTR03_EL0 |
+ HAFGRTR_EL2_AMEVCNTR02_EL0 |
+ HAFGRTR_EL2_AMEVCNTR01_EL0 |
+ HAFGRTR_EL2_AMEVCNTR00_EL0,
+ FEAT_AMUv1),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hafgrtr_desc, hafgrtr_masks,
+ hafgrtr_feat_map, FEAT_FGT);
+
+static const struct reg_bits_to_feat_map hfgitr2_feat_map[] = {
+ NEEDS_FEAT(HFGITR2_EL2_nDCCIVAPS, FEAT_PoPS),
+ NEEDS_FEAT(HFGITR2_EL2_TSBCSYNC, FEAT_TRBEv1p1)
+};
+
+static const DECLARE_FEAT_MAP_FGT(hfgitr2_desc, hfgitr2_masks,
+ hfgitr2_feat_map, FEAT_FGT2);
+
+static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = {
+ NEEDS_FEAT(HFGRTR2_EL2_nPFAR_EL1, FEAT_PFAR),
+ NEEDS_FEAT(HFGRTR2_EL2_nERXGSR_EL1, FEAT_RASv2),
+ NEEDS_FEAT(HFGRTR2_EL2_nACTLRALIAS_EL1 |
+ HFGRTR2_EL2_nACTLRMASK_EL1 |
+ HFGRTR2_EL2_nCPACRALIAS_EL1 |
+ HFGRTR2_EL2_nCPACRMASK_EL1 |
+ HFGRTR2_EL2_nSCTLR2MASK_EL1 |
+ HFGRTR2_EL2_nSCTLRALIAS2_EL1 |
+ HFGRTR2_EL2_nSCTLRALIAS_EL1 |
+ HFGRTR2_EL2_nSCTLRMASK_EL1 |
+ HFGRTR2_EL2_nTCR2ALIAS_EL1 |
+ HFGRTR2_EL2_nTCR2MASK_EL1 |
+ HFGRTR2_EL2_nTCRALIAS_EL1 |
+ HFGRTR2_EL2_nTCRMASK_EL1,
+ FEAT_SRMASK),
+ NEEDS_FEAT(HFGRTR2_EL2_nRCWSMASK_EL1, FEAT_THE),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hfgrtr2_desc, hfgrtr2_masks,
+ hfgrtr2_feat_map, FEAT_FGT2);
+
+static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = {
+ NEEDS_FEAT(HFGWTR2_EL2_nPFAR_EL1, FEAT_PFAR),
+ NEEDS_FEAT(HFGWTR2_EL2_nACTLRALIAS_EL1 |
+ HFGWTR2_EL2_nACTLRMASK_EL1 |
+ HFGWTR2_EL2_nCPACRALIAS_EL1 |
+ HFGWTR2_EL2_nCPACRMASK_EL1 |
+ HFGWTR2_EL2_nSCTLR2MASK_EL1 |
+ HFGWTR2_EL2_nSCTLRALIAS2_EL1 |
+ HFGWTR2_EL2_nSCTLRALIAS_EL1 |
+ HFGWTR2_EL2_nSCTLRMASK_EL1 |
+ HFGWTR2_EL2_nTCR2ALIAS_EL1 |
+ HFGWTR2_EL2_nTCR2MASK_EL1 |
+ HFGWTR2_EL2_nTCRALIAS_EL1 |
+ HFGWTR2_EL2_nTCRMASK_EL1,
+ FEAT_SRMASK),
+ NEEDS_FEAT(HFGWTR2_EL2_nRCWSMASK_EL1, FEAT_THE),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hfgwtr2_desc, hfgwtr2_masks,
+ hfgwtr2_feat_map, FEAT_FGT2);
+
+static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = {
+ NEEDS_FEAT(HDFGRTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss),
+ NEEDS_FEAT(HDFGRTR2_EL2_nTRCITECR_EL1, FEAT_ITE),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMICFILTR_EL0 |
+ HDFGRTR2_EL2_nPMICNTR_EL0,
+ FEAT_PMUv3_ICNTR),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, feat_pmuv3p9),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMSSCR_EL1 |
+ HDFGRTR2_EL2_nPMSSDATA,
+ FEAT_PMUv3_SS),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMIAR_EL1, FEAT_SEBEP),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMSDSFR_EL1, feat_spe_fds),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM),
+ NEEDS_FEAT(HDFGRTR2_EL2_nSPMACCESSR_EL1 |
+ HDFGRTR2_EL2_nSPMCNTEN |
+ HDFGRTR2_EL2_nSPMCR_EL0 |
+ HDFGRTR2_EL2_nSPMDEVAFF_EL1 |
+ HDFGRTR2_EL2_nSPMEVCNTRn_EL0 |
+ HDFGRTR2_EL2_nSPMEVTYPERn_EL0|
+ HDFGRTR2_EL2_nSPMID |
+ HDFGRTR2_EL2_nSPMINTEN |
+ HDFGRTR2_EL2_nSPMOVS |
+ HDFGRTR2_EL2_nSPMSCR_EL1 |
+ HDFGRTR2_EL2_nSPMSELR_EL0,
+ FEAT_SPMU),
+ NEEDS_FEAT(HDFGRTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2),
+ NEEDS_FEAT(HDFGRTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hdfgrtr2_desc, hdfgrtr2_masks,
+ hdfgrtr2_feat_map, FEAT_FGT2);
+
+static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = {
+ NEEDS_FEAT(HDFGWTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9),
+ NEEDS_FEAT(HDFGWTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss),
+ NEEDS_FEAT(HDFGWTR2_EL2_nTRCITECR_EL1, FEAT_ITE),
+ NEEDS_FEAT(HDFGWTR2_EL2_nPMICFILTR_EL0 |
+ HDFGWTR2_EL2_nPMICNTR_EL0,
+ FEAT_PMUv3_ICNTR),
+ NEEDS_FEAT(HDFGWTR2_EL2_nPMUACR_EL1 |
+ HDFGWTR2_EL2_nPMZR_EL0,
+ feat_pmuv3p9),
+ NEEDS_FEAT(HDFGWTR2_EL2_nPMSSCR_EL1, FEAT_PMUv3_SS),
+ NEEDS_FEAT(HDFGWTR2_EL2_nPMIAR_EL1, FEAT_SEBEP),
+ NEEDS_FEAT(HDFGWTR2_EL2_nPMSDSFR_EL1, feat_spe_fds),
+ NEEDS_FEAT(HDFGWTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM),
+ NEEDS_FEAT(HDFGWTR2_EL2_nSPMACCESSR_EL1 |
+ HDFGWTR2_EL2_nSPMCNTEN |
+ HDFGWTR2_EL2_nSPMCR_EL0 |
+ HDFGWTR2_EL2_nSPMEVCNTRn_EL0 |
+ HDFGWTR2_EL2_nSPMEVTYPERn_EL0|
+ HDFGWTR2_EL2_nSPMINTEN |
+ HDFGWTR2_EL2_nSPMOVS |
+ HDFGWTR2_EL2_nSPMSCR_EL1 |
+ HDFGWTR2_EL2_nSPMSELR_EL0,
+ FEAT_SPMU),
+ NEEDS_FEAT(HDFGWTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2),
+ NEEDS_FEAT(HDFGWTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam),
+};
+
+static const DECLARE_FEAT_MAP_FGT(hdfgwtr2_desc, hdfgwtr2_masks,
+ hdfgwtr2_feat_map, FEAT_FGT2);
+
+
+static const struct reg_bits_to_feat_map hcrx_feat_map[] = {
+ NEEDS_FEAT(HCRX_EL2_PACMEn, feat_pauth_lr),
+ NEEDS_FEAT(HCRX_EL2_EnFPM, FEAT_FPMR),
+ NEEDS_FEAT(HCRX_EL2_GCSEn, FEAT_GCS),
+ NEEDS_FEAT(HCRX_EL2_EnIDCP128, FEAT_SYSREG128),
+ NEEDS_FEAT(HCRX_EL2_EnSDERR, feat_aderr),
+ NEEDS_FEAT(HCRX_EL2_TMEA, FEAT_DoubleFault2),
+ NEEDS_FEAT(HCRX_EL2_EnSNERR, feat_anerr),
+ NEEDS_FEAT(HCRX_EL2_D128En, FEAT_D128),
+ NEEDS_FEAT(HCRX_EL2_PTTWI, FEAT_THE),
+ NEEDS_FEAT(HCRX_EL2_SCTLR2En, FEAT_SCTLR2),
+ NEEDS_FEAT(HCRX_EL2_TCR2En, FEAT_TCR2),
+ NEEDS_FEAT(HCRX_EL2_MSCEn |
+ HCRX_EL2_MCE2,
+ FEAT_MOPS),
+ NEEDS_FEAT(HCRX_EL2_CMOW, FEAT_CMOW),
+ NEEDS_FEAT(HCRX_EL2_VFNMI |
+ HCRX_EL2_VINMI |
+ HCRX_EL2_TALLINT,
+ FEAT_NMI),
+ NEEDS_FEAT(HCRX_EL2_SMPME, feat_sme_smps),
+ NEEDS_FEAT(HCRX_EL2_FGTnXS |
+ HCRX_EL2_FnXS,
+ FEAT_XS),
+ NEEDS_FEAT(HCRX_EL2_EnASR, FEAT_LS64_V),
+ NEEDS_FEAT(HCRX_EL2_EnALS, FEAT_LS64),
+ NEEDS_FEAT(HCRX_EL2_EnAS0, FEAT_LS64_ACCDATA),
+};
+
+
+static const DECLARE_FEAT_MAP(hcrx_desc, __HCRX_EL2,
+ hcrx_feat_map, FEAT_HCX);
+
+static const struct reg_bits_to_feat_map hcr_feat_map[] = {
+ NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0),
+ NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw),
+ NEEDS_FEAT(HCR_EL2_HCD, not_feat_aa64el3),
+ NEEDS_FEAT(HCR_EL2_AMO |
+ HCR_EL2_BSU |
+ HCR_EL2_CD |
+ HCR_EL2_DC |
+ HCR_EL2_FB |
+ HCR_EL2_FMO |
+ HCR_EL2_ID |
+ HCR_EL2_IMO |
+ HCR_EL2_MIOCNCE |
+ HCR_EL2_PTW |
+ HCR_EL2_SWIO |
+ HCR_EL2_TACR |
+ HCR_EL2_TDZ |
+ HCR_EL2_TGE |
+ HCR_EL2_TID1 |
+ HCR_EL2_TID2 |
+ HCR_EL2_TID3 |
+ HCR_EL2_TIDCP |
+ HCR_EL2_TPCP |
+ HCR_EL2_TPU |
+ HCR_EL2_TRVM |
+ HCR_EL2_TSC |
+ HCR_EL2_TSW |
+ HCR_EL2_TTLB |
+ HCR_EL2_TVM |
+ HCR_EL2_TWE |
+ HCR_EL2_TWI |
+ HCR_EL2_VF |
+ HCR_EL2_VI |
+ HCR_EL2_VM |
+ HCR_EL2_VSE,
+ FEAT_AA64EL1),
+ NEEDS_FEAT(HCR_EL2_AMVOFFEN, FEAT_AMUv1p1),
+ NEEDS_FEAT(HCR_EL2_EnSCXT, feat_csv2_2_csv2_1p2),
+ NEEDS_FEAT(HCR_EL2_TICAB |
+ HCR_EL2_TID4 |
+ HCR_EL2_TOCU,
+ FEAT_EVT),
+ NEEDS_FEAT(HCR_EL2_TTLBIS |
+ HCR_EL2_TTLBOS,
+ FEAT_EVT_TTLBxS),
+ NEEDS_FEAT(HCR_EL2_TLOR, FEAT_LOR),
+ NEEDS_FEAT(HCR_EL2_ATA |
+ HCR_EL2_DCT |
+ HCR_EL2_TID5,
+ FEAT_MTE2),
+ NEEDS_FEAT(HCR_EL2_AT | /* Ignore the original FEAT_NV */
+ HCR_EL2_NV2 |
+ HCR_EL2_NV,
+ feat_nv2),
+ NEEDS_FEAT(HCR_EL2_NV1, feat_nv2_e2h0_ni), /* Missing from JSON */
+ NEEDS_FEAT(HCR_EL2_API |
+ HCR_EL2_APK,
+ feat_pauth),
+ NEEDS_FEAT(HCR_EL2_TEA |
+ HCR_EL2_TERR,
+ FEAT_RAS),
+ NEEDS_FEAT(HCR_EL2_FIEN, feat_rasv1p1),
+ NEEDS_FEAT(HCR_EL2_GPF, FEAT_RME),
+ NEEDS_FEAT(HCR_EL2_FWB, FEAT_S2FWB),
+ NEEDS_FEAT(HCR_EL2_TME, FEAT_TME),
+ NEEDS_FEAT(HCR_EL2_TWEDEL |
+ HCR_EL2_TWEDEn,
+ FEAT_TWED),
+ NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h),
+};
+
+static const DECLARE_FEAT_MAP(hcr_desc, HCR_EL2,
+ hcr_feat_map, FEAT_AA64EL2);
+
+static const struct reg_bits_to_feat_map sctlr2_feat_map[] = {
+ NEEDS_FEAT(SCTLR2_EL1_NMEA |
+ SCTLR2_EL1_EASE,
+ FEAT_DoubleFault2),
+ NEEDS_FEAT(SCTLR2_EL1_EnADERR, feat_aderr),
+ NEEDS_FEAT(SCTLR2_EL1_EnANERR, feat_anerr),
+ NEEDS_FEAT(SCTLR2_EL1_EnIDCP128, FEAT_SYSREG128),
+ NEEDS_FEAT(SCTLR2_EL1_EnPACM |
+ SCTLR2_EL1_EnPACM0,
+ feat_pauth_lr),
+ NEEDS_FEAT(SCTLR2_EL1_CPTA |
+ SCTLR2_EL1_CPTA0 |
+ SCTLR2_EL1_CPTM |
+ SCTLR2_EL1_CPTM0,
+ FEAT_CPA2),
+};
+
+static const DECLARE_FEAT_MAP(sctlr2_desc, SCTLR2_EL1,
+ sctlr2_feat_map, FEAT_SCTLR2);
+
+static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = {
+ NEEDS_FEAT(TCR2_EL2_FNG1 |
+ TCR2_EL2_FNG0 |
+ TCR2_EL2_A2,
+ feat_asid2_e2h1),
+ NEEDS_FEAT(TCR2_EL2_DisCH1 |
+ TCR2_EL2_DisCH0 |
+ TCR2_EL2_D128,
+ feat_d128_e2h1),
+ NEEDS_FEAT(TCR2_EL2_AMEC1, feat_mec_e2h1),
+ NEEDS_FEAT(TCR2_EL2_AMEC0, FEAT_MEC),
+ NEEDS_FEAT(TCR2_EL2_HAFT, FEAT_HAFT),
+ NEEDS_FEAT(TCR2_EL2_PTTWI |
+ TCR2_EL2_PnCH,
+ FEAT_THE),
+ NEEDS_FEAT(TCR2_EL2_AIE, FEAT_AIE),
+ NEEDS_FEAT(TCR2_EL2_POE |
+ TCR2_EL2_E0POE,
+ FEAT_S1POE),
+ NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE),
+};
+
+static const DECLARE_FEAT_MAP(tcr2_el2_desc, TCR2_EL2,
+ tcr2_el2_feat_map, FEAT_TCR2);
+
+static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = {
+ NEEDS_FEAT(SCTLR_EL1_CP15BEN |
+ SCTLR_EL1_ITD |
+ SCTLR_EL1_SED,
+ FEAT_AA32EL0),
+ NEEDS_FEAT(SCTLR_EL1_BT0 |
+ SCTLR_EL1_BT1,
+ FEAT_BTI),
+ NEEDS_FEAT(SCTLR_EL1_CMOW, FEAT_CMOW),
+ NEEDS_FEAT(SCTLR_EL1_TSCXT, feat_csv2_2_csv2_1p2),
+ NEEDS_FEAT(SCTLR_EL1_EIS |
+ SCTLR_EL1_EOS,
+ FEAT_ExS),
+ NEEDS_FEAT(SCTLR_EL1_EnFPM, FEAT_FPMR),
+ NEEDS_FEAT(SCTLR_EL1_IESB, FEAT_IESB),
+ NEEDS_FEAT(SCTLR_EL1_EnALS, FEAT_LS64),
+ NEEDS_FEAT(SCTLR_EL1_EnAS0, FEAT_LS64_ACCDATA),
+ NEEDS_FEAT(SCTLR_EL1_EnASR, FEAT_LS64_V),
+ NEEDS_FEAT(SCTLR_EL1_nAA, FEAT_LSE2),
+ NEEDS_FEAT(SCTLR_EL1_LSMAOE |
+ SCTLR_EL1_nTLSMD,
+ FEAT_LSMAOC),
+ NEEDS_FEAT(SCTLR_EL1_EE, FEAT_MixedEnd),
+ NEEDS_FEAT(SCTLR_EL1_E0E, feat_mixedendel0),
+ NEEDS_FEAT(SCTLR_EL1_MSCEn, FEAT_MOPS),
+ NEEDS_FEAT(SCTLR_EL1_ATA0 |
+ SCTLR_EL1_ATA |
+ SCTLR_EL1_TCF0 |
+ SCTLR_EL1_TCF,
+ FEAT_MTE2),
+ NEEDS_FEAT(SCTLR_EL1_ITFSB, feat_mte_async),
+ NEEDS_FEAT(SCTLR_EL1_TCSO0 |
+ SCTLR_EL1_TCSO,
+ FEAT_MTE_STORE_ONLY),
+ NEEDS_FEAT(SCTLR_EL1_NMI |
+ SCTLR_EL1_SPINTMASK,
+ FEAT_NMI),
+ NEEDS_FEAT(SCTLR_EL1_SPAN, FEAT_PAN),
+ NEEDS_FEAT(SCTLR_EL1_EPAN, FEAT_PAN3),
+ NEEDS_FEAT(SCTLR_EL1_EnDA |
+ SCTLR_EL1_EnDB |
+ SCTLR_EL1_EnIA |
+ SCTLR_EL1_EnIB,
+ feat_pauth),
+ NEEDS_FEAT(SCTLR_EL1_EnTP2, FEAT_SME),
+ NEEDS_FEAT(SCTLR_EL1_EnRCTX, FEAT_SPECRES),
+ NEEDS_FEAT(SCTLR_EL1_DSSBS, FEAT_SSBS),
+ NEEDS_FEAT(SCTLR_EL1_TIDCP, FEAT_TIDCP1),
+ NEEDS_FEAT(SCTLR_EL1_TME0 |
+ SCTLR_EL1_TME |
+ SCTLR_EL1_TMT0 |
+ SCTLR_EL1_TMT,
+ FEAT_TME),
+ NEEDS_FEAT(SCTLR_EL1_TWEDEL |
+ SCTLR_EL1_TWEDEn,
+ FEAT_TWED),
+ NEEDS_FEAT(SCTLR_EL1_UCI |
+ SCTLR_EL1_EE |
+ SCTLR_EL1_E0E |
+ SCTLR_EL1_WXN |
+ SCTLR_EL1_nTWE |
+ SCTLR_EL1_nTWI |
+ SCTLR_EL1_UCT |
+ SCTLR_EL1_DZE |
+ SCTLR_EL1_I |
+ SCTLR_EL1_UMA |
+ SCTLR_EL1_SA0 |
+ SCTLR_EL1_SA |
+ SCTLR_EL1_C |
+ SCTLR_EL1_A |
+ SCTLR_EL1_M,
+ FEAT_AA64EL1),
+};
+
+static const DECLARE_FEAT_MAP(sctlr_el1_desc, SCTLR_EL1,
+ sctlr_el1_feat_map, FEAT_AA64EL1);
+
+static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = {
+ NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9),
+ NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock),
+ NEEDS_FEAT(MDCR_EL2_PMEE, FEAT_EBEP),
+ NEEDS_FEAT(MDCR_EL2_TDCC, FEAT_FGT),
+ NEEDS_FEAT(MDCR_EL2_MTPME, FEAT_MTPMU),
+ NEEDS_FEAT(MDCR_EL2_HPME |
+ MDCR_EL2_HPMN |
+ MDCR_EL2_TPMCR |
+ MDCR_EL2_TPM,
+ FEAT_PMUv3),
+ NEEDS_FEAT(MDCR_EL2_HPMD, feat_pmuv3p1),
+ NEEDS_FEAT(MDCR_EL2_HCCD |
+ MDCR_EL2_HLP,
+ feat_pmuv3p5),
+ NEEDS_FEAT(MDCR_EL2_HPMFZO, feat_pmuv3p7),
+ NEEDS_FEAT(MDCR_EL2_PMSSE, FEAT_PMUv3_SS),
+ NEEDS_FEAT(MDCR_EL2_E2PB |
+ MDCR_EL2_TPMS,
+ FEAT_SPE),
+ NEEDS_FEAT(MDCR_EL2_HPMFZS, FEAT_SPEv1p2),
+ NEEDS_FEAT(MDCR_EL2_EnSPM, FEAT_SPMU),
+ NEEDS_FEAT(MDCR_EL2_EnSTEPOP, FEAT_STEP2),
+ NEEDS_FEAT(MDCR_EL2_E2TB, FEAT_TRBE),
+ NEEDS_FEAT(MDCR_EL2_TTRF, FEAT_TRF),
+ NEEDS_FEAT(MDCR_EL2_TDA |
+ MDCR_EL2_TDE |
+ MDCR_EL2_TDRA,
+ FEAT_AA64EL1),
+};
+
+static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2,
+ mdcr_el2_feat_map, FEAT_AA64EL2);
+
+static void __init check_feat_map(const struct reg_bits_to_feat_map *map,
+ int map_size, u64 res0, const char *str)
+{
+ u64 mask = 0;
+
+ for (int i = 0; i < map_size; i++)
+ mask |= map[i].bits;
+
+ if (mask != ~res0)
+ kvm_err("Undefined %s behaviour, bits %016llx\n",
+ str, mask ^ ~res0);
+}
+
+static u64 reg_feat_map_bits(const struct reg_bits_to_feat_map *map)
+{
+ return map->flags & RES0_POINTER ? ~(*map->res0p) : map->bits;
+}
+
+static void __init check_reg_desc(const struct reg_feat_map_desc *r)
+{
+ check_feat_map(r->bit_feat_map, r->bit_feat_map_sz,
+ ~reg_feat_map_bits(&r->feat_map), r->name);
+}
+
+void __init check_feature_map(void)
+{
+ check_reg_desc(&hfgrtr_desc);
+ check_reg_desc(&hfgwtr_desc);
+ check_reg_desc(&hfgitr_desc);
+ check_reg_desc(&hdfgrtr_desc);
+ check_reg_desc(&hdfgwtr_desc);
+ check_reg_desc(&hafgrtr_desc);
+ check_reg_desc(&hfgrtr2_desc);
+ check_reg_desc(&hfgwtr2_desc);
+ check_reg_desc(&hfgitr2_desc);
+ check_reg_desc(&hdfgrtr2_desc);
+ check_reg_desc(&hdfgwtr2_desc);
+ check_reg_desc(&hcrx_desc);
+ check_reg_desc(&hcr_desc);
+ check_reg_desc(&sctlr2_desc);
+ check_reg_desc(&tcr2_el2_desc);
+ check_reg_desc(&sctlr_el1_desc);
+ check_reg_desc(&mdcr_el2_desc);
+}
+
+static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map)
+{
+ u64 regval = kvm->arch.id_regs[map->regidx];
+ u64 regfld = (regval >> map->shift) & GENMASK(map->width - 1, 0);
+
+ if (map->sign) {
+ s64 sfld = sign_extend64(regfld, map->width - 1);
+ s64 slim = sign_extend64(map->lo_lim, map->width - 1);
+ return sfld >= slim;
+ } else {
+ return regfld >= map->lo_lim;
+ }
+}
+
+static u64 __compute_fixed_bits(struct kvm *kvm,
+ const struct reg_bits_to_feat_map *map,
+ int map_size,
+ u64 *fixed_bits,
+ unsigned long require,
+ unsigned long exclude)
+{
+ u64 val = 0;
+
+ for (int i = 0; i < map_size; i++) {
+ bool match;
+
+ if ((map[i].flags & require) != require)
+ continue;
+
+ if (map[i].flags & exclude)
+ continue;
+
+ if (map[i].flags & CALL_FUNC)
+ match = (map[i].flags & FIXED_VALUE) ?
+ map[i].fval(kvm, fixed_bits) :
+ map[i].match(kvm);
+ else
+ match = idreg_feat_match(kvm, &map[i]);
+
+ if (!match || (map[i].flags & FIXED_VALUE))
+ val |= reg_feat_map_bits(&map[i]);
+ }
+
+ return val;
+}
+
+static u64 compute_res0_bits(struct kvm *kvm,
+ const struct reg_bits_to_feat_map *map,
+ int map_size,
+ unsigned long require,
+ unsigned long exclude)
+{
+ return __compute_fixed_bits(kvm, map, map_size, NULL,
+ require, exclude | FIXED_VALUE);
+}
+
+static u64 compute_reg_res0_bits(struct kvm *kvm,
+ const struct reg_feat_map_desc *r,
+ unsigned long require, unsigned long exclude)
+
+{
+ u64 res0;
+
+ res0 = compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz,
+ require, exclude);
+
+ /*
+ * If computing FGUs, don't take RES0 or register existence
+ * into account -- we're not computing bits for the register
+ * itself.
+ */
+ if (!(exclude & NEVER_FGU)) {
+ res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude);
+ res0 |= ~reg_feat_map_bits(&r->feat_map);
+ }
+
+ return res0;
+}
+
+static u64 compute_reg_fixed_bits(struct kvm *kvm,
+ const struct reg_feat_map_desc *r,
+ u64 *fixed_bits, unsigned long require,
+ unsigned long exclude)
+{
+ return __compute_fixed_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz,
+ fixed_bits, require | FIXED_VALUE, exclude);
+}
+
+void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt)
+{
+ u64 val = 0;
+
+ switch (fgt) {
+ case HFGRTR_GROUP:
+ val |= compute_reg_res0_bits(kvm, &hfgrtr_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgwtr_desc,
+ 0, NEVER_FGU);
+ break;
+ case HFGITR_GROUP:
+ val |= compute_reg_res0_bits(kvm, &hfgitr_desc,
+ 0, NEVER_FGU);
+ break;
+ case HDFGRTR_GROUP:
+ val |= compute_reg_res0_bits(kvm, &hdfgrtr_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hdfgwtr_desc,
+ 0, NEVER_FGU);
+ break;
+ case HAFGRTR_GROUP:
+ val |= compute_reg_res0_bits(kvm, &hafgrtr_desc,
+ 0, NEVER_FGU);
+ break;
+ case HFGRTR2_GROUP:
+ val |= compute_reg_res0_bits(kvm, &hfgrtr2_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgwtr2_desc,
+ 0, NEVER_FGU);
+ break;
+ case HFGITR2_GROUP:
+ val |= compute_reg_res0_bits(kvm, &hfgitr2_desc,
+ 0, NEVER_FGU);
+ break;
+ case HDFGRTR2_GROUP:
+ val |= compute_reg_res0_bits(kvm, &hdfgrtr2_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hdfgwtr2_desc,
+ 0, NEVER_FGU);
+ break;
+ default:
+ BUG();
+ }
+
+ kvm->arch.fgu[fgt] = val;
+}
+
+void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1)
+{
+ u64 fixed = 0, mask;
+
+ switch (reg) {
+ case HFGRTR_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hfgrtr_desc, 0, 0);
+ *res1 = HFGRTR_EL2_RES1;
+ break;
+ case HFGWTR_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hfgwtr_desc, 0, 0);
+ *res1 = HFGWTR_EL2_RES1;
+ break;
+ case HFGITR_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hfgitr_desc, 0, 0);
+ *res1 = HFGITR_EL2_RES1;
+ break;
+ case HDFGRTR_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hdfgrtr_desc, 0, 0);
+ *res1 = HDFGRTR_EL2_RES1;
+ break;
+ case HDFGWTR_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hdfgwtr_desc, 0, 0);
+ *res1 = HDFGWTR_EL2_RES1;
+ break;
+ case HAFGRTR_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hafgrtr_desc, 0, 0);
+ *res1 = HAFGRTR_EL2_RES1;
+ break;
+ case HFGRTR2_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hfgrtr2_desc, 0, 0);
+ *res1 = HFGRTR2_EL2_RES1;
+ break;
+ case HFGWTR2_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hfgwtr2_desc, 0, 0);
+ *res1 = HFGWTR2_EL2_RES1;
+ break;
+ case HFGITR2_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hfgitr2_desc, 0, 0);
+ *res1 = HFGITR2_EL2_RES1;
+ break;
+ case HDFGRTR2_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hdfgrtr2_desc, 0, 0);
+ *res1 = HDFGRTR2_EL2_RES1;
+ break;
+ case HDFGWTR2_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hdfgwtr2_desc, 0, 0);
+ *res1 = HDFGWTR2_EL2_RES1;
+ break;
+ case HCRX_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &hcrx_desc, 0, 0);
+ *res1 = __HCRX_EL2_RES1;
+ break;
+ case HCR_EL2:
+ mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0);
+ *res0 = compute_reg_res0_bits(kvm, &hcr_desc, 0, 0);
+ *res0 |= (mask & ~fixed);
+ *res1 = HCR_EL2_RES1 | (mask & fixed);
+ break;
+ case SCTLR2_EL1:
+ case SCTLR2_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &sctlr2_desc, 0, 0);
+ *res1 = SCTLR2_EL1_RES1;
+ break;
+ case TCR2_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &tcr2_el2_desc, 0, 0);
+ *res1 = TCR2_EL2_RES1;
+ break;
+ case SCTLR_EL1:
+ *res0 = compute_reg_res0_bits(kvm, &sctlr_el1_desc, 0, 0);
+ *res1 = SCTLR_EL1_RES1;
+ break;
+ case MDCR_EL2:
+ *res0 = compute_reg_res0_bits(kvm, &mdcr_el2_desc, 0, 0);
+ *res1 = MDCR_EL2_RES1;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ *res0 = *res1 = 0;
+ break;
+ }
+}
+
+static __always_inline struct fgt_masks *__fgt_reg_to_masks(enum vcpu_sysreg reg)
+{
+ switch (reg) {
+ case HFGRTR_EL2:
+ return &hfgrtr_masks;
+ case HFGWTR_EL2:
+ return &hfgwtr_masks;
+ case HFGITR_EL2:
+ return &hfgitr_masks;
+ case HDFGRTR_EL2:
+ return &hdfgrtr_masks;
+ case HDFGWTR_EL2:
+ return &hdfgwtr_masks;
+ case HAFGRTR_EL2:
+ return &hafgrtr_masks;
+ case HFGRTR2_EL2:
+ return &hfgrtr2_masks;
+ case HFGWTR2_EL2:
+ return &hfgwtr2_masks;
+ case HFGITR2_EL2:
+ return &hfgitr2_masks;
+ case HDFGRTR2_EL2:
+ return &hdfgrtr2_masks;
+ case HDFGWTR2_EL2:
+ return &hdfgwtr2_masks;
+ default:
+ BUILD_BUG_ON(1);
+ }
+}
+
+static __always_inline void __compute_fgt(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
+{
+ u64 fgu = vcpu->kvm->arch.fgu[__fgt_reg_to_group_id(reg)];
+ struct fgt_masks *m = __fgt_reg_to_masks(reg);
+ u64 clear = 0, set = 0, val = m->nmask;
+
+ set |= fgu & m->mask;
+ clear |= fgu & m->nmask;
+
+ if (is_nested_ctxt(vcpu)) {
+ u64 nested = __vcpu_sys_reg(vcpu, reg);
+ set |= nested & m->mask;
+ clear |= ~nested & m->nmask;
+ }
+
+ val |= set;
+ val &= ~clear;
+ *vcpu_fgt(vcpu, reg) = val;
+}
+
+static void __compute_hfgwtr(struct kvm_vcpu *vcpu)
+{
+ __compute_fgt(vcpu, HFGWTR_EL2);
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ *vcpu_fgt(vcpu, HFGWTR_EL2) |= HFGWTR_EL2_TCR_EL1;
+}
+
+static void __compute_hdfgwtr(struct kvm_vcpu *vcpu)
+{
+ __compute_fgt(vcpu, HDFGWTR_EL2);
+
+ if (is_hyp_ctxt(vcpu))
+ *vcpu_fgt(vcpu, HDFGWTR_EL2) |= HDFGWTR_EL2_MDSCR_EL1;
+}
+
+void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu)
+{
+ if (!cpus_have_final_cap(ARM64_HAS_FGT))
+ return;
+
+ __compute_fgt(vcpu, HFGRTR_EL2);
+ __compute_hfgwtr(vcpu);
+ __compute_fgt(vcpu, HFGITR_EL2);
+ __compute_fgt(vcpu, HDFGRTR_EL2);
+ __compute_hdfgwtr(vcpu);
+ __compute_fgt(vcpu, HAFGRTR_EL2);
+
+ if (!cpus_have_final_cap(ARM64_HAS_FGT2))
+ return;
+
+ __compute_fgt(vcpu, HFGRTR2_EL2);
+ __compute_fgt(vcpu, HFGWTR2_EL2);
+ __compute_fgt(vcpu, HFGITR2_EL2);
+ __compute_fgt(vcpu, HDFGRTR2_EL2);
+ __compute_fgt(vcpu, HDFGWTR2_EL2);
+}
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index fccf9ec01813..3ad6b7c6e4ba 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -3,7 +3,8 @@
* Debug and Guest Debug support
*
* Copyright (C) 2015 - Linaro Ltd
- * Author: Alex Bennée <alex.bennee@linaro.org>
+ * Authors: Alex Bennée <alex.bennee@linaro.org>
+ * Oliver Upton <oliver.upton@linux.dev>
*/
#include <linux/kvm_host.h>
@@ -14,70 +15,10 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
-#include "trace.h"
-
-/* These are the bits of MDSCR_EL1 we may manipulate */
-#define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \
- DBG_MDSCR_KDE | \
- DBG_MDSCR_MDE)
-
-static DEFINE_PER_CPU(u64, mdcr_el2);
-
-/**
- * save/restore_guest_debug_regs
- *
- * For some debug operations we need to tweak some guest registers. As
- * a result we need to save the state of those registers before we
- * make those modifications.
- *
- * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled
- * after we have restored the preserved value to the main context.
- *
- * When single-step is enabled by userspace, we tweak PSTATE.SS on every
- * guest entry. Preserve PSTATE.SS so we can restore the original value
- * for the vcpu after the single-step is disabled.
- */
-static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
-{
- u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
-
- vcpu->arch.guest_debug_preserved.mdscr_el1 = val;
-
- trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
- vcpu->arch.guest_debug_preserved.mdscr_el1);
-
- vcpu->arch.guest_debug_preserved.pstate_ss =
- (*vcpu_cpsr(vcpu) & DBG_SPSR_SS);
-}
-
-static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
-{
- u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1;
-
- vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);
-
- trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
- vcpu_read_sys_reg(vcpu, MDSCR_EL1));
-
- if (vcpu->arch.guest_debug_preserved.pstate_ss)
- *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
- else
- *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
-}
-
-/**
- * kvm_arm_init_debug - grab what we need for debug
- *
- * Currently the sole task of this function is to retrieve the initial
- * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has
- * presumably been set-up by some knowledgeable bootcode.
- *
- * It is called once per-cpu during CPU hyp initialisation.
- */
-
-void kvm_arm_init_debug(void)
+static int cpu_has_spe(u64 dfr0)
{
- __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2));
+ return cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) &&
+ !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P);
}
/**
@@ -95,11 +36,14 @@ void kvm_arm_init_debug(void)
*/
static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
{
+ preempt_disable();
+
/*
* This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
* to disable guest access to the profiling and trace buffers
*/
- vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+ vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
+ *host_data_ptr(nr_event_counters));
vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
MDCR_EL2_TPMS |
MDCR_EL2_TTRF |
@@ -113,232 +57,234 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
/*
- * Trap debug register access when one of the following is true:
- * - Userspace is using the hardware to debug the guest
- * (KVM_GUESTDBG_USE_HW is set).
- * - The guest is not using debug (DEBUG_DIRTY clear).
- * - The guest has enabled the OS Lock (debug exceptions are blocked).
+ * Trap debug registers if the guest doesn't have ownership of them.
*/
- if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) ||
- !vcpu_get_flag(vcpu, DEBUG_DIRTY) ||
- kvm_vcpu_os_lock_enabled(vcpu))
+ if (!kvm_guest_owns_debug_regs(vcpu))
vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
- trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
-}
+ if (vcpu_has_nv(vcpu))
+ kvm_nested_setup_mdcr_el2(vcpu);
+
+ /* Write MDCR_EL2 directly if we're already at EL2 */
+ if (has_vhe())
+ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-/**
- * kvm_arm_vcpu_init_debug - setup vcpu debug traps
- *
- * @vcpu: the vcpu pointer
- *
- * Set vcpu initial mdcr_el2 value.
- */
-void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
- kvm_arm_setup_mdcr_el2(vcpu);
preempt_enable();
}
-/**
- * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
- */
+void kvm_init_host_debug_data(void)
+{
+ u64 dfr0 = read_sysreg(id_aa64dfr0_el1);
+
+ if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0)
+ *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N,
+ read_sysreg(pmcr_el0));
+
+ *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0);
+ *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0);
+
+ if (cpu_has_spe(dfr0))
+ host_data_set_flag(HAS_SPE);
-void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
+ if (has_vhe())
+ return;
+
+ /* Check if we have BRBE implemented and available at the host */
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT))
+ host_data_set_flag(HAS_BRBE);
+
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) {
+ /* Force disable trace in protected mode in case of no TRBE */
+ if (is_protected_kvm_enabled())
+ host_data_set_flag(EL1_TRACING_CONFIGURED);
+
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) &&
+ !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P))
+ host_data_set_flag(HAS_TRBE);
+ }
+}
+
+void kvm_debug_init_vhe(void)
{
- vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state;
+ /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */
+ if (host_data_test_flag(HAS_SPE))
+ write_sysreg_el1(0, SYS_PMSCR);
}
-/**
- * kvm_arm_setup_debug - set up debug related stuff
+/*
+ * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host
+ * has taken over MDSCR_EL1.
*
- * @vcpu: the vcpu pointer
+ * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1.
*
- * This is called before each entry into the hypervisor to setup any
- * debug related registers.
+ * - Userspace is using the breakpoint/watchpoint registers to debug the
+ * guest, and MDSCR_EL1.MDE is forced to 1.
*
- * Additionally, KVM only traps guest accesses to the debug registers if
- * the guest is not actively using them (see the DEBUG_DIRTY
- * flag on vcpu->arch.iflags). Since the guest must not interfere
- * with the hardware state when debugging the guest, we must ensure that
- * trapping is enabled whenever we are debugging the guest using the
- * debug registers.
+ * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0,
+ * masking all debug exceptions affected by the OS Lock.
*/
+static void setup_external_mdscr(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Use the guest's MDSCR_EL1 as a starting point, since there are
+ * several other features controlled by MDSCR_EL1 that are not relevant
+ * to the host.
+ *
+ * Clear the bits that KVM may use which also satisfies emulation of
+ * the OS Lock as MDSCR_EL1.MDE is cleared.
+ */
+ u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS |
+ MDSCR_EL1_MDE |
+ MDSCR_EL1_KDE);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ mdscr |= MDSCR_EL1_SS;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW)
+ mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE;
-void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
+ vcpu->arch.external_mdscr_el1 = mdscr;
+}
+
+void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
{
- unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2;
+ u64 mdscr;
- trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
+ /* Must be called before kvm_vcpu_load_vhe() */
+ KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm);
- kvm_arm_setup_mdcr_el2(vcpu);
+ if (has_vhe())
+ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
- /* Check if we need to use the debug registers. */
+ /*
+ * Determine which of the possible debug states we're in:
+ *
+ * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's
+ * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do
+ * software step or emulate the effects of the OS Lock being enabled.
+ *
+ * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and
+ * the breakpoint/watchpoint registers need to be loaded eagerly.
+ *
+ * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint
+ * context needs to be loaded on the CPU.
+ */
if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
- /* Save guest debug state */
- save_guest_debug_regs(vcpu);
+ vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED;
+ setup_external_mdscr(vcpu);
/*
- * Single Step (ARM ARM D2.12.3 The software step state
- * machine)
- *
- * If we are doing Single Step we need to manipulate
- * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the
- * step has occurred the hypervisor will trap the
- * debug exception and we return to userspace.
- *
- * If the guest attempts to single step its userspace
- * we would have to deal with a trapped exception
- * while in the guest kernel. Because this would be
- * hard to unwind we suppress the guest's ability to
- * do so by masking MDSCR_EL.SS.
- *
- * This confuses guest debuggers which use
- * single-step behind the scenes but everything
- * returns to normal once the host is no longer
- * debugging the system.
+ * Steal the guest's single-step state machine if userspace wants
+ * single-step the guest.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- /*
- * If the software step state at the last guest exit
- * was Active-pending, we don't set DBG_SPSR_SS so
- * that the state is maintained (to not run another
- * single-step until the pending Software Step
- * exception is taken).
- */
- if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING))
+ if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS)
+ vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING);
+ else
+ vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING);
+
+ if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING))
*vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
else
*vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
-
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr |= DBG_MDSCR_SS;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
- } else {
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr &= ~DBG_MDSCR_SS;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
}
+ } else {
+ mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
-
- /*
- * HW Breakpoints and watchpoints
- *
- * We simply switch the debug_ptr to point to our new
- * external_debug_state which has been populated by the
- * debug ioctl. The existing DEBUG_DIRTY mechanism ensures
- * the registers are updated on the world switch.
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
- /* Enable breakpoints/watchpoints */
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr |= DBG_MDSCR_MDE;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
-
- vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
-
- trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
- &vcpu->arch.debug_ptr->dbg_bcr[0],
- &vcpu->arch.debug_ptr->dbg_bvr[0]);
-
- trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
- &vcpu->arch.debug_ptr->dbg_wcr[0],
- &vcpu->arch.debug_ptr->dbg_wvr[0]);
-
- /*
- * The OS Lock blocks debug exceptions in all ELs when it is
- * enabled. If the guest has enabled the OS Lock, constrain its
- * effects to the guest. Emulate the behavior by clearing
- * MDSCR_EL1.MDE. In so doing, we ensure that host debug
- * exceptions are unaffected by guest configuration of the OS
- * Lock.
- */
- } else if (kvm_vcpu_os_lock_enabled(vcpu)) {
- mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
- mdscr &= ~DBG_MDSCR_MDE;
- vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
- }
+ if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE))
+ vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED;
+ else
+ vcpu->arch.debug_owner = VCPU_DEBUG_FREE;
}
- BUG_ON(!vcpu->guest_debug &&
- vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
-
- /* If KDE or MDE are set, perform a full save/restore cycle. */
- if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
-
- /* Write mdcr_el2 changes since vcpu_load on VHE systems */
- if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2)
- write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-
- trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
+ kvm_arm_setup_mdcr_el2(vcpu);
}
-void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
+void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu)
{
- trace_kvm_arm_clear_debug(vcpu->guest_debug);
+ if (has_vhe())
+ write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
+
+ if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ return;
/*
- * Restore the guest's debug registers if we were using them.
+ * Save the host's software step state and restore the guest's before
+ * potentially returning to userspace.
*/
- if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
- /*
- * Mark the vcpu as ACTIVE_PENDING
- * until Software Step exception is taken.
- */
- vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING);
- }
+ if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
+ vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING);
+ else
+ vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING);
+
+ if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING))
+ *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
+ else
+ *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
+}
- restore_guest_debug_regs(vcpu);
+/*
+ * Updates ownership of the debug registers after a trapped guest access to a
+ * breakpoint/watchpoint register. Host ownership of the debug registers is of
+ * strictly higher priority, and it is the responsibility of the VMM to emulate
+ * guest debug exceptions in this configuration.
+ */
+void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu)
+{
+ if (kvm_host_owns_debug_regs(vcpu))
+ return;
- /*
- * If we were using HW debug we need to restore the
- * debug_ptr to the guest debug state.
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
- kvm_arm_reset_debug_ptr(vcpu);
+ vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED;
+ kvm_arm_setup_mdcr_el2(vcpu);
+}
- trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
- &vcpu->arch.debug_ptr->dbg_bcr[0],
- &vcpu->arch.debug_ptr->dbg_bvr[0]);
+void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val)
+{
+ if (val & OSLAR_EL1_OSLK)
+ __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, |=, OSLSR_EL1_OSLK);
+ else
+ __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, &=, ~OSLSR_EL1_OSLK);
- trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
- &vcpu->arch.debug_ptr->dbg_wcr[0],
- &vcpu->arch.debug_ptr->dbg_wvr[0]);
- }
- }
+ preempt_disable();
+ kvm_arch_vcpu_put(vcpu);
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ preempt_enable();
}
-void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu)
+static bool skip_trbe_access(bool skip_condition)
{
- u64 dfr0;
+ return (WARN_ON_ONCE(preemptible()) || skip_condition ||
+ is_protected_kvm_enabled() || !is_kvm_arm_initialised());
+}
- /* For VHE, there is nothing to do */
- if (has_vhe())
- return;
+void kvm_enable_trbe(void)
+{
+ if (!skip_trbe_access(has_vhe()))
+ host_data_set_flag(TRBE_ENABLED);
+}
+EXPORT_SYMBOL_GPL(kvm_enable_trbe);
- dfr0 = read_sysreg(id_aa64dfr0_el1);
- /*
- * If SPE is present on this CPU and is available at current EL,
- * we may need to check if the host state needs to be saved.
- */
- if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) &&
- !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT)))
- vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE);
-
- /* Check if we have TRBE implemented and available at the host */
- if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) &&
- !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG))
- vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE);
+void kvm_disable_trbe(void)
+{
+ if (!skip_trbe_access(has_vhe()))
+ host_data_clear_flag(TRBE_ENABLED);
}
+EXPORT_SYMBOL_GPL(kvm_disable_trbe);
-void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu)
+void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest)
{
- vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE);
- vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE);
+ if (skip_trbe_access(false))
+ return;
+
+ if (has_vhe()) {
+ write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12);
+ return;
+ }
+
+ *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest;
+ if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest)
+ host_data_set_flag(EL1_TRACING_CONFIGURED);
+ else
+ host_data_clear_flag(EL1_TRACING_CONFIGURED);
}
+EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration);
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
new file mode 100644
index 000000000000..834f13fb1fb7
--- /dev/null
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -0,0 +1,2854 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 - Linaro and Columbia University
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
+
+#include "hyp/include/hyp/adjust_pc.h"
+
+#include "trace.h"
+
+enum trap_behaviour {
+ BEHAVE_HANDLE_LOCALLY = 0,
+
+ BEHAVE_FORWARD_READ = BIT(0),
+ BEHAVE_FORWARD_WRITE = BIT(1),
+ BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE,
+
+ /* Traps that take effect in Host EL0, this is rare! */
+ BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2),
+};
+
+struct trap_bits {
+ const enum vcpu_sysreg index;
+ const enum trap_behaviour behaviour;
+ const u64 value;
+ const u64 mask;
+};
+
+/* Coarse Grained Trap definitions */
+enum cgt_group_id {
+ /* Indicates no coarse trap control */
+ __RESERVED__,
+
+ /*
+ * The first batch of IDs denote coarse trapping that are used
+ * on their own instead of being part of a combination of
+ * trap controls.
+ */
+ CGT_HCR_TID1,
+ CGT_HCR_TID2,
+ CGT_HCR_TID3,
+ CGT_HCR_IMO,
+ CGT_HCR_FMO,
+ CGT_HCR_TIDCP,
+ CGT_HCR_TACR,
+ CGT_HCR_TSW,
+ CGT_HCR_TPC,
+ CGT_HCR_TPU,
+ CGT_HCR_TTLB,
+ CGT_HCR_TVM,
+ CGT_HCR_TDZ,
+ CGT_HCR_TRVM,
+ CGT_HCR_TLOR,
+ CGT_HCR_TERR,
+ CGT_HCR_APK,
+ CGT_HCR_NV,
+ CGT_HCR_NV_nNV2,
+ CGT_HCR_NV1_nNV2,
+ CGT_HCR_AT,
+ CGT_HCR_nFIEN,
+ CGT_HCR_TID4,
+ CGT_HCR_TICAB,
+ CGT_HCR_TOCU,
+ CGT_HCR_ENSCXT,
+ CGT_HCR_TTLBIS,
+ CGT_HCR_TTLBOS,
+
+ CGT_MDCR_TPMCR,
+ CGT_MDCR_TPM,
+ CGT_MDCR_TDE,
+ CGT_MDCR_TDA,
+ CGT_MDCR_TDOSA,
+ CGT_MDCR_TDRA,
+ CGT_MDCR_E2PB,
+ CGT_MDCR_TPMS,
+ CGT_MDCR_TTRF,
+ CGT_MDCR_E2TB,
+ CGT_MDCR_TDCC,
+
+ CGT_CPTR_TAM,
+ CGT_CPTR_TCPAC,
+
+ CGT_HCRX_EnFPM,
+ CGT_HCRX_TCR2En,
+ CGT_HCRX_SCTLR2En,
+
+ CGT_CNTHCTL_EL1TVT,
+ CGT_CNTHCTL_EL1TVCT,
+
+ CGT_ICH_HCR_TC,
+ CGT_ICH_HCR_TALL0,
+ CGT_ICH_HCR_TALL1,
+ CGT_ICH_HCR_TDIR,
+
+ /*
+ * Anything after this point is a combination of coarse trap
+ * controls, which must all be evaluated to decide what to do.
+ */
+ __MULTIPLE_CONTROL_BITS__,
+ CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__,
+ CGT_HCR_TID2_TID4,
+ CGT_HCR_TTLB_TTLBIS,
+ CGT_HCR_TTLB_TTLBOS,
+ CGT_HCR_TVM_TRVM,
+ CGT_HCR_TVM_TRVM_HCRX_TCR2En,
+ CGT_HCR_TVM_TRVM_HCRX_SCTLR2En,
+ CGT_HCR_TPU_TICAB,
+ CGT_HCR_TPU_TOCU,
+ CGT_HCR_NV1_nNV2_ENSCXT,
+ CGT_MDCR_TPM_TPMCR,
+ CGT_MDCR_TPM_HPMN,
+ CGT_MDCR_TDE_TDA,
+ CGT_MDCR_TDE_TDOSA,
+ CGT_MDCR_TDE_TDRA,
+ CGT_MDCR_TDCC_TDE_TDA,
+
+ CGT_ICH_HCR_TC_TDIR,
+
+ /*
+ * Anything after this point requires a callback evaluating a
+ * complex trap condition. Ugly stuff.
+ */
+ __COMPLEX_CONDITIONS__,
+ CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__,
+ CGT_CNTHCTL_EL1PTEN,
+ CGT_CNTHCTL_EL1NVPCT,
+ CGT_CNTHCTL_EL1NVVCT,
+
+ CGT_CPTR_TTA,
+ CGT_MDCR_HPMN,
+
+ /* Must be last */
+ __NR_CGT_GROUP_IDS__
+};
+
+static const struct trap_bits coarse_trap_bits[] = {
+ [CGT_HCR_TID1] = {
+ .index = HCR_EL2,
+ .value = HCR_TID1,
+ .mask = HCR_TID1,
+ .behaviour = BEHAVE_FORWARD_READ,
+ },
+ [CGT_HCR_TID2] = {
+ .index = HCR_EL2,
+ .value = HCR_TID2,
+ .mask = HCR_TID2,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TID3] = {
+ .index = HCR_EL2,
+ .value = HCR_TID3,
+ .mask = HCR_TID3,
+ .behaviour = BEHAVE_FORWARD_READ,
+ },
+ [CGT_HCR_IMO] = {
+ .index = HCR_EL2,
+ .value = HCR_IMO,
+ .mask = HCR_IMO,
+ .behaviour = BEHAVE_FORWARD_WRITE,
+ },
+ [CGT_HCR_FMO] = {
+ .index = HCR_EL2,
+ .value = HCR_FMO,
+ .mask = HCR_FMO,
+ .behaviour = BEHAVE_FORWARD_WRITE,
+ },
+ [CGT_HCR_TIDCP] = {
+ .index = HCR_EL2,
+ .value = HCR_TIDCP,
+ .mask = HCR_TIDCP,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TACR] = {
+ .index = HCR_EL2,
+ .value = HCR_TACR,
+ .mask = HCR_TACR,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TSW] = {
+ .index = HCR_EL2,
+ .value = HCR_TSW,
+ .mask = HCR_TSW,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */
+ .index = HCR_EL2,
+ .value = HCR_TPC,
+ .mask = HCR_TPC,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TPU] = {
+ .index = HCR_EL2,
+ .value = HCR_TPU,
+ .mask = HCR_TPU,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TTLB] = {
+ .index = HCR_EL2,
+ .value = HCR_TTLB,
+ .mask = HCR_TTLB,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TVM] = {
+ .index = HCR_EL2,
+ .value = HCR_TVM,
+ .mask = HCR_TVM,
+ .behaviour = BEHAVE_FORWARD_WRITE,
+ },
+ [CGT_HCR_TDZ] = {
+ .index = HCR_EL2,
+ .value = HCR_TDZ,
+ .mask = HCR_TDZ,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TRVM] = {
+ .index = HCR_EL2,
+ .value = HCR_TRVM,
+ .mask = HCR_TRVM,
+ .behaviour = BEHAVE_FORWARD_READ,
+ },
+ [CGT_HCR_TLOR] = {
+ .index = HCR_EL2,
+ .value = HCR_TLOR,
+ .mask = HCR_TLOR,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TERR] = {
+ .index = HCR_EL2,
+ .value = HCR_TERR,
+ .mask = HCR_TERR,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_APK] = {
+ .index = HCR_EL2,
+ .value = 0,
+ .mask = HCR_APK,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_NV] = {
+ .index = HCR_EL2,
+ .value = HCR_NV,
+ .mask = HCR_NV,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_NV_nNV2] = {
+ .index = HCR_EL2,
+ .value = HCR_NV,
+ .mask = HCR_NV | HCR_NV2,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_NV1_nNV2] = {
+ .index = HCR_EL2,
+ .value = HCR_NV | HCR_NV1,
+ .mask = HCR_NV | HCR_NV1 | HCR_NV2,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_AT] = {
+ .index = HCR_EL2,
+ .value = HCR_AT,
+ .mask = HCR_AT,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_nFIEN] = {
+ .index = HCR_EL2,
+ .value = 0,
+ .mask = HCR_FIEN,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TID4] = {
+ .index = HCR_EL2,
+ .value = HCR_TID4,
+ .mask = HCR_TID4,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TICAB] = {
+ .index = HCR_EL2,
+ .value = HCR_TICAB,
+ .mask = HCR_TICAB,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TOCU] = {
+ .index = HCR_EL2,
+ .value = HCR_TOCU,
+ .mask = HCR_TOCU,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_ENSCXT] = {
+ .index = HCR_EL2,
+ .value = 0,
+ .mask = HCR_ENSCXT,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TTLBIS] = {
+ .index = HCR_EL2,
+ .value = HCR_TTLBIS,
+ .mask = HCR_TTLBIS,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCR_TTLBOS] = {
+ .index = HCR_EL2,
+ .value = HCR_TTLBOS,
+ .mask = HCR_TTLBOS,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_TPMCR] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TPMCR,
+ .mask = MDCR_EL2_TPMCR,
+ .behaviour = BEHAVE_FORWARD_RW |
+ BEHAVE_FORWARD_IN_HOST_EL0,
+ },
+ [CGT_MDCR_TPM] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TPM,
+ .mask = MDCR_EL2_TPM,
+ .behaviour = BEHAVE_FORWARD_RW |
+ BEHAVE_FORWARD_IN_HOST_EL0,
+ },
+ [CGT_MDCR_TDE] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TDE,
+ .mask = MDCR_EL2_TDE,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_TDA] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TDA,
+ .mask = MDCR_EL2_TDA,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_TDOSA] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TDOSA,
+ .mask = MDCR_EL2_TDOSA,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_TDRA] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TDRA,
+ .mask = MDCR_EL2_TDRA,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_E2PB] = {
+ .index = MDCR_EL2,
+ .value = 0,
+ .mask = BIT(MDCR_EL2_E2PB_SHIFT),
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_TPMS] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TPMS,
+ .mask = MDCR_EL2_TPMS,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_TTRF] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TTRF,
+ .mask = MDCR_EL2_TTRF,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_E2TB] = {
+ .index = MDCR_EL2,
+ .value = 0,
+ .mask = BIT(MDCR_EL2_E2TB_SHIFT),
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_MDCR_TDCC] = {
+ .index = MDCR_EL2,
+ .value = MDCR_EL2_TDCC,
+ .mask = MDCR_EL2_TDCC,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CPTR_TAM] = {
+ .index = CPTR_EL2,
+ .value = CPTR_EL2_TAM,
+ .mask = CPTR_EL2_TAM,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CPTR_TCPAC] = {
+ .index = CPTR_EL2,
+ .value = CPTR_EL2_TCPAC,
+ .mask = CPTR_EL2_TCPAC,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCRX_EnFPM] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_EnFPM,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCRX_TCR2En] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_TCR2En,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_HCRX_SCTLR2En] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_SCTLR2En,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CNTHCTL_EL1TVT] = {
+ .index = CNTHCTL_EL2,
+ .value = CNTHCTL_EL1TVT,
+ .mask = CNTHCTL_EL1TVT,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_CNTHCTL_EL1TVCT] = {
+ .index = CNTHCTL_EL2,
+ .value = CNTHCTL_EL1TVCT,
+ .mask = CNTHCTL_EL1TVCT,
+ .behaviour = BEHAVE_FORWARD_READ,
+ },
+ [CGT_ICH_HCR_TC] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TC,
+ .mask = ICH_HCR_EL2_TC,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_ICH_HCR_TALL0] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TALL0,
+ .mask = ICH_HCR_EL2_TALL0,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_ICH_HCR_TALL1] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TALL1,
+ .mask = ICH_HCR_EL2_TALL1,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+ [CGT_ICH_HCR_TDIR] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_EL2_TDIR,
+ .mask = ICH_HCR_EL2_TDIR,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
+};
+
+#define MCB(id, ...) \
+ [id - __MULTIPLE_CONTROL_BITS__] = \
+ (const enum cgt_group_id[]){ \
+ __VA_ARGS__, __RESERVED__ \
+ }
+
+static const enum cgt_group_id *coarse_control_combo[] = {
+ MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4),
+ MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS),
+ MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS),
+ MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM),
+ MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En,
+ CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En),
+ MCB(CGT_HCR_TVM_TRVM_HCRX_SCTLR2En,
+ CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_SCTLR2En),
+ MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB),
+ MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU),
+ MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT),
+ MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR),
+ MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN),
+ MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA),
+ MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA),
+ MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA),
+ MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA),
+
+ MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC),
+ MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR),
+};
+
+typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *);
+
+/*
+ * Warning, maximum confusion ahead.
+ *
+ * When E2H=0, CNTHCTL_EL2[1:0] are defined as EL1PCEN:EL1PCTEN
+ * When E2H=1, CNTHCTL_EL2[11:10] are defined as EL1PTEN:EL1PCTEN
+ *
+ * Note the single letter difference? Yet, the bits have the same
+ * function despite a different layout and a different name.
+ *
+ * We don't try to reconcile this mess. We just use the E2H=0 bits
+ * to generate something that is in the E2H=1 format, and live with
+ * it. You're welcome.
+ */
+static u64 get_sanitized_cnthctl(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10;
+
+ return val & ((CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN) << 10);
+}
+
+static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu)
+{
+ if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10))
+ return BEHAVE_HANDLE_LOCALLY;
+
+ return BEHAVE_FORWARD_RW;
+}
+
+static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu)
+{
+ if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10))
+ return BEHAVE_HANDLE_LOCALLY;
+
+ return BEHAVE_FORWARD_RW;
+}
+
+static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ val = __vcpu_sys_reg(vcpu, HCR_EL2);
+ return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV));
+}
+
+static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu)
+{
+ if (!is_nested_nv2_guest(vcpu) ||
+ !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT))
+ return BEHAVE_HANDLE_LOCALLY;
+
+ return BEHAVE_FORWARD_RW;
+}
+
+static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu)
+{
+ if (!is_nested_nv2_guest(vcpu) ||
+ !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT))
+ return BEHAVE_HANDLE_LOCALLY;
+
+ return BEHAVE_FORWARD_RW;
+}
+
+static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2);
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = translate_cptr_el2_to_cpacr_el1(val);
+
+ if (val & CPACR_EL1_TTA)
+ return BEHAVE_FORWARD_RW;
+
+ return BEHAVE_HANDLE_LOCALLY;
+}
+
+static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ unsigned int idx;
+
+
+ switch (sysreg) {
+ case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30):
+ case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30):
+ idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg);
+ break;
+ case SYS_PMXEVTYPER_EL0:
+ case SYS_PMXEVCNTR_EL0:
+ idx = SYS_FIELD_GET(PMSELR_EL0, SEL,
+ __vcpu_sys_reg(vcpu, PMSELR_EL0));
+ break;
+ default:
+ /* Someone used this trap helper for something else... */
+ KVM_BUG_ON(1, vcpu->kvm);
+ return BEHAVE_HANDLE_LOCALLY;
+ }
+
+ if (kvm_pmu_counter_is_hyp(vcpu, idx))
+ return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0;
+
+ return BEHAVE_HANDLE_LOCALLY;
+}
+
+#define CCC(id, fn) \
+ [id - __COMPLEX_CONDITIONS__] = fn
+
+static const complex_condition_check ccc[] = {
+ CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten),
+ CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten),
+ CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct),
+ CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct),
+ CCC(CGT_CPTR_TTA, check_cptr_tta),
+ CCC(CGT_MDCR_HPMN, check_mdcr_hpmn),
+};
+
+/*
+ * Bit assignment for the trap controls. We use a 64bit word with the
+ * following layout for each trapped sysreg:
+ *
+ * [9:0] enum cgt_group_id (10 bits)
+ * [13:10] enum fgt_group_id (4 bits)
+ * [19:14] bit number in the FGT register (6 bits)
+ * [20] trap polarity (1 bit)
+ * [25:21] FG filter (5 bits)
+ * [35:26] Main SysReg table index (10 bits)
+ * [62:36] Unused (27 bits)
+ * [63] RES0 - Must be zero, as lost on insertion in the xarray
+ */
+#define TC_CGT_BITS 10
+#define TC_FGT_BITS 4
+#define TC_FGF_BITS 5
+#define TC_SRI_BITS 10
+
+union trap_config {
+ u64 val;
+ struct {
+ unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */
+ unsigned long fgt:TC_FGT_BITS; /* Fine Grained Trap id */
+ unsigned long bit:6; /* Bit number */
+ unsigned long pol:1; /* Polarity */
+ unsigned long fgf:TC_FGF_BITS; /* Fine Grained Filter */
+ unsigned long sri:TC_SRI_BITS; /* SysReg Index */
+ unsigned long unused:27; /* Unused, should be zero */
+ unsigned long mbz:1; /* Must Be Zero */
+ };
+};
+
+struct encoding_to_trap_config {
+ const u32 encoding;
+ const u32 end;
+ const union trap_config tc;
+ const unsigned int line;
+};
+
+/*
+ * WARNING: using ranges is a treacherous endeavour, as sysregs that
+ * are part of an architectural range are not necessarily contiguous
+ * in the [Op0,Op1,CRn,CRm,Ops] space. Tread carefully.
+ */
+#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \
+ { \
+ .encoding = sr_start, \
+ .end = sr_end, \
+ .tc = { \
+ .cgt = trap_id, \
+ }, \
+ .line = __LINE__, \
+ }
+
+#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id)
+
+/*
+ * Map encoding to trap bits for exception reported with EC=0x18.
+ * These must only be evaluated when running a nested hypervisor, but
+ * that the current context is not a hypervisor context. When the
+ * trapped access matches one of the trap controls, the exception is
+ * re-injected in the nested hypervisor.
+ */
+static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
+ SR_TRAP(SYS_REVIDR_EL1, CGT_HCR_TID1),
+ SR_TRAP(SYS_AIDR_EL1, CGT_HCR_TID1),
+ SR_TRAP(SYS_SMIDR_EL1, CGT_HCR_TID1),
+ SR_TRAP(SYS_CTR_EL0, CGT_HCR_TID2),
+ SR_TRAP(SYS_CCSIDR_EL1, CGT_HCR_TID2_TID4),
+ SR_TRAP(SYS_CCSIDR2_EL1, CGT_HCR_TID2_TID4),
+ SR_TRAP(SYS_CLIDR_EL1, CGT_HCR_TID2_TID4),
+ SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4),
+ SR_RANGE_TRAP(SYS_ID_PFR0_EL1,
+ sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3),
+ SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
+ SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0),
+ sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0),
+ sys_reg(3, 1, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 2, 11, 0, 0),
+ sys_reg(3, 2, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 3, 11, 0, 0),
+ sys_reg(3, 3, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 4, 11, 0, 0),
+ sys_reg(3, 4, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 5, 11, 0, 0),
+ sys_reg(3, 5, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 6, 11, 0, 0),
+ sys_reg(3, 6, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 7, 11, 0, 0),
+ sys_reg(3, 7, 11, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 0, 15, 0, 0),
+ sys_reg(3, 0, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 1, 15, 0, 0),
+ sys_reg(3, 1, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 2, 15, 0, 0),
+ sys_reg(3, 2, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 3, 15, 0, 0),
+ sys_reg(3, 3, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 4, 15, 0, 0),
+ sys_reg(3, 4, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 5, 15, 0, 0),
+ sys_reg(3, 5, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 6, 15, 0, 0),
+ sys_reg(3, 6, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_RANGE_TRAP(sys_reg(3, 7, 15, 0, 0),
+ sys_reg(3, 7, 15, 15, 7), CGT_HCR_TIDCP),
+ SR_TRAP(SYS_ACTLR_EL1, CGT_HCR_TACR),
+ SR_TRAP(SYS_DC_ISW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_CSW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_CISW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_IGSW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_IGDSW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_CGSW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_CGDSW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_CIGSW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_CIGDSW, CGT_HCR_TSW),
+ SR_TRAP(SYS_DC_CIVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CVAP, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CVADP, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_IVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CIGVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CIGDVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_IGVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_IGDVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CGVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CGDVAC, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CGVAP, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CGDVAP, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CGVADP, CGT_HCR_TPC),
+ SR_TRAP(SYS_DC_CGDVADP, CGT_HCR_TPC),
+ SR_TRAP(SYS_IC_IVAU, CGT_HCR_TPU_TOCU),
+ SR_TRAP(SYS_IC_IALLU, CGT_HCR_TPU_TOCU),
+ SR_TRAP(SYS_IC_IALLUIS, CGT_HCR_TPU_TICAB),
+ SR_TRAP(SYS_DC_CVAU, CGT_HCR_TPU_TOCU),
+ SR_TRAP(OP_TLBI_RVAE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVAAE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVALE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVAALE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VMALLE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VAE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_ASIDE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VAAE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VALE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VAALE1, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVAE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVAAE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVALE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVAALE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VMALLE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VAE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_ASIDE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VAAE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VALE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_VAALE1NXS, CGT_HCR_TTLB),
+ SR_TRAP(OP_TLBI_RVAE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_RVAAE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_RVALE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_RVAALE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VMALLE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VAE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_ASIDE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VAAE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VALE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VAALE1IS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_RVAE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_RVAAE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_RVALE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_RVAALE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VMALLE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VAE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_ASIDE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VAAE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VALE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VAALE1ISNXS, CGT_HCR_TTLB_TTLBIS),
+ SR_TRAP(OP_TLBI_VMALLE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VAE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_ASIDE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VAAE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VALE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VAALE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVAE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVAAE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVALE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVAALE1OS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VMALLE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VAE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_ASIDE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VAAE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_VAALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVAE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVAAE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(OP_TLBI_RVAALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
+ SR_TRAP(SYS_SCTLR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_SCTLR2_EL1, CGT_HCR_TVM_TRVM_HCRX_SCTLR2En),
+ SR_TRAP(SYS_TTBR0_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_TTBR1_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_TCR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_ESR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_FAR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_AFSR0_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_AFSR1_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En),
+ SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ),
+ SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ),
+ SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ),
+ SR_TRAP(SYS_LORSA_EL1, CGT_HCR_TLOR),
+ SR_TRAP(SYS_LOREA_EL1, CGT_HCR_TLOR),
+ SR_TRAP(SYS_LORN_EL1, CGT_HCR_TLOR),
+ SR_TRAP(SYS_LORC_EL1, CGT_HCR_TLOR),
+ SR_TRAP(SYS_LORID_EL1, CGT_HCR_TLOR),
+ SR_TRAP(SYS_ERRIDR_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERRSELR_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXADDR_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXCTLR_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXFR_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXMISC0_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXMISC1_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXMISC2_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXMISC3_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_ERXSTATUS_EL1, CGT_HCR_TERR),
+ SR_TRAP(SYS_APIAKEYLO_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APIAKEYHI_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APIBKEYLO_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APIBKEYHI_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APDAKEYLO_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APDAKEYHI_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APDBKEYLO_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APDBKEYHI_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APGAKEYLO_EL1, CGT_HCR_APK),
+ SR_TRAP(SYS_APGAKEYHI_EL1, CGT_HCR_APK),
+ /* All _EL2 registers */
+ SR_TRAP(SYS_BRBCR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_VPIDR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_VMPIDR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_SCTLR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_ACTLR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_SCTLR2_EL2, CGT_HCR_NV),
+ SR_RANGE_TRAP(SYS_HCR_EL2,
+ SYS_HCRX_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_SMPRIMAP_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_SMCR_EL2, CGT_HCR_NV),
+ SR_RANGE_TRAP(SYS_TTBR0_EL2,
+ SYS_TCR2_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_VTTBR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_VTCR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_VNCR_EL2, CGT_HCR_NV),
+ SR_RANGE_TRAP(SYS_HDFGRTR_EL2,
+ SYS_HAFGRTR_EL2, CGT_HCR_NV),
+ /* Skip the SP_EL1 encoding... */
+ SR_TRAP(SYS_SPSR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_ELR_EL2, CGT_HCR_NV),
+ /* Skip SPSR_irq, SPSR_abt, SPSR_und, SPSR_fiq */
+ SR_TRAP(SYS_AFSR0_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_AFSR1_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_ESR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_VSESR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_TFSR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_FAR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_HPFAR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_PMSCR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_MAIR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_AMAIR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_MPAMHCR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_MPAMVPMV_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_MPAM2_EL2, CGT_HCR_NV),
+ SR_RANGE_TRAP(SYS_MPAMVPM0_EL2,
+ SYS_MPAMVPM7_EL2, CGT_HCR_NV),
+ /*
+ * Note that the spec. describes a group of MEC registers
+ * whose access should not trap, therefore skip the following:
+ * MECID_A0_EL2, MECID_A1_EL2, MECID_P0_EL2,
+ * MECID_P1_EL2, MECIDR_EL2, VMECID_A_EL2,
+ * VMECID_P_EL2.
+ */
+ SR_RANGE_TRAP(SYS_VBAR_EL2,
+ SYS_RMR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_VDISR_EL2, CGT_HCR_NV),
+ /* ICH_AP0R<m>_EL2 */
+ SR_RANGE_TRAP(SYS_ICH_AP0R0_EL2,
+ SYS_ICH_AP0R3_EL2, CGT_HCR_NV),
+ /* ICH_AP1R<m>_EL2 */
+ SR_RANGE_TRAP(SYS_ICH_AP1R0_EL2,
+ SYS_ICH_AP1R3_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_ICC_SRE_EL2, CGT_HCR_NV),
+ SR_RANGE_TRAP(SYS_ICH_HCR_EL2,
+ SYS_ICH_EISR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_ICH_ELRSR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_ICH_VMCR_EL2, CGT_HCR_NV),
+ /* ICH_LR<m>_EL2 */
+ SR_RANGE_TRAP(SYS_ICH_LR0_EL2,
+ SYS_ICH_LR15_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_CONTEXTIDR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_TPIDR_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_SCXTNUM_EL2, CGT_HCR_NV),
+ /* AMEVCNTVOFF0<n>_EL2, AMEVCNTVOFF1<n>_EL2 */
+ SR_RANGE_TRAP(SYS_AMEVCNTVOFF0n_EL2(0),
+ SYS_AMEVCNTVOFF1n_EL2(15), CGT_HCR_NV),
+ /* CNT*_EL2 */
+ SR_TRAP(SYS_CNTVOFF_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_CNTPOFF_EL2, CGT_HCR_NV),
+ SR_TRAP(SYS_CNTHCTL_EL2, CGT_HCR_NV),
+ SR_RANGE_TRAP(SYS_CNTHP_TVAL_EL2,
+ SYS_CNTHP_CVAL_EL2, CGT_HCR_NV),
+ SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2,
+ SYS_CNTHV_CVAL_EL2, CGT_HCR_NV),
+ /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/
+ SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0),
+ sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV),
+ SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0),
+ sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV),
+ SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT),
+ SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT),
+ SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT),
+ SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT),
+ SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2LE1, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVAE2, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVALE2, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE2, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VAE2, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE1, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VALE2, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VMALLS12E1, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2E1NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2E1NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2LE1NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2LE1NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVAE2NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVALE2NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE2NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VAE2NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE1NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VALE2NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VMALLS12E1NXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2E1IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2E1IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2LE1IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2LE1IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVAE2IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVALE2IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE2IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VAE2IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE1IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VALE2IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VMALLS12E1IS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2E1ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2E1ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2LE1ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2LE1ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVAE2ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVALE2ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE2ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VAE2ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE1ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VALE2ISNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VMALLS12E1ISNXS,CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE2OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VAE2OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE1OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VALE2OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VMALLS12E1OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2E1OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2E1OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2LE1OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2LE1OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVAE2OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVALE2OS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE2OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VAE2OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_ALLE1OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VALE2OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_VMALLS12E1OSNXS,CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2E1OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2E1OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_IPAS2LE1OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RIPAS2LE1OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVAE2OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_TLBI_RVALE2OSNXS, CGT_HCR_NV),
+ SR_TRAP(OP_CPP_RCTX, CGT_HCR_NV),
+ SR_TRAP(OP_DVP_RCTX, CGT_HCR_NV),
+ SR_TRAP(OP_CFP_RCTX, CGT_HCR_NV),
+ SR_TRAP(SYS_SP_EL1, CGT_HCR_NV_nNV2),
+ SR_TRAP(SYS_VBAR_EL1, CGT_HCR_NV1_nNV2),
+ SR_TRAP(SYS_ELR_EL1, CGT_HCR_NV1_nNV2),
+ SR_TRAP(SYS_SPSR_EL1, CGT_HCR_NV1_nNV2),
+ SR_TRAP(SYS_SCXTNUM_EL1, CGT_HCR_NV1_nNV2_ENSCXT),
+ SR_TRAP(SYS_SCXTNUM_EL0, CGT_HCR_ENSCXT),
+ SR_TRAP(OP_AT_S1E1R, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E1W, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E0R, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT),
+ SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN),
+ SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN),
+ SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN),
+ SR_TRAP(SYS_PMCR_EL0, CGT_MDCR_TPM_TPMCR),
+ SR_TRAP(SYS_PMCNTENSET_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMCNTENCLR_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMOVSSET_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN),
+ SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM),
+ SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA),
+ SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA),
+ SR_TRAP(SYS_OSDTRRX_EL1, CGT_MDCR_TDCC_TDE_TDA),
+ SR_TRAP(SYS_OSDTRTX_EL1, CGT_MDCR_TDCC_TDE_TDA),
+ SR_TRAP(SYS_DBGDTR_EL0, CGT_MDCR_TDCC_TDE_TDA),
+ /*
+ * Also covers DBGDTRRX_EL0, which has the same encoding as
+ * SYS_DBGDTRTX_EL0...
+ */
+ SR_TRAP(SYS_DBGDTRTX_EL0, CGT_MDCR_TDCC_TDE_TDA),
+ SR_TRAP(SYS_MDSCR_EL1, CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_OSECCR_EL1, CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(0), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(1), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(2), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(3), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(4), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(5), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(6), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(7), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(8), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(9), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(10), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(11), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(12), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(13), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(14), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBVRn_EL1(15), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(0), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(1), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(2), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(3), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(4), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(5), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(6), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(7), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(8), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(9), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(10), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(11), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(12), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(13), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(14), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGBCRn_EL1(15), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(0), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(1), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(2), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(3), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(4), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(5), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(6), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(7), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(8), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(9), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(10), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(11), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(12), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(13), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(14), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWVRn_EL1(15), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(0), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(1), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(2), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(3), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(4), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(5), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(6), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(7), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(8), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(9), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(10), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(11), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(12), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(13), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGWCRn_EL1(14), CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGCLAIMSET_EL1, CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGCLAIMCLR_EL1, CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_DBGAUTHSTATUS_EL1, CGT_MDCR_TDE_TDA),
+ SR_TRAP(SYS_OSLAR_EL1, CGT_MDCR_TDE_TDOSA),
+ SR_TRAP(SYS_OSLSR_EL1, CGT_MDCR_TDE_TDOSA),
+ SR_TRAP(SYS_OSDLR_EL1, CGT_MDCR_TDE_TDOSA),
+ SR_TRAP(SYS_DBGPRCR_EL1, CGT_MDCR_TDE_TDOSA),
+ SR_TRAP(SYS_MDRAR_EL1, CGT_MDCR_TDE_TDRA),
+ SR_TRAP(SYS_PMBLIMITR_EL1, CGT_MDCR_E2PB),
+ SR_TRAP(SYS_PMBPTR_EL1, CGT_MDCR_E2PB),
+ SR_TRAP(SYS_PMBSR_EL1, CGT_MDCR_E2PB),
+ SR_TRAP(SYS_PMSCR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSEVFR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSFCR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSICR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSIDR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSIRR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSLATFR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSNEVFR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSDSFR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_TRFCR_EL1, CGT_MDCR_TTRF),
+ SR_TRAP(SYS_TRBBASER_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_TRBLIMITR_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_TRBMAR_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC),
+ SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM),
+ /* op0=2, op1=1, and CRn<0b1000 */
+ SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0),
+ sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA),
+ SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN),
+ SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN),
+ SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN),
+ SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN),
+ SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN),
+ SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT),
+ SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT),
+ SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT),
+ SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT),
+ SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT),
+ SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM),
+ /*
+ * IMPDEF choice:
+ * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as
+ * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for
+ * ICC_SRE_EL1 access, and always handle it locally.
+ */
+ SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR),
+ SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC),
+};
+
+static DEFINE_XARRAY(sr_forward_xa);
+
+enum fg_filter_id {
+ __NO_FGF__,
+ HCRX_FGTnXS,
+
+ /* Must be last */
+ __NR_FG_FILTER_IDS__
+};
+
+#define __FGT(g, b, p, f) \
+ { \
+ .fgt = g ## _GROUP, \
+ .bit = g ## _EL2_ ## b ## _SHIFT, \
+ .pol = p, \
+ .fgf = f, \
+ }
+
+#define FGT(g, b, p) __FGT(g, b, p, __NO_FGF__)
+
+/*
+ * See the warning next to SR_RANGE_TRAP(), and apply the same
+ * level of caution.
+ */
+#define SR_FGF_RANGE(sr, e, g, b, p, f) \
+ { \
+ .encoding = sr, \
+ .end = e, \
+ .tc = __FGT(g, b, p, f), \
+ .line = __LINE__, \
+ }
+
+#define SR_FGF(sr, g, b, p, f) SR_FGF_RANGE(sr, sr, g, b, p, f)
+#define SR_FGT(sr, g, b, p) SR_FGF_RANGE(sr, sr, g, b, p, __NO_FGF__)
+#define SR_FGT_RANGE(sr, end, g, b, p) \
+ SR_FGF_RANGE(sr, end, g, b, p, __NO_FGF__)
+
+static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
+ /* HFGRTR_EL2, HFGWTR_EL2 */
+ SR_FGT(SYS_AMAIR2_EL1, HFGRTR, nAMAIR2_EL1, 0),
+ SR_FGT(SYS_MAIR2_EL1, HFGRTR, nMAIR2_EL1, 0),
+ SR_FGT(SYS_S2POR_EL1, HFGRTR, nS2POR_EL1, 0),
+ SR_FGT(SYS_POR_EL1, HFGRTR, nPOR_EL1, 0),
+ SR_FGT(SYS_POR_EL0, HFGRTR, nPOR_EL0, 0),
+ SR_FGT(SYS_PIR_EL1, HFGRTR, nPIR_EL1, 0),
+ SR_FGT(SYS_PIRE0_EL1, HFGRTR, nPIRE0_EL1, 0),
+ SR_FGT(SYS_RCWMASK_EL1, HFGRTR, nRCWMASK_EL1, 0),
+ SR_FGT(SYS_TPIDR2_EL0, HFGRTR, nTPIDR2_EL0, 0),
+ SR_FGT(SYS_SMPRI_EL1, HFGRTR, nSMPRI_EL1, 0),
+ SR_FGT(SYS_GCSCR_EL1, HFGRTR, nGCS_EL1, 0),
+ SR_FGT(SYS_GCSPR_EL1, HFGRTR, nGCS_EL1, 0),
+ SR_FGT(SYS_GCSCRE0_EL1, HFGRTR, nGCS_EL0, 0),
+ SR_FGT(SYS_GCSPR_EL0, HFGRTR, nGCS_EL0, 0),
+ SR_FGT(SYS_ACCDATA_EL1, HFGRTR, nACCDATA_EL1, 0),
+ SR_FGT(SYS_ERXADDR_EL1, HFGRTR, ERXADDR_EL1, 1),
+ SR_FGT(SYS_ERXPFGCDN_EL1, HFGRTR, ERXPFGCDN_EL1, 1),
+ SR_FGT(SYS_ERXPFGCTL_EL1, HFGRTR, ERXPFGCTL_EL1, 1),
+ SR_FGT(SYS_ERXPFGF_EL1, HFGRTR, ERXPFGF_EL1, 1),
+ SR_FGT(SYS_ERXMISC0_EL1, HFGRTR, ERXMISCn_EL1, 1),
+ SR_FGT(SYS_ERXMISC1_EL1, HFGRTR, ERXMISCn_EL1, 1),
+ SR_FGT(SYS_ERXMISC2_EL1, HFGRTR, ERXMISCn_EL1, 1),
+ SR_FGT(SYS_ERXMISC3_EL1, HFGRTR, ERXMISCn_EL1, 1),
+ SR_FGT(SYS_ERXSTATUS_EL1, HFGRTR, ERXSTATUS_EL1, 1),
+ SR_FGT(SYS_ERXCTLR_EL1, HFGRTR, ERXCTLR_EL1, 1),
+ SR_FGT(SYS_ERXFR_EL1, HFGRTR, ERXFR_EL1, 1),
+ SR_FGT(SYS_ERRSELR_EL1, HFGRTR, ERRSELR_EL1, 1),
+ SR_FGT(SYS_ERRIDR_EL1, HFGRTR, ERRIDR_EL1, 1),
+ SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGRTR, ICC_IGRPENn_EL1, 1),
+ SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGRTR, ICC_IGRPENn_EL1, 1),
+ SR_FGT(SYS_VBAR_EL1, HFGRTR, VBAR_EL1, 1),
+ SR_FGT(SYS_TTBR1_EL1, HFGRTR, TTBR1_EL1, 1),
+ SR_FGT(SYS_TTBR0_EL1, HFGRTR, TTBR0_EL1, 1),
+ SR_FGT(SYS_TPIDR_EL0, HFGRTR, TPIDR_EL0, 1),
+ SR_FGT(SYS_TPIDRRO_EL0, HFGRTR, TPIDRRO_EL0, 1),
+ SR_FGT(SYS_TPIDR_EL1, HFGRTR, TPIDR_EL1, 1),
+ SR_FGT(SYS_TCR_EL1, HFGRTR, TCR_EL1, 1),
+ SR_FGT(SYS_TCR2_EL1, HFGRTR, TCR_EL1, 1),
+ SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1),
+ SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1),
+ SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1),
+ SR_FGT(SYS_SCTLR2_EL1, HFGRTR, SCTLR_EL1, 1),
+ SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1),
+ SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1),
+ SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1),
+ SR_FGT(SYS_MIDR_EL1, HFGRTR, MIDR_EL1, 1),
+ SR_FGT(SYS_MAIR_EL1, HFGRTR, MAIR_EL1, 1),
+ SR_FGT(SYS_LORSA_EL1, HFGRTR, LORSA_EL1, 1),
+ SR_FGT(SYS_LORN_EL1, HFGRTR, LORN_EL1, 1),
+ SR_FGT(SYS_LORID_EL1, HFGRTR, LORID_EL1, 1),
+ SR_FGT(SYS_LOREA_EL1, HFGRTR, LOREA_EL1, 1),
+ SR_FGT(SYS_LORC_EL1, HFGRTR, LORC_EL1, 1),
+ SR_FGT(SYS_ISR_EL1, HFGRTR, ISR_EL1, 1),
+ SR_FGT(SYS_FAR_EL1, HFGRTR, FAR_EL1, 1),
+ SR_FGT(SYS_ESR_EL1, HFGRTR, ESR_EL1, 1),
+ SR_FGT(SYS_DCZID_EL0, HFGRTR, DCZID_EL0, 1),
+ SR_FGT(SYS_CTR_EL0, HFGRTR, CTR_EL0, 1),
+ SR_FGT(SYS_CSSELR_EL1, HFGRTR, CSSELR_EL1, 1),
+ SR_FGT(SYS_CPACR_EL1, HFGRTR, CPACR_EL1, 1),
+ SR_FGT(SYS_CONTEXTIDR_EL1, HFGRTR, CONTEXTIDR_EL1, 1),
+ SR_FGT(SYS_CLIDR_EL1, HFGRTR, CLIDR_EL1, 1),
+ SR_FGT(SYS_CCSIDR_EL1, HFGRTR, CCSIDR_EL1, 1),
+ SR_FGT(SYS_APIBKEYLO_EL1, HFGRTR, APIBKey, 1),
+ SR_FGT(SYS_APIBKEYHI_EL1, HFGRTR, APIBKey, 1),
+ SR_FGT(SYS_APIAKEYLO_EL1, HFGRTR, APIAKey, 1),
+ SR_FGT(SYS_APIAKEYHI_EL1, HFGRTR, APIAKey, 1),
+ SR_FGT(SYS_APGAKEYLO_EL1, HFGRTR, APGAKey, 1),
+ SR_FGT(SYS_APGAKEYHI_EL1, HFGRTR, APGAKey, 1),
+ SR_FGT(SYS_APDBKEYLO_EL1, HFGRTR, APDBKey, 1),
+ SR_FGT(SYS_APDBKEYHI_EL1, HFGRTR, APDBKey, 1),
+ SR_FGT(SYS_APDAKEYLO_EL1, HFGRTR, APDAKey, 1),
+ SR_FGT(SYS_APDAKEYHI_EL1, HFGRTR, APDAKey, 1),
+ SR_FGT(SYS_AMAIR_EL1, HFGRTR, AMAIR_EL1, 1),
+ SR_FGT(SYS_AIDR_EL1, HFGRTR, AIDR_EL1, 1),
+ SR_FGT(SYS_AFSR1_EL1, HFGRTR, AFSR1_EL1, 1),
+ SR_FGT(SYS_AFSR0_EL1, HFGRTR, AFSR0_EL1, 1),
+
+ /* HFGRTR2_EL2, HFGWTR2_EL2 */
+ SR_FGT(SYS_ACTLRALIAS_EL1, HFGRTR2, nACTLRALIAS_EL1, 0),
+ SR_FGT(SYS_ACTLRMASK_EL1, HFGRTR2, nACTLRMASK_EL1, 0),
+ SR_FGT(SYS_CPACRALIAS_EL1, HFGRTR2, nCPACRALIAS_EL1, 0),
+ SR_FGT(SYS_CPACRMASK_EL1, HFGRTR2, nCPACRMASK_EL1, 0),
+ SR_FGT(SYS_PFAR_EL1, HFGRTR2, nPFAR_EL1, 0),
+ SR_FGT(SYS_RCWSMASK_EL1, HFGRTR2, nRCWSMASK_EL1, 0),
+ SR_FGT(SYS_SCTLR2ALIAS_EL1, HFGRTR2, nSCTLRALIAS2_EL1, 0),
+ SR_FGT(SYS_SCTLR2MASK_EL1, HFGRTR2, nSCTLR2MASK_EL1, 0),
+ SR_FGT(SYS_SCTLRALIAS_EL1, HFGRTR2, nSCTLRALIAS_EL1, 0),
+ SR_FGT(SYS_SCTLRMASK_EL1, HFGRTR2, nSCTLRMASK_EL1, 0),
+ SR_FGT(SYS_TCR2ALIAS_EL1, HFGRTR2, nTCR2ALIAS_EL1, 0),
+ SR_FGT(SYS_TCR2MASK_EL1, HFGRTR2, nTCR2MASK_EL1, 0),
+ SR_FGT(SYS_TCRALIAS_EL1, HFGRTR2, nTCRALIAS_EL1, 0),
+ SR_FGT(SYS_TCRMASK_EL1, HFGRTR2, nTCRMASK_EL1, 0),
+ SR_FGT(SYS_ERXGSR_EL1, HFGRTR2, nERXGSR_EL1, 0),
+
+ /* HFGITR_EL2 */
+ SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1),
+ SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1),
+ SR_FGT(OP_GCSPUSHX, HFGITR, nGCSEPP, 0),
+ SR_FGT(OP_GCSPOPX, HFGITR, nGCSEPP, 0),
+ SR_FGT(OP_GCSPUSHM, HFGITR, nGCSPUSHM_EL1, 0),
+ SR_FGT(OP_BRB_IALL, HFGITR, nBRBIALL, 0),
+ SR_FGT(OP_BRB_INJ, HFGITR, nBRBINJ, 0),
+ SR_FGT(SYS_DC_CVAC, HFGITR, DCCVAC, 1),
+ SR_FGT(SYS_DC_CGVAC, HFGITR, DCCVAC, 1),
+ SR_FGT(SYS_DC_CGDVAC, HFGITR, DCCVAC, 1),
+ SR_FGT(OP_CPP_RCTX, HFGITR, CPPRCTX, 1),
+ SR_FGT(OP_DVP_RCTX, HFGITR, DVPRCTX, 1),
+ SR_FGT(OP_CFP_RCTX, HFGITR, CFPRCTX, 1),
+ SR_FGT(OP_TLBI_VAALE1, HFGITR, TLBIVAALE1, 1),
+ SR_FGT(OP_TLBI_VALE1, HFGITR, TLBIVALE1, 1),
+ SR_FGT(OP_TLBI_VAAE1, HFGITR, TLBIVAAE1, 1),
+ SR_FGT(OP_TLBI_ASIDE1, HFGITR, TLBIASIDE1, 1),
+ SR_FGT(OP_TLBI_VAE1, HFGITR, TLBIVAE1, 1),
+ SR_FGT(OP_TLBI_VMALLE1, HFGITR, TLBIVMALLE1, 1),
+ SR_FGT(OP_TLBI_RVAALE1, HFGITR, TLBIRVAALE1, 1),
+ SR_FGT(OP_TLBI_RVALE1, HFGITR, TLBIRVALE1, 1),
+ SR_FGT(OP_TLBI_RVAAE1, HFGITR, TLBIRVAAE1, 1),
+ SR_FGT(OP_TLBI_RVAE1, HFGITR, TLBIRVAE1, 1),
+ SR_FGT(OP_TLBI_RVAALE1IS, HFGITR, TLBIRVAALE1IS, 1),
+ SR_FGT(OP_TLBI_RVALE1IS, HFGITR, TLBIRVALE1IS, 1),
+ SR_FGT(OP_TLBI_RVAAE1IS, HFGITR, TLBIRVAAE1IS, 1),
+ SR_FGT(OP_TLBI_RVAE1IS, HFGITR, TLBIRVAE1IS, 1),
+ SR_FGT(OP_TLBI_VAALE1IS, HFGITR, TLBIVAALE1IS, 1),
+ SR_FGT(OP_TLBI_VALE1IS, HFGITR, TLBIVALE1IS, 1),
+ SR_FGT(OP_TLBI_VAAE1IS, HFGITR, TLBIVAAE1IS, 1),
+ SR_FGT(OP_TLBI_ASIDE1IS, HFGITR, TLBIASIDE1IS, 1),
+ SR_FGT(OP_TLBI_VAE1IS, HFGITR, TLBIVAE1IS, 1),
+ SR_FGT(OP_TLBI_VMALLE1IS, HFGITR, TLBIVMALLE1IS, 1),
+ SR_FGT(OP_TLBI_RVAALE1OS, HFGITR, TLBIRVAALE1OS, 1),
+ SR_FGT(OP_TLBI_RVALE1OS, HFGITR, TLBIRVALE1OS, 1),
+ SR_FGT(OP_TLBI_RVAAE1OS, HFGITR, TLBIRVAAE1OS, 1),
+ SR_FGT(OP_TLBI_RVAE1OS, HFGITR, TLBIRVAE1OS, 1),
+ SR_FGT(OP_TLBI_VAALE1OS, HFGITR, TLBIVAALE1OS, 1),
+ SR_FGT(OP_TLBI_VALE1OS, HFGITR, TLBIVALE1OS, 1),
+ SR_FGT(OP_TLBI_VAAE1OS, HFGITR, TLBIVAAE1OS, 1),
+ SR_FGT(OP_TLBI_ASIDE1OS, HFGITR, TLBIASIDE1OS, 1),
+ SR_FGT(OP_TLBI_VAE1OS, HFGITR, TLBIVAE1OS, 1),
+ SR_FGT(OP_TLBI_VMALLE1OS, HFGITR, TLBIVMALLE1OS, 1),
+ /* nXS variants must be checked against HCRX_EL2.FGTnXS */
+ SR_FGF(OP_TLBI_VAALE1NXS, HFGITR, TLBIVAALE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VALE1NXS, HFGITR, TLBIVALE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAAE1NXS, HFGITR, TLBIVAAE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_ASIDE1NXS, HFGITR, TLBIASIDE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAE1NXS, HFGITR, TLBIVAE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VMALLE1NXS, HFGITR, TLBIVMALLE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAALE1NXS, HFGITR, TLBIRVAALE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVALE1NXS, HFGITR, TLBIRVALE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAAE1NXS, HFGITR, TLBIRVAAE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAE1NXS, HFGITR, TLBIRVAE1, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAALE1ISNXS, HFGITR, TLBIRVAALE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVALE1ISNXS, HFGITR, TLBIRVALE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAAE1ISNXS, HFGITR, TLBIRVAAE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAE1ISNXS, HFGITR, TLBIRVAE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAALE1ISNXS, HFGITR, TLBIVAALE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VALE1ISNXS, HFGITR, TLBIVALE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAAE1ISNXS, HFGITR, TLBIVAAE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_ASIDE1ISNXS, HFGITR, TLBIASIDE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAE1ISNXS, HFGITR, TLBIVAE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VMALLE1ISNXS, HFGITR, TLBIVMALLE1IS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAALE1OSNXS, HFGITR, TLBIRVAALE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVALE1OSNXS, HFGITR, TLBIRVALE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAAE1OSNXS, HFGITR, TLBIRVAAE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_RVAE1OSNXS, HFGITR, TLBIRVAE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAALE1OSNXS, HFGITR, TLBIVAALE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VALE1OSNXS, HFGITR, TLBIVALE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAAE1OSNXS, HFGITR, TLBIVAAE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_ASIDE1OSNXS, HFGITR, TLBIASIDE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VAE1OSNXS, HFGITR, TLBIVAE1OS, 1, HCRX_FGTnXS),
+ SR_FGF(OP_TLBI_VMALLE1OSNXS, HFGITR, TLBIVMALLE1OS, 1, HCRX_FGTnXS),
+ SR_FGT(OP_AT_S1E1WP, HFGITR, ATS1E1WP, 1),
+ SR_FGT(OP_AT_S1E1RP, HFGITR, ATS1E1RP, 1),
+ SR_FGT(OP_AT_S1E0W, HFGITR, ATS1E0W, 1),
+ SR_FGT(OP_AT_S1E0R, HFGITR, ATS1E0R, 1),
+ SR_FGT(OP_AT_S1E1W, HFGITR, ATS1E1W, 1),
+ SR_FGT(OP_AT_S1E1R, HFGITR, ATS1E1R, 1),
+ SR_FGT(SYS_DC_ZVA, HFGITR, DCZVA, 1),
+ SR_FGT(SYS_DC_GVA, HFGITR, DCZVA, 1),
+ SR_FGT(SYS_DC_GZVA, HFGITR, DCZVA, 1),
+ SR_FGT(SYS_DC_CIVAC, HFGITR, DCCIVAC, 1),
+ SR_FGT(SYS_DC_CIGVAC, HFGITR, DCCIVAC, 1),
+ SR_FGT(SYS_DC_CIGDVAC, HFGITR, DCCIVAC, 1),
+ SR_FGT(SYS_DC_CVADP, HFGITR, DCCVADP, 1),
+ SR_FGT(SYS_DC_CGVADP, HFGITR, DCCVADP, 1),
+ SR_FGT(SYS_DC_CGDVADP, HFGITR, DCCVADP, 1),
+ SR_FGT(SYS_DC_CVAP, HFGITR, DCCVAP, 1),
+ SR_FGT(SYS_DC_CGVAP, HFGITR, DCCVAP, 1),
+ SR_FGT(SYS_DC_CGDVAP, HFGITR, DCCVAP, 1),
+ SR_FGT(SYS_DC_CVAU, HFGITR, DCCVAU, 1),
+ SR_FGT(SYS_DC_CISW, HFGITR, DCCISW, 1),
+ SR_FGT(SYS_DC_CIGSW, HFGITR, DCCISW, 1),
+ SR_FGT(SYS_DC_CIGDSW, HFGITR, DCCISW, 1),
+ SR_FGT(SYS_DC_CSW, HFGITR, DCCSW, 1),
+ SR_FGT(SYS_DC_CGSW, HFGITR, DCCSW, 1),
+ SR_FGT(SYS_DC_CGDSW, HFGITR, DCCSW, 1),
+ SR_FGT(SYS_DC_ISW, HFGITR, DCISW, 1),
+ SR_FGT(SYS_DC_IGSW, HFGITR, DCISW, 1),
+ SR_FGT(SYS_DC_IGDSW, HFGITR, DCISW, 1),
+ SR_FGT(SYS_DC_IVAC, HFGITR, DCIVAC, 1),
+ SR_FGT(SYS_DC_IGVAC, HFGITR, DCIVAC, 1),
+ SR_FGT(SYS_DC_IGDVAC, HFGITR, DCIVAC, 1),
+ SR_FGT(SYS_IC_IVAU, HFGITR, ICIVAU, 1),
+ SR_FGT(SYS_IC_IALLU, HFGITR, ICIALLU, 1),
+ SR_FGT(SYS_IC_IALLUIS, HFGITR, ICIALLUIS, 1),
+
+ /* HFGITR2_EL2 */
+ SR_FGT(SYS_DC_CIGDVAPS, HFGITR2, nDCCIVAPS, 0),
+ SR_FGT(SYS_DC_CIVAPS, HFGITR2, nDCCIVAPS, 0),
+
+ /* HDFGRTR_EL2 */
+ SR_FGT(SYS_PMBIDR_EL1, HDFGRTR, PMBIDR_EL1, 1),
+ SR_FGT(SYS_PMSNEVFR_EL1, HDFGRTR, nPMSNEVFR_EL1, 0),
+ SR_FGT(SYS_BRBINF_EL1(0), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(1), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(2), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(3), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(4), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(5), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(6), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(7), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(8), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(9), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(10), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(11), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(12), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(13), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(14), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(15), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(16), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(17), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(18), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(19), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(20), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(21), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(22), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(23), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(24), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(25), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(26), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(27), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(28), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(29), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(30), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINF_EL1(31), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBINFINJ_EL1, HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(0), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(1), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(2), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(3), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(4), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(5), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(6), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(7), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(8), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(9), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(10), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(11), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(12), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(13), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(14), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(15), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(16), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(17), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(18), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(19), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(20), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(21), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(22), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(23), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(24), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(25), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(26), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(27), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(28), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(29), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(30), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRC_EL1(31), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBSRCINJ_EL1, HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(0), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(1), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(2), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(3), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(4), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(5), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(6), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(7), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(8), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(9), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(10), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(11), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(12), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(13), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(14), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(15), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(16), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(17), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(18), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(19), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(20), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(21), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(22), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(23), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(24), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(25), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(26), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(27), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(28), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(29), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(30), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGT_EL1(31), HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTGTINJ_EL1, HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBTS_EL1, HDFGRTR, nBRBDATA, 0),
+ SR_FGT(SYS_BRBCR_EL1, HDFGRTR, nBRBCTL, 0),
+ SR_FGT(SYS_BRBFCR_EL1, HDFGRTR, nBRBCTL, 0),
+ SR_FGT(SYS_BRBIDR0_EL1, HDFGRTR, nBRBIDR, 0),
+ SR_FGT(SYS_PMCEID0_EL0, HDFGRTR, PMCEIDn_EL0, 1),
+ SR_FGT(SYS_PMCEID1_EL0, HDFGRTR, PMCEIDn_EL0, 1),
+ SR_FGT(SYS_PMUSERENR_EL0, HDFGRTR, PMUSERENR_EL0, 1),
+ SR_FGT(SYS_TRBTRG_EL1, HDFGRTR, TRBTRG_EL1, 1),
+ SR_FGT(SYS_TRBSR_EL1, HDFGRTR, TRBSR_EL1, 1),
+ SR_FGT(SYS_TRBPTR_EL1, HDFGRTR, TRBPTR_EL1, 1),
+ SR_FGT(SYS_TRBMAR_EL1, HDFGRTR, TRBMAR_EL1, 1),
+ SR_FGT(SYS_TRBLIMITR_EL1, HDFGRTR, TRBLIMITR_EL1, 1),
+ SR_FGT(SYS_TRBIDR_EL1, HDFGRTR, TRBIDR_EL1, 1),
+ SR_FGT(SYS_TRBBASER_EL1, HDFGRTR, TRBBASER_EL1, 1),
+ SR_FGT(SYS_TRCVICTLR, HDFGRTR, TRCVICTLR, 1),
+ SR_FGT(SYS_TRCSTATR, HDFGRTR, TRCSTATR, 1),
+ SR_FGT(SYS_TRCSSCSR(0), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSSCSR(1), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSSCSR(2), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSSCSR(3), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSSCSR(4), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSSCSR(5), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSSCSR(6), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSSCSR(7), HDFGRTR, TRCSSCSRn, 1),
+ SR_FGT(SYS_TRCSEQSTR, HDFGRTR, TRCSEQSTR, 1),
+ SR_FGT(SYS_TRCPRGCTLR, HDFGRTR, TRCPRGCTLR, 1),
+ SR_FGT(SYS_TRCOSLSR, HDFGRTR, TRCOSLSR, 1),
+ SR_FGT(SYS_TRCIMSPEC(0), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCIMSPEC(1), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCIMSPEC(2), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCIMSPEC(3), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCIMSPEC(4), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCIMSPEC(5), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCIMSPEC(6), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCIMSPEC(7), HDFGRTR, TRCIMSPECn, 1),
+ SR_FGT(SYS_TRCDEVARCH, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCDEVID, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR0, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR1, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR2, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR3, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR4, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR5, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR6, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR7, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR8, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR9, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR10, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR11, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR12, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCIDR13, HDFGRTR, TRCID, 1),
+ SR_FGT(SYS_TRCCNTVR(0), HDFGRTR, TRCCNTVRn, 1),
+ SR_FGT(SYS_TRCCNTVR(1), HDFGRTR, TRCCNTVRn, 1),
+ SR_FGT(SYS_TRCCNTVR(2), HDFGRTR, TRCCNTVRn, 1),
+ SR_FGT(SYS_TRCCNTVR(3), HDFGRTR, TRCCNTVRn, 1),
+ SR_FGT(SYS_TRCCLAIMCLR, HDFGRTR, TRCCLAIM, 1),
+ SR_FGT(SYS_TRCCLAIMSET, HDFGRTR, TRCCLAIM, 1),
+ SR_FGT(SYS_TRCAUXCTLR, HDFGRTR, TRCAUXCTLR, 1),
+ SR_FGT(SYS_TRCAUTHSTATUS, HDFGRTR, TRCAUTHSTATUS, 1),
+ SR_FGT(SYS_TRCACATR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(4), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(5), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(6), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(7), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(8), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(9), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(10), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(11), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(12), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(13), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(14), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACATR(15), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(4), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(5), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(6), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(7), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(8), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(9), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(10), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(11), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(12), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(13), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(14), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCACVR(15), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCBBCTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCCCTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCCTLR0, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCCTLR1, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(4), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(5), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(6), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCIDCVR(7), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTCTLR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTCTLR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTCTLR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTCTLR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTRLDVR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTRLDVR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTRLDVR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCNTRLDVR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCCONFIGR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCEVENTCTL0R, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCEVENTCTL1R, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCEXTINSELR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCEXTINSELR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCEXTINSELR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCEXTINSELR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCQCTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(4), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(5), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(6), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(7), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(8), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(9), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(10), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(11), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(12), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(13), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(14), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(15), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(16), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(17), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(18), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(19), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(20), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(21), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(22), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(23), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(24), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(25), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(26), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(27), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(28), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(29), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(30), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSCTLR(31), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCRSR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSEQEVR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSEQEVR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSEQEVR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSEQRSTEVR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(4), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(5), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(6), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSCCR(7), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(4), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(5), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(6), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSSPCICR(7), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSTALLCTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCSYNCPR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCTRACEIDR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCTSCTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVIIECTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVIPCSSCTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVISSCTLR, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCCTLR0, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCCTLR1, HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(0), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(1), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(2), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(3), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(4), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(5), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(6), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_TRCVMIDCVR(7), HDFGRTR, TRC, 1),
+ SR_FGT(SYS_PMSLATFR_EL1, HDFGRTR, PMSLATFR_EL1, 1),
+ SR_FGT(SYS_PMSIRR_EL1, HDFGRTR, PMSIRR_EL1, 1),
+ SR_FGT(SYS_PMSIDR_EL1, HDFGRTR, PMSIDR_EL1, 1),
+ SR_FGT(SYS_PMSICR_EL1, HDFGRTR, PMSICR_EL1, 1),
+ SR_FGT(SYS_PMSFCR_EL1, HDFGRTR, PMSFCR_EL1, 1),
+ SR_FGT(SYS_PMSEVFR_EL1, HDFGRTR, PMSEVFR_EL1, 1),
+ SR_FGT(SYS_PMSCR_EL1, HDFGRTR, PMSCR_EL1, 1),
+ SR_FGT(SYS_PMBSR_EL1, HDFGRTR, PMBSR_EL1, 1),
+ SR_FGT(SYS_PMBPTR_EL1, HDFGRTR, PMBPTR_EL1, 1),
+ SR_FGT(SYS_PMBLIMITR_EL1, HDFGRTR, PMBLIMITR_EL1, 1),
+ SR_FGT(SYS_PMMIR_EL1, HDFGRTR, PMMIR_EL1, 1),
+ SR_FGT(SYS_PMSELR_EL0, HDFGRTR, PMSELR_EL0, 1),
+ SR_FGT(SYS_PMOVSCLR_EL0, HDFGRTR, PMOVS, 1),
+ SR_FGT(SYS_PMOVSSET_EL0, HDFGRTR, PMOVS, 1),
+ SR_FGT(SYS_PMINTENCLR_EL1, HDFGRTR, PMINTEN, 1),
+ SR_FGT(SYS_PMINTENSET_EL1, HDFGRTR, PMINTEN, 1),
+ SR_FGT(SYS_PMCNTENCLR_EL0, HDFGRTR, PMCNTEN, 1),
+ SR_FGT(SYS_PMCNTENSET_EL0, HDFGRTR, PMCNTEN, 1),
+ SR_FGT(SYS_PMCCNTR_EL0, HDFGRTR, PMCCNTR_EL0, 1),
+ SR_FGT(SYS_PMCCFILTR_EL0, HDFGRTR, PMCCFILTR_EL0, 1),
+ SR_FGT_RANGE(SYS_PMEVTYPERn_EL0(0),
+ SYS_PMEVTYPERn_EL0(30),
+ HDFGRTR, PMEVTYPERn_EL0, 1),
+ SR_FGT_RANGE(SYS_PMEVCNTRn_EL0(0),
+ SYS_PMEVCNTRn_EL0(30),
+ HDFGRTR, PMEVCNTRn_EL0, 1),
+ SR_FGT(SYS_OSDLR_EL1, HDFGRTR, OSDLR_EL1, 1),
+ SR_FGT(SYS_OSECCR_EL1, HDFGRTR, OSECCR_EL1, 1),
+ SR_FGT(SYS_OSLSR_EL1, HDFGRTR, OSLSR_EL1, 1),
+ SR_FGT(SYS_DBGPRCR_EL1, HDFGRTR, DBGPRCR_EL1, 1),
+ SR_FGT(SYS_DBGAUTHSTATUS_EL1, HDFGRTR, DBGAUTHSTATUS_EL1, 1),
+ SR_FGT(SYS_DBGCLAIMSET_EL1, HDFGRTR, DBGCLAIM, 1),
+ SR_FGT(SYS_DBGCLAIMCLR_EL1, HDFGRTR, DBGCLAIM, 1),
+ SR_FGT(SYS_MDSCR_EL1, HDFGRTR, MDSCR_EL1, 1),
+ /*
+ * The trap bits capture *64* debug registers per bit, but the
+ * ARM ARM only describes the encoding for the first 16, and
+ * we don't really support more than that anyway.
+ */
+ SR_FGT(SYS_DBGWVRn_EL1(0), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(1), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(2), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(3), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(4), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(5), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(6), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(7), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(8), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(9), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(10), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(11), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(12), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(13), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(14), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWVRn_EL1(15), HDFGRTR, DBGWVRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(0), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(1), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(2), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(3), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(4), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(5), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(6), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(7), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(8), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(9), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(10), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(11), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(12), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(13), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(14), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGWCRn_EL1(15), HDFGRTR, DBGWCRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(0), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(1), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(2), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(3), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(4), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(5), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(6), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(7), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(8), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(9), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(10), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(11), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(12), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(13), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(14), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBVRn_EL1(15), HDFGRTR, DBGBVRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(0), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(1), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(2), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(3), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(4), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(5), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(6), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(7), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(8), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(9), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(10), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(11), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(12), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(13), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(14), HDFGRTR, DBGBCRn_EL1, 1),
+ SR_FGT(SYS_DBGBCRn_EL1(15), HDFGRTR, DBGBCRn_EL1, 1),
+
+ /* HDFGRTR2_EL2 */
+ SR_FGT(SYS_MDSELR_EL1, HDFGRTR2, nMDSELR_EL1, 0),
+ SR_FGT(SYS_MDSTEPOP_EL1, HDFGRTR2, nMDSTEPOP_EL1, 0),
+ SR_FGT(SYS_PMCCNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0),
+ SR_FGT_RANGE(SYS_PMEVCNTSVRn_EL1(0),
+ SYS_PMEVCNTSVRn_EL1(30),
+ HDFGRTR2, nPMSSDATA, 0),
+ SR_FGT(SYS_PMICNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0),
+ SR_FGT(SYS_PMECR_EL1, HDFGRTR2, nPMECR_EL1, 0),
+ SR_FGT(SYS_PMIAR_EL1, HDFGRTR2, nPMIAR_EL1, 0),
+ SR_FGT(SYS_PMICFILTR_EL0, HDFGRTR2, nPMICFILTR_EL0, 0),
+ SR_FGT(SYS_PMICNTR_EL0, HDFGRTR2, nPMICNTR_EL0, 0),
+ SR_FGT(SYS_PMSSCR_EL1, HDFGRTR2, nPMSSCR_EL1, 0),
+ SR_FGT(SYS_PMUACR_EL1, HDFGRTR2, nPMUACR_EL1, 0),
+ SR_FGT(SYS_SPMACCESSR_EL1, HDFGRTR2, nSPMACCESSR_EL1, 0),
+ SR_FGT(SYS_SPMCFGR_EL1, HDFGRTR2, nSPMID, 0),
+ SR_FGT(SYS_SPMDEVARCH_EL1, HDFGRTR2, nSPMID, 0),
+ SR_FGT(SYS_SPMCGCRn_EL1(0), HDFGRTR2, nSPMID, 0),
+ SR_FGT(SYS_SPMCGCRn_EL1(1), HDFGRTR2, nSPMID, 0),
+ SR_FGT(SYS_SPMIIDR_EL1, HDFGRTR2, nSPMID, 0),
+ SR_FGT(SYS_SPMCNTENCLR_EL0, HDFGRTR2, nSPMCNTEN, 0),
+ SR_FGT(SYS_SPMCNTENSET_EL0, HDFGRTR2, nSPMCNTEN, 0),
+ SR_FGT(SYS_SPMCR_EL0, HDFGRTR2, nSPMCR_EL0, 0),
+ SR_FGT(SYS_SPMDEVAFF_EL1, HDFGRTR2, nSPMDEVAFF_EL1, 0),
+ /*
+ * We have up to 64 of these registers in ranges of 16, banked via
+ * SPMSELR_EL0.BANK. We're only concerned with the accessors here,
+ * not the architectural registers.
+ */
+ SR_FGT_RANGE(SYS_SPMEVCNTRn_EL0(0),
+ SYS_SPMEVCNTRn_EL0(15),
+ HDFGRTR2, nSPMEVCNTRn_EL0, 0),
+ SR_FGT_RANGE(SYS_SPMEVFILT2Rn_EL0(0),
+ SYS_SPMEVFILT2Rn_EL0(15),
+ HDFGRTR2, nSPMEVTYPERn_EL0, 0),
+ SR_FGT_RANGE(SYS_SPMEVFILTRn_EL0(0),
+ SYS_SPMEVFILTRn_EL0(15),
+ HDFGRTR2, nSPMEVTYPERn_EL0, 0),
+ SR_FGT_RANGE(SYS_SPMEVTYPERn_EL0(0),
+ SYS_SPMEVTYPERn_EL0(15),
+ HDFGRTR2, nSPMEVTYPERn_EL0, 0),
+ SR_FGT(SYS_SPMINTENCLR_EL1, HDFGRTR2, nSPMINTEN, 0),
+ SR_FGT(SYS_SPMINTENSET_EL1, HDFGRTR2, nSPMINTEN, 0),
+ SR_FGT(SYS_SPMOVSCLR_EL0, HDFGRTR2, nSPMOVS, 0),
+ SR_FGT(SYS_SPMOVSSET_EL0, HDFGRTR2, nSPMOVS, 0),
+ SR_FGT(SYS_SPMSCR_EL1, HDFGRTR2, nSPMSCR_EL1, 0),
+ SR_FGT(SYS_SPMSELR_EL0, HDFGRTR2, nSPMSELR_EL0, 0),
+ SR_FGT(SYS_TRCITECR_EL1, HDFGRTR2, nTRCITECR_EL1, 0),
+ SR_FGT(SYS_PMBMAR_EL1, HDFGRTR2, nPMBMAR_EL1, 0),
+ SR_FGT(SYS_PMSDSFR_EL1, HDFGRTR2, nPMSDSFR_EL1, 0),
+ SR_FGT(SYS_TRBMPAM_EL1, HDFGRTR2, nTRBMPAM_EL1, 0),
+
+ /*
+ * HDFGWTR_EL2
+ *
+ * Although HDFGRTR_EL2 and HDFGWTR_EL2 registers largely
+ * overlap in their bit assignment, there are a number of bits
+ * that are RES0 on one side, and an actual trap bit on the
+ * other. The policy chosen here is to describe all the
+ * read-side mappings, and only the write-side mappings that
+ * differ from the read side, and the trap handler will pick
+ * the correct shadow register based on the access type.
+ *
+ * Same model applies to the FEAT_FGT2 registers.
+ */
+ SR_FGT(SYS_TRFCR_EL1, HDFGWTR, TRFCR_EL1, 1),
+ SR_FGT(SYS_TRCOSLAR, HDFGWTR, TRCOSLAR, 1),
+ SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1),
+ SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1),
+ SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1),
+
+ /* HDFGWTR2_EL2 */
+ SR_FGT(SYS_PMZR_EL0, HDFGWTR2, nPMZR_EL0, 0),
+ SR_FGT(SYS_SPMZR_EL0, HDFGWTR2, nSPMEVCNTRn_EL0, 0),
+
+ /*
+ * HAFGRTR_EL2
+ */
+ SR_FGT(SYS_AMEVTYPER1_EL0(15), HAFGRTR, AMEVTYPER115_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(14), HAFGRTR, AMEVTYPER114_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(13), HAFGRTR, AMEVTYPER113_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(12), HAFGRTR, AMEVTYPER112_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(11), HAFGRTR, AMEVTYPER111_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(10), HAFGRTR, AMEVTYPER110_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(9), HAFGRTR, AMEVTYPER19_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(8), HAFGRTR, AMEVTYPER18_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(7), HAFGRTR, AMEVTYPER17_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(6), HAFGRTR, AMEVTYPER16_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(5), HAFGRTR, AMEVTYPER15_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(4), HAFGRTR, AMEVTYPER14_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(3), HAFGRTR, AMEVTYPER13_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(2), HAFGRTR, AMEVTYPER12_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(1), HAFGRTR, AMEVTYPER11_EL0, 1),
+ SR_FGT(SYS_AMEVTYPER1_EL0(0), HAFGRTR, AMEVTYPER10_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(15), HAFGRTR, AMEVCNTR115_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(14), HAFGRTR, AMEVCNTR114_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(13), HAFGRTR, AMEVCNTR113_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(12), HAFGRTR, AMEVCNTR112_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(11), HAFGRTR, AMEVCNTR111_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(10), HAFGRTR, AMEVCNTR110_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(9), HAFGRTR, AMEVCNTR19_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(8), HAFGRTR, AMEVCNTR18_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(7), HAFGRTR, AMEVCNTR17_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(6), HAFGRTR, AMEVCNTR16_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(5), HAFGRTR, AMEVCNTR15_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(4), HAFGRTR, AMEVCNTR14_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(3), HAFGRTR, AMEVCNTR13_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(2), HAFGRTR, AMEVCNTR12_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(1), HAFGRTR, AMEVCNTR11_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR1_EL0(0), HAFGRTR, AMEVCNTR10_EL0, 1),
+ SR_FGT(SYS_AMCNTENCLR1_EL0, HAFGRTR, AMCNTEN1, 1),
+ SR_FGT(SYS_AMCNTENSET1_EL0, HAFGRTR, AMCNTEN1, 1),
+ SR_FGT(SYS_AMCNTENCLR0_EL0, HAFGRTR, AMCNTEN0, 1),
+ SR_FGT(SYS_AMCNTENSET0_EL0, HAFGRTR, AMCNTEN0, 1),
+ SR_FGT(SYS_AMEVCNTR0_EL0(3), HAFGRTR, AMEVCNTR03_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1),
+ SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1),
+};
+
+/*
+ * Additional FGTs that do not fire with ESR_EL2.EC==0x18. This table
+ * isn't used for exception routing, but only as a promise that the
+ * trap is handled somewhere else.
+ */
+static const union trap_config non_0x18_fgt[] __initconst = {
+ FGT(HFGITR, PSBCSYNC, 1),
+ FGT(HFGITR, nGCSSTR_EL1, 0),
+ FGT(HFGITR, SVC_EL1, 1),
+ FGT(HFGITR, SVC_EL0, 1),
+ FGT(HFGITR, ERET, 1),
+ FGT(HFGITR2, TSBCSYNC, 1),
+};
+
+static union trap_config get_trap_config(u32 sysreg)
+{
+ return (union trap_config) {
+ .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)),
+ };
+}
+
+static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc,
+ const char *type, int err)
+{
+ kvm_err("%s line %d encoding range "
+ "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n",
+ type, tc->line,
+ sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding),
+ sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding),
+ sys_reg_Op2(tc->encoding),
+ sys_reg_Op0(tc->end), sys_reg_Op1(tc->end),
+ sys_reg_CRn(tc->end), sys_reg_CRm(tc->end),
+ sys_reg_Op2(tc->end),
+ err);
+}
+
+static u32 encoding_next(u32 encoding)
+{
+ u8 op0, op1, crn, crm, op2;
+
+ op0 = sys_reg_Op0(encoding);
+ op1 = sys_reg_Op1(encoding);
+ crn = sys_reg_CRn(encoding);
+ crm = sys_reg_CRm(encoding);
+ op2 = sys_reg_Op2(encoding);
+
+ if (op2 < Op2_mask)
+ return sys_reg(op0, op1, crn, crm, op2 + 1);
+ if (crm < CRm_mask)
+ return sys_reg(op0, op1, crn, crm + 1, 0);
+ if (crn < CRn_mask)
+ return sys_reg(op0, op1, crn + 1, 0, 0);
+ if (op1 < Op1_mask)
+ return sys_reg(op0, op1 + 1, 0, 0, 0);
+
+ return sys_reg(op0 + 1, 0, 0, 0, 0);
+}
+
+#define FGT_MASKS(__n, __m) \
+ struct fgt_masks __n = { .str = #__m, .res0 = __m, }
+
+FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0);
+FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0);
+FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0);
+FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0);
+FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0);
+FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0);
+FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0);
+FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0);
+FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0);
+FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0);
+FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0);
+
+static __init bool aggregate_fgt(union trap_config tc)
+{
+ struct fgt_masks *rmasks, *wmasks;
+
+ switch (tc.fgt) {
+ case HFGRTR_GROUP:
+ rmasks = &hfgrtr_masks;
+ wmasks = &hfgwtr_masks;
+ break;
+ case HDFGRTR_GROUP:
+ rmasks = &hdfgrtr_masks;
+ wmasks = &hdfgwtr_masks;
+ break;
+ case HAFGRTR_GROUP:
+ rmasks = &hafgrtr_masks;
+ wmasks = NULL;
+ break;
+ case HFGITR_GROUP:
+ rmasks = &hfgitr_masks;
+ wmasks = NULL;
+ break;
+ case HFGRTR2_GROUP:
+ rmasks = &hfgrtr2_masks;
+ wmasks = &hfgwtr2_masks;
+ break;
+ case HDFGRTR2_GROUP:
+ rmasks = &hdfgrtr2_masks;
+ wmasks = &hdfgwtr2_masks;
+ break;
+ case HFGITR2_GROUP:
+ rmasks = &hfgitr2_masks;
+ wmasks = NULL;
+ break;
+ }
+
+ /*
+ * A bit can be reserved in either the R or W register, but
+ * not both.
+ */
+ if ((BIT(tc.bit) & rmasks->res0) &&
+ (!wmasks || (BIT(tc.bit) & wmasks->res0)))
+ return false;
+
+ if (tc.pol)
+ rmasks->mask |= BIT(tc.bit) & ~rmasks->res0;
+ else
+ rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0;
+
+ if (wmasks) {
+ if (tc.pol)
+ wmasks->mask |= BIT(tc.bit) & ~wmasks->res0;
+ else
+ wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0;
+ }
+
+ return true;
+}
+
+static __init int check_fgt_masks(struct fgt_masks *masks)
+{
+ unsigned long duplicate = masks->mask & masks->nmask;
+ u64 res0 = masks->res0;
+ int ret = 0;
+
+ if (duplicate) {
+ int i;
+
+ for_each_set_bit(i, &duplicate, 64) {
+ kvm_err("%s[%d] bit has both polarities\n",
+ masks->str, i);
+ }
+
+ ret = -EINVAL;
+ }
+
+ masks->res0 = ~(masks->mask | masks->nmask);
+ if (masks->res0 != res0)
+ kvm_info("Implicit %s = %016llx, expecting %016llx\n",
+ masks->str, masks->res0, res0);
+
+ return ret;
+}
+
+static __init int check_all_fgt_masks(int ret)
+{
+ static struct fgt_masks * const masks[] __initconst = {
+ &hfgrtr_masks,
+ &hfgwtr_masks,
+ &hfgitr_masks,
+ &hdfgrtr_masks,
+ &hdfgwtr_masks,
+ &hafgrtr_masks,
+ &hfgrtr2_masks,
+ &hfgwtr2_masks,
+ &hfgitr2_masks,
+ &hdfgrtr2_masks,
+ &hdfgwtr2_masks,
+ };
+ int err = 0;
+
+ for (int i = 0; i < ARRAY_SIZE(masks); i++)
+ err |= check_fgt_masks(masks[i]);
+
+ return ret ?: err;
+}
+
+#define for_each_encoding_in(__x, __s, __e) \
+ for (u32 __x = __s; __x <= __e; __x = encoding_next(__x))
+
+int __init populate_nv_trap_config(void)
+{
+ int ret = 0;
+
+ BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *));
+ BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS));
+ BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS));
+ BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS));
+ BUILD_BUG_ON(__HCRX_EL2_MASK & __HCRX_EL2_nMASK);
+
+ for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) {
+ const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i];
+ void *prev;
+
+ if (cgt->tc.val & BIT(63)) {
+ kvm_err("CGT[%d] has MBZ bit set\n", i);
+ ret = -EINVAL;
+ }
+
+ for_each_encoding_in(enc, cgt->encoding, cgt->end) {
+ prev = xa_store(&sr_forward_xa, enc,
+ xa_mk_value(cgt->tc.val), GFP_KERNEL);
+ if (prev && !xa_is_err(prev)) {
+ ret = -EINVAL;
+ print_nv_trap_error(cgt, "Duplicate CGT", ret);
+ }
+
+ if (xa_is_err(prev)) {
+ ret = xa_err(prev);
+ print_nv_trap_error(cgt, "Failed CGT insertion", ret);
+ }
+ }
+ }
+
+ if (__HCRX_EL2_RES0 != HCRX_EL2_RES0)
+ kvm_info("Sanitised HCR_EL2_RES0 = %016llx, expecting %016llx\n",
+ __HCRX_EL2_RES0, HCRX_EL2_RES0);
+
+ kvm_info("nv: %ld coarse grained trap handlers\n",
+ ARRAY_SIZE(encoding_to_cgt));
+
+ if (!cpus_have_final_cap(ARM64_HAS_FGT))
+ goto check_mcb;
+
+ for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) {
+ const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i];
+ union trap_config tc;
+ void *prev;
+
+ if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) {
+ ret = -EINVAL;
+ print_nv_trap_error(fgt, "Invalid FGT", ret);
+ }
+
+ for_each_encoding_in(enc, fgt->encoding, fgt->end) {
+ tc = get_trap_config(enc);
+
+ if (tc.fgt) {
+ ret = -EINVAL;
+ print_nv_trap_error(fgt, "Duplicate FGT", ret);
+ }
+
+ tc.val |= fgt->tc.val;
+ prev = xa_store(&sr_forward_xa, enc,
+ xa_mk_value(tc.val), GFP_KERNEL);
+
+ if (xa_is_err(prev)) {
+ ret = xa_err(prev);
+ print_nv_trap_error(fgt, "Failed FGT insertion", ret);
+ }
+
+ if (!aggregate_fgt(tc)) {
+ ret = -EINVAL;
+ print_nv_trap_error(fgt, "FGT bit is reserved", ret);
+ }
+ }
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(non_0x18_fgt); i++) {
+ if (!aggregate_fgt(non_0x18_fgt[i])) {
+ ret = -EINVAL;
+ kvm_err("non_0x18_fgt[%d] is reserved\n", i);
+ }
+ }
+
+ ret = check_all_fgt_masks(ret);
+
+ kvm_info("nv: %ld fine grained trap handlers\n",
+ ARRAY_SIZE(encoding_to_fgt));
+
+check_mcb:
+ for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) {
+ const enum cgt_group_id *cgids;
+
+ cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__];
+
+ for (int i = 0; cgids[i] != __RESERVED__; i++) {
+ if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ &&
+ cgids[i] < __COMPLEX_CONDITIONS__) {
+ kvm_err("Recursive MCB %d/%d\n", id, cgids[i]);
+ ret = -EINVAL;
+ }
+ }
+ }
+
+ if (ret)
+ xa_destroy(&sr_forward_xa);
+
+ return ret;
+}
+
+int __init populate_sysreg_config(const struct sys_reg_desc *sr,
+ unsigned int idx)
+{
+ union trap_config tc;
+ u32 encoding;
+ void *ret;
+
+ /*
+ * 0 is a valid value for the index, but not for the storage.
+ * We'll store (idx+1), so check against an offset'd limit.
+ */
+ if (idx >= (BIT(TC_SRI_BITS) - 1)) {
+ kvm_err("sysreg %s (%d) out of range\n", sr->name, idx);
+ return -EINVAL;
+ }
+
+ encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2);
+ tc = get_trap_config(encoding);
+
+ if (tc.sri) {
+ kvm_err("sysreg %s (%d) duplicate entry (%d)\n",
+ sr->name, idx - 1, tc.sri);
+ return -EINVAL;
+ }
+
+ tc.sri = idx + 1;
+ ret = xa_store(&sr_forward_xa, encoding,
+ xa_mk_value(tc.val), GFP_KERNEL);
+
+ return xa_err(ret);
+}
+
+static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu,
+ const struct trap_bits *tb)
+{
+ enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY;
+ u64 val;
+
+ val = __vcpu_sys_reg(vcpu, tb->index);
+ if ((val & tb->mask) == tb->value)
+ b |= tb->behaviour;
+
+ return b;
+}
+
+static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu,
+ const enum cgt_group_id id,
+ enum trap_behaviour b)
+{
+ switch (id) {
+ const enum cgt_group_id *cgids;
+
+ case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1:
+ if (likely(id != __RESERVED__))
+ b |= get_behaviour(vcpu, &coarse_trap_bits[id]);
+ break;
+ case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1:
+ /* Yes, this is recursive. Don't do anything stupid. */
+ cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__];
+ for (int i = 0; cgids[i] != __RESERVED__; i++)
+ b |= __compute_trap_behaviour(vcpu, cgids[i], b);
+ break;
+ default:
+ if (ARRAY_SIZE(ccc))
+ b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu);
+ break;
+ }
+
+ return b;
+}
+
+static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu,
+ const union trap_config tc)
+{
+ enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY;
+
+ return __compute_trap_behaviour(vcpu, tc.cgt, b);
+}
+
+static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr)
+{
+ struct kvm_sysreg_masks *masks;
+
+ /* Only handle the VNCR-backed regs for now */
+ if (sr < __VNCR_START__)
+ return 0;
+
+ masks = kvm->arch.sysreg_masks;
+
+ return masks->mask[sr - __VNCR_START__].res0;
+}
+
+static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr,
+ const union trap_config tc)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 val;
+
+ /*
+ * KVM doesn't know about any FGTs that apply to the host, and hopefully
+ * that'll remain the case.
+ */
+ if (is_hyp_ctxt(vcpu))
+ return false;
+
+ val = __vcpu_sys_reg(vcpu, sr);
+
+ if (tc.pol)
+ return (val & BIT(tc.bit));
+
+ /*
+ * FGTs with negative polarities are an absolute nightmare, as
+ * we need to evaluate the bit in the light of the feature
+ * that defines it. WTF were they thinking?
+ *
+ * So let's check if the bit has been earmarked as RES0, as
+ * this indicates an unimplemented feature.
+ */
+ if (val & BIT(tc.bit))
+ return false;
+
+ return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit));
+}
+
+bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index)
+{
+ enum vcpu_sysreg fgtreg;
+ union trap_config tc;
+ enum trap_behaviour b;
+ bool is_read;
+ u32 sysreg;
+ u64 esr;
+
+ esr = kvm_vcpu_get_esr(vcpu);
+ sysreg = esr_sys64_to_sysreg(esr);
+ is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+
+ tc = get_trap_config(sysreg);
+
+ /*
+ * A value of 0 for the whole entry means that we know nothing
+ * for this sysreg, and that it cannot be re-injected into the
+ * nested hypervisor. In this situation, let's cut it short.
+ */
+ if (!tc.val)
+ goto local;
+
+ /*
+ * If a sysreg can be trapped using a FGT, first check whether we
+ * trap for the purpose of forbidding the feature. In that case,
+ * inject an UNDEF.
+ */
+ if (tc.fgt != __NO_FGT_GROUP__ &&
+ (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) {
+ kvm_inject_undefined(vcpu);
+ return true;
+ }
+
+ /*
+ * If we're not nesting, immediately return to the caller, with the
+ * sysreg index, should we have it.
+ */
+ if (!vcpu_has_nv(vcpu))
+ goto local;
+
+ /*
+ * There are a few traps that take effect InHost, but are constrained
+ * to EL0. Don't bother with computing the trap behaviour if the vCPU
+ * isn't in EL0.
+ */
+ if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu))
+ goto local;
+
+ switch ((enum fgt_group_id)tc.fgt) {
+ case __NO_FGT_GROUP__:
+ break;
+
+ case HFGRTR_GROUP:
+ fgtreg = is_read ? HFGRTR_EL2 : HFGWTR_EL2;
+ break;
+
+ case HDFGRTR_GROUP:
+ fgtreg = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2;
+ break;
+
+ case HAFGRTR_GROUP:
+ fgtreg = HAFGRTR_EL2;
+ break;
+
+ case HFGITR_GROUP:
+ fgtreg = HFGITR_EL2;
+ switch (tc.fgf) {
+ u64 tmp;
+
+ case __NO_FGF__:
+ break;
+
+ case HCRX_FGTnXS:
+ tmp = __vcpu_sys_reg(vcpu, HCRX_EL2);
+ if (tmp & HCRX_EL2_FGTnXS)
+ tc.fgt = __NO_FGT_GROUP__;
+ }
+ break;
+
+ case HFGRTR2_GROUP:
+ fgtreg = is_read ? HFGRTR2_EL2 : HFGWTR2_EL2;
+ break;
+
+ case HDFGRTR2_GROUP:
+ fgtreg = is_read ? HDFGRTR2_EL2 : HDFGWTR2_EL2;
+ break;
+
+ case HFGITR2_GROUP:
+ fgtreg = HFGITR2_EL2;
+ break;
+
+ default:
+ /* Something is really wrong, bail out */
+ WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n",
+ sysreg, tc.val);
+ goto local;
+ }
+
+ if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, fgtreg, tc))
+ goto inject;
+
+ b = compute_trap_behaviour(vcpu, tc);
+
+ if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu))
+ goto local;
+
+ if (((b & BEHAVE_FORWARD_READ) && is_read) ||
+ ((b & BEHAVE_FORWARD_WRITE) && !is_read))
+ goto inject;
+
+local:
+ if (!tc.sri) {
+ struct sys_reg_params params;
+
+ params = esr_sys64_to_params(esr);
+
+ /*
+ * Check for the IMPDEF range, as per DDI0487 J.a,
+ * D18.3.2 Reserved encodings for IMPLEMENTATION
+ * DEFINED registers.
+ */
+ if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011))
+ print_sys_reg_msg(&params,
+ "Unsupported guest access at: %lx\n",
+ *vcpu_pc(vcpu));
+ kvm_inject_undefined(vcpu);
+ return true;
+ }
+
+ *sr_index = tc.sri - 1;
+ return false;
+
+inject:
+ trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read);
+
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+ return true;
+}
+
+static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit)
+{
+ if (is_nested_ctxt(vcpu) &&
+ (__vcpu_sys_reg(vcpu, reg) & control_bit)) {
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+ return true;
+ }
+ return false;
+}
+
+static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit)
+{
+ return __forward_traps(vcpu, HCR_EL2, control_bit);
+}
+
+bool forward_smc_trap(struct kvm_vcpu *vcpu)
+{
+ return forward_hcr_traps(vcpu, HCR_TSC);
+}
+
+static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit)
+{
+ return __forward_traps(vcpu, MDCR_EL2, control_bit);
+}
+
+bool forward_debug_exception(struct kvm_vcpu *vcpu)
+{
+ return forward_mdcr_traps(vcpu, MDCR_EL2_TDE);
+}
+
+static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr)
+{
+ u64 mode = spsr & PSR_MODE_MASK;
+
+ /*
+ * Possible causes for an Illegal Exception Return from EL2:
+ * - trying to return to EL3
+ * - trying to return to an illegal M value
+ * - trying to return to a 32bit EL
+ * - trying to return to EL1 with HCR_EL2.TGE set
+ */
+ if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h ||
+ mode == 0b00001 || (mode & BIT(1)) ||
+ (spsr & PSR_MODE32_BIT) ||
+ (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t ||
+ mode == PSR_MODE_EL1h))) {
+ /*
+ * The guest is playing with our nerves. Preserve EL, SP,
+ * masks, flags from the existing PSTATE, and set IL.
+ * The HW will then generate an Illegal State Exception
+ * immediately after ERET.
+ */
+ spsr = *vcpu_cpsr(vcpu);
+
+ spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT |
+ PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT |
+ PSR_MODE_MASK | PSR_MODE32_BIT);
+ spsr |= PSR_IL_BIT;
+ }
+
+ return spsr;
+}
+
+void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
+{
+ u64 spsr, elr, esr;
+
+ spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2);
+ spsr = kvm_check_illegal_exception_return(vcpu, spsr);
+
+ /* Check for an ERETAx */
+ esr = kvm_vcpu_get_esr(vcpu);
+ if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) {
+ /*
+ * Oh no, ERETAx failed to authenticate.
+ *
+ * If we have FPACCOMBINE and we don't have a pending
+ * Illegal Execution State exception (which has priority
+ * over FPAC), deliver an exception right away.
+ *
+ * Otherwise, let the mangled ELR value trickle down the
+ * ERET handling, and the guest will have a little surprise.
+ */
+ if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) {
+ esr &= ESR_ELx_ERET_ISS_ERETA;
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC);
+ kvm_inject_nested_sync(vcpu, esr);
+ return;
+ }
+ }
+
+ preempt_disable();
+ vcpu_set_flag(vcpu, IN_NESTED_ERET);
+ kvm_arch_vcpu_put(vcpu);
+
+ if (!esr_iss_is_eretax(esr))
+ elr = __vcpu_sys_reg(vcpu, ELR_EL2);
+
+ trace_kvm_nested_eret(vcpu, elr, spsr);
+
+ *vcpu_pc(vcpu) = elr;
+ *vcpu_cpsr(vcpu) = spsr;
+
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ vcpu_clear_flag(vcpu, IN_NESTED_ERET);
+ preempt_enable();
+
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_nested_transition(vcpu);
+}
+
+static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
+ enum exception_type type)
+{
+ trace_kvm_inject_nested_exception(vcpu, esr_el2, type);
+
+ switch (type) {
+ case except_type_sync:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
+ vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2);
+ break;
+ case except_type_irq:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ);
+ break;
+ case except_type_serror:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR);
+ break;
+ default:
+ WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type);
+ }
+}
+
+/*
+ * Emulate taking an exception to EL2.
+ * See ARM ARM J8.1.2 AArch64.TakeException()
+ */
+static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2,
+ enum exception_type type)
+{
+ u64 pstate, mode;
+ bool direct_inject;
+
+ if (!vcpu_has_nv(vcpu)) {
+ kvm_err("Unexpected call to %s for the non-nesting configuration\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * As for ERET, we can avoid doing too much on the injection path by
+ * checking that we either took the exception from a VHE host
+ * userspace or from vEL2. In these cases, there is no change in
+ * translation regime (or anything else), so let's do as little as
+ * possible.
+ */
+ pstate = *vcpu_cpsr(vcpu);
+ mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ direct_inject = (mode == PSR_MODE_EL0t &&
+ vcpu_el2_e2h_is_set(vcpu) &&
+ vcpu_el2_tge_is_set(vcpu));
+ direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t);
+
+ if (direct_inject) {
+ kvm_inject_el2_exception(vcpu, esr_el2, type);
+ return 1;
+ }
+
+ preempt_disable();
+
+ /*
+ * We may have an exception or PC update in the EL0/EL1 context.
+ * Commit it before entering EL2.
+ */
+ __kvm_adjust_pc(vcpu);
+
+ kvm_arch_vcpu_put(vcpu);
+
+ kvm_inject_el2_exception(vcpu, esr_el2, type);
+
+ /*
+ * A hard requirement is that a switch between EL1 and EL2
+ * contexts has to happen between a put/load, so that we can
+ * pick the correct timer and interrupt configuration, among
+ * other things.
+ *
+ * Make sure the exception actually took place before we load
+ * the new context.
+ */
+ __kvm_adjust_pc(vcpu);
+
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ preempt_enable();
+
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_pmu_nested_transition(vcpu);
+
+ return 1;
+}
+
+int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ return kvm_inject_nested(vcpu, esr_el2, except_type_sync);
+}
+
+int kvm_inject_nested_irq(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Do not inject an irq if the:
+ * - Current exception level is EL2, and
+ * - virtual HCR_EL2.TGE == 0
+ * - virtual HCR_EL2.IMO == 0
+ *
+ * See Table D1-17 "Physical interrupt target and masking when EL3 is
+ * not implemented and EL2 is implemented" in ARM DDI 0487C.a.
+ */
+
+ if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) &&
+ !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO))
+ return 1;
+
+ /* esr_el2 value doesn't matter for exits due to irqs. */
+ return kvm_inject_nested(vcpu, 0, except_type_irq);
+}
+
+int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
+{
+ u64 esr = FIELD_PREP(ESR_ELx_EC_MASK,
+ iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW);
+ esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL;
+
+ vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
+
+ if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE)
+ return kvm_inject_nested(vcpu, esr, except_type_serror);
+
+ return kvm_inject_nested_sync(vcpu, esr);
+}
+
+int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr)
+{
+ /*
+ * Hardware sets up the EC field when propagating ESR as a result of
+ * vSError injection. Manually populate EC for an emulated SError
+ * exception.
+ */
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR);
+ return kvm_inject_nested(vcpu, esr, except_type_serror);
+}
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 02dd7e9ebd39..15e17aca1dec 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -14,57 +14,6 @@
#include <asm/kvm_mmu.h>
#include <asm/sysreg.h>
-void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu)
-{
- struct task_struct *p = vcpu->arch.parent_task;
- struct user_fpsimd_state *fpsimd;
-
- if (!is_protected_kvm_enabled() || !p)
- return;
-
- fpsimd = &p->thread.uw.fpsimd_state;
- kvm_unshare_hyp(fpsimd, fpsimd + 1);
- put_task_struct(p);
-}
-
-/*
- * Called on entry to KVM_RUN unless this vcpu previously ran at least
- * once and the most recent prior KVM_RUN for this vcpu was called from
- * the same task as current (highly likely).
- *
- * This is guaranteed to execute before kvm_arch_vcpu_load_fp(vcpu),
- * such that on entering hyp the relevant parts of current are already
- * mapped.
- */
-int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
-{
- int ret;
-
- struct user_fpsimd_state *fpsimd = &current->thread.uw.fpsimd_state;
-
- kvm_vcpu_unshare_task_fp(vcpu);
-
- /* Make sure the host task fpsimd state is visible to hyp: */
- ret = kvm_share_hyp(fpsimd, fpsimd + 1);
- if (ret)
- return ret;
-
- vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
-
- /*
- * We need to keep current's task_struct pinned until its data has been
- * unshared with the hypervisor to make sure it is not re-used by the
- * kernel and donated to someone else while already shared -- see
- * kvm_vcpu_unshare_task_fp() for the matching put_task_struct().
- */
- if (is_protected_kvm_enabled()) {
- get_task_struct(current);
- vcpu->arch.parent_task = current;
- }
-
- return 0;
-}
-
/*
* Prepare vcpu for saving the host's FPSIMD state and loading the guest's.
* The actual loading is done by the FPSIMD access trap taken to hyp.
@@ -79,37 +28,22 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
if (!system_supports_fpsimd())
return;
- fpsimd_kvm_prepare();
-
- vcpu->arch.fp_state = FP_STATE_HOST_OWNED;
-
- vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
- if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
- vcpu_set_flag(vcpu, HOST_SVE_ENABLED);
-
/*
- * We don't currently support SME guests but if we leave
- * things in streaming mode then when the guest starts running
- * FPSIMD or SVE code it may generate SME traps so as a
- * special case if we are in streaming mode we force the host
- * state to be saved now and exit streaming mode so that we
- * don't have to handle any SME traps for valid guest
- * operations. Do this for ZA as well for now for simplicity.
+ * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such
+ * that the host kernel is responsible for restoring this state upon
+ * return to userspace, and the hyp code doesn't need to save anything.
+ *
+ * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures
+ * that PSTATE.{SM,ZA} == {0,0}.
*/
- if (system_supports_sme()) {
- vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
- if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
- vcpu_set_flag(vcpu, HOST_SME_ENABLED);
+ fpsimd_save_and_flush_cpu_state();
+ *host_data_ptr(fp_owner) = FP_STATE_FREE;
- if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) {
- vcpu->arch.fp_state = FP_STATE_FREE;
- fpsimd_save_and_flush_cpu_state();
- }
- }
+ WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR));
}
/*
- * Called just before entering the guest once we are no longer preemptable
+ * Called just before entering the guest once we are no longer preemptible
* and interrupts are disabled. If we have managed to run anything using
* FP while we were preemptible (such as off the back of an interrupt),
* then neither the host nor the guest own the FP hardware (and it was the
@@ -118,7 +52,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu)
{
if (test_thread_flag(TIF_FOREIGN_FPSTATE))
- vcpu->arch.fp_state = FP_STATE_FREE;
+ *host_data_ptr(fp_owner) = FP_STATE_FREE;
}
/*
@@ -134,8 +68,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(!irqs_disabled());
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) {
-
+ if (guest_owns_fp_regs()) {
/*
* Currently we do not support SME guests so SVCR is
* always 0 and we just need a variable to point to.
@@ -143,8 +76,9 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
fp_state.st = &vcpu->arch.ctxt.fp_regs;
fp_state.sve_state = vcpu->arch.sve_state;
fp_state.sve_vl = vcpu->arch.sve_max_vl;
- fp_state.za_state = NULL;
- fp_state.svcr = &vcpu->arch.svcr;
+ fp_state.sme_state = NULL;
+ fp_state.svcr = __ctxt_sys_reg(&vcpu->arch.ctxt, SVCR);
+ fp_state.fpmr = __ctxt_sys_reg(&vcpu->arch.ctxt, FPMR);
fp_state.fp_type = &vcpu->arch.fp_type;
if (vcpu_has_sve(vcpu))
@@ -170,45 +104,18 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
local_irq_save(flags);
- /*
- * If we have VHE then the Hyp code will reset CPACR_EL1 to
- * CPACR_EL1_DEFAULT and we need to reenable SME.
- */
- if (has_vhe() && system_supports_sme()) {
- /* Also restore EL0 state seen on entry */
- if (vcpu_get_flag(vcpu, HOST_SME_ENABLED))
- sysreg_clear_set(CPACR_EL1, 0,
- CPACR_EL1_SMEN_EL0EN |
- CPACR_EL1_SMEN_EL1EN);
- else
- sysreg_clear_set(CPACR_EL1,
- CPACR_EL1_SMEN_EL0EN,
- CPACR_EL1_SMEN_EL1EN);
- }
-
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) {
- if (vcpu_has_sve(vcpu)) {
- __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
-
- /* Restore the VL that was saved when bound to the CPU */
- if (!has_vhe())
- sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
- SYS_ZCR_EL1);
- }
-
- fpsimd_save_and_flush_cpu_state();
- } else if (has_vhe() && system_supports_sve()) {
+ if (guest_owns_fp_regs()) {
/*
- * The FPSIMD/SVE state in the CPU has not been touched, and we
- * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
- * reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE
- * for EL0. To avoid spurious traps, restore the trap state
- * seen by kvm_arch_vcpu_load_fp():
+ * Flush (save and invalidate) the fpsimd/sve state so that if
+ * the host tries to use fpsimd/sve, it's not using stale data
+ * from the guest.
+ *
+ * Flushing the state sets the TIF_FOREIGN_FPSTATE bit for the
+ * context unconditionally, in both nVHE and VHE. This allows
+ * the kernel to restore the fpsimd/sve state, including ZCR_EL1
+ * when needed.
*/
- if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED))
- sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN);
- else
- sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0);
+ fpsimd_save_and_flush_cpu_state();
}
local_irq_restore(flags);
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index cf4c495a4321..1c87699fd886 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -24,6 +24,7 @@
#include <asm/fpsimd.h>
#include <asm/kvm.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
#include <asm/sigcontext.h>
#include "trace.h"
@@ -250,9 +251,15 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case PSR_AA32_MODE_SVC:
case PSR_AA32_MODE_ABT:
case PSR_AA32_MODE_UND:
+ case PSR_AA32_MODE_SYS:
if (!vcpu_el1_is_32bit(vcpu))
return -EINVAL;
break;
+ case PSR_MODE_EL2h:
+ case PSR_MODE_EL2t:
+ if (!vcpu_has_nv(vcpu))
+ return -EINVAL;
+ fallthrough;
case PSR_MODE_EL0t:
case PSR_MODE_EL1t:
case PSR_MODE_EL1h:
@@ -270,7 +277,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) {
int i, nr_reg;
- switch (*vcpu_cpsr(vcpu)) {
+ switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) {
/*
* Either we are dealing with user mode, and only the
* first 15 registers (+ PC) must be narrowed to 32bit.
@@ -584,59 +591,6 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu)
return copy_core_reg_indices(vcpu, NULL);
}
-/**
- * ARM64 versions of the TIMER registers, always available on arm64
- */
-
-#define NUM_TIMER_REGS 3
-
-static bool is_timer_reg(u64 index)
-{
- switch (index) {
- case KVM_REG_ARM_TIMER_CTL:
- case KVM_REG_ARM_TIMER_CNT:
- case KVM_REG_ARM_TIMER_CVAL:
- return true;
- }
- return false;
-}
-
-static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
-{
- if (put_user(KVM_REG_ARM_TIMER_CTL, uindices))
- return -EFAULT;
- uindices++;
- if (put_user(KVM_REG_ARM_TIMER_CNT, uindices))
- return -EFAULT;
- uindices++;
- if (put_user(KVM_REG_ARM_TIMER_CVAL, uindices))
- return -EFAULT;
-
- return 0;
-}
-
-static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
- void __user *uaddr = (void __user *)(long)reg->addr;
- u64 val;
- int ret;
-
- ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id));
- if (ret != 0)
- return -EFAULT;
-
- return kvm_arm_timer_set_reg(vcpu, reg->id, val);
-}
-
-static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
- void __user *uaddr = (void __user *)(long)reg->addr;
- u64 val;
-
- val = kvm_arm_timer_get_reg(vcpu, reg->id);
- return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
-}
-
static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
{
const unsigned int slices = vcpu_sve_slices(vcpu);
@@ -700,6 +654,7 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu,
/**
* kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG
+ * @vcpu: the vCPU pointer
*
* This is for all registers.
*/
@@ -711,13 +666,14 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
res += num_sve_regs(vcpu);
res += kvm_arm_num_sys_reg_descs(vcpu);
res += kvm_arm_get_fw_num_regs(vcpu);
- res += NUM_TIMER_REGS;
return res;
}
/**
* kvm_arm_copy_reg_indices - get indices of all registers.
+ * @vcpu: the vCPU pointer
+ * @uindices: register list to copy
*
* We do core registers right here, then we append system regs.
*/
@@ -740,11 +696,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
return ret;
uindices += kvm_arm_get_fw_num_regs(vcpu);
- ret = copy_timer_indices(vcpu, uindices);
- if (ret < 0)
- return ret;
- uindices += NUM_TIMER_REGS;
-
return kvm_arm_copy_sys_reg_indices(vcpu, uindices);
}
@@ -762,9 +713,6 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg);
}
- if (is_timer_reg(reg->id))
- return get_timer_reg(vcpu, reg);
-
return kvm_arm_sys_reg_get_reg(vcpu, reg);
}
@@ -782,9 +730,6 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg);
}
- if (is_timer_reg(reg->id))
- return set_timer_reg(vcpu, reg);
-
return kvm_arm_sys_reg_set_reg(vcpu, reg);
}
@@ -803,8 +748,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE);
- events->exception.serror_has_esr = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
+ events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
+ events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) ||
+ vcpu_get_flag(vcpu, NESTED_SERROR_PENDING);
if (events->exception.serror_pending && events->exception.serror_has_esr)
events->exception.serror_esr = vcpu_get_vsesr(vcpu);
@@ -818,29 +764,62 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
return 0;
}
+static void commit_pending_events(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
+ return;
+
+ /*
+ * Reset the MMIO emulation state to avoid stepping PC after emulating
+ * the exception entry.
+ */
+ vcpu->mmio_needed = false;
+ kvm_call_hyp(__kvm_adjust_pc, vcpu);
+}
+
int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
bool ext_dabt_pending = events->exception.ext_dabt_pending;
+ u64 esr = events->exception.serror_esr;
+ int ret = 0;
- if (serror_pending && has_esr) {
- if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
- return -EINVAL;
-
- if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK))
- kvm_set_sei_esr(vcpu, events->exception.serror_esr);
- else
- return -EINVAL;
- } else if (serror_pending) {
- kvm_inject_vabt(vcpu);
+ /*
+ * Immediately commit the pending SEA to the vCPU's architectural
+ * state which is necessary since we do not return a pending SEA
+ * to userspace via KVM_GET_VCPU_EVENTS.
+ */
+ if (ext_dabt_pending) {
+ ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ commit_pending_events(vcpu);
}
- if (ext_dabt_pending)
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ if (ret < 0)
+ return ret;
+
+ if (!serror_pending)
+ return 0;
- return 0;
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr)
+ return -EINVAL;
+
+ if (has_esr && (esr & ~ESR_ELx_ISS_MASK))
+ return -EINVAL;
+
+ if (has_esr)
+ ret = kvm_inject_serror_esr(vcpu, esr);
+ else
+ ret = kvm_inject_serror(vcpu);
+
+ /*
+ * We could've decided that the SError is due for immediate software
+ * injection; commit the exception in case userspace decides it wants
+ * to inject more exceptions for some strange reason.
+ */
+ commit_pending_events(vcpu);
+ return (ret < 0) ? ret : 0;
}
u32 __attribute_const__ kvm_target_cpu(void)
@@ -863,7 +842,7 @@ u32 __attribute_const__ kvm_target_cpu(void)
break;
case ARM_CPU_IMP_APM:
switch (part_number) {
- case APM_CPU_PART_POTENZA:
+ case APM_CPU_PART_XGENE:
return KVM_ARM_TARGET_XGENE_POTENZA;
}
break;
@@ -873,21 +852,6 @@ u32 __attribute_const__ kvm_target_cpu(void)
return KVM_ARM_TARGET_GENERIC_V8;
}
-void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
-{
- u32 target = kvm_target_cpu();
-
- memset(init, 0, sizeof(*init));
-
- /*
- * For now, we don't return any features.
- * In future, we might use features to return target
- * specific features available for the preferred
- * target type.
- */
- init->target = (__u32)target;
-}
-
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
return -EINVAL;
@@ -906,8 +870,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
/**
* kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
- * @kvm: pointer to the KVM struct
- * @kvm_guest_debug: the ioctl data buffer
+ * @vcpu: the vCPU pointer
+ * @dbg: the ioctl data buffer
*
* This sets up and enables the VM for guest debugging. Userspace
* passes in a control flag to enable different debug types and
@@ -917,31 +881,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
- int ret = 0;
-
trace_kvm_set_guest_debug(vcpu, dbg->control);
- if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) {
- ret = -EINVAL;
- goto out;
- }
-
- if (dbg->control & KVM_GUESTDBG_ENABLE) {
- vcpu->guest_debug = dbg->control;
-
- /* Hardware assisted Break and Watch points */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
- vcpu->arch.external_debug_state = dbg->arch;
- }
+ if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
+ return -EINVAL;
- } else {
- /* If not enabled clear all flags */
+ if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
vcpu->guest_debug = 0;
- vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING);
+ vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING);
+ return 0;
}
-out:
- return ret;
+ vcpu->guest_debug = dbg->control;
+
+ /* Hardware assisted Break and Watch points */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW)
+ vcpu->arch.external_debug_state = dbg->arch;
+
+ return 0;
}
int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
@@ -951,7 +908,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
switch (attr->group) {
case KVM_ARM_VCPU_PMU_V3_CTRL:
+ mutex_lock(&vcpu->kvm->arch.config_lock);
ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
break;
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_set_attr(vcpu, attr);
@@ -1013,8 +972,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
return ret;
}
-long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
- struct kvm_arm_copy_mte_tags *copy_tags)
+int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+ struct kvm_arm_copy_mte_tags *copy_tags)
{
gpa_t guest_ipa = copy_tags->guest_ipa;
size_t length = copy_tags->length;
@@ -1035,54 +994,72 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
return -EINVAL;
+ /* Lengths above INT_MAX cannot be represented in the return value */
+ if (length > INT_MAX)
+ return -EINVAL;
+
gfn = gpa_to_gfn(guest_ipa);
mutex_lock(&kvm->slots_lock);
+ if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
while (length > 0) {
- kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+ struct page *page = __gfn_to_page(kvm, gfn, write);
void *maddr;
unsigned long num_tags;
- struct page *page;
+ struct folio *folio;
- if (is_error_noslot_pfn(pfn)) {
+ if (!page) {
ret = -EFAULT;
goto out;
}
- page = pfn_to_online_page(pfn);
- if (!page) {
+ if (!pfn_to_online_page(page_to_pfn(page))) {
/* Reject ZONE_DEVICE memory */
+ kvm_release_page_unused(page);
ret = -EFAULT;
goto out;
}
+ folio = page_folio(page);
maddr = page_address(page);
if (!write) {
- if (page_mte_tagged(page))
+ if ((folio_test_hugetlb(folio) &&
+ folio_test_hugetlb_mte_tagged(folio)) ||
+ page_mte_tagged(page))
num_tags = mte_copy_tags_to_user(tags, maddr,
MTE_GRANULES_PER_PAGE);
else
/* No tags in memory, so write zeros */
num_tags = MTE_GRANULES_PER_PAGE -
clear_user(tags, MTE_GRANULES_PER_PAGE);
- kvm_release_pfn_clean(pfn);
+ kvm_release_page_clean(page);
} else {
/*
* Only locking to serialise with a concurrent
- * set_pte_at() in the VMM but still overriding the
+ * __set_ptes() in the VMM but still overriding the
* tags, hence ignoring the return value.
*/
- try_page_mte_tagging(page);
+ if (folio_test_hugetlb(folio))
+ folio_try_hugetlb_mte_tagging(folio);
+ else
+ try_page_mte_tagging(page);
num_tags = mte_copy_tags_from_user(maddr, tags,
MTE_GRANULES_PER_PAGE);
/* uaccess failed, don't leave stale tags */
if (num_tags != MTE_GRANULES_PER_PAGE)
mte_clear_page_tags(maddr);
- set_page_mte_tagged(page);
+ if (folio_test_hugetlb(folio))
+ folio_set_hugetlb_mte_tagged(folio);
+ else
+ set_page_mte_tagged(page);
- kvm_release_pfn_dirty(pfn);
+ kvm_release_page_dirty(page);
}
if (num_tags != MTE_GRANULES_PER_PAGE) {
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e778eefcf214..cc7d5d1709cb 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -10,12 +10,14 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/ubsan.h>
#include <asm/esr.h>
#include <asm/exception.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/debug-monitors.h>
#include <asm/stacktrace/nvhe.h>
#include <asm/traps.h>
@@ -30,47 +32,82 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *);
static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
{
if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr))
- kvm_inject_vabt(vcpu);
+ kvm_inject_serror(vcpu);
}
static int handle_hvc(struct kvm_vcpu *vcpu)
{
- int ret;
-
trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0),
kvm_vcpu_hvc_get_imm(vcpu));
vcpu->stat.hvc_exit_stat++;
- ret = kvm_hvc_call_handler(vcpu);
- if (ret < 0) {
- vcpu_set_reg(vcpu, 0, ~0UL);
+ /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */
+ if (vcpu_has_nv(vcpu)) {
+ if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD)
+ kvm_inject_undefined(vcpu);
+ else
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
return 1;
}
- return ret;
+ return kvm_smccc_call_handler(vcpu);
}
static int handle_smc(struct kvm_vcpu *vcpu)
{
/*
+ * Forward this trapped smc instruction to the virtual EL2 if
+ * the guest has asked for it.
+ */
+ if (forward_smc_trap(vcpu))
+ return 1;
+
+ /*
* "If an SMC instruction executed at Non-secure EL1 is
* trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
* Trap exception, not a Secure Monitor Call exception [...]"
*
* We need to advance the PC after the trap, as it would
- * otherwise return to the same address...
+ * otherwise return to the same address. Furthermore, pre-incrementing
+ * the PC before potentially exiting to userspace maintains the same
+ * abstraction for both SMCs and HVCs.
*/
- vcpu_set_reg(vcpu, 0, ~0UL);
kvm_incr_pc(vcpu);
- return 1;
+
+ /*
+ * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9
+ * "SMC and HVC immediate value".
+ */
+ if (kvm_vcpu_hvc_get_imm(vcpu)) {
+ vcpu_set_reg(vcpu, 0, ~0UL);
+ return 1;
+ }
+
+ /*
+ * If imm is zero then it is likely an SMCCC call.
+ *
+ * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed
+ * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than
+ * being treated as UNDEFINED.
+ */
+ return kvm_smccc_call_handler(vcpu);
}
/*
- * Guest access to FP/ASIMD registers are routed to this handler only
- * when the system doesn't support FP/ASIMD.
+ * This handles the cases where the system does not support FP/ASIMD or when
+ * we are running nested virtualization and the guest hypervisor is trapping
+ * FP/ASIMD accesses by its guest guest.
+ *
+ * All other handling of guest vs. host FP/ASIMD register state is handled in
+ * fixup_guest_exit().
*/
-static int handle_no_fpsimd(struct kvm_vcpu *vcpu)
+static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu)
{
+ if (guest_hyp_fpsimd_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ /* This is the case when the system doesn't support FP/ASIMD. */
kvm_inject_undefined(vcpu);
return 1;
}
@@ -93,8 +130,12 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu)
static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
{
u64 esr = kvm_vcpu_get_esr(vcpu);
+ bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE);
- if (esr & ESR_ELx_WFx_ISS_WFE) {
+ if (guest_hyp_wfx_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ if (is_wfe) {
trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
vcpu->stat.wfe_exit_stat++;
} else {
@@ -106,7 +147,12 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
if (esr & ESR_ELx_WFx_ISS_RV) {
u64 val, now;
- now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT);
+ now = kvm_phys_timer_read();
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ now -= timer_get_offset(vcpu_hvtimer(vcpu));
+ else
+ now -= timer_get_offset(vcpu_vtimer(vcpu));
+
val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
if (now >= val)
@@ -147,6 +193,9 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu)
struct kvm_run *run = vcpu->run;
u64 esr = kvm_vcpu_get_esr(vcpu);
+ if (!vcpu->guest_debug && forward_debug_exception(vcpu))
+ return 1;
+
run->exit_reason = KVM_EXIT_DEBUG;
run->debug.arch.hsr = lower_32_bits(esr);
run->debug.arch.hsr_high = upper_32_bits(esr);
@@ -157,7 +206,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu)
run->debug.arch.far = vcpu->arch.fault.far_el2;
break;
case ESR_ELx_EC_SOFTSTP_LOW:
- vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING);
+ *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
break;
}
@@ -181,21 +230,149 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
*/
static int handle_sve(struct kvm_vcpu *vcpu)
{
+ if (guest_hyp_sve_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
kvm_inject_undefined(vcpu);
return 1;
}
/*
- * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into
- * a NOP). If we get here, it is that we didn't fixup ptrauth on exit, and all
- * that we can do is give the guest an UNDEF.
+ * Two possibilities to handle a trapping ptrauth instruction:
+ *
+ * - Guest usage of a ptrauth instruction (which the guest EL1 did not
+ * turn into a NOP). If we get here, it is because we didn't enable
+ * ptrauth for the guest. This results in an UNDEF, as it isn't
+ * supposed to use ptrauth without being told it could.
+ *
+ * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for
+ * which we reinject the exception into L1.
+ *
+ * Anything else is an emulation bug (hence the WARN_ON + UNDEF).
*/
static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu)
{
+ if (!vcpu_has_ptrauth(vcpu)) {
+ kvm_inject_undefined(vcpu);
+ return 1;
+ }
+
+ if (is_nested_ctxt(vcpu)) {
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+ return 1;
+ }
+
+ /* Really shouldn't be here! */
+ WARN_ON_ONCE(1);
+ kvm_inject_undefined(vcpu);
+ return 1;
+}
+
+static int kvm_handle_eret(struct kvm_vcpu *vcpu)
+{
+ if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) &&
+ !vcpu_has_ptrauth(vcpu))
+ return kvm_handle_ptrauth(vcpu);
+
+ /*
+ * If we got here, two possibilities:
+ *
+ * - the guest is in EL2, and we need to fully emulate ERET
+ *
+ * - the guest is in EL1, and we need to reinject the
+ * exception into the L1 hypervisor.
+ *
+ * If KVM ever traps ERET for its own use, we'll have to
+ * revisit this.
+ */
+ if (is_hyp_ctxt(vcpu))
+ kvm_emulate_nested_eret(vcpu);
+ else
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ return 1;
+}
+
+static int handle_svc(struct kvm_vcpu *vcpu)
+{
+ /*
+ * So far, SVC traps only for NV via HFGITR_EL2. A SVC from a
+ * 32bit guest would be caught by vpcu_mode_is_bad_32bit(), so
+ * we should only have to deal with a 64 bit exception.
+ */
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+ return 1;
+}
+
+static int kvm_handle_gcs(struct kvm_vcpu *vcpu)
+{
+ /* We don't expect GCS, so treat it with contempt */
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, GCS, IMP))
+ WARN_ON_ONCE(1);
+
kvm_inject_undefined(vcpu);
return 1;
}
+static int handle_other(struct kvm_vcpu *vcpu)
+{
+ bool allowed, fwd = is_nested_ctxt(vcpu);
+ u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2);
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 iss = ESR_ELx_ISS(esr);
+ struct kvm *kvm = vcpu->kvm;
+
+ /*
+ * We only trap for two reasons:
+ *
+ * - the feature is disabled, and the only outcome is to
+ * generate an UNDEF.
+ *
+ * - the feature is enabled, but a NV guest wants to trap the
+ * feature used by its L2 guest. We forward the exception in
+ * this case.
+ *
+ * What we don't expect is to end-up here if the guest is
+ * expected be be able to directly use the feature, hence the
+ * WARN_ON below.
+ */
+ switch (iss) {
+ case ESR_ELx_ISS_OTHER_ST64BV:
+ allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V);
+ fwd &= !(hcrx & HCRX_EL2_EnASR);
+ break;
+ case ESR_ELx_ISS_OTHER_ST64BV0:
+ allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA);
+ fwd &= !(hcrx & HCRX_EL2_EnAS0);
+ break;
+ case ESR_ELx_ISS_OTHER_LDST64B:
+ allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64);
+ fwd &= !(hcrx & HCRX_EL2_EnALS);
+ break;
+ case ESR_ELx_ISS_OTHER_TSBCSYNC:
+ allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1);
+ fwd &= (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC);
+ break;
+ case ESR_ELx_ISS_OTHER_PSBCSYNC:
+ allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5);
+ fwd &= (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC);
+ break;
+ default:
+ /* Clearly, we're missing something. */
+ WARN_ON_ONCE(1);
+ allowed = false;
+ }
+
+ WARN_ON_ONCE(allowed && !fwd);
+
+ if (allowed && fwd)
+ kvm_inject_nested_sync(vcpu, esr);
+ else
+ kvm_inject_undefined(vcpu);
+
+ return 1;
+}
+
static exit_handle_fn arm_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec,
[ESR_ELx_EC_WFx] = kvm_handle_wfx,
@@ -205,21 +382,26 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store,
[ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id,
[ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64,
+ [ESR_ELx_EC_OTHER] = handle_other,
[ESR_ELx_EC_HVC32] = handle_hvc,
[ESR_ELx_EC_SMC32] = handle_smc,
[ESR_ELx_EC_HVC64] = handle_hvc,
[ESR_ELx_EC_SMC64] = handle_smc,
+ [ESR_ELx_EC_SVC64] = handle_svc,
[ESR_ELx_EC_SYS64] = kvm_handle_sys_reg,
[ESR_ELx_EC_SVE] = handle_sve,
+ [ESR_ELx_EC_ERET] = kvm_handle_eret,
[ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort,
[ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort,
+ [ESR_ELx_EC_DABT_CUR] = kvm_handle_vncr_abort,
[ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
[ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug,
[ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
[ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug,
[ESR_ELx_EC_BRK64] = kvm_handle_guest_debug,
- [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd,
+ [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd,
[ESR_ELx_EC_PAC] = kvm_handle_ptrauth,
+ [ESR_ELx_EC_GCS] = kvm_handle_gcs,
};
static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
@@ -313,7 +495,7 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, disr_to_esr(disr));
} else {
- kvm_inject_vabt(vcpu);
+ kvm_inject_serror(vcpu);
}
return;
@@ -325,6 +507,20 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
}
+static void print_nvhe_hyp_panic(const char *name, u64 panic_addr)
+{
+ kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr,
+ (void *)(panic_addr + kaslr_offset()));
+}
+
+static void kvm_nvhe_report_cfi_failure(u64 panic_addr)
+{
+ print_nvhe_hyp_panic("CFI failure", panic_addr);
+
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
+ kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n");
+}
+
void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
u64 elr_virt, u64 elr_phys,
u64 par, uintptr_t vcpu,
@@ -337,7 +533,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
kvm_err("Invalid host exception to nVHE hyp!\n");
} else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
- (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
+ esr_brk_comment(esr) == BUG_BRK_IMM) {
const char *file = NULL;
unsigned int line = 0;
@@ -353,16 +549,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
if (file)
kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
else
- kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr,
- (void *)(panic_addr + kaslr_offset()));
+ print_nvhe_hyp_panic("BUG", panic_addr);
+ } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) {
+ kvm_nvhe_report_cfi_failure(panic_addr);
+ } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) &&
+ ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
+ esr_is_ubsan_brk(esr)) {
+ print_nvhe_hyp_panic(report_ubsan_failure(esr & UBSAN_BRK_MASK),
+ panic_addr);
} else {
- kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr,
- (void *)(panic_addr + kaslr_offset()));
+ print_nvhe_hyp_panic("panic", panic_addr);
}
/* Dump the nVHE hypervisor backtrace */
kvm_nvhe_dump_backtrace(hyp_offset);
+ /* Dump the faulting instruction */
+ dump_kernel_instr(panic_addr + kaslr_offset());
+
/*
* Hyp has panicked and we're going to handle that by panicking the
* kernel. The kernel offset will be revealed in the panic so we're
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index a38dea6186c9..d61e44642f98 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -3,7 +3,7 @@
# Makefile for Kernel-based Virtual Machine module, HYP part
#
-incdir := $(srctree)/$(src)/include
+incdir := $(src)/include
subdir-asflags-y := -I$(incdir)
subdir-ccflags-y := -I$(incdir)
diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c
index f98cbe2626a1..449fa58cf3b6 100644
--- a/arch/arm64/kvm/hyp/aarch32.c
+++ b/arch/arm64/kvm/hyp/aarch32.c
@@ -50,9 +50,23 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
u32 cpsr_cond;
int cond;
- /* Top two bits non-zero? Unconditional. */
- if (kvm_vcpu_get_esr(vcpu) >> 30)
+ /*
+ * These are the exception classes that could fire with a
+ * conditional instruction.
+ */
+ switch (kvm_vcpu_trap_get_class(vcpu)) {
+ case ESR_ELx_EC_CP15_32:
+ case ESR_ELx_EC_CP15_64:
+ case ESR_ELx_EC_CP14_MR:
+ case ESR_ELx_EC_CP14_LS:
+ case ESR_ELx_EC_FP_ASIMD:
+ case ESR_ELx_EC_CP10_ID:
+ case ESR_ELx_EC_CP14_64:
+ case ESR_ELx_EC_SVC32:
+ break;
+ default:
return true;
+ }
/* Is condition field valid? */
cond = kvm_vcpu_get_condition(vcpu);
@@ -84,7 +98,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
}
/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
+ * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
* @vcpu: The VCPU pointer
*
* When exceptions occur while instructions are executed in Thumb IF-THEN
@@ -120,7 +134,7 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
}
/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
+ * kvm_skip_instr32 - skip a trapped instruction and proceed to the next
* @vcpu: The vcpu pointer
*/
void kvm_skip_instr32(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 435346ea1504..9f4e8d68ab50 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN
alternative_else_nop_endif
mrs x1, isr_el1
cbz x1, 1f
+
+ // Ensure that __guest_enter() always provides a context
+ // synchronization event so that callers don't need ISBs for anything
+ // that would usually be synchonized by the ERET.
+ isb
mov x0, #ARM_EXCEPTION_IRQ
ret
@@ -83,6 +88,14 @@ alternative_else_nop_endif
eret
sb
+SYM_INNER_LABEL(__guest_exit_restore_elr_and_panic, SYM_L_GLOBAL)
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
+
+ adr_this_cpu x0, kvm_hyp_ctxt, x1
+ ldr x0, [x0, #CPU_ELR_EL2]
+ msr elr_el2, x0
+
SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// x2-x29,lr: vcpu regs
// vcpu x0-x1 on the stack
@@ -171,7 +184,7 @@ alternative_else
dsb sy // Synchronize against in-flight ld/st
isb // Prevent an early read of side-effect free ISR
mrs x2, isr_el1
- tbnz x2, #8, 2f // ISR_EL1.A
+ tbnz x2, #ISR_EL1_A_SHIFT, 2f
ret
nop
2:
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 791d3de76771..bef40ddb16db 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -14,6 +14,7 @@
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__)
#error Hypervisor code only!
@@ -21,33 +22,36 @@
static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
- u64 val;
-
- if (__vcpu_read_sys_reg_from_cpu(reg, &val))
- return val;
+ if (has_vhe())
+ return vcpu_read_sys_reg(vcpu, reg);
return __vcpu_sys_reg(vcpu, reg);
}
static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
- if (__vcpu_write_sys_reg_to_cpu(val, reg))
- return;
-
- __vcpu_sys_reg(vcpu, reg) = val;
+ if (has_vhe())
+ vcpu_write_sys_reg(vcpu, val, reg);
+ else
+ __vcpu_assign_sys_reg(vcpu, reg, val);
}
-static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val)
+static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
+ u64 val)
{
- if (has_vhe())
- write_sysreg_el1(val, SYS_SPSR);
- else
- __vcpu_sys_reg(vcpu, SPSR_EL1) = val;
+ if (has_vhe()) {
+ if (target_mode == PSR_MODE_EL1h)
+ vcpu_write_sys_reg(vcpu, val, SPSR_EL1);
+ else
+ vcpu_write_sys_reg(vcpu, val, SPSR_EL2);
+ } else {
+ __vcpu_assign_sys_reg(vcpu, SPSR_EL1, val);
+ }
}
static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
{
- if (has_vhe())
+ if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
write_sysreg(val, spsr_abt);
else
vcpu->arch.ctxt.spsr_abt = val;
@@ -55,7 +59,7 @@ static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
{
- if (has_vhe())
+ if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
write_sysreg(val, spsr_und);
else
vcpu->arch.ctxt.spsr_und = val;
@@ -101,6 +105,11 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
__vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
break;
+ case PSR_MODE_EL2h:
+ vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL2);
+ sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL2);
+ break;
default:
/* Don't do that */
BUG();
@@ -153,7 +162,7 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
new |= target_mode;
*vcpu_cpsr(vcpu) = new;
- __vcpu_write_spsr(vcpu, old);
+ __vcpu_write_spsr(vcpu, target_mode, old);
}
/*
@@ -323,11 +332,28 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
break;
+
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR):
+ enter_exception64(vcpu, PSR_MODE_EL1h, except_type_serror);
+ break;
+
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
+ enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync);
+ break;
+
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_IRQ):
+ enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq);
+ break;
+
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR):
+ enter_exception64(vcpu, PSR_MODE_EL2h, except_type_serror);
+ break;
+
default:
/*
- * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ}
- * will be implemented at some point. Everything
- * else gets silently ignored.
+ * Only EL1_{SYNC,SERR} and EL2_{SYNC,IRQ,SERR} makes
+ * sense so far. Everything else gets silently
+ * ignored.
*/
break;
}
diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S
index 61e6f3ba7b7d..e950875e31ce 100644
--- a/arch/arm64/kvm/hyp/fpsimd.S
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state)
sve_load 0, x1, x2, 3
ret
SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+ mov x2, #1
+ sve_save 0, x1, x2, 3
+ ret
+SYM_FUNC_END(__sve_save_state)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 8f3f93fa119e..03f97d71984c 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -154,6 +154,12 @@ SYM_CODE_END(\label)
esb
stp x0, x1, [sp, #-16]!
662:
+ /*
+ * spectre vectors __bp_harden_hyp_vecs generate br instructions at runtime
+ * that jump at offset 8 at __kvm_hyp_vector.
+ * As hyp .text is guarded section, it needs bti j.
+ */
+ bti j
b \target
check_preamble_length 661b, 662b
@@ -165,6 +171,8 @@ check_preamble_length 661b, 662b
nop
stp x0, x1, [sp, #-16]!
662:
+ /* Check valid_vect */
+ bti j
b \target
check_preamble_length 661b, 662b
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 961bbef104a6..502a5b73ee70 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -88,15 +88,26 @@
default: write_debug(ptr[0], reg, 0); \
}
+static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu)
+{
+ switch (vcpu->arch.debug_owner) {
+ case VCPU_DEBUG_FREE:
+ WARN_ON_ONCE(1);
+ fallthrough;
+ case VCPU_DEBUG_GUEST_OWNED:
+ return &vcpu->arch.vcpu_debug_state;
+ case VCPU_DEBUG_HOST_OWNED:
+ return &vcpu->arch.external_debug_state;
+ }
+
+ return NULL;
+}
+
static void __debug_save_state(struct kvm_guest_debug_arch *dbg,
struct kvm_cpu_context *ctxt)
{
- u64 aa64dfr0;
- int brps, wrps;
-
- aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
- brps = (aa64dfr0 >> 12) & 0xf;
- wrps = (aa64dfr0 >> 20) & 0xf;
+ int brps = *host_data_ptr(debug_brps);
+ int wrps = *host_data_ptr(debug_wrps);
save_debug(dbg->dbg_bcr, dbgbcr, brps);
save_debug(dbg->dbg_bvr, dbgbvr, brps);
@@ -109,13 +120,8 @@ static void __debug_save_state(struct kvm_guest_debug_arch *dbg,
static void __debug_restore_state(struct kvm_guest_debug_arch *dbg,
struct kvm_cpu_context *ctxt)
{
- u64 aa64dfr0;
- int brps, wrps;
-
- aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
-
- brps = (aa64dfr0 >> 12) & 0xf;
- wrps = (aa64dfr0 >> 20) & 0xf;
+ int brps = *host_data_ptr(debug_brps);
+ int wrps = *host_data_ptr(debug_wrps);
restore_debug(dbg->dbg_bcr, dbgbcr, brps);
restore_debug(dbg->dbg_bvr, dbgbvr, brps);
@@ -132,13 +138,13 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
struct kvm_guest_debug_arch *host_dbg;
struct kvm_guest_debug_arch *guest_dbg;
- if (!vcpu_get_flag(vcpu, DEBUG_DIRTY))
+ if (!kvm_debug_regs_in_use(vcpu))
return;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
- host_dbg = &vcpu->arch.host_debug_state.regs;
- guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
+ host_dbg = host_data_ptr(host_debug_state.regs);
+ guest_dbg = __vcpu_debug_regs(vcpu);
__debug_save_state(host_dbg, host_ctxt);
__debug_restore_state(guest_dbg, guest_ctxt);
@@ -151,18 +157,16 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
struct kvm_guest_debug_arch *host_dbg;
struct kvm_guest_debug_arch *guest_dbg;
- if (!vcpu_get_flag(vcpu, DEBUG_DIRTY))
+ if (!kvm_debug_regs_in_use(vcpu))
return;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
- host_dbg = &vcpu->arch.host_debug_state.regs;
- guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
+ host_dbg = host_data_ptr(host_debug_state.regs);
+ guest_dbg = __vcpu_debug_regs(vcpu);
__debug_save_state(guest_dbg, guest_ctxt);
__debug_restore_state(host_dbg, host_ctxt);
-
- vcpu_clear_flag(vcpu, DEBUG_DIRTY);
}
#endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */
diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h
index 9ddcfe2c3e57..fc573fc767b0 100644
--- a/arch/arm64/kvm/hyp/include/hyp/fault.h
+++ b/arch/arm64/kvm/hyp/include/hyp/fault.h
@@ -12,8 +12,19 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+static inline bool __fault_safe_to_translate(u64 esr)
+{
+ u64 fsc = esr & ESR_ELx_FSC;
+
+ if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr))
+ return false;
+
+ return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV));
+}
+
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
+ int ret;
u64 par, tmp;
/*
@@ -27,7 +38,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
* saved the guest context yet, and we may return early...
*/
par = read_sysreg_par();
- if (!__kvm_at("s1e1r", far))
+ ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) :
+ __kvm_at(OP_AT_S1E1R, far);
+ if (!ret)
tmp = read_sysreg_par();
else
tmp = SYS_PAR_EL1_F; /* back to the guest */
@@ -41,34 +54,50 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
return true;
}
-static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
+/*
+ * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR.
+ */
+static inline bool __hpfar_valid(u64 esr)
{
- u64 hpfar, far;
-
- far = read_sysreg_el2(SYS_FAR);
-
/*
- * The HPFAR can be invalid if the stage 2 fault did not
- * happen during a stage 1 page table walk (the ESR_EL2.S1PTW
- * bit is clear) and one of the two following cases are true:
- * 1. The fault was due to a permission fault
- * 2. The processor carries errata 834220
+ * CPUs affected by ARM erratum #834220 may incorrectly report a
+ * stage-2 translation fault when a stage-1 permission fault occurs.
*
- * Therefore, for all non S1PTW faults where we either have a
- * permission fault or the errata workaround is enabled, we
- * resolve the IPA using the AT instruction.
+ * Re-walk the page tables to determine if a stage-1 fault actually
+ * occurred.
*/
- if (!(esr & ESR_ELx_S1PTW) &&
- (cpus_have_final_cap(ARM64_WORKAROUND_834220) ||
- (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) {
- if (!__translate_far_to_hpfar(far, &hpfar))
- return false;
- } else {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_834220) &&
+ esr_fsc_is_translation_fault(esr))
+ return false;
+
+ if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr))
+ return true;
+
+ if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr))
+ return true;
+
+ return esr_fsc_is_addr_sz_fault(esr);
+}
+
+static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
+{
+ u64 hpfar;
+
+ fault->far_el2 = read_sysreg_el2(SYS_FAR);
+ fault->hpfar_el2 = 0;
+
+ if (__hpfar_valid(esr))
hpfar = read_sysreg(hpfar_el2);
- }
+ else if (unlikely(!__fault_safe_to_translate(esr)))
+ return true;
+ else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar))
+ return false;
- fault->far_el2 = far;
- fault->hpfar_el2 = hpfar;
+ /*
+ * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid
+ * HPFAR value.
+ */
+ fault->hpfar_el2 = hpfar | HPFAR_EL2_NS;
return true;
}
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 07d37ff88a3f..c5d5e5b86eaf 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -26,9 +26,11 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/fpsimd.h>
#include <asm/debug-monitors.h>
#include <asm/processor.h>
+#include <asm/traps.h>
struct kvm_exception_table_entry {
int insn, fixup;
@@ -37,19 +39,13 @@ struct kvm_exception_table_entry {
extern struct kvm_exception_table_entry __start___kvm_ex_table;
extern struct kvm_exception_table_entry __stop___kvm_ex_table;
-/* Check whether the FP regs are owned by the guest */
-static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.fp_state == FP_STATE_GUEST_OWNED;
-}
-
/* Save the 32-bit only FPSIMD system register state */
static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu)
{
if (!vcpu_el1_is_32bit(vcpu))
return;
- __vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2);
+ __vcpu_assign_sys_reg(vcpu, FPEXC32_EL2, read_sysreg(fpexc32_el2));
}
static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
@@ -69,8 +65,241 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
}
}
+static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+ u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA;
+
+ /*
+ * Always trap SME since it's not supported in KVM.
+ * TSM is RES1 if SME isn't implemented.
+ */
+ val |= CPTR_EL2_TSM;
+
+ if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
+ val |= CPTR_EL2_TZ;
+
+ if (!guest_owns_fp_regs())
+ val |= CPTR_EL2_TFP;
+
+ write_sysreg(val, cptr_el2);
+}
+
+static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+ /*
+ * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
+ * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
+ * except for some missing controls, such as TAM.
+ * In this case, CPTR_EL2.TAM has the same position with or without
+ * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
+ * shift value for trapping the AMU accesses.
+ */
+ u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA;
+ u64 cptr;
+
+ if (guest_owns_fp_regs()) {
+ val |= CPACR_EL1_FPEN;
+ if (vcpu_has_sve(vcpu))
+ val |= CPACR_EL1_ZEN;
+ }
+
+ if (!vcpu_has_nv(vcpu))
+ goto write;
+
+ /*
+ * The architecture is a bit crap (what a surprise): an EL2 guest
+ * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
+ * as they are RES0 in the guest's view. To work around it, trap the
+ * sucker using the very same bit it can't set...
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
+ val |= CPTR_EL2_TCPAC;
+
+ /*
+ * Layer the guest hypervisor's trap configuration on top of our own if
+ * we're in a nested context.
+ */
+ if (is_hyp_ctxt(vcpu))
+ goto write;
+
+ cptr = vcpu_sanitised_cptr_el2(vcpu);
+
+ /*
+ * Pay attention, there's some interesting detail here.
+ *
+ * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
+ * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
+ *
+ * - CPTR_EL2.xEN = x0, traps are enabled
+ * - CPTR_EL2.xEN = x1, traps are disabled
+ *
+ * In other words, bit[0] determines if guest accesses trap or not. In
+ * the interest of simplicity, clear the entire field if the guest
+ * hypervisor has traps enabled to dispel any illusion of something more
+ * complicated taking place.
+ */
+ if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
+ val &= ~CPACR_EL1_FPEN;
+ if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
+ val &= ~CPACR_EL1_ZEN;
+
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
+ val |= cptr & CPACR_EL1_E0POE;
+
+ val |= cptr & CPTR_EL2_TCPAC;
+
+write:
+ write_sysreg(val, cpacr_el1);
+}
+
+static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ if (!guest_owns_fp_regs())
+ __activate_traps_fpsimd32(vcpu);
+
+ if (has_vhe() || has_hvhe())
+ __activate_cptr_traps_vhe(vcpu);
+ else
+ __activate_cptr_traps_nvhe(vcpu);
+}
+
+static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+ u64 val = CPTR_NVHE_EL2_RES1;
+
+ if (!cpus_have_final_cap(ARM64_SVE))
+ val |= CPTR_EL2_TZ;
+ if (!cpus_have_final_cap(ARM64_SME))
+ val |= CPTR_EL2_TSM;
+
+ write_sysreg(val, cptr_el2);
+}
+
+static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+ u64 val = CPACR_EL1_FPEN;
+
+ if (cpus_have_final_cap(ARM64_SVE))
+ val |= CPACR_EL1_ZEN;
+ if (cpus_have_final_cap(ARM64_SME))
+ val |= CPACR_EL1_SMEN;
+
+ write_sysreg(val, cpacr_el1);
+}
+
+static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ if (has_vhe() || has_hvhe())
+ __deactivate_cptr_traps_vhe(vcpu);
+ else
+ __deactivate_cptr_traps_nvhe(vcpu);
+}
+
+static inline bool cpu_has_amu(void)
+{
+ u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1);
+
+ return cpuid_feature_extract_unsigned_field(pfr0,
+ ID_AA64PFR0_EL1_AMU_SHIFT);
+}
+
+#define __activate_fgt(hctxt, vcpu, reg) \
+ do { \
+ ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \
+ write_sysreg_s(*vcpu_fgt(vcpu, reg), SYS_ ## reg); \
+ } while (0)
+
+static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
+ if (!cpus_have_final_cap(ARM64_HAS_FGT))
+ return;
+
+ __activate_fgt(hctxt, vcpu, HFGRTR_EL2);
+ __activate_fgt(hctxt, vcpu, HFGWTR_EL2);
+ __activate_fgt(hctxt, vcpu, HFGITR_EL2);
+ __activate_fgt(hctxt, vcpu, HDFGRTR_EL2);
+ __activate_fgt(hctxt, vcpu, HDFGWTR_EL2);
+
+ if (cpu_has_amu())
+ __activate_fgt(hctxt, vcpu, HAFGRTR_EL2);
+
+ if (!cpus_have_final_cap(ARM64_HAS_FGT2))
+ return;
+
+ __activate_fgt(hctxt, vcpu, HFGRTR2_EL2);
+ __activate_fgt(hctxt, vcpu, HFGWTR2_EL2);
+ __activate_fgt(hctxt, vcpu, HFGITR2_EL2);
+ __activate_fgt(hctxt, vcpu, HDFGRTR2_EL2);
+ __activate_fgt(hctxt, vcpu, HDFGWTR2_EL2);
+}
+
+#define __deactivate_fgt(htcxt, vcpu, reg) \
+ do { \
+ write_sysreg_s(ctxt_sys_reg(hctxt, reg), \
+ SYS_ ## reg); \
+ } while(0)
+
+static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
+ if (!cpus_have_final_cap(ARM64_HAS_FGT))
+ return;
+
+ __deactivate_fgt(hctxt, vcpu, HFGRTR_EL2);
+ __deactivate_fgt(hctxt, vcpu, HFGWTR_EL2);
+ __deactivate_fgt(hctxt, vcpu, HFGITR_EL2);
+ __deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2);
+ __deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2);
+
+ if (cpu_has_amu())
+ __deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2);
+
+ if (!cpus_have_final_cap(ARM64_HAS_FGT2))
+ return;
+
+ __deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2);
+ __deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2);
+ __deactivate_fgt(hctxt, vcpu, HFGITR2_EL2);
+ __deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2);
+ __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2);
+}
+
+static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
+{
+ u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
+
+ if (!system_supports_mpam())
+ return;
+
+ /* trap guest access to MPAMIDR_EL1 */
+ if (system_supports_mpam_hcr()) {
+ write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2);
+ } else {
+ /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */
+ r |= MPAM2_EL2_TIDR;
+ }
+
+ write_sysreg_s(r, SYS_MPAM2_EL2);
+}
+
+static inline void __deactivate_traps_mpam(void)
+{
+ if (!system_supports_mpam())
+ return;
+
+ write_sysreg_s(0, SYS_MPAM2_EL2);
+
+ if (system_supports_mpam_hcr())
+ write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2);
+}
+
static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
{
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
/* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
write_sysreg(1 << 15, hstr_el2);
@@ -80,68 +309,104 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
* counter, which could make a PMXEVCNTR_EL0 access UNDEF at
* EL1 instead of being trapped to EL2.
*/
- if (kvm_arm_support_pmu_v3()) {
+ if (system_supports_pmuv3()) {
write_sysreg(0, pmselr_el0);
+
+ ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0);
write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
+ vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
}
- vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
- write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-
- if (cpus_have_final_cap(ARM64_SME)) {
- sysreg_clear_set_s(SYS_HFGRTR_EL2,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK,
- 0);
- sysreg_clear_set_s(SYS_HFGWTR_EL2,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK,
- 0);
+ if (cpus_have_final_cap(ARM64_HAS_HCX)) {
+ u64 hcrx = vcpu->arch.hcrx_el2;
+ if (is_nested_ctxt(vcpu)) {
+ u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2);
+ hcrx |= val & __HCRX_EL2_MASK;
+ hcrx &= ~(~val & __HCRX_EL2_nMASK);
+ }
+
+ ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2);
+ write_sysreg_s(hcrx, SYS_HCRX_EL2);
}
+
+ __activate_traps_hfgxtr(vcpu);
+ __activate_traps_mpam(vcpu);
}
static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
{
- write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2);
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
write_sysreg(0, hstr_el2);
- if (kvm_arm_support_pmu_v3())
- write_sysreg(0, pmuserenr_el0);
-
- if (cpus_have_final_cap(ARM64_SME)) {
- sysreg_clear_set_s(SYS_HFGRTR_EL2, 0,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK);
- sysreg_clear_set_s(SYS_HFGWTR_EL2, 0,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK);
+ if (system_supports_pmuv3()) {
+ write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
+ vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
}
+
+ if (cpus_have_final_cap(ARM64_HAS_HCX))
+ write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2);
+
+ __deactivate_traps_hfgxtr(vcpu);
+ __deactivate_traps_mpam();
}
-static inline void ___activate_traps(struct kvm_vcpu *vcpu)
+static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr)
{
- u64 hcr = vcpu->arch.hcr_el2;
-
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM))
hcr |= HCR_TVM;
- write_sysreg(hcr, hcr_el2);
+ write_sysreg_hcr(hcr);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
- write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) {
+ u64 vsesr;
+
+ /*
+ * When HCR_EL2.AMO is set, physical SErrors are taken to EL2
+ * and vSError injection is enabled for EL1. Conveniently, for
+ * NV this means that it is never the case where a 'physical'
+ * SError (injected by KVM or userspace) and vSError are
+ * deliverable to the same context.
+ *
+ * As such, we can trivially select between the host or guest's
+ * VSESR_EL2. Except for the case that FEAT_RAS hasn't been
+ * exposed to the guest, where ESR propagation in hardware
+ * occurs unconditionally.
+ *
+ * Paper over the architectural wart and use an IMPLEMENTATION
+ * DEFINED ESR value in case FEAT_RAS is hidden from the guest.
+ */
+ if (!vserror_state_is_nested(vcpu))
+ vsesr = vcpu->arch.vsesr_el2;
+ else if (kvm_has_ras(kern_hyp_va(vcpu->kvm)))
+ vsesr = __vcpu_sys_reg(vcpu, VSESR_EL2);
+ else
+ vsesr = ESR_ELx_ISV;
+
+ write_sysreg_s(vsesr, SYS_VSESR_EL2);
+ }
}
static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
{
+ u64 *hcr;
+
+ if (vserror_state_is_nested(vcpu))
+ hcr = __ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2);
+ else
+ hcr = &vcpu->arch.hcr_el2;
+
/*
* If we pended a virtual abort, preserve it until it gets
* cleared. See D1.14.3 (Virtual Interrupts) for details, but
* the crucial bit is "On taking a vSError interrupt,
* HCR_EL2.VSE is cleared to 0."
+ *
+ * Additionally, when in a nested context we need to propagate the
+ * updated state to the guest hypervisor's HCR_EL2.
*/
- if (vcpu->arch.hcr_el2 & HCR_VSE) {
- vcpu->arch.hcr_el2 &= ~HCR_VSE;
- vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE;
+ if (*hcr & HCR_VSE) {
+ *hcr &= ~HCR_VSE;
+ *hcr |= read_sysreg(hcr_el2) & HCR_VSE;
}
}
@@ -150,25 +415,142 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault);
}
+static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
+ arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2);
+ write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
+
+ /*
+ * Finish potential single step before executing the prologue
+ * instruction.
+ */
+ *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
+ write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
+
+ return true;
+}
+
static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
{
+ /*
+ * The vCPU's saved SVE state layout always matches the max VL of the
+ * vCPU. Start off with the max VL so we can load the SVE state.
+ */
sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
__sve_restore_state(vcpu_sve_pffr(vcpu),
- &vcpu->arch.ctxt.fp_regs.fpsr);
- write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
+ &vcpu->arch.ctxt.fp_regs.fpsr,
+ true);
+
+ /*
+ * The effective VL for a VM could differ from the max VL when running a
+ * nested guest, as the guest hypervisor could select a smaller VL. Slap
+ * that into hardware before wrapping up.
+ */
+ if (is_nested_ctxt(vcpu))
+ sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2);
+
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR);
}
+static inline void __hyp_sve_save_host(void)
+{
+ struct cpu_sve_state *sve_state = *host_data_ptr(sve_state);
+
+ sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR);
+ write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2);
+ __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl),
+ &sve_state->fpsr,
+ true);
+}
+
+static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ u64 zcr_el1, zcr_el2;
+
+ if (!guest_owns_fp_regs())
+ return;
+
+ if (vcpu_has_sve(vcpu)) {
+ /* A guest hypervisor may restrict the effective max VL. */
+ if (is_nested_ctxt(vcpu))
+ zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2);
+ else
+ zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
+
+ write_sysreg_el2(zcr_el2, SYS_ZCR);
+
+ zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu));
+ write_sysreg_el1(zcr_el1, SYS_ZCR);
+ }
+}
+
+static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu)
+{
+ u64 zcr_el1, zcr_el2;
+
+ if (!guest_owns_fp_regs())
+ return;
+
+ /*
+ * When the guest owns the FP regs, we know that guest+hyp traps for
+ * any FPSIMD/SVE/SME features exposed to the guest have been disabled
+ * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd()
+ * prior to __guest_entry(). As __guest_entry() guarantees a context
+ * synchronization event, we don't need an ISB here to avoid taking
+ * traps for anything that was exposed to the guest.
+ */
+ if (vcpu_has_sve(vcpu)) {
+ zcr_el1 = read_sysreg_el1(SYS_ZCR);
+ __vcpu_assign_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu), zcr_el1);
+
+ /*
+ * The guest's state is always saved using the guest's max VL.
+ * Ensure that the host has the guest's max VL active such that
+ * the host can save the guest's state lazily, but don't
+ * artificially restrict the host to the guest's max VL.
+ */
+ if (has_vhe()) {
+ zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
+ write_sysreg_el2(zcr_el2, SYS_ZCR);
+ } else {
+ zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1;
+ write_sysreg_el2(zcr_el2, SYS_ZCR);
+
+ zcr_el1 = vcpu_sve_max_vq(vcpu) - 1;
+ write_sysreg_el1(zcr_el1, SYS_ZCR);
+ }
+ }
+}
+
+static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Non-protected kvm relies on the host restoring its sve state.
+ * Protected kvm restores the host's sve state as not to reveal that
+ * fpsimd was used by a guest nor leak upper sve bits.
+ */
+ if (system_supports_sve()) {
+ __hyp_sve_save_host();
+ } else {
+ __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
+ }
+
+ if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
+ *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR);
+}
+
+
/*
* We trap the first access to the FP/SIMD to save the host context and
* restore the guest context lazily.
* If FP/SIMD is not implemented, handle the trap and inject an undefined
* instruction exception to the guest. Similarly for trapped SVE accesses.
*/
-static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
{
bool sve_guest;
u8 esr_ec;
- u64 reg;
if (!system_supports_fpsimd())
return false;
@@ -176,31 +558,36 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
sve_guest = vcpu_has_sve(vcpu);
esr_ec = kvm_vcpu_trap_get_class(vcpu);
- /* Don't handle SVE traps for non-SVE vcpus here: */
- if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD)
+ /* Only handle traps the vCPU can support here: */
+ switch (esr_ec) {
+ case ESR_ELx_EC_FP_ASIMD:
+ /* Forward traps to the guest hypervisor as required */
+ if (guest_hyp_fpsimd_traps_enabled(vcpu))
+ return false;
+ break;
+ case ESR_ELx_EC_SYS64:
+ if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu)))
+ return false;
+ fallthrough;
+ case ESR_ELx_EC_SVE:
+ if (!sve_guest)
+ return false;
+ if (guest_hyp_sve_traps_enabled(vcpu))
+ return false;
+ break;
+ default:
return false;
+ }
/* Valid trap. Switch the context: */
/* First disable enough traps to allow us to update the registers */
- if (has_vhe()) {
- reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN;
- if (sve_guest)
- reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
-
- sysreg_clear_set(cpacr_el1, 0, reg);
- } else {
- reg = CPTR_EL2_TFP;
- if (sve_guest)
- reg |= CPTR_EL2_TZ;
-
- sysreg_clear_set(cptr_el2, reg, 0);
- }
+ __deactivate_cptr_traps(vcpu);
isb();
/* Write out the host state if it's in the registers */
- if (vcpu->arch.fp_state == FP_STATE_HOST_OWNED)
- __fpsimd_save_state(vcpu->arch.host_fpsimd_state);
+ if (is_protected_kvm_enabled() && host_owns_fp_regs())
+ kvm_hyp_save_fpsimd_host(vcpu);
/* Restore the guest state */
if (sve_guest)
@@ -208,11 +595,21 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
else
__fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
+ if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
+ write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR);
+
/* Skip restoring fpexc32 for AArch64 guests */
if (!(read_sysreg(hcr_el2) & HCR_RW))
write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2);
- vcpu->arch.fp_state = FP_STATE_GUEST_OWNED;
+ *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED;
+
+ /*
+ * Re-enable traps necessary for the current state of the guest, e.g.
+ * those enabled by a guest hypervisor. The ERET to the guest will
+ * provide the necessary context synchronization.
+ */
+ __activate_cptr_traps(vcpu);
return true;
}
@@ -272,77 +669,126 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu)
return true;
}
-static inline bool esr_is_ptrauth_trap(u64 esr)
+/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */
+static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt)
{
- switch (esr_sys64_to_sysreg(esr)) {
- case SYS_APIAKEYLO_EL1:
- case SYS_APIAKEYHI_EL1:
- case SYS_APIBKEYLO_EL1:
- case SYS_APIBKEYHI_EL1:
- case SYS_APDAKEYLO_EL1:
- case SYS_APDAKEYHI_EL1:
- case SYS_APDBKEYLO_EL1:
- case SYS_APDBKEYHI_EL1:
- case SYS_APGAKEYLO_EL1:
- case SYS_APGAKEYHI_EL1:
- return true;
- }
+ u64 offset = 0;
- return false;
-}
+ if (ctxt->offset.vm_offset)
+ offset += *kern_hyp_va(ctxt->offset.vm_offset);
+ if (ctxt->offset.vcpu_offset)
+ offset += *kern_hyp_va(ctxt->offset.vcpu_offset);
-#define __ptrauth_save_key(ctxt, key) \
- do { \
- u64 __val; \
- __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \
- ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \
- __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \
- ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
-} while(0)
+ return offset;
+}
-DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+static inline u64 compute_counter_value(struct arch_timer_context *ctxt)
+{
+ return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt);
+}
-static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code)
+static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu)
{
- struct kvm_cpu_context *ctxt;
+ struct arch_timer_context *ctxt;
+ u32 sysreg;
u64 val;
- if (!vcpu_has_ptrauth(vcpu))
+ /*
+ * We only get here for 64bit guests, 32bit guests will hit
+ * the long and winding road all the way to the standard
+ * handling. Yes, it sucks to be irrelevant.
+ *
+ * Also, we only deal with non-hypervisor context here (either
+ * an EL1 guest, or a non-HYP context of an EL2 guest).
+ */
+ if (is_hyp_ctxt(vcpu))
+ return false;
+
+ sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+
+ switch (sysreg) {
+ case SYS_CNTPCT_EL0:
+ case SYS_CNTPCTSS_EL0:
+ if (vcpu_has_nv(vcpu)) {
+ /* Check for guest hypervisor trapping */
+ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = (val & CNTHCTL_EL1PCTEN) << 10;
+
+ if (!(val & (CNTHCTL_EL1PCTEN << 10)))
+ return false;
+ }
+
+ ctxt = vcpu_ptimer(vcpu);
+ break;
+ case SYS_CNTVCT_EL0:
+ case SYS_CNTVCTSS_EL0:
+ if (vcpu_has_nv(vcpu)) {
+ /* Check for guest hypervisor trapping */
+ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+
+ if (val & CNTHCTL_EL1TVCT)
+ return false;
+ }
+
+ ctxt = vcpu_vtimer(vcpu);
+ break;
+ default:
return false;
+ }
- ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
- __ptrauth_save_key(ctxt, APIA);
- __ptrauth_save_key(ctxt, APIB);
- __ptrauth_save_key(ctxt, APDA);
- __ptrauth_save_key(ctxt, APDB);
- __ptrauth_save_key(ctxt, APGA);
+ val = compute_counter_value(ctxt);
- vcpu_ptrauth_enable(vcpu);
+ vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
+ __kvm_skip_instr(vcpu);
+ return true;
+}
+
+static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ int rt = kvm_vcpu_sys_get_rt(vcpu);
+ u64 val = vcpu_get_reg(vcpu, rt);
- val = read_sysreg(hcr_el2);
- val |= (HCR_API | HCR_APK);
- write_sysreg(val, hcr_el2);
+ if (sysreg != SYS_TCR_EL1)
+ return false;
+ /*
+ * Affected parts do not advertise support for hardware Access Flag /
+ * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying
+ * control bits are still functional. The architecture requires these be
+ * RES0 on systems that do not implement FEAT_HAFDBS.
+ *
+ * Uphold the requirements of the architecture by masking guest writes
+ * to TCR_EL1.{HA,HD} here.
+ */
+ val &= ~(TCR_HD | TCR_HA);
+ write_sysreg_el1(val, SYS_TCR);
+ __kvm_skip_instr(vcpu);
return true;
}
-static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
handle_tx2_tvm(vcpu))
return true;
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) &&
+ handle_ampere1_tcr(vcpu))
+ return true;
+
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
__vgic_v3_perform_cpuif_access(vcpu) == 1)
return true;
- if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
- return kvm_hyp_handle_ptrauth(vcpu, exit_code);
+ if (kvm_handle_cntxct(vcpu))
+ return true;
return false;
}
-static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
__vgic_v3_perform_cpuif_access(vcpu) == 1)
@@ -351,23 +797,26 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
return false;
}
-static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu,
+ u64 *exit_code)
{
if (!__populate_fault_info(vcpu))
return true;
return false;
}
+#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault
+#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault
-static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
{
- if (!__populate_fault_info(vcpu))
+ if (kvm_hyp_handle_memory_fault(vcpu, exit_code))
return true;
if (static_branch_unlikely(&vgic_v2_cpuif_trap)) {
bool valid;
- valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT &&
+ valid = kvm_vcpu_trap_is_translation_fault(vcpu) &&
kvm_vcpu_dabt_isvalid(vcpu) &&
!kvm_vcpu_abt_issea(vcpu) &&
!kvm_vcpu_abt_iss1tw(vcpu);
@@ -389,23 +838,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);
-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
-
-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
-
/*
* Allow the hypervisor to handle the exit with an exit handler if it has one.
*
* Returns true if the hypervisor handled the exit, and control should go back
* to the guest, or false if it hasn't.
*/
-static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
+ const exit_handler_fn *handlers)
{
- const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
- exit_handler_fn fn;
-
- fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
-
+ exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
if (fn)
return fn(vcpu, exit_code);
@@ -435,20 +877,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code
* the guest, false when we should restore the host state and return to the
* main run loop.
*/
-static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
+ const exit_handler_fn *handlers)
{
- /*
- * Save PSTATE early so that we can evaluate the vcpu mode
- * early on.
- */
- synchronize_vcpu_pstate(vcpu, exit_code);
-
- /*
- * Check whether we want to repaint the state one way or
- * another.
- */
- early_exit_filter(vcpu, exit_code);
-
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
@@ -478,7 +909,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
goto exit;
/* Check if there's an exit handler and allow it to handle the exit. */
- if (kvm_hyp_handle_exit(vcpu, exit_code))
+ if (kvm_hyp_handle_exit(vcpu, exit_code, handlers))
goto guest;
exit:
/* Return to the host kernel and handle the exit */
@@ -492,7 +923,7 @@ guest:
static inline void __kvm_unexpected_el2_exception(void)
{
- extern char __guest_exit_panic[];
+ extern char __guest_exit_restore_elr_and_panic[];
unsigned long addr, fixup;
struct kvm_exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
@@ -514,7 +945,8 @@ static inline void __kvm_unexpected_el2_exception(void)
}
/* Trigger a panic after restoring the hyp context. */
- write_sysreg(__guest_exit_panic, elr_el2);
+ this_cpu_ptr(&kvm_hyp_ctxt)->sys_regs[ELR_EL2] = elr_el2;
+ write_sysreg(__guest_exit_restore_elr_and_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index baa5b9b3dde5..a17cbe7582de 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -16,9 +16,51 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt);
+
+static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
+
+ if (!vcpu)
+ vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
+
+ return vcpu;
+}
+
+static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt)
+{
+ return host_data_ptr(host_ctxt) != ctxt;
+}
+
+static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt);
+
+ if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu))
+ return &vcpu->arch.external_mdscr_el1;
+
+ return &ctxt_sys_reg(ctxt, MDSCR_EL1);
+}
+
+static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt)
+{
+ struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm);
+
+ if (!(ctxt_is_guest(ctxt) &&
+ test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)))
+ return read_cpuid_id();
+
+ return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1);
+}
+
static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
- ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1);
+ *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1);
+
+ // POR_EL0 can affect uaccess, so must be saved/restored early.
+ if (ctxt_has_s1poe(ctxt))
+ ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0);
}
static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
@@ -29,22 +71,84 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
{
- struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
-
- if (!vcpu)
- vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
+ struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt);
return kvm_has_mte(kern_hyp_va(vcpu->kvm));
}
+static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_S1PIE))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_s1pie(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_TCR2))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_tcr2(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!system_supports_poe())
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_s1poe(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_ras(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_ras(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_sctlr2(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_SCTLR2))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_sctlr2(kern_hyp_va(vcpu->kvm));
+}
+
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
- ctxt_sys_reg(ctxt, CSSELR_EL1) = read_sysreg(csselr_el1);
ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR);
ctxt_sys_reg(ctxt, CPACR_EL1) = read_sysreg_el1(SYS_CPACR);
ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0);
ctxt_sys_reg(ctxt, TTBR1_EL1) = read_sysreg_el1(SYS_TTBR1);
ctxt_sys_reg(ctxt, TCR_EL1) = read_sysreg_el1(SYS_TCR);
+ if (ctxt_has_tcrx(ctxt)) {
+ ctxt_sys_reg(ctxt, TCR2_EL1) = read_sysreg_el1(SYS_TCR2);
+
+ if (ctxt_has_s1pie(ctxt)) {
+ ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR);
+ ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0);
+ }
+
+ if (ctxt_has_s1poe(ctxt))
+ ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR);
+ }
ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR);
ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0);
ctxt_sys_reg(ctxt, AFSR1_EL1) = read_sysreg_el1(SYS_AFSR1);
@@ -65,6 +169,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1);
ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR);
ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR);
+
+ if (ctxt_has_sctlr2(ctxt))
+ ctxt_sys_reg(ctxt, SCTLR2_EL1) = read_sysreg_el1(SYS_SCTLR2);
}
static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
@@ -77,13 +184,22 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
if (!has_vhe() && ctxt->__hyp_running_vcpu)
ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return;
+
+ if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt)))
ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2);
+ else if (ctxt_has_ras(ctxt))
+ ctxt_sys_reg(ctxt, VDISR_EL2) = read_sysreg_s(SYS_VDISR_EL2);
}
static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
{
- write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1);
+ write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1);
+
+ // POR_EL0 can affect uaccess, so must be saved/restored early.
+ if (ctxt_has_s1poe(ctxt))
+ write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0);
}
static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
@@ -92,10 +208,11 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0);
}
-static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
+static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt,
+ u64 midr, u64 mpidr)
{
- write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2);
- write_sysreg(ctxt_sys_reg(ctxt, CSSELR_EL1), csselr_el1);
+ write_sysreg(midr, vpidr_el2);
+ write_sysreg(mpidr, vmpidr_el2);
if (has_vhe() ||
!cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
@@ -116,6 +233,17 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1), SYS_CPACR);
write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0);
write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1);
+ if (ctxt_has_tcrx(ctxt)) {
+ write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1), SYS_TCR2);
+
+ if (ctxt_has_s1pie(ctxt)) {
+ write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR);
+ write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0);
+ }
+
+ if (ctxt_has_s1poe(ctxt))
+ write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR);
+ }
write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR);
write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0);
write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1), SYS_AFSR1);
@@ -154,12 +282,33 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt_sys_reg(ctxt, SP_EL1), sp_el1);
write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1), SYS_ELR);
write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR);
+
+ if (ctxt_has_sctlr2(ctxt))
+ write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR2_EL1), SYS_SCTLR2);
+}
+
+/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */
+static inline u64 to_hw_pstate(const struct kvm_cpu_context *ctxt)
+{
+ u64 mode = ctxt->regs.pstate & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ switch (mode) {
+ case PSR_MODE_EL2t:
+ mode = PSR_MODE_EL1t;
+ break;
+ case PSR_MODE_EL2h:
+ mode = PSR_MODE_EL1h;
+ break;
+ }
+
+ return (ctxt->regs.pstate & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode;
}
static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
{
- u64 pstate = ctxt->regs.pstate;
+ u64 pstate = to_hw_pstate(ctxt);
u64 mode = pstate & PSR_AA32_MODE_MASK;
+ u64 vdisr;
/*
* Safety check to ensure we're setting the CPU up to enter the guest
@@ -178,8 +327,17 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx
write_sysreg_el2(ctxt->regs.pc, SYS_ELR);
write_sysreg_el2(pstate, SYS_SPSR);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
- write_sysreg_s(ctxt_sys_reg(ctxt, DISR_EL1), SYS_VDISR_EL2);
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return;
+
+ if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt)))
+ vdisr = ctxt_sys_reg(ctxt, DISR_EL1);
+ else if (ctxt_has_ras(ctxt))
+ vdisr = ctxt_sys_reg(ctxt, VDISR_EL2);
+ else
+ vdisr = 0;
+
+ write_sysreg_s(vdisr, SYS_VDISR_EL2);
}
static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu)
@@ -192,11 +350,11 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu)
vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq);
vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq);
- __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2);
- __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2);
+ __vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2));
+ __vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2));
- if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY))
- __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2);
+ if (has_vhe() || kvm_debug_regs_in_use(vcpu))
+ __vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2));
}
static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu)
@@ -212,7 +370,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu)
write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2);
write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2);
- if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY))
+ if (has_vhe() || kvm_debug_regs_in_use(vcpu))
write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2);
}
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
new file mode 100644
index 000000000000..146e0aebfa1c
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ */
+#ifndef __KVM_HYP_FFA_H
+#define __KVM_HYP_FFA_H
+
+#include <asm/kvm_host.h>
+
+#define FFA_MIN_FUNC_NUM 0x60
+#define FFA_MAX_FUNC_NUM 0xFF
+
+int hyp_ffa_init(void *pages);
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id);
+
+#endif /* __KVM_HYP_FFA_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
deleted file mode 100644
index 07edfc7524c9..000000000000
--- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2021 Google LLC
- * Author: Fuad Tabba <tabba@google.com>
- */
-
-#ifndef __ARM64_KVM_FIXED_CONFIG_H__
-#define __ARM64_KVM_FIXED_CONFIG_H__
-
-#include <asm/sysreg.h>
-
-/*
- * This file contains definitions for features to be allowed or restricted for
- * guest virtual machines, depending on the mode KVM is running in and on the
- * type of guest that is running.
- *
- * The ALLOW masks represent a bitmask of feature fields that are allowed
- * without any restrictions as long as they are supported by the system.
- *
- * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for
- * features that are restricted to support at most the specified feature.
- *
- * If a feature field is not present in either, than it is not supported.
- *
- * The approach taken for protected VMs is to allow features that are:
- * - Needed by common Linux distributions (e.g., floating point)
- * - Trivial to support, e.g., supporting the feature does not introduce or
- * require tracking of additional state in KVM
- * - Cannot be trapped or prevent the guest from using anyway
- */
-
-/*
- * Allow for protected VMs:
- * - Floating-point and Advanced SIMD
- * - Data Independent Timing
- */
-#define PVM_ID_AA64PFR0_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \
- )
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - AArch64 guests only (no support for AArch32 guests):
- * AArch32 adds complexity in trap handling, emulation, condition codes,
- * etc...
- * - RAS (v1)
- * Supported by KVM
- */
-#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \
- )
-
-/*
- * Allow for protected VMs:
- * - Branch Target Identification
- * - Speculative Store Bypassing
- */
-#define PVM_ID_AA64PFR1_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \
- ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \
- )
-
-/*
- * Allow for protected VMs:
- * - Mixed-endian
- * - Distinction between Secure and Non-secure Memory
- * - Mixed-endian at EL0 only
- * - Non-context synchronizing exception entry and exit
- */
-#define PVM_ID_AA64MMFR0_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \
- )
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - 40-bit IPA
- * - 16-bit ASID
- */
-#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \
- FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \
- )
-
-/*
- * Allow for protected VMs:
- * - Hardware translation table updates to Access flag and Dirty state
- * - Number of VMID bits from CPU
- * - Hierarchical Permission Disables
- * - Privileged Access Never
- * - SError interrupt exceptions from speculative reads
- * - Enhanced Translation Synchronization
- */
-#define PVM_ID_AA64MMFR1_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \
- )
-
-/*
- * Allow for protected VMs:
- * - Common not Private translations
- * - User Access Override
- * - IESB bit in the SCTLR_ELx registers
- * - Unaligned single-copy atomicity and atomic functions
- * - ESR_ELx.EC value on an exception by read access to feature ID space
- * - TTL field in address operations.
- * - Break-before-make sequences when changing translation block size
- * - E0PDx mechanism
- */
-#define PVM_ID_AA64MMFR2_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \
- ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \
- )
-
-/*
- * No support for Scalable Vectors for protected VMs:
- * Requires additional support from KVM, e.g., context-switching and
- * trapping at EL2
- */
-#define PVM_ID_AA64ZFR0_ALLOW (0ULL)
-
-/*
- * No support for debug, including breakpoints, and watchpoints for protected
- * VMs:
- * The Arm architecture mandates support for at least the Armv8 debug
- * architecture, which would include at least 2 hardware breakpoints and
- * watchpoints. Providing that support to protected guests adds
- * considerable state and complexity. Therefore, the reserved value of 0 is
- * used for debug-related fields.
- */
-#define PVM_ID_AA64DFR0_ALLOW (0ULL)
-#define PVM_ID_AA64DFR1_ALLOW (0ULL)
-
-/*
- * No support for implementation defined features.
- */
-#define PVM_ID_AA64AFR0_ALLOW (0ULL)
-#define PVM_ID_AA64AFR1_ALLOW (0ULL)
-
-/*
- * No restrictions on instructions implemented in AArch64.
- */
-#define PVM_ID_AA64ISAR0_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \
- )
-
-#define PVM_ID_AA64ISAR1_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \
- )
-
-#define PVM_ID_AA64ISAR2_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \
- )
-
-u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
-bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
-bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
-int kvm_check_pvm_sysreg_table(void);
-
-#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 0a048dc06a7d..3766333bace9 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -7,7 +7,7 @@
#include <nvhe/memory.h>
#include <nvhe/spinlock.h>
-#define HYP_NO_ORDER USHRT_MAX
+#define HYP_NO_ORDER ((u8)(~0))
struct hyp_pool {
/*
@@ -16,14 +16,14 @@ struct hyp_pool {
* API at EL2.
*/
hyp_spinlock_t lock;
- struct list_head free_area[MAX_ORDER];
+ struct list_head free_area[NR_PAGE_ORDERS];
phys_addr_t range_start;
phys_addr_t range_end;
- unsigned short max_order;
+ u8 max_order;
};
/* Allocation */
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+void *hyp_alloc_pages(struct hyp_pool *pool, u8 order);
void hyp_split_page(struct hyp_page *page);
void hyp_get_page(struct hyp_pool *pool, void *addr);
void hyp_put_page(struct hyp_pool *pool, void *addr);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index b7bdbe63deed..5f9d56754e39 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -11,40 +11,10 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/virt.h>
+#include <nvhe/memory.h>
#include <nvhe/pkvm.h>
#include <nvhe/spinlock.h>
-/*
- * SW bits 0-1 are reserved to track the memory ownership state of each page:
- * 00: The page is owned exclusively by the page-table owner.
- * 01: The page is owned by the page-table owner, but is shared
- * with another entity.
- * 10: The page is shared with, but not owned by the page-table owner.
- * 11: Reserved for future use (lending).
- */
-enum pkvm_page_state {
- PKVM_PAGE_OWNED = 0ULL,
- PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0,
- PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1,
- __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 |
- KVM_PGTABLE_PROT_SW1,
-
- /* Meta-states which aren't encoded directly in the PTE's SW bits */
- PKVM_NOPAGE,
-};
-
-#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
-static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
- enum pkvm_page_state state)
-{
- return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state;
-}
-
-static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
-{
- return prot & PKVM_PAGE_STATE_PROT_MASK;
-}
-
struct host_mmu {
struct kvm_arch arch;
struct kvm_pgtable pgt;
@@ -57,6 +27,7 @@ extern struct host_mmu host_mmu;
enum pkvm_component_id {
PKVM_ID_HOST,
PKVM_ID_HYP,
+ PKVM_ID_FFA,
};
extern unsigned long hyp_nr_cpus;
@@ -66,6 +37,15 @@ int __pkvm_host_share_hyp(u64 pfn);
int __pkvm_host_unshare_hyp(u64 pfn);
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
+ enum kvm_pgtable_prot prot);
+int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm);
+int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot);
+int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm);
+int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm);
+int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
@@ -76,7 +56,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
int hyp_pin_shared_mem(void *from, void *to);
void hyp_unpin_shared_mem(void *from, void *to);
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
+void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
struct kvm_hyp_memcache *host_mc);
@@ -87,4 +67,10 @@ static __always_inline void __load_host_stage2(void)
else
write_sysreg(0, vttbr_el2);
}
+
+#ifdef CONFIG_NVHE_EL2_DEBUG
+void pkvm_ownership_selftest(void *base);
+#else
+static inline void pkvm_ownership_selftest(void *base) { }
+#endif
#endif /* __KVM_NVHE_MEM_PROTECT__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index ab205c4d6774..dee1a406b0c2 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -7,9 +7,61 @@
#include <linux/types.h>
+/*
+ * Bits 0-1 are used to encode the memory ownership state of each page from the
+ * point of view of a pKVM "component" (host, hyp, guest, ... see enum
+ * pkvm_component_id):
+ * 00: The page is owned and exclusively accessible by the component;
+ * 01: The page is owned and accessible by the component, but is also
+ * accessible by another component;
+ * 10: The page is accessible but not owned by the component;
+ * The storage of this state depends on the component: either in the
+ * hyp_vmemmap for the host and hyp states or in PTE software bits for guests.
+ */
+enum pkvm_page_state {
+ PKVM_PAGE_OWNED = 0ULL,
+ PKVM_PAGE_SHARED_OWNED = BIT(0),
+ PKVM_PAGE_SHARED_BORROWED = BIT(1),
+
+ /*
+ * 'Meta-states' are not stored directly in PTE SW bits for guest
+ * states, but inferred from the context (e.g. invalid PTE entries).
+ * For the host and hyp, meta-states are stored directly in the
+ * struct hyp_page.
+ */
+ PKVM_NOPAGE = BIT(0) | BIT(1),
+};
+#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1))
+
+#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
+static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
+ enum pkvm_page_state state)
+{
+ prot &= ~PKVM_PAGE_STATE_PROT_MASK;
+ prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state);
+ return prot;
+}
+
+static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
+{
+ return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot);
+}
+
struct hyp_page {
- unsigned short refcount;
- unsigned short order;
+ u16 refcount;
+ u8 order;
+
+ /* Host state. Guarded by the host stage-2 lock. */
+ unsigned __host_state : 4;
+
+ /*
+ * Complement of the hyp state. Guarded by the hyp stage-1 lock. We use
+ * the complement so that the initial 0 in __hyp_state_comp (due to the
+ * entire vmemmap starting off zeroed) encodes PKVM_NOPAGE.
+ */
+ unsigned __hyp_state_comp : 4;
+
+ u32 host_share_guest_count;
};
extern u64 __hyp_vmemmap;
@@ -29,7 +81,13 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr)
#define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT)
#define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT))
-#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)])
+
+static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys)
+{
+ BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64));
+ return &hyp_vmemmap[hyp_phys_to_pfn(phys)];
+}
+
#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt))
#define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt))
@@ -38,6 +96,26 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr)
#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
+static inline enum pkvm_page_state get_host_state(struct hyp_page *p)
+{
+ return p->__host_state;
+}
+
+static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state)
+{
+ p->__host_state = state;
+}
+
+static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p)
+{
+ return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK;
+}
+
+static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state)
+{
+ p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK;
+}
+
/*
* Refcounting for 'struct hyp_page'.
* hyp_pool::lock must be held if atomic access to the refcount is required.
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index d5ec972b5c1e..6e83ce35c2f2 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -13,9 +13,11 @@
extern struct kvm_pgtable pkvm_pgtable;
extern hyp_spinlock_t pkvm_pgd_lock;
-int hyp_create_pcpu_fixmap(void);
+int hyp_create_fixmap(void);
void *hyp_fixmap_map(phys_addr_t phys);
void hyp_fixmap_unmap(void);
+void *hyp_fixblock_map(phys_addr_t phys, size_t *size);
+void hyp_fixblock_unmap(void);
int hyp_create_idmap(u32 hyp_va_bits);
int hyp_map_vectors(void);
@@ -26,6 +28,7 @@ int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot
int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
enum kvm_pgtable_prot prot,
unsigned long *haddr);
+int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr);
int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
#endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 82b3d62538a6..184ad7a39950 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -20,10 +20,16 @@ struct pkvm_hyp_vcpu {
/* Backpointer to the host's (untrusted) vCPU instance. */
struct kvm_vcpu *host_vcpu;
+
+ /*
+ * If this hyp vCPU is loaded, then this is a backpointer to the
+ * per-cpu pointer tracking us. Otherwise, NULL if not loaded.
+ */
+ struct pkvm_hyp_vcpu **loaded_hyp_vcpu;
};
/*
- * Holds the relevant data for running a protected vm.
+ * Holds the relevant data for running a vm in protected mode.
*/
struct pkvm_hyp_vm {
struct kvm kvm;
@@ -37,24 +43,32 @@ struct pkvm_hyp_vm {
struct hyp_pool pool;
hyp_spinlock_t lock;
- /*
- * The number of vcpus initialized and ready to run.
- * Modifying this is protected by 'vm_table_lock'.
- */
- unsigned int nr_vcpus;
-
/* Array of the hyp vCPU structures for this VM. */
struct pkvm_hyp_vcpu *vcpus[];
};
+extern hyp_spinlock_t vm_table_lock;
+
static inline struct pkvm_hyp_vm *
pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
{
return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
}
+static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ return vcpu_is_protected(&hyp_vcpu->vcpu);
+}
+
+static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm)
+{
+ return kvm_vm_is_protected(&hyp_vm->kvm);
+}
+
void pkvm_hyp_vm_table_init(void *tbl);
+int __pkvm_reserve_vm(void);
+void __pkvm_unreserve_vm(pkvm_handle_t handle);
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
unsigned long pgd_hva);
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
@@ -64,5 +78,15 @@ int __pkvm_teardown_vm(pkvm_handle_t handle);
struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
unsigned int vcpu_idx);
void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void);
+
+struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle);
+struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle);
+void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm);
+
+bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
+bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
+void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu);
+int kvm_check_pvm_sysreg_table(void);
#endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
index 45a84f0ade04..ba5382c12787 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@@ -12,9 +12,8 @@
#include <asm/kvm_host.h>
#define cpu_reg(ctxt, r) (ctxt)->regs.regs[r]
-#define DECLARE_REG(type, name, ctxt, reg) \
+#define DECLARE_REG(type, name, ctxt, reg) \
+ __always_unused int ___check_reg_ ## reg; \
type name = (type)cpu_reg(ctxt, (reg))
-void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu);
-
#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 530347cdebe3..a244ec25f8c5 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -12,7 +12,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
ccflags-y += -fno-stack-protector \
-DDISABLE_BRANCH_PROFILING \
- $(DISABLE_STACKLEAK_PLUGIN)
+ $(DISABLE_KSTACK_ERASE)
hostprogs := gen-hyprel
HOST_EXTRACFLAGS += -I$(objtree)/include
@@ -20,12 +20,15 @@ HOST_EXTRACFLAGS += -I$(objtree)/include
lib-objs := clear_page.o copy_page.o memcpy.o memset.o
lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+CFLAGS_switch.nvhe.o += -Wno-override-init
+
hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
- cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
+ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
-hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
+hyp-obj-y += ../../../kernel/smccc-call.o
+hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
hyp-obj-y += $(lib-objs)
##
@@ -89,24 +92,17 @@ quiet_cmd_hyprel = HYPREL $@
quiet_cmd_hypcopy = HYPCOPY $@
cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
-# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
-# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
-KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+# Remove ftrace and Shadow Call Stack CFLAGS.
+# This is equivalent to the 'notrace' and '__noscs' annotations.
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAGS))
# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
# when profile optimization is applied. gen-hyprel does not support SHT_REL and
# causes a build failure. Remove profile optimization flags.
KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables
-# KVM nVHE code is run at a different exception code with a different map, so
-# compiler instrumentation that inserts callbacks or checks into the code may
-# cause crashes. Just disable it.
-GCOV_PROFILE := n
-KASAN_SANITIZE := n
-KCSAN_SANITIZE := n
-UBSAN_SANITIZE := n
-KCOV_INSTRUMENT := n
-
-# Skip objtool checking for this directory because nVHE code is compiled with
-# non-standard build rules.
-OBJECT_FILES_NON_STANDARD := y
+ifeq ($(CONFIG_UBSAN_KVM_EL2),y)
+UBSAN_SANITIZE := y
+# Always use brk and not hooks
+ccflags-y += $(CFLAGS_UBSAN_TRAP)
+endif
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index e17455773b98..2a1c0f49792b 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -27,17 +27,16 @@ static void __debug_save_spe(u64 *pmscr_el1)
* Check if the host is actually using it ?
*/
reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
- if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))
+ if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT)))
return;
/* Yes; save the control register and disable data generation */
- *pmscr_el1 = read_sysreg_s(SYS_PMSCR_EL1);
- write_sysreg_s(0, SYS_PMSCR_EL1);
+ *pmscr_el1 = read_sysreg_el1(SYS_PMSCR);
+ write_sysreg_el1(0, SYS_PMSCR);
isb();
/* Now drain all buffered data to memory */
psb_csync();
- dsb(nsh);
}
static void __debug_restore_spe(u64 pmscr_el1)
@@ -49,46 +48,88 @@ static void __debug_restore_spe(u64 pmscr_el1)
isb();
/* Re-enable data generation */
- write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
+ write_sysreg_el1(pmscr_el1, SYS_PMSCR);
}
-static void __debug_save_trace(u64 *trfcr_el1)
+static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr)
{
- *trfcr_el1 = 0;
+ *saved_trfcr = read_sysreg_el1(SYS_TRFCR);
+ write_sysreg_el1(new_trfcr, SYS_TRFCR);
+}
+
+static bool __trace_needs_drain(void)
+{
+ if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE))
+ return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E;
+
+ return host_data_test_flag(TRBE_ENABLED);
+}
+
+static bool __trace_needs_switch(void)
+{
+ return host_data_test_flag(TRBE_ENABLED) ||
+ host_data_test_flag(EL1_TRACING_CONFIGURED);
+}
+
+static void __trace_switch_to_guest(void)
+{
+ /* Unsupported with TRBE so disable */
+ if (host_data_test_flag(TRBE_ENABLED))
+ *host_data_ptr(trfcr_while_in_guest) = 0;
+
+ __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1),
+ *host_data_ptr(trfcr_while_in_guest));
+
+ if (__trace_needs_drain()) {
+ isb();
+ tsb_csync();
+ }
+}
- /* Check if the TRBE is enabled */
- if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE))
+static void __trace_switch_to_host(void)
+{
+ __trace_do_switch(host_data_ptr(trfcr_while_in_guest),
+ *host_data_ptr(host_debug_state.trfcr_el1));
+}
+
+static void __debug_save_brbe(u64 *brbcr_el1)
+{
+ *brbcr_el1 = 0;
+
+ /* Check if the BRBE is enabled */
+ if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
return;
+
/*
- * Prohibit trace generation while we are in guest.
- * Since access to TRFCR_EL1 is trapped, the guest can't
+ * Prohibit branch record generation while we are in guest.
+ * Since access to BRBCR_EL1 is trapped, the guest can't
* modify the filtering set by the host.
*/
- *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1);
- write_sysreg_s(0, SYS_TRFCR_EL1);
- isb();
- /* Drain the trace buffer to memory */
- tsb_csync();
- dsb(nsh);
+ *brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
+ write_sysreg_el1(0, SYS_BRBCR);
}
-static void __debug_restore_trace(u64 trfcr_el1)
+static void __debug_restore_brbe(u64 brbcr_el1)
{
- if (!trfcr_el1)
+ if (!brbcr_el1)
return;
- /* Restore trace filter controls */
- write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1);
+ /* Restore BRBE controls */
+ write_sysreg_el1(brbcr_el1, SYS_BRBCR);
}
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE))
- __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
- /* Disable and flush Self-Hosted Trace generation */
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE))
- __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1);
+ if (host_data_test_flag(HAS_SPE))
+ __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1));
+
+ /* Disable BRBE branch records */
+ if (host_data_test_flag(HAS_BRBE))
+ __debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1));
+
+ if (__trace_needs_switch())
+ __trace_switch_to_guest();
}
void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -98,18 +139,15 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE))
- __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
- if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE))
- __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1);
+ if (host_data_test_flag(HAS_SPE))
+ __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1));
+ if (host_data_test_flag(HAS_BRBE))
+ __debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1));
+ if (__trace_needs_switch())
+ __trace_switch_to_host();
}
void __debug_switch_to_host(struct kvm_vcpu *vcpu)
{
__debug_switch_to_host_common(vcpu);
}
-
-u64 __kvm_get_mdcr_el2(void)
-{
- return read_sysreg(mdcr_el2);
-}
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
new file mode 100644
index 000000000000..f731cc4c3f28
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -0,0 +1,993 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by
+ * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware
+ * Framework for Arm A-profile", which is specified by Arm in document
+ * number DEN0077.
+ *
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ *
+ * This driver hooks into the SMC trapping logic for the host and intercepts
+ * all calls falling within the FF-A range. Each call is either:
+ *
+ * - Forwarded on unmodified to the SPMD at EL3
+ * - Rejected as "unsupported"
+ * - Accompanied by a host stage-2 page-table check/update and reissued
+ *
+ * Consequently, any attempts by the host to make guest memory pages
+ * accessible to the secure world using FF-A will be detected either here
+ * (in the case that the memory is already owned by the guest) or during
+ * donation to the guest (in the case that the memory was previously shared
+ * with the secure world).
+ *
+ * To allow the rolling-back of page-table updates and FF-A calls in the
+ * event of failure, operations involving the RXTX buffers are locked for
+ * the duration and are therefore serialised.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/arm_ffa.h>
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/ffa.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/trap_handler.h>
+#include <nvhe/spinlock.h>
+
+/*
+ * "ID value 0 must be returned at the Non-secure physical FF-A instance"
+ * We share this ID with the host.
+ */
+#define HOST_FFA_ID 0
+
+/*
+ * A buffer to hold the maximum descriptor size we can see from the host,
+ * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP
+ * when resolving the handle on the reclaim path.
+ */
+struct kvm_ffa_descriptor_buffer {
+ void *buf;
+ size_t len;
+};
+
+static struct kvm_ffa_descriptor_buffer ffa_desc_buf;
+
+struct kvm_ffa_buffers {
+ hyp_spinlock_t lock;
+ void *tx;
+ void *rx;
+};
+
+/*
+ * Note that we don't currently lock these buffers explicitly, instead
+ * relying on the locking of the host FFA buffers as we only have one
+ * client.
+ */
+static struct kvm_ffa_buffers hyp_buffers;
+static struct kvm_ffa_buffers host_buffers;
+static u32 hyp_ffa_version;
+static bool has_version_negotiated;
+static hyp_spinlock_t version_lock;
+
+static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno)
+{
+ *res = (struct arm_smccc_1_2_regs) {
+ .a0 = FFA_ERROR,
+ .a2 = ffa_errno,
+ };
+}
+
+static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop)
+{
+ if (ret == FFA_RET_SUCCESS) {
+ *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS,
+ .a2 = prop };
+ } else {
+ ffa_to_smccc_error(res, ret);
+ }
+}
+
+static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret)
+{
+ ffa_to_smccc_res_prop(res, ret, 0);
+}
+
+static void ffa_set_retval(struct kvm_cpu_context *ctxt,
+ struct arm_smccc_1_2_regs *res)
+{
+ cpu_reg(ctxt, 0) = res->a0;
+ cpu_reg(ctxt, 1) = res->a1;
+ cpu_reg(ctxt, 2) = res->a2;
+ cpu_reg(ctxt, 3) = res->a3;
+ cpu_reg(ctxt, 4) = res->a4;
+ cpu_reg(ctxt, 5) = res->a5;
+ cpu_reg(ctxt, 6) = res->a6;
+ cpu_reg(ctxt, 7) = res->a7;
+
+ /*
+ * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30.
+ *
+ * In FF-A 1.2, we cannot rely on the function ID sent by the caller to
+ * detect 32-bit calls because the CPU cycle management interfaces (e.g.
+ * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses.
+ *
+ * FFA-1.3 introduces 64-bit variants of the CPU cycle management
+ * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests
+ * complete with SMC32 direct responses which *should* allow us use the
+ * function ID sent by the caller to determine whether to return x8-x17.
+ *
+ * Note that we also cannot rely on function IDs in the response.
+ *
+ * Given the above, assume SMC64 and send back x0-x17 unconditionally
+ * as the passthrough code (__kvm_hyp_host_forward_smc) does the same.
+ */
+ cpu_reg(ctxt, 8) = res->a8;
+ cpu_reg(ctxt, 9) = res->a9;
+ cpu_reg(ctxt, 10) = res->a10;
+ cpu_reg(ctxt, 11) = res->a11;
+ cpu_reg(ctxt, 12) = res->a12;
+ cpu_reg(ctxt, 13) = res->a13;
+ cpu_reg(ctxt, 14) = res->a14;
+ cpu_reg(ctxt, 15) = res->a15;
+ cpu_reg(ctxt, 16) = res->a16;
+ cpu_reg(ctxt, 17) = res->a17;
+}
+
+static bool is_ffa_call(u64 func_id)
+{
+ return ARM_SMCCC_IS_FAST_CALL(func_id) &&
+ ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
+ ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM &&
+ ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM;
+}
+
+static int ffa_map_hyp_buffers(u64 ffa_page_count)
+{
+ struct arm_smccc_1_2_regs res;
+
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_FN64_RXTX_MAP,
+ .a1 = hyp_virt_to_phys(hyp_buffers.tx),
+ .a2 = hyp_virt_to_phys(hyp_buffers.rx),
+ .a3 = ffa_page_count,
+ }, &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static int ffa_unmap_hyp_buffers(void)
+{
+ struct arm_smccc_1_2_regs res;
+
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_RXTX_UNMAP,
+ .a1 = HOST_FFA_ID,
+ }, &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
+ u32 handle_hi, u32 fraglen, u32 endpoint_id)
+{
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_FRAG_TX,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = fraglen,
+ .a4 = endpoint_id,
+ }, res);
+}
+
+static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
+ u32 handle_hi, u32 fragoff)
+{
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_FRAG_RX,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = fragoff,
+ .a4 = HOST_FFA_ID,
+ }, res);
+}
+
+static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len,
+ u32 fraglen)
+{
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = func_id,
+ .a1 = len,
+ .a2 = fraglen,
+ }, res);
+}
+
+static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo,
+ u32 handle_hi, u32 flags)
+{
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_RECLAIM,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = flags,
+ }, res);
+}
+
+static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len)
+{
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_FN64_MEM_RETRIEVE_REQ,
+ .a1 = len,
+ .a2 = len,
+ }, res);
+}
+
+static void ffa_rx_release(struct arm_smccc_1_2_regs *res)
+{
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_RX_RELEASE,
+ }, res);
+}
+
+static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(phys_addr_t, tx, ctxt, 1);
+ DECLARE_REG(phys_addr_t, rx, ctxt, 2);
+ DECLARE_REG(u32, npages, ctxt, 3);
+ int ret = 0;
+ void *rx_virt, *tx_virt;
+
+ if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (host_buffers.tx) {
+ ret = FFA_RET_DENIED;
+ goto out_unlock;
+ }
+
+ /*
+ * Map our hypervisor buffers into the SPMD before mapping and
+ * pinning the host buffers in our own address space.
+ */
+ ret = ffa_map_hyp_buffers(npages);
+ if (ret)
+ goto out_unlock;
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unmap;
+ }
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_tx;
+ }
+
+ tx_virt = hyp_phys_to_virt(tx);
+ ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_rx;
+ }
+
+ rx_virt = hyp_phys_to_virt(rx);
+ ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unpin_tx;
+ }
+
+ host_buffers.tx = tx_virt;
+ host_buffers.rx = rx_virt;
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unpin_tx:
+ hyp_unpin_shared_mem(tx_virt, tx_virt + 1);
+err_unshare_rx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx));
+err_unshare_tx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx));
+err_unmap:
+ ffa_unmap_hyp_buffers();
+ goto out_unlock;
+}
+
+static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ int ret = 0;
+
+ if (id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx)));
+ host_buffers.tx = NULL;
+
+ hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx)));
+ host_buffers.rx = NULL;
+
+ ffa_unmap_hyp_buffers();
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+}
+
+static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nshared = __ffa_host_share_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nshared != nranges) {
+ WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nunshared != nranges) {
+ WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, fraglen, ctxt, 3);
+ DECLARE_REG(u32, endpoint_id, ctxt, 4);
+ struct ffa_mem_region_addr_range *buf;
+ int ret = FFA_RET_INVALID_PARAMETERS;
+ u32 nr_ranges;
+
+ if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)
+ goto out;
+
+ if (fraglen % sizeof(*buf))
+ goto out;
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx)
+ goto out_unlock;
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+ nr_ranges = fraglen / sizeof(*buf);
+
+ ret = ffa_host_share_ranges(buf, nr_ranges);
+ if (ret) {
+ /*
+ * We're effectively aborting the transaction, so we need
+ * to restore the global state back to what it was prior to
+ * transmission of the first fragment.
+ */
+ ffa_mem_reclaim(res, handle_lo, handle_hi, 0);
+ WARN_ON(res->a0 != FFA_SUCCESS);
+ goto out_unlock;
+ }
+
+ ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id);
+ if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX)
+ WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges));
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+
+ /*
+ * If for any reason this did not succeed, we're in trouble as we have
+ * now lost the content of the previous fragments and we can't rollback
+ * the host stage-2 changes. The pages previously marked as shared will
+ * remain stuck in that state forever, hence preventing the host from
+ * sharing/donating them again and may possibly lead to subsequent
+ * failures, but this will not compromise confidentiality.
+ */
+ return;
+}
+
+static void __do_ffa_mem_xfer(const u64 func_id,
+ struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, len, ctxt, 1);
+ DECLARE_REG(u32, fraglen, ctxt, 2);
+ DECLARE_REG(u64, addr_mbz, ctxt, 3);
+ DECLARE_REG(u32, npages_mbz, ctxt, 4);
+ struct ffa_mem_region_attributes *ep_mem_access;
+ struct ffa_composite_mem_region *reg;
+ struct ffa_mem_region *buf;
+ u32 offset, nr_ranges, checked_offset;
+ int ret = 0;
+
+ if (addr_mbz || npages_mbz || fraglen > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (fraglen < sizeof(struct ffa_mem_region) +
+ sizeof(struct ffa_mem_region_attributes)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (len > ffa_desc_buf.len) {
+ ret = FFA_RET_NO_MEMORY;
+ goto out_unlock;
+ }
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+
+ ep_mem_access = (void *)buf +
+ ffa_mem_desc_offset(buf, 0, hyp_ffa_version);
+ offset = ep_mem_access->composite_off;
+ if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < checked_offset) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ reg = (void *)buf + offset;
+ nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents;
+ if (nr_ranges % sizeof(reg->constituents[0])) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ nr_ranges /= sizeof(reg->constituents[0]);
+ ret = ffa_host_share_ranges(reg->constituents, nr_ranges);
+ if (ret)
+ goto out_unlock;
+
+ ffa_mem_xfer(res, func_id, len, fraglen);
+ if (fraglen != len) {
+ if (res->a0 != FFA_MEM_FRAG_RX)
+ goto err_unshare;
+
+ if (res->a3 != fraglen)
+ goto err_unshare;
+ } else if (res->a0 != FFA_SUCCESS) {
+ goto err_unshare;
+ }
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unshare:
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges));
+ goto out_unlock;
+}
+
+#define do_ffa_mem_xfer(fid, res, ctxt) \
+ do { \
+ BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \
+ (fid) != FFA_FN64_MEM_LEND); \
+ __do_ffa_mem_xfer((fid), (res), (ctxt)); \
+ } while (0);
+
+static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, flags, ctxt, 3);
+ struct ffa_mem_region_attributes *ep_mem_access;
+ struct ffa_composite_mem_region *reg;
+ u32 offset, len, fraglen, fragoff;
+ struct ffa_mem_region *buf;
+ int ret = 0;
+ u64 handle;
+
+ handle = PACK_HANDLE(handle_lo, handle_hi);
+
+ hyp_spin_lock(&host_buffers.lock);
+
+ buf = hyp_buffers.tx;
+ *buf = (struct ffa_mem_region) {
+ .sender_id = HOST_FFA_ID,
+ .handle = handle,
+ };
+
+ ffa_retrieve_req(res, sizeof(*buf));
+ buf = hyp_buffers.rx;
+ if (res->a0 != FFA_MEM_RETRIEVE_RESP)
+ goto out_unlock;
+
+ len = res->a1;
+ fraglen = res->a2;
+
+ ep_mem_access = (void *)buf +
+ ffa_mem_desc_offset(buf, 0, hyp_ffa_version);
+ offset = ep_mem_access->composite_off;
+ /*
+ * We can trust the SPMD to get this right, but let's at least
+ * check that we end up with something that doesn't look _completely_
+ * bogus.
+ */
+ if (WARN_ON(offset > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
+ ret = FFA_RET_ABORTED;
+ ffa_rx_release(res);
+ goto out_unlock;
+ }
+
+ if (len > ffa_desc_buf.len) {
+ ret = FFA_RET_NO_MEMORY;
+ ffa_rx_release(res);
+ goto out_unlock;
+ }
+
+ buf = ffa_desc_buf.buf;
+ memcpy(buf, hyp_buffers.rx, fraglen);
+ ffa_rx_release(res);
+
+ for (fragoff = fraglen; fragoff < len; fragoff += fraglen) {
+ ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff);
+ if (res->a0 != FFA_MEM_FRAG_TX) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ fraglen = res->a3;
+ memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen);
+ ffa_rx_release(res);
+ }
+
+ ffa_mem_reclaim(res, handle_lo, handle_hi, flags);
+ if (res->a0 != FFA_SUCCESS)
+ goto out_unlock;
+
+ reg = (void *)buf + offset;
+ /* If the SPMD was happy, then we should be too. */
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents,
+ reg->addr_range_cnt));
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+}
+
+/*
+ * Is a given FFA function supported, either by forwarding on directly
+ * or by handling at EL2?
+ */
+static bool ffa_call_supported(u64 func_id)
+{
+ switch (func_id) {
+ /* Unsupported memory management calls */
+ case FFA_FN64_MEM_RETRIEVE_REQ:
+ case FFA_MEM_RETRIEVE_RESP:
+ case FFA_MEM_RELINQUISH:
+ case FFA_MEM_OP_PAUSE:
+ case FFA_MEM_OP_RESUME:
+ case FFA_MEM_FRAG_RX:
+ case FFA_FN64_MEM_DONATE:
+ /* Indirect message passing via RX/TX buffers */
+ case FFA_MSG_SEND:
+ case FFA_MSG_POLL:
+ case FFA_MSG_WAIT:
+ /* 32-bit variants of 64-bit calls */
+ case FFA_MSG_SEND_DIRECT_RESP:
+ case FFA_RXTX_MAP:
+ case FFA_MEM_DONATE:
+ case FFA_MEM_RETRIEVE_REQ:
+ /* Optional notification interfaces added in FF-A 1.1 */
+ case FFA_NOTIFICATION_BITMAP_CREATE:
+ case FFA_NOTIFICATION_BITMAP_DESTROY:
+ case FFA_NOTIFICATION_BIND:
+ case FFA_NOTIFICATION_UNBIND:
+ case FFA_NOTIFICATION_SET:
+ case FFA_NOTIFICATION_GET:
+ case FFA_NOTIFICATION_INFO_GET:
+ /* Optional interfaces added in FF-A 1.2 */
+ case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */
+ case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */
+ case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */
+ case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */
+ return false;
+ }
+
+ return true;
+}
+
+static bool do_ffa_features(struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ u64 prop = 0;
+ int ret = 0;
+
+ if (!ffa_call_supported(id)) {
+ ret = FFA_RET_NOT_SUPPORTED;
+ goto out_handled;
+ }
+
+ switch (id) {
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ ret = FFA_RET_SUCCESS;
+ prop = 0; /* No support for dynamic buffers */
+ goto out_handled;
+ default:
+ return false;
+ }
+
+out_handled:
+ ffa_to_smccc_res_prop(res, ret, prop);
+ return true;
+}
+
+static int hyp_ffa_post_init(void)
+{
+ size_t min_rxtx_sz;
+ struct arm_smccc_1_2_regs res;
+
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ .a0 = FFA_ID_GET,
+ }, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ if (res.a2 != HOST_FFA_ID)
+ return -EINVAL;
+
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ .a0 = FFA_FEATURES,
+ .a1 = FFA_FN64_RXTX_MAP,
+ }, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) {
+ case FFA_FEAT_RXTX_MIN_SZ_4K:
+ min_rxtx_sz = SZ_4K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_16K:
+ min_rxtx_sz = SZ_16K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_64K:
+ min_rxtx_sz = SZ_64K;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (min_rxtx_sz > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static void do_ffa_version(struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, ffa_req_version, ctxt, 1);
+
+ if (FFA_MAJOR_VERSION(ffa_req_version) != 1) {
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ return;
+ }
+
+ hyp_spin_lock(&version_lock);
+ if (has_version_negotiated) {
+ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version))
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ else
+ res->a0 = hyp_ffa_version;
+ goto unlock;
+ }
+
+ /*
+ * If the client driver tries to downgrade the version, we need to ask
+ * first if TEE supports it.
+ */
+ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) {
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_VERSION,
+ .a1 = ffa_req_version,
+ }, res);
+ if (res->a0 == FFA_RET_NOT_SUPPORTED)
+ goto unlock;
+
+ hyp_ffa_version = ffa_req_version;
+ }
+
+ if (hyp_ffa_post_init()) {
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ } else {
+ smp_store_release(&has_version_negotiated, true);
+ res->a0 = hyp_ffa_version;
+ }
+unlock:
+ hyp_spin_unlock(&version_lock);
+}
+
+static void do_ffa_part_get(struct arm_smccc_1_2_regs *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, uuid0, ctxt, 1);
+ DECLARE_REG(u32, uuid1, ctxt, 2);
+ DECLARE_REG(u32, uuid2, ctxt, 3);
+ DECLARE_REG(u32, uuid3, ctxt, 4);
+ DECLARE_REG(u32, flags, ctxt, 5);
+ u32 count, partition_sz, copy_sz;
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.rx) {
+ ffa_to_smccc_res(res, FFA_RET_BUSY);
+ goto out_unlock;
+ }
+
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_PARTITION_INFO_GET,
+ .a1 = uuid0,
+ .a2 = uuid1,
+ .a3 = uuid2,
+ .a4 = uuid3,
+ .a5 = flags,
+ }, res);
+
+ if (res->a0 != FFA_SUCCESS)
+ goto out_unlock;
+
+ count = res->a2;
+ if (!count)
+ goto out_unlock;
+
+ if (hyp_ffa_version > FFA_VERSION_1_0) {
+ /* Get the number of partitions deployed in the system */
+ if (flags & 0x1)
+ goto out_unlock;
+
+ partition_sz = res->a3;
+ } else {
+ /* FFA_VERSION_1_0 lacks the size in the response */
+ partition_sz = FFA_1_0_PARTITON_INFO_SZ;
+ }
+
+ copy_sz = partition_sz * count;
+ if (copy_sz > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+ ffa_to_smccc_res(res, FFA_RET_ABORTED);
+ goto out_unlock;
+ }
+
+ memcpy(host_buffers.rx, hyp_buffers.rx, copy_sz);
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+}
+
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
+{
+ struct arm_smccc_1_2_regs res;
+
+ /*
+ * There's no way we can tell what a non-standard SMC call might
+ * be up to. Ideally, we would terminate these here and return
+ * an error to the host, but sadly devices make use of custom
+ * firmware calls for things like power management, debugging,
+ * RNG access and crash reporting.
+ *
+ * Given that the architecture requires us to trust EL3 anyway,
+ * we forward unrecognised calls on under the assumption that
+ * the firmware doesn't expose a mechanism to access arbitrary
+ * non-secure memory. Short of a per-device table of SMCs, this
+ * is the best we can do.
+ */
+ if (!is_ffa_call(func_id))
+ return false;
+
+ if (func_id != FFA_VERSION &&
+ !smp_load_acquire(&has_version_negotiated)) {
+ ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS);
+ goto out_handled;
+ }
+
+ switch (func_id) {
+ case FFA_FEATURES:
+ if (!do_ffa_features(&res, host_ctxt))
+ return false;
+ goto out_handled;
+ /* Memory management */
+ case FFA_FN64_RXTX_MAP:
+ do_ffa_rxtx_map(&res, host_ctxt);
+ goto out_handled;
+ case FFA_RXTX_UNMAP:
+ do_ffa_rxtx_unmap(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_RECLAIM:
+ do_ffa_mem_reclaim(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_FRAG_TX:
+ do_ffa_mem_frag_tx(&res, host_ctxt);
+ goto out_handled;
+ case FFA_VERSION:
+ do_ffa_version(&res, host_ctxt);
+ goto out_handled;
+ case FFA_PARTITION_INFO_GET:
+ do_ffa_part_get(&res, host_ctxt);
+ goto out_handled;
+ }
+
+ if (ffa_call_supported(func_id))
+ return false; /* Pass through */
+
+ ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED);
+out_handled:
+ ffa_set_retval(host_ctxt, &res);
+ return true;
+}
+
+int hyp_ffa_init(void *pages)
+{
+ struct arm_smccc_1_2_regs res;
+ void *tx, *rx;
+
+ if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
+ return 0;
+
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_VERSION,
+ .a1 = FFA_VERSION_1_2,
+ }, &res);
+ if (res.a0 == FFA_RET_NOT_SUPPORTED)
+ return 0;
+
+ /*
+ * Firmware returns the maximum supported version of the FF-A
+ * implementation. Check that the returned version is
+ * backwards-compatible with the hyp according to the rules in DEN0077A
+ * v1.1 REL0 13.2.1.
+ *
+ * Of course, things are never simple when dealing with firmware. v1.1
+ * broke ABI with v1.0 on several structures, which is itself
+ * incompatible with the aforementioned versioning scheme. The
+ * expectation is that v1.x implementations that do not support the v1.0
+ * ABI return NOT_SUPPORTED rather than a version number, according to
+ * DEN0077A v1.1 REL0 18.6.4.
+ */
+ if (FFA_MAJOR_VERSION(res.a0) != 1)
+ return -EOPNOTSUPP;
+
+ if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2))
+ hyp_ffa_version = res.a0;
+ else
+ hyp_ffa_version = FFA_VERSION_1_2;
+
+ tx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+ rx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+
+ ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) {
+ .buf = pages,
+ .len = PAGE_SIZE *
+ (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)),
+ };
+
+ hyp_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ .tx = tx,
+ .rx = rx,
+ };
+
+ host_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ };
+
+ version_lock = __HYP_SPIN_LOCK_UNLOCKED;
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
index 6bc88a756cb7..b63f4e1c1033 100644
--- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
+++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
@@ -50,6 +50,9 @@
#ifndef R_AARCH64_ABS64
#define R_AARCH64_ABS64 257
#endif
+#ifndef R_AARCH64_ABS32
+#define R_AARCH64_ABS32 258
+#endif
#ifndef R_AARCH64_PREL64
#define R_AARCH64_PREL64 260
#endif
@@ -383,6 +386,9 @@ static void emit_rela_section(Elf64_Shdr *sh_rela)
case R_AARCH64_ABS64:
emit_rela_abs64(rela, sh_orig_name);
break;
+ /* Allow 32-bit absolute relocation, for kCFI type hashes. */
+ case R_AARCH64_ABS32:
+ break;
/* Allow position-relative data relocations. */
case R_AARCH64_PREL64:
case R_AARCH64_PREL32:
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index b6c0188c4b35..eef15b374abb 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -10,6 +10,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_ptrauth.h>
.text
@@ -37,10 +38,43 @@ SYM_FUNC_START(__host_exit)
/* Save the host context pointer in x29 across the function call */
mov x29, x0
+
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_save
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ /* Save kernel ptrauth keys. */
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_save_state x18, x19, x20
+
+ /* Use hyp keys. */
+ adr_this_cpu x18, kvm_hyp_ctxt, x19
+ add x18, x18, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+ isb
+alternative_else_nop_endif
+__skip_pauth_save:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
bl handle_trap
- /* Restore host regs x0-x17 */
__host_enter_restore_full:
+ /* Restore kernel keys. */
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_restore
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+alternative_else_nop_endif
+__skip_pauth_restore:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+ /* Restore host regs x0-x17 */
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
@@ -76,7 +110,7 @@ SYM_FUNC_END(__host_enter)
* u64 elr, u64 par);
*/
SYM_FUNC_START(__hyp_do_panic)
- /* Prepare and exit to the host's panic funciton. */
+ /* Prepare and exit to the host's panic function. */
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
PSR_MODE_EL1h)
msr spsr_el2, lr
@@ -90,7 +124,7 @@ SYM_FUNC_START(__hyp_do_panic)
/* Ensure host stage-2 is disabled */
mrs x0, hcr_el2
bic x0, x0, #HCR_VM
- msr hcr_el2, x0
+ msr_hcr_el2 x0
isb
tlbi vmalls12e1
dsb nsh
@@ -154,21 +188,15 @@ SYM_FUNC_END(__host_hvc)
/*
* Test whether the SP has overflowed, without corrupting a GPR.
- * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
+ * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit
* of SP should always be 1.
*/
add sp, sp, x0 // sp' = sp + x0
sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
- tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
+ tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@
sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
- /* If a guest is loaded, panic out of it. */
- stp x0, x1, [sp, #-16]!
- get_loaded_vcpu x0, x1
- cbnz x0, __guest_exit_panic
- add sp, sp, #16
-
/*
* The panic may not be clean if the exception is taken before the host
* context has been saved by __host_exit or after the hyp context has
@@ -263,3 +291,13 @@ SYM_CODE_START(__kvm_hyp_host_forward_smc)
ret
SYM_CODE_END(__kvm_hyp_host_forward_smc)
+
+/*
+ * kvm_host_psci_cpu_entry is called through br instruction, which requires
+ * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external
+ * functions, but bti c instead.
+ */
+SYM_CODE_START(kvm_host_psci_cpu_entry)
+ bti j
+ b __kvm_host_psci_cpu_entry
+SYM_CODE_END(kvm_host_psci_cpu_entry)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index c953fb4b9a13..aada42522e7b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -5,6 +5,7 @@
*/
#include <linux/arm-smccc.h>
+#include <linux/cfi_types.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
@@ -23,28 +24,25 @@
.align 11
SYM_CODE_START(__kvm_hyp_init)
- ventry __invalid // Synchronous EL2t
- ventry __invalid // IRQ EL2t
- ventry __invalid // FIQ EL2t
- ventry __invalid // Error EL2t
+ ventry . // Synchronous EL2t
+ ventry . // IRQ EL2t
+ ventry . // FIQ EL2t
+ ventry . // Error EL2t
- ventry __invalid // Synchronous EL2h
- ventry __invalid // IRQ EL2h
- ventry __invalid // FIQ EL2h
- ventry __invalid // Error EL2h
+ ventry . // Synchronous EL2h
+ ventry . // IRQ EL2h
+ ventry . // FIQ EL2h
+ ventry . // Error EL2h
ventry __do_hyp_init // Synchronous 64-bit EL1
- ventry __invalid // IRQ 64-bit EL1
- ventry __invalid // FIQ 64-bit EL1
- ventry __invalid // Error 64-bit EL1
+ ventry . // IRQ 64-bit EL1
+ ventry . // FIQ 64-bit EL1
+ ventry . // Error 64-bit EL1
- ventry __invalid // Synchronous 32-bit EL1
- ventry __invalid // IRQ 32-bit EL1
- ventry __invalid // FIQ 32-bit EL1
- ventry __invalid // Error 32-bit EL1
-
-__invalid:
- b .
+ ventry . // Synchronous 32-bit EL1
+ ventry . // IRQ 32-bit EL1
+ ventry . // FIQ 32-bit EL1
+ ventry . // Error 32-bit EL1
/*
* Only uses x0..x3 so as to not clobber callee-saved SMCCC registers.
@@ -57,6 +55,7 @@ __do_hyp_init:
cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc
+ bic x0, x0, #ARM_SMCCC_CALL_HINTS
mov x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
cmp x0, x3
b.eq 1f
@@ -75,6 +74,17 @@ __do_hyp_init:
SYM_CODE_END(__kvm_hyp_init)
/*
+ * Initialize EL2 CPU state to sane values.
+ *
+ * HCR_EL2.E2H must have been initialized already.
+ */
+SYM_CODE_START_LOCAL(__kvm_init_el2_state)
+ init_el2_state // Clobbers x0..x2
+ finalise_el2_state
+ ret
+SYM_CODE_END(__kvm_init_el2_state)
+
+/*
* Initialize the hypervisor in EL2.
*
* Only uses x0..x2 so as to not clobber callee-saved SMCCC registers
@@ -83,9 +93,6 @@ SYM_CODE_END(__kvm_hyp_init)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START_LOCAL(___kvm_hyp_init)
- ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
- msr tpidr_el2, x1
-
ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA]
mov sp, x1
@@ -93,7 +100,26 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
msr mair_el2, x1
ldr x1, [x0, #NVHE_INIT_HCR_EL2]
- msr hcr_el2, x1
+ msr_hcr_el2 x1
+
+ mov x2, #HCR_E2H
+ and x2, x1, x2
+ cbz x2, 1f
+
+ // hVHE: Replay the EL2 setup to account for the E2H bit
+ // TPIDR_EL2 is used to preserve x0 across the macro maze...
+ isb
+ msr tpidr_el2, x0
+ str lr, [x0, #NVHE_INIT_TMP]
+
+ bl __kvm_init_el2_state
+
+ mrs x0, tpidr_el2
+ ldr lr, [x0, #NVHE_INIT_TMP]
+
+1:
+ ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
+ msr tpidr_el2, x1
ldr x1, [x0, #NVHE_INIT_VTTBR]
msr vttbr_el2, x1
@@ -108,18 +134,14 @@ alternative_if ARM64_HAS_CNP
alternative_else_nop_endif
msr ttbr0_el2, x2
- /*
- * Set the PS bits in TCR_EL2.
- */
ldr x0, [x0, #NVHE_INIT_TCR_EL2]
- tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
msr tcr_el2, x0
isb
/* Invalidate the stale TLBs from Bootloader */
tlbi alle2
- tlbi vmalls12e1
+ tlbi alle1
dsb sy
mov_q x0, INIT_SCTLR_EL2_MMU_ON
@@ -128,6 +150,13 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
orr x0, x0, x1
alternative_else_nop_endif
+
+#ifdef CONFIG_ARM64_BTI_KERNEL
+alternative_if ARM64_BTI
+ orr x0, x0, #SCTLR_EL2_BT
+alternative_else_nop_endif
+#endif /* CONFIG_ARM64_BTI_KERNEL */
+
msr sctlr_el2, x0
isb
@@ -181,8 +210,9 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
2: msr SPsel, #1 // We want to use SP_EL{1,2}
- /* Initialize EL2 CPU state to sane values. */
- init_el2_state // Clobbers x0..x2
+ init_el2_hcr 0
+
+ bl __kvm_init_el2_state
/* Enable MMU, set vectors and stack. */
mov x0, x28
@@ -195,6 +225,11 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
SYM_CODE_END(__kvm_hyp_init_cpu)
SYM_CODE_START(__kvm_handle_stub_hvc)
+ /*
+ * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so
+ * we need bti j at beginning.
+ */
+ bti j
cmp x0, #HVC_SOFT_RESTART
b.ne 1f
@@ -227,7 +262,7 @@ reset:
alternative_if ARM64_KVM_PROTECTED_MODE
mov_q x5, HCR_HOST_NVHE_FLAGS
- msr hcr_el2, x5
+ msr_hcr_el2 x5
alternative_else_nop_endif
/* Install stub vectors */
@@ -241,31 +276,38 @@ alternative_else_nop_endif
SYM_CODE_END(__kvm_handle_stub_hvc)
-SYM_FUNC_START(__pkvm_init_switch_pgd)
+/*
+ * void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp,
+ * void (*fn)(void));
+ *
+ * SYM_TYPED_FUNC_START() allows C to call this ID-mapped function indirectly
+ * using a physical pointer without triggering a kCFI failure.
+ */
+SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd)
/* Turn the MMU off */
pre_disable_mmu_workaround
- mrs x2, sctlr_el2
- bic x3, x2, #SCTLR_ELx_M
- msr sctlr_el2, x3
+ mrs x3, sctlr_el2
+ bic x4, x3, #SCTLR_ELx_M
+ msr sctlr_el2, x4
isb
tlbi alle2
/* Install the new pgtables */
- ldr x3, [x0, #NVHE_INIT_PGD_PA]
- phys_to_ttbr x4, x3
+ phys_to_ttbr x5, x0
alternative_if ARM64_HAS_CNP
- orr x4, x4, #TTBR_CNP_BIT
+ orr x5, x5, #TTBR_CNP_BIT
alternative_else_nop_endif
- msr ttbr0_el2, x4
+ msr ttbr0_el2, x5
/* Set the new stack pointer */
- ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA]
- mov sp, x0
+ mov sp, x1
/* And turn the MMU back on! */
- set_sctlr_el2 x2
- ret x1
+ dsb nsh
+ isb
+ set_sctlr_el2 x3
+ ret x2
SYM_FUNC_END(__pkvm_init_switch_pgd)
.popsection
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 728e01d4536b..a7c689152f68 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -5,6 +5,7 @@
*/
#include <hyp/adjust_pc.h>
+#include <hyp/switch.h>
#include <asm/pgtable-types.h>
#include <asm/kvm_asm.h>
@@ -13,6 +14,7 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
#include <nvhe/pkvm.h>
@@ -22,26 +24,114 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
-static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu)
+{
+ __vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR));
+ /*
+ * On saving/restoring guest sve state, always use the maximum VL for
+ * the guest. The layout of the data when saving the sve state depends
+ * on the VL, so use a consistent (i.e., the maximum) guest VL.
+ */
+ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
+ __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true);
+ write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2);
+}
+
+static void __hyp_sve_restore_host(void)
+{
+ struct cpu_sve_state *sve_state = *host_data_ptr(sve_state);
+
+ /*
+ * On saving/restoring host sve state, always use the maximum VL for
+ * the host. The layout of the data when saving the sve state depends
+ * on the VL, so use a consistent (i.e., the maximum) host VL.
+ *
+ * Note that this constrains the PE to the maximum shared VL
+ * that was discovered, if we wish to use larger VLs this will
+ * need to be revisited.
+ */
+ write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2);
+ __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl),
+ &sve_state->fpsr,
+ true);
+ write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR);
+}
+
+static void fpsimd_sve_flush(void)
+{
+ *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
+}
+
+static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
+{
+ bool has_fpmr;
+
+ if (!guest_owns_fp_regs())
+ return;
+
+ /*
+ * Traps have been disabled by __deactivate_cptr_traps(), but there
+ * hasn't necessarily been a context synchronization event yet.
+ */
+ isb();
+
+ if (vcpu_has_sve(vcpu))
+ __hyp_sve_save_guest(vcpu);
+ else
+ __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs);
+
+ has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm));
+ if (has_fpmr)
+ __vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR));
+
+ if (system_supports_sve())
+ __hyp_sve_restore_host();
+ else
+ __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs));
+
+ if (has_fpmr)
+ write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR);
+
+ *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
+}
+
+static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
{
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
- hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
+ hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner;
- hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
- hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
+ if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
+ hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state;
+ else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
+ hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state;
+}
+
+static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
+ host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state;
+ else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
+ host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state;
+}
+
+static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
- hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu;
+ fpsimd_sve_flush();
+ flush_debug_state(hyp_vcpu);
+
+ hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
- hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2;
hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
- hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2;
+ hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE);
+ hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) &
+ (HCR_TWI | HCR_TWE);
hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags;
- hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state;
-
- hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr);
- hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state;
hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
@@ -55,35 +145,75 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
unsigned int i;
+ fpsimd_sve_sync(&hyp_vcpu->vcpu);
+ sync_debug_state(hyp_vcpu);
+
host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt;
host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2;
- host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2;
host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault;
host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
- host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state;
host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
+ host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr;
for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
}
+static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2);
+ DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx);
+ if (!hyp_vcpu)
+ return;
+
+ if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
+ /* Propagate WFx trapping flags */
+ hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI);
+ hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI);
+ }
+}
+
+static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (hyp_vcpu)
+ pkvm_put_hyp_vcpu(hyp_vcpu);
+}
+
static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
int ret;
- host_vcpu = kern_hyp_va(host_vcpu);
-
if (unlikely(is_protected_kvm_enabled())) {
- struct pkvm_hyp_vcpu *hyp_vcpu;
- struct kvm *host_kvm;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+
+ /*
+ * KVM (and pKVM) doesn't support SME guests for now, and
+ * ensures that SME features aren't enabled in pstate when
+ * loading a vcpu. Therefore, if SME features enabled the host
+ * is misbehaving.
+ */
+ if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) {
+ ret = -EINVAL;
+ goto out;
+ }
- host_kvm = kern_hyp_va(host_vcpu->kvm);
- hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
- host_vcpu->vcpu_idx);
if (!hyp_vcpu) {
ret = -EINVAL;
goto out;
@@ -94,12 +224,149 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
sync_hyp_vcpu(hyp_vcpu);
- pkvm_put_hyp_vcpu(hyp_vcpu);
} else {
+ struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu);
+
/* The host is fully trusted, run its vCPU directly. */
- ret = __kvm_vcpu_run(host_vcpu);
+ fpsimd_lazy_switch_to_guest(vcpu);
+ ret = __kvm_vcpu_run(vcpu);
+ fpsimd_lazy_switch_to_host(vcpu);
}
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache,
+ host_vcpu->arch.pkvm_memcache.nr_pages,
+ &host_vcpu->arch.pkvm_memcache);
+}
+
+static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+ ret = pkvm_refill_memcache(hyp_vcpu);
+ if (ret)
+ goto out;
+
+ ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ goto out;
+
+ ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm);
+ put_pkvm_hyp_vm(hyp_vm);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, gfn, host_ctxt, 1);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ goto out;
+
+ ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm);
+ put_pkvm_hyp_vm(hyp_vm);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
+ DECLARE_REG(bool, mkold, host_ctxt, 4);
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ goto out;
+
+ ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm);
+ put_pkvm_hyp_vm(hyp_vm);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, gfn, host_ctxt, 1);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ goto out;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu);
out:
cpu_reg(host_ctxt, 1) = ret;
}
@@ -125,6 +392,25 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
}
+static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
+ DECLARE_REG(int, level, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level);
+}
+
+static void
+handle___kvm_tlb_flush_vmid_range(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, start, host_ctxt, 2);
+ DECLARE_REG(unsigned long, pages, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_range(kern_hyp_va(mmu), start, pages);
+}
+
static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
@@ -132,6 +418,22 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid(kern_hyp_va(mmu));
}
+static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ struct pkvm_hyp_vm *hyp_vm;
+
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_vm = get_np_pkvm_hyp_vm(handle);
+ if (!hyp_vm)
+ return;
+
+ __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
+ put_pkvm_hyp_vm(hyp_vm);
+}
+
static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
@@ -158,26 +460,11 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config();
}
-static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
-{
- cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr();
-}
-
-static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt)
-{
- __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1));
-}
-
static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
{
__vgic_v3_init_lrs();
}
-static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt)
-{
- cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2();
-}
-
static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
@@ -185,11 +472,11 @@ static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
__vgic_v3_save_aprs(kern_hyp_va(cpu_if));
}
-static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
- __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+ __vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if));
}
static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
@@ -260,11 +547,16 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
}
-static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt)
{
- DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+ cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm();
+}
- __pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
+static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ __pkvm_unreserve_vm(handle);
}
static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
@@ -300,7 +592,6 @@ typedef void (*hcall_t)(struct kvm_cpu_context *);
static const hcall_t host_hcall[] = {
/* ___kvm_hyp_init */
- HANDLE_FUNC(__kvm_get_mdcr_el2),
HANDLE_FUNC(__pkvm_init),
HANDLE_FUNC(__pkvm_create_private_mapping),
HANDLE_FUNC(__pkvm_cpu_set_vector),
@@ -311,21 +602,31 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__pkvm_host_share_hyp),
HANDLE_FUNC(__pkvm_host_unshare_hyp),
+ HANDLE_FUNC(__pkvm_host_share_guest),
+ HANDLE_FUNC(__pkvm_host_unshare_guest),
+ HANDLE_FUNC(__pkvm_host_relax_perms_guest),
+ HANDLE_FUNC(__pkvm_host_wrprotect_guest),
+ HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
+ HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_range),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
- HANDLE_FUNC(__vgic_v3_read_vmcr),
- HANDLE_FUNC(__vgic_v3_write_vmcr),
HANDLE_FUNC(__vgic_v3_save_aprs),
- HANDLE_FUNC(__vgic_v3_restore_aprs),
- HANDLE_FUNC(__pkvm_vcpu_init_traps),
+ HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
+ HANDLE_FUNC(__pkvm_reserve_vm),
+ HANDLE_FUNC(__pkvm_unreserve_vm),
HANDLE_FUNC(__pkvm_init_vm),
HANDLE_FUNC(__pkvm_init_vcpu),
HANDLE_FUNC(__pkvm_teardown_vm),
+ HANDLE_FUNC(__pkvm_vcpu_load),
+ HANDLE_FUNC(__pkvm_vcpu_put),
+ HANDLE_FUNC(__pkvm_tlb_flush_vmid),
};
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -346,6 +647,7 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
if (static_branch_unlikely(&kvm_protected_mode_initialized))
hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize;
+ id &= ~ARM_SMCCC_CALL_HINTS;
id -= KVM_HOST_SMCCC_ID(0);
if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall)))
@@ -370,9 +672,14 @@ static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
{
+ DECLARE_REG(u64, func_id, host_ctxt, 0);
bool handled;
- handled = kvm_host_psci_handler(host_ctxt);
+ func_id &= ~ARM_SMCCC_CALL_HINTS;
+
+ handled = kvm_host_psci_handler(host_ctxt, func_id);
+ if (!handled)
+ handled = kvm_host_ffa_handler(host_ctxt, func_id);
if (!handled)
default_host_smc_handler(host_ctxt);
@@ -391,11 +698,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
case ESR_ELx_EC_SMC64:
handle_host_smc(host_ctxt);
break;
- case ESR_ELx_EC_SVE:
- sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
- isb();
- sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
- break;
case ESR_ELx_EC_IABT_LOW:
case ESR_ELx_EC_DABT_LOW:
handle_host_mem_abort(host_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index f4562f417d3f..d724f6d69302 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -25,5 +25,7 @@ SECTIONS {
BEGIN_HYP_SECTION(.data..percpu)
PERCPU_INPUT(L1_CACHE_BYTES)
END_HYP_SECTION
+
HYP_SECTION(.bss)
+ HYP_SECTION(.data)
}
diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c
index d68abd7ea124..baa6260f88dc 100644
--- a/arch/arm64/kvm/hyp/nvhe/list_debug.c
+++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c
@@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v)
bool corruption = unlikely(condition); \
if (corruption) { \
if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
- BUG_ON(1); \
+ BUG(); \
} else \
WARN_ON(1); \
} \
@@ -26,8 +26,9 @@ static inline __must_check bool nvhe_check_data_corruption(bool v)
/* The predicates checked here are taken from lib/list_debug.c. */
-bool __list_add_valid(struct list_head *new, struct list_head *prev,
- struct list_head *next)
+__list_valid_slowpath
+bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
{
if (NVHE_CHECK_DATA_CORRUPTION(next->prev != prev) ||
NVHE_CHECK_DATA_CORRUPTION(prev->next != next) ||
@@ -37,7 +38,8 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev,
return true;
}
-bool __list_del_entry_valid(struct list_head *entry)
+__list_valid_slowpath
+bool __list_del_entry_valid_or_report(struct list_head *entry)
{
struct list_head *prev, *next;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 552653fa18be..49db32f3ddf7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -60,6 +60,11 @@ static void hyp_unlock_component(void)
hyp_spin_unlock(&pkvm_pgd_lock);
}
+#define for_each_hyp_page(__p, __st, __sz) \
+ for (struct hyp_page *__p = hyp_phys_to_page(__st), \
+ *__e = __p + ((__sz) >> PAGE_SHIFT); \
+ __p < __e; __p++)
+
static void *host_s2_zalloc_pages_exact(size_t size)
{
void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size));
@@ -91,9 +96,9 @@ static void host_s2_put_page(void *addr)
hyp_put_page(&host_s2_pool, addr);
}
-static void host_s2_free_removed_table(void *addr, u32 level)
+static void host_s2_free_unlinked_table(void *addr, s8 level)
{
- kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
+ kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level);
}
static int prepare_s2_pool(void *pgt_pool_base)
@@ -110,7 +115,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
.zalloc_page = host_s2_zalloc_page,
- .free_removed_table = host_s2_free_removed_table,
+ .free_unlinked_table = host_s2_free_unlinked_table,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
@@ -129,8 +134,8 @@ static void prepare_host_vtcr(void)
parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
- host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
- id_aa64mmfr1_el1_sys_val, phys_shift);
+ host_mmu.arch.mmu.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+ id_aa64mmfr1_el1_sys_val, phys_shift);
}
static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
@@ -161,12 +166,6 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
return 0;
}
-static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
- enum kvm_pgtable_prot prot)
-{
- return true;
-}
-
static void *guest_s2_zalloc_pages_exact(size_t size)
{
void *addr = hyp_alloc_pages(&current_vm->pool, get_order(size));
@@ -201,8 +200,8 @@ static void *guest_s2_zalloc_page(void *mc)
memset(addr, 0, PAGE_SIZE);
p = hyp_virt_to_page(addr);
- memset(p, 0, sizeof(*p));
p->refcount = 1;
+ p->order = 0;
return addr;
}
@@ -217,16 +216,42 @@ static void guest_s2_put_page(void *addr)
hyp_put_page(&current_vm->pool, addr);
}
+static void __apply_guest_page(void *va, size_t size,
+ void (*func)(void *addr, size_t size))
+{
+ size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE);
+ va = PTR_ALIGN_DOWN(va, PAGE_SIZE);
+ size = PAGE_ALIGN(size);
+
+ while (size) {
+ size_t map_size = PAGE_SIZE;
+ void *map;
+
+ if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE)
+ map = hyp_fixblock_map(__hyp_pa(va), &map_size);
+ else
+ map = hyp_fixmap_map(__hyp_pa(va));
+
+ func(map, map_size);
+
+ if (map_size == PMD_SIZE)
+ hyp_fixblock_unmap();
+ else
+ hyp_fixmap_unmap();
+
+ size -= map_size;
+ va += map_size;
+ }
+}
+
static void clean_dcache_guest_page(void *va, size_t size)
{
- __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
- hyp_fixmap_unmap();
+ __apply_guest_page(va, size, __clean_dcache_guest_page);
}
static void invalidate_icache_guest_page(void *va, size_t size)
{
- __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
- hyp_fixmap_unmap();
+ __apply_guest_page(va, size, __invalidate_icache_guest_page);
}
int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
@@ -235,7 +260,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
unsigned long nr_pages;
int ret;
- nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+ nr_pages = kvm_pgtable_stage2_pgd_size(mmu->vtcr) >> PAGE_SHIFT;
ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0);
if (ret)
return ret;
@@ -255,8 +280,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
};
guest_lock_component(vm);
- ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
- guest_stage2_force_pte_cb);
+ ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL);
guest_unlock_component(vm);
if (ret)
return ret;
@@ -266,8 +290,9 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
return 0;
}
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
+void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
{
+ struct hyp_page *page;
void *addr;
/* Dump all pgtable pages in the hyp_pool */
@@ -279,7 +304,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
/* Drain the hyp_pool into the memcache */
addr = hyp_alloc_pages(&vm->pool, 0);
while (addr) {
- memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
+ page = hyp_virt_to_page(addr);
+ page->refcount = 0;
+ page->order = 0;
push_hyp_memcache(mc, addr, hyp_virt_to_phys);
WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
addr = hyp_alloc_pages(&vm->pool, 0);
@@ -295,11 +322,18 @@ int __pkvm_prot_finalize(void)
return -EPERM;
params->vttbr = kvm_get_vttbr(mmu);
- params->vtcr = host_mmu.arch.vtcr;
+ params->vtcr = mmu->vtcr;
params->hcr_el2 |= HCR_VM;
+
+ /*
+ * The CMO below not only cleans the updated params to the
+ * PoC, but also provides the DSB that ensures ongoing
+ * page-table walks that have started before we trapped to EL2
+ * have completed.
+ */
kvm_flush_dcache_to_poc(params, sizeof(*params));
- write_sysreg(params->hcr_el2, hcr_el2);
+ write_sysreg_hcr(params->hcr_el2);
__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
/*
@@ -333,6 +367,19 @@ static int host_stage2_unmap_dev_all(void)
return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
}
+/*
+ * Ensure the PFN range is contained within PA-range.
+ *
+ * This check is also robust to overflows and is therefore a requirement before
+ * using a pfn/nr_pages pair from an untrusted source.
+ */
+static bool pfn_range_is_valid(u64 pfn, u64 nr_pages)
+{
+ u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT);
+
+ return pfn < limit && ((limit - pfn) >= nr_pages);
+}
+
struct kvm_mem_range {
u64 start;
u64 end;
@@ -375,19 +422,28 @@ bool addr_is_memory(phys_addr_t phys)
return !!find_mem_range(phys, &range);
}
-static bool addr_is_allowed_memory(phys_addr_t phys)
+static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
+{
+ return range->start <= addr && addr < range->end;
+}
+
+static int check_range_allowed_memory(u64 start, u64 end)
{
struct memblock_region *reg;
struct kvm_mem_range range;
- reg = find_mem_range(phys, &range);
+ /*
+ * Callers can't check the state of a range that overlaps memory and
+ * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range.
+ */
+ reg = find_mem_range(start, &range);
+ if (!is_in_mem_range(end - 1, &range))
+ return -EINVAL;
- return reg && !(reg->flags & MEMBLOCK_NOMAP);
-}
+ if (!reg || reg->flags & MEMBLOCK_NOMAP)
+ return -EPERM;
-static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
-{
- return range->start <= addr && addr < range->end;
+ return 0;
}
static bool range_is_memory(u64 start, u64 end)
@@ -436,7 +492,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
{
struct kvm_mem_range cur;
kvm_pte_t pte;
- u32 level;
+ u64 granule;
+ s8 level;
int ret;
hyp_assert_lock_held(&host_mmu.lock);
@@ -447,21 +504,27 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
if (kvm_pte_valid(pte))
return -EAGAIN;
- if (pte)
+ if (pte) {
+ WARN_ON(addr_is_memory(addr) &&
+ get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE);
return -EPERM;
+ }
- do {
- u64 granule = kvm_granule_size(level);
+ for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) {
+ if (!kvm_level_supports_block_mapping(level))
+ continue;
+ granule = kvm_granule_size(level);
cur.start = ALIGN_DOWN(addr, granule);
cur.end = cur.start + granule;
- level++;
- } while ((level < KVM_PGTABLE_MAX_LEVELS) &&
- !(kvm_level_supports_block_mapping(level) &&
- range_included(&cur, range)));
+ if (!range_included(&cur, range))
+ continue;
+ *range = cur;
+ return 0;
+ }
- *range = cur;
+ WARN_ON(1);
- return 0;
+ return -EINVAL;
}
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
@@ -470,10 +533,31 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
}
+static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state)
+{
+ for_each_hyp_page(page, addr, size)
+ set_host_state(page, state);
+}
+
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
- return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
- addr, size, &host_s2_pool, owner_id);
+ int ret;
+
+ if (!range_is_memory(addr, addr + size))
+ return -EPERM;
+
+ ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
+ addr, size, &host_s2_pool, owner_id);
+ if (ret)
+ return ret;
+
+ /* Don't forget to update the vmemmap tracking for the host */
+ if (owner_id == PKVM_ID_HOST)
+ __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
+ else
+ __host_update_page_state(addr, size, PKVM_NOPAGE);
+
+ return 0;
}
static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
@@ -526,49 +610,29 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
int ret = 0;
esr = read_sysreg_el2(SYS_ESR);
- BUG_ON(!__get_fault_info(esr, &fault));
+ if (!__get_fault_info(esr, &fault)) {
+ /*
+ * We've presumably raced with a page-table change which caused
+ * AT to fail, try again.
+ */
+ return;
+ }
+
+
+ /*
+ * Yikes, we couldn't resolve the fault IPA. This should reinject an
+ * abort into the host when we figure out how to do that.
+ */
+ BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS));
+ addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12;
- addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
ret = host_stage2_idmap(addr);
BUG_ON(ret && ret != -EAGAIN);
}
-struct pkvm_mem_transition {
- u64 nr_pages;
-
- struct {
- enum pkvm_component_id id;
- /* Address in the initiator's address space */
- u64 addr;
-
- union {
- struct {
- /* Address in the completer's address space */
- u64 completer_addr;
- } host;
- struct {
- u64 completer_addr;
- } hyp;
- };
- } initiator;
-
- struct {
- enum pkvm_component_id id;
- } completer;
-};
-
-struct pkvm_mem_share {
- const struct pkvm_mem_transition tx;
- const enum kvm_pgtable_prot completer_prot;
-};
-
-struct pkvm_mem_donation {
- const struct pkvm_mem_transition tx;
-};
-
struct check_walk_data {
enum pkvm_page_state desired;
- enum pkvm_page_state (*get_page_state)(kvm_pte_t pte);
+ enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr);
};
static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx,
@@ -576,10 +640,7 @@ static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx,
{
struct check_walk_data *d = ctx->arg;
- if (kvm_pte_valid(ctx->old) && !addr_is_allowed_memory(kvm_pte_to_phys(ctx->old)))
- return -EINVAL;
-
- return d->get_page_state(ctx->old) == d->desired ? 0 : -EPERM;
+ return d->get_page_state(ctx->old, ctx->addr) == d->desired ? 0 : -EPERM;
}
static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
@@ -594,637 +655,742 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
-static enum pkvm_page_state host_get_page_state(kvm_pte_t pte)
-{
- if (!kvm_pte_valid(pte) && pte)
- return PKVM_NOPAGE;
-
- return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
-}
-
static int __host_check_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- struct check_walk_data d = {
- .desired = state,
- .get_page_state = host_get_page_state,
- };
+ int ret;
+
+ ret = check_range_allowed_memory(addr, addr + size);
+ if (ret)
+ return ret;
hyp_assert_lock_held(&host_mmu.lock);
- return check_page_state_range(&host_mmu.pgt, addr, size, &d);
+
+ for_each_hyp_page(page, addr, size) {
+ if (get_host_state(page) != state)
+ return -EPERM;
+ }
+
+ return 0;
}
static int __host_set_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state);
+ if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) {
+ int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
- return host_stage2_idmap_locked(addr, size, prot);
-}
+ if (ret)
+ return ret;
+ }
-static int host_request_owned_transition(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
+ __host_update_page_state(addr, size, state);
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+ return 0;
}
-static int host_request_unshare(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
+static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
-
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED);
+ for_each_hyp_page(page, phys, size)
+ set_hyp_state(page, state);
}
-static int host_initiate_share(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
+static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
+ for_each_hyp_page(page, phys, size) {
+ if (get_hyp_state(page) != state)
+ return -EPERM;
+ }
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED);
+ return 0;
}
-static int host_initiate_unshare(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
+static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
+ if (!kvm_pte_valid(pte))
+ return PKVM_NOPAGE;
- *completer_addr = tx->initiator.host.completer_addr;
- return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED);
+ return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
}
-static int host_initiate_donation(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
+static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr,
+ u64 size, enum pkvm_page_state state)
{
- u8 owner_id = tx->completer.id;
- u64 size = tx->nr_pages * PAGE_SIZE;
-
- *completer_addr = tx->initiator.host.completer_addr;
- return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id);
-}
+ struct check_walk_data d = {
+ .desired = state,
+ .get_page_state = guest_get_page_state,
+ };
-static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
-{
- return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
- tx->initiator.id != PKVM_ID_HYP);
+ hyp_assert_lock_held(&vm->lock);
+ return check_page_state_range(&vm->pgt, addr, size, &d);
}
-static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx,
- enum pkvm_page_state state)
+int __pkvm_host_share_hyp(u64 pfn)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE;
+ int ret;
- if (__host_ack_skip_pgtable_check(tx))
- return 0;
+ host_lock_component();
+ hyp_lock_component();
- return __host_check_page_state_range(addr, size, state);
-}
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+ ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
-static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
-{
- return __host_ack_transition(addr, tx, PKVM_NOPAGE);
-}
+ __hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
+ WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED));
-static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u8 host_id = tx->completer.id;
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- return host_stage2_set_owner_locked(addr, size, host_id);
+ return ret;
}
-static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte)
+int __pkvm_host_unshare_hyp(u64 pfn)
{
- if (!kvm_pte_valid(pte))
- return PKVM_NOPAGE;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 virt = (u64)__hyp_va(phys);
+ u64 size = PAGE_SIZE;
+ int ret;
- return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
-}
+ host_lock_component();
+ hyp_lock_component();
-static int __hyp_check_page_state_range(u64 addr, u64 size,
- enum pkvm_page_state state)
-{
- struct check_walk_data d = {
- .desired = state,
- .get_page_state = hyp_get_page_state,
- };
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
+ if (ret)
+ goto unlock;
+ ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
+ if (ret)
+ goto unlock;
+ if (hyp_page_count((void *)virt)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
- hyp_assert_lock_held(&pkvm_pgd_lock);
- return check_page_state_range(&pkvm_pgtable, addr, size, &d);
-}
+ __hyp_set_page_state_range(phys, size, PKVM_NOPAGE);
+ WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED));
-static int hyp_request_donation(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- u64 addr = tx->initiator.addr;
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- *completer_addr = tx->initiator.hyp.completer_addr;
- return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+ return ret;
}
-static int hyp_initiate_donation(u64 *completer_addr,
- const struct pkvm_mem_transition *tx)
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
+ void *virt = __hyp_va(phys);
int ret;
- *completer_addr = tx->initiator.hyp.completer_addr;
- ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size);
- return (ret != size) ? -EFAULT : 0;
-}
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
-static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
-{
- return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
- tx->initiator.id != PKVM_ID_HOST);
-}
+ host_lock_component();
+ hyp_lock_component();
-static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx,
- enum kvm_pgtable_prot perms)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+ ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
- if (perms != PAGE_HYP)
- return -EPERM;
+ __hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP));
+ WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP));
- if (__hyp_ack_skip_pgtable_check(tx))
- return 0;
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+ return ret;
}
-static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
{
- u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
+ u64 virt = (u64)__hyp_va(phys);
+ int ret;
- if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr))
- return -EBUSY;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
- if (__hyp_ack_skip_pgtable_check(tx))
- return 0;
+ host_lock_component();
+ hyp_lock_component();
- return __hyp_check_page_state_range(addr, size,
- PKVM_PAGE_SHARED_BORROWED);
-}
+ ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+ ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
-static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
+ __hyp_set_page_state_range(phys, size, PKVM_NOPAGE);
+ WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size);
+ WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST));
- if (__hyp_ack_skip_pgtable_check(tx))
- return 0;
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
- return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+ return ret;
}
-static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
- enum kvm_pgtable_prot perms)
+int hyp_pin_shared_mem(void *from, void *to)
{
- void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
- enum kvm_pgtable_prot prot;
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
+ u64 phys = __hyp_pa(start);
+ u64 size = end - start;
+ struct hyp_page *p;
+ int ret;
- prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED);
- return pkvm_create_mappings_locked(start, end, prot);
-}
+ host_lock_component();
+ hyp_lock_component();
-static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
-{
- u64 size = tx->nr_pages * PAGE_SIZE;
- int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size);
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
+ if (ret)
+ goto unlock;
- return (ret != size) ? -EFAULT : 0;
-}
+ ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
+ if (ret)
+ goto unlock;
-static int hyp_complete_donation(u64 addr,
- const struct pkvm_mem_transition *tx)
-{
- void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
- enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
+ for (cur = start; cur < end; cur += PAGE_SIZE) {
+ p = hyp_virt_to_page(cur);
+ hyp_page_ref_inc(p);
+ if (p->refcount == 1)
+ WARN_ON(pkvm_create_mappings_locked((void *)cur,
+ (void *)cur + PAGE_SIZE,
+ PAGE_HYP));
+ }
- return pkvm_create_mappings_locked(start, end, prot);
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
}
-static int check_share(struct pkvm_mem_share *share)
+void hyp_unpin_shared_mem(void *from, void *to)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
- int ret;
-
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_request_owned_transition(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
+ struct hyp_page *p;
- if (ret)
- return ret;
+ host_lock_component();
+ hyp_lock_component();
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
- break;
- default:
- ret = -EINVAL;
+ for (cur = start; cur < end; cur += PAGE_SIZE) {
+ p = hyp_virt_to_page(cur);
+ if (p->refcount == 1)
+ WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE);
+ hyp_page_ref_dec(p);
}
- return ret;
+ hyp_unlock_component();
+ host_unlock_component();
}
-static int __do_share(struct pkvm_mem_share *share)
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
int ret;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_initiate_share(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
-
- if (ret)
- return ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
- break;
- default:
- ret = -EINVAL;
- }
+ host_lock_component();
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (!ret)
+ ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
+ host_unlock_component();
return ret;
}
-/*
- * do_share():
- *
- * The page owner grants access to another component with a given set
- * of permissions.
- *
- * Initiator: OWNED => SHARED_OWNED
- * Completer: NOPAGE => SHARED_BORROWED
- */
-static int do_share(struct pkvm_mem_share *share)
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
{
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 size = PAGE_SIZE * nr_pages;
int ret;
- ret = check_share(share);
- if (ret)
- return ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
- return WARN_ON(__do_share(share));
+ host_lock_component();
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
+ if (!ret)
+ ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ host_unlock_component();
+
+ return ret;
}
-static int check_unshare(struct pkvm_mem_share *share)
+static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
- int ret;
+ size_t block_size;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_request_unshare(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
+ if (nr_pages == 1) {
+ *size = PAGE_SIZE;
+ return 0;
}
- if (ret)
- return ret;
+ /* We solely support second to last level huge mapping */
+ block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1);
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_ack_unshare(completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ if (nr_pages != block_size >> PAGE_SHIFT)
+ return -EINVAL;
- return ret;
+ if (!IS_ALIGNED(phys | ipa, block_size))
+ return -EINVAL;
+
+ *size = block_size;
+ return 0;
}
-static int __do_unshare(struct pkvm_mem_share *share)
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
+ enum kvm_pgtable_prot prot)
{
- const struct pkvm_mem_transition *tx = &share->tx;
- u64 completer_addr;
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 size;
int ret;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_initiate_unshare(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ if (prot & ~KVM_PGTABLE_PROT_RWX)
+ return -EINVAL;
+
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
+ ret = __guest_check_transition_size(phys, ipa, nr_pages, &size);
+ if (ret)
+ return ret;
+ ret = check_range_allowed_memory(phys, phys + size);
if (ret)
return ret;
- switch (tx->completer.id) {
- case PKVM_ID_HYP:
- ret = hyp_complete_unshare(completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
+
+ for_each_hyp_page(page, phys, size) {
+ switch (get_host_state(page)) {
+ case PKVM_PAGE_OWNED:
+ continue;
+ case PKVM_PAGE_SHARED_OWNED:
+ if (page->host_share_guest_count == U32_MAX) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ /* Only host to np-guest multi-sharing is tolerated */
+ if (page->host_share_guest_count)
+ continue;
+
+ fallthrough;
+ default:
+ ret = -EPERM;
+ goto unlock;
+ }
+ }
+
+ for_each_hyp_page(page, phys, size) {
+ set_host_state(page, PKVM_PAGE_SHARED_OWNED);
+ page->host_share_guest_count++;
}
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys,
+ pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
return ret;
}
-/*
- * do_unshare():
- *
- * The page owner revokes access from another component for a range of
- * pages which were previously shared using do_share().
- *
- * Initiator: SHARED_OWNED => OWNED
- * Completer: SHARED_BORROWED => NOPAGE
- */
-static int do_unshare(struct pkvm_mem_share *share)
+static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size)
{
+ enum pkvm_page_state state;
+ kvm_pte_t pte;
+ u64 phys;
+ s8 level;
int ret;
- ret = check_unshare(share);
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
if (ret)
return ret;
+ if (!kvm_pte_valid(pte))
+ return -ENOENT;
+ if (size && kvm_granule_size(level) != size)
+ return -E2BIG;
- return WARN_ON(__do_unshare(share));
-}
-
-static int check_donation(struct pkvm_mem_donation *donation)
-{
- const struct pkvm_mem_transition *tx = &donation->tx;
- u64 completer_addr;
- int ret;
+ if (!size)
+ size = kvm_granule_size(level);
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_request_owned_transition(&completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_request_donation(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
+ state = guest_get_page_state(pte, ipa);
+ if (state != PKVM_PAGE_SHARED_BORROWED)
+ return -EPERM;
- if (ret)
+ phys = kvm_pte_to_phys(pte);
+ ret = check_range_allowed_memory(phys, phys + size);
+ if (WARN_ON(ret))
return ret;
- switch (tx->completer.id) {
- case PKVM_ID_HOST:
- ret = host_ack_donation(completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_ack_donation(completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
+ for_each_hyp_page(page, phys, size) {
+ if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED)
+ return -EPERM;
+ if (WARN_ON(!page->host_share_guest_count))
+ return -EINVAL;
}
- return ret;
+ *__phys = phys;
+
+ return 0;
}
-static int __do_donate(struct pkvm_mem_donation *donation)
+int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm)
{
- const struct pkvm_mem_transition *tx = &donation->tx;
- u64 completer_addr;
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 size, phys;
int ret;
- switch (tx->initiator.id) {
- case PKVM_ID_HOST:
- ret = host_initiate_donation(&completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_initiate_donation(&completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
-
+ ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
if (ret)
return ret;
- switch (tx->completer.id) {
- case PKVM_ID_HOST:
- ret = host_complete_donation(completer_addr, tx);
- break;
- case PKVM_ID_HYP:
- ret = hyp_complete_donation(completer_addr, tx);
- break;
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
+ host_lock_component();
+ guest_lock_component(vm);
-/*
- * do_donate():
- *
- * The page owner transfers ownership to another component, losing access
- * as a consequence.
- *
- * Initiator: OWNED => NOPAGE
- * Completer: NOPAGE => OWNED
- */
-static int do_donate(struct pkvm_mem_donation *donation)
-{
- int ret;
+ ret = __check_host_shared_guest(vm, &phys, ipa, size);
+ if (ret)
+ goto unlock;
- ret = check_donation(donation);
+ ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size);
if (ret)
- return ret;
+ goto unlock;
+
+ for_each_hyp_page(page, phys, size) {
+ /* __check_host_shared_guest() protects against underflow */
+ page->host_share_guest_count--;
+ if (!page->host_share_guest_count)
+ set_host_state(page, PKVM_PAGE_OWNED);
+ }
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
- return WARN_ON(__do_donate(donation));
+ return ret;
}
-int __pkvm_host_share_hyp(u64 pfn)
+static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size)
{
+ u64 phys;
int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_share share = {
- .tx = {
- .nr_pages = 1,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = host_addr,
- .host = {
- .completer_addr = hyp_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HYP,
- },
- },
- .completer_prot = PAGE_HYP,
- };
+
+ if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG))
+ return;
host_lock_component();
- hyp_lock_component();
+ guest_lock_component(vm);
- ret = do_share(&share);
+ ret = __check_host_shared_guest(vm, &phys, ipa, size);
- hyp_unlock_component();
+ guest_unlock_component(vm);
host_unlock_component();
- return ret;
+ WARN_ON(ret && ret != -ENOENT);
}
-int __pkvm_host_unshare_hyp(u64 pfn)
+int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot)
{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 ipa = hyp_pfn_to_phys(gfn);
int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_share share = {
- .tx = {
- .nr_pages = 1,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = host_addr,
- .host = {
- .completer_addr = hyp_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HYP,
- },
- },
- .completer_prot = PAGE_HYP,
- };
- host_lock_component();
- hyp_lock_component();
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
- ret = do_unshare(&share);
+ if (prot & ~KVM_PGTABLE_PROT_RWX)
+ return -EINVAL;
- hyp_unlock_component();
- host_unlock_component();
+ assert_host_shared_guest(vm, ipa, 0);
+ guest_lock_component(vm);
+ ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0);
+ guest_unlock_component(vm);
return ret;
}
-int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
+int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm)
{
+ u64 size, ipa = hyp_pfn_to_phys(gfn);
int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_donation donation = {
- .tx = {
- .nr_pages = nr_pages,
- .initiator = {
- .id = PKVM_ID_HOST,
- .addr = host_addr,
- .host = {
- .completer_addr = hyp_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HYP,
- },
- },
- };
- host_lock_component();
- hyp_lock_component();
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
- ret = do_donate(&donation);
+ ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
+ if (ret)
+ return ret;
- hyp_unlock_component();
- host_unlock_component();
+ assert_host_shared_guest(vm, ipa, size);
+ guest_lock_component(vm);
+ ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size);
+ guest_unlock_component(vm);
return ret;
}
-int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
+int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm)
{
+ u64 size, ipa = hyp_pfn_to_phys(gfn);
int ret;
- u64 host_addr = hyp_pfn_to_phys(pfn);
- u64 hyp_addr = (u64)__hyp_va(host_addr);
- struct pkvm_mem_donation donation = {
- .tx = {
- .nr_pages = nr_pages,
- .initiator = {
- .id = PKVM_ID_HYP,
- .addr = hyp_addr,
- .hyp = {
- .completer_addr = host_addr,
- },
- },
- .completer = {
- .id = PKVM_ID_HOST,
- },
- },
- };
- host_lock_component();
- hyp_lock_component();
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
- ret = do_donate(&donation);
+ ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
+ if (ret)
+ return ret;
- hyp_unlock_component();
- host_unlock_component();
+ assert_host_shared_guest(vm, ipa, size);
+ guest_lock_component(vm);
+ ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold);
+ guest_unlock_component(vm);
return ret;
}
-int hyp_pin_shared_mem(void *from, void *to)
+int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu)
{
- u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
- u64 end = PAGE_ALIGN((u64)to);
- u64 size = end - start;
- int ret;
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 ipa = hyp_pfn_to_phys(gfn);
- host_lock_component();
- hyp_lock_component();
+ if (pkvm_hyp_vm_is_protected(vm))
+ return -EPERM;
- ret = __host_check_page_state_range(__hyp_pa(start), size,
- PKVM_PAGE_SHARED_OWNED);
- if (ret)
- goto unlock;
+ assert_host_shared_guest(vm, ipa, 0);
+ guest_lock_component(vm);
+ kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0);
+ guest_unlock_component(vm);
- ret = __hyp_check_page_state_range(start, size,
- PKVM_PAGE_SHARED_BORROWED);
- if (ret)
- goto unlock;
+ return 0;
+}
- for (cur = start; cur < end; cur += PAGE_SIZE)
- hyp_page_ref_inc(hyp_virt_to_page(cur));
+#ifdef CONFIG_NVHE_EL2_DEBUG
+struct pkvm_expected_state {
+ enum pkvm_page_state host;
+ enum pkvm_page_state hyp;
+ enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */
+};
-unlock:
- hyp_unlock_component();
- host_unlock_component();
+static struct pkvm_expected_state selftest_state;
+static struct hyp_page *selftest_page;
- return ret;
+static struct pkvm_hyp_vm selftest_vm = {
+ .kvm = {
+ .arch = {
+ .mmu = {
+ .arch = &selftest_vm.kvm.arch,
+ .pgt = &selftest_vm.pgt,
+ },
+ },
+ },
+};
+
+static struct pkvm_hyp_vcpu selftest_vcpu = {
+ .vcpu = {
+ .arch = {
+ .hw_mmu = &selftest_vm.kvm.arch.mmu,
+ },
+ .kvm = &selftest_vm.kvm,
+ },
+};
+
+static void init_selftest_vm(void *virt)
+{
+ struct hyp_page *p = hyp_virt_to_page(virt);
+ int i;
+
+ selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
+ WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
+
+ for (i = 0; i < pkvm_selftest_pages(); i++) {
+ if (p[i].refcount)
+ continue;
+ p[i].refcount = 1;
+ hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
+ }
}
-void hyp_unpin_shared_mem(void *from, void *to)
+static u64 selftest_ipa(void)
{
- u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
- u64 end = PAGE_ALIGN((u64)to);
+ return BIT(selftest_vm.pgt.ia_bits - 1);
+}
- host_lock_component();
- hyp_lock_component();
+static void assert_page_state(void)
+{
+ void *virt = hyp_page_to_virt(selftest_page);
+ u64 size = PAGE_SIZE << selftest_page->order;
+ struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
+ u64 phys = hyp_virt_to_phys(virt);
+ u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE };
+ struct pkvm_hyp_vm *vm;
- for (cur = start; cur < end; cur += PAGE_SIZE)
- hyp_page_ref_dec(hyp_virt_to_page(cur));
+ vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
- hyp_unlock_component();
+ host_lock_component();
+ WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host));
host_unlock_component();
-}
+
+ hyp_lock_component();
+ WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp));
+ hyp_unlock_component();
+
+ guest_lock_component(&selftest_vm);
+ WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0]));
+ WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1]));
+ guest_unlock_component(&selftest_vm);
+}
+
+#define assert_transition_res(res, fn, ...) \
+ do { \
+ WARN_ON(fn(__VA_ARGS__) != res); \
+ assert_page_state(); \
+ } while (0)
+
+void pkvm_ownership_selftest(void *base)
+{
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
+ void *virt = hyp_alloc_pages(&host_s2_pool, 0);
+ struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
+ struct pkvm_hyp_vm *vm = &selftest_vm;
+ u64 phys, size, pfn, gfn;
+
+ WARN_ON(!virt);
+ selftest_page = hyp_virt_to_page(virt);
+ selftest_page->refcount = 0;
+ init_selftest_vm(base);
+
+ size = PAGE_SIZE << selftest_page->order;
+ phys = hyp_virt_to_phys(virt);
+ pfn = hyp_phys_to_pfn(phys);
+ gfn = hyp_phys_to_pfn(selftest_ipa());
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.hyp = PKVM_PAGE_OWNED;
+ selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE;
+ assert_page_state();
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+
+ selftest_state.host = PKVM_PAGE_SHARED_OWNED;
+ selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED;
+ assert_transition_res(0, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+
+ assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
+ assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
+ hyp_unpin_shared_mem(virt, virt + size);
+ WARN_ON(hyp_page_count(virt) != 1);
+ assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+
+ hyp_unpin_shared_mem(virt, virt + size);
+ assert_page_state();
+ WARN_ON(hyp_page_count(virt));
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_unshare_hyp, pfn);
+
+ selftest_state.host = PKVM_PAGE_SHARED_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_SHARED_OWNED;
+ selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED;
+ assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+
+ selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED;
+ assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2);
+
+ selftest_state.guest[0] = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm);
+
+ selftest_state.guest[1] = PKVM_NOPAGE;
+ selftest_state.host = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.hyp = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1);
+
+ selftest_page->refcount = 1;
+ hyp_put_page(&host_s2_pool, virt);
+}
+#endif
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 318298eb3d6b..ae8391baebc3 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -44,6 +44,27 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size,
return err;
}
+static int __pkvm_alloc_private_va_range(unsigned long start, size_t size)
+{
+ unsigned long cur;
+
+ hyp_assert_lock_held(&pkvm_pgd_lock);
+
+ if (!start || start < __io_map_base)
+ return -EINVAL;
+
+ /* The allocated size is always a multiple of PAGE_SIZE */
+ cur = start + PAGE_ALIGN(size);
+
+ /* Are we overflowing on the vmemmap ? */
+ if (cur > __hyp_vmemmap)
+ return -ENOMEM;
+
+ __io_map_base = cur;
+
+ return 0;
+}
+
/**
* pkvm_alloc_private_va_range - Allocates a private VA range.
* @size: The size of the VA range to reserve.
@@ -56,27 +77,16 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size,
*/
int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr)
{
- unsigned long base, addr;
- int ret = 0;
+ unsigned long addr;
+ int ret;
hyp_spin_lock(&pkvm_pgd_lock);
-
- /* Align the allocation based on the order of its size */
- addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size));
-
- /* The allocated size is always a multiple of PAGE_SIZE */
- base = addr + PAGE_ALIGN(size);
-
- /* Are we overflowing on the vmemmap ? */
- if (!addr || base > __hyp_vmemmap)
- ret = -ENOMEM;
- else {
- __io_map_base = base;
- *haddr = addr;
- }
-
+ addr = __io_map_base;
+ ret = __pkvm_alloc_private_va_range(addr, size);
hyp_spin_unlock(&pkvm_pgd_lock);
+ *haddr = addr;
+
return ret;
}
@@ -145,7 +155,7 @@ int hyp_back_vmemmap(phys_addr_t back)
start = hyp_memory[i].base;
start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE);
/*
- * The begining of the hyp_vmemmap region for the current
+ * The beginning of the hyp_vmemmap region for the current
* memblock may already be backed by the page backing the end
* the previous region, so avoid mapping it twice.
*/
@@ -219,9 +229,8 @@ int hyp_map_vectors(void)
return 0;
}
-void *hyp_fixmap_map(phys_addr_t phys)
+static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys)
{
- struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
kvm_pte_t pte, *ptep = slot->ptep;
pte = *ptep;
@@ -233,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys)
return (void *)slot->addr;
}
+void *hyp_fixmap_map(phys_addr_t phys)
+{
+ return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys);
+}
+
static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
{
kvm_pte_t *ptep = slot->ptep;
u64 addr = slot->addr;
+ u32 level;
+
+ if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE)
+ level = KVM_PGTABLE_LAST_LEVEL;
+ else
+ level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */
WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
@@ -250,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
* https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
*/
dsb(ishst);
- __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
+ __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
dsb(ish);
isb();
}
@@ -263,9 +283,9 @@ void hyp_fixmap_unmap(void)
static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
- struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg);
+ struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg;
- if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1)
+ if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level))
return -EINVAL;
slot->addr = ctx->addr;
@@ -286,13 +306,84 @@ static int create_fixmap_slot(u64 addr, u64 cpu)
struct kvm_pgtable_walker walker = {
.cb = __create_fixmap_slot_cb,
.flags = KVM_PGTABLE_WALK_LEAF,
- .arg = (void *)cpu,
+ .arg = per_cpu_ptr(&fixmap_slots, cpu),
};
return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
}
-int hyp_create_pcpu_fixmap(void)
+#if PAGE_SHIFT < 16
+#define HAS_FIXBLOCK
+static struct hyp_fixmap_slot hyp_fixblock_slot;
+static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock);
+#endif
+
+static int create_fixblock(void)
+{
+#ifdef HAS_FIXBLOCK
+ struct kvm_pgtable_walker walker = {
+ .cb = __create_fixmap_slot_cb,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &hyp_fixblock_slot,
+ };
+ unsigned long addr;
+ phys_addr_t phys;
+ int ret, i;
+
+ /* Find a RAM phys address, PMD aligned */
+ for (i = 0; i < hyp_memblock_nr; i++) {
+ phys = ALIGN(hyp_memory[i].base, PMD_SIZE);
+ if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size))
+ break;
+ }
+
+ if (i >= hyp_memblock_nr)
+ return -EINVAL;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+ addr = ALIGN(__io_map_base, PMD_SIZE);
+ ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE);
+ if (ret)
+ goto unlock;
+
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP);
+ if (ret)
+ goto unlock;
+
+ ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker);
+
+unlock:
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+void *hyp_fixblock_map(phys_addr_t phys, size_t *size)
+{
+#ifdef HAS_FIXBLOCK
+ *size = PMD_SIZE;
+ hyp_spin_lock(&hyp_fixblock_lock);
+ return fixmap_map_slot(&hyp_fixblock_slot, phys);
+#else
+ *size = PAGE_SIZE;
+ return hyp_fixmap_map(phys);
+#endif
+}
+
+void hyp_fixblock_unmap(void)
+{
+#ifdef HAS_FIXBLOCK
+ fixmap_clear_slot(&hyp_fixblock_slot);
+ hyp_spin_unlock(&hyp_fixblock_lock);
+#else
+ hyp_fixmap_unmap();
+#endif
+}
+
+int hyp_create_fixmap(void)
{
unsigned long addr, i;
int ret;
@@ -312,7 +403,7 @@ int hyp_create_pcpu_fixmap(void)
return ret;
}
- return 0;
+ return create_fixblock();
}
int hyp_create_idmap(u32 hyp_va_bits)
@@ -340,6 +431,45 @@ int hyp_create_idmap(u32 hyp_va_bits)
return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
}
+int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr)
+{
+ unsigned long addr, prev_base;
+ size_t size;
+ int ret;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+
+ prev_base = __io_map_base;
+ /*
+ * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
+ * an alignment of our allocation on the order of the size.
+ */
+ size = NVHE_STACK_SIZE * 2;
+ addr = ALIGN(__io_map_base, size);
+
+ ret = __pkvm_alloc_private_va_range(addr, size);
+ if (!ret) {
+ /*
+ * Since the stack grows downwards, map the stack to the page
+ * at the higher address and leave the lower guard page
+ * unbacked.
+ *
+ * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
+ * and addresses corresponding to the guard page have the
+ * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
+ */
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE,
+ NVHE_STACK_SIZE, phys, PAGE_HYP);
+ if (ret)
+ __io_map_base = prev_base;
+ }
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ *haddr = addr + size;
+
+ return ret;
+}
+
static void *admit_host_page(void *arg)
{
struct kvm_hyp_memcache *host_mc = arg;
@@ -359,7 +489,7 @@ static void *admit_host_page(void *arg)
return pop_hyp_memcache(host_mc, hyp_phys_to_virt);
}
-/* Refill our local memcache by poping pages from the one provided by the host. */
+/* Refill our local memcache by popping pages from the one provided by the host. */
int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
struct kvm_hyp_memcache *host_mc)
{
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 803ba3222e75..a1eb27a1a747 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -32,7 +32,7 @@ u64 __hyp_vmemmap;
*/
static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned short order)
+ u8 order)
{
phys_addr_t addr = hyp_page_to_phys(p);
@@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
/* Find a buddy page currently available for allocation */
static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned short order)
+ u8 order)
{
struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
@@ -94,7 +94,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
struct hyp_page *p)
{
phys_addr_t phys = hyp_page_to_phys(p);
- unsigned short order = p->order;
+ u8 order = p->order;
struct hyp_page *buddy;
memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
@@ -110,7 +110,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
* after coalescing, so make sure to mark it HYP_NO_ORDER proactively.
*/
p->order = HYP_NO_ORDER;
- for (; (order + 1) < pool->max_order; order++) {
+ for (; (order + 1) <= pool->max_order; order++) {
buddy = __find_buddy_avail(pool, p, order);
if (!buddy)
break;
@@ -129,7 +129,7 @@ insert:
static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned short order)
+ u8 order)
{
struct hyp_page *buddy;
@@ -183,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr)
void hyp_split_page(struct hyp_page *p)
{
- unsigned short order = p->order;
+ u8 order = p->order;
unsigned int i;
p->order = 0;
@@ -195,17 +195,17 @@ void hyp_split_page(struct hyp_page *p)
}
}
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
+void *hyp_alloc_pages(struct hyp_pool *pool, u8 order)
{
- unsigned short i = order;
struct hyp_page *p;
+ u8 i = order;
hyp_spin_lock(&pool->lock);
/* Look for a high-enough-order page */
- while (i < pool->max_order && list_empty(&pool->free_area[i]))
+ while (i <= pool->max_order && list_empty(&pool->free_area[i]))
i++;
- if (i >= pool->max_order) {
+ if (i > pool->max_order) {
hyp_spin_unlock(&pool->lock);
return NULL;
}
@@ -228,8 +228,9 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
int i;
hyp_spin_lock_init(&pool->lock);
- pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT));
- for (i = 0; i < pool->max_order; i++)
+ pool->max_order = min(MAX_PAGE_ORDER,
+ get_order(nr_pages << PAGE_SHIFT));
+ for (i = 0; i <= pool->max_order; i++)
INIT_LIST_HEAD(&pool->free_area[i]);
pool->range_start = phys;
pool->range_end = phys + (nr_pages << PAGE_SHIFT);
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index a06ece14a6d8..8911338961c5 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -6,191 +6,185 @@
#include <linux/kvm_host.h>
#include <linux/mm.h>
-#include <nvhe/fixed_config.h>
+
+#include <asm/kvm_emulate.h>
+
#include <nvhe/mem_protect.h>
#include <nvhe/memory.h>
#include <nvhe/pkvm.h>
#include <nvhe/trap_handler.h>
-/* Used by icache_is_vpipt(). */
+/* Used by icache_is_aliasing(). */
unsigned long __icache_flags;
/* Used by kvm_get_vttbr(). */
unsigned int kvm_arm_vmid_bits;
+unsigned int kvm_host_sve_max_vl;
+
/*
- * Set trap register values based on features in ID_AA64PFR0.
+ * The currently loaded hyp vCPU for each physical CPU. Used in protected mode
+ * for both protected and non-protected VMs.
*/
-static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
+static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu);
+
+static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1);
- u64 hcr_set = HCR_RW;
- u64 hcr_clear = 0;
- u64 cptr_set = 0;
+ vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
- /* Protected KVM does not support AArch32 guests. */
- BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0),
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
- BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1),
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
+ if (has_hvhe())
+ vcpu->arch.hcr_el2 |= HCR_E2H;
- /*
- * Linux guests assume support for floating-point and Advanced SIMD. Do
- * not change the trapping behavior for these from the KVM default.
- */
- BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP),
- PVM_ID_AA64PFR0_ALLOW));
- BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD),
- PVM_ID_AA64PFR0_ALLOW));
-
- /* Trap RAS unless all current versions are supported */
- if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) <
- ID_AA64PFR0_EL1_RAS_V1P1) {
- hcr_set |= HCR_TERR | HCR_TEA;
- hcr_clear |= HCR_FIEN;
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
+ /* route synchronous external abort exceptions to EL2 */
+ vcpu->arch.hcr_el2 |= HCR_TEA;
+ /* trap error record accesses */
+ vcpu->arch.hcr_el2 |= HCR_TERR;
}
- /* Trap AMU */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) {
- hcr_clear |= HCR_AMVOFFEN;
- cptr_set |= CPTR_EL2_TAM;
- }
+ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
+ vcpu->arch.hcr_el2 |= HCR_FWB;
+
+ if (cpus_have_final_cap(ARM64_HAS_EVT) &&
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
+ kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0))
+ vcpu->arch.hcr_el2 |= HCR_TID4;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TID2;
- /* Trap SVE */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids))
- cptr_set |= CPTR_EL2_TZ;
+ if (vcpu_has_ptrauth(vcpu))
+ vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
- vcpu->arch.hcr_el2 |= hcr_set;
- vcpu->arch.hcr_el2 &= ~hcr_clear;
- vcpu->arch.cptr_el2 |= cptr_set;
+ if (kvm_has_mte(vcpu->kvm))
+ vcpu->arch.hcr_el2 |= HCR_ATA;
}
-/*
- * Set trap register values based on features in ID_AA64PFR1.
- */
-static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu)
+static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu)
{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1);
- u64 hcr_set = 0;
- u64 hcr_clear = 0;
+ struct kvm *kvm = vcpu->kvm;
+ u64 val = vcpu->arch.hcr_el2;
- /* Memory Tagging: Trap and Treat as Untagged if not supported. */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) {
- hcr_set |= HCR_TID5;
- hcr_clear |= HCR_DCT | HCR_ATA;
- }
+ /* No support for AArch32. */
+ val |= HCR_RW;
- vcpu->arch.hcr_el2 |= hcr_set;
- vcpu->arch.hcr_el2 &= ~hcr_clear;
-}
+ /*
+ * Always trap:
+ * - Feature id registers: to control features exposed to guests
+ * - Implementation-defined features
+ */
+ val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1;
-/*
- * Set trap register values based on features in ID_AA64DFR0.
- */
-static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
-{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1);
- u64 mdcr_set = 0;
- u64 mdcr_clear = 0;
- u64 cptr_set = 0;
-
- /* Trap/constrain PMU */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) {
- mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
- mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME |
- MDCR_EL2_HPMN_MASK;
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
+ val |= HCR_TERR | HCR_TEA;
+ val &= ~(HCR_FIEN);
}
- /* Trap Debug */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids))
- mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE;
-
- /* Trap OS Double Lock */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids))
- mdcr_set |= MDCR_EL2_TDOSA;
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP))
+ val &= ~(HCR_AMVOFFEN);
- /* Trap SPE */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) {
- mdcr_set |= MDCR_EL2_TPMS;
- mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+ if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) {
+ val |= HCR_TID5;
+ val &= ~(HCR_DCT | HCR_ATA);
}
- /* Trap Trace Filter */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids))
- mdcr_set |= MDCR_EL2_TTRF;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP))
+ val |= HCR_TLOR;
- /* Trap Trace */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids))
- cptr_set |= CPTR_EL2_TTA;
-
- vcpu->arch.mdcr_el2 |= mdcr_set;
- vcpu->arch.mdcr_el2 &= ~mdcr_clear;
- vcpu->arch.cptr_el2 |= cptr_set;
+ vcpu->arch.hcr_el2 = val;
}
-/*
- * Set trap register values based on features in ID_AA64MMFR0.
- */
-static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu)
+static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu)
{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1);
- u64 mdcr_set = 0;
+ struct kvm *kvm = vcpu->kvm;
+ u64 val = vcpu->arch.mdcr_el2;
- /* Trap Debug Communications Channel registers */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids))
- mdcr_set |= MDCR_EL2_TDCC;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) {
+ val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
+ val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK);
+ }
- vcpu->arch.mdcr_el2 |= mdcr_set;
-}
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP))
+ val |= MDCR_EL2_TDRA | MDCR_EL2_TDA;
-/*
- * Set trap register values based on features in ID_AA64MMFR1.
- */
-static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu)
-{
- const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1);
- u64 hcr_set = 0;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP))
+ val |= MDCR_EL2_TDOSA;
+
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) {
+ val |= MDCR_EL2_TPMS;
+ val &= ~MDCR_EL2_E2PB_MASK;
+ }
+
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
+ val |= MDCR_EL2_TTRF;
- /* Trap LOR */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids))
- hcr_set |= HCR_TLOR;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP))
+ val |= MDCR_EL2_E2TB_MASK;
+
+ /* Trap Debug Communications Channel registers */
+ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
+ val |= MDCR_EL2_TDCC;
- vcpu->arch.hcr_el2 |= hcr_set;
+ vcpu->arch.mdcr_el2 = val;
}
/*
- * Set baseline trap register values.
+ * Check that cpu features that are neither trapped nor supported are not
+ * enabled for protected VMs.
*/
-static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
+static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu)
{
- const u64 hcr_trap_feat_regs = HCR_TID3;
- const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1;
+ struct kvm *kvm = vcpu->kvm;
+
+ /* No AArch32 support for protected guests. */
+ if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) ||
+ kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32))
+ return -EINVAL;
/*
- * Always trap:
- * - Feature id registers: to control features exposed to guests
- * - Implementation-defined features
+ * Linux guests assume support for floating-point and Advanced SIMD. Do
+ * not change the trapping behavior for these from the KVM default.
*/
- vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef;
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) ||
+ !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP))
+ return -EINVAL;
- /* Clear res0 and set res1 bits to trap potential new features. */
- vcpu->arch.hcr_el2 &= ~(HCR_RES0);
- vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
- vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
- vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+ /* No SME support in KVM right now. Check to catch if it changes. */
+ if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP))
+ return -EINVAL;
+
+ return 0;
}
/*
- * Initialize trap register values for protected VMs.
+ * Initialize trap register values in protected mode.
*/
-void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
+static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
{
- pvm_init_trap_regs(vcpu);
- pvm_init_traps_aa64pfr0(vcpu);
- pvm_init_traps_aa64pfr1(vcpu);
- pvm_init_traps_aa64dfr0(vcpu);
- pvm_init_traps_aa64mmfr0(vcpu);
- pvm_init_traps_aa64mmfr1(vcpu);
+ struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+ int ret;
+
+ vcpu->arch.mdcr_el2 = 0;
+
+ pkvm_vcpu_reset_hcr(vcpu);
+
+ if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) {
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ /* Trust the host for non-protected vcpu features. */
+ vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2;
+ memcpy(vcpu->arch.fgt, host_vcpu->arch.fgt, sizeof(vcpu->arch.fgt));
+ return 0;
+ }
+
+ ret = pkvm_check_pvm_cpu_features(vcpu);
+ if (ret)
+ return ret;
+
+ pvm_init_traps_hcr(vcpu);
+ pvm_init_traps_mdcr(vcpu);
+ vcpu_set_hcrx(vcpu);
+
+ return 0;
}
/*
@@ -199,6 +193,11 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
*/
#define HANDLE_OFFSET 0x1000
+/*
+ * Marks a reserved but not yet used entry in the VM table.
+ */
+#define RESERVED_ENTRY ((void *)0xa110ca7ed)
+
static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
{
return handle - HANDLE_OFFSET;
@@ -211,14 +210,14 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
/*
* Spinlock for protecting state related to the VM table. Protects writes
- * to 'vm_table' and 'nr_table_entries' as well as reads and writes to
- * 'last_hyp_vcpu_lookup'.
+ * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization.
+ * Also protects reads and writes to 'last_hyp_vcpu_lookup'.
*/
-static DEFINE_HYP_SPINLOCK(vm_table_lock);
+DEFINE_HYP_SPINLOCK(vm_table_lock);
/*
- * The table of VM entries for protected VMs in hyp.
- * Allocated at hyp initialization and setup.
+ * A table that tracks all VMs in protected mode.
+ * Allocated during hyp initialization and setup.
*/
static struct pkvm_hyp_vm **vm_table;
@@ -238,6 +237,10 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
if (unlikely(idx >= KVM_MAX_PVMS))
return NULL;
+ /* A reserved entry doesn't represent an initialized VM. */
+ if (unlikely(vm_table[idx] == RESERVED_ENTRY))
+ return NULL;
+
return vm_table[idx];
}
@@ -247,15 +250,32 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
struct pkvm_hyp_vm *hyp_vm;
+ /* Cannot load a new vcpu without putting the old one first. */
+ if (__this_cpu_read(loaded_hyp_vcpu))
+ return NULL;
+
hyp_spin_lock(&vm_table_lock);
hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
+ if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx)
goto unlock;
hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+ if (!hyp_vcpu)
+ goto unlock;
+
+ /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
+ if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) {
+ hyp_vcpu = NULL;
+ goto unlock;
+ }
+
+ hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu);
hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
unlock:
hyp_spin_unlock(&vm_table_lock);
+
+ if (hyp_vcpu)
+ __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu);
return hyp_vcpu;
}
@@ -264,63 +284,228 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
hyp_spin_lock(&vm_table_lock);
+ hyp_vcpu->loaded_hyp_vcpu = NULL;
+ __this_cpu_write(loaded_hyp_vcpu, NULL);
+ hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
+ hyp_spin_unlock(&vm_table_lock);
+}
+
+struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void)
+{
+ return __this_cpu_read(loaded_hyp_vcpu);
+
+}
+
+struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_vm_by_handle(handle);
+ if (hyp_vm)
+ hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
+ hyp_spin_unlock(&vm_table_lock);
+
+ return hyp_vm;
+}
+
+void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm)
+{
+ hyp_spin_lock(&vm_table_lock);
hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
hyp_spin_unlock(&vm_table_lock);
}
+struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle);
+
+ if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) {
+ put_pkvm_hyp_vm(hyp_vm);
+ hyp_vm = NULL;
+ }
+
+ return hyp_vm;
+}
+
+static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm)
+{
+ struct kvm *kvm = &hyp_vm->kvm;
+ unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags);
+ DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+ /* CTR_EL0 is always under host control, even for protected VMs. */
+ hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0;
+
+ /* Preserve the vgic model so that GICv3 emulation works */
+ hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model;
+
+ if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags))
+ set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
+
+ /* No restrictions for non-protected VMs. */
+ if (!kvm_vm_is_protected(kvm)) {
+ hyp_vm->kvm.arch.flags = host_arch_flags;
+
+ bitmap_copy(kvm->arch.vcpu_features,
+ host_kvm->arch.vcpu_features,
+ KVM_VCPU_MAX_FEATURES);
+
+ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags))
+ hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1;
+
+ return;
+ }
+
+ bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+ set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3))
+ set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS))
+ set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC))
+ set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
+
+ if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) {
+ set_bit(KVM_ARM_VCPU_SVE, allowed_features);
+ kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE);
+ }
+
+ bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features,
+ allowed_features, KVM_VCPU_MAX_FEATURES);
+}
+
static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
{
if (host_vcpu)
hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
}
+static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ void *sve_state;
+
+ if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE))
+ return;
+
+ sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state);
+ hyp_unpin_shared_mem(sve_state,
+ sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu));
+}
+
static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
unsigned int nr_vcpus)
{
int i;
- for (i = 0; i < nr_vcpus; i++)
- unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+ for (i = 0; i < nr_vcpus; i++) {
+ struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i];
+
+ if (!hyp_vcpu)
+ continue;
+
+ unpin_host_vcpu(hyp_vcpu->host_vcpu);
+ unpin_host_sve_state(hyp_vcpu);
+ }
}
static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
- unsigned int nr_vcpus)
+ unsigned int nr_vcpus, pkvm_handle_t handle)
{
+ struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
+ int idx = vm_handle_to_idx(handle);
+
+ hyp_vm->kvm.arch.pkvm.handle = handle;
+
hyp_vm->host_kvm = host_kvm;
hyp_vm->kvm.created_vcpus = nr_vcpus;
- hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+ hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected);
+ hyp_vm->kvm.arch.pkvm.is_created = true;
+ hyp_vm->kvm.arch.flags = 0;
+ pkvm_init_features_from_host(hyp_vm, host_kvm);
+
+ /* VMID 0 is reserved for the host */
+ atomic64_set(&mmu->vmid.id, idx + 1);
+
+ mmu->vtcr = host_mmu.arch.mmu.vtcr;
+ mmu->arch = &hyp_vm->kvm.arch;
+ mmu->pgt = &hyp_vm->pgt;
+}
+
+static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
+{
+ struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+ unsigned int sve_max_vl;
+ size_t sve_state_size;
+ void *sve_state;
+ int ret = 0;
+
+ if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) {
+ vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED);
+ return 0;
+ }
+
+ /* Limit guest vector length to the maximum supported by the host. */
+ sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl);
+ sve_state_size = sve_state_size_from_vl(sve_max_vl);
+ sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state));
+
+ if (!sve_state || !sve_state_size) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size);
+ if (ret)
+ goto err;
+
+ vcpu->arch.sve_state = sve_state;
+ vcpu->arch.sve_max_vl = sve_max_vl;
+
+ return 0;
+err:
+ clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features);
+ return ret;
}
static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
struct pkvm_hyp_vm *hyp_vm,
- struct kvm_vcpu *host_vcpu,
- unsigned int vcpu_idx)
+ struct kvm_vcpu *host_vcpu)
{
int ret = 0;
if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
return -EBUSY;
- if (host_vcpu->vcpu_idx != vcpu_idx) {
- ret = -EINVAL;
- goto done;
- }
-
hyp_vcpu->host_vcpu = host_vcpu;
hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
- hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+ hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx);
hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
+ hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+
+ if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ kvm_init_pvm_id_regs(&hyp_vcpu->vcpu);
+
+ ret = pkvm_vcpu_init_traps(hyp_vcpu);
+ if (ret)
+ goto done;
+
+ ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu);
done:
if (ret)
unpin_host_vcpu(host_vcpu);
return ret;
}
-static int find_free_vm_table_entry(struct kvm *host_kvm)
+static int find_free_vm_table_entry(void)
{
int i;
@@ -333,15 +518,13 @@ static int find_free_vm_table_entry(struct kvm *host_kvm)
}
/*
- * Allocate a VM table entry and insert a pointer to the new vm.
+ * Reserve a VM table entry.
*
- * Return a unique handle to the protected VM on success,
+ * Return a unique handle to the VM on success,
* negative error code on failure.
*/
-static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
- struct pkvm_hyp_vm *hyp_vm)
+static int allocate_vm_table_entry(void)
{
- struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
int idx;
hyp_assert_lock_held(&vm_table_lock);
@@ -354,20 +537,57 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
if (unlikely(!vm_table))
return -EINVAL;
- idx = find_free_vm_table_entry(host_kvm);
- if (idx < 0)
+ idx = find_free_vm_table_entry();
+ if (unlikely(idx < 0))
return idx;
- hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx);
+ vm_table[idx] = RESERVED_ENTRY;
- /* VMID 0 is reserved for the host */
- atomic64_set(&mmu->vmid.id, idx + 1);
+ return idx;
+}
- mmu->arch = &hyp_vm->kvm.arch;
- mmu->pgt = &hyp_vm->pgt;
+static int __insert_vm_table_entry(pkvm_handle_t handle,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ unsigned int idx;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ /*
+ * Initializing protected state might have failed, yet a malicious
+ * host could trigger this function. Thus, ensure that 'vm_table'
+ * exists.
+ */
+ if (unlikely(!vm_table))
+ return -EINVAL;
+
+ idx = vm_handle_to_idx(handle);
+ if (unlikely(idx >= KVM_MAX_PVMS))
+ return -EINVAL;
+
+ if (unlikely(vm_table[idx] != RESERVED_ENTRY))
+ return -EINVAL;
vm_table[idx] = hyp_vm;
- return hyp_vm->kvm.arch.pkvm.handle;
+
+ return 0;
+}
+
+/*
+ * Insert a pointer to the initialized VM into the VM table.
+ *
+ * Return 0 on success, or negative error code on failure.
+ */
+static int insert_vm_table_entry(pkvm_handle_t handle,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ int ret;
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = __insert_vm_table_entry(handle, hyp_vm);
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
}
/*
@@ -411,6 +631,7 @@ static void *map_donated_memory(unsigned long host_va, size_t size)
static void __unmap_donated_memory(void *va, size_t size)
{
+ kvm_flush_dcache_to_poc(va, size);
WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
PAGE_ALIGN(size) >> PAGE_SHIFT));
}
@@ -433,10 +654,45 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
}
/*
- * Initialize the hypervisor copy of the protected VM state using the
- * memory donated by the host.
+ * Reserves an entry in the hypervisor for a new VM in protected mode.
*
- * Unmaps the donated memory from the host at stage 2.
+ * Return a unique handle to the VM on success, negative error code on failure.
+ */
+int __pkvm_reserve_vm(void)
+{
+ int ret;
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = allocate_vm_table_entry();
+ hyp_spin_unlock(&vm_table_lock);
+
+ if (ret < 0)
+ return ret;
+
+ return idx_to_vm_handle(ret);
+}
+
+/*
+ * Removes a reserved entry, but only if is hasn't been used yet.
+ * Otherwise, the VM needs to be destroyed.
+ */
+void __pkvm_unreserve_vm(pkvm_handle_t handle)
+{
+ unsigned int idx = vm_handle_to_idx(handle);
+
+ if (unlikely(!vm_table))
+ return;
+
+ hyp_spin_lock(&vm_table_lock);
+ if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY))
+ remove_vm_table_entry(handle);
+ hyp_spin_unlock(&vm_table_lock);
+}
+
+/*
+ * Initialize the hypervisor copy of the VM state using host-donated memory.
+ *
+ * Unmap the donated memory from the host at stage 2.
*
* host_kvm: A pointer to the host's struct kvm.
* vm_hva: The host va of the area being donated for the VM state.
@@ -445,8 +701,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
* the VM. Must be page aligned. Its size is implied by the VM's
* VTCR.
*
- * Return a unique handle to the protected VM on success,
- * negative error code on failure.
+ * Return 0 success, negative error code on failure.
*/
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
unsigned long pgd_hva)
@@ -454,6 +709,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
struct pkvm_hyp_vm *hyp_vm = NULL;
size_t vm_size, pgd_size;
unsigned int nr_vcpus;
+ pkvm_handle_t handle;
void *pgd = NULL;
int ret;
@@ -467,8 +723,14 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
goto err_unpin_kvm;
}
+ handle = READ_ONCE(host_kvm->arch.pkvm.handle);
+ if (unlikely(handle < HANDLE_OFFSET)) {
+ ret = -EINVAL;
+ goto err_unpin_kvm;
+ }
+
vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
- pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
+ pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr);
ret = -ENOMEM;
@@ -480,24 +742,19 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
if (!pgd)
goto err_remove_mappings;
- init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
-
- hyp_spin_lock(&vm_table_lock);
- ret = insert_vm_table_entry(host_kvm, hyp_vm);
- if (ret < 0)
- goto err_unlock;
+ init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle);
ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
if (ret)
- goto err_remove_vm_table_entry;
- hyp_spin_unlock(&vm_table_lock);
+ goto err_remove_mappings;
- return hyp_vm->kvm.arch.pkvm.handle;
+ /* Must be called last since this publishes the VM. */
+ ret = insert_vm_table_entry(handle, hyp_vm);
+ if (ret)
+ goto err_remove_mappings;
+
+ return 0;
-err_remove_vm_table_entry:
- remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle);
-err_unlock:
- hyp_spin_unlock(&vm_table_lock);
err_remove_mappings:
unmap_donated_memory(hyp_vm, vm_size);
unmap_donated_memory(pgd, pgd_size);
@@ -507,10 +764,9 @@ err_unpin_kvm:
}
/*
- * Initialize the hypervisor copy of the protected vCPU state using the
- * memory donated by the host.
+ * Initialize the hypervisor copy of the vCPU state using host-donated memory.
*
- * handle: The handle for the protected vm.
+ * handle: The hypervisor handle for the vm.
* host_vcpu: A pointer to the corresponding host vcpu.
* vcpu_hva: The host va of the area being donated for the vcpu state.
* Must be page aligned. The size of the area must be equal to
@@ -537,24 +793,27 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
goto unlock;
}
- idx = hyp_vm->nr_vcpus;
+ ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu);
+ if (ret)
+ goto unlock;
+
+ idx = hyp_vcpu->vcpu.vcpu_idx;
if (idx >= hyp_vm->kvm.created_vcpus) {
ret = -EINVAL;
goto unlock;
}
- ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
- if (ret)
+ if (hyp_vm->vcpus[idx]) {
+ ret = -EINVAL;
goto unlock;
+ }
hyp_vm->vcpus[idx] = hyp_vcpu;
- hyp_vm->nr_vcpus++;
unlock:
hyp_spin_unlock(&vm_table_lock);
if (ret)
unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
-
return ret;
}
@@ -572,7 +831,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
int __pkvm_teardown_vm(pkvm_handle_t handle)
{
- struct kvm_hyp_memcache *mc;
+ struct kvm_hyp_memcache *mc, *stage2_mc;
struct pkvm_hyp_vm *hyp_vm;
struct kvm *host_kvm;
unsigned int idx;
@@ -600,12 +859,26 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
/* Reclaim guest pages (including page-table pages) */
mc = &host_kvm->arch.pkvm.teardown_mc;
- reclaim_guest_pages(hyp_vm, mc);
- unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+ stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc;
+ reclaim_pgtable_pages(hyp_vm, stage2_mc);
+ unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus);
/* Push the metadata pages to the teardown memcache */
- for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
+ for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) {
struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
+ struct kvm_hyp_memcache *vcpu_mc;
+
+ if (!hyp_vcpu)
+ continue;
+
+ vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
+
+ while (vcpu_mc->nr_pages) {
+ void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
+
+ push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys);
+ unmap_donated_memory_noclear(addr, PAGE_SIZE);
+ }
teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 08508783ec3d..c3e196fb8b18 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -200,12 +200,12 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
__hyp_pa(init_params), 0);
}
-asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on)
+asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
{
struct psci_boot_args *boot_args;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
if (is_cpu_on)
boot_args = this_cpu_ptr(&cpu_on_args);
@@ -218,6 +218,9 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on)
if (is_cpu_on)
release_boot_args(boot_args);
+ write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR);
+ write_sysreg(INIT_PSTATE_EL1, SPSR_EL2);
+
__host_enter(host_ctxt);
}
@@ -265,6 +268,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_
case PSCI_1_0_FN_PSCI_FEATURES:
case PSCI_1_0_FN_SET_SUSPEND_MODE:
case PSCI_1_1_FN64_SYSTEM_RESET2:
+ case PSCI_1_3_FN_SYSTEM_OFF2:
+ case PSCI_1_3_FN64_SYSTEM_OFF2:
return psci_forward(host_ctxt);
case PSCI_1_0_FN64_SYSTEM_SUSPEND:
return psci_system_suspend(func_id, host_ctxt);
@@ -273,9 +278,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_
}
}
-bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt)
+bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
{
- DECLARE_REG(u64, func_id, host_ctxt, 0);
unsigned long ret;
switch (kvm_host_psci_config.version) {
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 110f04627785..90bd014e952f 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -11,7 +11,7 @@
#include <asm/kvm_pkvm.h>
#include <nvhe/early_alloc.h>
-#include <nvhe/fixed_config.h>
+#include <nvhe/ffa.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
@@ -28,6 +28,8 @@ static void *vmemmap_base;
static void *vm_table_base;
static void *hyp_pgt_base;
static void *host_s2_pgt_base;
+static void *selftest_base;
+static void *ffa_proxy_pages;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static struct hyp_pool hpool;
@@ -37,6 +39,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
hyp_early_alloc_init(virt, size);
+ nr_pages = pkvm_selftest_pages();
+ selftest_base = hyp_early_alloc_contig(nr_pages);
+ if (nr_pages && !selftest_base)
+ return -ENOMEM;
+
nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page));
vmemmap_base = hyp_early_alloc_contig(nr_pages);
if (!vmemmap_base)
@@ -57,6 +64,33 @@ static int divide_memory_pool(void *virt, unsigned long size)
if (!host_s2_pgt_base)
return -ENOMEM;
+ nr_pages = hyp_ffa_proxy_pages();
+ ffa_proxy_pages = hyp_early_alloc_contig(nr_pages);
+ if (!ffa_proxy_pages)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int pkvm_create_host_sve_mappings(void)
+{
+ void *start, *end;
+ int ret, i;
+
+ if (!system_supports_sve())
+ return 0;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i);
+ struct cpu_sve_state *sve_state = host_data->sve_state;
+
+ start = kern_hyp_va(sve_state);
+ end = start + PAGE_ALIGN(pkvm_host_sve_state_size());
+ ret = pkvm_create_mappings(start, end, PAGE_HYP);
+ if (ret)
+ return ret;
+ }
+
return 0;
}
@@ -66,7 +100,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
{
void *start, *end, *virt = hyp_phys_to_virt(phys);
unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
- enum kvm_pgtable_prot prot;
int ret, i;
/* Recreate the hyp page-table using the early page allocator */
@@ -92,6 +125,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
+ ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP);
+ if (ret)
+ return ret;
+
ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
if (ret)
return ret;
@@ -106,7 +143,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
for (i = 0; i < hyp_nr_cpus; i++) {
struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i);
- unsigned long hyp_addr;
start = (void *)kern_hyp_va(per_cpu_base[i]);
end = start + PAGE_ALIGN(hyp_percpu_size);
@@ -114,51 +150,12 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
- /*
- * Allocate a contiguous HYP private VA range for the stack
- * and guard page. The allocation is also aligned based on
- * the order of its size.
- */
- ret = pkvm_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
- if (ret)
- return ret;
-
- /*
- * Since the stack grows downwards, map the stack to the page
- * at the higher address and leave the lower guard page
- * unbacked.
- *
- * Any valid stack address now has the PAGE_SHIFT bit as 1
- * and addresses corresponding to the guard page have the
- * PAGE_SHIFT bit as 0 - this is used for overflow detection.
- */
- hyp_spin_lock(&pkvm_pgd_lock);
- ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE,
- PAGE_SIZE, params->stack_pa, PAGE_HYP);
- hyp_spin_unlock(&pkvm_pgd_lock);
+ ret = pkvm_create_stack(params->stack_pa, &params->stack_hyp_va);
if (ret)
return ret;
-
- /* Update stack_hyp_va to end of the stack's private VA range */
- params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
}
- /*
- * Map the host sections RO in the hypervisor, but transfer the
- * ownership from the host to the hypervisor itself to make sure they
- * can't be donated or shared with another entity.
- *
- * The ownership transition requires matching changes in the host
- * stage-2. This will be done later (see finalize_host_mappings()) once
- * the hyp_vmemmap is addressable.
- */
- prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
- ret = pkvm_create_mappings(&kvm_vgic_global_state,
- &kvm_vgic_global_state + 1, prot);
- if (ret)
- return ret;
-
- return 0;
+ return pkvm_create_host_sve_mappings();
}
static void update_nvhe_init_params(void)
@@ -192,39 +189,53 @@ static void hpool_put_page(void *addr)
static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
- enum kvm_pgtable_prot prot;
enum pkvm_page_state state;
+ struct hyp_page *page;
phys_addr_t phys;
+ enum kvm_pgtable_prot prot;
if (!kvm_pte_valid(ctx->old))
return 0;
- if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1))
+ if (ctx->level != KVM_PGTABLE_LAST_LEVEL)
return -EINVAL;
phys = kvm_pte_to_phys(ctx->old);
if (!addr_is_memory(phys))
return -EINVAL;
+ page = hyp_phys_to_page(phys);
+
/*
* Adjust the host stage-2 mappings to match the ownership attributes
- * configured in the hypervisor stage-1.
+ * configured in the hypervisor stage-1, and make sure to propagate them
+ * to the hyp_vmemmap state.
*/
- state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
+ prot = kvm_pgtable_hyp_pte_prot(ctx->old);
+ state = pkvm_getstate(prot);
switch (state) {
case PKVM_PAGE_OWNED:
- return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
+ set_hyp_state(page, PKVM_PAGE_OWNED);
+ /* hyp text is RO in the host stage-2 to be inspected on panic. */
+ if (prot == PAGE_HYP_EXEC) {
+ set_host_state(page, PKVM_NOPAGE);
+ return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R);
+ } else {
+ return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
+ }
case PKVM_PAGE_SHARED_OWNED:
- prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
+ set_hyp_state(page, PKVM_PAGE_SHARED_OWNED);
+ set_host_state(page, PKVM_PAGE_SHARED_BORROWED);
break;
case PKVM_PAGE_SHARED_BORROWED:
- prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+ set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED);
+ set_host_state(page, PKVM_PAGE_SHARED_OWNED);
break;
default:
return -EINVAL;
}
- return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
+ return 0;
}
static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx,
@@ -275,8 +286,7 @@ static int fix_hyp_pgtable_refcnt(void)
void __noreturn __pkvm_init_finalise(void)
{
- struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
- struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt;
+ struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt);
unsigned long nr_pages, reserved_pages, pfn;
int ret;
@@ -310,11 +320,17 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
- ret = hyp_create_pcpu_fixmap();
+ ret = hyp_create_fixmap();
+ if (ret)
+ goto out;
+
+ ret = hyp_ffa_init(ffa_proxy_pages);
if (ret)
goto out;
pkvm_hyp_vm_table_init(vm_table_base);
+
+ pkvm_ownership_selftest(selftest_base);
out:
/*
* We tail-called to here from handle___pkvm_init() and will not return,
@@ -330,7 +346,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
{
struct kvm_nvhe_init_params *params;
void *virt = hyp_phys_to_virt(phys);
- void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+ typeof(__pkvm_init_switch_pgd) *fn;
int ret;
BUG_ON(kvm_check_pvm_sysreg_table());
@@ -354,7 +370,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
/* Jump in the idmap page to switch to the new page-tables */
params = this_cpu_ptr(&kvm_init_params);
fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
- fn(__hyp_pa(params), __pkvm_init_finalise);
+ fn(params->pgd_pa, params->stack_hyp_va, __pkvm_init_finalise);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
index ed6b58b19cfa..5b6eeab1a774 100644
--- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c
+++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
@@ -28,7 +28,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc)
struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info);
struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
- stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE);
+ stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE);
stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack);
stacktrace_info->fp = fp;
stacktrace_info->pc = pc;
@@ -54,7 +54,7 @@ static struct stack_info stackinfo_get_hyp(void)
{
struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
unsigned long high = params->stack_hyp_va;
- unsigned long low = high - PAGE_SIZE;
+ unsigned long low = high - NVHE_STACK_SIZE;
return (struct stack_info) {
.low = low,
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index c2cb46ca4fb6..d3b9ec8a7c28 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -26,7 +26,6 @@
#include <asm/debug-monitors.h>
#include <asm/processor.h>
-#include <nvhe/fixed_config.h>
#include <nvhe/mem_protect.h>
/* Non-VHE specific context */
@@ -34,25 +33,30 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+struct fgt_masks hfgrtr_masks;
+struct fgt_masks hfgwtr_masks;
+struct fgt_masks hfgitr_masks;
+struct fgt_masks hdfgrtr_masks;
+struct fgt_masks hdfgwtr_masks;
+struct fgt_masks hafgrtr_masks;
+struct fgt_masks hfgrtr2_masks;
+struct fgt_masks hfgwtr2_masks;
+struct fgt_masks hfgitr2_masks;
+struct fgt_masks hdfgrtr2_masks;
+struct fgt_masks hdfgwtr2_masks;
+
extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
static void __activate_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ ___activate_traps(vcpu, vcpu->arch.hcr_el2);
- ___activate_traps(vcpu);
- __activate_traps_common(vcpu);
+ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
+ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
- val = vcpu->arch.cptr_el2;
- val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
- if (!guest_owns_fp_regs(vcpu)) {
- val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
- __activate_traps_fpsimd32(vcpu);
- }
- if (cpus_have_final_cap(ARM64_SME))
- val |= CPTR_EL2_TSM;
+ __activate_traps_common(vcpu);
+ __activate_cptr_traps(vcpu);
- write_sysreg(val, cptr_el2);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
@@ -73,7 +77,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
extern char __kvm_hyp_host_vector[];
- u64 cptr;
___deactivate_traps(vcpu);
@@ -94,17 +97,13 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
isb();
}
- __deactivate_traps_common(vcpu);
+ write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
- write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
+ __deactivate_traps_common(vcpu);
- cptr = CPTR_EL2_DEFAULT;
- if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
- cptr |= CPTR_EL2_TZ;
- if (cpus_have_final_cap(ARM64_SME))
- cptr &= ~CPTR_EL2_TSM;
+ write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2);
- write_sysreg(cptr, cptr_el2);
+ __deactivate_cptr_traps(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
@@ -170,9 +169,8 @@ static void __pmu_switch_to_host(struct kvm_vcpu *vcpu)
static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code)
{
/*
- * Make sure we handle the exit for workarounds and ptrauth
- * before the pKVM handling, as the latter could decide to
- * UNDEF.
+ * Make sure we handle the exit for workarounds before the pKVM
+ * handling, as the latter could decide to UNDEF.
*/
return (kvm_hyp_handle_sysreg(vcpu, exit_code) ||
kvm_handle_pvm_sysreg(vcpu, exit_code));
@@ -186,7 +184,8 @@ static const exit_handler_fn hyp_exit_handlers[] = {
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
[ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
- [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
+ [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
+ [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
};
static const exit_handler_fn pvm_exit_handlers[] = {
@@ -196,33 +195,34 @@ static const exit_handler_fn pvm_exit_handlers[] = {
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
[ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
- [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
+ [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
+ [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
};
static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
{
- if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm))))
+ if (unlikely(vcpu_is_protected(vcpu)))
return pvm_exit_handlers;
return hyp_exit_handlers;
}
-/*
- * Some guests (e.g., protected VMs) are not be allowed to run in AArch32.
- * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a
- * guest from dropping to AArch32 EL0 if implemented by the CPU. If the
- * hypervisor spots a guest in such a state ensure it is handled, and don't
- * trust the host to spot or fix it. The check below is based on the one in
- * kvm_arch_vcpu_ioctl_run().
- *
- * Returns false if the guest ran in AArch32 when it shouldn't have, and
- * thus should exit to the host, or true if a the guest run loop can continue.
- */
-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
+static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
- struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+ const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
+
+ synchronize_vcpu_pstate(vcpu, exit_code);
- if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) {
+ /*
+ * Some guests (e.g., protected VMs) are not be allowed to run in
+ * AArch32. The ARMv8 architecture does not give the hypervisor a
+ * mechanism to prevent a guest from dropping to AArch32 EL0 if
+ * implemented by the CPU. If the hypervisor spots a guest in such a
+ * state ensure it is handled, and don't trust the host to spot or fix
+ * it. The check below is based on the one in
+ * kvm_arch_vcpu_ioctl_run().
+ */
+ if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) {
/*
* As we have caught the guest red-handed, decide that it isn't
* fit for purpose anymore by making the vcpu invalid. The VMM
@@ -230,10 +230,12 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
* KVM_ARM_VCPU_INIT, however, this is likely not possible for
* protected VMs.
*/
- vcpu->arch.target = -1;
+ vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
*exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT);
*exit_code |= ARM_EXCEPTION_IL;
}
+
+ return __fixup_guest_exit(vcpu, exit_code, handlers);
}
/* Switch to the guest for legacy non-VHE systems */
@@ -256,7 +258,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmr_sync();
}
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -272,6 +274,17 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
*/
__debug_save_host_buffers_nvhe(vcpu);
+ /*
+ * We're about to restore some new MMU state. Make sure
+ * ongoing page-table walks that have started before we
+ * trapped to EL2 have completed. This also synchronises the
+ * above disabling of BRBE, SPE and TRBE.
+ *
+ * See DDI0487I.a D8.1.5 "Out-of-context translation regimes",
+ * rule R_LFHQG and subsequent information statements.
+ */
+ dsb(nsh);
+
__kvm_adjust_pc(vcpu);
/*
@@ -306,12 +319,19 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__timer_disable_traps(vcpu);
__hyp_vgic_save_state(vcpu);
+ /*
+ * Same thing as before the guest run: we're about to switch
+ * the MMU context, so let's make sure we don't have any
+ * ongoing EL1&0 translations.
+ */
+ dsb(nsh);
+
__deactivate_traps(vcpu);
__load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)
+ if (guest_owns_fp_regs())
__fpsimd_save_fpexc32(vcpu);
__debug_switch_to_host(vcpu);
@@ -341,7 +361,7 @@ asmlinkage void __noreturn hyp_panic(void)
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
vcpu = host_ctxt->__hyp_running_vcpu;
if (vcpu) {
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 0f9ac25afdf4..3108b5185c20 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -11,7 +11,7 @@
#include <hyp/adjust_pc.h>
-#include <nvhe/fixed_config.h>
+#include <nvhe/pkvm.h>
#include "../../sys_regs.h"
@@ -26,230 +26,258 @@ u64 id_aa64isar2_el1_sys_val;
u64 id_aa64mmfr0_el1_sys_val;
u64 id_aa64mmfr1_el1_sys_val;
u64 id_aa64mmfr2_el1_sys_val;
+u64 id_aa64smfr0_el1_sys_val;
+
+struct pvm_ftr_bits {
+ bool sign;
+ u8 shift;
+ u8 width;
+ u8 max_val;
+ bool (*vm_supported)(const struct kvm *kvm);
+};
-/*
- * Inject an unknown/undefined exception to an AArch64 guest while most of its
- * sysregs are live.
- */
-static void inject_undef64(struct kvm_vcpu *vcpu)
-{
- u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
-
- *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
- *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
-
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
-
- __kvm_adjust_pc(vcpu);
-
- write_sysreg_el1(esr, SYS_ESR);
- write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
- write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
- write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
-}
-
-/*
- * Returns the restricted features values of the feature register based on the
- * limitations in restrict_fields.
- * A feature id field value of 0b0000 does not impose any restrictions.
- * Note: Use only for unsigned feature field values.
- */
-static u64 get_restricted_features_unsigned(u64 sys_reg_val,
- u64 restrict_fields)
-{
- u64 value = 0UL;
- u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
-
- /*
- * According to the Arm Architecture Reference Manual, feature fields
- * use increasing values to indicate increases in functionality.
- * Iterate over the restricted feature fields and calculate the minimum
- * unsigned value between the one supported by the system, and what the
- * value is being restricted to.
- */
- while (sys_reg_val && restrict_fields) {
- value |= min(sys_reg_val & mask, restrict_fields & mask);
- sys_reg_val &= ~mask;
- restrict_fields &= ~mask;
- mask <<= ARM64_FEATURE_FIELD_BITS;
+#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \
+ { \
+ .sign = sgn, \
+ .shift = id##_##fld##_SHIFT, \
+ .width = id##_##fld##_WIDTH, \
+ .max_val = id##_##fld##_##max, \
+ .vm_supported = func, \
}
- return value;
-}
-
-/*
- * Functions that return the value of feature id registers for protected VMs
- * based on allowed features, system features, and KVM support.
- */
+#define MAX_FEAT_FUNC(id, fld, max, func) \
+ __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED)
-static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu)
-{
- const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm);
- u64 set_mask = 0;
- u64 allow_mask = PVM_ID_AA64PFR0_ALLOW;
+#define MAX_FEAT(id, fld, max) \
+ MAX_FEAT_FUNC(id, fld, max, NULL)
- set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val,
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED);
+#define MAX_FEAT_ENUM(id, fld, max) \
+ __MAX_FEAT_FUNC(id, fld, max, NULL, false)
- /* Spectre and Meltdown mitigation in KVM */
- set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
- (u64)kvm->arch.pfr0_csv2);
- set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
- (u64)kvm->arch.pfr0_csv3);
+#define FEAT_END { .width = 0, }
- return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask;
-}
-
-static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu)
+static bool vm_has_ptrauth(const struct kvm *kvm)
{
- const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm);
- u64 allow_mask = PVM_ID_AA64PFR1_ALLOW;
-
- if (!kvm_has_mte(kvm))
- allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
-
- return id_aa64pfr1_el1_sys_val & allow_mask;
-}
-
-static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for Scalable Vectors, therefore, hyp has no sanitized
- * copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL);
- return 0;
-}
-
-static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for debug, including breakpoints, and watchpoints,
- * therefore, pKVM has no sanitized copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL);
- return 0;
-}
-
-static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for debug, therefore, hyp has no sanitized copy of the
- * feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL);
- return 0;
-}
+ if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH))
+ return false;
-static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu)
-{
- /*
- * No support for implementation defined features, therefore, hyp has no
- * sanitized copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL);
- return 0;
+ return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) ||
+ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) &&
+ kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC);
}
-static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu)
+static bool vm_has_sve(const struct kvm *kvm)
{
- /*
- * No support for implementation defined features, therefore, hyp has no
- * sanitized copy of the feature id register.
- */
- BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL);
- return 0;
+ return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE);
}
-static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu)
-{
- return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW;
-}
+/*
+ * Definitions for features to be allowed or restricted for protected guests.
+ *
+ * Each field in the masks represents the highest supported value for the
+ * feature. If a feature field is not present, it is not supported. Moreover,
+ * these are used to generate the guest's view of the feature registers.
+ *
+ * The approach for protected VMs is to at least support features that are:
+ * - Needed by common Linux distributions (e.g., floating point)
+ * - Trivial to support, e.g., supporting the feature does not introduce or
+ * require tracking of additional state in KVM
+ * - Cannot be trapped or prevent the guest from using anyway
+ */
-static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu)
-{
- u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW;
+static const struct pvm_ftr_bits pvmid_aa64pfr0[] = {
+ MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16),
+ MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16),
+ MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP),
+ MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve),
+ MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP),
+ MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP),
+ FEAT_END
+};
- if (!vcpu_has_ptrauth(vcpu))
- allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
+static const struct pvm_ftr_bits pvmid_aa64pfr1[] = {
+ MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP),
+ MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2),
+ MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI),
+ FEAT_END
+};
- return id_aa64isar1_el1_sys_val & allow_mask;
-}
+static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = {
+ MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40),
+ MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16),
+ MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP),
+ MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP),
+ MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP),
+ MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP),
+ FEAT_END
+};
-static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu)
-{
- u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW;
+static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = {
+ MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM),
+ MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16),
+ MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2),
+ MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3),
+ MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP),
+ MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP),
+ MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP),
+ FEAT_END
+};
- if (!vcpu_has_ptrauth(vcpu))
- allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
+static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = {
+ MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP),
+ MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18),
+ MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP),
+ MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2),
+ MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP),
+ FEAT_END
+};
- return id_aa64isar2_el1_sys_val & allow_mask;
-}
+static const struct pvm_ftr_bits pvmid_aa64isar1[] = {
+ MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2),
+ MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth),
+ MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth),
+ MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3),
+ MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX),
+ MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16),
+ MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP),
+ MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP),
+ FEAT_END
+};
-static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu)
-{
- u64 set_mask;
+static const struct pvm_ftr_bits pvmid_aa64isar2[] = {
+ MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth),
+ MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth),
+ MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP),
+ FEAT_END
+};
- set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val,
- PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED);
+/*
+ * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported.
+ * However, both have Not-Implemented values that are non-zero. Define them
+ * so they can be used when getting the value of these registers.
+ */
+#define ID_AA64DFR0_EL1_NONZERO_NI \
+( \
+ SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \
+ SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \
+)
- return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask;
-}
+#define ID_AA64MMFR4_EL1_NONZERO_NI \
+ SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI)
-static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu)
+/*
+ * Returns the value of the feature registers based on the system register
+ * value, the vcpu support for the revelant features, and the additional
+ * restrictions for protected VMs.
+ */
+static u64 get_restricted_features(const struct kvm_vcpu *vcpu,
+ u64 sys_reg_val,
+ const struct pvm_ftr_bits restrictions[])
{
- return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW;
-}
+ u64 val = 0UL;
+ int i;
+
+ for (i = 0; restrictions[i].width != 0; i++) {
+ bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported;
+ bool sign = restrictions[i].sign;
+ int shift = restrictions[i].shift;
+ int width = restrictions[i].width;
+ u64 min_signed = (1UL << width) - 1UL;
+ u64 sign_bit = 1UL << (width - 1);
+ u64 mask = GENMASK_ULL(width + shift - 1, shift);
+ u64 sys_val = (sys_reg_val & mask) >> shift;
+ u64 pvm_max = restrictions[i].max_val;
+
+ if (vm_supported && !vm_supported(vcpu->kvm))
+ val |= (sign ? min_signed : 0) << shift;
+ else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit))
+ val |= max(sys_val, pvm_max) << shift;
+ else
+ val |= min(sys_val, pvm_max) << shift;
+ }
-static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu)
-{
- return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW;
+ return val;
}
-/* Read a sanitized cpufeature ID register by its encoding */
-u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id)
+static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id)
{
switch (id) {
case SYS_ID_AA64PFR0_EL1:
- return get_pvm_id_aa64pfr0(vcpu);
+ return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0);
case SYS_ID_AA64PFR1_EL1:
- return get_pvm_id_aa64pfr1(vcpu);
- case SYS_ID_AA64ZFR0_EL1:
- return get_pvm_id_aa64zfr0(vcpu);
- case SYS_ID_AA64DFR0_EL1:
- return get_pvm_id_aa64dfr0(vcpu);
- case SYS_ID_AA64DFR1_EL1:
- return get_pvm_id_aa64dfr1(vcpu);
- case SYS_ID_AA64AFR0_EL1:
- return get_pvm_id_aa64afr0(vcpu);
- case SYS_ID_AA64AFR1_EL1:
- return get_pvm_id_aa64afr1(vcpu);
+ return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1);
case SYS_ID_AA64ISAR0_EL1:
- return get_pvm_id_aa64isar0(vcpu);
+ return id_aa64isar0_el1_sys_val;
case SYS_ID_AA64ISAR1_EL1:
- return get_pvm_id_aa64isar1(vcpu);
+ return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1);
case SYS_ID_AA64ISAR2_EL1:
- return get_pvm_id_aa64isar2(vcpu);
+ return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2);
case SYS_ID_AA64MMFR0_EL1:
- return get_pvm_id_aa64mmfr0(vcpu);
+ return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0);
case SYS_ID_AA64MMFR1_EL1:
- return get_pvm_id_aa64mmfr1(vcpu);
+ return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1);
case SYS_ID_AA64MMFR2_EL1:
- return get_pvm_id_aa64mmfr2(vcpu);
+ return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2);
+ case SYS_ID_AA64DFR0_EL1:
+ return ID_AA64DFR0_EL1_NONZERO_NI;
+ case SYS_ID_AA64MMFR4_EL1:
+ return ID_AA64MMFR4_EL1_NONZERO_NI;
default:
/* Unhandled ID register, RAZ */
return 0;
}
}
+/*
+ * Inject an unknown/undefined exception to an AArch64 guest while most of its
+ * sysregs are live.
+ */
+static void inject_undef64(struct kvm_vcpu *vcpu)
+{
+ u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
+
+ *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
+ *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
+ __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR));
+
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+
+ __kvm_adjust_pc(vcpu);
+
+ write_sysreg_el1(esr, SYS_ESR);
+ write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
+ write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
+ write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
+}
+
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
struct sys_reg_desc const *r)
{
- return pvm_read_id_reg(vcpu, reg_to_encoding(r));
+ struct kvm *kvm = vcpu->kvm;
+ u32 reg = reg_to_encoding(r);
+
+ if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)))
+ return 0;
+
+ if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7))
+ return kvm->arch.id_regs[IDREG_IDX(reg)];
+
+ return 0;
}
/* Handler to RAZ/WI sysregs */
@@ -277,13 +305,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu,
return false;
}
- /*
- * No support for AArch32 guests, therefore, pKVM has no sanitized copy
- * of AArch32 feature id registers.
- */
- BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1),
- PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
-
return pvm_access_raz_wi(vcpu, p, r);
}
@@ -352,6 +373,9 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Debug and Trace Registers are restricted. */
+ /* Group 1 ID registers */
+ HOST_HANDLED(SYS_REVIDR_EL1),
+
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
AARCH32(SYS_ID_PFR0_EL1),
@@ -420,6 +444,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Scalable Vector Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_PMR_EL1),
+
RAZ_WI(SYS_ERRIDR_EL1),
RAZ_WI(SYS_ERRSELR_EL1),
RAZ_WI(SYS_ERXFR_EL1),
@@ -433,13 +459,17 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Limited Ordering Regions Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_DIR_EL1),
+ HOST_HANDLED(SYS_ICC_RPR_EL1),
HOST_HANDLED(SYS_ICC_SGI1R_EL1),
HOST_HANDLED(SYS_ICC_ASGI1R_EL1),
HOST_HANDLED(SYS_ICC_SGI0R_EL1),
+ HOST_HANDLED(SYS_ICC_CTLR_EL1),
{ SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, },
HOST_HANDLED(SYS_CCSIDR_EL1),
HOST_HANDLED(SYS_CLIDR_EL1),
+ HOST_HANDLED(SYS_AIDR_EL1),
HOST_HANDLED(SYS_CSSELR_EL1),
HOST_HANDLED(SYS_CTR_EL0),
@@ -455,6 +485,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
};
/*
+ * Initializes feature registers for protected vms.
+ */
+void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_arch *ka = &kvm->arch;
+ u32 r;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
+ return;
+
+ /*
+ * Initialize only AArch64 id registers since AArch32 isn't supported
+ * for protected VMs.
+ */
+ for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1))
+ ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r);
+
+ set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+}
+
+/*
* Checks that the sysreg table is unique and in-order.
*
* Returns 0 if the table is consistent, or 1 otherwise.
diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
index 29305022bc04..3cc613cce5f5 100644
--- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
@@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt)
void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_restore_el1_state(ctxt);
+ u64 midr = ctxt_midr_el1(ctxt);
+
+ __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1));
__sysreg_restore_common_state(ctxt);
__sysreg_restore_user_state(ctxt);
__sysreg_restore_el2_return_state(ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
index 9072e71693ba..ff176f4ce7de 100644
--- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
@@ -9,6 +9,7 @@
#include <linux/kvm_host.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
void __kvm_timer_set_cntvoff(u64 cntvoff)
{
@@ -16,33 +17,54 @@ void __kvm_timer_set_cntvoff(u64 cntvoff)
}
/*
- * Should only be called on non-VHE systems.
+ * Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_disable_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ u64 set, clr, shift = 0;
+
+ if (has_hvhe())
+ shift = 10;
/* Allow physical timer/counter access for the host */
- val = read_sysreg(cnthctl_el2);
- val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
- write_sysreg(val, cnthctl_el2);
+ set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift;
+ clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT;
+
+ sysreg_clear_set(cnthctl_el2, clr, set);
}
/*
- * Should only be called on non-VHE systems.
+ * Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_enable_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ u64 clr = 0, set = 0;
/*
* Disallow physical timer access for the guest
- * Physical counter access is allowed
+ * Physical counter access is allowed if no offset is enforced
+ * or running protected (we don't offset anything in this case).
*/
- val = read_sysreg(cnthctl_el2);
- val &= ~CNTHCTL_EL1PCEN;
- val |= CNTHCTL_EL1PCTEN;
- write_sysreg(val, cnthctl_el2);
+ clr = CNTHCTL_EL1PCEN;
+ if (is_protected_kvm_enabled() ||
+ !kern_hyp_va(vcpu->kvm)->arch.timer_data.poffset)
+ set |= CNTHCTL_EL1PCTEN;
+ else
+ clr |= CNTHCTL_EL1PCTEN;
+
+ if (has_hvhe()) {
+ clr <<= 10;
+ set <<= 10;
+ }
+
+ /*
+ * Trap the virtual counter/timer if we have a broken cntvoff
+ * implementation.
+ */
+ if (has_broken_cntvoff())
+ set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT;
+
+ sysreg_clear_set(cnthctl_el2, clr, set);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index d296d617f589..48da9ca9763f 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -11,26 +11,94 @@
#include <nvhe/mem_protect.h>
struct tlb_inv_context {
- u64 tcr;
+ struct kvm_s2_mmu *mmu;
+ u64 tcr;
+ u64 sctlr;
};
-static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
- struct tlb_inv_context *cxt)
+static void enter_vmid_context(struct kvm_s2_mmu *mmu,
+ struct tlb_inv_context *cxt,
+ bool nsh)
{
+ struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+ cxt->mmu = NULL;
+
+ /*
+ * We have two requirements:
+ *
+ * - ensure that the page table updates are visible to all
+ * CPUs, for which a dsb(DOMAIN-st) is what we need, DOMAIN
+ * being either ish or nsh, depending on the invalidation
+ * type.
+ *
+ * - complete any speculative page table walk started before
+ * we trapped to EL2 so that we can mess with the MM
+ * registers out of context, for which dsb(nsh) is enough
+ *
+ * The composition of these two barriers is a dsb(DOMAIN), and
+ * the 'nsh' parameter tracks the distinction between
+ * Inner-Shareable and Non-Shareable, as specified by the
+ * callers.
+ */
+ if (nsh)
+ dsb(nsh);
+ else
+ dsb(ish);
+
+ /*
+ * If we're already in the desired context, then there's nothing to do.
+ */
+ if (vcpu) {
+ /*
+ * We're in guest context. However, for this to work, this needs
+ * to be called from within __kvm_vcpu_run(), which ensures that
+ * __hyp_running_vcpu is set to the current guest vcpu.
+ */
+ if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu))
+ return;
+
+ cxt->mmu = vcpu->arch.hw_mmu;
+ } else {
+ /* We're in host context. */
+ if (mmu == host_s2_mmu)
+ return;
+
+ cxt->mmu = host_s2_mmu;
+ }
+
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
u64 val;
/*
* For CPUs that are affected by ARM 1319367, we need to
- * avoid a host Stage-1 walk while we have the guest's
- * VMID set in the VTTBR in order to invalidate TLBs.
- * We're guaranteed that the S1 MMU is enabled, so we can
- * simply set the EPD bits to avoid any further TLB fill.
+ * avoid a Stage-1 walk with the old VMID while we have
+ * the new VMID set in the VTTBR in order to invalidate TLBs.
+ * We're guaranteed that the host S1 MMU is enabled, so
+ * we can simply set the EPD bits to avoid any further
+ * TLB fill. For guests, we ensure that the S1 MMU is
+ * temporarily enabled in the next context.
*/
val = cxt->tcr = read_sysreg_el1(SYS_TCR);
val |= TCR_EPD1_MASK | TCR_EPD0_MASK;
write_sysreg_el1(val, SYS_TCR);
isb();
+
+ if (vcpu) {
+ val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR);
+ if (!(val & SCTLR_ELx_M)) {
+ val |= SCTLR_ELx_M;
+ write_sysreg_el1(val, SYS_SCTLR);
+ isb();
+ }
+ } else {
+ /* The host S1 MMU is always enabled. */
+ cxt->sctlr = SCTLR_ELx_M;
+ }
}
/*
@@ -39,18 +107,40 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
* ensuring that we always have an ISB, but not two ISBs back
* to back.
*/
- __load_stage2(mmu, kern_hyp_va(mmu->arch));
+ if (vcpu)
+ __load_host_stage2();
+ else
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
+
asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
}
-static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
+static void exit_vmid_context(struct tlb_inv_context *cxt)
{
- __load_host_stage2();
+ struct kvm_s2_mmu *mmu = cxt->mmu;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+
+ if (!mmu)
+ return;
+
+ if (vcpu)
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
+ else
+ __load_host_stage2();
+
+ /* Ensure write of the old VMID */
+ isb();
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
- /* Ensure write of the host VMID */
- isb();
- /* Restore the host's TCR_EL1 */
+ if (!(cxt->sctlr & SCTLR_ELx_M)) {
+ write_sysreg_el1(cxt->sctlr, SYS_SCTLR);
+ isb();
+ }
+
write_sysreg_el1(cxt->tcr, SYS_TCR);
}
}
@@ -60,10 +150,8 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
{
struct tlb_inv_context cxt;
- dsb(ishst);
-
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt, false);
/*
* We could do so much better if we had the VA as well.
@@ -84,45 +172,78 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ish);
isb();
+ exit_vmid_context(&cxt);
+}
+
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ enter_vmid_context(mmu, &cxt, true);
+
/*
- * If the host is running at EL1 and we have a VPIPT I-cache,
- * then we must perform I-cache maintenance at EL2 in order for
- * it to have an effect on the guest. Since the guest cannot hit
- * I-cache lines allocated with a different VMID, we don't need
- * to worry about junk out of guest reset (we nuke the I-cache on
- * VMID rollover), but we do need to be careful when remapping
- * executable pages for the same guest. This can happen when KSM
- * takes a CoW fault on an executable page, copies the page into
- * a page that was previously mapped in the guest and then needs
- * to invalidate the guest view of the I-cache for that page
- * from EL1. To solve this, we invalidate the entire I-cache when
- * unmapping a page from a guest if we have a VPIPT I-cache but
- * the host is running at EL1. As above, we could do better if
- * we had the VA.
- *
- * The moral of this story is: if you have a VPIPT I-cache, then
- * you should be running with VHE enabled.
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
*/
- if (icache_is_vpipt())
- icache_inval_all_pou();
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
-void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
+void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
+ phys_addr_t start, unsigned long pages)
{
struct tlb_inv_context cxt;
+ unsigned long stride;
+
+ /*
+ * Since the range of addresses may not be mapped at
+ * the same level, assume the worst case as PAGE_SIZE
+ */
+ stride = PAGE_SIZE;
+ start = round_down(start, stride);
+
+ /* Switch to requested VMID */
+ enter_vmid_context(mmu, &cxt, false);
- dsb(ishst);
+ __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride,
+ TLBI_TTL_UNKNOWN);
+
+ dsb(ish);
+ __tlbi(vmalle1is);
+ dsb(ish);
+ isb();
+
+ exit_vmid_context(&cxt);
+}
+
+void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
+{
+ struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt, false);
__tlbi(vmalls12e1is);
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
@@ -130,32 +251,20 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt, false);
__tlbi(vmalle1);
asm volatile("ic iallu");
dsb(nsh);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_vm_context(void)
{
- dsb(ishst);
+ /* Same remark as in enter_vmid_context() */
+ dsb(ish);
__tlbi(alle1is);
-
- /*
- * VIPT and PIPT caches are not affected by VMID, so no maintenance
- * is necessary across a VMID rollover.
- *
- * VPIPT caches constrain lookup and maintenance to the active VMID,
- * so we need to invalidate lines with a stale VMID to avoid an ABA
- * race after multiple rollovers.
- *
- */
- if (icache_is_vpipt())
- asm volatile("ic ialluis");
-
dsb(ish);
}
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b11cf2c618a6..947ac1a951a5 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -11,60 +11,22 @@
#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
-
-#define KVM_PTE_TYPE BIT(1)
-#define KVM_PTE_TYPE_BLOCK 0
-#define KVM_PTE_TYPE_PAGE 1
-#define KVM_PTE_TYPE_TABLE 1
-
-#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
-
-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
-
-#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
- KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
- KVM_PTE_LEAF_ATTR_HI_S2_XN)
-
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID 1
-
-/*
- * Used to indicate a pte for which a 'break-before-make' sequence is in
- * progress.
- */
-#define KVM_INVALID_PTE_LOCKED BIT(10)
-
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;
+ const u64 start;
u64 addr;
- u64 end;
+ const u64 end;
};
-static bool kvm_phys_is_valid(u64 phys)
+static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
+}
+
+static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
{
- return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
}
static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
@@ -77,13 +39,13 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx,
if (granule > (ctx->end - ctx->addr))
return false;
- if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
+ if (!IS_ALIGNED(phys, granule))
return false;
return IS_ALIGNED(ctx->addr, granule);
}
-static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level)
{
u64 shift = kvm_granule_shift(level);
u64 mask = BIT(PAGE_SHIFT - 3) - 1;
@@ -99,7 +61,7 @@ static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
return (addr & mask) >> shift;
}
-static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level)
{
struct kvm_pgtable pgt = {
.ia_bits = ia_bits,
@@ -109,9 +71,9 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
}
-static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+static bool kvm_pte_table(kvm_pte_t pte, s8 level)
{
- if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ if (level == KVM_PGTABLE_LAST_LEVEL)
return false;
if (!kvm_pte_valid(pte))
@@ -139,11 +101,11 @@ static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops
return pte;
}
-static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
+static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
{
kvm_pte_t pte = kvm_phys_to_pte(pa);
- u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
- KVM_PTE_TYPE_BLOCK;
+ u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE :
+ KVM_PTE_TYPE_BLOCK;
pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
@@ -168,12 +130,31 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
return walker->cb(ctx, visit);
}
+static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
+ int r)
+{
+ /*
+ * Visitor callbacks return EAGAIN when the conditions that led to a
+ * fault are no longer reflected in the page tables due to a race to
+ * update a PTE. In the context of a fault handler this is interpreted
+ * as a signal to retry guest execution.
+ *
+ * Ignore the return code altogether for walkers outside a fault handler
+ * (e.g. write protecting a range of memory) and chug along with the
+ * page table walk.
+ */
+ if (r == -EAGAIN)
+ return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
+
+ return !r;
+}
+
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
- struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level);
+ struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level);
static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
struct kvm_pgtable_mm_ops *mm_ops,
- kvm_pteref_t pteref, u32 level)
+ kvm_pteref_t pteref, s8 level)
{
enum kvm_pgtable_walk_flags flags = data->walker->flags;
kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
@@ -182,25 +163,38 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
.old = READ_ONCE(*ptep),
.arg = data->walker->arg,
.mm_ops = mm_ops,
+ .start = data->start,
.addr = data->addr,
.end = data->end,
.level = level,
.flags = flags,
};
int ret = 0;
+ bool reload = false;
kvm_pteref_t childp;
bool table = kvm_pte_table(ctx.old, level);
- if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE))
+ if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
+ reload = true;
+ }
if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
+ reload = true;
+ }
+
+ /*
+ * Reload the page table after invoking the walker callback for leaf
+ * entries or after pre-order traversal, to allow the walker to descend
+ * into a newly installed or replaced table.
+ */
+ if (reload) {
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
}
- if (ret)
+ if (!kvm_pgtable_walk_continue(data->walker, ret))
goto out;
if (!table) {
@@ -211,23 +205,27 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
- if (ret)
+ if (!kvm_pgtable_walk_continue(data->walker, ret))
goto out;
if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
out:
+ if (kvm_pgtable_walk_continue(data->walker, ret))
+ return 0;
+
return ret;
}
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
- struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level)
+ struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level)
{
u32 idx;
int ret = 0;
- if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+ if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL ||
+ level > KVM_PGTABLE_LAST_LEVEL))
return -EINVAL;
for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
@@ -271,6 +269,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker)
{
struct kvm_pgtable_walk_data walk_data = {
+ .start = ALIGN_DOWN(addr, PAGE_SIZE),
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
.end = PAGE_ALIGN(walk_data.addr + size),
.walker = walker,
@@ -289,7 +288,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct leaf_walk_data {
kvm_pte_t pte;
- u32 level;
+ s8 level;
};
static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
@@ -304,7 +303,7 @@ static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
}
int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
- kvm_pte_t *ptep, u32 *level)
+ kvm_pte_t *ptep, s8 *level)
{
struct leaf_walk_data data;
struct kvm_pgtable_walker walker = {
@@ -327,7 +326,7 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
}
struct hyp_map_data {
- u64 phys;
+ const u64 phys;
kvm_pte_t attr;
};
@@ -349,12 +348,16 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
if (device)
return -EINVAL;
+
+ if (system_supports_bti_kernel())
+ attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
} else {
attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
}
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
- attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
+ if (!kvm_lpa2_is_enabled())
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
*ptep = attr;
@@ -385,13 +388,12 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct hyp_map_data *data)
{
+ u64 phys = data->phys + (ctx->addr - ctx->start);
kvm_pte_t new;
- u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
if (!kvm_block_mapping_supported(ctx, phys))
return false;
- data->phys += granule;
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
if (ctx->old == new)
return true;
@@ -414,7 +416,7 @@ static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
if (hyp_map_walker_try_leaf(ctx, data))
return 0;
- if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
+ if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
return -EINVAL;
childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
@@ -470,7 +472,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
kvm_clear_pte(ctx->ptep);
dsb(ishst);
- __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
+ __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
} else {
if (ctx->end - ctx->addr < granule)
return -EINVAL;
@@ -510,14 +512,19 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
struct kvm_pgtable_mm_ops *mm_ops)
{
- u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
+ s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 -
+ ARM64_HW_PGTABLE_LEVELS(va_bits);
+
+ if (start_level < KVM_PGTABLE_FIRST_LEVEL ||
+ start_level > KVM_PGTABLE_LAST_LEVEL)
+ return -EINVAL;
pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
if (!pgt->pgd)
return -ENOMEM;
pgt->ia_bits = va_bits;
- pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
+ pgt->start_level = start_level;
pgt->mm_ops = mm_ops;
pgt->mmu = NULL;
pgt->force_pte_cb = NULL;
@@ -554,7 +561,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
}
struct stage2_map_data {
- u64 phys;
+ const u64 phys;
kvm_pte_t attr;
u8 owner_id;
@@ -566,12 +573,15 @@ struct stage2_map_data {
/* Force mappings to page granularity */
bool force_pte;
+
+ /* Walk should update owner_id only */
+ bool annotation;
};
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
{
u64 vtcr = VTCR_EL2_FLAGS;
- u8 lvls;
+ s8 lvls;
vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
vtcr |= VTCR_EL2_T0SZ(phys_shift);
@@ -582,14 +592,36 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
lvls = stage2_pgtable_levels(phys_shift);
if (lvls < 2)
lvls = 2;
+
+ /*
+ * When LPA2 is enabled, the HW supports an extra level of translation
+ * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2
+ * to as an addition to SL0 to enable encoding this extra start level.
+ * However, since we always use concatenated pages for the first level
+ * lookup, we will never need this extra level and therefore do not need
+ * to touch SL2.
+ */
vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable the Hardware Access Flag management, unconditionally
- * on all CPUs. The features is RES0 on CPUs without the support
- * and must be ignored by the CPUs.
+ * on all CPUs. In systems that have asymmetric support for the feature
+ * this allows KVM to leverage hardware support on the subset of cores
+ * that implement the feature.
+ *
+ * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
+ * hardware) on implementations that do not advertise support for the
+ * feature. As such, setting HA unconditionally is safe, unless you
+ * happen to be running on a design that has unadvertised support for
+ * HAFDBS. Here be dragons.
*/
- vtcr |= VTCR_EL2_HA;
+ if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ vtcr |= VTCR_EL2_HA;
+#endif /* CONFIG_ARM64_HW_AFDBM */
+
+ if (kvm_lpa2_is_enabled())
+ vtcr |= VTCR_EL2_DS;
/* Set the vmid bits */
vtcr |= (get_vmid_bits(mmfr1) == 16) ?
@@ -601,26 +633,87 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
static bool stage2_has_fwb(struct kvm_pgtable *pgt)
{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
return false;
return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
}
+void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
+ phys_addr_t addr, size_t size)
+{
+ unsigned long pages, inval_pages;
+
+ if (!system_supports_tlb_range()) {
+ kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
+ return;
+ }
+
+ pages = size >> PAGE_SHIFT;
+ while (pages > 0) {
+ inval_pages = min(pages, MAX_TLBI_RANGE_PAGES);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages);
+
+ addr += inval_pages << PAGE_SHIFT;
+ pages -= inval_pages;
+ }
+}
+
#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
+static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr)
+{
+ bool px, ux;
+ u8 xn;
+
+ px = prot & KVM_PGTABLE_PROT_PX;
+ ux = prot & KVM_PGTABLE_PROT_UX;
+
+ if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux)
+ return -EINVAL;
+
+ if (px && ux)
+ xn = 0b00;
+ else if (!px && ux)
+ xn = 0b01;
+ else if (!px && !ux)
+ xn = 0b10;
+ else
+ xn = 0b11;
+
+ *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn);
+ return 0;
+}
+
static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
kvm_pte_t *ptep)
{
- bool device = prot & KVM_PGTABLE_PROT_DEVICE;
- kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
- KVM_S2_MEMATTR(pgt, NORMAL);
+ kvm_pte_t attr;
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+ int r;
- if (!(prot & KVM_PGTABLE_PROT_X))
- attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
- else if (device)
+ switch (prot & (KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_NORMAL_NC)) {
+ case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
return -EINVAL;
+ case KVM_PGTABLE_PROT_DEVICE:
+ if (prot & KVM_PGTABLE_PROT_X)
+ return -EINVAL;
+ attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
+ break;
+ case KVM_PGTABLE_PROT_NORMAL_NC:
+ if (prot & KVM_PGTABLE_PROT_X)
+ return -EINVAL;
+ attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
+ break;
+ default:
+ attr = KVM_S2_MEMATTR(pgt, NORMAL);
+ }
+
+ r = stage2_set_xn_attr(prot, &attr);
+ if (r)
+ return r;
if (prot & KVM_PGTABLE_PROT_R)
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
@@ -628,7 +721,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
if (prot & KVM_PGTABLE_PROT_W)
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
- attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+ if (!kvm_lpa2_is_enabled())
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+
attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
*ptep = attr;
@@ -647,8 +742,20 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
prot |= KVM_PGTABLE_PROT_R;
if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
prot |= KVM_PGTABLE_PROT_W;
- if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
- prot |= KVM_PGTABLE_PROT_X;
+
+ switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) {
+ case 0b00:
+ prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX;
+ break;
+ case 0b01:
+ prot |= KVM_PGTABLE_PROT_UX;
+ break;
+ case 0b11:
+ prot |= KVM_PGTABLE_PROT_PX;
+ break;
+ default:
+ break;
+ }
return prot;
}
@@ -717,14 +824,21 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
return false;
- /*
- * Perform the appropriate TLB invalidation based on the evicted pte
- * value (if any).
- */
- if (kvm_pte_table(ctx->old, ctx->level))
- kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
- else if (kvm_pte_valid(ctx->old))
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
+ if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
+ /*
+ * Perform the appropriate TLB invalidation based on the
+ * evicted pte value (if any).
+ */
+ if (kvm_pte_table(ctx->old, ctx->level)) {
+ u64 size = kvm_granule_size(ctx->level);
+ u64 addr = ALIGN_DOWN(ctx->addr, size);
+
+ kvm_tlb_flush_vmid_range(mmu, addr, size);
+ } else if (kvm_pte_valid(ctx->old)) {
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
+ ctx->addr, ctx->level);
+ }
+ }
if (stage2_pte_is_counted(ctx->old))
mm_ops->put_page(ctx->ptep);
@@ -744,16 +858,40 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
smp_store_release(ctx->ptep, new);
}
-static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
- struct kvm_pgtable_mm_ops *mm_ops)
+static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
+{
+ /*
+ * If FEAT_TLBIRANGE is implemented, defer the individual
+ * TLB invalidations until the entire walk is finished, and
+ * then use the range-based TLBI instructions to do the
+ * invalidations. Condition deferred TLB invalidation on the
+ * system supporting FWB as the optimization is entirely
+ * pointless when the unmap walker needs to perform CMOs.
+ */
+ return system_supports_tlb_range() && stage2_has_fwb(pgt);
+}
+
+static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
+ struct kvm_s2_mmu *mmu,
+ struct kvm_pgtable_mm_ops *mm_ops)
{
+ struct kvm_pgtable *pgt = ctx->arg;
+
/*
- * Clear the existing PTE, and perform break-before-make with
- * TLB maintenance if it was valid.
+ * Clear the existing PTE, and perform break-before-make if it was
+ * valid. Depending on the system support, defer the TLB maintenance
+ * for the same until the entire unmap walk is completed.
*/
if (kvm_pte_valid(ctx->old)) {
kvm_clear_pte(ctx->ptep);
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
+
+ if (kvm_pte_table(ctx->old, ctx->level)) {
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
+ TLBI_TTL_UNKNOWN);
+ } else if (!stage2_unmap_defer_tlb_flush(pgt)) {
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
+ ctx->level);
+ }
}
mm_ops->put_page(ctx->ptep);
@@ -762,35 +900,50 @@ static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s
static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
{
u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
- return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
+ return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
}
static bool stage2_pte_executable(kvm_pte_t pte)
{
- return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+ return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
+static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
+ const struct stage2_map_data *data)
+{
+ u64 phys = data->phys;
+
+ /* Work out the correct PA based on how far the walk has gotten */
+ return phys + (ctx->addr - ctx->start);
}
static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
- if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1)))
+ u64 phys = stage2_map_walker_phys_addr(ctx, data);
+
+ if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL)
return false;
- return kvm_block_mapping_supported(ctx, data->phys);
+ if (data->annotation)
+ return true;
+
+ return kvm_block_mapping_supported(ctx, phys);
}
static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
kvm_pte_t new;
- u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
+ u64 phys = stage2_map_walker_phys_addr(ctx, data);
+ u64 granule = kvm_granule_size(ctx->level);
struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
if (!stage2_leaf_mapping_allowed(ctx, data))
return -E2BIG;
- if (kvm_phys_is_valid(phys))
+ if (!data->annotation)
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
else
new = kvm_init_invalid_leaf_owner(data->owner_id);
@@ -804,21 +957,36 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
if (!stage2_pte_needs_update(ctx->old, new))
return -EAGAIN;
+ /* If we're only changing software bits, then store them and go! */
+ if (!kvm_pgtable_walk_shared(ctx) &&
+ !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) {
+ bool old_is_counted = stage2_pte_is_counted(ctx->old);
+
+ if (old_is_counted != stage2_pte_is_counted(new)) {
+ if (old_is_counted)
+ mm_ops->put_page(ctx->ptep);
+ else
+ mm_ops->get_page(ctx->ptep);
+ }
+ WARN_ON_ONCE(!stage2_try_set_pte(ctx, new));
+ return 0;
+ }
+
if (!stage2_try_break_pte(ctx, data->mmu))
return -EAGAIN;
/* Perform CMOs before installation of the guest stage-2 PTE */
- if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
+ stage2_pte_cacheable(pgt, new))
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
- granule);
+ granule);
- if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
+ stage2_pte_executable(new))
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
stage2_make_pte(ctx, new);
- if (kvm_phys_is_valid(phys))
- data->phys += granule;
return 0;
}
@@ -836,7 +1004,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
if (ret)
return ret;
- mm_ops->free_removed_table(childp, ctx->level);
+ mm_ops->free_unlinked_table(childp, ctx->level);
return 0;
}
@@ -851,7 +1019,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
if (ret != -E2BIG)
return ret;
- if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
+ if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
return -EINVAL;
if (!data->memcache)
@@ -881,7 +1049,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
* The TABLE_PRE callback runs for table entries on the way down, looking
* for table entries which we could conceivably replace with a block entry
* for this mapping. If it finds one it replaces the entry and calls
- * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
+ * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
*
* Otherwise, the LEAF callback performs the mapping at the existing leaves
* instead.
@@ -937,11 +1105,11 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
{
int ret;
struct stage2_map_data map_data = {
- .phys = KVM_PHYS_INVALID,
.mmu = pgt->mmu,
.memcache = mc,
.owner_id = owner_id,
.force_pte = true,
+ .annotation = true,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -988,7 +1156,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
* block entry and rely on the remaining portions being faulted
* back lazily.
*/
- stage2_put_pte(ctx, mmu, mm_ops);
+ stage2_unmap_put_pte(ctx, mmu, mm_ops);
if (need_flush && mm_ops->dcache_clean_inval_poc)
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
@@ -1002,20 +1170,26 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
+ int ret;
struct kvm_pgtable_walker walker = {
.cb = stage2_unmap_walker,
.arg = pgt,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
};
- return kvm_pgtable_walk(pgt, addr, size, &walker);
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ if (stage2_unmap_defer_tlb_flush(pgt))
+ /* Perform the deferred TLB invalidations */
+ kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
+
+ return ret;
}
struct stage2_attr_data {
kvm_pte_t attr_set;
kvm_pte_t attr_clr;
kvm_pte_t pte;
- u32 level;
+ s8 level;
};
static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
@@ -1026,7 +1200,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
if (!kvm_pte_valid(ctx->old))
- return 0;
+ return -EAGAIN;
data->level = ctx->level;
data->pte = pte;
@@ -1058,7 +1232,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
u64 size, kvm_pte_t attr_set,
kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
- u32 *level, enum kvm_pgtable_walk_flags flags)
+ s8 *level, enum kvm_pgtable_walk_flags flags)
{
int ret;
kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
@@ -1091,42 +1265,73 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
NULL, NULL, 0);
}
-kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_walk_flags flags)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
- &pte, NULL, 0);
- dsb(ishst);
- return pte;
+ int ret;
+
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+ NULL, NULL, flags);
+ if (!ret)
+ dsb(ishst);
}
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+struct stage2_age_data {
+ bool mkold;
+ bool young;
+};
+
+static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL, 0);
+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data *data = ctx->arg;
+
+ if (!kvm_pte_valid(ctx->old) || new == ctx->old)
+ return 0;
+
+ data->young = true;
+
+ /*
+ * stage2_age_walker() is always called while holding the MMU lock for
+ * write, so this will always succeed. Nonetheless, this deliberately
+ * follows the race detection pattern of the other stage-2 walkers in
+ * case the locking mechanics of the MMU notifiers is ever changed.
+ */
+ if (data->mkold && !stage2_try_set_pte(ctx, new))
+ return -EAGAIN;
+
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
- return pte;
+ return 0;
}
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
- return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data data = {
+ .mkold = mkold,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_age_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+ return data.young;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
- enum kvm_pgtable_prot prot)
+ enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
{
+ kvm_pte_t xn = 0, set = 0, clr = 0;
+ s8 level;
int ret;
- u32 level;
- kvm_pte_t set = 0, clr = 0;
if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
return -EINVAL;
@@ -1137,13 +1342,16 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
if (prot & KVM_PGTABLE_PROT_W)
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
- if (prot & KVM_PGTABLE_PROT_X)
- clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ ret = stage2_set_xn_attr(prot, &xn);
+ if (ret)
+ return ret;
- ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
- KVM_PGTABLE_WALK_SHARED);
- if (!ret)
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
+ set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN;
+
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
+ if (!ret || ret == -EAGAIN)
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
return ret;
}
@@ -1153,7 +1361,7 @@ static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
struct kvm_pgtable *pgt = ctx->arg;
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
- if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old))
+ if (!stage2_pte_cacheable(pgt, ctx->old))
return 0;
if (mm_ops->dcache_clean_inval_poc)
@@ -1176,6 +1384,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
+kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
+ u64 phys, s8 level,
+ enum kvm_pgtable_prot prot,
+ void *mc, bool force_pte)
+{
+ struct stage2_map_data map_data = {
+ .phys = phys,
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ .force_pte = force_pte,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
+ KVM_PGTABLE_WALK_SKIP_CMO,
+ .arg = &map_data,
+ };
+ /*
+ * The input address (.addr) is irrelevant for walking an
+ * unlinked table. Construct an ambiguous IA range to map
+ * kvm_granule_size(level) worth of memory.
+ */
+ struct kvm_pgtable_walk_data data = {
+ .walker = &walker,
+ .addr = 0,
+ .end = kvm_granule_size(level),
+ };
+ struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
+ kvm_pte_t *pgtable;
+ int ret;
+
+ if (!IS_ALIGNED(phys, kvm_granule_size(level)))
+ return ERR_PTR(-EINVAL);
+
+ ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ pgtable = mm_ops->zalloc_page(mc);
+ if (!pgtable)
+ return ERR_PTR(-ENOMEM);
+
+ ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
+ level + 1);
+ if (ret) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
+ return ERR_PTR(ret);
+ }
+
+ return pgtable;
+}
+
+/*
+ * Get the number of page-tables needed to replace a block with a
+ * fully populated tree up to the PTE entries. Note that @level is
+ * interpreted as in "level @level entry".
+ */
+static int stage2_block_get_nr_page_tables(s8 level)
+{
+ switch (level) {
+ case 1:
+ return PTRS_PER_PTE + 1;
+ case 2:
+ return 1;
+ case 3:
+ return 0;
+ default:
+ WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
+ level > KVM_PGTABLE_LAST_LEVEL);
+ return -EINVAL;
+ };
+}
+
+static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ struct kvm_mmu_memory_cache *mc = ctx->arg;
+ struct kvm_s2_mmu *mmu;
+ kvm_pte_t pte = ctx->old, new, *childp;
+ enum kvm_pgtable_prot prot;
+ s8 level = ctx->level;
+ bool force_pte;
+ int nr_pages;
+ u64 phys;
+
+ /* No huge-pages exist at the last level */
+ if (level == KVM_PGTABLE_LAST_LEVEL)
+ return 0;
+
+ /* We only split valid block mappings */
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ nr_pages = stage2_block_get_nr_page_tables(level);
+ if (nr_pages < 0)
+ return nr_pages;
+
+ if (mc->nobjs >= nr_pages) {
+ /* Build a tree mapped down to the PTE granularity. */
+ force_pte = true;
+ } else {
+ /*
+ * Don't force PTEs, so create_unlinked() below does
+ * not populate the tree up to the PTE level. The
+ * consequence is that the call will require a single
+ * page of level 2 entries at level 1, or a single
+ * page of PTEs at level 2. If we are at level 1, the
+ * PTEs will be created recursively.
+ */
+ force_pte = false;
+ nr_pages = 1;
+ }
+
+ if (mc->nobjs < nr_pages)
+ return -ENOMEM;
+
+ mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
+ phys = kvm_pte_to_phys(pte);
+ prot = kvm_pgtable_stage2_pte_prot(pte);
+
+ childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
+ level, prot, mc, force_pte);
+ if (IS_ERR(childp))
+ return PTR_ERR(childp);
+
+ if (!stage2_try_break_pte(ctx, mmu)) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
+ return -EAGAIN;
+ }
+
+ /*
+ * Note, the contents of the page table are guaranteed to be made
+ * visible before the new PTE is assigned because stage2_make_pte()
+ * writes the PTE using smp_store_release().
+ */
+ new = kvm_init_table_pte(childp, mm_ops);
+ stage2_make_pte(ctx, new);
+ return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_mmu_memory_cache *mc)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_split_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = mc,
+ };
+ int ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ return ret;
+}
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
struct kvm_pgtable_mm_ops *mm_ops,
@@ -1183,10 +1547,10 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
kvm_pgtable_force_pte_cb_t force_pte_cb)
{
size_t pgd_sz;
- u64 vtcr = mmu->arch->vtcr;
+ u64 vtcr = mmu->vtcr;
u32 ia_bits = VTCR_EL2_IPA(vtcr);
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
- u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+ s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
@@ -1209,43 +1573,86 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
{
u32 ia_bits = VTCR_EL2_IPA(vtcr);
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
- u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+ s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
}
-static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
- enum kvm_pgtable_walk_flags visit)
+static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx)
{
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!stage2_pte_is_counted(ctx->old))
+ mm_ops->put_page(ctx->ptep);
+ return 0;
+}
+
+static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
+
+ if (mm_ops->page_count(childp) != 1)
return 0;
+ /*
+ * Drop references and clear the now stale PTE to avoid rewalking the
+ * freed page table.
+ */
mm_ops->put_page(ctx->ptep);
+ mm_ops->put_page(childp);
+ kvm_clear_pte(ctx->ptep);
+ return 0;
+}
- if (kvm_pte_table(ctx->old, ctx->level))
- mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
+static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ if (!stage2_pte_is_counted(ctx->old))
+ return 0;
- return 0;
+ switch (visit) {
+ case KVM_PGTABLE_WALK_LEAF:
+ return stage2_free_leaf(ctx);
+ case KVM_PGTABLE_WALK_TABLE_POST:
+ return stage2_free_table_post(ctx);
+ default:
+ return -EINVAL;
+ }
}
-void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size)
{
- size_t pgd_sz;
struct kvm_pgtable_walker walker = {
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
};
- WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+}
+
+void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
+{
+ size_t pgd_sz;
+
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
- pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
+
+ /*
+ * Since the pgtable is unlinked at this point, and not shared with
+ * other walkers, safely deference pgd with kvm_dereference_pteref_raw()
+ */
+ pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz);
pgt->pgd = NULL;
}
-void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
+ kvm_pgtable_stage2_destroy_pgd(pgt);
+}
+
+void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
{
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
struct kvm_pgtable_walker walker = {
@@ -1266,4 +1673,7 @@ void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pg
};
WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));
+
+ WARN_ON(mm_ops->page_count(pgtable) != 1);
+ mm_ops->put_page(pgtable);
}
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 87a54375bd6e..5fd99763b54d 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -20,7 +20,7 @@ static bool __is_be(struct kvm_vcpu *vcpu)
if (vcpu_mode_is_32bit(vcpu))
return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT);
- return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE);
+ return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE);
}
/*
@@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
return -1;
}
+ /* Handle deactivation as a normal exit */
+ if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE)
+ return 0;
+
rd = kvm_vcpu_dabt_get_rd(vcpu);
addr = kvm_vgic_global_state.vcpu_hyp_va;
addr += fault_ipa - vgic->vgic_cpu_base;
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 6cb638b184b1..0b670a033fd8 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -14,11 +14,13 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include "../../vgic/vgic.h"
+
#define vtr_to_max_lr_idx(v) ((v) & 0xf)
#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5))
-static u64 __gic_v3_get_lr(unsigned int lr)
+u64 __gic_v3_get_lr(unsigned int lr)
{
switch (lr & 0xf) {
case 0:
@@ -58,7 +60,7 @@ static u64 __gic_v3_get_lr(unsigned int lr)
unreachable();
}
-static void __gic_v3_set_lr(u64 val, int lr)
+void __gic_v3_set_lr(u64 val, int lr)
{
switch (lr & 0xf) {
case 0:
@@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n)
return val;
}
+static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if)
+{
+ return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits();
+}
+
void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
{
u64 used_lrs = cpu_if->used_lrs;
@@ -212,14 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
}
}
- if (used_lrs || cpu_if->its_vpe.its_vm) {
+ if (used_lrs) {
int i;
u32 elrsr;
elrsr = read_gicreg(ICH_ELRSR_EL2);
- write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
-
for (i = 0; i < used_lrs; i++) {
if (elrsr & (1 << i))
cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
@@ -229,6 +234,23 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
__gic_v3_set_lr(0, i);
}
}
+
+ cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+
+ if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) {
+ u64 val = read_gicreg(ICH_HCR_EL2);
+ cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount;
+ cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount;
+ }
+
+ write_gicreg(0, ICH_HCR_EL2);
+
+ /*
+ * Hack alert: On NV, this results in a trap so that the above write
+ * actually takes effect... No synchronisation is necessary, as we
+ * only care about the effects when this traps.
+ */
+ read_gicreg(ICH_MISR_EL2);
}
void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
@@ -236,12 +258,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
u64 used_lrs = cpu_if->used_lrs;
int i;
- if (used_lrs || cpu_if->its_vpe.its_vm) {
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+ write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2);
- for (i = 0; i < used_lrs; i++)
- __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
- }
+ for (i = 0; i < used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
/*
* Ensure that writes to the LRs, and on non-VHE systems ensure that
@@ -268,8 +288,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
* starting to mess with the rest of the GIC, and VMCR_EL2 in
* particular. This logic must be called before
* __vgic_v3_restore_state().
+ *
+ * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is
+ * provisioned at all. In order to prevent illegal accesses to the
+ * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1
+ * so that the trap bits can take effect. Yes, we *loves* the GIC.
*/
- if (!cpu_if->vgic_sre) {
+ if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) {
+ write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1);
+ isb();
+ } else if (!cpu_if->vgic_sre) {
write_gicreg(0, ICC_SRE_EL1);
isb();
write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
@@ -287,38 +315,42 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
}
- /*
- * Prevent the guest from touching the GIC system registers if
- * SRE isn't enabled for GICv3 emulation.
- */
- write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
- ICC_SRE_EL2);
+ /* Only disable SRE if the host implements the GICv2 interface */
+ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) {
+ /*
+ * Prevent the guest from touching the ICC_SRE_EL1 system
+ * register. Note that this may not have any effect, as
+ * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
+ */
+ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+ ICC_SRE_EL2);
+ }
/*
- * If we need to trap system registers, we must write
- * ICH_HCR_EL2 anyway, even if no interrupts are being
- * injected,
+ * If we need to trap system registers, we must write ICH_HCR_EL2
+ * anyway, even if no interrupts are being injected. Note that this
+ * also applies if we don't expect any system register access (no
+ * vgic at all). In any case, no need to provide MI configuration.
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+ cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
+ write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2);
}
void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
- if (!cpu_if->vgic_sre) {
- cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
- }
-
- val = read_gicreg(ICC_SRE_EL2);
- write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+ /* Only restore SRE if the host implements the GICv2 interface */
+ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) {
+ val = read_gicreg(ICC_SRE_EL2);
+ write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
- if (!cpu_if->vgic_sre) {
- /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
- isb();
- write_gicreg(1, ICC_SRE_EL1);
+ if (!cpu_if->vgic_sre) {
+ /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+ isb();
+ write_gicreg(1, ICC_SRE_EL1);
+ }
}
/*
@@ -326,7 +358,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
* no interrupts were being injected, and we disable it again here.
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
+ cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
write_gicreg(0, ICH_HCR_EL2);
}
@@ -363,7 +395,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
}
}
-void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
+static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
u32 nr_pre_bits;
@@ -413,29 +445,43 @@ void __vgic_v3_init_lrs(void)
*/
u64 __vgic_v3_get_gic_config(void)
{
- u64 val, sre = read_gicreg(ICC_SRE_EL1);
+ u64 val, sre;
unsigned long flags = 0;
/*
- * To check whether we have a MMIO-based (GICv2 compatible)
- * CPU interface, we need to disable the system register
- * view. To do that safely, we have to prevent any interrupt
- * from firing (which would be deadly).
- *
- * Note that this only makes sense on VHE, as interrupts are
- * already masked for nVHE as part of the exception entry to
- * EL2.
+ * In compat mode, we cannot access ICC_SRE_EL1 at any EL
+ * other than EL1 itself; just return the
+ * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5
+ * system, so we first check if we have GICv5 support.
*/
- if (has_vhe())
- flags = local_daif_save();
+ if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return read_gicreg(ICH_VTR_EL2);
+ sre = read_gicreg(ICC_SRE_EL1);
/*
+ * To check whether we have a MMIO-based (GICv2 compatible)
+ * CPU interface, we need to disable the system register
+ * view.
+ *
* Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates
* that to be able to set ICC_SRE_EL1.SRE to 0, all the
* interrupt overrides must be set. You've got to love this.
+ *
+ * As we always run VHE with HCR_xMO set, no extra xMO
+ * manipulation is required in that case.
+ *
+ * To safely disable SRE, we have to prevent any interrupt
+ * from firing (which would be deadly). This only makes sense
+ * on VHE, as interrupts are already masked for nVHE as part
+ * of the exception entry to EL2.
*/
- sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO);
- isb();
+ if (has_vhe()) {
+ flags = local_daif_save();
+ } else {
+ sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO);
+ isb();
+ }
+
write_gicreg(0, ICC_SRE_EL1);
isb();
@@ -443,11 +489,13 @@ u64 __vgic_v3_get_gic_config(void)
write_gicreg(sre, ICC_SRE_EL1);
isb();
- sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0);
- isb();
- if (has_vhe())
+ if (has_vhe()) {
local_daif_restore(flags);
+ } else {
+ sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0);
+ isb();
+ }
val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63);
val |= read_gicreg(ICH_VTR_EL2);
@@ -455,16 +503,40 @@ u64 __vgic_v3_get_gic_config(void)
return val;
}
-u64 __vgic_v3_read_vmcr(void)
+static void __vgic_v3_compat_mode_enable(void)
+{
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return;
+
+ sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3);
+ /* Wait for V3 to become enabled */
+ isb();
+}
+
+static u64 __vgic_v3_read_vmcr(void)
{
return read_gicreg(ICH_VMCR_EL2);
}
-void __vgic_v3_write_vmcr(u32 vmcr)
+static void __vgic_v3_write_vmcr(u32 vmcr)
{
write_gicreg(vmcr, ICH_VMCR_EL2);
}
+void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+ __vgic_v3_compat_mode_enable();
+
+ /*
+ * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+ * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+ * VMCR_EL2 save/restore in the world switch.
+ */
+ if (cpu_if->vgic_sre)
+ __vgic_v3_write_vmcr(cpu_if->vgic_vmcr);
+ __vgic_v3_restore_aprs(cpu_if);
+}
+
static int __vgic_v3_bpr_min(void)
{
/* See Pseudocode for VPriorityGroup */
@@ -723,11 +795,11 @@ static void __vgic_v3_bump_eoicount(void)
u32 hcr;
hcr = read_gicreg(ICH_HCR_EL2);
- hcr += 1 << ICH_HCR_EOIcount_SHIFT;
+ hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT;
write_gicreg(hcr, ICH_HCR_EL2);
}
-static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
u32 vid = vcpu_get_reg(vcpu, rt);
u64 lr_val;
@@ -735,19 +807,25 @@ static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
/* EOImode == 0, nothing to be done here */
if (!(vmcr & ICH_VMCR_EOIM_MASK))
- return;
+ return 1;
/* No deactivate to be performed on an LPI */
if (vid >= VGIC_MIN_LPI)
- return;
+ return 1;
lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
- if (lr == -1) {
- __vgic_v3_bump_eoicount();
- return;
+ if (lr != -1) {
+ __vgic_v3_clear_active_lr(lr, lr_val);
+ return 1;
}
- __vgic_v3_clear_active_lr(lr, lr_val);
+ return 0;
+}
+
+static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ if (!___vgic_v3_write_dir(vcpu, vmcr, rt))
+ __vgic_v3_bump_eoicount();
}
static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
@@ -983,9 +1061,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
/* IDbits */
val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
- /* SEIS */
- if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK)
- val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT);
/* A3V */
val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
/* EOImode */
@@ -1013,6 +1088,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
write_gicreg(vmcr, ICH_VMCR_EL2);
}
+static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
+ u32 sysreg, bool is_read)
+{
+ u64 ich_hcr;
+
+ if (!is_nested_ctxt(vcpu))
+ return false;
+
+ ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+
+ switch (sysreg) {
+ case SYS_ICC_IGRPEN0_EL1:
+ if (is_read &&
+ (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ if (!is_read &&
+ (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_AP0Rn_EL1(0):
+ case SYS_ICC_AP0Rn_EL1(1):
+ case SYS_ICC_AP0Rn_EL1(2):
+ case SYS_ICC_AP0Rn_EL1(3):
+ case SYS_ICC_BPR0_EL1:
+ case SYS_ICC_EOIR0_EL1:
+ case SYS_ICC_HPPIR0_EL1:
+ case SYS_ICC_IAR0_EL1:
+ return ich_hcr & ICH_HCR_EL2_TALL0;
+
+ case SYS_ICC_IGRPEN1_EL1:
+ if (is_read &&
+ (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ if (!is_read &&
+ (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_AP1Rn_EL1(0):
+ case SYS_ICC_AP1Rn_EL1(1):
+ case SYS_ICC_AP1Rn_EL1(2):
+ case SYS_ICC_AP1Rn_EL1(3):
+ case SYS_ICC_BPR1_EL1:
+ case SYS_ICC_EOIR1_EL1:
+ case SYS_ICC_HPPIR1_EL1:
+ case SYS_ICC_IAR1_EL1:
+ return ich_hcr & ICH_HCR_EL2_TALL1;
+
+ case SYS_ICC_DIR_EL1:
+ if (ich_hcr & ICH_HCR_EL2_TDIR)
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_RPR_EL1:
+ case SYS_ICC_CTLR_EL1:
+ case SYS_ICC_PMR_EL1:
+ return ich_hcr & ICH_HCR_EL2_TC;
+
+ default:
+ return false;
+ }
+}
+
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
{
int rt;
@@ -1022,6 +1166,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
bool is_read;
u32 sysreg;
+ if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+ return 0;
+
esr = kvm_vcpu_get_esr(vcpu);
if (vcpu_mode_is_32bit(vcpu)) {
if (!kvm_condition_valid(vcpu)) {
@@ -1036,6 +1183,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+ if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read))
+ return 0;
+
switch (sysreg) {
case SYS_ICC_IAR0_EL1:
case SYS_ICC_IAR1_EL1:
@@ -1110,6 +1260,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
case SYS_ICC_DIR_EL1:
if (unlikely(is_read))
return 0;
+ /*
+ * Full exit if required to handle overflow deactivation,
+ * unless we can emulate it in the LRs (likely the majority
+ * of the cases).
+ */
+ if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) {
+ int ret;
+
+ ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(),
+ kvm_vcpu_sys_get_rt(vcpu));
+ if (ret)
+ __kvm_skip_instr(vcpu);
+
+ return ret;
+ }
fn = __vgic_v3_write_dir;
break;
case SYS_ICC_RPR_EL1:
diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile
index 3b9e5464b5b3..afc4aed9231a 100644
--- a/arch/arm64/kvm/hyp/vhe/Makefile
+++ b/arch/arm64/kvm/hyp/vhe/Makefile
@@ -6,6 +6,8 @@
asflags-y := -D__KVM_VHE_HYPERVISOR__
ccflags-y := -D__KVM_VHE_HYPERVISOR__
+CFLAGS_switch.o += -Wno-override-init
+
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o
diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c
index 289689b2682d..0100339b09e0 100644
--- a/arch/arm64/kvm/hyp/vhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c
@@ -19,8 +19,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
{
__debug_switch_to_host_common(vcpu);
}
-
-u64 __kvm_get_mdcr_el2(void)
-{
- return read_sysreg(mdcr_el2);
-}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 1a97391fedd2..9984c492305a 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -33,37 +33,110 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
-static void __activate_traps(struct kvm_vcpu *vcpu)
-{
- u64 val;
+/*
+ * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1
+ * semantics, irrespective of the configuration), but that cannot be
+ * applied to the actual HW as things would otherwise break badly.
+ *
+ * - TGE: we want the guest to use EL1, which is incompatible with
+ * this bit being set
+ *
+ * - API/APK: they are already accounted for by vcpu_load(), and can
+ * only take effect across a load/put cycle (such as ERET)
+ *
+ * - FIEN: no way we let a guest have access to the RAS "Common Fault
+ * Injection" thing, whatever that does
+ */
+#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK | HCR_FIEN)
- ___activate_traps(vcpu);
+static u64 __compute_hcr(struct kvm_vcpu *vcpu)
+{
+ u64 guest_hcr, hcr = vcpu->arch.hcr_el2;
- val = read_sysreg(cpacr_el1);
- val |= CPACR_EL1_TTA;
- val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN |
- CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN);
+ if (!vcpu_has_nv(vcpu))
+ return hcr;
/*
- * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
- * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
- * except for some missing controls, such as TAM.
- * In this case, CPTR_EL2.TAM has the same position with or without
- * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
- * shift value for trapping the AMU accesses.
+ * We rely on the invariant that a vcpu entered from HYP
+ * context must also exit in the same context, as only an ERET
+ * instruction can kick us out of it, and we obviously trap
+ * that sucker. PSTATE.M will get fixed-up on exit.
*/
+ if (is_hyp_ctxt(vcpu)) {
+ host_data_set_flag(VCPU_IN_HYP_CONTEXT);
+
+ hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB;
- val |= CPTR_EL2_TAM;
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ hcr |= HCR_NV1;
- if (guest_owns_fp_regs(vcpu)) {
- if (vcpu_has_sve(vcpu))
- val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
+ /*
+ * Nothing in HCR_EL2 should impact running in hypervisor
+ * context, apart from bits we have defined as RESx (E2H,
+ * HCD and co), or that cannot be set directly (the EXCLUDE
+ * bits). Given that we OR the guest's view with the host's,
+ * we can use the 0 value as the starting point, and only
+ * use the config-driven RES1 bits.
+ */
+ guest_hcr = kvm_vcpu_apply_reg_masks(vcpu, HCR_EL2, 0);
+
+ write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2);
} else {
- val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN);
- __activate_traps_fpsimd32(vcpu);
+ host_data_clear_flag(VCPU_IN_HYP_CONTEXT);
+
+ guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
+ if (guest_hcr & HCR_NV) {
+ u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id()));
+
+ /* Inherit the low bits from the actual register */
+ va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0);
+ write_sysreg_s(va, SYS_VNCR_EL2);
+
+ /* Force NV2 in case the guest is forgetful... */
+ guest_hcr |= HCR_NV2;
+ }
+
+ /*
+ * Exclude the guest's TWED configuration if it hasn't set TWE
+ * to avoid potentially delaying traps for the host.
+ */
+ if (!(guest_hcr & HCR_TWE))
+ guest_hcr &= ~(HCR_EL2_TWEDEn | HCR_EL2_TWEDEL);
+ }
+
+ BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) &&
+ host_data_test_flag(L1_VNCR_MAPPED));
+
+ return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE);
+}
+
+static void __activate_traps(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ ___activate_traps(vcpu, __compute_hcr(vcpu));
+
+ if (has_cntpoff()) {
+ struct timer_map map;
+
+ get_timer_map(vcpu, &map);
+
+ /*
+ * We're entrering the guest. Reload the correct
+ * values from memory now that TGE is clear.
+ */
+ if (map.direct_ptimer == vcpu_ptimer(vcpu))
+ val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ if (map.direct_ptimer == vcpu_hptimer(vcpu))
+ val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2);
+
+ if (map.direct_ptimer) {
+ write_sysreg_el0(val, SYS_CNTP_CVAL);
+ isb();
+ }
}
- write_sysreg(val, cpacr_el1);
+ __activate_cptr_traps(vcpu);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
@@ -75,7 +148,31 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
___deactivate_traps(vcpu);
- write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
+ write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
+
+ if (has_cntpoff()) {
+ struct timer_map map;
+ u64 val, offset;
+
+ get_timer_map(vcpu, &map);
+
+ /*
+ * We're exiting the guest. Save the latest CVAL value
+ * to memory and apply the offset now that TGE is set.
+ */
+ val = read_sysreg_el0(SYS_CNTP_CVAL);
+ if (map.direct_ptimer == vcpu_ptimer(vcpu))
+ __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val);
+ if (map.direct_ptimer == vcpu_hptimer(vcpu))
+ __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val);
+
+ offset = read_sysreg_s(SYS_CNTPOFF_EL2);
+
+ if (map.direct_ptimer && offset) {
+ write_sysreg_el0(val + offset, SYS_CNTP_CVAL);
+ isb();
+ }
+ }
/*
* ARM errata 1165522 and 1530923 require the actual execution of the
@@ -84,7 +181,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
*/
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
- write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
+ __deactivate_cptr_traps(vcpu);
if (!arm64_kernel_unmapped_at_el0())
host_vectors = __this_cpu_read(this_cpu_vector);
@@ -92,34 +189,383 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
}
NOKPROBE_SYMBOL(__deactivate_traps);
-void activate_traps_vhe_load(struct kvm_vcpu *vcpu)
+/*
+ * Disable IRQs in __vcpu_{load,put}_{activate,deactivate}_traps() to
+ * prevent a race condition between context switching of PMUSERENR_EL0
+ * in __{activate,deactivate}_traps_common() and IPIs that attempts to
+ * update PMUSERENR_EL0. See also kvm_set_pmuserenr().
+ */
+static void __vcpu_load_activate_traps(struct kvm_vcpu *vcpu)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
__activate_traps_common(vcpu);
+ local_irq_restore(flags);
}
-void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu)
+static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
__deactivate_traps_common(vcpu);
+ local_irq_restore(flags);
+}
+
+void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu)
+{
+ host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu;
+
+ __vcpu_load_switch_sysregs(vcpu);
+ __vcpu_load_activate_traps(vcpu);
+ __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
+}
+
+void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu)
+{
+ __vcpu_put_deactivate_traps(vcpu);
+ __vcpu_put_switch_sysregs(vcpu);
+
+ host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL;
+}
+
+static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg reg)
+{
+ unsigned long ctl;
+ u64 cval, cnt;
+ bool stat;
+
+ switch (reg) {
+ case CNTP_CTL_EL0:
+ cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0);
+ cnt = compute_counter_value(vcpu_ptimer(vcpu));
+ break;
+ case CNTV_CTL_EL0:
+ cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
+ ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0);
+ cnt = compute_counter_value(vcpu_vtimer(vcpu));
+ break;
+ default:
+ BUG();
+ }
+
+ stat = cval <= cnt;
+ __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat);
+
+ return ctl;
+}
+
+static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 esr, val;
+
+ /*
+ * Having FEAT_ECV allows for a better quality of timer emulation.
+ * However, this comes at a huge cost in terms of traps. Try and
+ * satisfy the reads from guest's hypervisor context without
+ * returning to the kernel if we can.
+ */
+ if (!is_hyp_ctxt(vcpu))
+ return false;
+
+ esr = kvm_vcpu_get_esr(vcpu);
+ if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ)
+ return false;
+
+ switch (esr_sys64_to_sysreg(esr)) {
+ case SYS_CNTP_CTL_EL02:
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0);
+ break;
+ case SYS_CNTP_CTL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu))
+ val = read_sysreg_el0(SYS_CNTP_CTL);
+ else
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0);
+ break;
+ case SYS_CNTP_CVAL_EL02:
+ val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ break;
+ case SYS_CNTP_CVAL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu)) {
+ val = read_sysreg_el0(SYS_CNTP_CVAL);
+
+ if (!has_cntpoff())
+ val -= timer_get_offset(vcpu_hptimer(vcpu));
+ } else {
+ val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ }
+ break;
+ case SYS_CNTPCT_EL0:
+ case SYS_CNTPCTSS_EL0:
+ val = compute_counter_value(vcpu_hptimer(vcpu));
+ break;
+ case SYS_CNTV_CTL_EL02:
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0);
+ break;
+ case SYS_CNTV_CTL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu))
+ val = read_sysreg_el0(SYS_CNTV_CTL);
+ else
+ val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0);
+ break;
+ case SYS_CNTV_CVAL_EL02:
+ val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
+ break;
+ case SYS_CNTV_CVAL_EL0:
+ if (vcpu_el2_e2h_is_set(vcpu))
+ val = read_sysreg_el0(SYS_CNTV_CVAL);
+ else
+ val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
+ break;
+ case SYS_CNTVCT_EL0:
+ case SYS_CNTVCTSS_EL0:
+ val = compute_counter_value(vcpu_hvtimer(vcpu));
+ break;
+ default:
+ return false;
+ }
+
+ vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 spsr, elr, mode;
+
+ /*
+ * Going through the whole put/load motions is a waste of time
+ * if this is a VHE guest hypervisor returning to its own
+ * userspace, or the hypervisor performing a local exception
+ * return. No need to save/restore registers, no need to
+ * switch S2 MMU. Just do the canonical ERET.
+ *
+ * Unless the trap has to be forwarded further down the line,
+ * of course...
+ */
+ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) ||
+ (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET))
+ return false;
+
+ spsr = read_sysreg_el1(SYS_SPSR);
+ mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ switch (mode) {
+ case PSR_MODE_EL0t:
+ if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
+ return false;
+ break;
+ case PSR_MODE_EL2t:
+ mode = PSR_MODE_EL1t;
+ break;
+ case PSR_MODE_EL2h:
+ mode = PSR_MODE_EL1h;
+ break;
+ default:
+ return false;
+ }
+
+ /* If ERETAx fails, take the slow path */
+ if (esr_iss_is_eretax(esr)) {
+ if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr)))
+ return false;
+ } else {
+ elr = read_sysreg_el1(SYS_ELR);
+ }
+
+ spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode;
+
+ write_sysreg_el2(spsr, SYS_SPSR);
+ write_sysreg_el2(elr, SYS_ELR);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ int ret = -EINVAL;
+ u32 instr;
+ u64 val;
+
+ /*
+ * Ideally, we would never trap on EL2 S1 TLB invalidations using
+ * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}.
+ * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2,
+ * meaning that we can't track changes to the virtual TGE bit. So we
+ * have to leave HCR_EL2.TTLB set on the host. Oopsie...
+ *
+ * Try and handle these invalidation as quickly as possible, without
+ * fully exiting. Note that we don't need to consider any forwarding
+ * here, as having E2H+TGE set is the very definition of being
+ * InHost.
+ *
+ * For the lesser hypervisors out there that have failed to get on
+ * with the VHE program, we can also handle the nVHE style of EL2
+ * invalidation.
+ */
+ if (!(is_hyp_ctxt(vcpu)))
+ return false;
+
+ instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+
+ if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) &&
+ vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) ||
+ kvm_supported_tlbi_s1e2_op (vcpu, instr))
+ ret = __kvm_tlbi_s1e2(NULL, val, instr);
+
+ if (ret)
+ return false;
+
+ /*
+ * If we have to check for any VNCR mapping being invalidated,
+ * go back to the slow path for further processing.
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) &&
+ atomic_read(&vcpu->kvm->arch.vncr_map_count))
+ return false;
+
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ int rt;
+
+ if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1)
+ return false;
+
+ rt = kvm_vcpu_sys_get_rt(vcpu);
+
+ if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) {
+ vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2));
+ } else {
+ vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2);
+ __activate_cptr_traps(vcpu);
+ }
+
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+
+ if (!vcpu_has_nv(vcpu))
+ return false;
+
+ if (sysreg != SYS_ZCR_EL2)
+ return false;
+
+ if (guest_owns_fp_regs())
+ return false;
+
+ /*
+ * ZCR_EL2 traps are handled in the slow path, with the expectation
+ * that the guest's FP context has already been loaded onto the CPU.
+ *
+ * Load the guest's FP context and unconditionally forward to the
+ * slow path for handling (i.e. return false).
+ */
+ kvm_hyp_handle_fpsimd(vcpu, exit_code);
+ return false;
+}
+
+static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_timer(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_zcr_el2(vcpu, exit_code))
+ return true;
+
+ return kvm_hyp_handle_sysreg(vcpu, exit_code);
+}
+
+static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 iss;
+
+ if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return false;
+
+ /*
+ * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2
+ * is populated with a correct ISS for a sysreg trap. These fruity
+ * parts are 64bit only, so unconditionally set IL.
+ */
+ iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2));
+ vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) |
+ FIELD_PREP(ESR_ELx_ISS_MASK, iss) |
+ ESR_ELx_IL;
+ return false;
}
static const exit_handler_fn hyp_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
[ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32,
- [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg,
+ [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe,
[ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
[ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
- [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
+ [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
+ [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret,
+ [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
+
+ /* Apple shenanigans */
+ [0x3F] = kvm_hyp_handle_impdef,
};
-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
+static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
- return hyp_exit_handlers;
-}
+ synchronize_vcpu_pstate(vcpu, exit_code);
-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
-{
+ /*
+ * If we were in HYP context on entry, adjust the PSTATE view
+ * so that the usual helpers work correctly. This enforces our
+ * invariant that the guest's HYP context status is preserved
+ * across a run.
+ */
+ if (vcpu_has_nv(vcpu) &&
+ unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) {
+ u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ switch (mode) {
+ case PSR_MODE_EL1t:
+ mode = PSR_MODE_EL2t;
+ break;
+ case PSR_MODE_EL1h:
+ mode = PSR_MODE_EL2h;
+ break;
+ }
+
+ *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT);
+ *vcpu_cpsr(vcpu) |= mode;
+ }
+
+ /* Apply extreme paranoia! */
+ BUG_ON(vcpu_has_nv(vcpu) &&
+ !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu));
+
+ return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers);
}
/* Switch to the guest for VHE systems running in EL2 */
@@ -129,24 +575,19 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt;
u64 exit_code;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
- host_ctxt->__hyp_running_vcpu = vcpu;
+ host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
+ fpsimd_lazy_switch_to_guest(vcpu);
+
sysreg_save_host_state_vhe(host_ctxt);
/*
- * ARM erratum 1165522 requires us to configure both stage 1 and
- * stage 2 translation for the guest context before we clear
- * HCR_EL2.TGE.
- *
- * We have already configured the guest's stage 1 translation in
- * kvm_vcpu_load_sysregs_vhe above. We must now call
- * __load_stage2 before __activate_traps, because
- * __load_stage2 configures stage 2 translation, and
- * __activate_traps clear HCR_EL2.TGE (among other things).
+ * Note that ARM erratum 1165522 requires us to configure both stage 1
+ * and stage 2 translation for the guest context before we clear
+ * HCR_EL2.TGE. The stage 1 and stage 2 guest context has already been
+ * loaded on the CPU in kvm_vcpu_load_vhe().
*/
- __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
__activate_traps(vcpu);
__kvm_adjust_pc(vcpu);
@@ -167,11 +608,21 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
sysreg_restore_host_state_vhe(host_ctxt);
- if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)
- __fpsimd_save_fpexc32(vcpu);
-
__debug_switch_to_host(vcpu);
+ /*
+ * Ensure that all system register writes above have taken effect
+ * before returning to the host. In VHE mode, CPTR traps for
+ * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be
+ * manipulated after the ISB.
+ */
+ isb();
+
+ fpsimd_lazy_switch_to_host(vcpu);
+
+ if (guest_owns_fp_regs())
+ __fpsimd_save_fpexc32(vcpu);
+
return exit_code;
}
NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
@@ -201,22 +652,15 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
*/
local_daif_restore(DAIF_PROCCTX_NOIRQ);
- /*
- * When we exit from the guest we change a number of CPU configuration
- * parameters, such as traps. Make sure these changes take effect
- * before running the host or additional guests.
- */
- isb();
-
return ret;
}
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
+static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
@@ -236,7 +680,6 @@ void __noreturn hyp_panic(void)
u64 par = read_sysreg_par();
__hyp_call_panic(spsr, elr, par);
- unreachable();
}
asmlinkage void kvm_unexpected_el2_exception(void)
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 7b44f6b3b547..f28c6cf4fe1b 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -13,6 +13,139 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_nested.h>
+
+static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
+{
+ /* These registers are common with EL1 */
+ __vcpu_assign_sys_reg(vcpu, PAR_EL1, read_sysreg(par_el1));
+ __vcpu_assign_sys_reg(vcpu, TPIDR_EL1, read_sysreg(tpidr_el1));
+
+ __vcpu_assign_sys_reg(vcpu, ESR_EL2, read_sysreg_el1(SYS_ESR));
+ __vcpu_assign_sys_reg(vcpu, AFSR0_EL2, read_sysreg_el1(SYS_AFSR0));
+ __vcpu_assign_sys_reg(vcpu, AFSR1_EL2, read_sysreg_el1(SYS_AFSR1));
+ __vcpu_assign_sys_reg(vcpu, FAR_EL2, read_sysreg_el1(SYS_FAR));
+ __vcpu_assign_sys_reg(vcpu, MAIR_EL2, read_sysreg_el1(SYS_MAIR));
+ __vcpu_assign_sys_reg(vcpu, VBAR_EL2, read_sysreg_el1(SYS_VBAR));
+ __vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR));
+ __vcpu_assign_sys_reg(vcpu, AMAIR_EL2, read_sysreg_el1(SYS_AMAIR));
+
+ /*
+ * In VHE mode those registers are compatible between EL1 and EL2,
+ * and the guest uses the _EL1 versions on the CPU naturally.
+ * So we save them into their _EL2 versions here.
+ * For nVHE mode we trap accesses to those registers, so our
+ * _EL2 copy in sys_regs[] is always up-to-date and we don't need
+ * to save anything here.
+ */
+ if (vcpu_el2_e2h_is_set(vcpu)) {
+ u64 val;
+
+ /*
+ * We don't save CPTR_EL2, as accesses to CPACR_EL1
+ * are always trapped, ensuring that the in-memory
+ * copy is always up-to-date. A small blessing...
+ */
+ __vcpu_assign_sys_reg(vcpu, SCTLR_EL2, read_sysreg_el1(SYS_SCTLR));
+ __vcpu_assign_sys_reg(vcpu, TTBR0_EL2, read_sysreg_el1(SYS_TTBR0));
+ __vcpu_assign_sys_reg(vcpu, TTBR1_EL2, read_sysreg_el1(SYS_TTBR1));
+ __vcpu_assign_sys_reg(vcpu, TCR_EL2, read_sysreg_el1(SYS_TCR));
+
+ if (ctxt_has_tcrx(&vcpu->arch.ctxt)) {
+ __vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2));
+
+ if (ctxt_has_s1pie(&vcpu->arch.ctxt)) {
+ __vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0));
+ __vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR));
+ }
+
+ if (ctxt_has_s1poe(&vcpu->arch.ctxt))
+ __vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR));
+ }
+
+ /*
+ * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where
+ * the interesting CNTHCTL_EL2 bits live. So preserve these
+ * bits when reading back the guest-visible value.
+ */
+ val = read_sysreg_el1(SYS_CNTKCTL);
+ val &= CNTKCTL_VALID_BITS;
+ __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS);
+ __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val);
+ }
+
+ __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1));
+ __vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR));
+ __vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR));
+
+ if (ctxt_has_sctlr2(&vcpu->arch.ctxt))
+ __vcpu_assign_sys_reg(vcpu, SCTLR2_EL2, read_sysreg_el1(SYS_SCTLR2));
+}
+
+static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ /* These registers are common with EL1 */
+ write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1);
+ write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1);
+
+ write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2);
+ write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR);
+
+ if (vcpu_el2_e2h_is_set(vcpu)) {
+ /*
+ * In VHE mode those registers are compatible between
+ * EL1 and EL2.
+ */
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL);
+ } else {
+ /*
+ * CNTHCTL_EL2 only affects EL1 when running nVHE, so
+ * no need to restore it.
+ */
+ val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2));
+ write_sysreg_el1(val, SYS_SCTLR);
+ val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2));
+ write_sysreg_el1(val, SYS_CPACR);
+ val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2));
+ write_sysreg_el1(val, SYS_TTBR0);
+ val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2));
+ write_sysreg_el1(val, SYS_TCR);
+ }
+
+ if (ctxt_has_tcrx(&vcpu->arch.ctxt)) {
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2);
+
+ if (ctxt_has_s1pie(&vcpu->arch.ctxt)) {
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0);
+ }
+
+ if (ctxt_has_s1poe(&vcpu->arch.ctxt))
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR);
+ }
+
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR);
+ write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR);
+
+ if (ctxt_has_sctlr2(&vcpu->arch.ctxt))
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR2_EL2), SYS_SCTLR2);
+}
/*
* VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and
@@ -51,7 +184,7 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe);
/**
- * kvm_vcpu_load_sysregs_vhe - Load guest system registers to the physical CPU
+ * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU
*
* @vcpu: The VCPU pointer
*
@@ -61,15 +194,27 @@ NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe);
* and loading system register state early avoids having to load them on
* every entry to the VM.
*/
-void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
+void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
+ u64 midr, mpidr;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
__sysreg_save_user_state(host_ctxt);
/*
+ * When running a normal EL1 guest, we only load a new vcpu
+ * after a context switch, which imvolves a DSB, so all
+ * speculative EL1&0 walks will have already completed.
+ * If running NV, the vcpu may transition between vEL1 and
+ * vEL2 without a context switch, so make sure we complete
+ * those walks before loading a new context.
+ */
+ if (vcpu_has_nv(vcpu))
+ dsb(nsh);
+
+ /*
* Load guest EL1 and user state
*
* We must restore the 32-bit state before the sysregs, thanks
@@ -77,15 +222,30 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
*/
__sysreg32_restore_state(vcpu);
__sysreg_restore_user_state(guest_ctxt);
- __sysreg_restore_el1_state(guest_ctxt);
- vcpu_set_flag(vcpu, SYSREGS_ON_CPU);
+ if (unlikely(is_hyp_ctxt(vcpu))) {
+ __sysreg_restore_vel2_state(vcpu);
+ } else {
+ if (vcpu_has_nv(vcpu)) {
+ /*
+ * As we're restoring a nested guest, set the value
+ * provided by the guest hypervisor.
+ */
+ midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2);
+ mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2);
+ } else {
+ midr = ctxt_midr_el1(guest_ctxt);
+ mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1);
+ }
+
+ __sysreg_restore_el1_state(guest_ctxt, midr, mpidr);
+ }
- activate_traps_vhe_load(vcpu);
+ vcpu_set_flag(vcpu, SYSREGS_ON_CPU);
}
/**
- * kvm_vcpu_put_sysregs_vhe - Restore host system registers to the physical CPU
+ * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU
*
* @vcpu: The VCPU pointer
*
@@ -95,15 +255,18 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
* and deferring saving system register state until we're no longer running the
* VCPU avoids having to save them on every exit from the VM.
*/
-void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
+void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
- deactivate_traps_vhe_put(vcpu);
+ host_ctxt = host_data_ptr(host_ctxt);
+
+ if (unlikely(is_hyp_ctxt(vcpu)))
+ __sysreg_save_vel2_state(vcpu);
+ else
+ __sysreg_save_el1_state(guest_ctxt);
- __sysreg_save_el1_state(guest_ctxt);
__sysreg_save_user_state(guest_ctxt);
__sysreg32_save_state(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 24cef9b87f9e..ec2569818629 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -11,18 +11,25 @@
#include <asm/tlbflush.h>
struct tlb_inv_context {
- unsigned long flags;
- u64 tcr;
- u64 sctlr;
+ struct kvm_s2_mmu *mmu;
+ unsigned long flags;
+ u64 tcr;
+ u64 sctlr;
};
-static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
- struct tlb_inv_context *cxt)
+static void enter_vmid_context(struct kvm_s2_mmu *mmu,
+ struct tlb_inv_context *cxt)
{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
u64 val;
local_irq_save(cxt->flags);
+ if (vcpu && mmu != vcpu->arch.hw_mmu)
+ cxt->mmu = vcpu->arch.hw_mmu;
+ else
+ cxt->mmu = NULL;
+
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
/*
* For CPUs that are affected by ARM errata 1165522 or 1530923,
@@ -56,20 +63,23 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
__load_stage2(mmu, mmu->arch);
val = read_sysreg(hcr_el2);
val &= ~HCR_TGE;
- write_sysreg(val, hcr_el2);
+ write_sysreg_hcr(val);
isb();
}
-static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
+static void exit_vmid_context(struct tlb_inv_context *cxt)
{
/*
* We're done with the TLB operation, let's restore the host's
* view of HCR_EL2.
*/
- write_sysreg(0, vttbr_el2);
- write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
+ write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
isb();
+ /* ... and the stage-2 MMU context that we switched away from */
+ if (cxt->mmu)
+ __load_stage2(cxt->mmu, cxt->mmu->arch);
+
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
/* Restore the registers to what they were */
write_sysreg_el1(cxt->tcr, SYS_TCR);
@@ -87,7 +97,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
/*
* We could do so much better if we had the VA as well.
@@ -108,7 +118,68 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
+}
+
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ dsb(nshst);
+
+ /* Switch to requested VMID */
+ enter_vmid_context(mmu, &cxt);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ exit_vmid_context(&cxt);
+}
+
+void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
+ phys_addr_t start, unsigned long pages)
+{
+ struct tlb_inv_context cxt;
+ unsigned long stride;
+
+ /*
+ * Since the range of addresses may not be mapped at
+ * the same level, assume the worst case as PAGE_SIZE
+ */
+ stride = PAGE_SIZE;
+ start = round_down(start, stride);
+
+ dsb(ishst);
+
+ /* Switch to requested VMID */
+ enter_vmid_context(mmu, &cxt);
+
+ __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride,
+ TLBI_TTL_UNKNOWN);
+
+ dsb(ish);
+ __tlbi(vmalle1is);
+ dsb(ish);
+ isb();
+
+ exit_vmid_context(&cxt);
}
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
@@ -118,13 +189,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
dsb(ishst);
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
__tlbi(vmalls12e1is);
dsb(ish);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
@@ -132,32 +203,166 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ enter_vmid_context(mmu, &cxt);
__tlbi(vmalle1);
asm volatile("ic iallu");
dsb(nsh);
isb();
- __tlb_switch_to_host(&cxt);
+ exit_vmid_context(&cxt);
}
void __kvm_flush_vm_context(void)
{
dsb(ishst);
__tlbi(alle1is);
+ dsb(ish);
+}
+
+/*
+ * TLB invalidation emulation for NV. For any given instruction, we
+ * perform the following transformtions:
+ *
+ * - a TLBI targeting EL2 S1 is remapped to EL1 S1
+ * - a non-shareable TLBI is upgraded to being inner-shareable
+ * - an outer-shareable TLBI is also mapped to inner-shareable
+ * - an nXS TLBI is upgraded to XS
+ */
+int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding)
+{
+ struct tlb_inv_context cxt;
+ int ret = 0;
/*
- * VIPT and PIPT caches are not affected by VMID, so no maintenance
- * is necessary across a VMID rollover.
- *
- * VPIPT caches constrain lookup and maintenance to the active VMID,
- * so we need to invalidate lines with a stale VMID to avoid an ABA
- * race after multiple rollovers.
- *
+ * The guest will have provided its own DSB ISHST before trapping.
+ * If it hasn't, that's its own problem, and we won't paper over it
+ * (plus, there is plenty of extra synchronisation before we even
+ * get here...).
*/
- if (icache_is_vpipt())
- asm volatile("ic ialluis");
+ if (mmu)
+ enter_vmid_context(mmu, &cxt);
+
+ switch (sys_encoding) {
+ case OP_TLBI_ALLE2:
+ case OP_TLBI_ALLE2IS:
+ case OP_TLBI_ALLE2OS:
+ case OP_TLBI_VMALLE1:
+ case OP_TLBI_VMALLE1IS:
+ case OP_TLBI_VMALLE1OS:
+ case OP_TLBI_ALLE2NXS:
+ case OP_TLBI_ALLE2ISNXS:
+ case OP_TLBI_ALLE2OSNXS:
+ case OP_TLBI_VMALLE1NXS:
+ case OP_TLBI_VMALLE1ISNXS:
+ case OP_TLBI_VMALLE1OSNXS:
+ __tlbi(vmalle1is);
+ break;
+ case OP_TLBI_VAE2:
+ case OP_TLBI_VAE2IS:
+ case OP_TLBI_VAE2OS:
+ case OP_TLBI_VAE1:
+ case OP_TLBI_VAE1IS:
+ case OP_TLBI_VAE1OS:
+ case OP_TLBI_VAE2NXS:
+ case OP_TLBI_VAE2ISNXS:
+ case OP_TLBI_VAE2OSNXS:
+ case OP_TLBI_VAE1NXS:
+ case OP_TLBI_VAE1ISNXS:
+ case OP_TLBI_VAE1OSNXS:
+ __tlbi(vae1is, va);
+ break;
+ case OP_TLBI_VALE2:
+ case OP_TLBI_VALE2IS:
+ case OP_TLBI_VALE2OS:
+ case OP_TLBI_VALE1:
+ case OP_TLBI_VALE1IS:
+ case OP_TLBI_VALE1OS:
+ case OP_TLBI_VALE2NXS:
+ case OP_TLBI_VALE2ISNXS:
+ case OP_TLBI_VALE2OSNXS:
+ case OP_TLBI_VALE1NXS:
+ case OP_TLBI_VALE1ISNXS:
+ case OP_TLBI_VALE1OSNXS:
+ __tlbi(vale1is, va);
+ break;
+ case OP_TLBI_ASIDE1:
+ case OP_TLBI_ASIDE1IS:
+ case OP_TLBI_ASIDE1OS:
+ case OP_TLBI_ASIDE1NXS:
+ case OP_TLBI_ASIDE1ISNXS:
+ case OP_TLBI_ASIDE1OSNXS:
+ __tlbi(aside1is, va);
+ break;
+ case OP_TLBI_VAAE1:
+ case OP_TLBI_VAAE1IS:
+ case OP_TLBI_VAAE1OS:
+ case OP_TLBI_VAAE1NXS:
+ case OP_TLBI_VAAE1ISNXS:
+ case OP_TLBI_VAAE1OSNXS:
+ __tlbi(vaae1is, va);
+ break;
+ case OP_TLBI_VAALE1:
+ case OP_TLBI_VAALE1IS:
+ case OP_TLBI_VAALE1OS:
+ case OP_TLBI_VAALE1NXS:
+ case OP_TLBI_VAALE1ISNXS:
+ case OP_TLBI_VAALE1OSNXS:
+ __tlbi(vaale1is, va);
+ break;
+ case OP_TLBI_RVAE2:
+ case OP_TLBI_RVAE2IS:
+ case OP_TLBI_RVAE2OS:
+ case OP_TLBI_RVAE1:
+ case OP_TLBI_RVAE1IS:
+ case OP_TLBI_RVAE1OS:
+ case OP_TLBI_RVAE2NXS:
+ case OP_TLBI_RVAE2ISNXS:
+ case OP_TLBI_RVAE2OSNXS:
+ case OP_TLBI_RVAE1NXS:
+ case OP_TLBI_RVAE1ISNXS:
+ case OP_TLBI_RVAE1OSNXS:
+ __tlbi(rvae1is, va);
+ break;
+ case OP_TLBI_RVALE2:
+ case OP_TLBI_RVALE2IS:
+ case OP_TLBI_RVALE2OS:
+ case OP_TLBI_RVALE1:
+ case OP_TLBI_RVALE1IS:
+ case OP_TLBI_RVALE1OS:
+ case OP_TLBI_RVALE2NXS:
+ case OP_TLBI_RVALE2ISNXS:
+ case OP_TLBI_RVALE2OSNXS:
+ case OP_TLBI_RVALE1NXS:
+ case OP_TLBI_RVALE1ISNXS:
+ case OP_TLBI_RVALE1OSNXS:
+ __tlbi(rvale1is, va);
+ break;
+ case OP_TLBI_RVAAE1:
+ case OP_TLBI_RVAAE1IS:
+ case OP_TLBI_RVAAE1OS:
+ case OP_TLBI_RVAAE1NXS:
+ case OP_TLBI_RVAAE1ISNXS:
+ case OP_TLBI_RVAAE1OSNXS:
+ __tlbi(rvaae1is, va);
+ break;
+ case OP_TLBI_RVAALE1:
+ case OP_TLBI_RVAALE1IS:
+ case OP_TLBI_RVAALE1OS:
+ case OP_TLBI_RVAALE1NXS:
+ case OP_TLBI_RVAALE1ISNXS:
+ case OP_TLBI_RVAALE1OSNXS:
+ __tlbi(rvaale1is, va);
+ break;
+ default:
+ ret = -EINVAL;
+ }
dsb(ish);
+ isb();
+
+ if (mmu)
+ exit_vmid_context(&cxt);
+
+ return ret;
}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index c9f401fa01a9..58c5fe7d7572 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -15,6 +15,8 @@
GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0)
#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \
GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0)
+#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2 \
+ GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0)
static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
{
@@ -44,10 +46,10 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
feature = smccc_get_arg1(vcpu);
switch (feature) {
case KVM_PTP_VIRT_COUNTER:
- cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+ cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset;
break;
case KVM_PTP_PHYS_COUNTER:
- cycles = systime_snapshot.cycles;
+ cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.poffset;
break;
default:
return;
@@ -65,7 +67,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
val[3] = lower_32_bits(cycles);
}
-static bool kvm_hvc_call_default_allowed(u32 func_id)
+static bool kvm_smccc_default_allowed(u32 func_id)
{
switch (func_id) {
/*
@@ -93,7 +95,7 @@ static bool kvm_hvc_call_default_allowed(u32 func_id)
}
}
-static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
+static bool kvm_smccc_test_fw_bmap(struct kvm_vcpu *vcpu, u32 func_id)
{
struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
@@ -117,20 +119,172 @@ static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP,
&smccc_feat->vendor_hyp_bmap);
default:
- return kvm_hvc_call_default_allowed(func_id);
+ return false;
}
}
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
+#define SMC32_ARCH_RANGE_BEGIN ARM_SMCCC_VERSION_FUNC_ID
+#define SMC32_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_32, \
+ 0, ARM_SMCCC_FUNC_MASK)
+
+#define SMC64_ARCH_RANGE_BEGIN ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ 0, 0)
+#define SMC64_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ 0, ARM_SMCCC_FUNC_MASK)
+
+static int kvm_smccc_filter_insert_reserved(struct kvm *kvm)
+{
+ int r;
+
+ /*
+ * Prevent userspace from handling any SMCCC calls in the architecture
+ * range, avoiding the risk of misrepresenting Spectre mitigation status
+ * to the guest.
+ */
+ r = mtree_insert_range(&kvm->arch.smccc_filter,
+ SMC32_ARCH_RANGE_BEGIN, SMC32_ARCH_RANGE_END,
+ xa_mk_value(KVM_SMCCC_FILTER_HANDLE),
+ GFP_KERNEL_ACCOUNT);
+ if (r)
+ goto out_destroy;
+
+ r = mtree_insert_range(&kvm->arch.smccc_filter,
+ SMC64_ARCH_RANGE_BEGIN, SMC64_ARCH_RANGE_END,
+ xa_mk_value(KVM_SMCCC_FILTER_HANDLE),
+ GFP_KERNEL_ACCOUNT);
+ if (r)
+ goto out_destroy;
+
+ return 0;
+out_destroy:
+ mtree_destroy(&kvm->arch.smccc_filter);
+ return r;
+}
+
+static bool kvm_smccc_filter_configured(struct kvm *kvm)
+{
+ return !mtree_empty(&kvm->arch.smccc_filter);
+}
+
+static int kvm_smccc_set_filter(struct kvm *kvm, struct kvm_smccc_filter __user *uaddr)
+{
+ const void *zero_page = page_to_virt(ZERO_PAGE(0));
+ struct kvm_smccc_filter filter;
+ u32 start, end;
+ int r;
+
+ if (copy_from_user(&filter, uaddr, sizeof(filter)))
+ return -EFAULT;
+
+ if (memcmp(filter.pad, zero_page, sizeof(filter.pad)))
+ return -EINVAL;
+
+ start = filter.base;
+ end = start + filter.nr_functions - 1;
+
+ if (end < start || filter.action >= NR_SMCCC_FILTER_ACTIONS)
+ return -EINVAL;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ if (kvm_vm_has_ran_once(kvm)) {
+ r = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (!kvm_smccc_filter_configured(kvm)) {
+ r = kvm_smccc_filter_insert_reserved(kvm);
+ if (WARN_ON_ONCE(r))
+ goto out_unlock;
+ }
+
+ r = mtree_insert_range(&kvm->arch.smccc_filter, start, end,
+ xa_mk_value(filter.action), GFP_KERNEL_ACCOUNT);
+out_unlock:
+ mutex_unlock(&kvm->arch.config_lock);
+ return r;
+}
+
+static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id)
+{
+ unsigned long idx = func_id;
+ void *val;
+
+ if (!kvm_smccc_filter_configured(kvm))
+ return KVM_SMCCC_FILTER_HANDLE;
+
+ /*
+ * But where's the error handling, you say?
+ *
+ * mt_find() returns NULL if no entry was found, which just so happens
+ * to match KVM_SMCCC_FILTER_HANDLE.
+ */
+ val = mt_find(&kvm->arch.smccc_filter, &idx, idx);
+ return xa_to_value(val);
+}
+
+static u8 kvm_smccc_get_action(struct kvm_vcpu *vcpu, u32 func_id)
+{
+ /*
+ * Intervening actions in the SMCCC filter take precedence over the
+ * pseudo-firmware register bitmaps.
+ */
+ u8 action = kvm_smccc_filter_get_action(vcpu->kvm, func_id);
+ if (action != KVM_SMCCC_FILTER_HANDLE)
+ return action;
+
+ if (kvm_smccc_test_fw_bmap(vcpu, func_id) ||
+ kvm_smccc_default_allowed(func_id))
+ return KVM_SMCCC_FILTER_HANDLE;
+
+ return KVM_SMCCC_FILTER_DENY;
+}
+
+static void kvm_prepare_hypercall_exit(struct kvm_vcpu *vcpu, u32 func_id)
+{
+ u8 ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu));
+ struct kvm_run *run = vcpu->run;
+ u64 flags = 0;
+
+ if (ec == ESR_ELx_EC_SMC32 || ec == ESR_ELx_EC_SMC64)
+ flags |= KVM_HYPERCALL_EXIT_SMC;
+
+ if (!kvm_vcpu_trap_il_is32bit(vcpu))
+ flags |= KVM_HYPERCALL_EXIT_16BIT;
+
+ run->exit_reason = KVM_EXIT_HYPERCALL;
+ run->hypercall = (typeof(run->hypercall)) {
+ .nr = func_id,
+ .flags = flags,
+ };
+}
+
+int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
{
struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
u32 func_id = smccc_get_function(vcpu);
u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
u32 feature;
+ u8 action;
gpa_t gpa;
+ uuid_t uuid;
- if (!kvm_hvc_call_allowed(vcpu, func_id))
+ action = kvm_smccc_get_action(vcpu, func_id);
+ switch (action) {
+ case KVM_SMCCC_FILTER_HANDLE:
+ break;
+ case KVM_SMCCC_FILTER_DENY:
goto out;
+ case KVM_SMCCC_FILTER_FWD_TO_USER:
+ kvm_prepare_hypercall_exit(vcpu, func_id);
+ return 0;
+ default:
+ WARN_RATELIMIT(1, "Unhandled SMCCC filter action: %d\n", action);
+ goto out;
+ }
switch (func_id) {
case ARM_SMCCC_VERSION_FUNC_ID:
@@ -166,7 +320,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
* to the guest, and hide SSBS so that the
* guest stays protected.
*/
- if (cpus_have_final_cap(ARM64_SSBS))
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP))
break;
fallthrough;
case SPECTRE_UNAFFECTED:
@@ -198,17 +352,20 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
break;
case ARM_SMCCC_HV_PV_TIME_ST:
gpa = kvm_init_stolen_time(vcpu);
- if (gpa != GPA_INVALID)
+ if (gpa != INVALID_GPA)
val[0] = gpa;
break;
case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
- val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
- val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
- val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
- val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+ uuid = ARM_SMCCC_VENDOR_HYP_UID_KVM;
+ val[0] = smccc_uuid_to_reg(&uuid, 0);
+ val[1] = smccc_uuid_to_reg(&uuid, 1);
+ val[2] = smccc_uuid_to_reg(&uuid, 2);
+ val[3] = smccc_uuid_to_reg(&uuid, 3);
break;
case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
val[0] = smccc_feat->vendor_hyp_bmap;
+ /* Function numbers 2-63 are reserved for pKVM for now */
+ val[2] = smccc_feat->vendor_hyp_bmap_2;
break;
case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
kvm_ptp_get_time(vcpu, val);
@@ -236,6 +393,7 @@ static const u64 kvm_arm_fw_reg_ids[] = {
KVM_REG_ARM_STD_BMAP,
KVM_REG_ARM_STD_HYP_BMAP,
KVM_REG_ARM_VENDOR_HYP_BMAP,
+ KVM_REG_ARM_VENDOR_HYP_BMAP_2,
};
void kvm_arm_init_hypercalls(struct kvm *kvm)
@@ -245,6 +403,13 @@ void kvm_arm_init_hypercalls(struct kvm *kvm)
smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES;
smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES;
smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
+
+ mt_init(&kvm->arch.smccc_filter);
+}
+
+void kvm_arm_teardown_hypercalls(struct kvm *kvm)
+{
+ mtree_destroy(&kvm->arch.smccc_filter);
}
int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
@@ -270,7 +435,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
* Convert the workaround level into an easy-to-compare number, where higher
* values mean better protection.
*/
-static int get_kernel_wa_level(u64 regid)
+static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid)
{
switch (regid) {
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
@@ -291,7 +456,7 @@ static int get_kernel_wa_level(u64 regid)
* don't have any FW mitigation if SSBS is there at
* all times.
*/
- if (cpus_have_final_cap(ARM64_SSBS))
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP))
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
fallthrough;
case SPECTRE_UNAFFECTED:
@@ -328,7 +493,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
- val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+ val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
break;
case KVM_REG_ARM_STD_BMAP:
val = READ_ONCE(smccc_feat->std_bmap);
@@ -339,6 +504,9 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_REG_ARM_VENDOR_HYP_BMAP:
val = READ_ONCE(smccc_feat->vendor_hyp_bmap);
break;
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
+ val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2);
+ break;
default:
return -ENOENT;
}
@@ -369,6 +537,10 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val)
fw_reg_bmap = &smccc_feat->vendor_hyp_bmap;
fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
break;
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
+ fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2;
+ fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2;
+ break;
default:
return -ENOENT;
}
@@ -377,17 +549,16 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val)
if (val & ~fw_reg_features)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
- if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) &&
- val != *fw_reg_bmap) {
+ if (kvm_vm_has_ran_once(kvm) && val != *fw_reg_bmap) {
ret = -EBUSY;
goto out;
}
WRITE_ONCE(*fw_reg_bmap, val);
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
@@ -397,6 +568,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
u64 val;
int wa_level;
+ if (KVM_REG_SIZE(reg->id) != sizeof(val))
+ return -ENOENT;
if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
return -EFAULT;
@@ -405,7 +578,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
bool wants_02;
- wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
+ wants_02 = vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2);
switch (val) {
case KVM_ARM_PSCI_0_1:
@@ -416,6 +589,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
case KVM_ARM_PSCI_0_2:
case KVM_ARM_PSCI_1_0:
case KVM_ARM_PSCI_1_1:
+ case KVM_ARM_PSCI_1_2:
+ case KVM_ARM_PSCI_1_3:
if (!wants_02)
return -EINVAL;
vcpu->kvm->arch.psci_version = val;
@@ -429,7 +604,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
return -EINVAL;
- if (get_kernel_wa_level(reg->id) < val)
+ if (get_kernel_wa_level(vcpu, reg->id) < val)
return -EINVAL;
return 0;
@@ -465,13 +640,14 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
* We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
* other way around.
*/
- if (get_kernel_wa_level(reg->id) < wa_level)
+ if (get_kernel_wa_level(vcpu, reg->id) < wa_level)
return -EINVAL;
return 0;
case KVM_REG_ARM_STD_BMAP:
case KVM_REG_ARM_STD_HYP_BMAP:
case KVM_REG_ARM_VENDOR_HYP_BMAP:
+ case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val);
default:
return -ENOENT;
@@ -479,3 +655,25 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
return -EINVAL;
}
+
+int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VM_SMCCC_FILTER:
+ return 0;
+ default:
+ return -ENXIO;
+ }
+}
+
+int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ void __user *uaddr = (void __user *)attr->addr;
+
+ switch (attr->attr) {
+ case KVM_ARM_VM_SMCCC_FILTER:
+ return kvm_smccc_set_filter(kvm, uaddr);
+ default:
+ return -ENXIO;
+ }
+}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index f32f4a2a347f..dfcd66c65517 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -12,17 +12,130 @@
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
#include <asm/esr.h>
+static unsigned int exception_target_el(struct kvm_vcpu *vcpu)
+{
+ /* If not nesting, EL1 is the only possible exception target */
+ if (likely(!vcpu_has_nv(vcpu)))
+ return PSR_MODE_EL1h;
+
+ /*
+ * With NV, we need to pick between EL1 and EL2. Note that we
+ * never deal with a nesting exception here, hence never
+ * changing context, and the exception itself can be delayed
+ * until the next entry.
+ */
+ switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) {
+ case PSR_MODE_EL2h:
+ case PSR_MODE_EL2t:
+ return PSR_MODE_EL2h;
+ case PSR_MODE_EL1h:
+ case PSR_MODE_EL1t:
+ return PSR_MODE_EL1h;
+ case PSR_MODE_EL0t:
+ return vcpu_el2_tge_is_set(vcpu) ? PSR_MODE_EL2h : PSR_MODE_EL1h;
+ default:
+ BUG();
+ }
+}
+
+static enum vcpu_sysreg exception_esr_elx(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL2h)
+ return ESR_EL2;
+
+ return ESR_EL1;
+}
+
+static enum vcpu_sysreg exception_far_elx(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL2h)
+ return FAR_EL2;
+
+ return FAR_EL1;
+}
+
+static void pend_sync_exception(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ else
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
+}
+
+static void pend_serror_exception(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SERR);
+ else
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR);
+}
+
+static bool __effective_sctlr2_bit(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ u64 sctlr2;
+
+ if (!kvm_has_sctlr2(vcpu->kvm))
+ return false;
+
+ if (is_nested_ctxt(vcpu) &&
+ !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_SCTLR2En))
+ return false;
+
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL1);
+ else
+ sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL2);
+
+ return sctlr2 & BIT(idx);
+}
+
+static bool effective_sctlr2_ease(struct kvm_vcpu *vcpu)
+{
+ return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_EASE_SHIFT);
+}
+
+static bool effective_sctlr2_nmea(struct kvm_vcpu *vcpu)
+{
+ return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_NMEA_SHIFT);
+}
+
static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
{
unsigned long cpsr = *vcpu_cpsr(vcpu);
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
- u64 esr = 0;
+ u64 esr = 0, fsc;
+ int level;
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ /*
+ * If injecting an abort from a failed S1PTW, rewalk the S1 PTs to
+ * find the failing level. If we can't find it, assume the error was
+ * transient and restart without changing the state.
+ */
+ if (kvm_vcpu_abt_iss1tw(vcpu)) {
+ u64 hpfar = kvm_vcpu_get_fault_ipa(vcpu);
+ int ret;
+
+ if (hpfar == INVALID_GPA)
+ return;
+
+ ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level);
+ if (ret)
+ return;
+
+ WARN_ON_ONCE(level < -1 || level > 3);
+ fsc = ESR_ELx_FSC_SEA_TTW(level);
+ } else {
+ fsc = ESR_ELx_FSC_EXTABT;
+ }
- vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
+ /* This delight is brought to you by FEAT_DoubleFault2. */
+ if (effective_sctlr2_ease(vcpu))
+ pend_serror_exception(vcpu);
+ else
+ pend_sync_exception(vcpu);
/*
* Build an {i,d}abort, depending on the level and the
@@ -43,14 +156,17 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
if (!is_iabt)
esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;
- vcpu_write_sys_reg(vcpu, esr | ESR_ELx_FSC_EXTABT, ESR_EL1);
+ esr |= fsc;
+
+ vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu));
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
static void inject_undef64(struct kvm_vcpu *vcpu)
{
u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ pend_sync_exception(vcpu);
/*
* Build an unknown exception, depending on the instruction
@@ -59,7 +175,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
if (kvm_vcpu_trap_il_is32bit(vcpu))
esr |= ESR_ELx_IL;
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
#define DFSR_FSC_EXTABT_LPAE 0x10
@@ -85,7 +201,7 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr)
if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) {
fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
} else {
- /* no need to shuffle FS[4] into DFSR[10] as its 0 */
+ /* no need to shuffle FS[4] into DFSR[10] as it's 0 */
fsr = DFSR_FSC_EXTABT_nLPAE;
}
@@ -106,36 +222,35 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr)
vcpu_write_sys_reg(vcpu, far, FAR_EL1);
}
-/**
- * kvm_inject_dabt - inject a data abort into the guest
- * @vcpu: The VCPU to receive the data abort
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
+static void __kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
{
if (vcpu_el1_is_32bit(vcpu))
- inject_abt32(vcpu, false, addr);
+ inject_abt32(vcpu, iabt, addr);
else
- inject_abt64(vcpu, false, addr);
+ inject_abt64(vcpu, iabt, addr);
}
-/**
- * kvm_inject_pabt - inject a prefetch abort into the guest
- * @vcpu: The VCPU to receive the prefetch abort
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
+static bool kvm_sea_target_is_el2(struct kvm_vcpu *vcpu)
{
- if (vcpu_el1_is_32bit(vcpu))
- inject_abt32(vcpu, true, addr);
- else
- inject_abt64(vcpu, true, addr);
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_TGE | HCR_TEA))
+ return true;
+
+ if (!vcpu_mode_priv(vcpu))
+ return false;
+
+ return (*vcpu_cpsr(vcpu) & PSR_A_BIT) &&
+ (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA);
+}
+
+int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
+{
+ lockdep_assert_held(&vcpu->mutex);
+
+ if (is_nested_ctxt(vcpu) && kvm_sea_target_is_el2(vcpu))
+ return kvm_inject_nested_sea(vcpu, iabt, addr);
+
+ __kvm_inject_sea(vcpu, iabt, addr);
+ return 1;
}
void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
@@ -145,25 +260,22 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
addr = kvm_vcpu_get_fault_ipa(vcpu);
addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- if (kvm_vcpu_trap_is_iabt(vcpu))
- kvm_inject_pabt(vcpu, addr);
- else
- kvm_inject_dabt(vcpu, addr);
+ __kvm_inject_sea(vcpu, kvm_vcpu_trap_is_iabt(vcpu), addr);
/*
* If AArch64 or LPAE, set FSC to 0 to indicate an Address
* Size Fault at level 0, as if exceeding PARange.
*
* Non-LPAE guests will only get the external abort, as there
- * is no way to to describe the ASF.
+ * is no way to describe the ASF.
*/
if (vcpu_el1_is_32bit(vcpu) &&
!(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE))
return;
- esr = vcpu_read_sys_reg(vcpu, ESR_EL1);
+ esr = vcpu_read_sys_reg(vcpu, exception_esr_elx(vcpu));
esr &= ~GENMASK_ULL(5, 0);
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
/**
@@ -181,25 +293,70 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
inject_undef64(vcpu);
}
-void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr)
+static bool serror_is_masked(struct kvm_vcpu *vcpu)
{
- vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
- *vcpu_hcr(vcpu) |= HCR_VSE;
+ return (*vcpu_cpsr(vcpu) & PSR_A_BIT) && !effective_sctlr2_nmea(vcpu);
}
-/**
- * kvm_inject_vabt - inject an async abort / SError into the guest
- * @vcpu: The VCPU to receive the exception
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- *
- * Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with
- * the remaining ISS all-zeros so that this error is not interpreted as an
- * uncategorized RAS error. Without the RAS Extensions we can't specify an ESR
- * value, so the CPU generates an imp-def value.
- */
-void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+static bool kvm_serror_target_is_el2(struct kvm_vcpu *vcpu)
{
- kvm_set_sei_esr(vcpu, ESR_ELx_ISV);
+ if (is_hyp_ctxt(vcpu) || vcpu_el2_amo_is_set(vcpu))
+ return true;
+
+ if (!(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA))
+ return false;
+
+ /*
+ * In another example where FEAT_DoubleFault2 is entirely backwards,
+ * "masked" as it relates to the routing effects of HCRX_EL2.TMEA
+ * doesn't consider SCTLR2_EL1.NMEA. That is to say, even if EL1 asked
+ * for non-maskable SErrors, the EL2 bit takes priority if A is set.
+ */
+ if (vcpu_mode_priv(vcpu))
+ return *vcpu_cpsr(vcpu) & PSR_A_BIT;
+
+ /*
+ * Otherwise SErrors are considered unmasked when taken from EL0 and
+ * NMEA is set.
+ */
+ return serror_is_masked(vcpu);
+}
+
+static bool kvm_serror_undeliverable_at_el2(struct kvm_vcpu *vcpu)
+{
+ return !(vcpu_el2_tge_is_set(vcpu) || vcpu_el2_amo_is_set(vcpu));
+}
+
+int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr)
+{
+ lockdep_assert_held(&vcpu->mutex);
+
+ if (is_nested_ctxt(vcpu) && kvm_serror_target_is_el2(vcpu))
+ return kvm_inject_nested_serror(vcpu, esr);
+
+ if (vcpu_is_el2(vcpu) && kvm_serror_undeliverable_at_el2(vcpu)) {
+ vcpu_set_vsesr(vcpu, esr);
+ vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
+ return 1;
+ }
+
+ /*
+ * Emulate the exception entry if SErrors are unmasked. This is useful if
+ * the vCPU is in a nested context w/ vSErrors enabled then we've already
+ * delegated he hardware vSError context (i.e. HCR_EL2.VSE, VSESR_EL2,
+ * VDISR_EL2) to the guest hypervisor.
+ *
+ * As we're emulating the SError injection we need to explicitly populate
+ * ESR_ELx.EC because hardware will not do it on our behalf.
+ */
+ if (!serror_is_masked(vcpu)) {
+ pend_serror_exception(vcpu);
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
+ return 1;
+ }
+
+ vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
+ *vcpu_hcr(vcpu) |= HCR_VSE;
+ return 1;
}
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index 3dd38a151d2a..54f9358c9e0e 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -72,6 +72,33 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
return data;
}
+static bool kvm_pending_external_abort(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
+ return false;
+
+ if (vcpu_el1_is_32bit(vcpu)) {
+ switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
+ case unpack_vcpu_flag(EXCEPT_AA32_UND):
+ case unpack_vcpu_flag(EXCEPT_AA32_IABT):
+ case unpack_vcpu_flag(EXCEPT_AA32_DABT):
+ return true;
+ default:
+ return false;
+ }
+ } else {
+ switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR):
+ return true;
+ default:
+ return false;
+ }
+ }
+}
+
/**
* kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
* or in-kernel IO emulation
@@ -84,9 +111,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
unsigned int len;
int mask;
- /* Detect an already handled MMIO return */
- if (unlikely(!vcpu->mmio_needed))
- return 0;
+ /*
+ * Detect if the MMIO return was already handled or if userspace aborted
+ * the MMIO access.
+ */
+ if (unlikely(!vcpu->mmio_needed || kvm_pending_external_abort(vcpu)))
+ return 1;
vcpu->mmio_needed = 0;
@@ -117,7 +147,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
*/
kvm_incr_pc(vcpu);
- return 0;
+ return 1;
}
int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
@@ -133,8 +163,17 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
/*
* No valid syndrome? Ask userspace for help if it has
* volunteered to do so, and bail out otherwise.
+ *
+ * In the protected VM case, there isn't much userspace can do
+ * though, so directly deliver an exception to the guest.
*/
if (!kvm_vcpu_dabt_isvalid(vcpu)) {
+ trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
+ kvm_vcpu_get_hfar(vcpu), fault_ipa);
+
+ if (vcpu_is_protected(vcpu))
+ return kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+
if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
&vcpu->kvm->arch.flags)) {
run->exit_reason = KVM_EXIT_ARM_NISV;
@@ -143,7 +182,6 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
return 0;
}
- kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n");
return -ENOSYS;
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a3ee3b605c9b..48d7c372a4cd 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -4,18 +4,20 @@
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
*/
+#include <linux/acpi.h>
#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/sched/signal.h>
#include <trace/events/kvm.h>
+#include <asm/acpi.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
-#include <asm/kvm_ras.h>
+#include <asm/kvm_pkvm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
@@ -25,20 +27,31 @@
static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
+static unsigned long __ro_after_init hyp_idmap_start;
+static unsigned long __ro_after_init hyp_idmap_end;
+static phys_addr_t __ro_after_init hyp_idmap_vector;
-static unsigned long io_map_base;
+u32 __ro_after_init __hyp_va_bits;
-static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+static unsigned long __ro_after_init io_map_base;
+
+#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn)
+
+static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
+ phys_addr_t size)
{
- phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
return (boundary - 1 < end - 1) ? boundary : end;
}
+static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
+
+ return __stage2_range_addr_end(addr, end, size);
+}
+
/*
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
@@ -46,16 +59,17 @@ static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
* long will also starve other vCPUs. We have to also make sure that the page
* tables are not freed while we released the lock.
*/
-static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
phys_addr_t end,
int (*fn)(struct kvm_pgtable *, u64, u64),
bool resched)
{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
int ret;
u64 next;
do {
- struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ struct kvm_pgtable *pgt = mmu->pgt;
if (!pgt)
return -EINVAL;
@@ -71,8 +85,81 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
return ret;
}
-#define stage2_apply_range_resched(kvm, addr, end, fn) \
- stage2_apply_range(kvm, addr, end, fn, true)
+#define stage2_apply_range_resched(mmu, addr, end, fn) \
+ stage2_apply_range(mmu, addr, end, fn, true)
+
+/*
+ * Get the maximum number of page-tables pages needed to split a range
+ * of blocks into PAGE_SIZE PTEs. It assumes the range is already
+ * mapped at level 2, or at level 1 if allowed.
+ */
+static int kvm_mmu_split_nr_page_tables(u64 range)
+{
+ int n = 0;
+
+ if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
+ n += DIV_ROUND_UP(range, PUD_SIZE);
+ n += DIV_ROUND_UP(range, PMD_SIZE);
+ return n;
+}
+
+static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
+{
+ struct kvm_mmu_memory_cache *cache;
+ u64 chunk_size, min;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ min = kvm_mmu_split_nr_page_tables(chunk_size);
+ cache = &kvm->arch.mmu.split_page_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end)
+{
+ struct kvm_mmu_memory_cache *cache;
+ struct kvm_pgtable *pgt;
+ int ret, cache_capacity;
+ u64 next, chunk_size;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
+
+ if (chunk_size == 0)
+ return 0;
+
+ cache = &kvm->arch.mmu.split_page_cache;
+
+ do {
+ if (need_split_memcache_topup_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /* Eager page splitting is best-effort. */
+ ret = __kvm_mmu_topup_memory_cache(cache,
+ cache_capacity,
+ cache_capacity);
+ write_lock(&kvm->mmu_lock);
+ if (ret)
+ break;
+ }
+
+ pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = __stage2_range_addr_end(addr, end, chunk_size);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
+ if (ret)
+ break;
+ } while (addr = next, addr != end);
+
+ return ret;
+}
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
@@ -80,20 +167,31 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
}
/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
* @kvm: pointer to kvm structure.
*
* Interface to HYP function to flush all VM TLB entries
*/
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
- ++kvm->stat.generic.remote_tlb_flush_requests;
- kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
+ return 0;
}
-static bool kvm_is_device_pfn(unsigned long pfn)
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
+ gfn_t gfn, u64 nr_pages)
{
- return !pfn_is_map_memory(pfn);
+ u64 size = nr_pages << PAGE_SHIFT;
+ u64 addr = gfn << PAGE_SHIFT;
+
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size);
+ return 0;
}
static void *stage2_memcache_zalloc_page(void *arg)
@@ -130,21 +228,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
-static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
{
struct page *page = container_of(head, struct page, rcu_head);
void *pgtable = page_to_virt(page);
- u32 level = page_private(page);
+ s8 level = page_private(page);
- kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+ KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level);
}
-static void stage2_free_removed_table(void *addr, u32 level)
+static void stage2_free_unlinked_table(void *addr, s8 level)
{
struct page *page = virt_to_page(addr);
set_page_private(page, (unsigned long)level);
- call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+ call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
}
static void kvm_host_get_page(void *addr)
@@ -216,7 +314,7 @@ static void invalidate_icache_guest_page(void *va, size_t size)
* does.
*/
/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
* @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
@@ -235,13 +333,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
- WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap),
may_block));
}
-static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
+ u64 size, bool may_block)
{
- __unmap_stage2_range(mmu, start, size, true);
+ __unmap_stage2_range(mmu, start, size, may_block);
+}
+
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+{
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush));
}
static void stage2_flush_memslot(struct kvm *kvm,
@@ -250,7 +354,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
+ kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
}
/**
@@ -273,6 +377,8 @@ static void stage2_flush_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_flush_memslot(kvm, memslot);
+ kvm_nested_s2_flush(kvm);
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -280,7 +386,7 @@ static void stage2_flush_vm(struct kvm *kvm)
/**
* free_hyp_pgds - free Hyp-mode page tables
*/
-void free_hyp_pgds(void)
+void __init free_hyp_pgds(void)
{
mutex_lock(&kvm_hyp_pgd_mutex);
if (hyp_pgtable) {
@@ -511,6 +617,25 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return 0;
}
+static int __hyp_alloc_private_va_range(unsigned long base)
+{
+ lockdep_assert_held(&kvm_hyp_pgd_mutex);
+
+ if (!PAGE_ALIGNED(base))
+ return -EINVAL;
+
+ /*
+ * Verify that BIT(VA_BITS - 1) hasn't been flipped by
+ * allocating the new area, as it would indicate we've
+ * overflowed the idmap/IO address range.
+ */
+ if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+ return -ENOMEM;
+
+ io_map_base = base;
+
+ return 0;
+}
/**
* hyp_alloc_private_va_range - Allocates a private VA range.
@@ -531,29 +656,22 @@ int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
/*
* This assumes that we have enough space below the idmap
- * page to allocate our VAs. If not, the check below will
- * kick. A potential alternative would be to detect that
- * overflow and switch to an allocation above the idmap.
+ * page to allocate our VAs. If not, the check in
+ * __hyp_alloc_private_va_range() will kick. A potential
+ * alternative would be to detect that overflow and switch
+ * to an allocation above the idmap.
*
* The allocated size is always a multiple of PAGE_SIZE.
*/
- base = io_map_base - PAGE_ALIGN(size);
-
- /* Align the allocation based on the order of its size */
- base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
-
- /*
- * Verify that BIT(VA_BITS - 1) hasn't been flipped by
- * allocating the new area, as it would indicate we've
- * overflowed the idmap/IO address range.
- */
- if ((base ^ io_map_base) & BIT(VA_BITS - 1))
- ret = -ENOMEM;
- else
- *haddr = io_map_base = base;
+ size = PAGE_ALIGN(size);
+ base = io_map_base - size;
+ ret = __hyp_alloc_private_va_range(base);
mutex_unlock(&kvm_hyp_pgd_mutex);
+ if (!ret)
+ *haddr = base;
+
return ret;
}
@@ -587,6 +705,48 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
return ret;
}
+int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
+{
+ unsigned long base;
+ size_t size;
+ int ret;
+
+ mutex_lock(&kvm_hyp_pgd_mutex);
+ /*
+ * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
+ * an alignment of our allocation on the order of the size.
+ */
+ size = NVHE_STACK_SIZE * 2;
+ base = ALIGN_DOWN(io_map_base - size, size);
+
+ ret = __hyp_alloc_private_va_range(base);
+
+ mutex_unlock(&kvm_hyp_pgd_mutex);
+
+ if (ret) {
+ kvm_err("Cannot allocate hyp stack guard page\n");
+ return ret;
+ }
+
+ /*
+ * Since the stack grows downwards, map the stack to the page
+ * at the higher address and leave the lower guard page
+ * unbacked.
+ *
+ * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
+ * and addresses corresponding to the guard page have the
+ * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
+ */
+ ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE,
+ phys_addr, PAGE_HYP);
+ if (ret)
+ kvm_err("Cannot map hyp stack\n");
+
+ *haddr = base + size;
+
+ return ret;
+}
+
/**
* create_hyp_io_mappings - Map IO into both kernel and HYP
* @phys_addr: The physical start address which gets mapped
@@ -661,18 +821,39 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
struct kvm_pgtable pgt = {
.pgd = (kvm_pteref_t)kvm->mm->pgd,
.ia_bits = vabits_actual,
- .start_level = (KVM_PGTABLE_MAX_LEVELS -
- CONFIG_PGTABLE_LEVELS),
+ .start_level = (KVM_PGTABLE_LAST_LEVEL -
+ ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
.mm_ops = &kvm_user_mm_ops,
};
+ unsigned long flags;
kvm_pte_t pte = 0; /* Keep GCC quiet... */
- u32 level = ~0;
+ s8 level = S8_MAX;
int ret;
+ /*
+ * Disable IRQs so that we hazard against a concurrent
+ * teardown of the userspace page tables (which relies on
+ * IPI-ing threads).
+ */
+ local_irq_save(flags);
ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
- VM_BUG_ON(ret);
- VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
- VM_BUG_ON(!(pte & PTE_VALID));
+ local_irq_restore(flags);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Not seeing an error, but not updating level? Something went
+ * deeply wrong...
+ */
+ if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
+ return -EFAULT;
+ if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL))
+ return -EFAULT;
+
+ /* Oops, the userspace PTs are gone... Replay the fault */
+ if (!kvm_pte_valid(pte))
+ return -EAGAIN;
return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
}
@@ -681,7 +862,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
.free_pages_exact = kvm_s2_free_pages_exact,
- .free_removed_table = stage2_free_removed_table,
+ .free_unlinked_table = stage2_free_unlinked_table,
.get_page = kvm_host_get_page,
.put_page = kvm_s2_put_page,
.page_count = kvm_host_page_count,
@@ -691,21 +872,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.icache_inval_pou = invalidate_icache_guest_page,
};
-/**
- * kvm_init_stage2_mmu - Initialise a S2 MMU structure
- * @kvm: The pointer to the KVM structure
- * @mmu: The pointer to the s2 MMU structure
- * @type: The machine type of the virtual machine
- *
- * Allocates only the stage-2 HW PGD level table(s).
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
{
u32 kvm_ipa_limit = get_kvm_ipa_limit();
- int cpu, err;
- struct kvm_pgtable *pgt;
u64 mmfr0, mmfr1;
u32 phys_shift;
@@ -730,22 +899,98 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
- kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+ mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+
+ return 0;
+}
+
+/*
+ * Assume that @pgt is valid and unlinked from the KVM MMU to free the
+ * page-table without taking the kvm_mmu_lock and without performing any
+ * TLB invalidations.
+ *
+ * Also, the range of addresses can be large enough to cause need_resched
+ * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
+ * cond_resched() periodically to prevent hogging the CPU for a long time
+ * and schedule something else, if required.
+ */
+static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
+ phys_addr_t end)
+{
+ u64 next;
+ do {
+ next = stage2_range_addr_end(addr, end);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr,
+ next - addr);
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+}
+
+static void kvm_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr);
+
+ stage2_destroy_range(pgt, 0, BIT(ia_bits));
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
+}
+
+/**
+ * kvm_init_stage2_mmu - Initialise a S2 MMU structure
+ * @kvm: The pointer to the KVM structure
+ * @mmu: The pointer to the s2 MMU structure
+ * @type: The machine type of the virtual machine
+ *
+ * Allocates only the stage-2 HW PGD level table(s).
+ * Note we don't need locking here as this is only called in two cases:
+ *
+ * - when the VM is created, which can't race against anything
+ *
+ * - when secondary kvm_s2_mmu structures are initialised for NV
+ * guests, and the caller must hold kvm->lock as this is called on a
+ * per-vcpu basis.
+ */
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+{
+ int cpu, err;
+ struct kvm_pgtable *pgt;
+
+ /*
+ * If we already have our page tables in place, and that the
+ * MMU context is the canonical one, we have a bug somewhere,
+ * as this is only supposed to ever happen once per VM.
+ *
+ * Otherwise, we're building nested page tables, and that's
+ * probably because userspace called KVM_ARM_VCPU_INIT more
+ * than once on the same vcpu. Since that's actually legal,
+ * don't kick a fuss and leave gracefully.
+ */
if (mmu->pgt != NULL) {
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ return 0;
+
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
+ err = kvm_init_ipa_range(mmu, type);
+ if (err)
+ return err;
+
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
if (!pgt)
return -ENOMEM;
mmu->arch = &kvm->arch;
- err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
+ err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops);
if (err)
goto out_free_pgtable;
+ mmu->pgt = pgt;
+ if (is_protected_kvm_enabled())
+ return 0;
+
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
err = -ENOMEM;
@@ -755,17 +1000,30 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
- mmu->pgt = pgt;
+ /* The eager page splitting is disabled by default */
+ mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ mmu->split_page_cache.gfp_zero = __GFP_ZERO;
+
mmu->pgd_phys = __pa(pgt->pgd);
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
return 0;
out_destroy_pgtable:
- kvm_pgtable_stage2_destroy(pgt);
+ kvm_stage2_destroy(pgt);
out_free_pgtable:
kfree(pgt);
return err;
}
+void kvm_uninit_stage2_mmu(struct kvm *kvm)
+{
+ kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
+}
+
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
@@ -802,7 +1060,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
if (!(vma->vm_flags & VM_PFNMAP)) {
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
- unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
}
hva = vm_end;
} while (hva < reg_end);
@@ -829,6 +1087,8 @@ void stage2_unmap_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_unmap_memslot(kvm, memslot);
+ kvm_nested_s2_unmap(kvm, true);
+
write_unlock(&kvm->mmu_lock);
mmap_read_unlock(current->mm);
srcu_read_unlock(&kvm->srcu, idx);
@@ -846,29 +1106,47 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
write_unlock(&kvm->mmu_lock);
if (pgt) {
- kvm_pgtable_stage2_destroy(pgt);
+ kvm_stage2_destroy(pgt);
kfree(pgt);
}
}
-static void hyp_mc_free_fn(void *addr, void *unused)
+static void hyp_mc_free_fn(void *addr, void *mc)
{
+ struct kvm_hyp_memcache *memcache = mc;
+
+ if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, -1);
+
free_page((unsigned long)addr);
}
-static void *hyp_mc_alloc_fn(void *unused)
+static void *hyp_mc_alloc_fn(void *mc)
{
- return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ struct kvm_hyp_memcache *memcache = mc;
+ void *addr;
+
+ addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, 1);
+
+ return addr;
}
void free_hyp_memcache(struct kvm_hyp_memcache *mc)
{
- if (is_protected_kvm_enabled())
- __free_hyp_memcache(mc, hyp_mc_free_fn,
- kvm_host_va, NULL);
+ if (!is_protected_kvm_enabled())
+ return;
+
+ kfree(mc->mapping);
+ __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc);
}
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
@@ -876,8 +1154,14 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
if (!is_protected_kvm_enabled())
return 0;
+ if (!mc->mapping) {
+ mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT);
+ if (!mc->mapping)
+ return -ENOMEM;
+ }
+
return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
- kvm_host_pa, NULL);
+ kvm_host_pa, mc);
}
/**
@@ -895,7 +1179,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t addr;
int ret = 0;
struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
- struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct kvm_pgtable *pgt = mmu->pgt;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
KVM_PGTABLE_PROT_R |
(writable ? KVM_PGTABLE_PROT_W : 0);
@@ -908,13 +1193,13 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
- kvm_mmu_cache_min_pages(kvm));
+ kvm_mmu_cache_min_pages(mmu));
if (ret)
break;
write_lock(&kvm->mmu_lock);
- ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
- &cache, 0);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE,
+ pa, prot, &cache, 0);
write_unlock(&kvm->mmu_lock);
if (ret)
break;
@@ -927,15 +1212,14 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
}
/**
- * stage2_wp_range() - write protect stage2 memory region range
+ * kvm_stage2_wp_range() - write protect stage2 memory region range
* @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
-static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect));
}
/**
@@ -964,45 +1248,75 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_nested_s2_wp(kvm);
write_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
+ * pages for memory slot
* @kvm: The KVM pointer
- * @slot: The memory slot associated with mask
- * @gfn_offset: The gfn offset in memory slot
- * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
- * slot to be write protected
+ * @slot: The memory slot to split
*
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
+ * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
*/
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
- phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
- phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
- phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ phys_addr_t start, end;
+
+ lockdep_assert_held(&kvm->slots_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ slots = kvm_memslots(kvm);
+ memslot = id_to_memslot(slots, slot);
+
+ start = memslot->base_gfn << PAGE_SHIFT;
+ end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ write_lock(&kvm->mmu_lock);
+ kvm_mmu_split_huge_pages(kvm, start, end);
+ write_unlock(&kvm->mmu_lock);
}
/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of pages at offset 'gfn_offset' in this memory
+ * slot to enable dirty logging on
*
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
+ * Writes protect selected pages to enable dirty logging, and then
+ * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+
+ /*
+ * Eager-splitting is done when manual-protect is set. We
+ * also check for initially-all-set because we can avoid
+ * eager-splitting if initially-all-set is false.
+ * Initially-all-set equal false implies that huge-pages were
+ * already split when enabling dirty logging: no need to do it
+ * again.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_split_huge_pages(kvm, start, end);
+
+ kvm_nested_s2_wp(kvm);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1022,6 +1336,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
if (map_size == PAGE_SIZE)
return true;
+ /* pKVM only supports PMD_SIZE huge-mappings */
+ if (is_protected_kvm_enabled() && map_size != PMD_SIZE)
+ return false;
+
size = memslot->npages * PAGE_SIZE;
gpa_start = memslot->base_gfn << PAGE_SHIFT;
@@ -1079,7 +1397,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
*
* Returns the size of the mapping.
*/
-static unsigned long
+static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long hva, kvm_pfn_t *pfnp,
phys_addr_t *ipap)
@@ -1091,30 +1409,17 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
* sure that the HVA and IPA are sufficiently aligned and that the
* block map is contained within the memslot.
*/
- if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
- get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
- /*
- * The address we faulted on is backed by a transparent huge
- * page. However, because we map the compound huge page and
- * not the individual tail page, we need to transfer the
- * refcount to the head page. We have to be careful that the
- * THP doesn't start to split while we are adjusting the
- * refcounts.
- *
- * We are sure this doesn't happen, because mmu_invalidate_retry
- * was successful and we are holding the mmu_lock, so if this
- * THP is trying to split, it will be blocked in the mmu
- * notifier before touching any of the pages, specifically
- * before being able to call __split_huge_page_refcount().
- *
- * We can therefore safely transfer the refcount from PG_tail
- * to PG_head and switch the pfn from a tail page to the head
- * page accordingly.
- */
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ int sz = get_user_mapping_size(kvm, hva);
+
+ if (sz < 0)
+ return sz;
+
+ if (sz < PMD_SIZE)
+ return PAGE_SIZE;
+
*ipap &= PMD_MASK;
- kvm_release_pfn_clean(pfn);
pfn &= ~(PTRS_PER_PMD - 1);
- get_page(pfn_to_page(pfn));
*pfnp = pfn;
return PMD_SIZE;
@@ -1158,21 +1463,29 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
* able to see the page's tags and therefore they must be initialised first. If
* PG_mte_tagged is set, tags have already been initialised.
*
- * The race in the test/set of the PG_mte_tagged flag is handled by:
- * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
- * racing to santise the same page
- * - mmap_lock protects between a VM faulting a page in and the VMM performing
- * an mprotect() to add VM_MTE
+ * Must be called with kvm->mmu_lock held to ensure the memory remains mapped
+ * while the tags are zeroed.
*/
static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
unsigned long size)
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
if (!kvm_has_mte(kvm))
return;
+ if (folio_test_hugetlb(folio)) {
+ /* Hugetlb has MTE flags set on head page only */
+ if (folio_try_hugetlb_mte_tagging(folio)) {
+ for (i = 0; i < nr_pages; i++, page++)
+ mte_clear_page_tags(page_address(page));
+ folio_set_hugetlb_mte_tagged(folio);
+ }
+ return;
+ }
+
for (i = 0; i < nr_pages; i++, page++) {
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
@@ -1186,36 +1499,190 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
return vma->vm_flags & VM_MTE_ALLOWED;
}
+static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
+{
+ switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) {
+ case MT_NORMAL_NC:
+ case MT_DEVICE_nGnRnE:
+ case MT_DEVICE_nGnRE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
+ void **memcache)
+{
+ int min_pages;
+
+ if (!is_protected_kvm_enabled())
+ *memcache = &vcpu->arch.mmu_page_cache;
+ else
+ *memcache = &vcpu->arch.pkvm_memcache;
+
+ if (!topup_memcache)
+ return 0;
+
+ min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
+
+ if (!is_protected_kvm_enabled())
+ return kvm_mmu_topup_memory_cache(*memcache, min_pages);
+
+ return topup_hyp_memcache(*memcache, min_pages);
+}
+
+/*
+ * Potentially reduce shadow S2 permissions to match the guest's own S2. For
+ * exec faults, we'd only reach this point if the guest actually allowed it (see
+ * kvm_s2_handle_perm_fault).
+ *
+ * Also encode the level of the original translation in the SW bits of the leaf
+ * entry as a proxy for the span of that translation. This will be retrieved on
+ * TLB invalidation from the guest and used to limit the invalidation scope if a
+ * TTL hint or a range isn't provided.
+ */
+static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot *prot,
+ bool *writable)
+{
+ *writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_readable(nested))
+ *prot &= ~KVM_PGTABLE_PROT_R;
+
+ *prot |= kvm_encode_nested_level(nested);
+}
+
+static void adjust_nested_exec_perms(struct kvm *kvm,
+ struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot *prot)
+{
+ if (!kvm_s2_trans_exec_el0(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_UX;
+ if (!kvm_s2_trans_exec_el1(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_PX;
+}
+
+#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
+
+static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
+ struct kvm_memory_slot *memslot, bool is_perm)
+{
+ bool write_fault, exec_fault, writable;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+ struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ unsigned long mmu_seq;
+ struct page *page;
+ struct kvm *kvm = vcpu->kvm;
+ void *memcache;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ int ret;
+
+ ret = prepare_mmu_memcache(vcpu, true, &memcache);
+ if (ret)
+ return ret;
+
+ if (nested)
+ gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
+ else
+ gfn = fault_ipa >> PAGE_SHIFT;
+
+ write_fault = kvm_is_write_fault(vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
+
+ VM_WARN_ON_ONCE(write_fault && exec_fault);
+
+ mmu_seq = kvm->mmu_invalidate_seq;
+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+ smp_rmb();
+
+ ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
+ if (ret) {
+ kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
+ write_fault, exec_fault, false);
+ return ret;
+ }
+
+ writable = !(memslot->flags & KVM_MEM_READONLY);
+
+ if (nested)
+ adjust_nested_fault_perms(nested, &prot, &writable);
+
+ if (writable)
+ prot |= KVM_PGTABLE_PROT_W;
+
+ if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+ prot |= KVM_PGTABLE_PROT_X;
+
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
+
+ kvm_fault_lock(kvm);
+ if (mmu_invalidate_retry(kvm, mmu_seq)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
+ __pfn_to_phys(pfn), prot,
+ memcache, flags);
+
+out_unlock:
+ kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_fault_unlock(kvm);
+
+ if (writable && !ret)
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
+
+ return ret != -EAGAIN ? ret : 0;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
- unsigned long fault_status)
+ bool fault_is_perm)
{
int ret = 0;
- bool write_fault, writable, force_pte = false;
- bool exec_fault;
- bool device = false;
+ bool topup_memcache;
+ bool write_fault, writable;
+ bool exec_fault, mte_allowed, is_vma_cacheable;
+ bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
+ phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
- struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ void *memcache;
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
- unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
- unsigned long vma_pagesize, fault_granule;
+ bool force_pte = logging_active;
+ long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
+ struct page *page;
+ vm_flags_t vm_flags;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
- fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
+ if (fault_is_perm)
+ fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
- VM_BUG_ON(write_fault && exec_fault);
+ VM_WARN_ON_ONCE(write_fault && exec_fault);
- if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) {
- kvm_err("Unexpected L2 read permission error\n");
- return -EFAULT;
- }
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ topup_memcache = !fault_is_perm || (logging_active && write_fault);
+ ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
+ if (ret)
+ return ret;
/*
* Let's check if we will get back a huge page backed by hugetlbfs, or
@@ -1229,16 +1696,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- /*
- * logging_active is guaranteed to never be true for VM_PFNMAP
- * memslots.
- */
- if (logging_active) {
- force_pte = true;
+ if (force_pte)
vma_shift = PAGE_SHIFT;
- } else {
+ else
vma_shift = get_vma_page_shift(vma, hva);
- }
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
@@ -1265,44 +1726,69 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
vma_pagesize = 1UL << vma_shift;
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
- fault_ipa &= ~(vma_pagesize - 1);
- gfn = fault_ipa >> PAGE_SHIFT;
- mmap_read_unlock(current->mm);
+ if (nested) {
+ unsigned long max_map_size;
+
+ max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
+
+ ipa = kvm_s2_trans_output(nested);
+
+ /*
+ * If we're about to create a shadow stage 2 entry, then we
+ * can only create a block mapping if the guest stage 2 page
+ * table uses at least as big a mapping.
+ */
+ max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+
+ /*
+ * Be careful that if the mapping size falls between
+ * two host sizes, take the smallest of the two.
+ */
+ if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
+ max_map_size = PMD_SIZE;
+ else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
+ max_map_size = PAGE_SIZE;
+
+ force_pte = (max_map_size == PAGE_SIZE);
+ vma_pagesize = min_t(long, vma_pagesize, max_map_size);
+ }
/*
- * Permission faults just need to update the existing leaf entry,
- * and so normally don't require allocations from the memcache. The
- * only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
+ * Both the canonical IPA and fault IPA must be hugepage-aligned to
+ * ensure we find the right PFN and lay down the mapping in the right
+ * place.
*/
- if (fault_status != ESR_ELx_FSC_PERM ||
- (logging_active && write_fault)) {
- ret = kvm_mmu_topup_memory_cache(memcache,
- kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
+ fault_ipa &= ~(vma_pagesize - 1);
+ ipa &= ~(vma_pagesize - 1);
}
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ gfn = ipa >> PAGE_SHIFT;
+ mte_allowed = kvm_vma_mte_allowed(vma);
+
+ vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+
+ vm_flags = vma->vm_flags;
+
+ is_vma_cacheable = kvm_vma_is_cacheable(vma);
+
+ /* Don't use the VMA after the unlock -- it may have vanished */
+ vma = NULL;
+
/*
- * Ensure the read of mmu_invalidate_seq happens before we call
- * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
- * the page we just got a reference to gets unmapped before we have a
- * chance to grab the mmu_lock, which ensure that if the page gets
- * unmapped afterwards, the call to kvm_unmap_gfn will take it away
- * from us again properly. This smp_rmb() interacts with the smp_wmb()
- * in kvm_mmu_notifier_invalidate_<page|range_end>.
+ * Read mmu_invalidate_seq so that KVM can detect if the results of
+ * vma_lookup() or __kvm_faultin_pfn() become stale prior to
+ * acquiring kvm->mmu_lock.
*
- * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
- * used to avoid unnecessary overhead introduced to locate the memory
- * slot because it's always fixed even @gfn is adjusted for huge pages.
+ * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
+ * with the smp_wmb() in kvm_mmu_invalidate_end().
*/
- smp_rmb();
+ mmu_seq = kvm->mmu_invalidate_seq;
+ mmap_read_unlock(current->mm);
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
- write_fault, &writable, NULL);
+ pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+ &writable, &page);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -1310,18 +1796,39 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (is_error_noslot_pfn(pfn))
return -EFAULT;
- if (kvm_is_device_pfn(pfn)) {
- /*
- * If the page was identified as device early by looking at
- * the VMA flags, vma_pagesize is already representing the
- * largest quantity we can map. If instead it was mapped
- * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
- * and must not be upgraded.
- *
- * In both cases, we don't let transparent_hugepage_adjust()
- * change things at the last minute.
- */
- device = true;
+ /*
+ * Check if this is non-struct page memory PFN, and cannot support
+ * CMOs. It could potentially be unsafe to access as cacheable.
+ */
+ if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
+ if (is_vma_cacheable) {
+ /*
+ * Whilst the VMA owner expects cacheable mapping to this
+ * PFN, hardware also has to support the FWB and CACHE DIC
+ * features.
+ *
+ * ARM64 KVM relies on kernel VA mapping to the PFN to
+ * perform cache maintenance as the CMO instructions work on
+ * virtual addresses. VM_PFNMAP region are not necessarily
+ * mapped to a KVA and hence the presence of hardware features
+ * S2FWB and CACHE DIC are mandatory to avoid the need for
+ * cache maintenance.
+ */
+ if (!kvm_supports_cacheable_pfnmap())
+ ret = -EFAULT;
+ } else {
+ /*
+ * If the page was identified as device early by looking at
+ * the VMA flags, vma_pagesize is already representing the
+ * largest quantity we can map. If instead it was mapped
+ * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
+ * and must not be upgraded.
+ *
+ * In both cases, we don't let transparent_hugepage_adjust()
+ * change things at the last minute.
+ */
+ s2_force_noncacheable = true;
+ }
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
@@ -1330,31 +1837,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
writable = false;
}
- if (exec_fault && device)
- return -ENOEXEC;
+ if (exec_fault && s2_force_noncacheable)
+ ret = -ENOEXEC;
- read_lock(&kvm->mmu_lock);
+ if (ret) {
+ kvm_release_page_unused(page);
+ return ret;
+ }
+
+ if (nested)
+ adjust_nested_fault_perms(nested, &prot, &writable);
+
+ kvm_fault_lock(kvm);
pgt = vcpu->arch.hw_mmu->pgt;
- if (mmu_invalidate_retry(kvm, mmu_seq))
+ if (mmu_invalidate_retry(kvm, mmu_seq)) {
+ ret = -EAGAIN;
goto out_unlock;
+ }
/*
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
- if (fault_status == ESR_ELx_FSC_PERM &&
- fault_granule > PAGE_SIZE)
+ if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
+ if (fault_is_perm && fault_granule > PAGE_SIZE)
vma_pagesize = fault_granule;
else
vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn,
&fault_ipa);
+
+ if (vma_pagesize < 0) {
+ ret = vma_pagesize;
+ goto out_unlock;
+ }
}
- if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) {
+ if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new disallowed VMA */
- if (kvm_vma_mte_allowed(vma)) {
+ if (mte_allowed) {
sanitise_mte_tags(kvm, pfn, vma_pagesize);
} else {
ret = -EFAULT;
@@ -1368,53 +1889,138 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
- if (device)
- prot |= KVM_PGTABLE_PROT_DEVICE;
- else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ if (s2_force_noncacheable) {
+ if (vfio_allow_any_uc)
+ prot |= KVM_PGTABLE_PROT_NORMAL_NC;
+ else
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
prot |= KVM_PGTABLE_PROT_X;
+ }
+
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule)
- ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- else
- ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ if (fault_is_perm && vma_pagesize == fault_granule) {
+ /*
+ * Drop the SW bits in favour of those stored in the
+ * PTE, which will be preserved.
+ */
+ prot &= ~KVM_NV_GUEST_MAP_SZ;
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
+ } else {
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
- memcache, KVM_PGTABLE_WALK_SHARED);
+ memcache, flags);
+ }
+
+out_unlock:
+ kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_fault_unlock(kvm);
/* Mark the page dirty only if the fault is handled successfully */
- if (writable && !ret) {
- kvm_set_pfn_dirty(pfn);
+ if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
- }
-out_unlock:
- read_unlock(&kvm->mmu_lock);
- kvm_set_pfn_accessed(pfn);
- kvm_release_pfn_clean(pfn);
return ret != -EAGAIN ? ret : 0;
}
/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pte_t pte;
- kvm_pte_t kpte;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
- write_lock(&vcpu->kvm->mmu_lock);
+ read_lock(&vcpu->kvm->mmu_lock);
mmu = vcpu->arch.hw_mmu;
- kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
- write_unlock(&vcpu->kvm->mmu_lock);
+ KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags);
+ read_unlock(&vcpu->kvm->mmu_lock);
+}
- pte = __pte(kpte);
- if (pte_valid(pte))
- kvm_set_pfn_accessed(pte_pfn(pte));
+/*
+ * Returns true if the SEA should be handled locally within KVM if the abort
+ * is caused by a kernel memory allocation (e.g. stage-2 table memory).
+ */
+static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr)
+{
+ /*
+ * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort
+ * taken from a guest EL to EL2 is due to a host-imposed access (e.g.
+ * stage-2 PTW).
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return true;
+
+ /* KVM owns the VNCR when the vCPU isn't in a nested context. */
+ if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR))
+ return true;
+
+ /*
+ * Determining if an external abort during a table walk happened at
+ * stage-2 is only possible with S1PTW is set. Otherwise, since KVM
+ * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the
+ * PA of the stage-1 descriptor) can reach here and are reported
+ * with a TTW ESR value.
+ */
+ return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW));
+}
+
+int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_run *run = vcpu->run;
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 esr_mask = ESR_ELx_EC_MASK |
+ ESR_ELx_IL |
+ ESR_ELx_FnV |
+ ESR_ELx_EA |
+ ESR_ELx_CM |
+ ESR_ELx_WNR |
+ ESR_ELx_FSC;
+ u64 ipa;
+
+ /*
+ * Give APEI the opportunity to claim the abort before handling it
+ * within KVM. apei_claim_sea() expects to be called with IRQs enabled.
+ */
+ lockdep_assert_irqs_enabled();
+ if (apei_claim_sea(NULL) == 0)
+ return 1;
+
+ if (host_owns_sea(vcpu, esr) ||
+ !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags))
+ return kvm_inject_serror(vcpu);
+
+ /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */
+ if (kvm_has_ras(kvm))
+ esr_mask |= ESR_ELx_SET_MASK;
+
+ /*
+ * Exit to userspace, and provide faulting guest virtual and physical
+ * addresses in case userspace wants to emulate SEA to guest by
+ * writing to FAR_ELx and HPFAR_ELx registers.
+ */
+ memset(&run->arm_sea, 0, sizeof(run->arm_sea));
+ run->exit_reason = KVM_EXIT_ARM_SEA;
+ run->arm_sea.esr = esr & esr_mask;
+
+ if (!(esr & ESR_ELx_FnV))
+ run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu);
+
+ ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (ipa != INVALID_GPA) {
+ run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID;
+ run->arm_sea.gpa = ipa;
+ }
+
+ return 0;
}
/**
@@ -1430,20 +2036,32 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
*/
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
- unsigned long fault_status;
- phys_addr_t fault_ipa;
+ struct kvm_s2_trans nested_trans, *nested = NULL;
+ unsigned long esr;
+ phys_addr_t fault_ipa; /* The address we faulted on */
+ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
struct kvm_memory_slot *memslot;
unsigned long hva;
bool is_iabt, write_fault, writable;
gfn_t gfn;
int ret, idx;
- fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+ if (kvm_vcpu_abt_issea(vcpu))
+ return kvm_handle_guest_sea(vcpu);
+
+ esr = kvm_vcpu_get_esr(vcpu);
+
+ /*
+ * The fault IPA should be reliable at this point as we're not dealing
+ * with an SEA.
+ */
+ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm))
+ return -EFAULT;
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
- if (fault_status == ESR_ELx_FSC_FAULT) {
+ if (esr_fsc_is_translation_fault(esr)) {
/* Beyond sanitised PARange (which is the IPA limit) */
if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
kvm_inject_size_fault(vcpu);
@@ -1451,36 +2069,20 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
/* Falls between the IPA range and the PARange? */
- if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
+ if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- if (is_iabt)
- kvm_inject_pabt(vcpu, fault_ipa);
- else
- kvm_inject_dabt(vcpu, fault_ipa);
- return 1;
+ return kvm_inject_sea(vcpu, is_iabt, fault_ipa);
}
}
- /* Synchronous External Abort? */
- if (kvm_vcpu_abt_issea(vcpu)) {
- /*
- * For RAS the host kernel may handle this abort.
- * There is no need to pass the error into the guest.
- */
- if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
- kvm_inject_vabt(vcpu);
-
- return 1;
- }
-
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
/* Check the stage-2 fault is trans. fault or write fault */
- if (fault_status != ESR_ELx_FSC_FAULT &&
- fault_status != ESR_ELx_FSC_PERM &&
- fault_status != ESR_ELx_FSC_ACCESS) {
+ if (!esr_fsc_is_translation_fault(esr) &&
+ !esr_fsc_is_permission_fault(esr) &&
+ !esr_fsc_is_access_flag_fault(esr)) {
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
@@ -1490,7 +2092,47 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
idx = srcu_read_lock(&vcpu->kvm->srcu);
- gfn = fault_ipa >> PAGE_SHIFT;
+ /*
+ * We may have faulted on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resolve the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we simply inject
+ * this to the guest and carry on.
+ *
+ * If there are no shadow S2 PTs because S2 is disabled, there is
+ * nothing to walk and we treat it as a 1:1 before going through the
+ * canonical translation.
+ */
+ if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
+ vcpu->arch.hw_mmu->nested_stage2_enabled) {
+ u32 esr;
+
+ ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret == -EAGAIN) {
+ ret = 1;
+ goto out_unlock;
+ }
+
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ipa = kvm_s2_trans_output(&nested_trans);
+ nested = &nested_trans;
+ }
+
+ gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
@@ -1507,8 +2149,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
if (kvm_vcpu_abt_iss1tw(vcpu)) {
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
+ ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
goto out_unlock;
}
@@ -1534,28 +2175,34 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* faulting VA. This is always 12 bits, irrespective
* of the page size.
*/
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, fault_ipa);
+ ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+ ret = io_mem_abort(vcpu, ipa);
goto out_unlock;
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+ VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
- if (fault_status == ESR_ELx_FSC_ACCESS) {
+ if (esr_fsc_is_access_flag_fault(esr)) {
handle_access_fault(vcpu, fault_ipa);
ret = 1;
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+ VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
+ !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
+
+ if (kvm_slot_has_gmem(memslot))
+ ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
+ esr_fsc_is_permission_fault(esr));
+ else
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
+ esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
- if (ret == -ENOEXEC) {
- kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
- }
+ if (ret == -ENOEXEC)
+ ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu));
out_unlock:
srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
@@ -1570,67 +2217,36 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
(range->end - range->start) << PAGE_SHIFT,
range->may_block);
+ kvm_nested_s2_unmap(kvm, range->may_block);
return false;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- kvm_pfn_t pfn = pte_pfn(range->pte);
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(range->end - range->start != 1);
-
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
/*
- * If the page isn't tagged, defer to user_mem_abort() for sanitising
- * the MTE tags. The S2 pte should have been unmapped by
- * mmu_notifier_invalidate_range_end().
+ * TODO: Handle nested_mmu structures here using the reverse mapping in
+ * a later version of patch series.
*/
- if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
- return false;
-
- /*
- * We've moved a page around, probably through CoW, so let's treat
- * it just like a translation fault and the map handler will clean
- * the cache to the PoC.
- *
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
- PAGE_SIZE, __pfn_to_phys(pfn),
- KVM_PGTABLE_PROT_R, NULL, 0);
-
- return false;
}
-bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
-}
-
-bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- if (!kvm->arch.mmu.pgt)
- return false;
-
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
@@ -1668,7 +2284,7 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
.virt_to_phys = kvm_host_pa,
};
-int kvm_mmu_init(u32 *hyp_va_bits)
+int __init kvm_mmu_init(u32 *hyp_va_bits)
{
int err;
u32 idmap_bits;
@@ -1687,16 +2303,9 @@ int kvm_mmu_init(u32 *hyp_va_bits)
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
/*
- * The ID map may be configured to use an extended virtual address
- * range. This is only the case if system RAM is out of range for the
- * currently configured page size and VA_BITS_MIN, in which case we will
- * also need the extended virtual range for the HYP ID map, or we won't
- * be able to enable the EL2 MMU.
- *
- * However, in some cases the ID map may be configured for fewer than
- * the number of VA bits used by the regular kernel stage 1. This
- * happens when VA_BITS=52 and the kernel image is placed in PA space
- * below 48 bits.
+ * The ID map is always configured for 48 bits of translation, which
+ * may be fewer than the number of VA bits used by the regular kernel
+ * stage 1, when VA_BITS=52.
*
* At EL2, there is only one TTBR register, and we can't switch between
* translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
@@ -1707,7 +2316,7 @@ int kvm_mmu_init(u32 *hyp_va_bits)
* 1 VA bits to assure that the hypervisor can both ID map its code page
* and map any kernel memory.
*/
- idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ idmap_bits = IDMAP_VA_BITS;
kernel_bits = vabits_actual;
*hyp_va_bits = max(idmap_bits, kernel_bits);
@@ -1745,6 +2354,7 @@ int kvm_mmu_init(u32 *hyp_va_bits)
goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
+ __hyp_va_bits = *hyp_va_bits;
return 0;
out_destroy_pgtable:
@@ -1761,20 +2371,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
/*
* At this point memslot has been committed and there is an
* allocated dirty_bitmap[], dirty pages will be tracked while the
* memory slot is write protected.
*/
- if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (log_dirty_pages) {
+
+ if (change == KVM_MR_DELETE)
+ return;
+
/*
- * If we're with initial-all-set, we don't need to write
- * protect any pages because they're all reported as dirty.
- * Huge pages and normal pages will be write protect gradually.
+ * Huge and normal pages are write-protected and split
+ * on either of these two cases:
+ *
+ * 1. with initial-all-set: gradually with CLEAR ioctls,
*/
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- kvm_mmu_wp_memory_region(kvm, new->id);
- }
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+ /*
+ * or
+ * 2. without initial-all-set: all in one shot when
+ * enabling dirty logging.
+ */
+ kvm_mmu_wp_memory_region(kvm, new->id);
+ kvm_mmu_split_memory_region(kvm, new->id);
+ } else {
+ /*
+ * Free any leftovers from the eager page splitting cache. Do
+ * this when deleting, moving, disabling dirty logging, or
+ * creating the memslot (a nop). Doing it for deletes makes
+ * sure we don't leak memory, and there's no need to keep the
+ * cache around for any of the other cases.
+ */
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}
}
@@ -1794,9 +2426,16 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* Prevent userspace from creating a memory region outside of the IPA
* space addressable by the KVM guest IPA space.
*/
- if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
+ if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
return -EFAULT;
+ /*
+ * Only support guest_memfd backed memslots with mappable memory, since
+ * there aren't any CoCo VMs that support only private memory on arm64.
+ */
+ if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
+ return -EINVAL;
+
hva = new->userspace_addr;
reg_end = hva + (new->npages << PAGE_SHIFT);
@@ -1830,6 +2469,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
ret = -EINVAL;
break;
}
+
+ /*
+ * Cacheable PFNMAP is allowed only if the hardware
+ * supports it.
+ */
+ if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) {
+ ret = -EINVAL;
+ break;
+ }
}
hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
@@ -1846,11 +2494,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
- kvm_free_stage2_pgd(&kvm->arch.mmu);
-}
-
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
@@ -1858,7 +2501,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
phys_addr_t size = slot->npages << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
+ kvm_nested_s2_unmap(kvm, true);
write_unlock(&kvm->mmu_lock);
}
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
new file mode 100644
index 000000000000..cdeeb8f09e72
--- /dev/null
+++ b/arch/arm64/kvm/nested.c
@@ -0,0 +1,1919 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 - Columbia University and Linaro Ltd.
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/fixmap.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
+#include <asm/sysreg.h>
+
+#include "sys_regs.h"
+
+struct vncr_tlb {
+ /* The guest's VNCR_EL2 */
+ u64 gva;
+ struct s1_walk_info wi;
+ struct s1_walk_result wr;
+
+ u64 hpa;
+
+ /* -1 when not mapped on a CPU */
+ int cpu;
+
+ /*
+ * true if the TLB is valid. Can only be changed with the
+ * mmu_lock held.
+ */
+ bool valid;
+};
+
+/*
+ * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
+ * memory usage and potential number of different sets of S2 PTs in
+ * the guests. Running out of S2 MMUs only affects performance (we
+ * will invalidate them more often).
+ */
+#define S2_MMU_PER_VCPU 2
+
+void kvm_init_nested(struct kvm *kvm)
+{
+ kvm->arch.nested_mmus = NULL;
+ kvm->arch.nested_mmus_size = 0;
+ atomic_set(&kvm->arch.vncr_map_count, 0);
+}
+
+static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+{
+ /*
+ * We only initialise the IPA range on the canonical MMU, which
+ * defines the contract between KVM and userspace on where the
+ * "hardware" is in the IPA space. This affects the validity of MMIO
+ * exits forwarded to userspace, for example.
+ *
+ * For nested S2s, we use the PARange as exposed to the guest, as it
+ * is allowed to use it at will to expose whatever memory map it
+ * wants to its own guests as it would be on real HW.
+ */
+ return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm));
+}
+
+int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_s2_mmu *tmp;
+ int num_mmus, ret = 0;
+
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) &&
+ !cpus_have_final_cap(ARM64_HAS_HCR_NV1))
+ return -EINVAL;
+
+ if (!vcpu->arch.ctxt.vncr_array)
+ vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT |
+ __GFP_ZERO);
+
+ if (!vcpu->arch.ctxt.vncr_array)
+ return -ENOMEM;
+
+ /*
+ * Let's treat memory allocation failures as benign: If we fail to
+ * allocate anything, return an error and keep the allocated array
+ * alive. Userspace may try to recover by initializing the vcpu
+ * again, and there is no reason to affect the whole VM for this.
+ */
+ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
+ tmp = kvrealloc(kvm->arch.nested_mmus,
+ size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tmp)
+ return -ENOMEM;
+
+ swap(kvm->arch.nested_mmus, tmp);
+
+ /*
+ * If we went through a realocation, adjust the MMU back-pointers in
+ * the previously initialised kvm_pgtable structures.
+ */
+ if (kvm->arch.nested_mmus != tmp)
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
+ kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];
+
+ for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
+ ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);
+
+ if (ret) {
+ for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
+ kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);
+
+ free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
+ vcpu->arch.ctxt.vncr_array = NULL;
+
+ return ret;
+ }
+
+ kvm->arch.nested_mmus_size = num_mmus;
+
+ return 0;
+}
+
+struct s2_walk_info {
+ u64 baddr;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int sl;
+ unsigned int t0sz;
+ bool be;
+ bool ha;
+};
+
+static u32 compute_fsc(int level, u32 fsc)
+{
+ return fsc | (level & 0x3);
+}
+
+static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
+{
+ u32 esr;
+
+ esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
+ esr |= compute_fsc(level, fsc);
+ return esr;
+}
+
+static int get_ia_size(struct s2_walk_info *wi)
+{
+ return 64 - wi->t0sz;
+}
+
+static int check_base_s2_limits(struct s2_walk_info *wi,
+ int level, int input_size, int stride)
+{
+ int start_size, ia_size;
+
+ ia_size = get_ia_size(wi);
+
+ /* Check translation limits */
+ switch (BIT(wi->pgshift)) {
+ case SZ_64K:
+ if (level == 0 || (level == 1 && ia_size <= 42))
+ return -EFAULT;
+ break;
+ case SZ_16K:
+ if (level == 0 || (level == 1 && ia_size <= 40))
+ return -EFAULT;
+ break;
+ case SZ_4K:
+ if (level < 0 || (level == 0 && ia_size <= 42))
+ return -EFAULT;
+ break;
+ }
+
+ /* Check input size limits */
+ if (input_size > ia_size)
+ return -EFAULT;
+
+ /* Check number of entries in starting level table */
+ start_size = input_size - ((3 - level) * stride + wi->pgshift);
+ if (start_size < 1 || start_size > stride + 4)
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Check if output is within boundaries */
+static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
+{
+ unsigned int output_size = wi->max_oa_bits;
+
+ if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
+ return -1;
+
+ return 0;
+}
+
+static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
+ struct s2_walk_info *wi)
+{
+ u64 val;
+ int r;
+
+ r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
+ if (r)
+ return r;
+
+ /*
+ * Handle reversedescriptors if endianness differs between the
+ * host and the guest hypervisor.
+ */
+ if (wi->be)
+ *desc = be64_to_cpu((__force __be64)val);
+ else
+ *desc = le64_to_cpu((__force __le64)val);
+
+ return 0;
+}
+
+static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new,
+ struct s2_walk_info *wi)
+{
+ if (wi->be) {
+ old = (__force u64)cpu_to_be64(old);
+ new = (__force u64)cpu_to_be64(new);
+ } else {
+ old = (__force u64)cpu_to_le64(old);
+ new = (__force u64)cpu_to_le64(new);
+ }
+
+ return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+}
+
+/*
+ * This is essentially a C-version of the pseudo code from the ARM ARM
+ * AArch64.TranslationTableWalk function. I strongly recommend looking at
+ * that pseudocode in trying to understand this.
+ *
+ * Must be called with the kvm->srcu read lock held
+ */
+static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
+ struct s2_walk_info *wi, struct kvm_s2_trans *out)
+{
+ int first_block_level, level, stride, input_size, base_lower_bound;
+ phys_addr_t base_addr;
+ unsigned int addr_top, addr_bottom;
+ u64 desc, new_desc; /* page table entry */
+ int ret;
+ phys_addr_t paddr;
+
+ switch (BIT(wi->pgshift)) {
+ default:
+ case SZ_64K:
+ case SZ_16K:
+ level = 3 - wi->sl;
+ first_block_level = 2;
+ break;
+ case SZ_4K:
+ level = 2 - wi->sl;
+ first_block_level = 1;
+ break;
+ }
+
+ stride = wi->pgshift - 3;
+ input_size = get_ia_size(wi);
+ if (input_size > 48 || input_size < 25)
+ return -EFAULT;
+
+ ret = check_base_s2_limits(wi, level, input_size, stride);
+ if (WARN_ON(ret))
+ return ret;
+
+ base_lower_bound = 3 + input_size - ((3 - level) * stride +
+ wi->pgshift);
+ base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
+
+ if (check_output_size(wi, base_addr)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ return 1;
+ }
+
+ addr_top = input_size - 1;
+
+ while (1) {
+ phys_addr_t index;
+
+ addr_bottom = (3 - level) * stride + wi->pgshift;
+ index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
+ >> (addr_bottom - 3);
+
+ paddr = base_addr | index;
+ ret = read_guest_s2_desc(vcpu, paddr, &desc, wi);
+ if (ret < 0)
+ return ret;
+
+ new_desc = desc;
+
+ /* Check for valid descriptor at this point */
+ if (!(desc & KVM_PTE_VALID)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->desc = desc;
+ return 1;
+ }
+
+ if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) {
+ if (level < 3)
+ break;
+
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->desc = desc;
+ return 1;
+ }
+
+ /* We're at the final level */
+ if (level == 3)
+ break;
+
+ if (check_output_size(wi, desc)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ out->desc = desc;
+ return 1;
+ }
+
+ base_addr = desc & GENMASK_ULL(47, wi->pgshift);
+
+ level += 1;
+ addr_top = addr_bottom - 1;
+ }
+
+ if (level < first_block_level) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->desc = desc;
+ return 1;
+ }
+
+ if (check_output_size(wi, desc)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ out->desc = desc;
+ return 1;
+ }
+
+ if (wi->ha)
+ new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+
+ if (new_desc != desc) {
+ ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi);
+ if (ret)
+ return ret;
+
+ desc = new_desc;
+ }
+
+ if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
+ out->desc = desc;
+ return 1;
+ }
+
+ addr_bottom += contiguous_bit_shift(desc, wi, level);
+
+ /* Calculate and return the result */
+ paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
+ (ipa & GENMASK_ULL(addr_bottom - 1, 0));
+ out->output = paddr;
+ out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
+ out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+ out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+ out->level = level;
+ out->desc = desc;
+ return 0;
+}
+
+static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
+{
+ wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
+
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ wi->pgshift = 12; break;
+ case VTCR_EL2_TG0_16K:
+ wi->pgshift = 14; break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+
+ wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ /* Global limit for now, should eventually be per-VM */
+ wi->max_oa_bits = min(get_kvm_ipa_limit(),
+ ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
+
+ wi->ha = vtcr & VTCR_EL2_HA;
+}
+
+int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+ struct kvm_s2_trans *result)
+{
+ u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ struct s2_walk_info wi;
+ int ret;
+
+ result->esr = 0;
+
+ if (!vcpu_has_nv(vcpu))
+ return 0;
+
+ wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ vtcr_to_walk_info(vtcr, &wi);
+
+ wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
+
+ ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result);
+ if (ret)
+ result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
+
+ return ret;
+}
+
+static unsigned int ttl_to_size(u8 ttl)
+{
+ int level = ttl & 3;
+ int gran = (ttl >> 2) & 3;
+ unsigned int max_size = 0;
+
+ switch (gran) {
+ case TLBI_TTL_TG_4K:
+ switch (level) {
+ case 0:
+ break;
+ case 1:
+ max_size = SZ_1G;
+ break;
+ case 2:
+ max_size = SZ_2M;
+ break;
+ case 3:
+ max_size = SZ_4K;
+ break;
+ }
+ break;
+ case TLBI_TTL_TG_16K:
+ switch (level) {
+ case 0:
+ case 1:
+ break;
+ case 2:
+ max_size = SZ_32M;
+ break;
+ case 3:
+ max_size = SZ_16K;
+ break;
+ }
+ break;
+ case TLBI_TTL_TG_64K:
+ switch (level) {
+ case 0:
+ case 1:
+ /* No 52bit IPA support */
+ break;
+ case 2:
+ max_size = SZ_512M;
+ break;
+ case 3:
+ max_size = SZ_64K;
+ break;
+ }
+ break;
+ default: /* No size information */
+ break;
+ }
+
+ return max_size;
+}
+
+static u8 pgshift_level_to_ttl(u16 shift, u8 level)
+{
+ u8 ttl;
+
+ switch(shift) {
+ case 12:
+ ttl = TLBI_TTL_TG_4K;
+ break;
+ case 14:
+ ttl = TLBI_TTL_TG_16K;
+ break;
+ case 16:
+ ttl = TLBI_TTL_TG_64K;
+ break;
+ default:
+ BUG();
+ }
+
+ ttl <<= 2;
+ ttl |= level & 3;
+
+ return ttl;
+}
+
+/*
+ * Compute the equivalent of the TTL field by parsing the shadow PT. The
+ * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
+ * retrieved from first entry carrying the level as a tag.
+ */
+static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
+{
+ u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
+ kvm_pte_t pte;
+ u8 ttl, level;
+
+ lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock);
+
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ ttl = (TLBI_TTL_TG_4K << 2);
+ break;
+ case VTCR_EL2_TG0_16K:
+ ttl = (TLBI_TTL_TG_16K << 2);
+ break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ ttl = (TLBI_TTL_TG_64K << 2);
+ break;
+ }
+
+ tmp = addr;
+
+again:
+ /* Iteratively compute the block sizes for a particular granule size */
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ if (sz < SZ_4K) sz = SZ_4K;
+ else if (sz < SZ_2M) sz = SZ_2M;
+ else if (sz < SZ_1G) sz = SZ_1G;
+ else sz = 0;
+ break;
+ case VTCR_EL2_TG0_16K:
+ if (sz < SZ_16K) sz = SZ_16K;
+ else if (sz < SZ_32M) sz = SZ_32M;
+ else sz = 0;
+ break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ if (sz < SZ_64K) sz = SZ_64K;
+ else if (sz < SZ_512M) sz = SZ_512M;
+ else sz = 0;
+ break;
+ }
+
+ if (sz == 0)
+ return 0;
+
+ tmp &= ~(sz - 1);
+ if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
+ goto again;
+ if (!(pte & PTE_VALID))
+ goto again;
+ level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
+ if (!level)
+ goto again;
+
+ ttl |= level;
+
+ /*
+ * We now have found some level information in the shadow S2. Check
+ * that the resulting range is actually including the original IPA.
+ */
+ sz = ttl_to_size(ttl);
+ if (addr < (tmp + sz))
+ return ttl;
+
+ return 0;
+}
+
+unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
+ unsigned long max_size;
+ u8 ttl;
+
+ ttl = FIELD_GET(TLBI_TTL_MASK, val);
+
+ if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
+ /* No TTL, check the shadow S2 for a hint */
+ u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
+ ttl = get_guest_mapping_ttl(mmu, addr);
+ }
+
+ max_size = ttl_to_size(ttl);
+
+ if (!max_size) {
+ /* Compute the maximum extent of the invalidation */
+ switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ max_size = SZ_1G;
+ break;
+ case VTCR_EL2_TG0_16K:
+ max_size = SZ_32M;
+ break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ /*
+ * No, we do not support 52bit IPA in nested yet. Once
+ * we do, this should be 4TB.
+ */
+ max_size = SZ_512M;
+ break;
+ }
+ }
+
+ WARN_ON(!max_size);
+ return max_size;
+}
+
+/*
+ * We can have multiple *different* MMU contexts with the same VMID:
+ *
+ * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit
+ *
+ * - Multiple vcpus using private S2s (huh huh...), hence differing by the
+ * VBBTR_EL2.BADDR address
+ *
+ * - A combination of the above...
+ *
+ * We can always identify which MMU context to pick at run-time. However,
+ * TLB invalidation involving a VMID must take action on all the TLBs using
+ * this particular VMID. This translates into applying the same invalidation
+ * operation to all the contexts that are using this VMID. Moar phun!
+ */
+void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
+ const union tlbi_info *info,
+ void (*tlbi_callback)(struct kvm_s2_mmu *,
+ const union tlbi_info *))
+{
+ write_lock(&kvm->mmu_lock);
+
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!kvm_s2_mmu_valid(mmu))
+ continue;
+
+ if (vmid == get_vmid(mmu->tlb_vttbr))
+ tlbi_callback(mmu, info);
+ }
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool nested_stage2_enabled;
+ u64 vttbr, vtcr, hcr;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
+
+ nested_stage2_enabled = hcr & HCR_VM;
+
+ /* Don't consider the CnP bit for the vttbr match */
+ vttbr &= ~VTTBR_CNP_BIT;
+
+ /*
+ * Two possibilities when looking up a S2 MMU context:
+ *
+ * - either S2 is enabled in the guest, and we need a context that is
+ * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
+ * which makes it safe from a TLB conflict perspective (a broken
+ * guest won't be able to generate them),
+ *
+ * - or S2 is disabled, and we need a context that is S2-disabled
+ * and matches the VMID only, as all TLBs are tagged by VMID even
+ * if S2 translation is disabled.
+ */
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!kvm_s2_mmu_valid(mmu))
+ continue;
+
+ if (nested_stage2_enabled &&
+ mmu->nested_stage2_enabled &&
+ vttbr == mmu->tlb_vttbr &&
+ vtcr == mmu->tlb_vtcr)
+ return mmu;
+
+ if (!nested_stage2_enabled &&
+ !mmu->nested_stage2_enabled &&
+ get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
+ return mmu;
+ }
+ return NULL;
+}
+
+static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_s2_mmu *s2_mmu;
+ int i;
+
+ lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
+
+ s2_mmu = lookup_s2_mmu(vcpu);
+ if (s2_mmu)
+ goto out;
+
+ /*
+ * Make sure we don't always search from the same point, or we
+ * will always reuse a potentially active context, leaving
+ * free contexts unused.
+ */
+ for (i = kvm->arch.nested_mmus_next;
+ i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
+ i++) {
+ s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
+
+ if (atomic_read(&s2_mmu->refcnt) == 0)
+ break;
+ }
+ BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
+
+ /* Set the scene for the next search */
+ kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
+
+ /* Make sure we don't forget to do the laundry */
+ if (kvm_s2_mmu_valid(s2_mmu))
+ s2_mmu->pending_unmap = true;
+
+ /*
+ * The virtual VMID (modulo CnP) will be used as a key when matching
+ * an existing kvm_s2_mmu.
+ *
+ * We cache VTCR at allocation time, once and for all. It'd be great
+ * if the guest didn't screw that one up, as this is not very
+ * forgiving...
+ */
+ s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
+ s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
+
+out:
+ atomic_inc(&s2_mmu->refcnt);
+
+ /*
+ * Set the vCPU request to perform an unmap, even if the pending unmap
+ * originates from another vCPU. This guarantees that the MMU has been
+ * completely unmapped before any vCPU actually uses it, and allows
+ * multiple vCPUs to lend a hand with completing the unmap.
+ */
+ if (s2_mmu->pending_unmap)
+ kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu);
+
+ return s2_mmu;
+}
+
+void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
+{
+ /* CnP being set denotes an invalid entry */
+ mmu->tlb_vttbr = VTTBR_CNP_BIT;
+ mmu->nested_stage2_enabled = false;
+ atomic_set(&mmu->refcnt, 0);
+}
+
+void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If the vCPU kept its reference on the MMU after the last put,
+ * keep rolling with it.
+ */
+ if (is_hyp_ctxt(vcpu)) {
+ if (!vcpu->arch.hw_mmu)
+ vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
+ } else {
+ if (!vcpu->arch.hw_mmu) {
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
+ vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
+ }
+
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV)
+ kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
+ }
+}
+
+void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
+{
+ /* Unconditionally drop the VNCR mapping if we have one */
+ if (host_data_test_flag(L1_VNCR_MAPPED)) {
+ BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id());
+ BUG_ON(is_hyp_ctxt(vcpu));
+
+ clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu));
+ vcpu->arch.vncr_tlb->cpu = -1;
+ host_data_clear_flag(L1_VNCR_MAPPED);
+ atomic_dec(&vcpu->kvm->arch.vncr_map_count);
+ }
+
+ /*
+ * Keep a reference on the associated stage-2 MMU if the vCPU is
+ * scheduling out and not in WFI emulation, suggesting it is likely to
+ * reuse the MMU sometime soon.
+ */
+ if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI))
+ return;
+
+ if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu))
+ atomic_dec(&vcpu->arch.hw_mmu->refcnt);
+
+ vcpu->arch.hw_mmu = NULL;
+}
+
+/*
+ * Returns non-zero if permission fault is handled by injecting it to the next
+ * level hypervisor.
+ */
+int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
+{
+ bool forward_fault = false;
+
+ trans->esr = 0;
+
+ if (!kvm_vcpu_trap_is_permission_fault(vcpu))
+ return 0;
+
+ if (kvm_vcpu_trap_is_iabt(vcpu)) {
+ if (vcpu_mode_priv(vcpu))
+ forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans);
+ else
+ forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans);
+ } else {
+ bool write_fault = kvm_is_write_fault(vcpu);
+
+ forward_fault = ((write_fault && !trans->writable) ||
+ (!write_fault && !trans->readable));
+ }
+
+ if (forward_fault)
+ trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
+
+ return forward_fault;
+}
+
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
+
+ return kvm_inject_nested_sync(vcpu, esr_el2);
+}
+
+static void invalidate_vncr(struct vncr_tlb *vt)
+{
+ vt->valid = false;
+ if (vt->cpu != -1)
+ clear_fixmap(vncr_fixmap(vt->cpu));
+}
+
+static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
+ return;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
+ u64 ipa_start, ipa_end, ipa_size;
+
+ /*
+ * Careful here: We end-up here from an MMU notifier,
+ * and this can race against a vcpu not being onlined
+ * yet, without the pseudo-TLB being allocated.
+ *
+ * Skip those, as they obviously don't participate in
+ * the invalidation at this stage.
+ */
+ if (!vt)
+ continue;
+
+ if (!vt->valid)
+ continue;
+
+ ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
+ vt->wr.level));
+ ipa_start = vt->wr.pa & ~(ipa_size - 1);
+ ipa_end = ipa_start + ipa_size;
+
+ if (ipa_end <= start || ipa_start >= end)
+ continue;
+
+ invalidate_vncr(vt);
+ }
+}
+
+struct s1e2_tlbi_scope {
+ enum {
+ TLBI_ALL,
+ TLBI_VA,
+ TLBI_VAA,
+ TLBI_ASID,
+ } type;
+
+ u16 asid;
+ u64 va;
+ u64 size;
+};
+
+static void invalidate_vncr_va(struct kvm *kvm,
+ struct s1e2_tlbi_scope *scope)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
+ u64 va_start, va_end, va_size;
+
+ if (!vt->valid)
+ continue;
+
+ va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
+ vt->wr.level));
+ va_start = vt->gva & ~(va_size - 1);
+ va_end = va_start + va_size;
+
+ switch (scope->type) {
+ case TLBI_ALL:
+ break;
+
+ case TLBI_VA:
+ if (va_end <= scope->va ||
+ va_start >= (scope->va + scope->size))
+ continue;
+ if (vt->wr.nG && vt->wr.asid != scope->asid)
+ continue;
+ break;
+
+ case TLBI_VAA:
+ if (va_end <= scope->va ||
+ va_start >= (scope->va + scope->size))
+ continue;
+ break;
+
+ case TLBI_ASID:
+ if (!vt->wr.nG || vt->wr.asid != scope->asid)
+ continue;
+ break;
+ }
+
+ invalidate_vncr(vt);
+ }
+}
+
+#define tlbi_va_s1_to_va(v) (u64)sign_extend64((v) << 12, 48)
+
+static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
+ struct s1e2_tlbi_scope *scope)
+{
+ switch (inst) {
+ case OP_TLBI_ALLE2:
+ case OP_TLBI_ALLE2IS:
+ case OP_TLBI_ALLE2OS:
+ case OP_TLBI_VMALLE1:
+ case OP_TLBI_VMALLE1IS:
+ case OP_TLBI_VMALLE1OS:
+ case OP_TLBI_ALLE2NXS:
+ case OP_TLBI_ALLE2ISNXS:
+ case OP_TLBI_ALLE2OSNXS:
+ case OP_TLBI_VMALLE1NXS:
+ case OP_TLBI_VMALLE1ISNXS:
+ case OP_TLBI_VMALLE1OSNXS:
+ scope->type = TLBI_ALL;
+ break;
+ case OP_TLBI_VAE2:
+ case OP_TLBI_VAE2IS:
+ case OP_TLBI_VAE2OS:
+ case OP_TLBI_VAE1:
+ case OP_TLBI_VAE1IS:
+ case OP_TLBI_VAE1OS:
+ case OP_TLBI_VAE2NXS:
+ case OP_TLBI_VAE2ISNXS:
+ case OP_TLBI_VAE2OSNXS:
+ case OP_TLBI_VAE1NXS:
+ case OP_TLBI_VAE1ISNXS:
+ case OP_TLBI_VAE1OSNXS:
+ case OP_TLBI_VALE2:
+ case OP_TLBI_VALE2IS:
+ case OP_TLBI_VALE2OS:
+ case OP_TLBI_VALE1:
+ case OP_TLBI_VALE1IS:
+ case OP_TLBI_VALE1OS:
+ case OP_TLBI_VALE2NXS:
+ case OP_TLBI_VALE2ISNXS:
+ case OP_TLBI_VALE2OSNXS:
+ case OP_TLBI_VALE1NXS:
+ case OP_TLBI_VALE1ISNXS:
+ case OP_TLBI_VALE1OSNXS:
+ scope->type = TLBI_VA;
+ scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
+ if (!scope->size)
+ scope->size = SZ_1G;
+ scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
+ scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
+ break;
+ case OP_TLBI_ASIDE1:
+ case OP_TLBI_ASIDE1IS:
+ case OP_TLBI_ASIDE1OS:
+ case OP_TLBI_ASIDE1NXS:
+ case OP_TLBI_ASIDE1ISNXS:
+ case OP_TLBI_ASIDE1OSNXS:
+ scope->type = TLBI_ASID;
+ scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
+ break;
+ case OP_TLBI_VAAE1:
+ case OP_TLBI_VAAE1IS:
+ case OP_TLBI_VAAE1OS:
+ case OP_TLBI_VAAE1NXS:
+ case OP_TLBI_VAAE1ISNXS:
+ case OP_TLBI_VAAE1OSNXS:
+ case OP_TLBI_VAALE1:
+ case OP_TLBI_VAALE1IS:
+ case OP_TLBI_VAALE1OS:
+ case OP_TLBI_VAALE1NXS:
+ case OP_TLBI_VAALE1ISNXS:
+ case OP_TLBI_VAALE1OSNXS:
+ scope->type = TLBI_VAA;
+ scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
+ if (!scope->size)
+ scope->size = SZ_1G;
+ scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
+ break;
+ case OP_TLBI_RVAE2:
+ case OP_TLBI_RVAE2IS:
+ case OP_TLBI_RVAE2OS:
+ case OP_TLBI_RVAE1:
+ case OP_TLBI_RVAE1IS:
+ case OP_TLBI_RVAE1OS:
+ case OP_TLBI_RVAE2NXS:
+ case OP_TLBI_RVAE2ISNXS:
+ case OP_TLBI_RVAE2OSNXS:
+ case OP_TLBI_RVAE1NXS:
+ case OP_TLBI_RVAE1ISNXS:
+ case OP_TLBI_RVAE1OSNXS:
+ case OP_TLBI_RVALE2:
+ case OP_TLBI_RVALE2IS:
+ case OP_TLBI_RVALE2OS:
+ case OP_TLBI_RVALE1:
+ case OP_TLBI_RVALE1IS:
+ case OP_TLBI_RVALE1OS:
+ case OP_TLBI_RVALE2NXS:
+ case OP_TLBI_RVALE2ISNXS:
+ case OP_TLBI_RVALE2OSNXS:
+ case OP_TLBI_RVALE1NXS:
+ case OP_TLBI_RVALE1ISNXS:
+ case OP_TLBI_RVALE1OSNXS:
+ scope->type = TLBI_VA;
+ scope->va = decode_range_tlbi(val, &scope->size, &scope->asid);
+ break;
+ case OP_TLBI_RVAAE1:
+ case OP_TLBI_RVAAE1IS:
+ case OP_TLBI_RVAAE1OS:
+ case OP_TLBI_RVAAE1NXS:
+ case OP_TLBI_RVAAE1ISNXS:
+ case OP_TLBI_RVAAE1OSNXS:
+ case OP_TLBI_RVAALE1:
+ case OP_TLBI_RVAALE1IS:
+ case OP_TLBI_RVAALE1OS:
+ case OP_TLBI_RVAALE1NXS:
+ case OP_TLBI_RVAALE1ISNXS:
+ case OP_TLBI_RVAALE1OSNXS:
+ scope->type = TLBI_VAA;
+ scope->va = decode_range_tlbi(val, &scope->size, NULL);
+ break;
+ }
+}
+
+void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val)
+{
+ struct s1e2_tlbi_scope scope = {};
+
+ compute_s1_tlbi_range(vcpu, inst, val, &scope);
+
+ guard(write_lock)(&vcpu->kvm->mmu_lock);
+ invalidate_vncr_va(vcpu->kvm, &scope);
+}
+
+void kvm_nested_s2_wp(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
+ }
+
+ kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
+}
+
+void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
+ }
+
+ kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
+}
+
+void kvm_nested_s2_flush(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
+ }
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!WARN_ON(atomic_read(&mmu->refcnt)))
+ kvm_free_stage2_pgd(mmu);
+ }
+ kvfree(kvm->arch.nested_mmus);
+ kvm->arch.nested_mmus = NULL;
+ kvm->arch.nested_mmus_size = 0;
+ kvm_uninit_stage2_mmu(kvm);
+}
+
+/*
+ * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter:
+ *
+ * - We introduce an internal representation of a vcpu-private TLB,
+ * representing the mapping between the guest VA contained in VNCR_EL2,
+ * the IPA the guest's EL2 PTs point to, and the actual PA this lives at.
+ *
+ * - On translation fault from a nested VNCR access, we create such a TLB.
+ * If there is no mapping to describe, the guest inherits the fault.
+ * Crucially, no actual mapping is done at this stage.
+ *
+ * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above
+ * TLB exists, we map it in the fixmap for this CPU, and run with it. We
+ * have to respect the permissions dictated by the guest, but not the
+ * memory type (FWB is a must).
+ *
+ * - Note that we usually don't do a vcpu_load() on the back of a fault
+ * (unless we are preempted), so the resolution of a translation fault
+ * must go via a request that will map the VNCR page in the fixmap.
+ * vcpu_load() might as well use the same mechanism.
+ *
+ * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was
+ * mapped, we unmap it. Yes it is that simple. The TLB still exists
+ * though, and may be reused at a later load.
+ *
+ * - On permission fault, we simply forward the fault to the guest's EL2.
+ * Get out of my way.
+ *
+ * - On any TLBI for the EL2&0 translation regime, we must find any TLB that
+ * intersects with the TLBI request, invalidate it, and unmap the page
+ * from the fixmap. Because we need to look at all the vcpu-private TLBs,
+ * this requires some wide-ranging locking to ensure that nothing races
+ * against it. This may require some refcounting to avoid the search when
+ * no such TLB is present.
+ *
+ * - On MMU notifiers, we must invalidate our TLB in a similar way, but
+ * looking at the IPA instead. The funny part is that there may not be a
+ * stage-2 mapping for this page if L1 hasn't accessed it using LD/ST
+ * instructions.
+ */
+
+int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
+ return 0;
+
+ vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb),
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.vncr_tlb)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
+{
+ return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
+}
+
+static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
+{
+ struct kvm_memory_slot *memslot;
+ bool write_fault, writable;
+ unsigned long mmu_seq;
+ struct vncr_tlb *vt;
+ struct page *page;
+ u64 va, pfn, gfn;
+ int ret;
+
+ vt = vcpu->arch.vncr_tlb;
+
+ /*
+ * If we're about to walk the EL2 S1 PTs, we must invalidate the
+ * current TLB, as it could be sampled from another vcpu doing a
+ * TLBI *IS. A real CPU wouldn't do that, but we only keep a single
+ * translation, so not much of a choice.
+ *
+ * We also prepare the next walk wilst we're at it.
+ */
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+ invalidate_vncr(vt);
+
+ vt->wi = (struct s1_walk_info) {
+ .regime = TR_EL20,
+ .as_el0 = false,
+ .pan = false,
+ };
+ vt->wr = (struct s1_walk_result){};
+ }
+
+ guard(srcu)(&vcpu->kvm->srcu);
+
+ va = read_vncr_el2(vcpu);
+
+ ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va);
+ if (ret)
+ return ret;
+
+ write_fault = kvm_is_write_fault(vcpu);
+
+ mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ gfn = vt->wr.pa >> PAGE_SHIFT;
+ memslot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!memslot)
+ return -EFAULT;
+
+ *is_gmem = kvm_slot_has_gmem(memslot);
+ if (!*is_gmem) {
+ pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+ &writable, &page);
+ if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
+ return -EFAULT;
+ } else {
+ ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
+ if (ret) {
+ kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
+ write_fault, false, false);
+ return ret;
+ }
+ }
+
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+ if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
+ return -EAGAIN;
+
+ vt->gva = va;
+ vt->hpa = pfn << PAGE_SHIFT;
+ vt->valid = true;
+ vt->cpu = -1;
+
+ kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
+ kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw);
+ }
+
+ if (vt->wr.pw)
+ mark_page_dirty(vcpu->kvm, gfn);
+
+ return 0;
+}
+
+static void inject_vncr_perm(struct kvm_vcpu *vcpu)
+{
+ struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+
+ /* Adjust the fault level to reflect that of the guest's */
+ esr &= ~ESR_ELx_FSC;
+ esr |= FIELD_PREP(ESR_ELx_FSC,
+ ESR_ELx_FSC_PERM_L(vt->wr.level));
+
+ kvm_inject_nested_sync(vcpu, esr);
+}
+
+static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
+{
+ struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
+
+ lockdep_assert_held_read(&vcpu->kvm->mmu_lock);
+
+ if (!vt->valid)
+ return false;
+
+ if (read_vncr_el2(vcpu) != vt->gva)
+ return false;
+
+ if (vt->wr.nG) {
+ u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ u64 ttbr = ((tcr & TCR_A1) ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+ u16 asid;
+
+ asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
+ !(tcr & TCR_ASID16))
+ asid &= GENMASK(7, 0);
+
+ return asid == vt->wr.asid;
+ }
+
+ return true;
+}
+
+int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
+{
+ struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+
+ WARN_ON_ONCE(!(esr & ESR_ELx_VNCR));
+
+ if (kvm_vcpu_abt_issea(vcpu))
+ return kvm_handle_guest_sea(vcpu);
+
+ if (esr_fsc_is_permission_fault(esr)) {
+ inject_vncr_perm(vcpu);
+ } else if (esr_fsc_is_translation_fault(esr)) {
+ bool valid, is_gmem = false;
+ int ret;
+
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ valid = kvm_vncr_tlb_lookup(vcpu);
+
+ if (!valid)
+ ret = kvm_translate_vncr(vcpu, &is_gmem);
+ else
+ ret = -EPERM;
+
+ switch (ret) {
+ case -EAGAIN:
+ /* Let's try again... */
+ break;
+ case -ENOMEM:
+ /*
+ * For guest_memfd, this indicates that it failed to
+ * create a folio to back the memory. Inform userspace.
+ */
+ if (is_gmem)
+ return 0;
+ /* Otherwise, let's try again... */
+ break;
+ case -EFAULT:
+ case -EIO:
+ case -EHWPOISON:
+ if (is_gmem)
+ return 0;
+ fallthrough;
+ case -EINVAL:
+ case -ENOENT:
+ case -EACCES:
+ /*
+ * Translation failed, inject the corresponding
+ * exception back to EL2.
+ */
+ BUG_ON(!vt->wr.failed);
+
+ esr &= ~ESR_ELx_FSC;
+ esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst);
+
+ kvm_inject_nested_sync(vcpu, esr);
+ break;
+ case -EPERM:
+ /* Hack to deal with POE until we get kernel support */
+ inject_vncr_perm(vcpu);
+ break;
+ case 0:
+ break;
+ }
+ } else {
+ WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr);
+ }
+
+ return 1;
+}
+
+static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
+{
+ struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
+ pgprot_t prot;
+
+ guard(preempt)();
+ guard(read_lock)(&vcpu->kvm->mmu_lock);
+
+ /*
+ * The request to map VNCR may have raced against some other
+ * event, such as an interrupt, and may not be valid anymore.
+ */
+ if (is_hyp_ctxt(vcpu))
+ return;
+
+ /*
+ * Check that the pseudo-TLB is valid and that VNCR_EL2 still
+ * contains the expected value. If it doesn't, we simply bail out
+ * without a mapping -- a transformed MSR/MRS will generate the
+ * fault and allows us to populate the pseudo-TLB.
+ */
+ if (!vt->valid)
+ return;
+
+ if (read_vncr_el2(vcpu) != vt->gva)
+ return;
+
+ if (vt->wr.nG) {
+ u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ u64 ttbr = ((tcr & TCR_A1) ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+ u16 asid;
+
+ asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
+ !(tcr & TCR_ASID16))
+ asid &= GENMASK(7, 0);
+
+ if (asid != vt->wr.asid)
+ return;
+ }
+
+ vt->cpu = smp_processor_id();
+
+ if (vt->wr.pw && vt->wr.pr)
+ prot = PAGE_KERNEL;
+ else if (vt->wr.pr)
+ prot = PAGE_KERNEL_RO;
+ else
+ prot = PAGE_NONE;
+
+ /*
+ * We can't map write-only (or no permission at all) in the kernel,
+ * but the guest can do it if using POE, so we'll have to turn a
+ * translation fault into a permission fault at runtime.
+ * FIXME: WO doesn't work at all, need POE support in the kernel.
+ */
+ if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) {
+ __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot);
+ host_data_set_flag(L1_VNCR_MAPPED);
+ atomic_inc(&vcpu->kvm->arch.vncr_map_count);
+ }
+}
+
+#define has_tgran_2(__r, __sz) \
+ ({ \
+ u64 _s1, _s2, _mmfr0 = __r; \
+ \
+ _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
+ TGRAN##__sz##_2, _mmfr0); \
+ \
+ _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
+ TGRAN##__sz, _mmfr0); \
+ \
+ ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \
+ _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \
+ (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \
+ _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \
+ })
+/*
+ * Our emulated CPU doesn't support all the possible features. For the
+ * sake of simplicity (and probably mental sanity), wipe out a number
+ * of feature bits we don't intend to support for the time being.
+ * This list should get updated as new features get added to the NV
+ * support, and new extension to the architecture.
+ */
+u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
+{
+ u64 orig_val = val;
+
+ switch (reg) {
+ case SYS_ID_AA64ISAR0_EL1:
+ /* Support everything but TME */
+ val &= ~ID_AA64ISAR0_EL1_TME;
+ break;
+
+ case SYS_ID_AA64ISAR1_EL1:
+ /* Support everything but LS64 and Spec Invalidation */
+ val &= ~(ID_AA64ISAR1_EL1_LS64 |
+ ID_AA64ISAR1_EL1_SPECRES);
+ break;
+
+ case SYS_ID_AA64PFR0_EL1:
+ /* No RME, AMU, MPAM, or S-EL2 */
+ val &= ~(ID_AA64PFR0_EL1_RME |
+ ID_AA64PFR0_EL1_AMU |
+ ID_AA64PFR0_EL1_MPAM |
+ ID_AA64PFR0_EL1_SEL2 |
+ ID_AA64PFR0_EL1_EL3 |
+ ID_AA64PFR0_EL1_EL2 |
+ ID_AA64PFR0_EL1_EL1 |
+ ID_AA64PFR0_EL1_EL0);
+ /* 64bit only at any EL */
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP);
+ break;
+
+ case SYS_ID_AA64PFR1_EL1:
+ /* Only support BTI, SSBS, CSV2_frac */
+ val &= ~(ID_AA64PFR1_EL1_PFAR |
+ ID_AA64PFR1_EL1_MTEX |
+ ID_AA64PFR1_EL1_THE |
+ ID_AA64PFR1_EL1_GCS |
+ ID_AA64PFR1_EL1_MTE_frac |
+ ID_AA64PFR1_EL1_NMI |
+ ID_AA64PFR1_EL1_SME |
+ ID_AA64PFR1_EL1_RES0 |
+ ID_AA64PFR1_EL1_MPAM_frac |
+ ID_AA64PFR1_EL1_MTE);
+ break;
+
+ case SYS_ID_AA64MMFR0_EL1:
+ /* Hide ExS, Secure Memory */
+ val &= ~(ID_AA64MMFR0_EL1_EXS |
+ ID_AA64MMFR0_EL1_TGRAN4_2 |
+ ID_AA64MMFR0_EL1_TGRAN16_2 |
+ ID_AA64MMFR0_EL1_TGRAN64_2 |
+ ID_AA64MMFR0_EL1_SNSMEM);
+
+ /* Hide CNTPOFF if present */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP);
+
+ /* Disallow unsupported S2 page sizes */
+ switch (PAGE_SIZE) {
+ case SZ_64K:
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI);
+ fallthrough;
+ case SZ_16K:
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI);
+ fallthrough;
+ case SZ_4K:
+ /* Support everything */
+ break;
+ }
+
+ /*
+ * Since we can't support a guest S2 page size smaller
+ * than the host's own page size (due to KVM only
+ * populating its own S2 using the kernel's page
+ * size), advertise the limitation using FEAT_GTG.
+ */
+ switch (PAGE_SIZE) {
+ case SZ_4K:
+ if (has_tgran_2(orig_val, 4))
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
+ fallthrough;
+ case SZ_16K:
+ if (has_tgran_2(orig_val, 16))
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
+ fallthrough;
+ case SZ_64K:
+ if (has_tgran_2(orig_val, 64))
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
+ break;
+ }
+
+ /* Cap PARange to 48bits */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48);
+ break;
+
+ case SYS_ID_AA64MMFR1_EL1:
+ val &= ~(ID_AA64MMFR1_EL1_CMOW |
+ ID_AA64MMFR1_EL1_nTLBPA |
+ ID_AA64MMFR1_EL1_ETS);
+
+ /* FEAT_E2H0 implies no VHE */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
+ val &= ~ID_AA64MMFR1_EL1_VH;
+
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
+ break;
+
+ case SYS_ID_AA64MMFR2_EL1:
+ val &= ~(ID_AA64MMFR2_EL1_BBM |
+ ID_AA64MMFR2_EL1_TTL |
+ GENMASK_ULL(47, 44) |
+ ID_AA64MMFR2_EL1_ST |
+ ID_AA64MMFR2_EL1_CCIDX |
+ ID_AA64MMFR2_EL1_VARange);
+
+ /* Force TTL support */
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP);
+ break;
+
+ case SYS_ID_AA64MMFR4_EL1:
+ /*
+ * You get EITHER
+ *
+ * - FEAT_VHE without FEAT_E2H0
+ * - FEAT_NV limited to FEAT_NV2
+ * - HCR_EL2.NV1 being RES0
+ *
+ * OR
+ *
+ * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV
+ *
+ * Life is too short for anything else.
+ */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) {
+ val = 0;
+ } else {
+ val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY);
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1);
+ }
+ break;
+
+ case SYS_ID_AA64DFR0_EL1:
+ /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
+ val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff |
+ ID_AA64DFR0_EL1_BRBE |
+ ID_AA64DFR0_EL1_MTPMU |
+ ID_AA64DFR0_EL1_TraceBuffer |
+ ID_AA64DFR0_EL1_TraceFilt |
+ ID_AA64DFR0_EL1_PMSVer |
+ ID_AA64DFR0_EL1_CTX_CMPs |
+ ID_AA64DFR0_EL1_SEBEP |
+ ID_AA64DFR0_EL1_PMSS |
+ ID_AA64DFR0_EL1_TraceVer);
+
+ /*
+ * FEAT_Debugv8p9 requires support for extended breakpoints /
+ * watchpoints.
+ */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
+ break;
+ }
+
+ return val;
+}
+
+u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg sr, u64 v)
+{
+ struct kvm_sysreg_masks *masks;
+
+ masks = vcpu->kvm->arch.sysreg_masks;
+
+ if (masks) {
+ sr -= __SANITISED_REG_START__;
+
+ v &= ~masks->mask[sr].res0;
+ v |= masks->mask[sr].res1;
+ }
+
+ return v;
+}
+
+static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1)
+{
+ int i = sr - __SANITISED_REG_START__;
+
+ BUILD_BUG_ON(!__builtin_constant_p(sr));
+ BUILD_BUG_ON(sr < __SANITISED_REG_START__);
+ BUILD_BUG_ON(sr >= NR_SYS_REGS);
+
+ kvm->arch.sysreg_masks->mask[i].res0 = res0;
+ kvm->arch.sysreg_masks->mask[i].res1 = res1;
+}
+
+int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 res0, res1;
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ if (kvm->arch.sysreg_masks)
+ goto out;
+
+ kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)),
+ GFP_KERNEL_ACCOUNT);
+ if (!kvm->arch.sysreg_masks)
+ return -ENOMEM;
+
+ /* VTTBR_EL2 */
+ res0 = res1 = 0;
+ if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16))
+ res0 |= GENMASK(63, 56);
+ if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP))
+ res0 |= VTTBR_CNP_BIT;
+ set_sysreg_masks(kvm, VTTBR_EL2, res0, res1);
+
+ /* VTCR_EL2 */
+ res0 = GENMASK(63, 32) | GENMASK(30, 20);
+ res1 = BIT(31);
+ set_sysreg_masks(kvm, VTCR_EL2, res0, res1);
+
+ /* VMPIDR_EL2 */
+ res0 = GENMASK(63, 40) | GENMASK(30, 24);
+ res1 = BIT(31);
+ set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1);
+
+ /* HCR_EL2 */
+ get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HCR_EL2, res0, res1);
+
+ /* HCRX_EL2 */
+ get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HCRX_EL2, res0, res1);
+
+ /* HFG[RW]TR_EL2 */
+ get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1);
+ get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1);
+
+ /* HDFG[RW]TR_EL2 */
+ get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1);
+ get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1);
+
+ /* HFGITR_EL2 */
+ get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HFGITR_EL2, res0, res1);
+
+ /* HAFGRTR_EL2 - not a lot to see here */
+ get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1);
+
+ /* HFG[RW]TR2_EL2 */
+ get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1);
+ get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1);
+
+ /* HDFG[RW]TR2_EL2 */
+ get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1);
+ get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1);
+
+ /* HFGITR2_EL2 */
+ get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1);
+
+ /* TCR2_EL2 */
+ get_reg_fixed_bits(kvm, TCR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, TCR2_EL2, res0, res1);
+
+ /* SCTLR_EL1 */
+ get_reg_fixed_bits(kvm, SCTLR_EL1, &res0, &res1);
+ set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);
+
+ /* SCTLR2_ELx */
+ get_reg_fixed_bits(kvm, SCTLR2_EL1, &res0, &res1);
+ set_sysreg_masks(kvm, SCTLR2_EL1, res0, res1);
+ get_reg_fixed_bits(kvm, SCTLR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, SCTLR2_EL2, res0, res1);
+
+ /* MDCR_EL2 */
+ get_reg_fixed_bits(kvm, MDCR_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, MDCR_EL2, res0, res1);
+
+ /* CNTHCTL_EL2 */
+ res0 = GENMASK(63, 20);
+ res1 = 0;
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP))
+ res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) {
+ res0 |= CNTHCTL_ECV;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP))
+ res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT |
+ CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT);
+ }
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
+ res0 |= GENMASK(11, 8);
+ set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1);
+
+ /* ICH_HCR_EL2 */
+ res0 = ICH_HCR_EL2_RES0;
+ res1 = ICH_HCR_EL2_RES1;
+ if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS))
+ res0 |= ICH_HCR_EL2_TDIR;
+ /* No GICv4 is presented to the guest */
+ res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
+ set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1);
+
+ /* VNCR_EL2 */
+ set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1);
+
+out:
+ for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
+ __vcpu_rmw_sys_reg(vcpu, sr, |=, 0);
+
+ return 0;
+}
+
+void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
+ struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ if (mmu->pending_unmap) {
+ kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
+ mmu->pending_unmap = false;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+ }
+
+ if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu))
+ kvm_map_l1_vncr(vcpu);
+
+ /* Must be last, as may switch context! */
+ if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
+ kvm_inject_nested_irq(vcpu);
+}
+
+/*
+ * One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor
+ * can write to HCR_EL2 behind our back, potentially changing the exception
+ * routing / masking for even the host context.
+ *
+ * What follows is some slop to (1) react to exception routing / masking and (2)
+ * preserve the pending SError state across translation regimes.
+ */
+void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_has_nv(vcpu))
+ return;
+
+ if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
+ kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
+}
+
+void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+ unsigned long *hcr = vcpu_hcr(vcpu);
+
+ if (!vcpu_has_nv(vcpu))
+ return;
+
+ /*
+ * We previously decided that an SError was deliverable to the guest.
+ * Reap the pending state from HCR_EL2 and...
+ */
+ if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr)))
+ vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
+
+ /*
+ * Re-attempt SError injection in case the deliverability has changed,
+ * which is necessary to faithfully emulate WFI the case of a pending
+ * SError being a wakeup condition.
+ */
+ if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
+ kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
+}
+
+/*
+ * KVM unconditionally sets most of these traps anyway but use an allowlist
+ * to document the guest hypervisor traps that may take precedence and guard
+ * against future changes to the non-nested trap configuration.
+ */
+#define NV_MDCR_GUEST_INCLUDE (MDCR_EL2_TDE | \
+ MDCR_EL2_TDA | \
+ MDCR_EL2_TDRA | \
+ MDCR_EL2_TTRF | \
+ MDCR_EL2_TPMS | \
+ MDCR_EL2_TPM | \
+ MDCR_EL2_TPMCR | \
+ MDCR_EL2_TDCC | \
+ MDCR_EL2_TDOSA)
+
+void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+{
+ u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ if (is_nested_ctxt(vcpu))
+ vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE);
+ /*
+ * In yet another example where FEAT_NV2 is fscking broken, accesses
+ * to MDSCR_EL1 are redirected to the VNCR despite having an effect
+ * at EL2. Use a big hammer to apply sanity.
+ *
+ * Unless of course we have FEAT_FGT, in which case we can precisely
+ * trap MDSCR_EL1.
+ */
+ else if (!cpus_have_final_cap(ARM64_HAS_FGT))
+ vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+}
diff --git a/arch/arm64/kvm/pauth.c b/arch/arm64/kvm/pauth.c
new file mode 100644
index 000000000000..d5eb3ae876be
--- /dev/null
+++ b/arch/arm64/kvm/pauth.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 - Google LLC
+ * Author: Marc Zyngier <maz@kernel.org>
+ *
+ * Primitive PAuth emulation for ERETAA/ERETAB.
+ *
+ * This code assumes that is is run from EL2, and that it is part of
+ * the emulation of ERETAx for a guest hypervisor. That's a lot of
+ * baked-in assumptions and shortcuts.
+ *
+ * Do no reuse for anything else!
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/gpr-num.h>
+#include <asm/kvm_emulate.h>
+#include <asm/pointer_auth.h>
+
+/* PACGA Xd, Xn, Xm */
+#define PACGA(d,n,m) \
+ asm volatile(__DEFINE_ASM_GPR_NUMS \
+ ".inst 0x9AC03000 |" \
+ "(.L__gpr_num_%[Rd] << 0) |" \
+ "(.L__gpr_num_%[Rn] << 5) |" \
+ "(.L__gpr_num_%[Rm] << 16)\n" \
+ : [Rd] "=r" ((d)) \
+ : [Rn] "r" ((n)), [Rm] "r" ((m)))
+
+static u64 compute_pac(struct kvm_vcpu *vcpu, u64 ptr,
+ struct ptrauth_key ikey)
+{
+ struct ptrauth_key gkey;
+ u64 mod, pac = 0;
+
+ preempt_disable();
+
+ if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
+ mod = __vcpu_sys_reg(vcpu, SP_EL2);
+ else
+ mod = read_sysreg(sp_el1);
+
+ gkey.lo = read_sysreg_s(SYS_APGAKEYLO_EL1);
+ gkey.hi = read_sysreg_s(SYS_APGAKEYHI_EL1);
+
+ __ptrauth_key_install_nosync(APGA, ikey);
+ isb();
+
+ PACGA(pac, ptr, mod);
+ isb();
+
+ __ptrauth_key_install_nosync(APGA, gkey);
+
+ preempt_enable();
+
+ /* PAC in the top 32bits */
+ return pac;
+}
+
+static bool effective_tbi(struct kvm_vcpu *vcpu, bool bit55)
+{
+ u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ bool tbi, tbid;
+
+ /*
+ * Since we are authenticating an instruction address, we have
+ * to take TBID into account. If E2H==0, ignore VA[55], as
+ * TCR_EL2 only has a single TBI/TBID. If VA[55] was set in
+ * this case, this is likely a guest bug...
+ */
+ if (!vcpu_el2_e2h_is_set(vcpu)) {
+ tbi = tcr & BIT(20);
+ tbid = tcr & BIT(29);
+ } else if (bit55) {
+ tbi = tcr & TCR_TBI1;
+ tbid = tcr & TCR_TBID1;
+ } else {
+ tbi = tcr & TCR_TBI0;
+ tbid = tcr & TCR_TBID0;
+ }
+
+ return tbi && !tbid;
+}
+
+static int compute_bottom_pac(struct kvm_vcpu *vcpu, bool bit55)
+{
+ static const int maxtxsz = 39; // Revisit these two values once
+ static const int mintxsz = 16; // (if) we support TTST/LVA/LVA2
+ u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ int txsz;
+
+ if (!vcpu_el2_e2h_is_set(vcpu) || !bit55)
+ txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+ else
+ txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+
+ return 64 - clamp(txsz, mintxsz, maxtxsz);
+}
+
+static u64 compute_pac_mask(struct kvm_vcpu *vcpu, bool bit55)
+{
+ int bottom_pac;
+ u64 mask;
+
+ bottom_pac = compute_bottom_pac(vcpu, bit55);
+
+ mask = GENMASK(54, bottom_pac);
+ if (!effective_tbi(vcpu, bit55))
+ mask |= GENMASK(63, 56);
+
+ return mask;
+}
+
+static u64 to_canonical_addr(struct kvm_vcpu *vcpu, u64 ptr, u64 mask)
+{
+ bool bit55 = !!(ptr & BIT(55));
+
+ if (bit55)
+ return ptr | mask;
+
+ return ptr & ~mask;
+}
+
+static u64 corrupt_addr(struct kvm_vcpu *vcpu, u64 ptr)
+{
+ bool bit55 = !!(ptr & BIT(55));
+ u64 mask, error_code;
+ int shift;
+
+ if (effective_tbi(vcpu, bit55)) {
+ mask = GENMASK(54, 53);
+ shift = 53;
+ } else {
+ mask = GENMASK(62, 61);
+ shift = 61;
+ }
+
+ if (esr_iss_is_eretab(kvm_vcpu_get_esr(vcpu)))
+ error_code = 2 << shift;
+ else
+ error_code = 1 << shift;
+
+ ptr &= ~mask;
+ ptr |= error_code;
+
+ return ptr;
+}
+
+/*
+ * Authenticate an ERETAA/ERETAB instruction, returning true if the
+ * authentication succeeded and false otherwise. In all cases, *elr
+ * contains the VA to ERET to. Potential exception injection is left
+ * to the caller.
+ */
+bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr)
+{
+ u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 ptr, cptr, pac, mask;
+ struct ptrauth_key ikey;
+
+ *elr = ptr = vcpu_read_sys_reg(vcpu, ELR_EL2);
+
+ /* We assume we're already in the context of an ERETAx */
+ if (esr_iss_is_eretab(esr)) {
+ if (!(sctlr & SCTLR_EL1_EnIB))
+ return true;
+
+ ikey.lo = __vcpu_sys_reg(vcpu, APIBKEYLO_EL1);
+ ikey.hi = __vcpu_sys_reg(vcpu, APIBKEYHI_EL1);
+ } else {
+ if (!(sctlr & SCTLR_EL1_EnIA))
+ return true;
+
+ ikey.lo = __vcpu_sys_reg(vcpu, APIAKEYLO_EL1);
+ ikey.hi = __vcpu_sys_reg(vcpu, APIAKEYHI_EL1);
+ }
+
+ mask = compute_pac_mask(vcpu, !!(ptr & BIT(55)));
+ cptr = to_canonical_addr(vcpu, ptr, mask);
+
+ pac = compute_pac(vcpu, cptr, ikey);
+
+ /*
+ * Slightly deviate from the pseudocode: if we have a PAC
+ * match with the signed pointer, then it must be good.
+ * Anything after this point is pure error handling.
+ */
+ if ((pac & mask) == (ptr & mask)) {
+ *elr = cptr;
+ return true;
+ }
+
+ /*
+ * Authentication failed, corrupt the canonical address if
+ * PAuth2 isn't implemented, or some XORing if it is.
+ */
+ if (!kvm_has_pauth(vcpu->kvm, PAuth2))
+ cptr = corrupt_addr(vcpu, cptr);
+ else
+ cptr = ptr ^ (pac & mask);
+
+ *elr = cptr;
+ return false;
+}
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index cf56958b1492..d7a0f69a9982 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -4,38 +4,26 @@
* Author: Quentin Perret <qperret@google.com>
*/
+#include <linux/init.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/kmemleak.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
#include <linux/memblock.h>
#include <linux/mutex.h>
-#include <linux/sort.h>
#include <asm/kvm_pkvm.h>
#include "hyp_constants.h"
+DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
+
static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
phys_addr_t hyp_mem_base;
phys_addr_t hyp_mem_size;
-static int cmp_hyp_memblock(const void *p1, const void *p2)
-{
- const struct memblock_region *r1 = p1;
- const struct memblock_region *r2 = p2;
-
- return r1->base < r2->base ? -1 : (r1->base > r2->base);
-}
-
-static void __init sort_memblock_regions(void)
-{
- sort(hyp_memory,
- *hyp_memblock_nr_ptr,
- sizeof(struct memblock_region),
- cmp_hyp_memblock,
- NULL);
-}
-
static int __init register_memblock_regions(void)
{
struct memblock_region *reg;
@@ -47,7 +35,6 @@ static int __init register_memblock_regions(void)
hyp_memory[*hyp_memblock_nr_ptr] = *reg;
(*hyp_memblock_nr_ptr)++;
}
- sort_memblock_regions();
return 0;
}
@@ -74,6 +61,8 @@ void __init kvm_hyp_reserve(void)
hyp_mem_pages += host_s2_pgtable_pages();
hyp_mem_pages += hyp_vm_table_pages();
hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
+ hyp_mem_pages += pkvm_selftest_pages();
+ hyp_mem_pages += hyp_ffa_proxy_pages();
/*
* Try to allocate a PMD-aligned region to reduce TLB pressure once
@@ -96,6 +85,47 @@ void __init kvm_hyp_reserve(void)
hyp_mem_base);
}
+static void __pkvm_destroy_hyp_vm(struct kvm *kvm)
+{
+ if (pkvm_hyp_vm_is_created(kvm)) {
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
+ kvm->arch.pkvm.handle));
+ } else if (kvm->arch.pkvm.handle) {
+ /*
+ * The VM could have been reserved but hyp initialization has
+ * failed. Make sure to unreserve it.
+ */
+ kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle);
+ }
+
+ kvm->arch.pkvm.handle = 0;
+ kvm->arch.pkvm.is_created = false;
+ free_hyp_memcache(&kvm->arch.pkvm.teardown_mc);
+ free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc);
+}
+
+static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
+{
+ size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
+ pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle;
+ void *hyp_vcpu;
+ int ret;
+
+ vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
+
+ hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
+ if (!hyp_vcpu)
+ return -ENOMEM;
+
+ ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu);
+ if (!ret)
+ vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED);
+ else
+ free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
+
+ return ret;
+}
+
/*
* Allocates and donates memory for hypervisor VM structs at EL2.
*
@@ -106,19 +136,16 @@ void __init kvm_hyp_reserve(void)
*
* Return 0 on success, negative error code on failure.
*/
-static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
+static int __pkvm_create_hyp_vm(struct kvm *kvm)
{
- size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
- struct kvm_vcpu *host_vcpu;
- pkvm_handle_t handle;
+ size_t pgd_sz, hyp_vm_sz;
void *pgd, *hyp_vm;
- unsigned long idx;
int ret;
- if (host_kvm->created_vcpus < 1)
+ if (kvm->created_vcpus < 1)
return -EINVAL;
- pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
+ pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr);
/*
* The PGD pages will be reclaimed using a hyp_memcache which implies
@@ -132,7 +159,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
/* Allocate memory to donate to hyp for vm and vcpu pointers. */
hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
size_mul(sizeof(void *),
- host_kvm->created_vcpus)));
+ kvm->created_vcpus)));
hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
if (!hyp_vm) {
ret = -ENOMEM;
@@ -140,44 +167,15 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
}
/* Donate the VM memory to hyp and let hyp initialize it. */
- ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
- if (ret < 0)
+ ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd);
+ if (ret)
goto free_vm;
- handle = ret;
-
- host_kvm->arch.pkvm.handle = handle;
-
- /* Donate memory for the vcpus at hyp and initialize it. */
- hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
- kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
- void *hyp_vcpu;
-
- /* Indexing of the vcpus to be sequential starting at 0. */
- if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
- ret = -EINVAL;
- goto destroy_vm;
- }
-
- hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
- if (!hyp_vcpu) {
- ret = -ENOMEM;
- goto destroy_vm;
- }
-
- ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
- hyp_vcpu);
- if (ret) {
- free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
- goto destroy_vm;
- }
- }
+ kvm->arch.pkvm.is_created = true;
+ kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
+ kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE);
return 0;
-
-destroy_vm:
- pkvm_destroy_hyp_vm(host_kvm);
- return ret;
free_vm:
free_pages_exact(hyp_vm, hyp_vm_sz);
free_pgd:
@@ -185,31 +183,305 @@ free_pgd:
return ret;
}
-int pkvm_create_hyp_vm(struct kvm *host_kvm)
+bool pkvm_hyp_vm_is_created(struct kvm *kvm)
+{
+ return READ_ONCE(kvm->arch.pkvm.is_created);
+}
+
+int pkvm_create_hyp_vm(struct kvm *kvm)
{
int ret = 0;
- mutex_lock(&host_kvm->lock);
- if (!host_kvm->arch.pkvm.handle)
- ret = __pkvm_create_hyp_vm(host_kvm);
- mutex_unlock(&host_kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
+ if (!pkvm_hyp_vm_is_created(kvm))
+ ret = __pkvm_create_hyp_vm(kvm);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
-void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
+int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
{
- if (host_kvm->arch.pkvm.handle) {
- WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
- host_kvm->arch.pkvm.handle));
+ int ret = 0;
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+ if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED))
+ ret = __pkvm_create_hyp_vcpu(vcpu);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ return ret;
+}
+
+void pkvm_destroy_hyp_vm(struct kvm *kvm)
+{
+ mutex_lock(&kvm->arch.config_lock);
+ __pkvm_destroy_hyp_vm(kvm);
+ mutex_unlock(&kvm->arch.config_lock);
+}
+
+int pkvm_init_host_vm(struct kvm *kvm)
+{
+ int ret;
+
+ if (pkvm_hyp_vm_is_created(kvm))
+ return -EINVAL;
+
+ /* VM is already reserved, no need to proceed. */
+ if (kvm->arch.pkvm.handle)
+ return 0;
+
+ /* Reserve the VM in hyp and obtain a hyp handle for the VM. */
+ ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm);
+ if (ret < 0)
+ return ret;
+
+ kvm->arch.pkvm.handle = ret;
+
+ return 0;
+}
+
+static void __init _kvm_host_prot_finalize(void *arg)
+{
+ int *err = arg;
+
+ if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
+ WRITE_ONCE(*err, -EINVAL);
+}
+
+static int __init pkvm_drop_host_privileges(void)
+{
+ int ret = 0;
+
+ /*
+ * Flip the static key upfront as that may no longer be possible
+ * once the host stage 2 is installed.
+ */
+ static_branch_enable(&kvm_protected_mode_initialized);
+ on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
+ return ret;
+}
+
+static int __init finalize_pkvm(void)
+{
+ int ret;
+
+ if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised())
+ return 0;
+
+ /*
+ * Exclude HYP sections from kmemleak so that they don't get peeked
+ * at, which would end badly once inaccessible.
+ */
+ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
+ kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start);
+ kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start);
+ kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
+
+ ret = pkvm_drop_host_privileges();
+ if (ret)
+ pr_err("Failed to finalize Hyp protection: %d\n", ret);
+
+ return ret;
+}
+device_initcall_sync(finalize_pkvm);
+
+static u64 __pkvm_mapping_start(struct pkvm_mapping *m)
+{
+ return m->gfn * PAGE_SIZE;
+}
+
+static u64 __pkvm_mapping_end(struct pkvm_mapping *m)
+{
+ return (m->gfn + m->nr_pages) * PAGE_SIZE - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last,
+ __pkvm_mapping_start, __pkvm_mapping_end, static,
+ pkvm_mapping);
+
+/*
+ * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow
+ * freeing of __map inline.
+ */
+#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \
+ for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \
+ __start, __end - 1); \
+ __tmp && ({ \
+ __map = __tmp; \
+ __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \
+ true; \
+ }); \
+ )
+
+int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
+ struct kvm_pgtable_mm_ops *mm_ops)
+{
+ pgt->pkvm_mappings = RB_ROOT_CACHED;
+ pgt->mmu = mmu;
+
+ return 0;
+}
+
+static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ int ret;
+
+ if (!handle)
+ return 0;
+
+ for_each_mapping_in_range_safe(pgt, start, end, mapping) {
+ ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
+ mapping->nr_pages);
+ if (WARN_ON(ret))
+ return ret;
+ pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
+ kfree(mapping);
}
- host_kvm->arch.pkvm.handle = 0;
- free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
+ return 0;
+}
+
+void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size)
+{
+ __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+}
+
+void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
+{
+ /* Expected to be called after all pKVM mappings have been released. */
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root));
}
-int pkvm_init_host_vm(struct kvm *host_kvm)
+int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ void *mc, enum kvm_pgtable_walk_flags flags)
{
- mutex_init(&host_kvm->lock);
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ struct pkvm_mapping *mapping = NULL;
+ struct kvm_hyp_memcache *cache = mc;
+ u64 gfn = addr >> PAGE_SHIFT;
+ u64 pfn = phys >> PAGE_SHIFT;
+ int ret;
+
+ if (size != PAGE_SIZE && size != PMD_SIZE)
+ return -EINVAL;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * Calling stage2_map() on top of existing mappings is either happening because of a race
+ * with another vCPU, or because we're changing between page and block mappings. As per
+ * user_mem_abort(), same-size permission faults are handled in the relax_perms() path.
+ */
+ mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1);
+ if (mapping) {
+ if (size == (mapping->nr_pages * PAGE_SIZE))
+ return -EAGAIN;
+
+ /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */
+ ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+ if (ret)
+ return ret;
+ mapping = NULL;
+ }
+
+ ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot);
+ if (WARN_ON(ret))
+ return ret;
+
+ swap(mapping, cache->mapping);
+ mapping->gfn = gfn;
+ mapping->pfn = pfn;
+ mapping->nr_pages = size / PAGE_SIZE;
+ pkvm_mapping_insert(mapping, &pgt->pkvm_mappings);
+
+ return ret;
+}
+
+int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock);
+
+ return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+}
+
+int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ int ret = 0;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
+ ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
+ mapping->nr_pages);
+ if (WARN_ON(ret))
+ break;
+ }
+
+ return ret;
+}
+
+int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ struct pkvm_mapping *mapping;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
+ __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn),
+ PAGE_SIZE * mapping->nr_pages);
+
return 0;
}
+
+bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ bool young = false;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
+ young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
+ mapping->nr_pages, mkold);
+
+ return young;
+}
+
+int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
+ enum kvm_pgtable_walk_flags flags)
+{
+ return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot);
+}
+
+void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_walk_flags flags)
+{
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
+}
+
+void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
+{
+ WARN_ON_ONCE(1);
+}
+
+kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
+ enum kvm_pgtable_prot prot, void *mc, bool force_pte)
+{
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_mmu_memory_cache *mc)
+{
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 24908400e190..b03dbda7f1ab 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -17,13 +17,18 @@
#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0)
-DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
-
static LIST_HEAD(arm_pmus);
static DEFINE_MUTEX(arm_pmus_lock);
static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc);
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc);
+static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc);
+
+bool kvm_supports_guest_pmuv3(void)
+{
+ guard(mutex)(&arm_pmus_lock);
+ return !list_empty(&arm_pmus);
+}
static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc)
{
@@ -35,12 +40,8 @@ static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx)
return &vcpu->arch.pmu.pmc[cnt_idx];
}
-static u32 kvm_pmu_event_mask(struct kvm *kvm)
+static u32 __kvm_pmu_event_mask(unsigned int pmuver)
{
- unsigned int pmuver;
-
- pmuver = kvm->arch.arm_pmu->pmuver;
-
switch (pmuver) {
case ID_AA64DFR0_EL1_PMUVer_IMP:
return GENMASK(9, 0);
@@ -55,19 +56,49 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm)
}
}
+static u32 kvm_pmu_event_mask(struct kvm *kvm)
+{
+ u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1);
+ u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0);
+
+ return __kvm_pmu_event_mask(pmuver);
+}
+
+u64 kvm_pmu_evtyper_mask(struct kvm *kvm)
+{
+ u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 |
+ kvm_pmu_event_mask(kvm);
+
+ if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP))
+ mask |= ARMV8_PMU_INCLUDE_EL2;
+
+ if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP))
+ mask |= ARMV8_PMU_EXCLUDE_NS_EL0 |
+ ARMV8_PMU_EXCLUDE_NS_EL1 |
+ ARMV8_PMU_EXCLUDE_EL3;
+
+ return mask;
+}
+
/**
* kvm_pmc_is_64bit - determine if counter is 64bit
* @pmc: counter context
*/
static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc)
{
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+
return (pmc->idx == ARMV8_PMU_CYCLE_IDX ||
- kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc)));
+ kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5));
}
static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc)
{
- u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0);
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ u64 val = kvm_vcpu_read_pmcr(vcpu);
+
+ if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx))
+ return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP;
return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) ||
(pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC));
@@ -89,6 +120,11 @@ static u32 counter_index_to_evtreg(u64 idx)
return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx;
}
+static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc)
+{
+ return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx));
+}
+
static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
{
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
@@ -118,9 +154,6 @@ static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
*/
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
}
@@ -145,7 +178,7 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
val |= lower_32_bits(val);
}
- __vcpu_sys_reg(vcpu, reg) = val;
+ __vcpu_assign_sys_reg(vcpu, reg, val);
/* Recreate the perf event to reflect the updated sample_period */
kvm_pmu_create_perf_event(pmc);
@@ -159,13 +192,23 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
*/
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false);
}
/**
+ * kvm_pmu_set_counter_value_user - set PMU counter value from user
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ * @val: The counter value
+ */
+void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
+{
+ kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
+ __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(select_idx), val);
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+}
+
+/**
* kvm_pmu_release_perf_event - remove the perf event
* @pmc: The PMU counter pointer
*/
@@ -196,7 +239,7 @@ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
reg = counter_index_to_reg(pmc->idx);
- __vcpu_sys_reg(vcpu, reg) = val;
+ __vcpu_assign_sys_reg(vcpu, reg, val);
kvm_pmu_release_perf_event(pmc);
}
@@ -211,25 +254,11 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
int i;
struct kvm_pmu *pmu = &vcpu->arch.pmu;
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+ for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++)
pmu->pmc[i].idx = i;
}
/**
- * kvm_pmu_vcpu_reset - reset pmu state for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
- int i;
-
- for_each_set_bit(i, &mask, 32)
- kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i));
-}
-
-/**
* kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
* @vcpu: The vcpu pointer
*
@@ -238,92 +267,128 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int i;
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+ for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++)
kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i));
irq_work_sync(&vcpu->arch.pmu.overflow_work);
}
-u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
+static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu)
{
- u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
+ unsigned int hpmn, n;
+
+ if (!vcpu_has_nv(vcpu))
+ return 0;
+
+ hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2));
+ n = vcpu->kvm->arch.nr_pmu_counters;
+
+ /*
+ * Programming HPMN to a value greater than PMCR_EL0.N is
+ * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an
+ * UNKNOWN number of counters (in our case, zero) are reserved for EL2.
+ */
+ if (hpmn >= n)
+ return 0;
+
+ /*
+ * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't
+ * implemented. Since KVM's ability to emulate HPMN=0 does not directly
+ * depend on hardware (all PMU registers are trapped), make the
+ * implementation choice that all counters are included in the second
+ * range reserved for EL2/EL3.
+ */
+ return GENMASK(n - 1, hpmn);
+}
+
+bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx);
+}
+
+u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu)
+{
+ u64 mask = kvm_pmu_implemented_counter_mask(vcpu);
+
+ if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu))
+ return mask;
+
+ return mask & ~kvm_pmu_hyp_counter_mask(vcpu);
+}
+
+u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu)
+{
+ u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu));
- val &= ARMV8_PMU_PMCR_N_MASK;
if (val == 0)
return BIT(ARMV8_PMU_CYCLE_IDX);
else
return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
}
-/**
- * kvm_pmu_enable_counter_mask - enable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENSET register
- *
- * Call perf_event_enable to start counting the perf event
- */
-void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc)
{
- int i;
- if (!kvm_vcpu_has_pmu(vcpu))
+ if (!pmc->perf_event) {
+ kvm_pmu_create_perf_event(pmc);
return;
+ }
- if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
- return;
-
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
- struct kvm_pmc *pmc;
-
- if (!(val & BIT(i)))
- continue;
-
- pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
+ perf_event_enable(pmc->perf_event);
+ if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
+ kvm_debug("fail to enable perf event\n");
+}
- if (!pmc->perf_event) {
- kvm_pmu_create_perf_event(pmc);
- } else {
- perf_event_enable(pmc->perf_event);
- if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
- kvm_debug("fail to enable perf event\n");
- }
- }
+static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event)
+ perf_event_disable(pmc->perf_event);
}
-/**
- * kvm_pmu_disable_counter_mask - disable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENCLR register
- *
- * Call perf_event_disable to stop counting the perf event
- */
-void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- if (!kvm_vcpu_has_pmu(vcpu) || !val)
+ if (!val)
return;
- for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
- struct kvm_pmc *pmc;
+ for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) {
+ struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
if (!(val & BIT(i)))
continue;
- pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
-
- if (pmc->perf_event)
- perf_event_disable(pmc->perf_event);
+ if (kvm_pmu_counter_is_enabled(pmc))
+ kvm_pmc_enable_perf_event(pmc);
+ else
+ kvm_pmc_disable_perf_event(pmc);
}
+
+ kvm_vcpu_pmu_restore_guest(vcpu);
}
-static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
+/*
+ * Returns the PMU overflow state, which is true if there exists an event
+ * counter where the values of the global enable control, PMOVSSET_EL0[n], and
+ * PMINTENSET_EL1[n] are all 1.
+ */
+static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
{
- u64 reg = 0;
+ u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
- if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
- reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
- reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
- reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
- }
+ reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+
+ /*
+ * PMCR_EL0.E is the global enable control for event counters available
+ * to EL0 and EL1.
+ */
+ if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E))
+ reg &= kvm_pmu_hyp_counter_mask(vcpu);
+
+ /*
+ * Otherwise, MDCR_EL2.HPME is the global enable control for event
+ * counters reserved for EL2.
+ */
+ if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME))
+ reg &= ~kvm_pmu_hyp_counter_mask(vcpu);
return reg;
}
@@ -333,17 +398,14 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
bool overflow;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
- overflow = !!kvm_pmu_overflow_status(vcpu);
+ overflow = kvm_pmu_overflow_status(vcpu);
if (pmu->irq_level == overflow)
return;
pmu->irq_level = overflow;
if (likely(irqchip_in_kernel(vcpu->kvm))) {
- int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+ int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu,
pmu->irq_num, overflow, pmu);
WARN_ON(ret);
}
@@ -398,7 +460,7 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
kvm_pmu_update_state(vcpu);
}
-/**
+/*
* When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding
* to the event.
* This is why we need a callback to do it once outside of the NMI context.
@@ -421,7 +483,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu,
{
int i;
- if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
+ if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E))
return;
/* Weed out disabled counters */
@@ -441,14 +503,14 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu,
reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1;
if (!kvm_pmc_is_64bit(pmc))
reg = lower_32_bits(reg);
- __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg;
+ __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(i), reg);
/* No overflow? move on */
if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg))
continue;
/* Mark overflow */
- __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(i));
if (kvm_pmu_counter_can_chain(pmc))
kvm_pmu_counter_increment(vcpu, BIT(i + 1),
@@ -469,7 +531,7 @@ static u64 compute_period(struct kvm_pmc *pmc, u64 counter)
return val;
}
-/**
+/*
* When the perf event overflows, set the overflow status and inform the vcpu.
*/
static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
@@ -494,7 +556,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
perf_event->attr.sample_period = period;
perf_event->hw.sample_period = period;
- __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
+ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(idx));
if (kvm_pmu_counter_can_chain(pmc))
kvm_pmu_counter_increment(vcpu, BIT(idx + 1),
@@ -531,29 +593,27 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
/* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */
- if (!kvm_pmu_is_3p5(vcpu))
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
val &= ~ARMV8_PMU_PMCR_LP;
- __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
+ /* Request a reload of the PMU to enable/disable affected counters */
+ if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E)
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
- if (val & ARMV8_PMU_PMCR_E) {
- kvm_pmu_enable_counter_mask(vcpu,
- __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
- } else {
- kvm_pmu_disable_counter_mask(vcpu,
- __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
- }
+ /* The reset bits don't indicate any state, and shouldn't be saved. */
+ __vcpu_assign_sys_reg(vcpu, PMCR_EL0, (val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P)));
if (val & ARMV8_PMU_PMCR_C)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
if (val & ARMV8_PMU_PMCR_P) {
- unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
- mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
+ unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) &
+ ~BIT(ARMV8_PMU_CYCLE_IDX);
+
+ if (!vcpu_is_el2(vcpu))
+ mask &= ~kvm_pmu_hyp_counter_mask(vcpu);
+
for_each_set_bit(i, &mask, 32)
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true);
}
@@ -562,8 +622,58 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
- return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
- (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx));
+ unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)))
+ return false;
+
+ if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx))
+ return mdcr & MDCR_EL2_HPME;
+
+ return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E;
+}
+
+static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc)
+{
+ u64 evtreg = kvm_pmc_read_evtreg(pmc);
+ bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0;
+ bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0;
+
+ return u == nsu;
+}
+
+static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc)
+{
+ u64 evtreg = kvm_pmc_read_evtreg(pmc);
+ bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1;
+ bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1;
+
+ return p == nsk;
+}
+
+static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc)
+{
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD))
+ return false;
+
+ return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2;
+}
+
+static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel)
+{
+ struct arm_pmu *pmu = kvm->arch.arm_pmu;
+
+ /*
+ * The CPU PMU likely isn't PMUv3; let the driver provide a mapping
+ * for the guest's PMUv3 event ID.
+ */
+ if (unlikely(pmu->map_pmuv3_event))
+ return pmu->map_pmuv3_event(eventsel);
+
+ return eventsel;
}
/**
@@ -576,16 +686,16 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu;
struct perf_event *event;
struct perf_event_attr attr;
- u64 eventsel, reg, data;
+ int eventsel;
+ u64 evtreg;
- reg = counter_index_to_evtreg(pmc->idx);
- data = __vcpu_sys_reg(vcpu, reg);
+ evtreg = kvm_pmc_read_evtreg(pmc);
kvm_pmu_stop_counter(pmc);
if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
else
- eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
+ eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm);
/*
* Neither SW increment nor chained events need to be backed
@@ -603,18 +713,34 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
!test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
+ /*
+ * Don't create an event if we're running on hardware that requires
+ * PMUv3 event translation and we couldn't find a valid mapping.
+ */
+ eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel);
+ if (eventsel < 0)
+ return;
+
memset(&attr, 0, sizeof(struct perf_event_attr));
attr.type = arm_pmu->pmu.type;
attr.size = sizeof(attr);
attr.pinned = 1;
attr.disabled = !kvm_pmu_counter_is_enabled(pmc);
- attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
- attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
+ attr.exclude_user = !kvm_pmc_counts_at_el0(pmc);
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
attr.config = eventsel;
/*
+ * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the
+ * guest's EL2 filter.
+ */
+ if (unlikely(is_hyp_ctxt(vcpu)))
+ attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc);
+ else
+ attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc);
+
+ /*
* If counting with a 64bit counter, advertise it to the perf
* code, carefully dealing with the initial sample period
* which also depends on the overflow.
@@ -650,18 +776,10 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx)
{
struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx);
- u64 reg, mask;
-
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
- mask = ARMV8_PMU_EVTYPE_MASK;
- mask &= ~ARMV8_PMU_EVTYPE_EVENT;
- mask |= kvm_pmu_event_mask(vcpu->kvm);
+ u64 reg;
reg = counter_index_to_evtreg(pmc->idx);
-
- __vcpu_sys_reg(vcpu, reg) = data & mask;
+ __vcpu_assign_sys_reg(vcpu, reg, (data & kvm_pmu_evtyper_mask(vcpu->kvm)));
kvm_pmu_create_perf_event(pmc);
}
@@ -670,93 +788,106 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
{
struct arm_pmu_entry *entry;
- if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
- pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ /*
+ * Check the sanitised PMU version for the system, as KVM does not
+ * support implementations where PMUv3 exists on a subset of CPUs.
+ */
+ if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
return;
- mutex_lock(&arm_pmus_lock);
+ guard(mutex)(&arm_pmus_lock);
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
- goto out_unlock;
+ return;
entry->arm_pmu = pmu;
list_add_tail(&entry->entry, &arm_pmus);
-
- if (list_is_singular(&arm_pmus))
- static_branch_enable(&kvm_arm_pmu_available);
-
-out_unlock:
- mutex_unlock(&arm_pmus_lock);
}
static struct arm_pmu *kvm_pmu_probe_armpmu(void)
{
- struct perf_event_attr attr = { };
- struct perf_event *event;
- struct arm_pmu *pmu = NULL;
+ struct arm_pmu_entry *entry;
+ struct arm_pmu *pmu;
+ int cpu;
+
+ guard(mutex)(&arm_pmus_lock);
/*
- * Create a dummy event that only counts user cycles. As we'll never
- * leave this function with the event being live, it will never
- * count anything. But it allows us to probe some of the PMU
- * details. Yes, this is terrible.
+ * It is safe to use a stale cpu to iterate the list of PMUs so long as
+ * the same value is used for the entirety of the loop. Given this, and
+ * the fact that no percpu data is used for the lookup there is no need
+ * to disable preemption.
+ *
+ * It is still necessary to get a valid cpu, though, to probe for the
+ * default PMU instance as userspace is not required to specify a PMU
+ * type. In order to uphold the preexisting behavior KVM selects the
+ * PMU instance for the core during vcpu init. A dependent use
+ * case would be a user with disdain of all things big.LITTLE that
+ * affines the VMM to a particular cluster of cores.
+ *
+ * In any case, userspace should just do the sane thing and use the UAPI
+ * to select a PMU type directly. But, be wary of the baggage being
+ * carried here.
*/
- attr.type = PERF_TYPE_RAW;
- attr.size = sizeof(attr);
- attr.pinned = 1;
- attr.disabled = 0;
- attr.exclude_user = 0;
- attr.exclude_kernel = 1;
- attr.exclude_hv = 1;
- attr.exclude_host = 1;
- attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
- attr.sample_period = GENMASK(63, 0);
-
- event = perf_event_create_kernel_counter(&attr, -1, current,
- kvm_pmu_perf_overflow, &attr);
+ cpu = raw_smp_processor_id();
+ list_for_each_entry(entry, &arm_pmus, entry) {
+ pmu = entry->arm_pmu;
- if (IS_ERR(event)) {
- pr_err_once("kvm: pmu event creation failed %ld\n",
- PTR_ERR(event));
- return NULL;
+ if (cpumask_test_cpu(cpu, &pmu->supported_cpus))
+ return pmu;
}
- if (event->pmu) {
- pmu = to_arm_pmu(event->pmu);
- if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
- pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
- pmu = NULL;
- }
+ return NULL;
+}
+
+static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1)
+{
+ u32 hi[2], lo[2];
+
+ bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
+ bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
- perf_event_disable(event);
- perf_event_release_kernel(event);
+ return ((u64)hi[pmceid1] << 32) | lo[pmceid1];
+}
+
+static u64 compute_pmceid0(struct arm_pmu *pmu)
+{
+ u64 val = __compute_pmceid(pmu, 0);
- return pmu;
+ /* always support SW_INCR */
+ val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR);
+ /* always support CHAIN */
+ val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
+ return val;
+}
+
+static u64 compute_pmceid1(struct arm_pmu *pmu)
+{
+ u64 val = __compute_pmceid(pmu, 1);
+
+ /*
+ * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled
+ * as RAZ
+ */
+ val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) |
+ BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) |
+ BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32));
+ return val;
}
u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
{
+ struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu;
unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
u64 val, mask = 0;
int base, i, nr_events;
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
-
if (!pmceid1) {
- val = read_sysreg(pmceid0_el0);
- /* always support CHAIN */
- val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
+ val = compute_pmceid0(cpu_pmu);
base = 0;
} else {
- val = read_sysreg(pmceid1_el0);
- /*
- * Don't advertise STALL_SLOT, as PMMIR_EL0 is handled
- * as RAZ
- */
- if (vcpu->kvm->arch.arm_pmu->pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4)
- val &= ~BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32);
+ val = compute_pmceid1(cpu_pmu);
base = 32;
}
@@ -779,11 +910,19 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
return val & mask;
}
-int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu)
{
- if (!kvm_vcpu_has_pmu(vcpu))
- return 0;
+ u64 mask = kvm_pmu_implemented_counter_mask(vcpu);
+ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, mask);
+ __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, mask);
+ __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, mask);
+
+ kvm_pmu_reprogram_counter_mask(vcpu, mask);
+}
+
+int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
+{
if (!vcpu->arch.pmu.created)
return -EINVAL;
@@ -806,9 +945,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
return -EINVAL;
}
- /* One-off reload of the PMU on first run */
- kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
-
return 0;
}
@@ -867,6 +1003,77 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
return true;
}
+/**
+ * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters.
+ * @kvm: The kvm pointer
+ */
+u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm)
+{
+ struct arm_pmu *arm_pmu = kvm->arch.arm_pmu;
+
+ /*
+ * PMUv3 requires that all event counters are capable of counting any
+ * event, though the same may not be true of non-PMUv3 hardware.
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return 1;
+
+ /*
+ * The arm_pmu->cntr_mask considers the fixed counter(s) as well.
+ * Ignore those and return only the general-purpose counters.
+ */
+ return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS);
+}
+
+static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr)
+{
+ kvm->arch.nr_pmu_counters = nr;
+
+ /* Reset MDCR_EL2.HPMN behind the vcpus' back... */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) {
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2);
+ val &= ~MDCR_EL2_HPMN;
+ val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters);
+ __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val);
+ }
+ }
+}
+
+static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu)
+{
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ kvm->arch.arm_pmu = arm_pmu;
+ kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm));
+}
+
+/**
+ * kvm_arm_set_default_pmu - No PMU set, get the default one.
+ * @kvm: The kvm pointer
+ *
+ * The observant among you will notice that the supported_cpus
+ * mask does not get updated for the default PMU even though it
+ * is quite possible the selected instance supports only a
+ * subset of cores in the system. This is intentional, and
+ * upholds the preexisting behavior on heterogeneous systems
+ * where vCPUs can be scheduled on any core but the guest
+ * counters could stop working.
+ */
+int kvm_arm_set_default_pmu(struct kvm *kvm)
+{
+ struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu();
+
+ if (!arm_pmu)
+ return -ENODEV;
+
+ kvm_arm_set_pmu(kvm, arm_pmu);
+ return 0;
+}
+
static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
{
struct kvm *kvm = vcpu->kvm;
@@ -874,19 +1081,19 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
struct arm_pmu *arm_pmu;
int ret = -ENXIO;
- mutex_lock(&kvm->lock);
+ lockdep_assert_held(&kvm->arch.config_lock);
mutex_lock(&arm_pmus_lock);
list_for_each_entry(entry, &arm_pmus, entry) {
arm_pmu = entry->arm_pmu;
if (arm_pmu->pmu.type == pmu_id) {
- if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) ||
+ if (kvm_vm_has_ran_once(kvm) ||
(kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) {
ret = -EBUSY;
break;
}
- kvm->arch.arm_pmu = arm_pmu;
+ kvm_arm_set_pmu(kvm, arm_pmu);
cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus);
ret = 0;
break;
@@ -894,31 +1101,35 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
}
mutex_unlock(&arm_pmus_lock);
- mutex_unlock(&kvm->lock);
return ret;
}
+static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.arm_pmu)
+ return -EINVAL;
+
+ if (n > kvm_arm_pmu_get_max_counters(kvm))
+ return -EINVAL;
+
+ kvm_arm_set_nr_counters(kvm, n);
+ return 0;
+}
+
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
struct kvm *kvm = vcpu->kvm;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (!kvm_vcpu_has_pmu(vcpu))
return -ENODEV;
if (vcpu->arch.pmu.created)
return -EBUSY;
- mutex_lock(&kvm->lock);
- if (!kvm->arch.arm_pmu) {
- /* No PMU set, get the default one */
- kvm->arch.arm_pmu = kvm_pmu_probe_armpmu();
- if (!kvm->arch.arm_pmu) {
- mutex_unlock(&kvm->lock);
- return -ENODEV;
- }
- }
- mutex_unlock(&kvm->lock);
-
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ: {
int __user *uaddr = (int __user *)(long)attr->addr;
@@ -945,11 +1156,17 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
return 0;
}
case KVM_ARM_VCPU_PMU_V3_FILTER: {
+ u8 pmuver = kvm_arm_pmu_get_pmuver_limit();
struct kvm_pmu_event_filter __user *uaddr;
struct kvm_pmu_event_filter filter;
int nr_events;
- nr_events = kvm_pmu_event_mask(kvm) + 1;
+ /*
+ * Allow userspace to specify an event filter for the entire
+ * event range supported by PMUVer of the hardware, rather
+ * than the guest's PMUVer for KVM backward compatibility.
+ */
+ nr_events = __kvm_pmu_event_mask(pmuver) + 1;
uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
@@ -961,19 +1178,13 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
filter.action != KVM_PMU_EVENT_DENY))
return -EINVAL;
- mutex_lock(&kvm->lock);
-
- if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags)) {
- mutex_unlock(&kvm->lock);
+ if (kvm_vm_has_ran_once(kvm))
return -EBUSY;
- }
if (!kvm->arch.pmu_filter) {
kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT);
- if (!kvm->arch.pmu_filter) {
- mutex_unlock(&kvm->lock);
+ if (!kvm->arch.pmu_filter)
return -ENOMEM;
- }
/*
* The default depends on the first applied filter.
@@ -992,8 +1203,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
else
bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents);
- mutex_unlock(&kvm->lock);
-
return 0;
}
case KVM_ARM_VCPU_PMU_V3_SET_PMU: {
@@ -1005,6 +1214,15 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id);
}
+ case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: {
+ unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr;
+ unsigned int n;
+
+ if (get_user(n, uaddr))
+ return -EFAULT;
+
+ return kvm_arm_pmu_v3_set_nr_counters(vcpu, n);
+ }
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
@@ -1043,6 +1261,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
case KVM_ARM_VCPU_PMU_V3_INIT:
case KVM_ARM_VCPU_PMU_V3_FILTER:
case KVM_ARM_VCPU_PMU_V3_SET_PMU:
+ case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS:
if (kvm_vcpu_has_pmu(vcpu))
return 0;
}
@@ -1052,11 +1271,65 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
u8 kvm_arm_pmu_get_pmuver_limit(void)
{
- u64 tmp;
+ unsigned int pmuver;
+
+ pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer,
+ read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1));
+
+ /*
+ * Spoof a barebones PMUv3 implementation if the system supports IMPDEF
+ * traps of the PMUv3 sysregs
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
+ return ID_AA64DFR0_EL1_PMUVer_IMP;
+
+ /*
+ * Otherwise, treat IMPLEMENTATION DEFINED functionality as
+ * unimplemented
+ */
+ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ return 0;
+
+ return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5);
+}
+
+/**
+ * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU
+ * @vcpu: The vcpu pointer
+ */
+u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu)
+{
+ u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
+ u64 n = vcpu->kvm->arch.nr_pmu_counters;
+
+ if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu))
+ n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2));
+
+ return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N);
+}
+
+void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu)
+{
+ bool reprogrammed = false;
+ unsigned long mask;
+ int i;
+
+ mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+ for_each_set_bit(i, &mask, 32) {
+ struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
+
+ /*
+ * We only need to reconfigure events where the filter is
+ * different at EL1 vs. EL2, as we're multiplexing the true EL1
+ * event filter bit for nested.
+ */
+ if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc))
+ continue;
+
+ kvm_pmu_create_perf_event(pmc);
+ reprogrammed = true;
+ }
- tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
- tmp = cpuid_feature_cap_perfmon_field(tmp,
- ID_AA64DFR0_EL1_PMUVer_SHIFT,
- ID_AA64DFR0_EL1_PMUVer_V3P5);
- return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp);
+ if (reprogrammed)
+ kvm_vcpu_pmu_restore_guest(vcpu);
}
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 7887133d15f0..6b48a3d16d0d 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -5,6 +5,8 @@
*/
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/perf/arm_pmuv3.h>
static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events);
@@ -35,11 +37,11 @@ struct kvm_pmu_events *kvm_get_pmu_events(void)
* Add events to track that we may want to switch at guest entry/exit
* time.
*/
-void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
+void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr)
{
struct kvm_pmu_events *pmu = kvm_get_pmu_events();
- if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr))
+ if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
@@ -51,90 +53,43 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
/*
* Stop tracking events
*/
-void kvm_clr_pmu_events(u32 clr)
+void kvm_clr_pmu_events(u64 clr)
{
struct kvm_pmu_events *pmu = kvm_get_pmu_events();
- if (!kvm_arm_support_pmu_v3() || !pmu)
+ if (!system_supports_pmuv3())
return;
pmu->events_host &= ~clr;
pmu->events_guest &= ~clr;
}
-#define PMEVTYPER_READ_CASE(idx) \
- case idx: \
- return read_sysreg(pmevtyper##idx##_el0)
-
-#define PMEVTYPER_WRITE_CASE(idx) \
- case idx: \
- write_sysreg(val, pmevtyper##idx##_el0); \
- break
-
-#define PMEVTYPER_CASES(readwrite) \
- PMEVTYPER_##readwrite##_CASE(0); \
- PMEVTYPER_##readwrite##_CASE(1); \
- PMEVTYPER_##readwrite##_CASE(2); \
- PMEVTYPER_##readwrite##_CASE(3); \
- PMEVTYPER_##readwrite##_CASE(4); \
- PMEVTYPER_##readwrite##_CASE(5); \
- PMEVTYPER_##readwrite##_CASE(6); \
- PMEVTYPER_##readwrite##_CASE(7); \
- PMEVTYPER_##readwrite##_CASE(8); \
- PMEVTYPER_##readwrite##_CASE(9); \
- PMEVTYPER_##readwrite##_CASE(10); \
- PMEVTYPER_##readwrite##_CASE(11); \
- PMEVTYPER_##readwrite##_CASE(12); \
- PMEVTYPER_##readwrite##_CASE(13); \
- PMEVTYPER_##readwrite##_CASE(14); \
- PMEVTYPER_##readwrite##_CASE(15); \
- PMEVTYPER_##readwrite##_CASE(16); \
- PMEVTYPER_##readwrite##_CASE(17); \
- PMEVTYPER_##readwrite##_CASE(18); \
- PMEVTYPER_##readwrite##_CASE(19); \
- PMEVTYPER_##readwrite##_CASE(20); \
- PMEVTYPER_##readwrite##_CASE(21); \
- PMEVTYPER_##readwrite##_CASE(22); \
- PMEVTYPER_##readwrite##_CASE(23); \
- PMEVTYPER_##readwrite##_CASE(24); \
- PMEVTYPER_##readwrite##_CASE(25); \
- PMEVTYPER_##readwrite##_CASE(26); \
- PMEVTYPER_##readwrite##_CASE(27); \
- PMEVTYPER_##readwrite##_CASE(28); \
- PMEVTYPER_##readwrite##_CASE(29); \
- PMEVTYPER_##readwrite##_CASE(30)
-
/*
* Read a value direct from PMEVTYPER<idx> where idx is 0-30
- * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31).
+ * or PMxCFILTR_EL0 where idx is 31-32.
*/
static u64 kvm_vcpu_pmu_read_evtype_direct(int idx)
{
- switch (idx) {
- PMEVTYPER_CASES(READ);
- case ARMV8_PMU_CYCLE_IDX:
- return read_sysreg(pmccfiltr_el0);
- default:
- WARN_ON(1);
- }
+ if (idx == ARMV8_PMU_CYCLE_IDX)
+ return read_pmccfiltr();
+ else if (idx == ARMV8_PMU_INSTR_IDX)
+ return read_pmicfiltr();
- return 0;
+ return read_pmevtypern(idx);
}
/*
* Write a value direct to PMEVTYPER<idx> where idx is 0-30
- * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31).
+ * or PMxCFILTR_EL0 where idx is 31-32.
*/
static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val)
{
- switch (idx) {
- PMEVTYPER_CASES(WRITE);
- case ARMV8_PMU_CYCLE_IDX:
- write_sysreg(val, pmccfiltr_el0);
- break;
- default:
- WARN_ON(1);
- }
+ if (idx == ARMV8_PMU_CYCLE_IDX)
+ write_pmccfiltr(val);
+ else if (idx == ARMV8_PMU_INSTR_IDX)
+ write_pmicfiltr(val);
+ else
+ write_pmevtypern(idx, val);
}
/*
@@ -145,7 +100,7 @@ static void kvm_vcpu_pmu_enable_el0(unsigned long events)
u64 typer;
u32 counter;
- for_each_set_bit(counter, &events, 32) {
+ for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) {
typer = kvm_vcpu_pmu_read_evtype_direct(counter);
typer &= ~ARMV8_PMU_EXCLUDE_EL0;
kvm_vcpu_pmu_write_evtype_direct(counter, typer);
@@ -160,7 +115,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events)
u64 typer;
u32 counter;
- for_each_set_bit(counter, &events, 32) {
+ for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) {
typer = kvm_vcpu_pmu_read_evtype_direct(counter);
typer |= ARMV8_PMU_EXCLUDE_EL0;
kvm_vcpu_pmu_write_evtype_direct(counter, typer);
@@ -176,9 +131,9 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events)
void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
{
struct kvm_pmu_events *pmu;
- u32 events_guest, events_host;
+ u64 events_guest, events_host;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return;
preempt_disable();
@@ -197,9 +152,9 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
{
struct kvm_pmu_events *pmu;
- u32 events_guest, events_host;
+ u64 events_guest, events_host;
- if (!kvm_arm_support_pmu_v3() || !has_vhe())
+ if (!system_supports_pmuv3() || !has_vhe())
return;
pmu = kvm_get_pmu_events();
@@ -209,3 +164,48 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
kvm_vcpu_pmu_enable_el0(events_host);
kvm_vcpu_pmu_disable_el0(events_guest);
}
+
+/*
+ * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU
+ * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched
+ * to the value for the guest on vcpu_load(). The value for the host EL0
+ * will be restored on vcpu_put(), before returning to userspace.
+ * This isn't necessary for nVHE, as the register is context switched for
+ * every guest enter/exit.
+ *
+ * Return true if KVM takes care of the register. Otherwise return false.
+ */
+bool kvm_set_pmuserenr(u64 val)
+{
+ struct kvm_cpu_context *hctxt;
+ struct kvm_vcpu *vcpu;
+
+ if (!system_supports_pmuv3() || !has_vhe())
+ return false;
+
+ vcpu = kvm_get_running_vcpu();
+ if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU))
+ return false;
+
+ hctxt = host_data_ptr(host_ctxt);
+ ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val;
+ return true;
+}
+
+/*
+ * If we interrupted the guest to update the host PMU context, make
+ * sure we re-apply the guest EL0 state.
+ */
+void kvm_vcpu_pmu_resync_el0(void)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!has_vhe() || !in_interrupt())
+ return;
+
+ vcpu = kvm_get_running_vcpu();
+ if (!vcpu)
+ return;
+
+ kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu);
+}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 7fbc4c1b9df0..3b5dbe9a0a0e 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -62,6 +62,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
struct vcpu_reset_state *reset_state;
struct kvm *kvm = source_vcpu->kvm;
struct kvm_vcpu *vcpu = NULL;
+ int ret = PSCI_RET_SUCCESS;
unsigned long cpu_id;
cpu_id = smccc_get_arg1(source_vcpu);
@@ -76,11 +77,15 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
*/
if (!vcpu)
return PSCI_RET_INVALID_PARAMS;
+
+ spin_lock(&vcpu->arch.mp_state_lock);
if (!kvm_arm_vcpu_stopped(vcpu)) {
if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
- return PSCI_RET_ALREADY_ON;
+ ret = PSCI_RET_ALREADY_ON;
else
- return PSCI_RET_INVALID_PARAMS;
+ ret = PSCI_RET_INVALID_PARAMS;
+
+ goto out_unlock;
}
reset_state = &vcpu->arch.reset_state;
@@ -96,7 +101,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
*/
reset_state->r0 = smccc_get_arg3(source_vcpu);
- WRITE_ONCE(reset_state->reset, true);
+ reset_state->reset = true;
kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
/*
@@ -105,10 +110,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
*/
smp_wmb();
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
kvm_vcpu_wake_up(vcpu);
- return PSCI_RET_SUCCESS;
+out_unlock:
+ spin_unlock(&vcpu->arch.mp_state_lock);
+ return ret;
}
static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
@@ -168,8 +175,11 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
* after this call is handled and before the VCPUs have been
* re-initialized.
*/
- kvm_for_each_vcpu(i, tmp, vcpu->kvm)
- tmp->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+ kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+ spin_lock(&tmp->arch.mp_state_lock);
+ WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
+ spin_unlock(&tmp->arch.mp_state_lock);
+ }
kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
@@ -184,6 +194,12 @@ static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0);
}
+static void kvm_psci_system_off2(struct kvm_vcpu *vcpu)
+{
+ kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN,
+ KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2);
+}
+
static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
{
kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0);
@@ -229,7 +245,6 @@ static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32
static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
u32 psci_fn = smccc_get_function(vcpu);
unsigned long val;
int ret = 1;
@@ -254,9 +269,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
kvm_psci_narrow_to_32bit(vcpu);
fallthrough;
case PSCI_0_2_FN64_CPU_ON:
- mutex_lock(&kvm->lock);
val = kvm_psci_vcpu_on(vcpu);
- mutex_unlock(&kvm->lock);
break;
case PSCI_0_2_FN_AFFINITY_INFO:
kvm_psci_narrow_to_32bit(vcpu);
@@ -315,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
switch(psci_fn) {
case PSCI_0_2_FN_PSCI_VERSION:
- val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1;
+ val = PSCI_VERSION(1, minor);
break;
case PSCI_1_0_FN_PSCI_FEATURES:
arg = smccc_get_arg1(vcpu);
@@ -351,6 +364,11 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
if (minor >= 1)
val = 0;
break;
+ case PSCI_1_3_FN_SYSTEM_OFF2:
+ case PSCI_1_3_FN64_SYSTEM_OFF2:
+ if (minor >= 3)
+ val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF;
+ break;
}
break;
case PSCI_1_0_FN_SYSTEM_SUSPEND:
@@ -385,6 +403,33 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
break;
}
break;
+ case PSCI_1_3_FN_SYSTEM_OFF2:
+ kvm_psci_narrow_to_32bit(vcpu);
+ fallthrough;
+ case PSCI_1_3_FN64_SYSTEM_OFF2:
+ if (minor < 3)
+ break;
+
+ arg = smccc_get_arg1(vcpu);
+ /*
+ * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2
+ * must be zero.
+ */
+ if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) ||
+ smccc_get_arg2(vcpu) != 0) {
+ val = PSCI_RET_INVALID_PARAMS;
+ break;
+ }
+ kvm_psci_system_off2(vcpu);
+ /*
+ * We shouldn't be going back to the guest after receiving a
+ * SYSTEM_OFF2 request. Preload a return value of
+ * INTERNAL_FAILURE should userspace ignore the exit and resume
+ * the vCPU.
+ */
+ val = PSCI_RET_INTERNAL_FAILURE;
+ ret = 0;
+ break;
default:
return kvm_psci_0_2_call(vcpu);
}
@@ -395,7 +440,6 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
u32 psci_fn = smccc_get_function(vcpu);
unsigned long val;
@@ -405,9 +449,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
val = PSCI_RET_SUCCESS;
break;
case KVM_PSCI_FN_CPU_ON:
- mutex_lock(&kvm->lock);
val = kvm_psci_vcpu_on(vcpu);
- mutex_unlock(&kvm->lock);
break;
default:
val = PSCI_RET_NOT_SUPPORTED;
@@ -435,6 +477,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
int kvm_psci_call(struct kvm_vcpu *vcpu)
{
u32 psci_fn = smccc_get_function(vcpu);
+ int version = kvm_psci_version(vcpu);
unsigned long val;
val = kvm_psci_check_allowed_function(vcpu, psci_fn);
@@ -443,7 +486,11 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
return 1;
}
- switch (kvm_psci_version(vcpu)) {
+ switch (version) {
+ case KVM_ARM_PSCI_1_3:
+ return kvm_psci_1_x_call(vcpu, 3);
+ case KVM_ARM_PSCI_1_2:
+ return kvm_psci_1_x_call(vcpu, 2);
case KVM_ARM_PSCI_1_1:
return kvm_psci_1_x_call(vcpu, 1);
case KVM_ARM_PSCI_1_0:
@@ -453,6 +500,8 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
case KVM_ARM_PSCI_0_1:
return kvm_psci_0_1_call(vcpu);
default:
- return -EINVAL;
+ WARN_ONCE(1, "Unknown PSCI version %d", version);
+ smccc_set_retval(vcpu, SMCCC_RET_NOT_SUPPORTED, 0, 0, 0);
+ return 1;
}
}
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
new file mode 100644
index 000000000000..6cbe018fd6fd
--- /dev/null
+++ b/arch/arm64/kvm/ptdump.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Debug helper used to dump the stage-2 pagetables of the system and their
+ * associated permissions.
+ *
+ * Copyright (C) Google, 2024
+ * Author: Sebastian Ene <sebastianene@google.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/kvm_host.h>
+#include <linux/seq_file.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/ptdump.h>
+
+#define MARKERS_LEN 2
+#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1)
+
+struct kvm_ptdump_guest_state {
+ struct kvm *kvm;
+ struct ptdump_pg_state parser_state;
+ struct addr_marker ipa_marker[MARKERS_LEN];
+ struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS];
+ struct ptdump_range range[MARKERS_LEN];
+};
+
+static const struct ptdump_prot_bits stage2_pte_bits[] = {
+ {
+ .mask = PTE_VALID,
+ .val = PTE_VALID,
+ .set = " ",
+ .clear = "F",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
+ .set = "R",
+ .clear = " ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
+ .set = "W",
+ .clear = " ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "px ux ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "PXNux ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "PXNUXN",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "px UXN",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_AF,
+ .set = "AF",
+ .clear = " ",
+ },
+ {
+ .mask = PMD_TYPE_MASK,
+ .val = PMD_TYPE_SECT,
+ .set = "BLK",
+ .clear = " ",
+ },
+};
+
+static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct ptdump_pg_state *st = ctx->arg;
+ struct ptdump_state *pt_st = &st->ptdump;
+
+ note_page(pt_st, ctx->addr, ctx->level, ctx->old);
+
+ return 0;
+}
+
+static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
+{
+ u32 i;
+ u64 mask;
+
+ if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
+ return -EINVAL;
+
+ mask = 0;
+ for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
+ mask |= stage2_pte_bits[i].mask;
+
+ for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+ snprintf(level[i].name, sizeof(level[i].name), "%u", i);
+
+ level[i].num = ARRAY_SIZE(stage2_pte_bits);
+ level[i].bits = stage2_pte_bits;
+ level[i].mask = mask;
+ }
+
+ return 0;
+}
+
+static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
+{
+ struct kvm_ptdump_guest_state *st;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct kvm_pgtable *pgtable = mmu->pgt;
+ int ret;
+
+ st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
+ if (!st)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
+ if (ret) {
+ kfree(st);
+ return ERR_PTR(ret);
+ }
+
+ st->ipa_marker[0].name = "Guest IPA";
+ st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
+ st->range[0].end = BIT(pgtable->ia_bits);
+
+ st->kvm = kvm;
+ st->parser_state = (struct ptdump_pg_state) {
+ .marker = &st->ipa_marker[0],
+ .level = -1,
+ .pg_level = &st->level[0],
+ .ptdump.range = &st->range[0],
+ .start_address = 0,
+ };
+
+ return st;
+}
+
+static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
+{
+ int ret;
+ struct kvm_ptdump_guest_state *st = m->private;
+ struct kvm *kvm = st->kvm;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct ptdump_pg_state *parser_state = &st->parser_state;
+ struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
+ .cb = kvm_ptdump_visitor,
+ .arg = parser_state,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ parser_state->seq = m;
+
+ write_lock(&kvm->mmu_lock);
+ ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
+ write_unlock(&kvm->mmu_lock);
+
+ return ret;
+}
+
+static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+ struct kvm_ptdump_guest_state *st;
+ int ret;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ st = kvm_ptdump_parser_create(kvm);
+ if (IS_ERR(st)) {
+ ret = PTR_ERR(st);
+ goto err_with_kvm_ref;
+ }
+
+ ret = single_open(file, kvm_ptdump_guest_show, st);
+ if (!ret)
+ return 0;
+
+ kfree(st);
+err_with_kvm_ref:
+ kvm_put_kvm(kvm);
+ return ret;
+}
+
+static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+ void *st = ((struct seq_file *)file->private_data)->private;
+
+ kfree(st);
+ kvm_put_kvm(kvm);
+
+ return single_release(m, file);
+}
+
+static const struct file_operations kvm_ptdump_guest_fops = {
+ .open = kvm_ptdump_guest_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_ptdump_guest_close,
+};
+
+static int kvm_pgtable_range_show(struct seq_file *m, void *unused)
+{
+ struct kvm_pgtable *pgtable = m->private;
+
+ seq_printf(m, "%2u\n", pgtable->ia_bits);
+ return 0;
+}
+
+static int kvm_pgtable_levels_show(struct seq_file *m, void *unused)
+{
+ struct kvm_pgtable *pgtable = m->private;
+
+ seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level);
+ return 0;
+}
+
+static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file,
+ int (*show)(struct seq_file *, void *))
+{
+ struct kvm *kvm = m->i_private;
+ struct kvm_pgtable *pgtable;
+ int ret;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ pgtable = kvm->arch.mmu.pgt;
+
+ ret = single_open(file, show, pgtable);
+ if (ret < 0)
+ kvm_put_kvm(kvm);
+ return ret;
+}
+
+static int kvm_pgtable_range_open(struct inode *m, struct file *file)
+{
+ return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show);
+}
+
+static int kvm_pgtable_levels_open(struct inode *m, struct file *file)
+{
+ return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show);
+}
+
+static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+
+ kvm_put_kvm(kvm);
+ return single_release(m, file);
+}
+
+static const struct file_operations kvm_pgtable_range_fops = {
+ .open = kvm_pgtable_range_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_pgtable_debugfs_close,
+};
+
+static const struct file_operations kvm_pgtable_levels_fops = {
+ .open = kvm_pgtable_levels_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_pgtable_debugfs_close,
+};
+
+void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
+ kvm, &kvm_ptdump_guest_fops);
+ debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
+ &kvm_pgtable_range_fops);
+ debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
+ kvm, &kvm_pgtable_levels_fops);
+}
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
index 78a09f7a6637..4ceabaa4c30b 100644
--- a/arch/arm64/kvm/pvtime.c
+++ b/arch/arm64/kvm/pvtime.c
@@ -19,7 +19,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
u64 steal = 0;
int idx;
- if (base == GPA_INVALID)
+ if (base == INVALID_GPA)
return;
idx = srcu_read_lock(&kvm->srcu);
@@ -40,7 +40,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
switch (feature) {
case ARM_SMCCC_HV_PV_TIME_FEATURES:
case ARM_SMCCC_HV_PV_TIME_ST:
- if (vcpu->arch.steal.base != GPA_INVALID)
+ if (vcpu->arch.steal.base != INVALID_GPA)
val = SMCCC_RET_SUCCESS;
break;
}
@@ -54,7 +54,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
u64 base = vcpu->arch.steal.base;
- if (base == GPA_INVALID)
+ if (base == INVALID_GPA)
return base;
/*
@@ -89,7 +89,7 @@ int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
return -EFAULT;
if (!IS_ALIGNED(ipa, 64))
return -EINVAL;
- if (vcpu->arch.steal.base != GPA_INVALID)
+ if (vcpu->arch.steal.base != INVALID_GPA)
return -EEXIST;
/* Check the address is in a valid memslot */
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e0267f672b8a..959532422d3a 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -27,10 +27,12 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/virt.h>
/* Maximum phys_shift supported for any VM on this host */
-static u32 kvm_ipa_limit;
+static u32 __ro_after_init kvm_ipa_limit;
+unsigned int __ro_after_init kvm_host_sve_max_vl;
/*
* ARMv8 Reset Values
@@ -38,15 +40,20 @@ static u32 kvm_ipa_limit;
#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
PSR_F_BIT | PSR_D_BIT)
+#define VCPU_RESET_PSTATE_EL2 (PSR_MODE_EL2h | PSR_A_BIT | PSR_I_BIT | \
+ PSR_F_BIT | PSR_D_BIT)
+
#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
PSR_AA32_I_BIT | PSR_AA32_F_BIT)
-unsigned int kvm_sve_max_vl;
+unsigned int __ro_after_init kvm_sve_max_vl;
-int kvm_arm_init_sve(void)
+int __init kvm_arm_init_sve(void)
{
if (system_supports_sve()) {
kvm_sve_max_vl = sve_max_virtualisable_vl();
+ kvm_host_sve_max_vl = sve_max_vl();
+ kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl;
/*
* The get_sve_reg()/set_sve_reg() ioctl interface will need
@@ -69,11 +76,8 @@ int kvm_arm_init_sve(void)
return 0;
}
-static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
{
- if (!system_supports_sve())
- return -EINVAL;
-
vcpu->arch.sve_max_vl = kvm_sve_max_vl;
/*
@@ -81,9 +85,7 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
* KVM_REG_ARM64_SVE_VLS. Allocation is deferred until
* kvm_arm_vcpu_finalize(), which freezes the configuration.
*/
- vcpu_set_flag(vcpu, GUEST_HAS_SVE);
-
- return 0;
+ set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags);
}
/*
@@ -152,11 +154,13 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
{
void *sve_state = vcpu->arch.sve_state;
- kvm_vcpu_unshare_task_fp(vcpu);
kvm_unshare_hyp(vcpu, vcpu + 1);
if (sve_state)
kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu));
kfree(sve_state);
+ free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
+ kfree(vcpu->arch.vncr_tlb);
+ kfree(vcpu->arch.ccsidr);
}
static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
@@ -165,69 +169,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu));
}
-static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
-{
- /*
- * For now make sure that both address/generic pointer authentication
- * features are requested by the userspace together and the system
- * supports these capabilities.
- */
- if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
- !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features) ||
- !system_has_full_ptr_auth())
- return -EINVAL;
-
- vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH);
- return 0;
-}
-
-/**
- * kvm_set_vm_width() - set the register width for the guest
- * @vcpu: Pointer to the vcpu being configured
- *
- * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED
- * in the VM flags based on the vcpu's requested register width, the HW
- * capabilities and other options (such as MTE).
- * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be
- * consistent with the value of the FLAG_EL1_32BIT bit in the flags.
- *
- * Return: 0 on success, negative error code on failure.
- */
-static int kvm_set_vm_width(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = vcpu->kvm;
- bool is32bit;
-
- is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT);
-
- lockdep_assert_held(&kvm->lock);
-
- if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) {
- /*
- * The guest's register width is already configured.
- * Make sure that the vcpu is consistent with it.
- */
- if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags))
- return 0;
-
- return -EINVAL;
- }
-
- if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
- return -EINVAL;
-
- /* MTE is incompatible with AArch32 */
- if (kvm_has_mte(kvm) && is32bit)
- return -EINVAL;
-
- if (is32bit)
- set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags);
-
- set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags);
-
- return 0;
-}
-
/**
* kvm_reset_vcpu - sets core registers and sys_regs to reset value
* @vcpu: The VCPU pointer
@@ -246,26 +187,16 @@ static int kvm_set_vm_width(struct kvm_vcpu *vcpu)
* disable preemption around the vcpu reset as we would otherwise race with
* preempt notifiers which also call put/load.
*/
-int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
+void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_reset_state reset_state;
- int ret;
bool loaded;
u32 pstate;
- mutex_lock(&vcpu->kvm->lock);
- ret = kvm_set_vm_width(vcpu);
- if (!ret) {
- reset_state = vcpu->arch.reset_state;
- WRITE_ONCE(vcpu->arch.reset_state.reset, false);
- }
- mutex_unlock(&vcpu->kvm->lock);
-
- if (ret)
- return ret;
-
- /* Reset PMU outside of the non-preemptible section */
- kvm_pmu_vcpu_reset(vcpu);
+ spin_lock(&vcpu->arch.mp_state_lock);
+ reset_state = vcpu->arch.reset_state;
+ vcpu->arch.reset_state.reset = false;
+ spin_unlock(&vcpu->arch.mp_state_lock);
preempt_disable();
loaded = (vcpu->cpu != -1);
@@ -273,37 +204,18 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_put(vcpu);
if (!kvm_arm_vcpu_sve_finalized(vcpu)) {
- if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) {
- ret = kvm_vcpu_enable_sve(vcpu);
- if (ret)
- goto out;
- }
+ if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE))
+ kvm_vcpu_enable_sve(vcpu);
} else {
kvm_vcpu_reset_sve(vcpu);
}
- if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
- test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) {
- if (kvm_vcpu_enable_ptrauth(vcpu)) {
- ret = -EINVAL;
- goto out;
- }
- }
-
- switch (vcpu->arch.target) {
- default:
- if (vcpu_el1_is_32bit(vcpu)) {
- pstate = VCPU_RESET_PSTATE_SVC;
- } else {
- pstate = VCPU_RESET_PSTATE_EL1;
- }
-
- if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) {
- ret = -EINVAL;
- goto out;
- }
- break;
- }
+ if (vcpu_el1_is_32bit(vcpu))
+ pstate = VCPU_RESET_PSTATE_SVC;
+ else if (vcpu_has_nv(vcpu))
+ pstate = VCPU_RESET_PSTATE_EL2;
+ else
+ pstate = VCPU_RESET_PSTATE_EL1;
/* Reset core registers */
memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
@@ -339,12 +251,17 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
}
/* Reset timer */
- ret = kvm_timer_vcpu_reset(vcpu);
-out:
+ kvm_timer_vcpu_reset(vcpu);
+
if (loaded)
kvm_arch_vcpu_load(vcpu, smp_processor_id());
preempt_enable();
- return ret;
+}
+
+u32 kvm_get_pa_bits(struct kvm *kvm)
+{
+ /* Fixed limit until we can configure ID_AA64MMFR0.PARange */
+ return kvm_ipa_limit;
}
u32 get_kvm_ipa_limit(void)
@@ -352,7 +269,7 @@ u32 get_kvm_ipa_limit(void)
return kvm_ipa_limit;
}
-int kvm_set_ipa_limit(void)
+int __init kvm_set_ipa_limit(void)
{
unsigned int parange;
u64 mmfr0;
@@ -361,12 +278,11 @@ int kvm_set_ipa_limit(void)
parange = cpuid_feature_extract_unsigned_field(mmfr0,
ID_AA64MMFR0_EL1_PARANGE_SHIFT);
/*
- * IPA size beyond 48 bits could not be supported
- * on either 4K or 16K page size. Hence let's cap
- * it to 48 bits, in case it's reported as larger
- * on the system.
+ * IPA size beyond 48 bits for 4K and 16K page size is only supported
+ * when LPA2 is available. So if we have LPA2, enable it, else cap to 48
+ * bits, in case it's reported as larger on the system.
*/
- if (PAGE_SIZE != SZ_64K)
+ if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K)
parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48);
/*
diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c
index 3ace5b75813b..af5eec681127 100644
--- a/arch/arm64/kvm/stacktrace.c
+++ b/arch/arm64/kvm/stacktrace.c
@@ -19,6 +19,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
#include <asm/stacktrace/nvhe.h>
static struct stack_info stackinfo_get_overflow(void)
@@ -50,7 +51,7 @@ static struct stack_info stackinfo_get_hyp(void)
struct kvm_nvhe_stacktrace_info *stacktrace_info
= this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
unsigned long low = (unsigned long)stacktrace_info->stack_base;
- unsigned long high = low + PAGE_SIZE;
+ unsigned long high = low + NVHE_STACK_SIZE;
return (struct stack_info) {
.low = low,
@@ -60,8 +61,8 @@ static struct stack_info stackinfo_get_hyp(void)
static struct stack_info stackinfo_get_hyp_kern_va(void)
{
- unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page);
- unsigned long high = low + PAGE_SIZE;
+ unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_base);
+ unsigned long high = low + NVHE_STACK_SIZE;
return (struct stack_info) {
.low = low,
@@ -145,7 +146,7 @@ static void unwind(struct unwind_state *state,
*/
static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where)
{
- unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0);
+ unsigned long va_mask = GENMASK_ULL(__hyp_va_bits - 1, 0);
unsigned long hyp_offset = (unsigned long)arg;
/* Mask tags and convert to kern addr */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c6cbfe6b854b..c8fd7c6a12a1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -11,11 +11,15 @@
#include <linux/bitfield.h>
#include <linux/bsearch.h>
+#include <linux/cacheinfo.h>
+#include <linux/debugfs.h>
#include <linux/kvm_host.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/uaccess.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <asm/arm_pmuv3.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
#include <asm/debug-monitors.h>
@@ -24,12 +28,14 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/perf_event.h>
#include <asm/sysreg.h>
#include <trace/events/kvm.h>
#include "sys_regs.h"
+#include "vgic/vgic.h"
#include "trace.h"
@@ -40,66 +46,454 @@
*/
static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
+static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 val);
+
+static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ kvm_inject_undefined(vcpu);
+ return false;
+}
+
+static bool bad_trap(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *params,
+ const struct sys_reg_desc *r,
+ const char *msg)
+{
+ WARN_ONCE(1, "Unexpected %s\n", msg);
+ print_sys_reg_instr(params);
+ return undef_access(vcpu, params, r);
+}
static bool read_from_write_only(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
const struct sys_reg_desc *r)
{
- WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n");
- print_sys_reg_instr(params);
- kvm_inject_undefined(vcpu);
- return false;
+ return bad_trap(vcpu, params, r,
+ "sys_reg read to write-only register");
}
static bool write_to_read_only(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
const struct sys_reg_desc *r)
{
- WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n");
- print_sys_reg_instr(params);
- kvm_inject_undefined(vcpu);
- return false;
+ return bad_trap(vcpu, params, r,
+ "sys_reg write to read-only register");
}
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+enum sr_loc_attr {
+ SR_LOC_MEMORY = 0, /* Register definitely in memory */
+ SR_LOC_LOADED = BIT(0), /* Register on CPU, unless it cannot */
+ SR_LOC_MAPPED = BIT(1), /* Register in a different CPU register */
+ SR_LOC_XLATED = BIT(2), /* Register translated to fit another reg */
+ SR_LOC_SPECIAL = BIT(3), /* Demanding register, implies loaded */
+};
+
+struct sr_loc {
+ enum sr_loc_attr loc;
+ enum vcpu_sysreg map_reg;
+ u64 (*xlate)(u64);
+};
+
+static enum sr_loc_attr locate_direct_register(const struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg reg)
{
- u64 val = 0x8badf00d8badf00d;
+ switch (reg) {
+ case SCTLR_EL1:
+ case CPACR_EL1:
+ case TTBR0_EL1:
+ case TTBR1_EL1:
+ case TCR_EL1:
+ case TCR2_EL1:
+ case PIR_EL1:
+ case PIRE0_EL1:
+ case POR_EL1:
+ case ESR_EL1:
+ case AFSR0_EL1:
+ case AFSR1_EL1:
+ case FAR_EL1:
+ case MAIR_EL1:
+ case VBAR_EL1:
+ case CONTEXTIDR_EL1:
+ case AMAIR_EL1:
+ case CNTKCTL_EL1:
+ case ELR_EL1:
+ case SPSR_EL1:
+ case ZCR_EL1:
+ case SCTLR2_EL1:
+ /*
+ * EL1 registers which have an ELx2 mapping are loaded if
+ * we're not in hypervisor context.
+ */
+ return is_hyp_ctxt(vcpu) ? SR_LOC_MEMORY : SR_LOC_LOADED;
+
+ case TPIDR_EL0:
+ case TPIDRRO_EL0:
+ case TPIDR_EL1:
+ case PAR_EL1:
+ case DACR32_EL2:
+ case IFSR32_EL2:
+ case DBGVCR32_EL2:
+ /* These registers are always loaded, no matter what */
+ return SR_LOC_LOADED;
- if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
- __vcpu_read_sys_reg_from_cpu(reg, &val))
- return val;
+ default:
+ /* Non-mapped EL2 registers are by definition in memory. */
+ return SR_LOC_MEMORY;
+ }
+}
- return __vcpu_sys_reg(vcpu, reg);
+static void locate_mapped_el2_register(const struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg reg,
+ enum vcpu_sysreg map_reg,
+ u64 (*xlate)(u64),
+ struct sr_loc *loc)
+{
+ if (!is_hyp_ctxt(vcpu)) {
+ loc->loc = SR_LOC_MEMORY;
+ return;
+ }
+
+ loc->loc = SR_LOC_LOADED | SR_LOC_MAPPED;
+ loc->map_reg = map_reg;
+
+ WARN_ON(locate_direct_register(vcpu, map_reg) != SR_LOC_MEMORY);
+
+ if (xlate != NULL && !vcpu_el2_e2h_is_set(vcpu)) {
+ loc->loc |= SR_LOC_XLATED;
+ loc->xlate = xlate;
+ }
}
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+#define MAPPED_EL2_SYSREG(r, m, t) \
+ case r: { \
+ locate_mapped_el2_register(vcpu, r, m, t, loc); \
+ break; \
+ }
+
+static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg,
+ struct sr_loc *loc)
{
- if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
- __vcpu_write_sys_reg_to_cpu(val, reg))
+ if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) {
+ loc->loc = SR_LOC_MEMORY;
return;
+ }
+
+ switch (reg) {
+ MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1,
+ translate_sctlr_el2_to_sctlr_el1 );
+ MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1,
+ translate_cptr_el2_to_cpacr_el1 );
+ MAPPED_EL2_SYSREG(TTBR0_EL2, TTBR0_EL1,
+ translate_ttbr0_el2_to_ttbr0_el1 );
+ MAPPED_EL2_SYSREG(TTBR1_EL2, TTBR1_EL1, NULL );
+ MAPPED_EL2_SYSREG(TCR_EL2, TCR_EL1,
+ translate_tcr_el2_to_tcr_el1 );
+ MAPPED_EL2_SYSREG(VBAR_EL2, VBAR_EL1, NULL );
+ MAPPED_EL2_SYSREG(AFSR0_EL2, AFSR0_EL1, NULL );
+ MAPPED_EL2_SYSREG(AFSR1_EL2, AFSR1_EL1, NULL );
+ MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL );
+ MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL );
+ MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL );
+ MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL );
+ MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL );
+ MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL );
+ MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL );
+ MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL );
+ MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL );
+ MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL );
+ MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL );
+ MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL );
+ case CNTHCTL_EL2:
+ /* CNTHCTL_EL2 is super special, until we support NV2.1 */
+ loc->loc = ((is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) ?
+ SR_LOC_SPECIAL : SR_LOC_MEMORY);
+ break;
+ default:
+ loc->loc = locate_direct_register(vcpu, reg);
+ }
+}
+
+static u64 read_sr_from_cpu(enum vcpu_sysreg reg)
+{
+ u64 val = 0x8badf00d8badf00d;
+
+ switch (reg) {
+ case SCTLR_EL1: val = read_sysreg_s(SYS_SCTLR_EL12); break;
+ case CPACR_EL1: val = read_sysreg_s(SYS_CPACR_EL12); break;
+ case TTBR0_EL1: val = read_sysreg_s(SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: val = read_sysreg_s(SYS_TTBR1_EL12); break;
+ case TCR_EL1: val = read_sysreg_s(SYS_TCR_EL12); break;
+ case TCR2_EL1: val = read_sysreg_s(SYS_TCR2_EL12); break;
+ case PIR_EL1: val = read_sysreg_s(SYS_PIR_EL12); break;
+ case PIRE0_EL1: val = read_sysreg_s(SYS_PIRE0_EL12); break;
+ case POR_EL1: val = read_sysreg_s(SYS_POR_EL12); break;
+ case ESR_EL1: val = read_sysreg_s(SYS_ESR_EL12); break;
+ case AFSR0_EL1: val = read_sysreg_s(SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: val = read_sysreg_s(SYS_AFSR1_EL12); break;
+ case FAR_EL1: val = read_sysreg_s(SYS_FAR_EL12); break;
+ case MAIR_EL1: val = read_sysreg_s(SYS_MAIR_EL12); break;
+ case VBAR_EL1: val = read_sysreg_s(SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
+ case AMAIR_EL1: val = read_sysreg_s(SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: val = read_sysreg_s(SYS_CNTKCTL_EL12); break;
+ case ELR_EL1: val = read_sysreg_s(SYS_ELR_EL12); break;
+ case SPSR_EL1: val = read_sysreg_s(SYS_SPSR_EL12); break;
+ case ZCR_EL1: val = read_sysreg_s(SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: val = read_sysreg_s(SYS_SCTLR2_EL12); break;
+ case TPIDR_EL0: val = read_sysreg_s(SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: val = read_sysreg_s(SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: val = read_sysreg_s(SYS_TPIDR_EL1); break;
+ case PAR_EL1: val = read_sysreg_par(); break;
+ case DACR32_EL2: val = read_sysreg_s(SYS_DACR32_EL2); break;
+ case IFSR32_EL2: val = read_sysreg_s(SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
+ default: WARN_ON_ONCE(1);
+ }
+
+ return val;
+}
- __vcpu_sys_reg(vcpu, reg) = val;
+static void write_sr_to_cpu(enum vcpu_sysreg reg, u64 val)
+{
+ switch (reg) {
+ case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
+ case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
+ case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break;
+ case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break;
+ case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break;
+ case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break;
+ case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break;
+ case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break;
+ case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break;
+ case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break;
+ case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break;
+ case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break;
+ case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
+ case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break;
+ case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break;
+ case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break;
+ case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break;
+ case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break;
+ case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break;
+ case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
+ case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
+ default: WARN_ON_ONCE(1);
+ }
}
-/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
-static u32 cache_levels;
+u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
+{
+ struct sr_loc loc = {};
+
+ locate_register(vcpu, reg, &loc);
+
+ WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
+
+ if (loc.loc & SR_LOC_SPECIAL) {
+ u64 val;
+
+ WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
+
+ /*
+ * CNTHCTL_EL2 requires some special treatment to account
+ * for the bits that can be set via CNTKCTL_EL1 when E2H==1.
+ */
+ switch (reg) {
+ case CNTHCTL_EL2:
+ val = read_sysreg_el1(SYS_CNTKCTL);
+ val &= CNTKCTL_VALID_BITS;
+ val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
+ return val;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ if (loc.loc & SR_LOC_LOADED) {
+ enum vcpu_sysreg map_reg = reg;
+
+ if (loc.loc & SR_LOC_MAPPED)
+ map_reg = loc.map_reg;
+
+ if (!(loc.loc & SR_LOC_XLATED)) {
+ u64 val = read_sr_from_cpu(map_reg);
+
+ if (reg >= __SANITISED_REG_START__)
+ val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
+
+ return val;
+ }
+ }
+
+ return __vcpu_sys_reg(vcpu, reg);
+}
+
+void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, enum vcpu_sysreg reg)
+{
+ struct sr_loc loc = {};
+
+ locate_register(vcpu, reg, &loc);
+
+ WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
+
+ if (loc.loc & SR_LOC_SPECIAL) {
+
+ WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
+
+ switch (reg) {
+ case CNTHCTL_EL2:
+ /*
+ * If E2H=1, some of the bits are backed by
+ * CNTKCTL_EL1, while the rest is kept in memory.
+ * Yes, this is fun stuff.
+ */
+ write_sysreg_el1(val, SYS_CNTKCTL);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ if (loc.loc & SR_LOC_LOADED) {
+ enum vcpu_sysreg map_reg = reg;
+ u64 xlated_val;
+
+ if (reg >= __SANITISED_REG_START__)
+ val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
+
+ if (loc.loc & SR_LOC_MAPPED)
+ map_reg = loc.map_reg;
+
+ if (loc.loc & SR_LOC_XLATED)
+ xlated_val = loc.xlate(val);
+ else
+ xlated_val = val;
+
+ write_sr_to_cpu(map_reg, xlated_val);
+
+ /*
+ * Fall through to write the backing store anyway, which
+ * allows translated registers to be directly read without a
+ * reverse translation.
+ */
+ }
+
+ __vcpu_assign_sys_reg(vcpu, reg, val);
+}
/* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */
#define CSSELR_MAX 14
+/*
+ * Returns the minimum line size for the selected cache, expressed as
+ * Log2(bytes).
+ */
+static u8 get_min_cache_line_size(bool icache)
+{
+ u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ u8 field;
+
+ if (icache)
+ field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr);
+ else
+ field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr);
+
+ /*
+ * Cache line size is represented as Log2(words) in CTR_EL0.
+ * Log2(bytes) can be derived with the following:
+ *
+ * Log2(words) + 2 = Log2(bytes / 4) + 2
+ * = Log2(bytes) - 2 + 2
+ * = Log2(bytes)
+ */
+ return field + 2;
+}
+
/* Which cache CCSIDR represents depends on CSSELR value. */
-static u32 get_ccsidr(u32 csselr)
+static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr)
{
- u32 ccsidr;
+ u8 line_size;
+
+ if (vcpu->arch.ccsidr)
+ return vcpu->arch.ccsidr[csselr];
+
+ line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD);
+
+ /*
+ * Fabricate a CCSIDR value as the overriding value does not exist.
+ * The real CCSIDR value will not be used as it can vary by the
+ * physical CPU which the vcpu currently resides in.
+ *
+ * The line size is determined with get_min_cache_line_size(), which
+ * should be valid for all CPUs even if they have different cache
+ * configuration.
+ *
+ * The associativity bits are cleared, meaning the geometry of all data
+ * and unified caches (which are guaranteed to be PIPT and thus
+ * non-aliasing) are 1 set and 1 way.
+ * Guests should not be doing cache operations by set/way at all, and
+ * for this reason, we trap them and attempt to infer the intent, so
+ * that we can flush the entire guest's address space at the appropriate
+ * time. The exposed geometry minimizes the number of the traps.
+ * [If guests should attempt to infer aliasing properties from the
+ * geometry (which is not permitted by the architecture), they would
+ * only do so for virtually indexed caches.]
+ *
+ * We don't check if the cache level exists as it is allowed to return
+ * an UNKNOWN value if not.
+ */
+ return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4);
+}
+
+static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val)
+{
+ u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4;
+ u32 *ccsidr = vcpu->arch.ccsidr;
+ u32 i;
+
+ if ((val & CCSIDR_EL1_RES0) ||
+ line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD))
+ return -EINVAL;
+
+ if (!ccsidr) {
+ if (val == get_ccsidr(vcpu, csselr))
+ return 0;
+
+ ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT);
+ if (!ccsidr)
+ return -ENOMEM;
+
+ for (i = 0; i < CSSELR_MAX; i++)
+ ccsidr[i] = get_ccsidr(vcpu, i);
+
+ vcpu->arch.ccsidr = ccsidr;
+ }
- /* Make sure noone else changes CSSELR during this! */
- local_irq_disable();
- write_sysreg(csselr, csselr_el1);
- isb();
- ccsidr = read_sysreg(ccsidr_el1);
- local_irq_enable();
+ ccsidr[csselr] = val;
+
+ return 0;
+}
+
+static bool access_rw(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ vcpu_write_sys_reg(vcpu, p->regval, r->reg);
+ else
+ p->regval = vcpu_read_sys_reg(vcpu, r->reg);
- return ccsidr;
+ return true;
}
/*
@@ -119,12 +513,23 @@ static bool access_dcsw(struct kvm_vcpu *vcpu,
* CPU left in the system, and certainly not from non-secure
* software).
*/
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
kvm_set_way_flush(vcpu);
return true;
}
+static bool access_dcgsw(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (!kvm_has_mte(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
+ /* Treat MTE S/W ops as we treat the classic ones: with contempt */
+ return access_dcsw(vcpu, p, r);
+}
+
static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift)
{
switch (r->aarch32_map) {
@@ -200,6 +605,9 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
{
bool g1;
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
if (!p->is_write)
return read_from_write_only(vcpu, p, r);
@@ -243,10 +651,33 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
if (p->is_write)
return ignore_write(vcpu, p);
- p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+ if (p->Op1 == 4) { /* ICC_SRE_EL2 */
+ p->regval = KVM_ICC_SRE_EL2;
+ } else { /* ICC_SRE_EL1 */
+ p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+ }
+
+ return true;
+}
+
+static bool access_gic_dir(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
+ if (!p->is_write)
+ return undef_access(vcpu, p, r);
+
+ vgic_v3_deactivate(vcpu, p->regval);
+
return true;
}
@@ -270,13 +701,10 @@ static bool trap_loregion(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
u32 sr = reg_to_encoding(r);
- if (!(val & (0xfUL << ID_AA64MMFR1_EL1_LO_SHIFT))) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP))
+ return undef_access(vcpu, p, r);
if (p->is_write && sr == SYS_LORID_EL1)
return write_to_read_only(vcpu, p, r);
@@ -288,17 +716,10 @@ static bool trap_oslar_el1(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 oslsr;
-
if (!p->is_write)
return read_from_write_only(vcpu, p, r);
- /* Forward the OSLK bit to OSLSR */
- oslsr = __vcpu_sys_reg(vcpu, OSLSR_EL1) & ~SYS_OSLSR_OSLK;
- if (p->regval & SYS_OSLAR_OSLK)
- oslsr |= SYS_OSLSR_OSLK;
-
- __vcpu_sys_reg(vcpu, OSLSR_EL1) = oslsr;
+ kvm_debug_handle_oslar(vcpu, p->regval);
return true;
}
@@ -320,10 +741,10 @@ static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
* The only modifiable bit is the OSLK bit. Refuse the write if
* userspace attempts to change any other bit in the register.
*/
- if ((val ^ rd->val) & ~SYS_OSLSR_OSLK)
+ if ((val ^ rd->val) & ~OSLSR_EL1_OSLK)
return -EINVAL;
- __vcpu_sys_reg(vcpu, rd->reg) = val;
+ __vcpu_assign_sys_reg(vcpu, rd->reg, val);
return 0;
}
@@ -339,46 +760,13 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu,
}
}
-/*
- * We want to avoid world-switching all the DBG registers all the
- * time:
- *
- * - If we've touched any debug register, it is likely that we're
- * going to touch more of them. It then makes sense to disable the
- * traps and start doing the save/restore dance
- * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is
- * then mandatory to save/restore the registers, as the guest
- * depends on them.
- *
- * For this, we use a DIRTY bit, indicating the guest has modified the
- * debug registers, used as follow:
- *
- * On guest entry:
- * - If the dirty bit is set (because we're coming back from trapping),
- * disable the traps, save host registers, restore guest registers.
- * - If debug is actively in use (DBG_MDSCR_KDE or DBG_MDSCR_MDE set),
- * set the dirty bit, disable the traps, save host registers,
- * restore guest registers.
- * - Otherwise, enable the traps
- *
- * On guest exit:
- * - If the dirty bit is set, save guest registers, restore host
- * registers and clear the dirty bit. This ensure that the host can
- * now use the debug registers.
- */
static bool trap_debug_regs(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- if (p->is_write) {
- vcpu_write_sys_reg(vcpu, p->regval, r->reg);
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
- } else {
- p->regval = vcpu_read_sys_reg(vcpu, r->reg);
- }
-
- trace_trap_reg(__func__, r->reg, p->is_write, p->regval);
+ access_rw(vcpu, p, r);
+ kvm_debug_set_guest_ownership(vcpu);
return true;
}
@@ -387,9 +775,6 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
*
* A 32 bit write to a debug register leave top bits alone
* A 32 bit read from a debug register only returns the bottom bits
- *
- * All writes will set the DEBUG_DIRTY flag to ensure the hyp code
- * switches between host and guest values in future.
*/
static void reg_to_dbg(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
@@ -404,8 +789,6 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu,
val &= ~mask;
val |= (p->regval & (mask >> shift)) << shift;
*dbg_reg = val;
-
- vcpu_set_flag(vcpu, DEBUG_DIRTY);
}
static void dbg_to_reg(struct kvm_vcpu *vcpu,
@@ -419,164 +802,97 @@ static void dbg_to_reg(struct kvm_vcpu *vcpu,
p->regval = (*dbg_reg & mask) >> shift;
}
-static bool trap_bvr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
+static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm];
-
- if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
- else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
-
- return true;
-}
-
-static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
-{
- vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val;
- return 0;
-}
-
-static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
-{
- *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm];
- return 0;
+ struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state;
+
+ switch (rd->Op2) {
+ case 0b100:
+ return &dbg->dbg_bvr[rd->CRm];
+ case 0b101:
+ return &dbg->dbg_bcr[rd->CRm];
+ case 0b110:
+ return &dbg->dbg_wvr[rd->CRm];
+ case 0b111:
+ return &dbg->dbg_wcr[rd->CRm];
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ return NULL;
+ }
}
-static void reset_bvr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
+static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *rd)
{
- vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val;
-}
+ u64 *reg = demux_wb_reg(vcpu, rd);
-static bool trap_bcr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
-{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm];
+ if (!reg)
+ return false;
if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
+ reg_to_dbg(vcpu, p, rd, reg);
else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
+ dbg_to_reg(vcpu, p, rd, reg);
+ kvm_debug_set_guest_ownership(vcpu);
return true;
}
-static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
-{
- vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val;
- return 0;
-}
-
-static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
-{
- *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm];
- return 0;
-}
-
-static void reset_bcr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val;
-}
-
-static bool trap_wvr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
+static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 val)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm];
-
- if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
- else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write,
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]);
+ u64 *reg = demux_wb_reg(vcpu, rd);
- return true;
-}
-
-static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
-{
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val;
- return 0;
-}
+ if (!reg)
+ return -EINVAL;
-static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
-{
- *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm];
+ *reg = val;
return 0;
}
-static void reset_wvr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val;
-}
-
-static bool trap_wcr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
+static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 *val)
{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm];
+ u64 *reg = demux_wb_reg(vcpu, rd);
- if (p->is_write)
- reg_to_dbg(vcpu, p, rd, dbg_reg);
- else
- dbg_to_reg(vcpu, p, rd, dbg_reg);
-
- trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg);
-
- return true;
-}
+ if (!reg)
+ return -EINVAL;
-static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 val)
-{
- vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val;
+ *val = *reg;
return 0;
}
-static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
- u64 *val)
+static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
{
- *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm];
- return 0;
-}
+ u64 *reg = demux_wb_reg(vcpu, rd);
-static void reset_wcr(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val;
+ /*
+ * Bail early if we couldn't find storage for the register, the
+ * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever
+ * being run.
+ */
+ if (!reg)
+ return 0;
+
+ *reg = rd->val;
+ return rd->val;
}
-static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 amair = read_sysreg(amair_el1);
vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1);
+ return amair;
}
-static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 actlr = read_sysreg(actlr_el1);
vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1);
+ return actlr;
}
-static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 mpidr;
@@ -590,7 +906,16 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
- vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1);
+ mpidr |= (1ULL << 31);
+ vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1);
+
+ return mpidr;
+}
+
+static unsigned int hidden_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+ return REG_HIDDEN;
}
static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
@@ -602,55 +927,62 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
-static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
- u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
-
- /* No PMU available, any PMU reg may UNDEF... */
- if (!kvm_arm_support_pmu_v3())
- return;
+ u64 mask = BIT(ARMV8_PMU_CYCLE_IDX);
+ u8 n = vcpu->kvm->arch.nr_pmu_counters;
- n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
- n &= ARMV8_PMU_PMCR_N_MASK;
if (n)
mask |= GENMASK(n - 1, 0);
reset_unknown(vcpu, r);
- __vcpu_sys_reg(vcpu, r->reg) &= mask;
+ __vcpu_rmw_sys_reg(vcpu, r->reg, &=, mask);
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
- __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+ __vcpu_rmw_sys_reg(vcpu, r->reg, &=, GENMASK(31, 0));
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
+ /* This thing will UNDEF, who cares about the reset value? */
+ if (!kvm_vcpu_has_pmu(vcpu))
+ return 0;
+
reset_unknown(vcpu, r);
- __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
+ __vcpu_rmw_sys_reg(vcpu, r->reg, &=, kvm_pmu_evtyper_mask(vcpu->kvm));
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
- __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+ __vcpu_rmw_sys_reg(vcpu, r->reg, &=, PMSELR_EL0_SEL_MASK);
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
- u64 pmcr;
-
- /* No PMU available, PMCR_EL0 may UNDEF... */
- if (!kvm_arm_support_pmu_v3())
- return;
+ u64 pmcr = 0;
- /* Only preserve PMCR_EL0.N, and reset the rest to 0 */
- pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT);
if (!kvm_supports_32bit_el0())
pmcr |= ARMV8_PMU_PMCR_LC;
- __vcpu_sys_reg(vcpu, r->reg) = pmcr;
+ /*
+ * The value of PMCR.N field is included when the
+ * vCPU register is read via kvm_vcpu_read_pmcr().
+ */
+ __vcpu_assign_sys_reg(vcpu, r->reg, pmcr);
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
@@ -697,16 +1029,15 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
* Only update writeable bits of PMCR (continuing into
* kvm_pmu_handle_pmcr() as well)
*/
- val = __vcpu_sys_reg(vcpu, PMCR_EL0);
+ val = kvm_vcpu_read_pmcr(vcpu);
val &= ~ARMV8_PMU_PMCR_MASK;
val |= p->regval & ARMV8_PMU_PMCR_MASK;
if (!kvm_supports_32bit_el0())
val |= ARMV8_PMU_PMCR_LC;
kvm_pmu_handle_pmcr(vcpu, val);
- kvm_vcpu_pmu_restore_guest(vcpu);
} else {
/* PMCR.P & PMCR.C are RAZ */
- val = __vcpu_sys_reg(vcpu, PMCR_EL0)
+ val = kvm_vcpu_read_pmcr(vcpu)
& ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
p->regval = val;
}
@@ -721,11 +1052,11 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return false;
if (p->is_write)
- __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval;
+ __vcpu_assign_sys_reg(vcpu, PMSELR_EL0, p->regval);
else
/* return PMSELR.SEL field */
p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0)
- & ARMV8_PMU_COUNTER_MASK;
+ & PMSELR_EL0_SEL_MASK;
return true;
}
@@ -755,8 +1086,8 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
{
u64 pmcr, val;
- pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
- val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
+ pmcr = kvm_vcpu_read_pmcr(vcpu);
+ val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr);
if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) {
kvm_inject_undefined(vcpu);
return false;
@@ -765,6 +1096,38 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
return true;
}
+static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ u64 idx;
+
+ if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
+ /* PMCCNTR_EL0 */
+ idx = ARMV8_PMU_CYCLE_IDX;
+ else
+ /* PMEVCNTRn_EL0 */
+ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+
+ *val = kvm_pmu_get_counter_value(vcpu, idx);
+ return 0;
+}
+
+static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ u64 idx;
+
+ if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
+ /* PMCCNTR_EL0 */
+ idx = ARMV8_PMU_CYCLE_IDX;
+ else
+ /* PMEVCNTRn_EL0 */
+ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+
+ kvm_pmu_set_counter_value_user(vcpu, idx, val);
+ return 0;
+}
+
static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -777,8 +1140,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
if (pmu_access_event_counter_el0_disabled(vcpu))
return false;
- idx = __vcpu_sys_reg(vcpu, PMSELR_EL0)
- & ARMV8_PMU_COUNTER_MASK;
+ idx = SYS_FIELD_GET(PMSELR_EL0, SEL,
+ __vcpu_sys_reg(vcpu, PMSELR_EL0));
} else if (r->Op2 == 0) {
/* PMCCNTR_EL0 */
if (pmu_access_cycle_counter_el0_disabled(vcpu))
@@ -828,7 +1191,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) {
/* PMXEVTYPER_EL0 */
- idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK;
+ idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0));
reg = PMEVTYPER0_EL0 + idx;
} else if (r->CRn == 14 && (r->CRm & 12) == 12) {
idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
@@ -846,15 +1209,32 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write) {
kvm_pmu_set_counter_event_type(vcpu, p->regval, idx);
- __vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK;
kvm_vcpu_pmu_restore_guest(vcpu);
} else {
- p->regval = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK;
+ p->regval = __vcpu_sys_reg(vcpu, reg);
}
return true;
}
+static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val)
+{
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
+
+ __vcpu_assign_sys_reg(vcpu, r->reg, val & mask);
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
+ return 0;
+}
+
+static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val)
+{
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
+
+ *val = __vcpu_sys_reg(vcpu, r->reg) & mask;
+ return 0;
+}
+
static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
@@ -863,19 +1243,17 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_access_el0_disabled(vcpu))
return false;
- mask = kvm_pmu_valid_counter_mask(vcpu);
+ mask = kvm_pmu_accessible_counter_mask(vcpu);
if (p->is_write) {
val = p->regval & mask;
- if (r->Op2 & 0x1) {
+ if (r->Op2 & 0x1)
/* accessing PMCNTENSET_EL0 */
- __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
- kvm_pmu_enable_counter_mask(vcpu, val);
- kvm_vcpu_pmu_restore_guest(vcpu);
- } else {
+ __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, |=, val);
+ else
/* accessing PMCNTENCLR_EL0 */
- __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
- kvm_pmu_disable_counter_mask(vcpu, val);
- }
+ __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, ~val);
+
+ kvm_pmu_reprogram_counter_mask(vcpu, val);
} else {
p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
}
@@ -886,7 +1264,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
if (check_pmu_access_disabled(vcpu, 0))
return false;
@@ -896,10 +1274,10 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (r->Op2 & 0x1)
/* accessing PMINTENSET_EL1 */
- __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val;
+ __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, |=, val);
else
/* accessing PMINTENCLR_EL1 */
- __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
+ __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, ~val);
} else {
p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
}
@@ -910,7 +1288,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+ u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
if (pmu_access_el0_disabled(vcpu))
return false;
@@ -918,10 +1296,10 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write) {
if (r->CRm & 0x2)
/* accessing PMOVSSET_EL0 */
- __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
+ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask));
else
/* accessing PMOVSCLR_EL0 */
- __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
+ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, ~(p->regval & mask));
} else {
p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
}
@@ -940,7 +1318,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_write_swinc_el0_disabled(vcpu))
return false;
- mask = kvm_pmu_valid_counter_mask(vcpu);
+ mask = kvm_pmu_accessible_counter_mask(vcpu);
kvm_pmu_software_increment(vcpu, p->regval & mask);
return true;
}
@@ -949,13 +1327,11 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
if (p->is_write) {
- if (!vcpu_mode_priv(vcpu)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!vcpu_mode_priv(vcpu))
+ return undef_access(vcpu, p, r);
- __vcpu_sys_reg(vcpu, PMUSERENR_EL0) =
- p->regval & ARMV8_PMU_USERENR_MASK;
+ __vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0,
+ (p->regval & ARMV8_PMU_USERENR_MASK));
} else {
p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0)
& ARMV8_PMU_USERENR_MASK;
@@ -964,40 +1340,86 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return true;
}
+static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = kvm_vcpu_read_pmcr(vcpu);
+ return 0;
+}
+
+static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val);
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ /*
+ * The vCPU can't have more counters than the PMU hardware
+ * implements. Ignore this error to maintain compatibility
+ * with the existing KVM behavior.
+ */
+ if (!kvm_vm_has_ran_once(kvm) &&
+ !vcpu_has_nv(vcpu) &&
+ new_n <= kvm_arm_pmu_get_max_counters(kvm))
+ kvm->arch.nr_pmu_counters = new_n;
+
+ mutex_unlock(&kvm->arch.config_lock);
+
+ /*
+ * Ignore writes to RES0 bits, read only bits that are cleared on
+ * vCPU reset, and writable bits that KVM doesn't support yet.
+ * (i.e. only PMCR.N and bits [7:0] are mutable from userspace)
+ * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU.
+ * But, we leave the bit as it is here, as the vCPU's PMUver might
+ * be changed later (NOTE: the bit will be cleared on first vCPU run
+ * if necessary).
+ */
+ val &= ARMV8_PMU_PMCR_MASK;
+
+ /* The LC bit is RES1 when AArch32 is not supported */
+ if (!kvm_supports_32bit_el0())
+ val |= ARMV8_PMU_PMCR_LC;
+
+ __vcpu_assign_sys_reg(vcpu, r->reg, val);
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
+ return 0;
+}
+
/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
#define DBG_BCR_BVR_WCR_WVR_EL1(n) \
{ SYS_DESC(SYS_DBGBVRn_EL1(n)), \
- trap_bvr, reset_bvr, 0, 0, get_bvr, set_bvr }, \
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }, \
{ SYS_DESC(SYS_DBGBCRn_EL1(n)), \
- trap_bcr, reset_bcr, 0, 0, get_bcr, set_bcr }, \
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }, \
{ SYS_DESC(SYS_DBGWVRn_EL1(n)), \
- trap_wvr, reset_wvr, 0, 0, get_wvr, set_wvr }, \
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }, \
{ SYS_DESC(SYS_DBGWCRn_EL1(n)), \
- trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr }
+ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \
+ get_dbg_wb_reg, set_dbg_wb_reg }
-#define PMU_SYS_REG(r) \
- SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility
+#define PMU_SYS_REG(name) \
+ SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \
+ .visibility = pmu_visibility
/* Macro to expand the PMEVCNTRn_EL0 register */
#define PMU_PMEVCNTR_EL0(n) \
- { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \
- .reset = reset_pmevcntr, \
+ { PMU_SYS_REG(PMEVCNTRn_EL0(n)), \
+ .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \
+ .set_user = set_pmu_evcntr, \
.access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
/* Macro to expand the PMEVTYPERn_EL0 register */
#define PMU_PMEVTYPER_EL0(n) \
- { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)), \
+ { PMU_SYS_REG(PMEVTYPERn_EL0(n)), \
.reset = reset_pmevtyper, \
.access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }
-static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
- const struct sys_reg_desc *r)
-{
- kvm_inject_undefined(vcpu);
-
- return false;
-}
-
/* Macro to expand the AMU counter and type registers*/
#define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access }
#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access }
@@ -1034,22 +1456,149 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
switch (reg) {
case SYS_CNTP_TVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
+ case SYS_CNTV_TVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
case SYS_AARCH32_CNTP_TVAL:
+ case SYS_CNTP_TVAL_EL02:
tmr = TIMER_PTIMER;
treg = TIMER_REG_TVAL;
break;
+
+ case SYS_CNTV_TVAL_EL02:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
+ case SYS_CNTHP_TVAL_EL2:
+ tmr = TIMER_HPTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
+ case SYS_CNTHV_TVAL_EL2:
+ tmr = TIMER_HVTIMER;
+ treg = TIMER_REG_TVAL;
+ break;
+
case SYS_CNTP_CTL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
+ case SYS_CNTV_CTL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
case SYS_AARCH32_CNTP_CTL:
+ case SYS_CNTP_CTL_EL02:
tmr = TIMER_PTIMER;
treg = TIMER_REG_CTL;
break;
+
+ case SYS_CNTV_CTL_EL02:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
+ case SYS_CNTHP_CTL_EL2:
+ tmr = TIMER_HPTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
+ case SYS_CNTHV_CTL_EL2:
+ tmr = TIMER_HVTIMER;
+ treg = TIMER_REG_CTL;
+ break;
+
case SYS_CNTP_CVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
+ case SYS_CNTV_CVAL_EL0:
+ if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
case SYS_AARCH32_CNTP_CVAL:
+ case SYS_CNTP_CVAL_EL02:
tmr = TIMER_PTIMER;
treg = TIMER_REG_CVAL;
break;
+
+ case SYS_CNTV_CVAL_EL02:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
+ case SYS_CNTHP_CVAL_EL2:
+ tmr = TIMER_HPTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
+ case SYS_CNTHV_CVAL_EL2:
+ tmr = TIMER_HVTIMER;
+ treg = TIMER_REG_CVAL;
+ break;
+
+ case SYS_CNTPCT_EL0:
+ case SYS_CNTPCTSS_EL0:
+ if (is_hyp_ctxt(vcpu))
+ tmr = TIMER_HPTIMER;
+ else
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CNT;
+ break;
+
+ case SYS_AARCH32_CNTPCT:
+ case SYS_AARCH32_CNTPCTSS:
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CNT;
+ break;
+
+ case SYS_CNTVCT_EL0:
+ case SYS_CNTVCTSS_EL0:
+ if (is_hyp_ctxt(vcpu))
+ tmr = TIMER_HVTIMER;
+ else
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CNT;
+ break;
+
+ case SYS_AARCH32_CNTVCT:
+ case SYS_AARCH32_CNTVCTSS:
+ tmr = TIMER_VTIMER;
+ treg = TIMER_REG_CNT;
+ break;
+
default:
- BUG();
+ print_sys_reg_msg(p, "%s", "Unhandled trapped timer register");
+ return undef_access(vcpu, p, r);
}
if (p->is_write)
@@ -1060,25 +1609,138 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}
-static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
+static int arch_timer_set_user(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ u64 val)
{
- if (kvm_vcpu_has_pmu(vcpu))
- return vcpu->kvm->arch.dfr0_pmuver.imp;
+ switch (reg_to_encoding(rd)) {
+ case SYS_CNTV_CTL_EL0:
+ case SYS_CNTP_CTL_EL0:
+ case SYS_CNTHV_CTL_EL2:
+ case SYS_CNTHP_CTL_EL2:
+ val &= ~ARCH_TIMER_CTRL_IT_STAT;
+ break;
+ case SYS_CNTVCT_EL0:
+ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags))
+ timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read() - val);
+ return 0;
+ case SYS_CNTPCT_EL0:
+ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags))
+ timer_set_offset(vcpu_ptimer(vcpu), kvm_phys_timer_read() - val);
+ return 0;
+ }
- return vcpu->kvm->arch.dfr0_pmuver.unimp;
+ __vcpu_assign_sys_reg(vcpu, rd->reg, val);
+ return 0;
}
-static u8 perfmon_to_pmuver(u8 perfmon)
+static int arch_timer_get_user(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ u64 *val)
{
- switch (perfmon) {
- case ID_DFR0_EL1_PerfMon_PMUv3:
- return ID_AA64DFR0_EL1_PMUVer_IMP;
- case ID_DFR0_EL1_PerfMon_IMPDEF:
- return ID_AA64DFR0_EL1_PMUVer_IMP_DEF;
+ switch (reg_to_encoding(rd)) {
+ case SYS_CNTVCT_EL0:
+ *val = kvm_phys_timer_read() - timer_get_offset(vcpu_vtimer(vcpu));
+ break;
+ case SYS_CNTPCT_EL0:
+ *val = kvm_phys_timer_read() - timer_get_offset(vcpu_ptimer(vcpu));
+ break;
default:
- /* Anything ARMv8.1+ and NI have the same value. For now. */
- return perfmon;
+ *val = __vcpu_sys_reg(vcpu, rd->reg);
+ }
+
+ return 0;
+}
+
+static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
+ s64 new, s64 cur)
+{
+ struct arm64_ftr_bits kvm_ftr = *ftrp;
+
+ /* Some features have different safe value type in KVM than host features */
+ switch (id) {
+ case SYS_ID_AA64DFR0_EL1:
+ switch (kvm_ftr.shift) {
+ case ID_AA64DFR0_EL1_PMUVer_SHIFT:
+ kvm_ftr.type = FTR_LOWER_SAFE;
+ break;
+ case ID_AA64DFR0_EL1_DebugVer_SHIFT:
+ kvm_ftr.type = FTR_LOWER_SAFE;
+ break;
+ }
+ break;
+ case SYS_ID_DFR0_EL1:
+ if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT)
+ kvm_ftr.type = FTR_LOWER_SAFE;
+ break;
+ }
+
+ return arm64_ftr_safe_value(&kvm_ftr, new, cur);
+}
+
+/*
+ * arm64_check_features() - Check if a feature register value constitutes
+ * a subset of features indicated by the idreg's KVM sanitised limit.
+ *
+ * This function will check if each feature field of @val is the "safe" value
+ * against idreg's KVM sanitised limit return from reset() callback.
+ * If a field value in @val is the same as the one in limit, it is always
+ * considered the safe value regardless For register fields that are not in
+ * writable, only the value in limit is considered the safe value.
+ *
+ * Return: 0 if all the fields are safe. Otherwise, return negative errno.
+ */
+static int arm64_check_features(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ u64 val)
+{
+ const struct arm64_ftr_reg *ftr_reg;
+ const struct arm64_ftr_bits *ftrp = NULL;
+ u32 id = reg_to_encoding(rd);
+ u64 writable_mask = rd->val;
+ u64 limit = rd->reset(vcpu, rd);
+ u64 mask = 0;
+
+ /*
+ * Hidden and unallocated ID registers may not have a corresponding
+ * struct arm64_ftr_reg. Of course, if the register is RAZ we know the
+ * only safe value is 0.
+ */
+ if (sysreg_visible_as_raz(vcpu, rd))
+ return val ? -E2BIG : 0;
+
+ ftr_reg = get_arm64_ftr_reg(id);
+ if (!ftr_reg)
+ return -EINVAL;
+
+ ftrp = ftr_reg->ftr_bits;
+
+ for (; ftrp && ftrp->width; ftrp++) {
+ s64 f_val, f_lim, safe_val;
+ u64 ftr_mask;
+
+ ftr_mask = arm64_ftr_mask(ftrp);
+ if ((ftr_mask & writable_mask) != ftr_mask)
+ continue;
+
+ f_val = arm64_ftr_value(ftrp, val);
+ f_lim = arm64_ftr_value(ftrp, limit);
+ mask |= ftr_mask;
+
+ if (f_val == f_lim)
+ safe_val = f_val;
+ else
+ safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim);
+
+ if (safe_val != f_val)
+ return -E2BIG;
}
+
+ /* For fields that are not writable, values in limit are the safe values. */
+ if ((val & ~mask) != (limit & ~mask))
+ return -E2BIG;
+
+ return 0;
}
static u8 pmuver_to_perfmon(u8 pmuver)
@@ -1094,8 +1756,13 @@ static u8 pmuver_to_perfmon(u8 pmuver)
}
}
+static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
+static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val);
+static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
+
/* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
+static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
{
u32 id = reg_to_encoding(r);
u64 val;
@@ -1106,60 +1773,116 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
val = read_sanitised_ftr_reg(id);
switch (id) {
+ case SYS_ID_AA64DFR0_EL1:
+ val = sanitise_id_aa64dfr0_el1(vcpu, val);
+ break;
case SYS_ID_AA64PFR0_EL1:
- if (!vcpu_has_sve(vcpu))
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
- if (kvm_vgic_global_state.type == VGIC_V3) {
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1);
- }
+ val = sanitise_id_aa64pfr0_el1(vcpu, val);
break;
case SYS_ID_AA64PFR1_EL1:
- if (!kvm_has_mte(vcpu->kvm))
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
-
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
+ val = sanitise_id_aa64pfr1_el1(vcpu, val);
+ break;
+ case SYS_ID_AA64PFR2_EL1:
+ val &= ID_AA64PFR2_EL1_FPMR |
+ (kvm_has_mte(vcpu->kvm) ?
+ ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY :
+ 0);
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
- val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
+ val &= ~(ID_AA64ISAR1_EL1_APA |
+ ID_AA64ISAR1_EL1_API |
+ ID_AA64ISAR1_EL1_GPA |
+ ID_AA64ISAR1_EL1_GPI);
break;
case SYS_ID_AA64ISAR2_EL1:
if (!vcpu_has_ptrauth(vcpu))
- val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
- if (!cpus_have_final_cap(ARM64_HAS_WFXT))
- val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
+ val &= ~(ID_AA64ISAR2_EL1_APA3 |
+ ID_AA64ISAR2_EL1_GPA3);
+ if (!cpus_have_final_cap(ARM64_HAS_WFXT) ||
+ has_broken_cntvoff())
+ val &= ~ID_AA64ISAR2_EL1_WFxT;
break;
- case SYS_ID_AA64DFR0_EL1:
- /* Limit debug to ARMv8.0 */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
- /* Set PMUver to the required version */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
- vcpu_pmuver(vcpu));
- /* Hide SPE from guests */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
+ case SYS_ID_AA64ISAR3_EL1:
+ val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE |
+ ID_AA64ISAR3_EL1_FAMINMAX;
break;
- case SYS_ID_DFR0_EL1:
- val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon),
- pmuver_to_perfmon(vcpu_pmuver(vcpu)));
+ case SYS_ID_AA64MMFR2_EL1:
+ val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
+ val &= ~ID_AA64MMFR2_EL1_NV;
+ break;
+ case SYS_ID_AA64MMFR3_EL1:
+ val &= ID_AA64MMFR3_EL1_TCRX |
+ ID_AA64MMFR3_EL1_SCTLRX |
+ ID_AA64MMFR3_EL1_S1POE |
+ ID_AA64MMFR3_EL1_S1PIE;
+ break;
+ case SYS_ID_MMFR4_EL1:
+ val &= ~ID_MMFR4_EL1_CCIDX;
break;
}
+ if (vcpu_has_nv(vcpu))
+ val = limit_nv_id_reg(vcpu->kvm, id, val);
+
return val;
}
+static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+ return __kvm_read_sanitised_id_reg(vcpu, r);
+}
+
+static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r));
+}
+
+static bool is_feature_id_reg(u32 encoding)
+{
+ return (sys_reg_Op0(encoding) == 3 &&
+ (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) &&
+ sys_reg_CRn(encoding) == 0 &&
+ sys_reg_CRm(encoding) <= 7);
+}
+
+/*
+ * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is
+ * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID
+ * registers KVM maintains on a per-VM basis.
+ *
+ * Additionally, the implementation ID registers and CTR_EL0 are handled as
+ * per-VM registers.
+ */
+static inline bool is_vm_ftr_id_reg(u32 id)
+{
+ switch (id) {
+ case SYS_CTR_EL0:
+ case SYS_MIDR_EL1:
+ case SYS_REVIDR_EL1:
+ case SYS_AIDR_EL1:
+ return true;
+ default:
+ return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
+ sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
+ sys_reg_CRm(id) < 8);
+
+ }
+}
+
+static inline bool is_vcpu_ftr_id_reg(u32 id)
+{
+ return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id);
+}
+
+static inline bool is_aa32_id_reg(u32 id)
+{
+ return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
+ sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
+ sys_reg_CRm(id) <= 3);
+}
+
static unsigned int id_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
@@ -1205,6 +1928,7 @@ static bool access_id_reg(struct kvm_vcpu *vcpu,
return write_to_read_only(vcpu, p, r);
p->regval = read_id_reg(vcpu, r);
+
return true;
}
@@ -1218,88 +1942,196 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
-static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd,
- u64 val)
+static unsigned int sme_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
{
- u8 csv2, csv3;
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_fpmr(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
+ if (!vcpu_has_sve(vcpu))
+ val &= ~ID_AA64PFR0_EL1_SVE_MASK;
/*
- * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as
- * it doesn't promise more than what is actually provided (the
- * guest could otherwise be covered in ectoplasmic residue).
+ * The default is to expose CSV2 == 1 if the HW isn't affected.
+ * Although this is a per-CPU feature, we make it global because
+ * asymmetric systems are just a nuisance.
+ *
+ * Userspace can override this as long as it doesn't promise
+ * the impossible.
*/
- csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT);
- if (csv2 > 1 ||
- (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED))
- return -EINVAL;
+ if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) {
+ val &= ~ID_AA64PFR0_EL1_CSV2_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP);
+ }
+ if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) {
+ val &= ~ID_AA64PFR0_EL1_CSV3_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
+ }
- /* Same thing for CSV3 */
- csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT);
- if (csv3 > 1 ||
- (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED))
- return -EINVAL;
+ if (vgic_is_v3(vcpu->kvm)) {
+ val &= ~ID_AA64PFR0_EL1_GIC_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ }
- /* We can only differ with CSV[23], and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
- if (val)
- return -EINVAL;
+ val &= ~ID_AA64PFR0_EL1_AMU_MASK;
+
+ /*
+ * MPAM is disabled by default as KVM also needs a set of PARTID to
+ * program the MPAMVPMx_EL2 PARTID remapping registers with. But some
+ * older kernels let the guest see the ID bit.
+ */
+ val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
- vcpu->kvm->arch.pfr0_csv2 = csv2;
- vcpu->kvm->arch.pfr0_csv3 = csv3;
+ return val;
+}
- return 0;
+static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
+ u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+ if (!kvm_has_mte(vcpu->kvm)) {
+ val &= ~ID_AA64PFR1_EL1_MTE;
+ val &= ~ID_AA64PFR1_EL1_MTE_frac;
+ }
+
+ if (!(cpus_have_final_cap(ARM64_HAS_RASV1P1_EXTN) &&
+ SYS_FIELD_GET(ID_AA64PFR0_EL1, RAS, pfr0) == ID_AA64PFR0_EL1_RAS_IMP))
+ val &= ~ID_AA64PFR1_EL1_RAS_frac;
+
+ val &= ~ID_AA64PFR1_EL1_SME;
+ val &= ~ID_AA64PFR1_EL1_RNDR_trap;
+ val &= ~ID_AA64PFR1_EL1_NMI;
+ val &= ~ID_AA64PFR1_EL1_GCS;
+ val &= ~ID_AA64PFR1_EL1_THE;
+ val &= ~ID_AA64PFR1_EL1_MTEX;
+ val &= ~ID_AA64PFR1_EL1_PFAR;
+ val &= ~ID_AA64PFR1_EL1_MPAM_frac;
+
+ return val;
+}
+
+static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
+
+ /*
+ * Only initialize the PMU version if the vCPU was configured with one.
+ */
+ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
+ if (kvm_vcpu_has_pmu(vcpu))
+ val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer,
+ kvm_arm_pmu_get_pmuver_limit());
+
+ /* Hide SPE from guests */
+ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK;
+
+ /* Hide BRBE from guests */
+ val &= ~ID_AA64DFR0_EL1_BRBE_MASK;
+
+ return val;
+}
+
+/*
+ * Older versions of KVM erroneously claim support for FEAT_DoubleLock with
+ * NV-enabled VMs on unsupporting hardware. Silently ignore the incorrect
+ * value if it is consistent with the bug.
+ */
+static bool ignore_feat_doublelock(struct kvm_vcpu *vcpu, u64 val)
+{
+ u8 host, user;
+
+ if (!vcpu_has_nv(vcpu))
+ return false;
+
+ host = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock,
+ read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1));
+ user = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, val);
+
+ return host == ID_AA64DFR0_EL1_DoubleLock_NI &&
+ user == ID_AA64DFR0_EL1_DoubleLock_IMP;
}
static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 val)
{
- u8 pmuver, host_pmuver;
- bool valid_pmu;
+ u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val);
+ u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val);
- host_pmuver = kvm_arm_pmu_get_pmuver_limit();
+ /*
+ * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the
+ * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously
+ * exposed an IMP_DEF PMU to userspace and the guest on systems w/
+ * non-architectural PMUs. Of course, PMUv3 is the only game in town for
+ * PMU virtualization, so the IMP_DEF value was rather user-hostile.
+ *
+ * At minimum, we're on the hook to allow values that were given to
+ * userspace by KVM. Cover our tracks here and replace the IMP_DEF value
+ * with a more sensible NI. The value of an ID register changing under
+ * the nose of the guest is unfortunate, but is certainly no more
+ * surprising than an ill-guided PMU driver poking at impdef system
+ * registers that end in an UNDEF...
+ */
+ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
/*
- * Allow AA64DFR0_EL1.PMUver to be set from userspace as long
- * as it doesn't promise more than what the HW gives us. We
- * allow an IMPDEF PMU though, only if no PMU is supported
- * (KVM backward compatibility handling).
+ * ID_AA64DFR0_EL1.DebugVer is one of those awkward fields with a
+ * nonzero minimum safe value.
*/
- pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val);
- if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver))
+ if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP)
return -EINVAL;
- valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
+ if (ignore_feat_doublelock(vcpu, val)) {
+ val &= ~ID_AA64DFR0_EL1_DoubleLock;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI);
+ }
- /* Make sure view register and PMU support do match */
- if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
- return -EINVAL;
+ return set_id_reg(vcpu, rd, val);
+}
- /* We can only differ with PMUver, and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
- if (val)
- return -EINVAL;
+static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ u8 perfmon;
+ u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1);
- if (valid_pmu)
- vcpu->kvm->arch.dfr0_pmuver.imp = pmuver;
- else
- vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver;
+ val &= ~ID_DFR0_EL1_PerfMon_MASK;
+ if (kvm_vcpu_has_pmu(vcpu)) {
+ perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon);
+ }
- return 0;
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8);
+
+ return val;
}
static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 val)
{
- u8 perfmon, host_perfmon;
- bool valid_pmu;
+ u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val);
+ u8 copdbg = SYS_FIELD_GET(ID_DFR0_EL1, CopDbg, val);
- host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) {
+ val &= ~ID_DFR0_EL1_PerfMon_MASK;
+ perfmon = 0;
+ }
/*
* Allow DFR0_EL1.PerfMon to be set from userspace as long as
@@ -1307,29 +2139,158 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
* AArch64 side (as everything is emulated with that), and
* that this is a PMUv3.
*/
- perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val);
- if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) ||
- (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3))
+ if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)
return -EINVAL;
- valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF);
+ if (copdbg < ID_DFR0_EL1_CopDbg_Armv8)
+ return -EINVAL;
+
+ return set_id_reg(vcpu, rd, val);
+}
+
+static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+ u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK;
+
+ /*
+ * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits
+ * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to
+ * guests, but didn't add trap handling. KVM doesn't support MPAM and
+ * always returns an UNDEF for these registers. The guest must see 0
+ * for this field.
+ *
+ * But KVM must also accept values from user-space that were provided
+ * by KVM. On CPUs that support MPAM, permit user-space to write
+ * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field.
+ */
+ if ((hw_val & mpam_mask) == (user_val & mpam_mask))
+ user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
- /* Make sure view register and PMU support do match */
- if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
+ /* Fail the guest's request to disable the AA64 ISA at EL{0,1,2} */
+ if (!FIELD_GET(ID_AA64PFR0_EL1_EL0, user_val) ||
+ !FIELD_GET(ID_AA64PFR0_EL1_EL1, user_val) ||
+ (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val)))
return -EINVAL;
- /* We can only differ with PerfMon, and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
- if (val)
+ /*
+ * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then
+ * we support GICv3. Fail attempts to do anything but set that to IMP.
+ */
+ if (vgic_is_v3_compat(vcpu->kvm) &&
+ FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP)
return -EINVAL;
- if (valid_pmu)
- vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon);
- else
- vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon);
+ return set_id_reg(vcpu, rd, user_val);
+}
- return 0;
+static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+ u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK;
+ u8 mte = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE, hw_val);
+ u8 user_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, user_val);
+ u8 hw_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, hw_val);
+
+ /* See set_id_aa64pfr0_el1 for comment about MPAM */
+ if ((hw_val & mpam_mask) == (user_val & mpam_mask))
+ user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+
+ /*
+ * Previously MTE_frac was hidden from guest. However, if the
+ * hardware supports MTE2 but not MTE_ASYM_FAULT then a value
+ * of 0 for this field indicates that the hardware supports
+ * MTE_ASYNC. Whereas, 0xf indicates MTE_ASYNC is not supported.
+ *
+ * As KVM must accept values from KVM provided by user-space,
+ * when ID_AA64PFR1_EL1.MTE is 2 allow user-space to set
+ * ID_AA64PFR1_EL1.MTE_frac to 0. However, ignore it to avoid
+ * incorrectly claiming hardware support for MTE_ASYNC in the
+ * guest.
+ */
+
+ if (mte == ID_AA64PFR1_EL1_MTE_MTE2 &&
+ hw_mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI &&
+ user_mte_frac == ID_AA64PFR1_EL1_MTE_frac_ASYNC) {
+ user_val &= ~ID_AA64PFR1_EL1_MTE_frac_MASK;
+ user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK;
+ }
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+/*
+ * Allow userspace to de-feature a stage-2 translation granule but prevent it
+ * from claiming the impossible.
+ */
+#define tgran2_val_allowed(tg, safe, user) \
+({ \
+ u8 __s = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, safe); \
+ u8 __u = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, user); \
+ \
+ __s == __u || __u == ID_AA64MMFR0_EL1_##tg##_NI; \
+})
+
+static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd);
+
+ if (!vcpu_has_nv(vcpu))
+ return set_id_reg(vcpu, rd, user_val);
+
+ if (!tgran2_val_allowed(TGRAN4_2, sanitized_val, user_val) ||
+ !tgran2_val_allowed(TGRAN16_2, sanitized_val, user_val) ||
+ !tgran2_val_allowed(TGRAN64_2, sanitized_val, user_val))
+ return -EINVAL;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+ u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK;
+
+ /*
+ * We made the mistake to expose the now deprecated NV field,
+ * so allow userspace to write it, but silently ignore it.
+ */
+ if ((hw_val & nv_mask) == (user_val & nv_mask))
+ user_val &= ~nv_mask;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_ctr_el0(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val);
+
+ /*
+ * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved.
+ * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based
+ * on what hardware reports.
+ *
+ * Using a VIPT software model on PIPT will lead to over invalidation,
+ * but still correct. Hence, we can allow downgrading PIPT to VIPT,
+ * but not the other way around. This is handled via arm64_ftr_safe_value()
+ * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value
+ * set as VIPT.
+ */
+ switch (user_L1Ip) {
+ case CTR_EL0_L1Ip_RESERVED_VPIPT:
+ case CTR_EL0_L1Ip_RESERVED_AIVIVT:
+ return -EINVAL;
+ case CTR_EL0_L1Ip_VIPT:
+ case CTR_EL0_L1Ip_PIPT:
+ return set_id_reg(vcpu, rd, user_val);
+ default:
+ return -ENOENT;
+ }
}
/*
@@ -1342,18 +2303,72 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 *val)
{
+ /*
+ * Avoid locking if the VM has already started, as the ID registers are
+ * guaranteed to be invariant at that point.
+ */
+ if (kvm_vm_has_ran_once(vcpu->kvm)) {
+ *val = read_id_reg(vcpu, rd);
+ return 0;
+ }
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
*val = read_id_reg(vcpu, rd);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
return 0;
}
static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 val)
{
- /* This is what we mean by invariant: you can't change it. */
- if (val != read_id_reg(vcpu, rd))
- return -EINVAL;
+ u32 id = reg_to_encoding(rd);
+ int ret;
- return 0;
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * Once the VM has started the ID registers are immutable. Reject any
+ * write that does not match the final register value.
+ */
+ if (kvm_vm_has_ran_once(vcpu->kvm)) {
+ if (val != read_id_reg(vcpu, rd))
+ ret = -EBUSY;
+ else
+ ret = 0;
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+ return ret;
+ }
+
+ ret = arm64_check_features(vcpu, rd, val);
+ if (!ret)
+ kvm_set_vm_id_reg(vcpu->kvm, id, val);
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * arm64_check_features() returns -E2BIG to indicate the register's
+ * feature set is a superset of the maximally-allowed register value.
+ * While it would be nice to precisely describe this to userspace, the
+ * existing UAPI for KVM_SET_ONE_REG has it that invalid register
+ * writes return -EINVAL.
+ */
+ if (ret == -E2BIG)
+ ret = -EINVAL;
+ return ret;
+}
+
+void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val)
+{
+ u64 *p = __vm_id_reg(&kvm->arch, reg);
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm))
+ return;
+
+ *p = val;
}
static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
@@ -1375,7 +2390,7 @@ static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
- p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0);
return true;
}
@@ -1385,10 +2400,80 @@ static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
- p->regval = read_sysreg(clidr_el1);
+ p->regval = __vcpu_sys_reg(vcpu, r->reg);
return true;
}
+/*
+ * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary
+ * by the physical CPU which the vcpu currently resides in.
+ */
+static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ u64 clidr;
+ u8 loc;
+
+ if ((ctr_el0 & CTR_EL0_IDC)) {
+ /*
+ * Data cache clean to the PoU is not required so LoUU and LoUIS
+ * will not be set and a unified cache, which will be marked as
+ * LoC, will be added.
+ *
+ * If not DIC, let the unified cache L2 so that an instruction
+ * cache can be added as L1 later.
+ */
+ loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2;
+ clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc);
+ } else {
+ /*
+ * Data cache clean to the PoU is required so let L1 have a data
+ * cache and mark it as LoUU and LoUIS. As L1 has a data cache,
+ * it can be marked as LoC too.
+ */
+ loc = 1;
+ clidr = 1 << CLIDR_LOUU_SHIFT;
+ clidr |= 1 << CLIDR_LOUIS_SHIFT;
+ clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1);
+ }
+
+ /*
+ * Instruction cache invalidation to the PoU is required so let L1 have
+ * an instruction cache. If L1 already has a data cache, it will be
+ * CACHE_TYPE_SEPARATE.
+ */
+ if (!(ctr_el0 & CTR_EL0_DIC))
+ clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1);
+
+ clidr |= loc << CLIDR_LOC_SHIFT;
+
+ /*
+ * Add tag cache unified to data cache. Allocation tags and data are
+ * unified in a cache line so that it looks valid even if there is only
+ * one cache line.
+ */
+ if (kvm_has_mte(vcpu->kvm))
+ clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc);
+
+ __vcpu_assign_sys_reg(vcpu, r->reg, clidr);
+
+ return __vcpu_sys_reg(vcpu, r->reg);
+}
+
+static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 val)
+{
+ u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val));
+
+ if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc))
+ return -EINVAL;
+
+ __vcpu_assign_sys_reg(vcpu, rd->reg, val);
+
+ return 0;
+}
+
static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
@@ -1410,22 +2495,10 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return write_to_read_only(vcpu, p, r);
csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1);
- p->regval = get_ccsidr(csselr);
+ csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD;
+ if (csselr < CSSELR_MAX)
+ p->regval = get_ccsidr(vcpu, csselr);
- /*
- * Guests should not be doing cache operations by set/way at all, and
- * for this reason, we trap them and attempt to infer the intent, so
- * that we can flush the entire guest's address space at the appropriate
- * time.
- * To prevent this trapping from causing performance problems, let's
- * expose the geometry of all data and unified caches (which are
- * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way.
- * [If guests should attempt to infer aliasing properties from the
- * geometry (which is not permitted by the architecture), they would
- * only do so for virtually indexed caches.]
- */
- if (!(csselr & 1)) // data or unified cache
- p->regval &= ~GENMASK(27, 3);
return true;
}
@@ -1446,22 +2519,119 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
.visibility = mte_visibility, \
}
-/* sys_reg_desc initialiser for known cpufeature ID registers */
-#define ID_SANITISED(name) { \
+static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (vcpu_has_nv(vcpu))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static bool bad_vncr_trap(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ /*
+ * We really shouldn't be here, and this is likely the result
+ * of a misconfigured trap, as this register should target the
+ * VNCR page, and nothing else.
+ */
+ return bad_trap(vcpu, p, r,
+ "trap of VNCR-backed register");
+}
+
+static bool bad_redir_trap(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ /*
+ * We really shouldn't be here, and this is likely the result
+ * of a misconfigured trap, as this register should target the
+ * corresponding EL1, and nothing else.
+ */
+ return bad_trap(vcpu, p, r,
+ "trap of EL2 register redirected to EL1");
+}
+
+#define SYS_REG_USER_FILTER(name, acc, rst, v, gu, su, filter) { \
SYS_DESC(SYS_##name), \
+ .access = acc, \
+ .reset = rst, \
+ .reg = name, \
+ .get_user = gu, \
+ .set_user = su, \
+ .visibility = filter, \
+ .val = v, \
+}
+
+#define EL2_REG_FILTERED(name, acc, rst, v, filter) \
+ SYS_REG_USER_FILTER(name, acc, rst, v, NULL, NULL, filter)
+
+#define EL2_REG(name, acc, rst, v) \
+ EL2_REG_FILTERED(name, acc, rst, v, el2_visibility)
+
+#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v)
+#define EL2_REG_VNCR_FILT(name, vis) \
+ EL2_REG_FILTERED(name, bad_vncr_trap, reset_val, 0, vis)
+#define EL2_REG_VNCR_GICv3(name) \
+ EL2_REG_VNCR_FILT(name, hidden_visibility)
+#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v)
+
+#define TIMER_REG(name, vis) \
+ SYS_REG_USER_FILTER(name, access_arch_timer, reset_val, 0, \
+ arch_timer_get_user, arch_timer_set_user, vis)
+
+/*
+ * Since reset() callback and field val are not used for idregs, they will be
+ * used for specific purposes for idregs.
+ * The reset() would return KVM sanitised register value. The value would be the
+ * same as the host kernel sanitised value if there is no KVM sanitisation.
+ * The val would be used as a mask indicating writable fields for the idreg.
+ * Only bits with 1 are writable from userspace. This mask might not be
+ * necessary in the future whenever all ID registers are enabled as writable
+ * from userspace.
+ */
+
+#define ID_DESC_DEFAULT_CALLBACKS \
.access = access_id_reg, \
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = id_visibility, \
-}
+ .reset = kvm_read_sanitised_id_reg
-/* sys_reg_desc initialiser for known cpufeature ID registers */
-#define AA32_ID_SANITISED(name) { \
+#define ID_DESC(name) \
SYS_DESC(SYS_##name), \
- .access = access_id_reg, \
- .get_user = get_id_reg, \
- .set_user = set_id_reg, \
+ ID_DESC_DEFAULT_CALLBACKS
+
+/* sys_reg_desc initialiser for known cpufeature ID registers */
+#define ID_SANITISED(name) { \
+ ID_DESC(name), \
+ .val = 0, \
+}
+
+/* sys_reg_desc initialiser for writable ID registers */
+#define ID_WRITABLE(name, mask) { \
+ ID_DESC(name), \
+ .val = mask, \
+}
+
+/*
+ * 32bit ID regs are fully writable when the guest is 32bit
+ * capable. Nothing in the KVM code should rely on 32bit features
+ * anyway, only 64bit, so let the VMM do its worse.
+ */
+#define AA32_ID_WRITABLE(name) { \
+ ID_DESC(name), \
.visibility = aa32_id_visibility, \
+ .val = GENMASK(31, 0), \
+}
+
+/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */
+#define ID_FILTERED(sysreg, name, mask) { \
+ ID_DESC(sysreg), \
+ .set_user = set_##name, \
+ .val = (mask), \
}
/*
@@ -1470,11 +2640,11 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
* (1 <= crm < 8, 0 <= Op2 < 8).
*/
#define ID_UNALLOCATED(crm, op2) { \
+ .name = "S3_0_0_" #crm "_" #op2, \
Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
- .access = access_id_reg, \
- .get_user = get_id_reg, \
- .set_user = set_id_reg, \
- .visibility = raz_visibility \
+ ID_DESC_DEFAULT_CALLBACKS, \
+ .visibility = raz_visibility, \
+ .val = 0, \
}
/*
@@ -1483,11 +2653,439 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
* RAZ for the guest.
*/
#define ID_HIDDEN(name) { \
- SYS_DESC(SYS_##name), \
- .access = access_id_reg, \
- .get_user = get_id_reg, \
- .set_user = set_id_reg, \
+ ID_DESC(name), \
.visibility = raz_visibility, \
+ .val = 0, \
+}
+
+static bool access_sp_el1(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ __vcpu_assign_sys_reg(vcpu, SP_EL1, p->regval);
+ else
+ p->regval = __vcpu_sys_reg(vcpu, SP_EL1);
+
+ return true;
+}
+
+static bool access_elr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1);
+ else
+ p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1);
+
+ return true;
+}
+
+static bool access_spsr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ __vcpu_assign_sys_reg(vcpu, SPSR_EL1, p->regval);
+ else
+ p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1);
+
+ return true;
+}
+
+static bool access_cntkctl_el12(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ __vcpu_assign_sys_reg(vcpu, CNTKCTL_EL1, p->regval);
+ else
+ p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1);
+
+ return true;
+}
+
+static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ u64 val = r->val;
+
+ if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
+ val |= HCR_E2H;
+
+ __vcpu_assign_sys_reg(vcpu, r->reg, val);
+
+ return __vcpu_sys_reg(vcpu, r->reg);
+}
+
+static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ unsigned int (*fn)(const struct kvm_vcpu *,
+ const struct sys_reg_desc *))
+{
+ return el2_visibility(vcpu, rd) ?: fn(vcpu, rd);
+}
+
+static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, sve_visibility);
+}
+
+static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (el2_visibility(vcpu, rd) == 0 &&
+ kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int sctlr2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_sctlr2(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int sctlr2_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, sctlr2_visibility);
+}
+
+static bool access_zcr_el2(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ unsigned int vq;
+
+ if (guest_hyp_sve_traps_enabled(vcpu)) {
+ kvm_inject_nested_sve_trap(vcpu);
+ return false;
+ }
+
+ if (!p->is_write) {
+ p->regval = __vcpu_sys_reg(vcpu, ZCR_EL2);
+ return true;
+ }
+
+ vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1;
+ vq = min(vq, vcpu_sve_max_vq(vcpu));
+ __vcpu_assign_sys_reg(vcpu, ZCR_EL2, vq - 1);
+ return true;
+}
+
+static bool access_gic_vtr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = kvm_get_guest_vtr_el2();
+
+ return true;
+}
+
+static bool access_gic_misr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_misr(vcpu);
+
+ return true;
+}
+
+static bool access_gic_eisr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_eisr(vcpu);
+
+ return true;
+}
+
+static bool access_gic_elrsr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = vgic_v3_get_elrsr(vcpu);
+
+ return true;
+}
+
+static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_s1poe(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, s1poe_visibility);
+}
+
+static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_tcr2(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, tcr2_visibility);
+}
+
+static unsigned int fgt2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (el2_visibility(vcpu, rd) == 0 &&
+ kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, FGT2))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int fgt_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (el2_visibility(vcpu, rd) == 0 &&
+ kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, IMP))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_s1pie(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, s1pie_visibility);
+}
+
+static unsigned int cnthv_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (vcpu_has_nv(vcpu) &&
+ !vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2_E2H0))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static bool access_mdcr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u64 hpmn, val, old = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ if (!p->is_write) {
+ p->regval = old;
+ return true;
+ }
+
+ val = p->regval;
+ hpmn = FIELD_GET(MDCR_EL2_HPMN, val);
+
+ /*
+ * If HPMN is out of bounds, limit it to what we actually
+ * support. This matches the UNKNOWN definition of the field
+ * in that case, and keeps the emulation simple. Sort of.
+ */
+ if (hpmn > vcpu->kvm->arch.nr_pmu_counters) {
+ hpmn = vcpu->kvm->arch.nr_pmu_counters;
+ u64p_replace_bits(&val, hpmn, MDCR_EL2_HPMN);
+ }
+
+ __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val);
+
+ /*
+ * Request a reload of the PMU to enable/disable the counters
+ * affected by HPME.
+ */
+ if ((old ^ val) & MDCR_EL2_HPME)
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
+ return true;
+}
+
+static bool access_ras(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch(reg_to_encoding(r)) {
+ case SYS_ERXPFGCDN_EL1:
+ case SYS_ERXPFGCTL_EL1:
+ case SYS_ERXPFGF_EL1:
+ case SYS_ERXMISC2_EL1:
+ case SYS_ERXMISC3_EL1:
+ if (!(kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) ||
+ (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) &&
+ kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)))) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+ break;
+ default:
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+ }
+
+ return trap_raz_wi(vcpu, p, r);
+}
+
+/*
+ * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and
+ * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them.
+ * The values made visible to userspace were the register values of the boot
+ * CPU.
+ *
+ * At the same time, reads from these registers at EL1 previously were not
+ * trapped, allowing the guest to read the actual hardware value. On big-little
+ * machines, this means the VM can see different values depending on where a
+ * given vCPU got scheduled.
+ *
+ * These registers are now trapped as collateral damage from SME, and what
+ * follows attempts to give a user / guest view consistent with the existing
+ * ABI.
+ */
+static bool access_imp_id_reg(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ /*
+ * Return the VM-scoped implementation ID register values if userspace
+ * has made them writable.
+ */
+ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags))
+ return access_id_reg(vcpu, p, r);
+
+ /*
+ * Otherwise, fall back to the old behavior of returning the value of
+ * the current CPU.
+ */
+ switch (reg_to_encoding(r)) {
+ case SYS_REVIDR_EL1:
+ p->regval = read_sysreg(revidr_el1);
+ break;
+ case SYS_AIDR_EL1:
+ p->regval = read_sysreg(aidr_el1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ return true;
+}
+
+static u64 __ro_after_init boot_cpu_midr_val;
+static u64 __ro_after_init boot_cpu_revidr_val;
+static u64 __ro_after_init boot_cpu_aidr_val;
+
+static void init_imp_id_regs(void)
+{
+ boot_cpu_midr_val = read_sysreg(midr_el1);
+ boot_cpu_revidr_val = read_sysreg(revidr_el1);
+ boot_cpu_aidr_val = read_sysreg(aidr_el1);
+}
+
+static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ switch (reg_to_encoding(r)) {
+ case SYS_MIDR_EL1:
+ return boot_cpu_midr_val;
+ case SYS_REVIDR_EL1:
+ return boot_cpu_revidr_val;
+ case SYS_AIDR_EL1:
+ return boot_cpu_aidr_val;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ return 0;
+ }
+}
+
+static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 expected;
+
+ guard(mutex)(&kvm->arch.config_lock);
+
+ expected = read_id_reg(vcpu, r);
+ if (expected == val)
+ return 0;
+
+ if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))
+ return -EINVAL;
+
+ /*
+ * Once the VM has started the ID registers are immutable. Reject the
+ * write if userspace tries to change it.
+ */
+ if (kvm_vm_has_ran_once(kvm))
+ return -EBUSY;
+
+ /*
+ * Any value is allowed for the implementation ID registers so long as
+ * it is within the writable mask.
+ */
+ if ((val & r->val) != val)
+ return -EINVAL;
+
+ kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val);
+ return 0;
+}
+
+#define IMPLEMENTATION_ID(reg, mask) { \
+ SYS_DESC(SYS_##reg), \
+ .access = access_imp_id_reg, \
+ .get_user = get_id_reg, \
+ .set_user = set_imp_id_reg, \
+ .reset = reset_imp_id_reg, \
+ .val = mask, \
+ }
+
+static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ __vcpu_assign_sys_reg(vcpu, r->reg, vcpu->kvm->arch.nr_pmu_counters);
+ return vcpu->kvm->arch.nr_pmu_counters;
}
/*
@@ -1502,10 +3100,6 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
* guest...
*/
static const struct sys_reg_desc sys_reg_descs[] = {
- { SYS_DESC(SYS_DC_ISW), access_dcsw },
- { SYS_DESC(SYS_DC_CSW), access_dcsw },
- { SYS_DESC(SYS_DC_CISW), access_dcsw },
-
DBG_BCR_BVR_WCR_WVR_EL1(0),
DBG_BCR_BVR_WCR_WVR_EL1(1),
{ SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 },
@@ -1528,7 +3122,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi },
{ SYS_DESC(SYS_OSLAR_EL1), trap_oslar_el1 },
{ SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1, reset_val, OSLSR_EL1,
- SYS_OSLSR_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, },
+ OSLSR_EL1_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, },
{ SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi },
{ SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi },
{ SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi },
@@ -1540,9 +3134,11 @@ static const struct sys_reg_desc sys_reg_descs[] = {
// DBGDTR[TR]X_EL0 share the same encoding
{ SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi },
- { SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 },
+ { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 },
+ IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)),
{ SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
+ IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)),
/*
* ID regs: all ID_SANITISED() entries here must have corresponding
@@ -1551,52 +3147,89 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
- AA32_ID_SANITISED(ID_PFR0_EL1),
- AA32_ID_SANITISED(ID_PFR1_EL1),
- { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_dfr0_el1,
- .visibility = aa32_id_visibility, },
+ AA32_ID_WRITABLE(ID_PFR0_EL1),
+ AA32_ID_WRITABLE(ID_PFR1_EL1),
+ { SYS_DESC(SYS_ID_DFR0_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_dfr0_el1,
+ .visibility = aa32_id_visibility,
+ .reset = read_sanitised_id_dfr0_el1,
+ .val = GENMASK(31, 0) },
ID_HIDDEN(ID_AFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR2_EL1),
- AA32_ID_SANITISED(ID_MMFR3_EL1),
+ AA32_ID_WRITABLE(ID_MMFR0_EL1),
+ AA32_ID_WRITABLE(ID_MMFR1_EL1),
+ AA32_ID_WRITABLE(ID_MMFR2_EL1),
+ AA32_ID_WRITABLE(ID_MMFR3_EL1),
/* CRm=2 */
- AA32_ID_SANITISED(ID_ISAR0_EL1),
- AA32_ID_SANITISED(ID_ISAR1_EL1),
- AA32_ID_SANITISED(ID_ISAR2_EL1),
- AA32_ID_SANITISED(ID_ISAR3_EL1),
- AA32_ID_SANITISED(ID_ISAR4_EL1),
- AA32_ID_SANITISED(ID_ISAR5_EL1),
- AA32_ID_SANITISED(ID_MMFR4_EL1),
- AA32_ID_SANITISED(ID_ISAR6_EL1),
+ AA32_ID_WRITABLE(ID_ISAR0_EL1),
+ AA32_ID_WRITABLE(ID_ISAR1_EL1),
+ AA32_ID_WRITABLE(ID_ISAR2_EL1),
+ AA32_ID_WRITABLE(ID_ISAR3_EL1),
+ AA32_ID_WRITABLE(ID_ISAR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR6_EL1),
/* CRm=3 */
- AA32_ID_SANITISED(MVFR0_EL1),
- AA32_ID_SANITISED(MVFR1_EL1),
- AA32_ID_SANITISED(MVFR2_EL1),
+ AA32_ID_WRITABLE(MVFR0_EL1),
+ AA32_ID_WRITABLE(MVFR1_EL1),
+ AA32_ID_WRITABLE(MVFR2_EL1),
ID_UNALLOCATED(3,3),
- AA32_ID_SANITISED(ID_PFR2_EL1),
+ AA32_ID_WRITABLE(ID_PFR2_EL1),
ID_HIDDEN(ID_DFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR5_EL1),
ID_UNALLOCATED(3,7),
/* AArch64 ID registers */
/* CRm=4 */
- { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, },
- ID_SANITISED(ID_AA64PFR1_EL1),
- ID_UNALLOCATED(4,2),
+ ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1,
+ ~(ID_AA64PFR0_EL1_AMU |
+ ID_AA64PFR0_EL1_MPAM |
+ ID_AA64PFR0_EL1_SVE |
+ ID_AA64PFR0_EL1_AdvSIMD |
+ ID_AA64PFR0_EL1_FP)),
+ ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
+ ~(ID_AA64PFR1_EL1_PFAR |
+ ID_AA64PFR1_EL1_MTEX |
+ ID_AA64PFR1_EL1_THE |
+ ID_AA64PFR1_EL1_GCS |
+ ID_AA64PFR1_EL1_MTE_frac |
+ ID_AA64PFR1_EL1_NMI |
+ ID_AA64PFR1_EL1_RNDR_trap |
+ ID_AA64PFR1_EL1_SME |
+ ID_AA64PFR1_EL1_RES0 |
+ ID_AA64PFR1_EL1_MPAM_frac |
+ ID_AA64PFR1_EL1_MTE)),
+ ID_WRITABLE(ID_AA64PFR2_EL1,
+ ID_AA64PFR2_EL1_FPMR |
+ ID_AA64PFR2_EL1_MTEFAR |
+ ID_AA64PFR2_EL1_MTESTOREONLY),
ID_UNALLOCATED(4,3),
- ID_SANITISED(ID_AA64ZFR0_EL1),
+ ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
ID_HIDDEN(ID_AA64SMFR0_EL1),
ID_UNALLOCATED(4,6),
- ID_UNALLOCATED(4,7),
+ ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0),
/* CRm=5 */
- { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, },
+ /*
+ * Prior to FEAT_Debugv8.9, the architecture defines context-aware
+ * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs).
+ * KVM does not trap + emulate the breakpoint registers, and as such
+ * cannot support a layout that misaligns with the underlying hardware.
+ * While it may be possible to describe a subset that aligns with
+ * hardware, just prevent changes to BRPs and CTX_CMPs altogether for
+ * simplicity.
+ *
+ * See DDI0487K.a, section D2.8.3 Breakpoint types and linking
+ * of breakpoints for more details.
+ */
+ ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1,
+ ID_AA64DFR0_EL1_DoubleLock_MASK |
+ ID_AA64DFR0_EL1_WRPs_MASK |
+ ID_AA64DFR0_EL1_PMUVer_MASK |
+ ID_AA64DFR0_EL1_DebugVer_MASK),
ID_SANITISED(ID_AA64DFR1_EL1),
ID_UNALLOCATED(5,2),
ID_UNALLOCATED(5,3),
@@ -1606,21 +3239,42 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_UNALLOCATED(5,7),
/* CRm=6 */
- ID_SANITISED(ID_AA64ISAR0_EL1),
- ID_SANITISED(ID_AA64ISAR1_EL1),
- ID_SANITISED(ID_AA64ISAR2_EL1),
- ID_UNALLOCATED(6,3),
+ ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0),
+ ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI |
+ ID_AA64ISAR1_EL1_GPA |
+ ID_AA64ISAR1_EL1_API |
+ ID_AA64ISAR1_EL1_APA)),
+ ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 |
+ ID_AA64ISAR2_EL1_APA3 |
+ ID_AA64ISAR2_EL1_GPA3)),
+ ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT |
+ ID_AA64ISAR3_EL1_LSFE |
+ ID_AA64ISAR3_EL1_FAMINMAX)),
ID_UNALLOCATED(6,4),
ID_UNALLOCATED(6,5),
ID_UNALLOCATED(6,6),
ID_UNALLOCATED(6,7),
/* CRm=7 */
- ID_SANITISED(ID_AA64MMFR0_EL1),
- ID_SANITISED(ID_AA64MMFR1_EL1),
- ID_SANITISED(ID_AA64MMFR2_EL1),
- ID_UNALLOCATED(7,3),
- ID_UNALLOCATED(7,4),
+ ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1,
+ ~(ID_AA64MMFR0_EL1_RES0 |
+ ID_AA64MMFR0_EL1_ASIDBITS)),
+ ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 |
+ ID_AA64MMFR1_EL1_XNX |
+ ID_AA64MMFR1_EL1_VH |
+ ID_AA64MMFR1_EL1_VMIDBits)),
+ ID_FILTERED(ID_AA64MMFR2_EL1,
+ id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 |
+ ID_AA64MMFR2_EL1_EVT |
+ ID_AA64MMFR2_EL1_FWB |
+ ID_AA64MMFR2_EL1_IDS |
+ ID_AA64MMFR2_EL1_NV |
+ ID_AA64MMFR2_EL1_CCIDX)),
+ ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX |
+ ID_AA64MMFR3_EL1_SCTLRX |
+ ID_AA64MMFR3_EL1_S1PIE |
+ ID_AA64MMFR3_EL1_S1POE)),
+ ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac),
ID_UNALLOCATED(7,5),
ID_UNALLOCATED(7,6),
ID_UNALLOCATED(7,7),
@@ -1628,6 +3282,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
+ { SYS_DESC(SYS_SCTLR2_EL1), access_vm_reg, reset_val, SCTLR2_EL1, 0,
+ .visibility = sctlr2_visibility },
MTE_REG(RGSR_EL1),
MTE_REG(GCR_EL1),
@@ -1639,6 +3295,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
{ SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
{ SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
+ { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0,
+ .visibility = tcr2_visibility },
PTRAUTH_KEY(APIA),
PTRAUTH_KEY(APIB),
@@ -1646,18 +3304,28 @@ static const struct sys_reg_desc sys_reg_descs[] = {
PTRAUTH_KEY(APDB),
PTRAUTH_KEY(APGA),
+ { SYS_DESC(SYS_SPSR_EL1), access_spsr},
+ { SYS_DESC(SYS_ELR_EL1), access_elr},
+
+ { SYS_DESC(SYS_ICC_PMR_EL1), undef_access },
+
{ SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
- { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
+ { SYS_DESC(SYS_ERRIDR_EL1), access_ras },
+ { SYS_DESC(SYS_ERRSELR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXFR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXCTLR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXSTATUS_EL1), access_ras },
+ { SYS_DESC(SYS_ERXADDR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXPFGF_EL1), access_ras },
+ { SYS_DESC(SYS_ERXPFGCTL_EL1), access_ras },
+ { SYS_DESC(SYS_ERXPFGCDN_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC0_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC1_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC2_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC3_EL1), access_ras },
MTE_REG(TFSR_EL1),
MTE_REG(TFSRE0_EL1),
@@ -1676,89 +3344,133 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_PMBLIMITR_EL1), undef_access },
{ SYS_DESC(SYS_PMBPTR_EL1), undef_access },
{ SYS_DESC(SYS_PMBSR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSDSFR_EL1), undef_access },
/* PMBIDR_EL1 is not trapped */
- { PMU_SYS_REG(SYS_PMINTENSET_EL1),
- .access = access_pminten, .reg = PMINTENSET_EL1 },
- { PMU_SYS_REG(SYS_PMINTENCLR_EL1),
- .access = access_pminten, .reg = PMINTENSET_EL1 },
+ { PMU_SYS_REG(PMINTENSET_EL1),
+ .access = access_pminten, .reg = PMINTENSET_EL1,
+ .get_user = get_pmreg, .set_user = set_pmreg },
+ { PMU_SYS_REG(PMINTENCLR_EL1),
+ .access = access_pminten, .reg = PMINTENSET_EL1,
+ .get_user = get_pmreg, .set_user = set_pmreg },
{ SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi },
{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
+ { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1,
+ .visibility = s1pie_visibility },
+ { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1,
+ .visibility = s1pie_visibility },
+ { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1,
+ .visibility = s1poe_visibility },
{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
{ SYS_DESC(SYS_LORSA_EL1), trap_loregion },
{ SYS_DESC(SYS_LOREA_EL1), trap_loregion },
{ SYS_DESC(SYS_LORN_EL1), trap_loregion },
{ SYS_DESC(SYS_LORC_EL1), trap_loregion },
+ { SYS_DESC(SYS_MPAMIDR_EL1), undef_access },
{ SYS_DESC(SYS_LORID_EL1), trap_loregion },
- { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
+ { SYS_DESC(SYS_MPAM1_EL1), undef_access },
+ { SYS_DESC(SYS_MPAM0_EL1), undef_access },
+ { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
- { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only },
+ { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
+ { SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi },
- { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only },
+ { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
+ { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },
{ SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
{ SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 },
+ { SYS_DESC(SYS_ACCDATA_EL1), undef_access },
+
{ SYS_DESC(SYS_SCXTNUM_EL1), undef_access },
{ SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0},
{ SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr },
- { SYS_DESC(SYS_CLIDR_EL1), access_clidr },
+ { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1,
+ .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 },
+ { SYS_DESC(SYS_CCSIDR2_EL1), undef_access },
{ SYS_DESC(SYS_SMIDR_EL1), undef_access },
+ IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)),
{ SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
- { SYS_DESC(SYS_CTR_EL0), access_ctr },
- { SYS_DESC(SYS_SVCR), undef_access },
-
- { PMU_SYS_REG(SYS_PMCR_EL0), .access = access_pmcr,
- .reset = reset_pmcr, .reg = PMCR_EL0 },
- { PMU_SYS_REG(SYS_PMCNTENSET_EL0),
- .access = access_pmcnten, .reg = PMCNTENSET_EL0 },
- { PMU_SYS_REG(SYS_PMCNTENCLR_EL0),
- .access = access_pmcnten, .reg = PMCNTENSET_EL0 },
- { PMU_SYS_REG(SYS_PMOVSCLR_EL0),
- .access = access_pmovs, .reg = PMOVSSET_EL0 },
+ ID_FILTERED(CTR_EL0, ctr_el0,
+ CTR_EL0_DIC_MASK |
+ CTR_EL0_IDC_MASK |
+ CTR_EL0_DminLine_MASK |
+ CTR_EL0_L1Ip_MASK |
+ CTR_EL0_IminLine_MASK),
+ { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility },
+ { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility },
+
+ { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr,
+ .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr },
+ { PMU_SYS_REG(PMCNTENSET_EL0),
+ .access = access_pmcnten, .reg = PMCNTENSET_EL0,
+ .get_user = get_pmreg, .set_user = set_pmreg },
+ { PMU_SYS_REG(PMCNTENCLR_EL0),
+ .access = access_pmcnten, .reg = PMCNTENSET_EL0,
+ .get_user = get_pmreg, .set_user = set_pmreg },
+ { PMU_SYS_REG(PMOVSCLR_EL0),
+ .access = access_pmovs, .reg = PMOVSSET_EL0,
+ .get_user = get_pmreg, .set_user = set_pmreg },
/*
* PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was
* previously (and pointlessly) advertised in the past...
*/
- { PMU_SYS_REG(SYS_PMSWINC_EL0),
+ { PMU_SYS_REG(PMSWINC_EL0),
.get_user = get_raz_reg, .set_user = set_wi_reg,
.access = access_pmswinc, .reset = NULL },
- { PMU_SYS_REG(SYS_PMSELR_EL0),
+ { PMU_SYS_REG(PMSELR_EL0),
.access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 },
- { PMU_SYS_REG(SYS_PMCEID0_EL0),
+ { PMU_SYS_REG(PMCEID0_EL0),
.access = access_pmceid, .reset = NULL },
- { PMU_SYS_REG(SYS_PMCEID1_EL0),
+ { PMU_SYS_REG(PMCEID1_EL0),
.access = access_pmceid, .reset = NULL },
- { PMU_SYS_REG(SYS_PMCCNTR_EL0),
- .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 },
- { PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
+ { PMU_SYS_REG(PMCCNTR_EL0),
+ .access = access_pmu_evcntr, .reset = reset_unknown,
+ .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr,
+ .set_user = set_pmu_evcntr },
+ { PMU_SYS_REG(PMXEVTYPER_EL0),
.access = access_pmu_evtyper, .reset = NULL },
- { PMU_SYS_REG(SYS_PMXEVCNTR_EL0),
+ { PMU_SYS_REG(PMXEVCNTR_EL0),
.access = access_pmu_evcntr, .reset = NULL },
/*
* PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero
* in 32bit mode. Here we choose to reset it as zero for consistency.
*/
- { PMU_SYS_REG(SYS_PMUSERENR_EL0), .access = access_pmuserenr,
+ { PMU_SYS_REG(PMUSERENR_EL0), .access = access_pmuserenr,
.reset = reset_val, .reg = PMUSERENR_EL0, .val = 0 },
- { PMU_SYS_REG(SYS_PMOVSSET_EL0),
- .access = access_pmovs, .reg = PMOVSSET_EL0 },
+ { PMU_SYS_REG(PMOVSSET_EL0),
+ .access = access_pmovs, .reg = PMOVSSET_EL0,
+ .get_user = get_pmreg, .set_user = set_pmreg },
+ { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0,
+ .visibility = s1poe_visibility },
{ SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 },
{ SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 },
{ SYS_DESC(SYS_TPIDR2_EL0), undef_access },
@@ -1838,9 +3550,19 @@ static const struct sys_reg_desc sys_reg_descs[] = {
AMU_AMEVTYPER1_EL0(14),
AMU_AMEVTYPER1_EL0(15),
+ { SYS_DESC(SYS_CNTPCT_EL0), .access = access_arch_timer,
+ .get_user = arch_timer_get_user, .set_user = arch_timer_set_user },
+ { SYS_DESC(SYS_CNTVCT_EL0), .access = access_arch_timer,
+ .get_user = arch_timer_get_user, .set_user = arch_timer_set_user },
+ { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer },
+ { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
- { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
- { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer },
+ TIMER_REG(CNTP_CTL_EL0, NULL),
+ TIMER_REG(CNTP_CVAL_EL0, NULL),
+
+ { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer },
+ TIMER_REG(CNTV_CTL_EL0, NULL),
+ TIMER_REG(CNTV_CVAL_EL0, NULL),
/* PMEVCNTRn_EL0 */
PMU_PMEVCNTR_EL0(0),
@@ -1910,12 +3632,650 @@ static const struct sys_reg_desc sys_reg_descs[] = {
* PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero
* in 32bit mode. Here we choose to reset it as zero for consistency.
*/
- { PMU_SYS_REG(SYS_PMCCFILTR_EL0), .access = access_pmu_evtyper,
+ { PMU_SYS_REG(PMCCFILTR_EL0), .access = access_pmu_evtyper,
.reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 },
- { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 },
- { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 },
- { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 },
+ EL2_REG_VNCR(VPIDR_EL2, reset_unknown, 0),
+ EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0),
+ EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
+ EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
+ EL2_REG_FILTERED(SCTLR2_EL2, access_vm_reg, reset_val, 0,
+ sctlr2_el2_visibility),
+ EL2_REG_VNCR(HCR_EL2, reset_hcr, 0),
+ EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0),
+ EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
+ EL2_REG_VNCR(HSTR_EL2, reset_val, 0),
+ EL2_REG_VNCR_FILT(HFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HFGWTR_EL2, fgt_visibility),
+ EL2_REG_VNCR(HFGITR_EL2, reset_val, 0),
+ EL2_REG_VNCR(HACR_EL2, reset_val, 0),
+
+ EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0,
+ sve_el2_visibility),
+
+ EL2_REG_VNCR(HCRX_EL2, reset_val, 0),
+
+ EL2_REG(TTBR0_EL2, access_rw, reset_val, 0),
+ EL2_REG(TTBR1_EL2, access_rw, reset_val, 0),
+ EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1),
+ EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1,
+ tcr2_el2_visibility),
+ EL2_REG_VNCR(VTTBR_EL2, reset_val, 0),
+ EL2_REG_VNCR(VTCR_EL2, reset_val, 0),
+ EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0,
+ vncr_el2_visibility),
+
+ { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 },
+ EL2_REG_VNCR_FILT(HDFGRTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HDFGWTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HFGRTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HFGWTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HDFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HDFGWTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HAFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HFGITR2_EL2, fgt2_visibility),
+ EL2_REG_REDIR(SPSR_EL2, reset_val, 0),
+ EL2_REG_REDIR(ELR_EL2, reset_val, 0),
+ { SYS_DESC(SYS_SP_EL1), access_sp_el1},
+
+ /* AArch32 SPSR_* are RES0 if trapped from a NV guest */
+ { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi },
+
+ { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 },
+ EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
+ EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
+ EL2_REG_REDIR(ESR_EL2, reset_val, 0),
+ EL2_REG_VNCR(VSESR_EL2, reset_unknown, 0),
+ { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 },
+
+ EL2_REG_REDIR(FAR_EL2, reset_val, 0),
+ EL2_REG(HPFAR_EL2, access_rw, reset_val, 0),
+
+ EL2_REG(MAIR_EL2, access_rw, reset_val, 0),
+ EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0,
+ s1pie_el2_visibility),
+ EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0,
+ s1pie_el2_visibility),
+ EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0,
+ s1poe_el2_visibility),
+ EL2_REG(AMAIR_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_MPAMHCR_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access },
+ { SYS_DESC(SYS_MPAM2_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access },
+ { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access },
+
+ EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_RVBAR_EL2), undef_access },
+ { SYS_DESC(SYS_RMR_EL2), undef_access },
+ EL2_REG_VNCR(VDISR_EL2, reset_unknown, 0),
+
+ EL2_REG_VNCR_GICv3(ICH_AP0R0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R3_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R3_EL2),
+
+ { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre },
+
+ EL2_REG_VNCR_GICv3(ICH_HCR_EL2),
+ { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr },
+ { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr },
+ { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr },
+ { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr },
+ EL2_REG_VNCR_GICv3(ICH_VMCR_EL2),
+
+ EL2_REG_VNCR_GICv3(ICH_LR0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR3_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR4_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR5_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR6_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR7_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR8_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR9_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR10_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR11_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR12_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR13_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR14_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR15_EL2),
+
+ EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
+ EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
+
+ EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0),
+ EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer },
+ TIMER_REG(CNTHP_CTL_EL2, el2_visibility),
+ TIMER_REG(CNTHP_CVAL_EL2, el2_visibility),
+
+ { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer, .visibility = cnthv_visibility },
+ TIMER_REG(CNTHV_CTL_EL2, cnthv_visibility),
+ TIMER_REG(CNTHV_CVAL_EL2, cnthv_visibility),
+
+ { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 },
+
+ { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer },
+
+ { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer },
+ { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer },
+
+ EL2_REG(SP_EL2, NULL, reset_unknown, 0),
+};
+
+static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ if (__kvm_at_s1e01(vcpu, op, p->regval))
+ return false;
+
+ return true;
+}
+
+static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ /* There is no FGT associated with AT S1E2A :-( */
+ if (op == OP_AT_S1E2A &&
+ !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ if (__kvm_at_s1e2(vcpu, op, p->regval))
+ return false;
+
+ return true;
+}
+
+static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ if (__kvm_at_s12(vcpu, op, p->regval))
+ return false;
+
+ return true;
+}
+
+static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+
+ if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
+ return false;
+
+ if (CRm == TLBI_CRm_nROS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ return true;
+}
+
+static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ /*
+ * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the
+ * corresponding VMIDs.
+ */
+ kvm_nested_s2_unmap(vcpu->kvm, true);
+
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return true;
+}
+
+static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+ u8 Op2 = sys_reg_Op2(instr);
+
+ if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ return true;
+}
+
+/* Only defined here as this is an internal "abstraction" */
+union tlbi_info {
+ struct {
+ u64 start;
+ u64 size;
+ } range;
+
+ struct {
+ u64 addr;
+ } ipa;
+
+ struct {
+ u64 addr;
+ u32 encoding;
+ } va;
+};
+
+static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ /*
+ * The unmap operation is allowed to drop the MMU lock and block, which
+ * means that @mmu could be used for a different context than the one
+ * currently being invalidated.
+ *
+ * This behavior is still safe, as:
+ *
+ * 1) The vCPU(s) that recycled the MMU are responsible for invalidating
+ * the entire MMU before reusing it, which still honors the intent
+ * of a TLBI.
+ *
+ * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC
+ * and ERET to the guest), other vCPUs are allowed to use stale
+ * translations.
+ *
+ * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and
+ * at worst may cause more aborts for shadow stage-2 fills.
+ *
+ * Dropping the MMU lock also implies that shadow stage-2 fills could
+ * happen behind the back of the TLBI. This is still safe, though, as
+ * the L1 needs to put its stage-2 in a consistent state before doing
+ * the TLBI.
+ */
+ kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true);
+}
+
+static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 limit, vttbr;
+
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm));
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .range = {
+ .start = 0,
+ .size = limit,
+ },
+ },
+ s2_mmu_unmap_range);
+
+ return true;
+}
+
+static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ u64 base, range;
+
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ /*
+ * Because the shadow S2 structure doesn't necessarily reflect that
+ * of the guest's S2 (different base granule size, for example), we
+ * decide to ignore TTL and only use the described range.
+ */
+ base = decode_range_tlbi(p->regval, &range, NULL);
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .range = {
+ .start = base,
+ .size = range,
+ },
+ },
+ s2_mmu_unmap_range);
+
+ return true;
+}
+
+static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ unsigned long max_size;
+ u64 base_addr;
+
+ /*
+ * We drop a number of things from the supplied value:
+ *
+ * - NS bit: we're non-secure only.
+ *
+ * - IPA[51:48]: We don't support 52bit IPA just yet...
+ *
+ * And of course, adjust the IPA to be on an actual address.
+ */
+ base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12;
+ max_size = compute_tlb_inval_range(mmu, info->ipa.addr);
+ base_addr &= ~(max_size - 1);
+
+ /*
+ * See comment in s2_mmu_unmap_range() for why this is allowed to
+ * reschedule.
+ */
+ kvm_stage2_unmap_range(mmu, base_addr, max_size, true);
+}
+
+static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .ipa = {
+ .addr = p->regval,
+ },
+ },
+ s2_mmu_unmap_ipa);
+
+ return true;
+}
+
+static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding));
+}
+
+static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval);
+ return true;
+}
+
+static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ /*
+ * If we're here, this is because we've trapped on a EL1 TLBI
+ * instruction that affects the EL1 translation regime while
+ * we're running in a context that doesn't allow us to let the
+ * HW do its thing (aka vEL2):
+ *
+ * - HCR_EL2.E2H == 0 : a non-VHE guest
+ * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode
+ *
+ * Another possibility is that we are invalidating the EL2 context
+ * using EL1 instructions, but that we landed here because we need
+ * additional invalidation for structures that are not held in the
+ * CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In
+ * that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 }
+ * as we don't allow an NV-capable L1 in a nVHE configuration.
+ *
+ * We don't expect these helpers to ever be called when running
+ * in a vEL1 context.
+ */
+
+ WARN_ON(!vcpu_is_el2(vcpu));
+
+ if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
+
+ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) {
+ kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval);
+ return true;
+ }
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm,
+ get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)),
+ &(union tlbi_info) {
+ .va = {
+ .addr = p->regval,
+ .encoding = sys_encoding,
+ },
+ },
+ s2_mmu_tlbi_s1e1);
+
+ return true;
+}
+
+#define SYS_INSN(insn, access_fn) \
+ { \
+ SYS_DESC(OP_##insn), \
+ .access = (access_fn), \
+ }
+
+static struct sys_reg_desc sys_insn_descs[] = {
+ { SYS_DESC(SYS_DC_ISW), access_dcsw },
+ { SYS_DESC(SYS_DC_IGSW), access_dcgsw },
+ { SYS_DESC(SYS_DC_IGDSW), access_dcgsw },
+
+ SYS_INSN(AT_S1E1R, handle_at_s1e01),
+ SYS_INSN(AT_S1E1W, handle_at_s1e01),
+ SYS_INSN(AT_S1E0R, handle_at_s1e01),
+ SYS_INSN(AT_S1E0W, handle_at_s1e01),
+ SYS_INSN(AT_S1E1RP, handle_at_s1e01),
+ SYS_INSN(AT_S1E1WP, handle_at_s1e01),
+
+ { SYS_DESC(SYS_DC_CSW), access_dcsw },
+ { SYS_DESC(SYS_DC_CGSW), access_dcgsw },
+ { SYS_DESC(SYS_DC_CGDSW), access_dcgsw },
+ { SYS_DESC(SYS_DC_CISW), access_dcsw },
+ { SYS_DESC(SYS_DC_CIGSW), access_dcgsw },
+ { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw },
+
+ SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1),
+
+ SYS_INSN(AT_S1E2R, handle_at_s1e2),
+ SYS_INSN(AT_S1E2W, handle_at_s1e2),
+ SYS_INSN(AT_S12E1R, handle_at_s12),
+ SYS_INSN(AT_S12E1W, handle_at_s12),
+ SYS_INSN(AT_S12E0R, handle_at_s12),
+ SYS_INSN(AT_S12E0W, handle_at_s12),
+ SYS_INSN(AT_S1E2A, handle_at_s1e2),
+
+ SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is),
+
+ SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2),
+ SYS_INSN(TLBI_ALLE1OS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2),
+ SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2),
+
+ SYS_INSN(TLBI_ALLE1IS, handle_alle1is),
+
+ SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2),
+
+ SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is),
+ SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVAE2, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVALE2, handle_tlbi_el2),
+ SYS_INSN(TLBI_ALLE2, handle_tlbi_el2),
+ SYS_INSN(TLBI_VAE2, handle_tlbi_el2),
+
+ SYS_INSN(TLBI_ALLE1, handle_alle1is),
+
+ SYS_INSN(TLBI_VALE2, handle_tlbi_el2),
+
+ SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is),
+
+ SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2),
+
+ SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is),
+ SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_ALLE1NXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2),
+ SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is),
};
static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
@@ -1925,14 +4285,14 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
if (p->is_write) {
return ignore_write(vcpu, p);
} else {
- u64 dfr = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
- u64 pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
- u32 el3 = !!cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR0_EL1_EL3_SHIFT);
-
- p->regval = ((((dfr >> ID_AA64DFR0_EL1_WRPs_SHIFT) & 0xf) << 28) |
- (((dfr >> ID_AA64DFR0_EL1_BRPs_SHIFT) & 0xf) << 24) |
- (((dfr >> ID_AA64DFR0_EL1_CTX_CMPs_SHIFT) & 0xf) << 20)
- | (6 << 16) | (1 << 15) | (el3 << 14) | (el3 << 12));
+ u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
+ u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP);
+
+ p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) |
+ (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) |
+ (SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, dfr) << 20) |
+ (SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, dfr) << 16) |
+ (1 << 15) | (el3 << 14) | (el3 << 12));
return true;
}
}
@@ -1946,18 +4306,20 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
* None of the other registers share their location, so treat them as
* if they were 64bit.
*/
-#define DBG_BCR_BVR_WCR_WVR(n) \
- /* DBGBVRn */ \
- { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \
- /* DBGBCRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \
- /* DBGWVRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \
- /* DBGWCRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n }
-
-#define DBGBXVR(n) \
- { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n }
+#define DBG_BCR_BVR_WCR_WVR(n) \
+ /* DBGBVRn */ \
+ { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \
+ trap_dbg_wb_reg, NULL, n }, \
+ /* DBGBCRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \
+ /* DBGWVRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \
+ /* DBGWCRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n }
+
+#define DBGBXVR(n) \
+ { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \
+ trap_dbg_wb_reg, NULL, n }
/*
* Trapped cp14 registers. We generally ignore most of the external
@@ -2092,6 +4454,7 @@ static const struct sys_reg_desc cp15_regs[] = {
/* TTBCR2 */
{ AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 },
{ Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 },
+ { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access },
/* DFSR */
{ Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 },
{ Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 },
@@ -2141,8 +4504,28 @@ static const struct sys_reg_desc cp15_regs[] = {
/* AMAIR1 */
{ AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 },
- /* ICC_SRE */
- { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre },
+ { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
+ { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
+ { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },
{ Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 },
@@ -2219,6 +4602,10 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr },
{ Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr },
+
+ /* CCSIDR2 */
+ { Op1(1), CRn( 0), CRm( 0), Op2(2), undef_access },
+
{ Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 },
};
@@ -2226,25 +4613,31 @@ static const struct sys_reg_desc cp15_64_regs[] = {
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 },
{ CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr },
{ Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */
+ { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer },
{ Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 },
{ Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
+ { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer },
{ Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */
{ SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer },
+ { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer },
+ { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer },
};
static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
- bool is_32)
+ bool reset_check)
{
unsigned int i;
for (i = 0; i < n; i++) {
- if (!is_32 && table[i].reg && !table[i].reset) {
- kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i);
+ if (reset_check && table[i].reg && !table[i].reset) {
+ kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n",
+ &table[i], i, table[i].name);
return false;
}
if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
- kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1);
+ kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n",
+ &table[i], i, table[i - 1].name, table[i].name);
return false;
}
}
@@ -2341,7 +4734,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
/**
* kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access
* @vcpu: The VCPU pointer
- * @run: The kvm_run struct
+ * @global: &struct sys_reg_desc
+ * @nr_global: size of the @global array
*/
static int kvm_handle_cp_64(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *global,
@@ -2430,7 +4824,7 @@ static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params)
return true;
kvm_pr_unimpl("Unhandled cp10 register %s: %u\n",
- params->is_write ? "write" : "read", reg_id);
+ str_write_read(params->is_write), reg_id);
return false;
}
@@ -2508,7 +4902,9 @@ static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu,
/**
* kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access
* @vcpu: The VCPU pointer
- * @run: The kvm_run struct
+ * @params: &struct sys_reg_params
+ * @global: &struct sys_reg_desc
+ * @nr_global: size of the @global array
*/
static int kvm_handle_cp_32(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
@@ -2544,9 +4940,13 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu)
* Certain AArch32 ID registers are handled by rerouting to the AArch64
* system register table. Registers in the ID range where CRm=0 are
* excluded from this scheme as they do not trivially map into AArch64
- * system register encodings.
+ * system register encodings, except for AIDR/REVIDR.
*/
- if (params.Op1 == 0 && params.CRn == 0 && params.CRm)
+ if (params.Op1 == 0 && params.CRn == 0 &&
+ (params.CRm || params.Op2 == 6 /* REVIDR */))
+ return kvm_emulate_cp15_id_reg(vcpu, &params);
+ if (params.Op1 == 1 && params.CRn == 0 &&
+ params.CRm == 0 && params.Op2 == 7 /* AIDR */)
return kvm_emulate_cp15_id_reg(vcpu, &params);
return kvm_handle_cp_32(vcpu, &params, cp15_regs, ARRAY_SIZE(cp15_regs));
@@ -2566,12 +4966,6 @@ int kvm_handle_cp14_32(struct kvm_vcpu *vcpu)
return kvm_handle_cp_32(vcpu, &params, cp14_regs, ARRAY_SIZE(cp14_regs));
}
-static bool is_imp_def_sys_reg(struct sys_reg_params *params)
-{
- // See ARM DDI 0487E.a, section D12.3.2
- return params->Op0 == 3 && (params->CRn & 0b1011) == 0b1011;
-}
-
/**
* emulate_sys_reg - Emulate a guest access to an AArch64 system register
* @vcpu: The VCPU pointer
@@ -2580,26 +4974,145 @@ static bool is_imp_def_sys_reg(struct sys_reg_params *params)
* Return: true if the system register access was successful, false otherwise.
*/
static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
- struct sys_reg_params *params)
+ struct sys_reg_params *params)
{
const struct sys_reg_desc *r;
r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
-
if (likely(r)) {
perform_access(vcpu, params, r);
return true;
}
- if (is_imp_def_sys_reg(params)) {
- kvm_inject_undefined(vcpu);
+ print_sys_reg_msg(params,
+ "Unsupported guest sys_reg access at: %lx [%08lx]\n",
+ *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
+ kvm_inject_undefined(vcpu);
+
+ return false;
+}
+
+static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos)
+{
+ unsigned long i, idreg_idx = 0;
+
+ for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+ const struct sys_reg_desc *r = &sys_reg_descs[i];
+
+ if (!is_vm_ftr_id_reg(reg_to_encoding(r)))
+ continue;
+
+ if (idreg_idx == pos)
+ return r;
+
+ idreg_idx++;
+ }
+
+ return NULL;
+}
+
+static void *idregs_debug_start(struct seq_file *s, loff_t *pos)
+{
+ struct kvm *kvm = s->private;
+ u8 *iter;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ iter = &kvm->arch.idreg_debugfs_iter;
+ if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) &&
+ *iter == (u8)~0) {
+ *iter = *pos;
+ if (!idregs_debug_find(kvm, *iter))
+ iter = NULL;
} else {
- print_sys_reg_msg(params,
- "Unsupported guest sys_reg access at: %lx [%08lx]\n",
- *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
- kvm_inject_undefined(vcpu);
+ iter = ERR_PTR(-EBUSY);
}
- return false;
+
+ mutex_unlock(&kvm->arch.config_lock);
+
+ return iter;
+}
+
+static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kvm *kvm = s->private;
+
+ (*pos)++;
+
+ if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) {
+ kvm->arch.idreg_debugfs_iter++;
+
+ return &kvm->arch.idreg_debugfs_iter;
+ }
+
+ return NULL;
+}
+
+static void idregs_debug_stop(struct seq_file *s, void *v)
+{
+ struct kvm *kvm = s->private;
+
+ if (IS_ERR(v))
+ return;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ kvm->arch.idreg_debugfs_iter = ~0;
+
+ mutex_unlock(&kvm->arch.config_lock);
+}
+
+static int idregs_debug_show(struct seq_file *s, void *v)
+{
+ const struct sys_reg_desc *desc;
+ struct kvm *kvm = s->private;
+
+ desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter);
+
+ if (!desc->name)
+ return 0;
+
+ seq_printf(s, "%20s:\t%016llx\n",
+ desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc)));
+
+ return 0;
+}
+
+static const struct seq_operations idregs_debug_sops = {
+ .start = idregs_debug_start,
+ .next = idregs_debug_next,
+ .stop = idregs_debug_stop,
+ .show = idregs_debug_show,
+};
+
+DEFINE_SEQ_ATTRIBUTE(idregs_debug);
+
+void kvm_sys_regs_create_debugfs(struct kvm *kvm)
+{
+ kvm->arch.idreg_debugfs_iter = ~0;
+
+ debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm,
+ &idregs_debug_fops);
+}
+
+static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg)
+{
+ u32 id = reg_to_encoding(reg);
+ struct kvm *kvm = vcpu->kvm;
+
+ if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
+ return;
+
+ kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg));
+}
+
+static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *reg)
+{
+ if (kvm_vcpu_initialized(vcpu))
+ return;
+
+ reg->reset(vcpu, reg);
}
/**
@@ -2611,33 +5124,66 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
*/
void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
unsigned long i;
- for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++)
- if (sys_reg_descs[i].reset)
- sys_reg_descs[i].reset(vcpu, &sys_reg_descs[i]);
+ for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+ const struct sys_reg_desc *r = &sys_reg_descs[i];
+
+ if (!r->reset)
+ continue;
+
+ if (is_vm_ftr_id_reg(reg_to_encoding(r)))
+ reset_vm_ftr_id_reg(vcpu, r);
+ else if (is_vcpu_ftr_id_reg(reg_to_encoding(r)))
+ reset_vcpu_ftr_id_reg(vcpu, r);
+ else
+ r->reset(vcpu, r);
+
+ if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS)
+ __vcpu_rmw_sys_reg(vcpu, r->reg, |=, 0);
+ }
+
+ set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+
+ if (kvm_vcpu_has_pmu(vcpu))
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
}
/**
- * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access
+ * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction
+ * trap on a guest execution
* @vcpu: The VCPU pointer
*/
int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
{
+ const struct sys_reg_desc *desc = NULL;
struct sys_reg_params params;
unsigned long esr = kvm_vcpu_get_esr(vcpu);
int Rt = kvm_vcpu_sys_get_rt(vcpu);
+ int sr_idx;
trace_kvm_handle_sys_reg(esr);
+ if (triage_sysreg_trap(vcpu, &sr_idx))
+ return 1;
+
params = esr_sys64_to_params(esr);
params.regval = vcpu_get_reg(vcpu, Rt);
- if (!emulate_sys_reg(vcpu, &params))
- return 1;
+ /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */
+ if (params.Op0 == 2 || params.Op0 == 3)
+ desc = &sys_reg_descs[sr_idx];
+ else
+ desc = &sys_insn_descs[sr_idx];
+
+ perform_access(vcpu, &params, desc);
- if (!params.is_write)
+ /* Read from system register? */
+ if (!params.is_write &&
+ (params.Op0 == 2 || params.Op0 == 3))
vcpu_set_reg(vcpu, Rt, params.regval);
+
return 1;
}
@@ -2707,99 +5253,7 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
return r;
}
-/*
- * These are the invariant sys_reg registers: we let the guest see the
- * host versions of these, so they're part of the guest state.
- *
- * A future CPU may provide a mechanism to present different values to
- * the guest, or a future kvm may trap them.
- */
-
-#define FUNCTION_INVARIANT(reg) \
- static void get_##reg(struct kvm_vcpu *v, \
- const struct sys_reg_desc *r) \
- { \
- ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \
- }
-
-FUNCTION_INVARIANT(midr_el1)
-FUNCTION_INVARIANT(revidr_el1)
-FUNCTION_INVARIANT(clidr_el1)
-FUNCTION_INVARIANT(aidr_el1)
-
-static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
-{
- ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0);
-}
-
-/* ->val is filled in by kvm_sys_reg_table_init() */
-static struct sys_reg_desc invariant_sys_regs[] = {
- { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
- { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 },
- { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 },
- { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 },
- { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 },
-};
-
-static int get_invariant_sys_reg(u64 id, u64 __user *uaddr)
-{
- const struct sys_reg_desc *r;
-
- r = get_reg_by_id(id, invariant_sys_regs,
- ARRAY_SIZE(invariant_sys_regs));
- if (!r)
- return -ENOENT;
-
- return put_user(r->val, uaddr);
-}
-
-static int set_invariant_sys_reg(u64 id, u64 __user *uaddr)
-{
- const struct sys_reg_desc *r;
- u64 val;
-
- r = get_reg_by_id(id, invariant_sys_regs,
- ARRAY_SIZE(invariant_sys_regs));
- if (!r)
- return -ENOENT;
-
- if (get_user(val, uaddr))
- return -EFAULT;
-
- /* This is what we mean by invariant: you can't change it. */
- if (r->val != val)
- return -EINVAL;
-
- return 0;
-}
-
-static bool is_valid_cache(u32 val)
-{
- u32 level, ctype;
-
- if (val >= CSSELR_MAX)
- return false;
-
- /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */
- level = (val >> 1);
- ctype = (cache_levels >> (level * 3)) & 7;
-
- switch (ctype) {
- case 0: /* No cache */
- return false;
- case 1: /* Instruction cache only */
- return (val & 1);
- case 2: /* Data cache only */
- case 4: /* Unified cache */
- return !(val & 1);
- case 3: /* Separate instruction and data caches */
- return true;
- default: /* Reserved: we can't know instruction or data. */
- return false;
- }
-}
-
-static int demux_c15_get(u64 id, void __user *uaddr)
+static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
u32 val;
u32 __user *uval = uaddr;
@@ -2815,16 +5269,16 @@ static int demux_c15_get(u64 id, void __user *uaddr)
return -ENOENT;
val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
>> KVM_REG_ARM_DEMUX_VAL_SHIFT;
- if (!is_valid_cache(val))
+ if (val >= CSSELR_MAX)
return -ENOENT;
- return put_user(get_ccsidr(val), uval);
+ return put_user(get_ccsidr(vcpu, val), uval);
default:
return -ENOENT;
}
}
-static int demux_c15_set(u64 id, void __user *uaddr)
+static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
u32 val, newval;
u32 __user *uval = uaddr;
@@ -2840,31 +5294,41 @@ static int demux_c15_set(u64 id, void __user *uaddr)
return -ENOENT;
val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
>> KVM_REG_ARM_DEMUX_VAL_SHIFT;
- if (!is_valid_cache(val))
+ if (val >= CSSELR_MAX)
return -ENOENT;
if (get_user(newval, uval))
return -EFAULT;
- /* This is also invariant: you can't change it. */
- if (newval != get_ccsidr(val))
- return -EINVAL;
- return 0;
+ return set_ccsidr(vcpu, val, newval);
default:
return -ENOENT;
}
}
+static u64 kvm_one_reg_to_id(const struct kvm_one_reg *reg)
+{
+ switch(reg->id) {
+ case KVM_REG_ARM_TIMER_CVAL:
+ return TO_ARM64_SYS_REG(CNTV_CVAL_EL0);
+ case KVM_REG_ARM_TIMER_CNT:
+ return TO_ARM64_SYS_REG(CNTVCT_EL0);
+ default:
+ return reg->id;
+ }
+}
+
int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
const struct sys_reg_desc table[], unsigned int num)
{
u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr;
const struct sys_reg_desc *r;
+ u64 id = kvm_one_reg_to_id(reg);
u64 val;
int ret;
- r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r)
+ r = id_to_sys_reg_desc(vcpu, id, table, num);
+ if (!r || sysreg_hidden(vcpu, r))
return -ENOENT;
if (r->get_user) {
@@ -2883,14 +5347,9 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
void __user *uaddr = (void __user *)(unsigned long)reg->addr;
- int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
- return demux_c15_get(reg->id, uaddr);
-
- err = get_invariant_sys_reg(reg->id, uaddr);
- if (err != -ENOENT)
- return err;
+ return demux_c15_get(vcpu, reg->id, uaddr);
return kvm_sys_reg_get_user(vcpu, reg,
sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
@@ -2901,14 +5360,15 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
{
u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr;
const struct sys_reg_desc *r;
+ u64 id = kvm_one_reg_to_id(reg);
u64 val;
int ret;
if (get_user(val, uaddr))
return -EFAULT;
- r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r)
+ r = id_to_sys_reg_desc(vcpu, id, table, num);
+ if (!r || sysreg_hidden(vcpu, r))
return -ENOENT;
if (sysreg_user_write_ignore(vcpu, r))
@@ -2917,7 +5377,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
if (r->set_user) {
ret = (r->set_user)(vcpu, r, val);
} else {
- __vcpu_sys_reg(vcpu, r->reg) = val;
+ __vcpu_assign_sys_reg(vcpu, r->reg, val);
ret = 0;
}
@@ -2927,14 +5387,9 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
void __user *uaddr = (void __user *)(unsigned long)reg->addr;
- int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
- return demux_c15_set(reg->id, uaddr);
-
- err = set_invariant_sys_reg(reg->id, uaddr);
- if (err != -ENOENT)
- return err;
+ return demux_c15_set(vcpu, reg->id, uaddr);
return kvm_sys_reg_set_user(vcpu, reg,
sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
@@ -2942,13 +5397,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
static unsigned int num_demux_regs(void)
{
- unsigned int i, count = 0;
-
- for (i = 0; i < CSSELR_MAX; i++)
- if (is_valid_cache(i))
- count++;
-
- return count;
+ return CSSELR_MAX;
}
static int write_demux_regids(u64 __user *uindices)
@@ -2958,8 +5407,6 @@ static int write_demux_regids(u64 __user *uindices)
val |= KVM_REG_ARM_DEMUX_ID_CCSIDR;
for (i = 0; i < CSSELR_MAX; i++) {
- if (!is_valid_cache(i))
- continue;
if (put_user(val | i, uindices))
return -EFAULT;
uindices++;
@@ -2980,10 +5427,23 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg)
static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind)
{
+ u64 idx;
+
if (!*uind)
return true;
- if (put_user(sys_reg_to_index(reg), *uind))
+ switch (reg_to_encoding(reg)) {
+ case SYS_CNTV_CVAL_EL0:
+ idx = KVM_REG_ARM_TIMER_CVAL;
+ break;
+ case SYS_CNTVCT_EL0:
+ idx = KVM_REG_ARM_TIMER_CNT;
+ break;
+ default:
+ idx = sys_reg_to_index(reg);
+ }
+
+ if (put_user(idx, *uind))
return false;
(*uind)++;
@@ -3032,23 +5492,14 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu)
{
- return ARRAY_SIZE(invariant_sys_regs)
- + num_demux_regs()
+ return num_demux_regs()
+ walk_sys_regs(vcpu, (u64 __user *)NULL);
}
int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
- unsigned int i;
int err;
- /* Then give them all the invariant registers' indices. */
- for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) {
- if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices))
- return -EFAULT;
- uindices++;
- }
-
err = walk_sys_regs(vcpu, uindices);
if (err < 0)
return err;
@@ -3057,44 +5508,179 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
return write_demux_regids(uindices);
}
-int kvm_sys_reg_table_init(void)
+#define KVM_ARM_FEATURE_ID_RANGE_INDEX(r) \
+ KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(r), \
+ sys_reg_Op1(r), \
+ sys_reg_CRn(r), \
+ sys_reg_CRm(r), \
+ sys_reg_Op2(r))
+
+int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range)
{
+ const void *zero_page = page_to_virt(ZERO_PAGE(0));
+ u64 __user *masks = (u64 __user *)range->addr;
+
+ /* Only feature id range is supported, reserved[13] must be zero. */
+ if (range->range ||
+ memcmp(range->reserved, zero_page, sizeof(range->reserved)))
+ return -EINVAL;
+
+ /* Wipe the whole thing first */
+ if (clear_user(masks, KVM_ARM_FEATURE_ID_RANGE_SIZE * sizeof(__u64)))
+ return -EFAULT;
+
+ for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+ const struct sys_reg_desc *reg = &sys_reg_descs[i];
+ u32 encoding = reg_to_encoding(reg);
+ u64 val;
+
+ if (!is_feature_id_reg(encoding) || !reg->set_user)
+ continue;
+
+ if (!reg->val ||
+ (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) {
+ continue;
+ }
+ val = reg->val;
+
+ if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding))))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static void vcpu_set_hcr(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (has_vhe() || has_hvhe())
+ vcpu->arch.hcr_el2 |= HCR_E2H;
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
+ /* route synchronous external abort exceptions to EL2 */
+ vcpu->arch.hcr_el2 |= HCR_TEA;
+ /* trap error record accesses */
+ vcpu->arch.hcr_el2 |= HCR_TERR;
+ }
+
+ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
+ vcpu->arch.hcr_el2 |= HCR_FWB;
+
+ if (cpus_have_final_cap(ARM64_HAS_EVT) &&
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
+ kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0))
+ vcpu->arch.hcr_el2 |= HCR_TID4;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TID2;
+
+ if (vcpu_el1_is_32bit(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_RW;
+
+ if (kvm_has_mte(vcpu->kvm))
+ vcpu->arch.hcr_el2 |= HCR_ATA;
+
+ /*
+ * In the absence of FGT, we cannot independently trap TLBI
+ * Range instructions. This isn't great, but trapping all
+ * TLBIs would be far worse. Live with it...
+ */
+ if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ vcpu->arch.hcr_el2 |= HCR_TTLBOS;
+}
+
+void kvm_calculate_traps(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&kvm->arch.config_lock);
+ vcpu_set_hcr(vcpu);
+ vcpu_set_ich_hcr(vcpu);
+ vcpu_set_hcrx(vcpu);
+
+ if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags))
+ goto out;
+
+ compute_fgu(kvm, HFGRTR_GROUP);
+ compute_fgu(kvm, HFGITR_GROUP);
+ compute_fgu(kvm, HDFGRTR_GROUP);
+ compute_fgu(kvm, HAFGRTR_GROUP);
+ compute_fgu(kvm, HFGRTR2_GROUP);
+ compute_fgu(kvm, HFGITR2_GROUP);
+ compute_fgu(kvm, HDFGRTR2_GROUP);
+
+ set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags);
+out:
+ mutex_unlock(&kvm->arch.config_lock);
+}
+
+/*
+ * Perform last adjustments to the ID registers that are implied by the
+ * configuration outside of the ID regs themselves, as well as any
+ * initialisation that directly depend on these ID registers (such as
+ * RES0/RES1 behaviours). This is not the place to configure traps though.
+ *
+ * Because this can be called once per CPU, changes must be idempotent.
+ */
+int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ guard(mutex)(&kvm->arch.config_lock);
+
+ /*
+ * This hacks into the ID registers, so only perform it when the
+ * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream.
+ */
+ if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) {
+ u64 val;
+
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val);
+ }
+
+ if (vcpu_has_nv(vcpu)) {
+ int ret = kvm_init_nv_sysregs(vcpu);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int __init kvm_sys_reg_table_init(void)
+{
+ const struct sys_reg_desc *gicv3_regs;
bool valid = true;
- unsigned int i;
- struct sys_reg_desc clidr;
+ unsigned int i, sz;
+ int ret = 0;
/* Make sure tables are unique and in order. */
- valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false);
- valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true);
- valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true);
- valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true);
- valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true);
- valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false);
+ valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), true);
+ valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), false);
+ valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), false);
+ valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), false);
+ valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), false);
+ valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false);
+
+ gicv3_regs = vgic_v3_get_sysreg_table(&sz);
+ valid &= check_sysreg_table(gicv3_regs, sz, false);
if (!valid)
return -EINVAL;
- /* We abuse the reset function to overwrite the table itself. */
- for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
- invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
+ init_imp_id_regs();
- /*
- * CLIDR format is awkward, so clean it up. See ARM B4.1.20:
- *
- * If software reads the Cache Type fields from Ctype1
- * upwards, once it has seen a value of 0b000, no caches
- * exist at further-out levels of the hierarchy. So, for
- * example, if Ctype3 is the first Cache Type field with a
- * value of 0b000, the values of Ctype4 to Ctype7 must be
- * ignored.
- */
- get_clidr_el1(NULL, &clidr); /* Ugly... */
- cache_levels = clidr.val;
- for (i = 0; i < 7; i++)
- if (((cache_levels >> (i*3)) & 7) == 0)
- break;
- /* Clear all higher bits. */
- cache_levels &= (1 << (i*3))-1;
+ ret = populate_nv_trap_config();
- return 0;
+ check_feature_map();
+
+ for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++)
+ ret = populate_sysreg_config(sys_reg_descs + i, i);
+
+ for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++)
+ ret = populate_sysreg_config(sys_insn_descs + i, i);
+
+ return ret;
}
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index e4ebb3a379fd..b3f904472fac 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -27,6 +27,13 @@ struct sys_reg_params {
bool is_write;
};
+#define encoding_to_params(reg) \
+ ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \
+ .Op1 = sys_reg_Op1(reg), \
+ .CRn = sys_reg_CRn(reg), \
+ .CRm = sys_reg_CRm(reg), \
+ .Op2 = sys_reg_Op2(reg) })
+
#define esr_sys64_to_params(esr) \
((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \
.Op1 = ((esr) >> 14) & 0x7, \
@@ -64,13 +71,16 @@ struct sys_reg_desc {
struct sys_reg_params *,
const struct sys_reg_desc *);
- /* Initialization for vcpu. */
- void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
+ /*
+ * Initialization for vcpu. Return initialized value, or KVM
+ * sanitized value for ID registers.
+ */
+ u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
/* Index into sys_reg[], or 0 if we don't need to save it. */
int reg;
- /* Value (usually reset value) */
+ /* Value (usually reset value), or write mask for idregs */
u64 val;
/* Custom get/set_user functions, fallback to generic if NULL */
@@ -98,7 +108,7 @@ inline void print_sys_reg_msg(const struct sys_reg_params *p,
/* Look, we even formatted it for you to paste into the table! */
kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n",
&(struct va_format){ fmt, &va },
- p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read");
+ p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, str_write_read(p->is_write));
va_end(va);
}
@@ -122,19 +132,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu,
}
/* Reset functions */
-static inline void reset_unknown(struct kvm_vcpu *vcpu,
+static inline u64 reset_unknown(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
- __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
+ __vcpu_assign_sys_reg(vcpu, r->reg, 0x1de7ec7edbadc0deULL);
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
- __vcpu_sys_reg(vcpu, r->reg) = r->val;
+ __vcpu_assign_sys_reg(vcpu, r->reg, r->val);
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu,
@@ -211,6 +223,10 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
const struct sys_reg_desc table[], unsigned int num);
+bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index);
+
+int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu);
+
#define AA32(_x) .aarch32_map = AA32_##_x
#define Op0(_x) .Op0 = _x
#define Op1(_x) .Op1 = _x
@@ -224,4 +240,27 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
Op2(sys_reg_Op2(reg))
+#define CP15_SYS_DESC(reg) \
+ .name = #reg, \
+ .aarch32_map = AA32_DIRECT, \
+ Op0(0), Op1(sys_reg_Op1(reg)), \
+ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
+ Op2(sys_reg_Op2(reg))
+
+#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \
+({ \
+ u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \
+ (val) &= ~reg##_##field##_MASK; \
+ (val) |= FIELD_PREP(reg##_##field##_MASK, \
+ min(__f_val, \
+ (u64)SYS_FIELD_VALUE(reg, field, limit))); \
+ (val); \
+})
+
+#define TO_ARM64_SYS_REG(r) ARM64_SYS_REG(sys_reg_Op0(SYS_ ## r), \
+ sys_reg_Op1(SYS_ ## r), \
+ sys_reg_CRn(SYS_ ## r), \
+ sys_reg_CRm(SYS_ ## r), \
+ sys_reg_Op2(SYS_ ## r))
+
#endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index 33e4e7dd2719..9c60f6465c78 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -2,6 +2,7 @@
#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_ARM_ARM64_KVM_H
+#include <asm/kvm_emulate.h>
#include <kvm/arm_arch_timer.h>
#include <linux/tracepoint.h>
@@ -135,6 +136,31 @@ TRACE_EVENT(kvm_mmio_emulate,
__entry->vcpu_pc, __entry->instr, __entry->cpsr)
);
+TRACE_EVENT(kvm_mmio_nisv,
+ TP_PROTO(unsigned long vcpu_pc, unsigned long esr,
+ unsigned long far, unsigned long ipa),
+ TP_ARGS(vcpu_pc, esr, far, ipa),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( unsigned long, esr )
+ __field( unsigned long, far )
+ __field( unsigned long, ipa )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->esr = esr;
+ __entry->far = far;
+ __entry->ipa = ipa;
+ ),
+
+ TP_printk("ipa %#016lx, esr %#016lx, far %#016lx, pc %#016lx",
+ __entry->ipa, __entry->esr,
+ __entry->far, __entry->vcpu_pc)
+);
+
+
TRACE_EVENT(kvm_set_way_flush,
TP_PROTO(unsigned long vcpu_pc, bool cache),
TP_ARGS(vcpu_pc, cache),
@@ -150,7 +176,7 @@ TRACE_EVENT(kvm_set_way_flush,
),
TP_printk("S/W flush at 0x%016lx (cache %s)",
- __entry->vcpu_pc, __entry->cache ? "on" : "off")
+ __entry->vcpu_pc, str_on_off(__entry->cache))
);
TRACE_EVENT(kvm_toggle_cache,
@@ -170,8 +196,8 @@ TRACE_EVENT(kvm_toggle_cache,
),
TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
- __entry->vcpu_pc, __entry->was ? "on" : "off",
- __entry->now ? "on" : "off")
+ __entry->vcpu_pc, str_on_off(__entry->was),
+ str_on_off(__entry->now))
);
/*
@@ -205,6 +231,7 @@ TRACE_EVENT(kvm_get_timer_map,
__field( unsigned long, vcpu_id )
__field( int, direct_vtimer )
__field( int, direct_ptimer )
+ __field( int, emul_vtimer )
__field( int, emul_ptimer )
),
@@ -213,14 +240,17 @@ TRACE_EVENT(kvm_get_timer_map,
__entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer);
__entry->direct_ptimer =
(map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+ __entry->emul_vtimer =
+ (map->emul_vtimer) ? arch_timer_ctx_index(map->emul_vtimer) : -1;
__entry->emul_ptimer =
(map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
),
- TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+ TP_printk("VCPU: %ld, dv: %d, dp: %d, ev: %d, ep: %d",
__entry->vcpu_id,
__entry->direct_vtimer,
__entry->direct_ptimer,
+ __entry->emul_vtimer,
__entry->emul_ptimer)
);
@@ -301,6 +331,90 @@ TRACE_EVENT(kvm_timer_emulate,
__entry->timer_idx, __entry->should_fire)
);
+TRACE_EVENT(kvm_nested_eret,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned long elr_el2,
+ unsigned long spsr_el2),
+ TP_ARGS(vcpu, elr_el2, spsr_el2),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(unsigned long, elr_el2)
+ __field(unsigned long, spsr_el2)
+ __field(unsigned long, target_mode)
+ __field(unsigned long, hcr_el2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->elr_el2 = elr_el2;
+ __entry->spsr_el2 = spsr_el2;
+ __entry->target_mode = spsr_el2 & (PSR_MODE_MASK | PSR_MODE32_BIT);
+ __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);
+ ),
+
+ TP_printk("elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx",
+ __entry->elr_el2, __entry->spsr_el2,
+ __print_symbolic(__entry->target_mode, kvm_mode_names),
+ __entry->hcr_el2)
+);
+
+TRACE_EVENT(kvm_inject_nested_exception,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 esr_el2, int type),
+ TP_ARGS(vcpu, esr_el2, type),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(unsigned long, esr_el2)
+ __field(int, type)
+ __field(unsigned long, spsr_el2)
+ __field(unsigned long, pc)
+ __field(unsigned long, source_mode)
+ __field(unsigned long, hcr_el2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->esr_el2 = esr_el2;
+ __entry->type = type;
+ __entry->spsr_el2 = *vcpu_cpsr(vcpu);
+ __entry->pc = *vcpu_pc(vcpu);
+ __entry->source_mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
+ __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);
+ ),
+
+ TP_printk("%s: esr_el2 0x%lx elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx",
+ __print_symbolic(__entry->type, kvm_exception_type_names),
+ __entry->esr_el2, __entry->pc, __entry->spsr_el2,
+ __print_symbolic(__entry->source_mode, kvm_mode_names),
+ __entry->hcr_el2)
+);
+
+TRACE_EVENT(kvm_forward_sysreg_trap,
+ TP_PROTO(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read),
+ TP_ARGS(vcpu, sysreg, is_read),
+
+ TP_STRUCT__entry(
+ __field(u64, pc)
+ __field(u32, sysreg)
+ __field(bool, is_read)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = *vcpu_pc(vcpu);
+ __entry->sysreg = sysreg;
+ __entry->is_read = is_read;
+ ),
+
+ TP_printk("%llx %c (%d,%d,%d,%d,%d)",
+ __entry->pc,
+ __entry->is_read ? 'R' : 'W',
+ sys_reg_Op0(__entry->sysreg),
+ sys_reg_Op1(__entry->sysreg),
+ sys_reg_CRn(__entry->sysreg),
+ sys_reg_CRm(__entry->sysreg),
+ sys_reg_Op2(__entry->sysreg))
+);
+
#endif /* _TRACE_ARM_ARM64_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
index 064a58c19f48..a7ab9a3bbed0 100644
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -46,38 +46,6 @@ TRACE_EVENT(kvm_hvc_arm64,
__entry->vcpu_pc, __entry->r0, __entry->imm)
);
-TRACE_EVENT(kvm_arm_setup_debug,
- TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
- TP_ARGS(vcpu, guest_debug),
-
- TP_STRUCT__entry(
- __field(struct kvm_vcpu *, vcpu)
- __field(__u32, guest_debug)
- ),
-
- TP_fast_assign(
- __entry->vcpu = vcpu;
- __entry->guest_debug = guest_debug;
- ),
-
- TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_clear_debug,
- TP_PROTO(__u32 guest_debug),
- TP_ARGS(guest_debug),
-
- TP_STRUCT__entry(
- __field(__u32, guest_debug)
- ),
-
- TP_fast_assign(
- __entry->guest_debug = guest_debug;
- ),
-
- TP_printk("flags: 0x%08x", __entry->guest_debug)
-);
-
/*
* The dreg32 name is a leftover from a distant past. This will really
* output a 64bit value...
@@ -99,49 +67,6 @@ TRACE_EVENT(kvm_arm_set_dreg32,
TP_printk("%s: 0x%llx", __entry->name, __entry->value)
);
-TRACE_DEFINE_SIZEOF(__u64);
-
-TRACE_EVENT(kvm_arm_set_regset,
- TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
- TP_ARGS(type, len, control, value),
- TP_STRUCT__entry(
- __field(const char *, name)
- __field(int, len)
- __array(u64, ctrls, 16)
- __array(u64, values, 16)
- ),
- TP_fast_assign(
- __entry->name = type;
- __entry->len = len;
- memcpy(__entry->ctrls, control, len << 3);
- memcpy(__entry->values, value, len << 3);
- ),
- TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
- __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
- __print_array(__entry->values, __entry->len, sizeof(__u64)))
-);
-
-TRACE_EVENT(trap_reg,
- TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
- TP_ARGS(fn, reg, is_write, write_value),
-
- TP_STRUCT__entry(
- __field(const char *, fn)
- __field(int, reg)
- __field(bool, is_write)
- __field(u64, write_value)
- ),
-
- TP_fast_assign(
- __entry->fn = fn;
- __entry->reg = reg;
- __entry->is_write = is_write;
- __entry->write_value = write_value;
- ),
-
- TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
-);
-
TRACE_EVENT(kvm_handle_sys_reg,
TP_PROTO(unsigned long hsr),
TP_ARGS(hsr),
@@ -188,7 +113,7 @@ TRACE_EVENT(kvm_sys_access,
__entry->vcpu_pc, __entry->name ?: "UNKN",
__entry->Op0, __entry->Op1, __entry->CRn,
__entry->CRm, __entry->Op2,
- __entry->is_write ? "write" : "read")
+ str_write_read(__entry->is_write))
);
TRACE_EVENT(kvm_set_guest_debug,
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 9e7c486b48c2..bdc2d57370b2 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -35,12 +35,12 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
vgic_v3_cpu->num_id_bits = host_id_bits;
- host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2);
+ host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2);
seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val);
if (host_seis != seis)
return -EINVAL;
- host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2);
+ host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2);
a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val);
if (host_a3v != a3v)
return -EINVAL;
@@ -68,10 +68,10 @@ static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1);
val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits);
val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK,
- FIELD_GET(ICH_VTR_SEIS_MASK,
+ FIELD_GET(ICH_VTR_EL2_SEIS,
kvm_vgic_global_state.ich_vtr_el2));
val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK,
- FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2));
+ FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2));
/*
* The VMCR.CTLR value is in ICC_CTLR_EL1 layout.
* Extract it directly using ICC_CTLR_EL1 reg definitions.
@@ -297,6 +297,91 @@ static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
return 0;
}
+static int set_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ __vcpu_assign_sys_reg(vcpu, r->reg, val);
+ return 0;
+}
+
+static int get_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = __vcpu_sys_reg(vcpu, r->reg);
+ return 0;
+}
+
+static int set_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ u8 idx = r->Op2 & 3;
+
+ if (idx > vgic_v3_max_apr_idx(vcpu))
+ return -EINVAL;
+
+ return set_gic_ich_reg(vcpu, r, val);
+}
+
+static int get_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ u8 idx = r->Op2 & 3;
+
+ if (idx > vgic_v3_max_apr_idx(vcpu))
+ return -EINVAL;
+
+ return get_gic_ich_reg(vcpu, r, val);
+}
+
+static int set_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ if (val != KVM_ICC_SRE_EL2)
+ return -EINVAL;
+ return 0;
+}
+
+static int get_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = KVM_ICC_SRE_EL2;
+ return 0;
+}
+
+static int set_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ if (val != kvm_get_guest_vtr_el2())
+ return -EINVAL;
+ return 0;
+}
+
+static int get_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = kvm_get_guest_vtr_el2();
+ return 0;
+}
+
+static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return vcpu_has_nv(vcpu) ? 0 : REG_HIDDEN;
+}
+
+#define __EL2_REG(r, acc, i) \
+ { \
+ SYS_DESC(SYS_ ## r), \
+ .get_user = get_gic_ ## acc, \
+ .set_user = set_gic_ ## acc, \
+ .reg = i, \
+ .visibility = el2_visibility, \
+ }
+
+#define EL2_REG(r, acc) __EL2_REG(r, acc, r)
+
+#define EL2_REG_RO(r, acc) __EL2_REG(r, acc, 0)
+
static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
{ SYS_DESC(SYS_ICC_PMR_EL1),
.set_user = set_gic_pmr, .get_user = get_gic_pmr, },
@@ -328,8 +413,42 @@ static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
.set_user = set_gic_grpen0, .get_user = get_gic_grpen0, },
{ SYS_DESC(SYS_ICC_IGRPEN1_EL1),
.set_user = set_gic_grpen1, .get_user = get_gic_grpen1, },
+ EL2_REG(ICH_AP0R0_EL2, ich_apr),
+ EL2_REG(ICH_AP0R1_EL2, ich_apr),
+ EL2_REG(ICH_AP0R2_EL2, ich_apr),
+ EL2_REG(ICH_AP0R3_EL2, ich_apr),
+ EL2_REG(ICH_AP1R0_EL2, ich_apr),
+ EL2_REG(ICH_AP1R1_EL2, ich_apr),
+ EL2_REG(ICH_AP1R2_EL2, ich_apr),
+ EL2_REG(ICH_AP1R3_EL2, ich_apr),
+ EL2_REG_RO(ICC_SRE_EL2, icc_sre),
+ EL2_REG(ICH_HCR_EL2, ich_reg),
+ EL2_REG_RO(ICH_VTR_EL2, ich_vtr),
+ EL2_REG(ICH_VMCR_EL2, ich_reg),
+ EL2_REG(ICH_LR0_EL2, ich_reg),
+ EL2_REG(ICH_LR1_EL2, ich_reg),
+ EL2_REG(ICH_LR2_EL2, ich_reg),
+ EL2_REG(ICH_LR3_EL2, ich_reg),
+ EL2_REG(ICH_LR4_EL2, ich_reg),
+ EL2_REG(ICH_LR5_EL2, ich_reg),
+ EL2_REG(ICH_LR6_EL2, ich_reg),
+ EL2_REG(ICH_LR7_EL2, ich_reg),
+ EL2_REG(ICH_LR8_EL2, ich_reg),
+ EL2_REG(ICH_LR9_EL2, ich_reg),
+ EL2_REG(ICH_LR10_EL2, ich_reg),
+ EL2_REG(ICH_LR11_EL2, ich_reg),
+ EL2_REG(ICH_LR12_EL2, ich_reg),
+ EL2_REG(ICH_LR13_EL2, ich_reg),
+ EL2_REG(ICH_LR14_EL2, ich_reg),
+ EL2_REG(ICH_LR15_EL2, ich_reg),
};
+const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz)
+{
+ *sz = ARRAY_SIZE(gic_v3_icc_reg_descs);
+ return gic_v3_icc_reg_descs;
+}
+
static u64 attr_to_id(u64 attr)
{
return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr),
@@ -341,8 +460,12 @@ static u64 attr_to_id(u64 attr)
int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
- if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs,
- ARRAY_SIZE(gic_v3_icc_reg_descs)))
+ const struct sys_reg_desc *r;
+
+ r = get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs,
+ ARRAY_SIZE(gic_v3_icc_reg_descs));
+
+ if (r && !sysreg_hidden(vcpu, r))
return 0;
return -ENXIO;
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 78cde687383c..bb92853d1fd3 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -28,27 +28,74 @@ struct vgic_state_iter {
int nr_lpis;
int dist_id;
int vcpu_id;
- int intid;
+ unsigned long intid;
int lpi_idx;
- u32 *lpi_array;
};
-static void iter_next(struct vgic_state_iter *iter)
+static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter)
{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
if (iter->dist_id == 0) {
iter->dist_id++;
return;
}
+ /*
+ * Let the xarray drive the iterator after the last SPI, as the iterator
+ * has exhausted the sequentially-allocated INTID space.
+ */
+ if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) &&
+ iter->nr_lpis) {
+ if (iter->lpi_idx < iter->nr_lpis)
+ xa_find_after(&dist->lpi_xa, &iter->intid,
+ VGIC_LPI_MAX_INTID,
+ LPI_XA_MARK_DEBUG_ITER);
+ iter->lpi_idx++;
+ return;
+ }
+
iter->intid++;
if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
++iter->vcpu_id < iter->nr_cpus)
iter->intid = 0;
+}
- if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
- if (iter->lpi_idx < iter->nr_lpis)
- iter->intid = iter->lpi_array[iter->lpi_idx];
- iter->lpi_idx++;
+static int iter_mark_lpis(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
+ struct vgic_irq *irq;
+ int nr_lpis = 0;
+
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ if (!vgic_try_get_irq_ref(irq))
+ continue;
+
+ __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ nr_lpis++;
+ }
+
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
+ return nr_lpis;
+}
+
+static void iter_unmark_lpis(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
+ struct vgic_irq *irq;
+
+ xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) {
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+ __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
+ /* vgic_put_irq() expects to be called outside of the xa_lock */
+ vgic_put_irq(kvm, irq);
}
}
@@ -61,15 +108,12 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
iter->nr_cpus = nr_cpus;
iter->nr_spis = kvm->arch.vgic.nr_spis;
- if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
- if (iter->nr_lpis < 0)
- iter->nr_lpis = 0;
- }
+ if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ iter->nr_lpis = iter_mark_lpis(kvm);
/* Fast forward to the right position if needed */
while (pos--)
- iter_next(iter);
+ iter_next(kvm, iter);
}
static bool end_of_vgic(struct vgic_state_iter *iter)
@@ -77,7 +121,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
return iter->dist_id > 0 &&
iter->vcpu_id == iter->nr_cpus &&
iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
- iter->lpi_idx > iter->nr_lpis;
+ (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis);
}
static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
@@ -85,7 +129,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
struct kvm *kvm = s->private;
struct vgic_state_iter *iter;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
iter = kvm->arch.vgic.iter;
if (iter) {
iter = ERR_PTR(-EBUSY);
@@ -104,7 +148,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
if (end_of_vgic(iter))
iter = NULL;
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return iter;
}
@@ -114,7 +158,7 @@ static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
struct vgic_state_iter *iter = kvm->arch.vgic.iter;
++*pos;
- iter_next(iter);
+ iter_next(kvm, iter);
if (end_of_vgic(iter))
iter = NULL;
return iter;
@@ -132,15 +176,16 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
if (IS_ERR(v))
return;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
iter = kvm->arch.vgic.iter;
- kfree(iter->lpi_array);
+ iter_unmark_lpis(kvm);
kfree(iter);
kvm->arch.vgic.iter = NULL;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
}
-static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
+static void print_dist_state(struct seq_file *s, struct vgic_dist *dist,
+ struct vgic_state_iter *iter)
{
bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
@@ -149,7 +194,7 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
if (v3)
- seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
+ seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis);
seq_printf(s, "enabled:\t%d\n", dist->enabled);
seq_printf(s, "\n");
@@ -166,7 +211,7 @@ static void print_header(struct seq_file *s, struct vgic_irq *irq,
if (vcpu) {
hdr = "VCPU";
- id = vcpu->vcpu_id;
+ id = vcpu->vcpu_idx;
}
seq_printf(s, "\n");
@@ -212,7 +257,7 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
" %2d "
"\n",
type, irq->intid,
- (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
+ (irq->target_vcpu) ? irq->target_vcpu->vcpu_idx : -1,
pending,
irq->line_level,
irq->active,
@@ -224,7 +269,7 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
irq->mpidr,
irq->source,
irq->priority,
- (irq->vcpu) ? irq->vcpu->vcpu_id : -1);
+ (irq->vcpu) ? irq->vcpu->vcpu_idx : -1);
}
static int vgic_debug_show(struct seq_file *s, void *v)
@@ -236,7 +281,7 @@ static int vgic_debug_show(struct seq_file *s, void *v)
unsigned long flags;
if (iter->dist_id == 0) {
- print_dist_state(s, &kvm->arch.vgic);
+ print_dist_state(s, &kvm->arch.vgic, iter);
return 0;
}
@@ -246,11 +291,16 @@ static int vgic_debug_show(struct seq_file *s, void *v)
if (iter->vcpu_id < iter->nr_cpus)
vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
- irq = vgic_get_irq(kvm, vcpu, iter->intid);
- if (!irq) {
- seq_printf(s, " LPI %4d freed\n", iter->intid);
- return 0;
- }
+ /*
+ * Expect this to succeed, as iter_mark_lpis() takes a reference on
+ * every LPI to be visited.
+ */
+ if (iter->intid < VGIC_NR_PRIVATE_IRQS)
+ irq = vgic_get_vcpu_irq(vcpu, iter->intid);
+ else
+ irq = vgic_get_irq(kvm, iter->intid);
+ if (WARN_ON_ONCE(!irq))
+ return -EINVAL;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
print_irq_state(s, irq, vcpu);
@@ -278,3 +328,230 @@ void vgic_debug_init(struct kvm *kvm)
void vgic_debug_destroy(struct kvm *kvm)
{
}
+
+/**
+ * struct vgic_its_iter - Iterator for traversing VGIC ITS device tables.
+ * @dev: Pointer to the current its_device being processed.
+ * @ite: Pointer to the current its_ite within the device being processed.
+ *
+ * This structure is used to maintain the current position during iteration
+ * over the ITS device tables. It holds pointers to both the current device
+ * and the current ITE within that device.
+ */
+struct vgic_its_iter {
+ struct its_device *dev;
+ struct its_ite *ite;
+};
+
+/**
+ * end_of_iter - Checks if the iterator has reached the end.
+ * @iter: The iterator to check.
+ *
+ * When the iterator completed processing the final ITE in the last device
+ * table, it was marked to indicate the end of iteration by setting its
+ * device and ITE pointers to NULL.
+ * This function checks whether the iterator was marked as end.
+ *
+ * Return: True if the iterator is marked as end, false otherwise.
+ */
+static inline bool end_of_iter(struct vgic_its_iter *iter)
+{
+ return !iter->dev && !iter->ite;
+}
+
+/**
+ * vgic_its_iter_next - Advances the iterator to the next entry in the ITS tables.
+ * @its: The VGIC ITS structure.
+ * @iter: The iterator to advance.
+ *
+ * This function moves the iterator to the next ITE within the current device,
+ * or to the first ITE of the next device if the current ITE is the last in
+ * the device. If the current device is the last device, the iterator is set
+ * to indicate the end of iteration.
+ */
+static void vgic_its_iter_next(struct vgic_its *its, struct vgic_its_iter *iter)
+{
+ struct its_device *dev = iter->dev;
+ struct its_ite *ite = iter->ite;
+
+ if (!ite || list_is_last(&ite->ite_list, &dev->itt_head)) {
+ if (list_is_last(&dev->dev_list, &its->device_list)) {
+ dev = NULL;
+ ite = NULL;
+ } else {
+ dev = list_next_entry(dev, dev_list);
+ ite = list_first_entry_or_null(&dev->itt_head,
+ struct its_ite,
+ ite_list);
+ }
+ } else {
+ ite = list_next_entry(ite, ite_list);
+ }
+
+ iter->dev = dev;
+ iter->ite = ite;
+}
+
+/**
+ * vgic_its_debug_start - Start function for the seq_file interface.
+ * @s: The seq_file structure.
+ * @pos: The starting position (offset).
+ *
+ * This function initializes the iterator to the beginning of the ITS tables
+ * and advances it to the specified position. It acquires the its_lock mutex
+ * to protect shared data.
+ *
+ * Return: An iterator pointer on success, NULL if no devices are found or
+ * the end of the list is reached, or ERR_PTR(-ENOMEM) on memory
+ * allocation failure.
+ */
+static void *vgic_its_debug_start(struct seq_file *s, loff_t *pos)
+{
+ struct vgic_its *its = s->private;
+ struct vgic_its_iter *iter;
+ struct its_device *dev;
+ loff_t offset = *pos;
+
+ mutex_lock(&its->its_lock);
+
+ dev = list_first_entry_or_null(&its->device_list,
+ struct its_device, dev_list);
+ if (!dev)
+ return NULL;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+
+ iter->dev = dev;
+ iter->ite = list_first_entry_or_null(&dev->itt_head,
+ struct its_ite, ite_list);
+
+ while (!end_of_iter(iter) && offset--)
+ vgic_its_iter_next(its, iter);
+
+ if (end_of_iter(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+/**
+ * vgic_its_debug_next - Next function for the seq_file interface.
+ * @s: The seq_file structure.
+ * @v: The current iterator.
+ * @pos: The current position (offset).
+ *
+ * This function advances the iterator to the next entry and increments the
+ * position.
+ *
+ * Return: An iterator pointer on success, or NULL if the end of the list is
+ * reached.
+ */
+static void *vgic_its_debug_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct vgic_its *its = s->private;
+ struct vgic_its_iter *iter = v;
+
+ ++*pos;
+ vgic_its_iter_next(its, iter);
+
+ if (end_of_iter(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+ return iter;
+}
+
+/**
+ * vgic_its_debug_stop - Stop function for the seq_file interface.
+ * @s: The seq_file structure.
+ * @v: The current iterator.
+ *
+ * This function frees the iterator and releases the its_lock mutex.
+ */
+static void vgic_its_debug_stop(struct seq_file *s, void *v)
+{
+ struct vgic_its *its = s->private;
+ struct vgic_its_iter *iter = v;
+
+ if (!IS_ERR_OR_NULL(iter))
+ kfree(iter);
+ mutex_unlock(&its->its_lock);
+}
+
+/**
+ * vgic_its_debug_show - Show function for the seq_file interface.
+ * @s: The seq_file structure.
+ * @v: The current iterator.
+ *
+ * This function formats and prints the ITS table entry information to the
+ * seq_file output.
+ *
+ * Return: 0 on success.
+ */
+static int vgic_its_debug_show(struct seq_file *s, void *v)
+{
+ struct vgic_its_iter *iter = v;
+ struct its_device *dev = iter->dev;
+ struct its_ite *ite = iter->ite;
+
+ if (!ite)
+ return 0;
+
+ if (list_is_first(&ite->ite_list, &dev->itt_head)) {
+ seq_printf(s, "\n");
+ seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n",
+ dev->device_id, BIT_ULL(dev->num_eventid_bits) - 1);
+ seq_printf(s, "EVENT_ID INTID HWINTID TARGET COL_ID HW\n");
+ seq_printf(s, "-----------------------------------------------\n");
+ }
+
+ if (ite->irq && ite->collection) {
+ seq_printf(s, "%8u %8u %8u %8u %8u %2d\n",
+ ite->event_id, ite->irq->intid, ite->irq->hwintid,
+ ite->collection->target_addr,
+ ite->collection->collection_id, ite->irq->hw);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations vgic_its_debug_sops = {
+ .start = vgic_its_debug_start,
+ .next = vgic_its_debug_next,
+ .stop = vgic_its_debug_stop,
+ .show = vgic_its_debug_show
+};
+
+DEFINE_SEQ_ATTRIBUTE(vgic_its_debug);
+
+/**
+ * vgic_its_debug_init - Initializes the debugfs interface for VGIC ITS.
+ * @dev: The KVM device structure.
+ *
+ * This function creates a debugfs file named "vgic-its-state@%its_base"
+ * to expose the ITS table information.
+ *
+ * Return: 0 on success.
+ */
+int vgic_its_debug_init(struct kvm_device *dev)
+{
+ struct vgic_its *its = dev->private;
+ char *name;
+
+ name = kasprintf(GFP_KERNEL, "vgic-its-state@%llx", (u64)its->vgic_its_base);
+ if (!name)
+ return -ENOMEM;
+
+ debugfs_create_file(name, 0444, dev->kvm->debugfs_dentry, its, &vgic_its_debug_fops);
+
+ kfree(name);
+ return 0;
+}
+
+void vgic_its_debug_destroy(struct kvm_device *dev)
+{
+}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index f6d4f4052555..dc9f9db31026 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -34,9 +34,9 @@
*
* CPU Interface:
*
- * - kvm_vgic_vcpu_init(): initialization of static data that
- * doesn't depend on any sizing information or emulation type. No
- * allocation is allowed there.
+ * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend
+ * on any sizing information. Private interrupts are allocated if not
+ * already allocated at vgic-creation time.
*/
/* EARLY INIT */
@@ -53,13 +53,13 @@ void kvm_vgic_early_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- INIT_LIST_HEAD(&dist->lpi_list_head);
- INIT_LIST_HEAD(&dist->lpi_translation_cache);
- raw_spin_lock_init(&dist->lpi_list_lock);
+ xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
}
/* CREATION */
+static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
+
/**
* kvm_vgic_create: triggered by the instantiation of the VGIC device by
* user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
@@ -71,12 +71,10 @@ void kvm_vgic_early_init(struct kvm *kvm)
int kvm_vgic_create(struct kvm *kvm, u32 type)
{
struct kvm_vcpu *vcpu;
+ u64 aa64pfr0, pfr1;
unsigned long i;
int ret;
- if (irqchip_in_kernel(kvm))
- return -EEXIST;
-
/*
* This function is also called by the KVM_CREATE_IRQCHIP handler,
* which had no chance yet to check the availability of the GICv2
@@ -87,10 +85,45 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
!kvm_vgic_global_state.can_emulate_gicv2)
return -ENODEV;
+ /*
+ * Ensure mutual exclusion with vCPU creation and any vCPU ioctls by:
+ *
+ * - Holding kvm->lock to prevent KVM_CREATE_VCPU from reaching
+ * kvm_arch_vcpu_precreate() and ensuring created_vcpus is stable.
+ * This alone is insufficient, as kvm_vm_ioctl_create_vcpu() drops
+ * the kvm->lock before completing the vCPU creation.
+ */
+ lockdep_assert_held(&kvm->lock);
+
+ /*
+ * - Acquiring the vCPU mutex for every *online* vCPU to prevent
+ * concurrent vCPU ioctls for vCPUs already visible to userspace.
+ */
ret = -EBUSY;
- if (!lock_all_vcpus(kvm))
+ if (kvm_trylock_all_vcpus(kvm))
return ret;
+ /*
+ * - Taking the config_lock which protects VGIC data structures such
+ * as the per-vCPU arrays of private IRQs (SGIs, PPIs).
+ */
+ mutex_lock(&kvm->arch.config_lock);
+
+ /*
+ * - Bailing on the entire thing if a vCPU is in the middle of creation,
+ * dropped the kvm->lock, but hasn't reached kvm_arch_vcpu_create().
+ *
+ * The whole combination of this guarantees that no vCPU can get into
+ * KVM with a VGIC configuration inconsistent with the VM's VGIC.
+ */
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
+ goto out_unlock;
+
+ if (irqchip_in_kernel(kvm)) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
+
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu_has_run_once(vcpu))
goto out_unlock;
@@ -107,18 +140,48 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
goto out_unlock;
}
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = vgic_allocate_private_irqs_locked(vcpu, type);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ kfree(vgic_cpu->private_irqs);
+ vgic_cpu->private_irqs = NULL;
+ }
+
+ goto out_unlock;
+ }
+
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
+ kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST;
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
- if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+ aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V2) {
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
- else
+ } else {
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+ aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3);
+ }
+
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
+
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
+ kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
out_unlock:
- unlock_all_vcpus(kvm);
+ mutex_unlock(&kvm->arch.config_lock);
+ kvm_unlock_all_vcpus(kvm);
return ret;
}
@@ -135,6 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
int i;
+ dist->active_spis = (atomic_t)ATOMIC_INIT(0);
dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT);
if (!dist->spis)
return -ENOMEM;
@@ -155,7 +219,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
raw_spin_lock_init(&irq->irq_lock);
irq->vcpu = NULL;
irq->target_vcpu = vcpu0;
- kref_init(&irq->refcount);
+ refcount_set(&irq->refcount, 0);
switch (dist->vgic_model) {
case KVM_DEV_TYPE_ARM_VGIC_V2:
irq->targets = 0;
@@ -174,27 +238,43 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
return 0;
}
-/**
- * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
- * structures and register VCPU-specific KVM iodevs
- *
- * @vcpu: pointer to the VCPU being created and initialized
- *
- * Only do initialization, but do not actually enable the
- * VGIC CPU interface
- */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */
+#define DEFAULT_MI_INTID 25
+
+int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ guard(mutex)(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * Matching the tradition established with the timers, provide
+ * a default PPI for the maintenance interrupt. It makes
+ * things easier to reason about.
+ */
+ if (vcpu->kvm->arch.vgic.mi_intid == 0)
+ vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID;
+ ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu);
+
+ return ret;
+}
+
+static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int ret = 0;
int i;
- vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+ lockdep_assert_held(&vcpu->kvm->arch.config_lock);
- INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
- raw_spin_lock_init(&vgic_cpu->ap_list_lock);
- atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+ if (vgic_cpu->private_irqs)
+ return 0;
+
+ vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS,
+ sizeof(struct vgic_irq),
+ GFP_KERNEL_ACCOUNT);
+
+ if (!vgic_cpu->private_irqs)
+ return -ENOMEM;
/*
* Enable and configure all SGIs to be edge-triggered and
@@ -208,7 +288,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
irq->intid = i;
irq->vcpu = NULL;
irq->target_vcpu = vcpu;
- kref_init(&irq->refcount);
+ refcount_set(&irq->refcount, 0);
if (vgic_irq_is_sgi(i)) {
/* SGIs */
irq->enabled = 1;
@@ -217,29 +297,79 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
/* PPIs */
irq->config = VGIC_CONFIG_LEVEL;
}
+
+ switch (type) {
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ irq->group = 1;
+ irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ irq->group = 0;
+ irq->targets = BIT(vcpu->vcpu_id);
+ break;
+ }
}
+ return 0;
+}
+
+static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type)
+{
+ int ret;
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+ ret = vgic_allocate_private_irqs_locked(vcpu, type);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ return ret;
+}
+
+/**
+ * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
+ * structures and register VCPU-specific KVM iodevs
+ *
+ * @vcpu: pointer to the VCPU being created and initialized
+ *
+ * Only do initialization, but do not actually enable the
+ * VGIC CPU interface
+ */
+int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ int ret = 0;
+
+ vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+
+ INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+ raw_spin_lock_init(&vgic_cpu->ap_list_lock);
+ atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+
if (!irqchip_in_kernel(vcpu->kvm))
return 0;
+ ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model);
+ if (ret)
+ return ret;
+
/*
* If we are creating a VCPU with a GICv3 we must also register the
* KVM io device for the redistributor that belongs to this VCPU.
*/
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->slots_lock);
ret = vgic_register_redist_iodev(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->slots_lock);
}
return ret;
}
-static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
+static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu)
{
if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_enable(vcpu);
+ vgic_v2_reset(vcpu);
else
- vgic_v3_enable(vcpu);
+ vgic_v3_reset(vcpu);
}
/*
@@ -250,15 +380,16 @@ static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
* The function is generally called when nr_spis has been explicitly set
* by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
* vgic_initialized() returns true when this function has succeeded.
- * Must be called with kvm->lock held!
*/
int vgic_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu;
- int ret = 0, i;
+ int ret = 0;
unsigned long idx;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (vgic_initialized(kvm))
return 0;
@@ -274,59 +405,25 @@ int vgic_init(struct kvm *kvm)
if (ret)
goto out;
- /* Initialize groups on CPUs created before the VGIC type was known */
- kvm_for_each_vcpu(idx, vcpu, kvm) {
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
- for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
- struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
- switch (dist->vgic_model) {
- case KVM_DEV_TYPE_ARM_VGIC_V3:
- irq->group = 1;
- irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
- break;
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- irq->group = 0;
- irq->targets = 1U << idx;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
- }
-
- if (vgic_has_its(kvm))
- vgic_lpi_translation_cache_init(kvm);
-
/*
- * If we have GICv4.1 enabled, unconditionnaly request enable the
- * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
- * enable it if we present a virtual ITS to the guest.
+ * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs,
+ * vLPIs) is supported.
*/
- if (vgic_supports_direct_msis(kvm)) {
+ if (vgic_supports_direct_irqs(kvm)) {
ret = vgic_v4_init(kvm);
if (ret)
goto out;
}
kvm_for_each_vcpu(idx, vcpu, kvm)
- kvm_vgic_vcpu_enable(vcpu);
+ kvm_vgic_vcpu_reset(vcpu);
ret = kvm_vgic_setup_default_irq_routing(kvm);
if (ret)
goto out;
vgic_debug_init(kvm);
-
- /*
- * If userspace didn't set the GIC implementation revision,
- * default to the latest and greatest. You know want it.
- */
- if (!dist->implementation_rev)
- dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST;
dist->initialized = true;
-
out:
return ret;
}
@@ -346,20 +443,19 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
- vgic_v3_free_redist_region(rdreg);
+ vgic_v3_free_redist_region(kvm, rdreg);
INIT_LIST_HEAD(&dist->rd_regions);
} else {
dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
- if (vgic_has_its(kvm))
- vgic_lpi_translation_cache_destroy(kvm);
-
- if (vgic_supports_direct_msis(kvm))
+ if (vgic_supports_direct_irqs(kvm))
vgic_v4_teardown(kvm);
+
+ xa_destroy(&dist->lpi_xa);
}
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
+static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -370,33 +466,69 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
vgic_flush_pending_lpis(vcpu);
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
- vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+ kfree(vgic_cpu->private_irqs);
+ vgic_cpu->private_irqs = NULL;
+
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ /*
+ * If this vCPU is being destroyed because of a failed creation
+ * then unregister the redistributor to avoid leaving behind a
+ * dangling pointer to the vCPU struct.
+ *
+ * vCPUs that have been successfully created (i.e. added to
+ * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as
+ * this function gets called while holding kvm->arch.config_lock
+ * in the VM teardown path and would otherwise introduce a lock
+ * inversion w.r.t. kvm->srcu.
+ *
+ * vCPUs that failed creation are torn down outside of the
+ * kvm->arch.config_lock and do not get unregistered in
+ * kvm_vgic_destroy(), meaning it is both safe and necessary to
+ * do so here.
+ */
+ if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu)
+ vgic_unregister_redist_iodev(vcpu);
+
+ vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+ }
}
-/* To be called with kvm->lock held */
-static void __kvm_vgic_destroy(struct kvm *kvm)
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&kvm->slots_lock);
+ __kvm_vgic_vcpu_destroy(vcpu);
+ mutex_unlock(&kvm->slots_lock);
+}
+
+void kvm_vgic_destroy(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
+ mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->arch.config_lock);
+
vgic_debug_destroy(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vgic_vcpu_destroy(vcpu);
+ __kvm_vgic_vcpu_destroy(vcpu);
kvm_vgic_dist_destroy(kvm);
-}
-void kvm_vgic_destroy(struct kvm *kvm)
-{
- mutex_lock(&kvm->lock);
- __kvm_vgic_destroy(kvm);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
+
+ if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vgic_unregister_redist_iodev(vcpu);
+
+ mutex_unlock(&kvm->slots_lock);
}
/**
* vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
- * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
+ * is a GICv2. A GICv3 must be explicitly initialized by userspace using the
* KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
* @kvm: kvm struct pointer
*/
@@ -414,9 +546,9 @@ int vgic_lazy_init(struct kvm *kvm)
if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
return -EBUSY;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
ret = vgic_init(kvm);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
}
return ret;
@@ -425,67 +557,95 @@ int vgic_lazy_init(struct kvm *kvm)
/* RESOURCE MAPPING */
/**
+ * kvm_vgic_map_resources - map the MMIO regions
+ * @kvm: kvm struct pointer
+ *
* Map the MMIO regions depending on the VGIC model exposed to the guest
* called on the first VCPU run.
* Also map the virtual CPU interface into the VM.
* v2 calls vgic_init() if not already done.
* v3 and derivatives return an error if the VGIC is not initialized.
- * vgic_ready() returns true if this function has succeeded.
- * @kvm: kvm struct pointer
*/
int kvm_vgic_map_resources(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ enum vgic_type type;
+ gpa_t dist_base;
int ret = 0;
- if (likely(vgic_ready(kvm)))
+ if (likely(smp_load_acquire(&dist->ready)))
return 0;
- mutex_lock(&kvm->lock);
- if (vgic_ready(kvm))
+ mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->arch.config_lock);
+ if (dist->ready)
goto out;
if (!irqchip_in_kernel(kvm))
goto out;
- if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
ret = vgic_v2_map_resources(kvm);
- else
+ type = VGIC_V2;
+ } else {
ret = vgic_v3_map_resources(kvm);
+ type = VGIC_V3;
+ }
if (ret)
- __kvm_vgic_destroy(kvm);
- else
- dist->ready = true;
+ goto out;
+ dist_base = dist->vgic_dist_base;
+ mutex_unlock(&kvm->arch.config_lock);
+
+ ret = vgic_register_dist_iodev(kvm, dist_base, type);
+ if (ret) {
+ kvm_err("Unable to register VGIC dist MMIO regions\n");
+ goto out_slots;
+ }
+
+ smp_store_release(&dist->ready, true);
+ goto out_slots;
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
+out_slots:
+ if (ret)
+ kvm_vm_dead(kvm);
+
+ mutex_unlock(&kvm->slots_lock);
+
return ret;
}
/* GENERIC PROBE */
-static int vgic_init_cpu_starting(unsigned int cpu)
+void kvm_vgic_cpu_up(void)
{
enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
- return 0;
}
-static int vgic_init_cpu_dying(unsigned int cpu)
+void kvm_vgic_cpu_down(void)
{
disable_percpu_irq(kvm_vgic_global_state.maint_irq);
- return 0;
}
static irqreturn_t vgic_maintenance_handler(int irq, void *data)
{
+ struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data;
+
/*
* We cannot rely on the vgic maintenance interrupt to be
* delivered synchronously. This means we can only use it to
* exit the VM, and we perform the handling of EOIed
* interrupts on the exit path (see vgic_fold_lr_state).
+ *
+ * Of course, NV throws a wrench in this plan, and needs
+ * something special.
*/
+ if (vcpu && vgic_state_is_nested(vcpu))
+ vgic_v3_handle_nested_maint_irq(vcpu);
+
return IRQ_HANDLED;
}
@@ -512,10 +672,12 @@ void kvm_vgic_init_cpu_hardware(void)
* We want to make sure the list registers start out clear so that we
* only have the program the used registers.
*/
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (kvm_vgic_global_state.type == VGIC_V2) {
vgic_v2_init_lrs();
- else
+ } else if (kvm_vgic_global_state.type == VGIC_V3 ||
+ kvm_vgic_global_state.has_gcie_v3_compat) {
kvm_call_hyp(__vgic_v3_init_lrs);
+ }
}
/**
@@ -560,6 +722,9 @@ int kvm_vgic_hyp_init(void)
kvm_info("GIC system register CPU interface enabled\n");
}
break;
+ case GIC_V5:
+ ret = vgic_v5_probe(gic_kvm_info);
+ break;
default:
ret = -ENODEV;
}
@@ -572,7 +737,7 @@ int kvm_vgic_hyp_init(void)
if (ret)
return ret;
- if (!has_mask)
+ if (!has_mask && !kvm_vgic_global_state.maint_irq)
return 0;
ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
@@ -584,19 +749,6 @@ int kvm_vgic_hyp_init(void)
return ret;
}
- ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
- "kvm/arm/vgic:starting",
- vgic_init_cpu_starting, vgic_init_cpu_dying);
- if (ret) {
- kvm_err("Cannot register vgic CPU notifier\n");
- goto out_free_irq;
- }
-
kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
return 0;
-
-out_free_irq:
- free_percpu_irq(kvm_vgic_global_state.maint_irq,
- kvm_get_running_vcpus());
- return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
index 475059bacedf..c314c016659a 100644
--- a/arch/arm64/kvm/vgic/vgic-irqfd.c
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -9,7 +9,7 @@
#include <kvm/arm_vgic.h>
#include "vgic.h"
-/**
+/*
* vgic_irqfd_set_irq: inject the IRQ corresponding to the
* irqchip routing entry
*
@@ -23,7 +23,7 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
if (!vgic_valid_spi(kvm, spi_id))
return -EINVAL;
- return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
+ return kvm_vgic_inject_irq(kvm, NULL, spi_id, level, NULL);
}
/**
@@ -75,7 +75,8 @@ static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
msi->flags = e->msi.flags;
msi->devid = e->msi.devid;
}
-/**
+
+/*
* kvm_set_msi: inject the MSI corresponding to the
* MSI routing entry
*
@@ -98,7 +99,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
return vgic_its_inject_msi(kvm, &msi);
}
-/**
+/*
* kvm_arch_set_irq_inatomic: fast-path for irqfd injection
*/
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 94a666dd1443..3f1c4b10fed9 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -23,12 +23,49 @@
#include "vgic.h"
#include "vgic-mmio.h"
+static struct kvm_device_ops kvm_arm_vgic_its_ops;
+
static int vgic_its_save_tables_v0(struct vgic_its *its);
static int vgic_its_restore_tables_v0(struct vgic_its *its);
static int vgic_its_commit_v0(struct vgic_its *its);
static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
struct kvm_vcpu *filter_vcpu, bool needs_inv);
+#define vgic_its_read_entry_lock(i, g, valp, t) \
+ ({ \
+ int __sz = vgic_its_get_abi(i)->t##_esz; \
+ struct kvm *__k = (i)->dev->kvm; \
+ int __ret; \
+ \
+ BUILD_BUG_ON(NR_ITS_ABIS == 1 && \
+ sizeof(*(valp)) != ABI_0_ESZ); \
+ if (NR_ITS_ABIS > 1 && \
+ KVM_BUG_ON(__sz != sizeof(*(valp)), __k)) \
+ __ret = -EINVAL; \
+ else \
+ __ret = kvm_read_guest_lock(__k, (g), \
+ valp, __sz); \
+ __ret; \
+ })
+
+#define vgic_its_write_entry_lock(i, g, val, t) \
+ ({ \
+ int __sz = vgic_its_get_abi(i)->t##_esz; \
+ struct kvm *__k = (i)->dev->kvm; \
+ typeof(val) __v = (val); \
+ int __ret; \
+ \
+ BUILD_BUG_ON(NR_ITS_ABIS == 1 && \
+ sizeof(__v) != ABI_0_ESZ); \
+ if (NR_ITS_ABIS > 1 && \
+ KVM_BUG_ON(__sz != sizeof(__v), __k)) \
+ __ret = -EINVAL; \
+ else \
+ __ret = vgic_write_guest_lock(__k, (g), \
+ &__v, __sz); \
+ __ret; \
+ })
+
/*
* Creates a new (reference to a) struct vgic_irq for a given LPI.
* If this LPI is already mapped on another ITS, we increase its refcount
@@ -40,7 +77,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
struct kvm_vcpu *vcpu)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+ struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
unsigned long flags;
int ret;
@@ -52,45 +89,44 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
if (!irq)
return ERR_PTR(-ENOMEM);
- INIT_LIST_HEAD(&irq->lpi_list);
+ ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+ if (ret) {
+ kfree(irq);
+ return ERR_PTR(ret);
+ }
+
INIT_LIST_HEAD(&irq->ap_list);
raw_spin_lock_init(&irq->irq_lock);
irq->config = VGIC_CONFIG_EDGE;
- kref_init(&irq->refcount);
+ refcount_set(&irq->refcount, 1);
irq->intid = intid;
irq->target_vcpu = vcpu;
irq->group = 1;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
/*
* There could be a race with another vgic_add_lpi(), so we need to
* check that we don't add a second list entry with the same LPI.
*/
- list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
- if (oldirq->intid != intid)
- continue;
-
+ oldirq = xa_load(&dist->lpi_xa, intid);
+ if (vgic_try_get_irq_ref(oldirq)) {
/* Someone was faster with adding this LPI, lets use that. */
kfree(irq);
irq = oldirq;
-
- /*
- * This increases the refcount, the caller is expected to
- * call vgic_put_irq() on the returned pointer once it's
- * finished with the IRQ.
- */
- vgic_get_irq_kref(irq);
-
- goto out_unlock;
+ } else {
+ ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
}
- list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
- dist->lpi_list_count++;
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
+ if (ret) {
+ xa_release(&dist->lpi_xa, intid);
+ kfree(irq);
-out_unlock:
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ return ERR_PTR(ret);
+ }
/*
* We "cache" the configuration table entries in our struct vgic_irq's.
@@ -115,50 +151,12 @@ out_unlock:
return irq;
}
-struct its_device {
- struct list_head dev_list;
-
- /* the head for the list of ITTEs */
- struct list_head itt_head;
- u32 num_eventid_bits;
- gpa_t itt_addr;
- u32 device_id;
-};
-
-#define COLLECTION_NOT_MAPPED ((u32)~0)
-
-struct its_collection {
- struct list_head coll_list;
-
- u32 collection_id;
- u32 target_addr;
-};
-
-#define its_is_collection_mapped(coll) ((coll) && \
- ((coll)->target_addr != COLLECTION_NOT_MAPPED))
-
-struct its_ite {
- struct list_head ite_list;
-
- struct vgic_irq *irq;
- struct its_collection *collection;
- u32 event_id;
-};
-
-struct vgic_translation_cache_entry {
- struct list_head entry;
- phys_addr_t db;
- u32 devid;
- u32 eventid;
- struct vgic_irq *irq;
-};
-
/**
* struct vgic_its_abi - ITS abi ops and settings
* @cte_esz: collection table entry size
* @dte_esz: device table entry size
* @ite_esz: interrupt translation table entry size
- * @save tables: save the ITS tables into guest RAM
+ * @save_tables: save the ITS tables into guest RAM
* @restore_tables: restore the ITS internal structs from tables
* stored in guest RAM
* @commit: initialize the registers which expose the ABI settings,
@@ -247,8 +245,10 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
#define GIC_LPI_OFFSET 8192
-#define VITS_TYPER_IDBITS 16
-#define VITS_TYPER_DEVBITS 16
+#define VITS_TYPER_IDBITS 16
+#define VITS_MAX_EVENTID (BIT(VITS_TYPER_IDBITS) - 1)
+#define VITS_TYPER_DEVBITS 16
+#define VITS_MAX_DEVID (BIT(VITS_TYPER_DEVBITS) - 1)
#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1)
#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1)
@@ -303,79 +303,40 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
}
}
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
if (irq->hw)
- return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
-
- return 0;
-}
+ ret = its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
-/*
- * Create a snapshot of the current LPIs targeting @vcpu, so that we can
- * enumerate those LPIs without holding any lock.
- * Returns their number and puts the kmalloc'ed array into intid_ptr.
- */
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_irq *irq;
- unsigned long flags;
- u32 *intids;
- int irq_count, i = 0;
-
- /*
- * There is an obvious race between allocating the array and LPIs
- * being mapped/unmapped. If we ended up here as a result of a
- * command, we're safe (locks are held, preventing another
- * command). If coming from another path (such as enabling LPIs),
- * we must be careful not to overrun the array.
- */
- irq_count = READ_ONCE(dist->lpi_list_count);
- intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT);
- if (!intids)
- return -ENOMEM;
-
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
- list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
- if (i == irq_count)
- break;
- /* We don't need to "get" the IRQ, as we hold the list lock. */
- if (vcpu && irq->target_vcpu != vcpu)
- continue;
- intids[i++] = irq->intid;
- }
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
- *intid_ptr = intids;
- return i;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ return ret;
}
static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
{
- int ret = 0;
- unsigned long flags;
+ struct its_vlpi_map map;
+ int ret;
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ guard(raw_spinlock_irqsave)(&irq->irq_lock);
irq->target_vcpu = vcpu;
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
- if (irq->hw) {
- struct its_vlpi_map map;
+ if (!irq->hw)
+ return 0;
- ret = its_get_vlpi(irq->host_irq, &map);
- if (ret)
- return ret;
+ ret = its_get_vlpi(irq->host_irq, &map);
+ if (ret)
+ return ret;
- if (map.vpe)
- atomic_dec(&map.vpe->vlpi_count);
- map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
- atomic_inc(&map.vpe->vlpi_count);
+ if (map.vpe)
+ atomic_dec(&map.vpe->vlpi_count);
- ret = its_map_vlpi(irq->host_irq, &map);
- }
+ map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ atomic_inc(&map.vpe->vlpi_count);
+ return its_map_vlpi(irq->host_irq, &map);
+}
- return ret;
+static struct kvm_vcpu *collection_to_vcpu(struct kvm *kvm,
+ struct its_collection *col)
+{
+ return kvm_get_vcpu_by_id(kvm, col->target_addr);
}
/*
@@ -391,7 +352,7 @@ static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
if (!its_is_collection_mapped(ite->collection))
return;
- vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+ vcpu = collection_to_vcpu(kvm, ite->collection);
update_affinity(ite->irq, vcpu);
}
@@ -428,23 +389,18 @@ static u32 max_lpis_propbaser(u64 propbaser)
static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
{
gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
int last_byte_offset = -1;
int ret = 0;
- u32 *intids;
- int nr_irqs, i;
- unsigned long flags;
u8 pendmask;
- nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
- if (nr_irqs < 0)
- return nr_irqs;
-
- for (i = 0; i < nr_irqs; i++) {
+ xa_for_each(&dist->lpi_xa, intid, irq) {
int byte_offset, bit_nr;
- byte_offset = intids[i] / BITS_PER_BYTE;
- bit_nr = intids[i] % BITS_PER_BYTE;
+ byte_offset = intid / BITS_PER_BYTE;
+ bit_nr = intid % BITS_PER_BYTE;
/*
* For contiguously allocated LPIs chances are we just read
@@ -454,22 +410,23 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
ret = kvm_read_guest_lock(vcpu->kvm,
pendbase + byte_offset,
&pendmask, 1);
- if (ret) {
- kfree(intids);
+ if (ret)
return ret;
- }
+
last_byte_offset = byte_offset;
}
- irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ irq = vgic_get_irq(vcpu->kvm, intid);
+ if (!irq)
+ continue;
+
raw_spin_lock_irqsave(&irq->irq_lock, flags);
- irq->pending_latch = pendmask & (1U << bit_nr);
+ if (irq->target_vcpu == vcpu)
+ irq->pending_latch = pendmask & (1U << bit_nr);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
vgic_put_irq(vcpu->kvm, irq);
}
- kfree(intids);
-
return ret;
}
@@ -545,47 +502,52 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
return 0;
}
-static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
- phys_addr_t db,
- u32 devid, u32 eventid)
+static struct vgic_its *__vgic_doorbell_to_its(struct kvm *kvm, gpa_t db)
{
- struct vgic_translation_cache_entry *cte;
+ struct kvm_io_device *kvm_io_dev;
+ struct vgic_io_device *iodev;
- list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
- /*
- * If we hit a NULL entry, there is nothing after this
- * point.
- */
- if (!cte->irq)
- break;
+ kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, db);
+ if (!kvm_io_dev)
+ return ERR_PTR(-EINVAL);
- if (cte->db != db || cte->devid != devid ||
- cte->eventid != eventid)
- continue;
+ if (kvm_io_dev->ops != &kvm_io_gic_ops)
+ return ERR_PTR(-EINVAL);
- /*
- * Move this entry to the head, as it is the most
- * recently used.
- */
- if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
- list_move(&cte->entry, &dist->lpi_translation_cache);
+ iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+ if (iodev->iodev_type != IODEV_ITS)
+ return ERR_PTR(-EINVAL);
- return cte->irq;
- }
+ return iodev->its;
+}
+
+static unsigned long vgic_its_cache_key(u32 devid, u32 eventid)
+{
+ return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid;
- return NULL;
}
static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
u32 devid, u32 eventid)
{
- struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long cache_key = vgic_its_cache_key(devid, eventid);
+ struct vgic_its *its;
struct vgic_irq *irq;
- unsigned long flags;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
- irq = __vgic_its_check_cache(dist, db, devid, eventid);
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ if (devid > VITS_MAX_DEVID || eventid > VITS_MAX_EVENTID)
+ return NULL;
+
+ its = __vgic_doorbell_to_its(kvm, db);
+ if (IS_ERR(its))
+ return NULL;
+
+ rcu_read_lock();
+
+ irq = xa_load(&its->translation_cache, cache_key);
+ if (!vgic_try_get_irq_ref(irq))
+ irq = NULL;
+
+ rcu_read_unlock();
return irq;
}
@@ -594,76 +556,68 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
u32 devid, u32 eventid,
struct vgic_irq *irq)
{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_translation_cache_entry *cte;
- unsigned long flags;
- phys_addr_t db;
+ unsigned long cache_key = vgic_its_cache_key(devid, eventid);
+ struct vgic_irq *old;
/* Do not cache a directly injected interrupt */
if (irq->hw)
return;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
- if (unlikely(list_empty(&dist->lpi_translation_cache)))
- goto out;
-
/*
- * We could have raced with another CPU caching the same
- * translation behind our back, so let's check it is not in
- * already
+ * The irq refcount is guaranteed to be nonzero while holding the
+ * its_lock, as the ITE (and the reference it holds) cannot be freed.
*/
- db = its->vgic_its_base + GITS_TRANSLATER;
- if (__vgic_its_check_cache(dist, db, devid, eventid))
- goto out;
+ lockdep_assert_held(&its->its_lock);
+ vgic_get_irq_ref(irq);
- /* Always reuse the last entry (LRU policy) */
- cte = list_last_entry(&dist->lpi_translation_cache,
- typeof(*cte), entry);
+ old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);
/*
- * Caching the translation implies having an extra reference
- * to the interrupt, so drop the potential reference on what
- * was in the cache, and increment it on the new interrupt.
+ * Put the reference taken on @irq if the store fails. Intentionally do
+ * not return the error as the translation cache is best effort.
*/
- if (cte->irq)
- __vgic_put_lpi_locked(kvm, cte->irq);
-
- vgic_get_irq_kref(irq);
+ if (xa_is_err(old)) {
+ vgic_put_irq(kvm, irq);
+ return;
+ }
- cte->db = db;
- cte->devid = devid;
- cte->eventid = eventid;
- cte->irq = irq;
+ /*
+ * We could have raced with another CPU caching the same
+ * translation behind our back, ensure we don't leak a
+ * reference if that is the case.
+ */
+ if (old)
+ vgic_put_irq(kvm, old);
+}
- /* Move the new translation to the head of the list */
- list_move(&cte->entry, &dist->lpi_translation_cache);
+static void vgic_its_invalidate_cache(struct vgic_its *its)
+{
+ struct kvm *kvm = its->dev->kvm;
+ struct vgic_irq *irq;
+ unsigned long idx;
-out:
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ xa_for_each(&its->translation_cache, idx, irq) {
+ xa_erase(&its->translation_cache, idx);
+ vgic_put_irq(kvm, irq);
+ }
}
-void vgic_its_invalidate_cache(struct kvm *kvm)
+void vgic_its_invalidate_all_caches(struct kvm *kvm)
{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_translation_cache_entry *cte;
- unsigned long flags;
+ struct kvm_device *dev;
+ struct vgic_its *its;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ rcu_read_lock();
- list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
- /*
- * If we hit a NULL entry, there is nothing after this
- * point.
- */
- if (!cte->irq)
- break;
+ list_for_each_entry_rcu(dev, &kvm->devices, vm_node) {
+ if (dev->ops != &kvm_arm_vgic_its_ops)
+ continue;
- __vgic_put_lpi_locked(kvm, cte->irq);
- cte->irq = NULL;
+ its = dev->private;
+ vgic_its_invalidate_cache(its);
}
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ rcu_read_unlock();
}
int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
@@ -679,7 +633,7 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
if (!ite || !its_is_collection_mapped(ite->collection))
return E_ITS_INT_UNMAPPED_INTERRUPT;
- vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+ vcpu = collection_to_vcpu(kvm, ite->collection);
if (!vcpu)
return E_ITS_INT_UNMAPPED_INTERRUPT;
@@ -695,8 +649,6 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
{
u64 address;
- struct kvm_io_device *kvm_io_dev;
- struct vgic_io_device *iodev;
if (!vgic_has_its(kvm))
return ERR_PTR(-ENODEV);
@@ -706,18 +658,7 @@ struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
address = (u64)msi->address_hi << 32 | msi->address_lo;
- kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
- if (!kvm_io_dev)
- return ERR_PTR(-EINVAL);
-
- if (kvm_io_dev->ops != &kvm_io_gic_ops)
- return ERR_PTR(-EINVAL);
-
- iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
- if (iodev->iodev_type != IODEV_ITS)
- return ERR_PTR(-EINVAL);
-
- return iodev->its;
+ return __vgic_doorbell_to_its(kvm, address);
}
/*
@@ -763,6 +704,7 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = true;
vgic_queue_irq_unlock(kvm, irq, flags);
+ vgic_put_irq(kvm, irq);
return 0;
}
@@ -806,12 +748,17 @@ int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
/* Requires the its_lock to be held. */
static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
{
+ struct vgic_irq *irq = ite->irq;
list_del(&ite->ite_list);
/* This put matches the get in vgic_add_lpi. */
- if (ite->irq) {
- if (ite->irq->hw)
- WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+ if (irq) {
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
+ if (irq->hw)
+ its_unmap_vlpi(ite->irq->host_irq);
+
+ irq->hw = false;
+ }
vgic_put_irq(kvm, ite->irq);
}
@@ -847,15 +794,19 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
ite = find_ite(its, device_id, event_id);
if (ite && its_is_collection_mapped(ite->collection)) {
+ struct its_device *device = find_its_device(its, device_id);
+ int ite_esz = vgic_its_get_abi(its)->ite_esz;
+ gpa_t gpa = device->itt_addr + ite->event_id * ite_esz;
/*
* Though the spec talks about removing the pending state, we
* don't bother here since we clear the ITTE anyway and the
* pending state is a property of the ITTE struct.
*/
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
its_free_ite(kvm, ite);
- return 0;
+
+ return vgic_its_write_entry_lock(its, gpa, 0ULL, ite);
}
return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
@@ -887,9 +838,9 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
return E_ITS_MOVI_UNMAPPED_COLLECTION;
ite->collection = collection;
- vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+ vcpu = collection_to_vcpu(kvm, collection);
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
return update_affinity(ite->irq, vcpu);
}
@@ -924,7 +875,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
switch (type) {
case GITS_BASER_TYPE_DEVICE:
- if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
+ if (id > VITS_MAX_DEVID)
return false;
break;
case GITS_BASER_TYPE_COLLECTION:
@@ -1121,7 +1072,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
}
if (its_is_collection_mapped(collection))
- vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+ vcpu = collection_to_vcpu(kvm, collection);
irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
if (IS_ERR(irq)) {
@@ -1136,7 +1087,8 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
}
/* Requires the its_lock to be held. */
-static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
+static void vgic_its_free_device(struct kvm *kvm, struct vgic_its *its,
+ struct its_device *device)
{
struct its_ite *ite, *temp;
@@ -1148,7 +1100,7 @@ static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
its_free_ite(kvm, ite);
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
list_del(&device->dev_list);
kfree(device);
@@ -1160,7 +1112,7 @@ static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
struct its_device *cur, *temp;
list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
- vgic_its_free_device(kvm, cur);
+ vgic_its_free_device(kvm, its, cur);
}
/* its lock must be held */
@@ -1204,8 +1156,9 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
u8 num_eventid_bits = its_cmd_get_size(its_cmd);
gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
struct its_device *device;
+ gpa_t gpa;
- if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
+ if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa))
return E_ITS_MAPD_DEVICE_OOR;
if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
@@ -1219,14 +1172,14 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
* by removing the mapping and re-establishing it.
*/
if (device)
- vgic_its_free_device(kvm, device);
+ vgic_its_free_device(kvm, its, device);
/*
* The spec does not say whether unmapping a not-mapped device
* is an error, so we are done in any case.
*/
if (!valid)
- return 0;
+ return vgic_its_write_entry_lock(its, gpa, 0ULL, dte);
device = vgic_its_alloc_device(its, device_id, itt_addr,
num_eventid_bits);
@@ -1242,21 +1195,22 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
u64 *its_cmd)
{
u16 coll_id;
- u32 target_addr;
struct its_collection *collection;
bool valid;
valid = its_cmd_get_validbit(its_cmd);
coll_id = its_cmd_get_collection(its_cmd);
- target_addr = its_cmd_get_target_addr(its_cmd);
-
- if (target_addr >= atomic_read(&kvm->online_vcpus))
- return E_ITS_MAPC_PROCNUM_OOR;
if (!valid) {
vgic_its_free_collection(its, coll_id);
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
} else {
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
+ if (!vcpu)
+ return E_ITS_MAPC_PROCNUM_OOR;
+
collection = find_collection(its, coll_id);
if (!collection) {
@@ -1270,9 +1224,9 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
coll_id);
if (ret)
return ret;
- collection->target_addr = target_addr;
+ collection->target_addr = vcpu->vcpu_id;
} else {
- collection->target_addr = target_addr;
+ collection->target_addr = vcpu->vcpu_id;
update_affinity_collection(kvm, its, collection);
}
}
@@ -1330,8 +1284,8 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
}
/**
- * vgic_its_invall - invalidate all LPIs targetting a given vcpu
- * @vcpu: the vcpu for which the RD is targetted by an invalidation
+ * vgic_its_invall - invalidate all LPIs targeting a given vcpu
+ * @vcpu: the vcpu for which the RD is targeted by an invalidation
*
* Contrary to the INVALL command, this targets a RD instead of a
* collection, and we don't need to hold the its_lock, since no ITS is
@@ -1340,23 +1294,19 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
int vgic_its_invall(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- int irq_count, i = 0;
- u32 *intids;
-
- irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
- if (irq_count < 0)
- return irq_count;
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ unsigned long intid;
- for (i = 0; i < irq_count; i++) {
- struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intids[i]);
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ irq = vgic_get_irq(kvm, intid);
if (!irq)
continue;
+
update_lpi_config(kvm, irq, vcpu, false);
vgic_put_irq(kvm, irq);
}
- kfree(intids);
-
if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
@@ -1382,7 +1332,7 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
if (!its_is_collection_mapped(collection))
return E_ITS_INVALL_UNMAPPED_COLLECTION;
- vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+ vcpu = collection_to_vcpu(kvm, collection);
vgic_its_invall(vcpu);
return 0;
@@ -1399,38 +1349,33 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
u64 *its_cmd)
{
- u32 target1_addr = its_cmd_get_target_addr(its_cmd);
- u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+ struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu1, *vcpu2;
struct vgic_irq *irq;
- u32 *intids;
- int irq_count, i;
+ unsigned long intid;
+
+ /* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */
+ vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
+ vcpu2 = kvm_get_vcpu_by_id(kvm, its_cmd_mask_field(its_cmd, 3, 16, 32));
- if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
- target2_addr >= atomic_read(&kvm->online_vcpus))
+ if (!vcpu1 || !vcpu2)
return E_ITS_MOVALL_PROCNUM_OOR;
- if (target1_addr == target2_addr)
+ if (vcpu1 == vcpu2)
return 0;
- vcpu1 = kvm_get_vcpu(kvm, target1_addr);
- vcpu2 = kvm_get_vcpu(kvm, target2_addr);
-
- irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
- if (irq_count < 0)
- return irq_count;
-
- for (i = 0; i < irq_count; i++) {
- irq = vgic_get_irq(kvm, NULL, intids[i]);
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ irq = vgic_get_irq(kvm, intid);
+ if (!irq)
+ continue;
update_affinity(irq, vcpu2);
vgic_put_irq(kvm, irq);
}
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
- kfree(intids);
return 0;
}
@@ -1781,7 +1726,7 @@ static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
its->enabled = !!(val & GITS_CTLR_ENABLE);
if (!its->enabled)
- vgic_its_invalidate_cache(kvm);
+ vgic_its_invalidate_cache(its);
/*
* Try to process any pending commands. This function bails out early
@@ -1882,47 +1827,6 @@ out:
return ret;
}
-/* Default is 16 cached LPIs per vcpu */
-#define LPI_DEFAULT_PCPU_CACHE_SIZE 16
-
-void vgic_lpi_translation_cache_init(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- unsigned int sz;
- int i;
-
- if (!list_empty(&dist->lpi_translation_cache))
- return;
-
- sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
-
- for (i = 0; i < sz; i++) {
- struct vgic_translation_cache_entry *cte;
-
- /* An allocation failure is not fatal */
- cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT);
- if (WARN_ON(!cte))
- break;
-
- INIT_LIST_HEAD(&cte->entry);
- list_add(&cte->entry, &dist->lpi_translation_cache);
- }
-}
-
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct vgic_translation_cache_entry *cte, *tmp;
-
- vgic_its_invalidate_cache(kvm);
-
- list_for_each_entry_safe(cte, tmp,
- &dist->lpi_translation_cache, entry) {
- list_del(&cte->entry);
- kfree(cte);
- }
-}
-
#define INITIAL_BASER_VALUE \
(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \
GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \
@@ -1936,6 +1840,7 @@ void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
static int vgic_its_create(struct kvm_device *dev, u32 type)
{
+ int ret;
struct vgic_its *its;
if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
@@ -1945,23 +1850,33 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
if (!its)
return -ENOMEM;
+ mutex_lock(&dev->kvm->arch.config_lock);
+
if (vgic_initialized(dev->kvm)) {
- int ret = vgic_v4_init(dev->kvm);
+ ret = vgic_v4_init(dev->kvm);
if (ret < 0) {
+ mutex_unlock(&dev->kvm->arch.config_lock);
kfree(its);
return ret;
}
-
- vgic_lpi_translation_cache_init(dev->kvm);
}
mutex_init(&its->its_lock);
mutex_init(&its->cmd_lock);
+ /* Yep, even more trickery for lock ordering... */
+#ifdef CONFIG_LOCKDEP
+ mutex_lock(&its->cmd_lock);
+ mutex_lock(&its->its_lock);
+ mutex_unlock(&its->its_lock);
+ mutex_unlock(&its->cmd_lock);
+#endif
+
its->vgic_its_base = VGIC_ADDR_UNDEF;
INIT_LIST_HEAD(&its->device_list);
INIT_LIST_HEAD(&its->collection_list);
+ xa_init(&its->translation_cache);
dev->kvm->arch.vgic.msis_require_devid = true;
dev->kvm->arch.vgic.has_its = true;
@@ -1976,7 +1891,11 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
dev->private = its;
- return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
+ ret = vgic_its_set_abi(its, NR_ITS_ABIS - 1);
+
+ mutex_unlock(&dev->kvm->arch.config_lock);
+
+ return ret;
}
static void vgic_its_destroy(struct kvm_device *kvm_dev)
@@ -1986,8 +1905,12 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev)
mutex_lock(&its->its_lock);
+ vgic_its_debug_destroy(kvm_dev);
+
vgic_its_free_device_list(kvm, its);
vgic_its_free_collection_list(kvm, its);
+ vgic_its_invalidate_cache(its);
+ xa_destroy(&its->translation_cache);
mutex_unlock(&its->its_lock);
kfree(its);
@@ -2045,6 +1968,13 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return -EBUSY;
+ }
+
+ mutex_lock(&dev->kvm->arch.config_lock);
+
if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
ret = -ENXIO;
goto out;
@@ -2058,11 +1988,6 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
goto out;
}
- if (!lock_all_vcpus(dev->kvm)) {
- ret = -EBUSY;
- goto out;
- }
-
addr = its->vgic_its_base + offset;
len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
@@ -2076,8 +2001,9 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
} else {
*reg = region->its_read(dev->kvm, its, addr, len);
}
- unlock_all_vcpus(dev->kvm);
out:
+ mutex_unlock(&dev->kvm->arch.config_lock);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return ret;
}
@@ -2110,7 +2036,7 @@ static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
}
/**
- * entry_fn_t - Callback called on a table entry restore path
+ * typedef entry_fn_t - Callback called on a table entry restore path
* @its: its handle
* @id: id of the entry
* @entry: pointer to the entry
@@ -2133,6 +2059,7 @@ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
* @start_id: the ID of the first entry in the table
* (non zero for 2d level tables)
* @fn: function to apply on each entry
+ * @opaque: pointer to opaque data
*
* Return: < 0 on error, 0 if last element was identified, 1 otherwise
* (the last element may not be found on second level tables)
@@ -2172,13 +2099,12 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
return 1;
}
-/**
+/*
* vgic_its_save_ite - Save an interrupt translation entry at @gpa
*/
static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
- struct its_ite *ite, gpa_t gpa, int ite_esz)
+ struct its_ite *ite, gpa_t gpa)
{
- struct kvm *kvm = its->dev->kvm;
u32 next_offset;
u64 val;
@@ -2187,11 +2113,14 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
ite->collection->collection_id;
val = cpu_to_le64(val);
- return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
+
+ return vgic_its_write_entry_lock(its, gpa, val, ite);
}
/**
* vgic_its_restore_ite - restore an interrupt translation entry
+ *
+ * @its: its handle
* @event_id: id used for indexing
* @ptr: pointer to the ITE entry
* @opaque: pointer to the its_device
@@ -2239,7 +2168,7 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
return PTR_ERR(ite);
if (its_is_collection_mapped(collection))
- vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+ vcpu = kvm_get_vcpu_by_id(kvm, collection->target_addr);
irq = vgic_add_lpi(kvm, lpi_id, vcpu);
if (IS_ERR(irq)) {
@@ -2285,7 +2214,7 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
- ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
+ ret = vgic_its_save_ite(its, device, ite, gpa);
if (ret)
return ret;
}
@@ -2326,9 +2255,8 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
* @ptr: GPA
*/
static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
- gpa_t ptr, int dte_esz)
+ gpa_t ptr)
{
- struct kvm *kvm = its->dev->kvm;
u64 val, itt_addr_field;
u32 next_offset;
@@ -2339,7 +2267,8 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
(itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
(dev->num_eventid_bits - 1));
val = cpu_to_le64(val);
- return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
+
+ return vgic_its_write_entry_lock(its, ptr, val, dte);
}
/**
@@ -2387,7 +2316,7 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
ret = vgic_its_restore_itt(its, dev);
if (ret) {
- vgic_its_free_device(its->dev->kvm, dev);
+ vgic_its_free_device(its->dev->kvm, its, dev);
return ret;
}
@@ -2406,7 +2335,7 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a,
return 1;
}
-/**
+/*
* vgic_its_save_device_tables - Save the device table and all ITT
* into guest RAM
*
@@ -2415,10 +2344,8 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a,
*/
static int vgic_its_save_device_tables(struct vgic_its *its)
{
- const struct vgic_its_abi *abi = vgic_its_get_abi(its);
u64 baser = its->baser_device_table;
struct its_device *dev;
- int dte_esz = abi->dte_esz;
if (!(baser & GITS_BASER_VALID))
return 0;
@@ -2437,7 +2364,7 @@ static int vgic_its_save_device_tables(struct vgic_its *its)
if (ret)
return ret;
- ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
+ ret = vgic_its_save_dte(its, dev, eaddr);
if (ret)
return ret;
}
@@ -2479,7 +2406,7 @@ static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
return ret;
}
-/**
+/*
* vgic_its_restore_device_tables - Restore the device table and all ITT
* from guest RAM to internal data structs
*/
@@ -2518,7 +2445,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its)
static int vgic_its_save_cte(struct vgic_its *its,
struct its_collection *collection,
- gpa_t gpa, int esz)
+ gpa_t gpa)
{
u64 val;
@@ -2526,7 +2453,8 @@ static int vgic_its_save_cte(struct vgic_its *its,
((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
collection->collection_id);
val = cpu_to_le64(val);
- return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
+
+ return vgic_its_write_entry_lock(its, gpa, val, cte);
}
/*
@@ -2534,7 +2462,7 @@ static int vgic_its_save_cte(struct vgic_its *its,
* Return +1 on success, 0 if the entry was invalid (which should be
* interpreted as end-of-table), and a negative error value for generic errors.
*/
-static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
+static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa)
{
struct its_collection *collection;
struct kvm *kvm = its->dev->kvm;
@@ -2542,8 +2470,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
u64 val;
int ret;
- BUG_ON(esz > sizeof(val));
- ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
+ ret = vgic_its_read_entry_lock(its, gpa, &val, cte);
if (ret)
return ret;
val = le64_to_cpu(val);
@@ -2554,7 +2481,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
coll_id = val & KVM_ITS_CTE_ICID_MASK;
if (target_addr != COLLECTION_NOT_MAPPED &&
- target_addr >= atomic_read(&kvm->online_vcpus))
+ !kvm_get_vcpu_by_id(kvm, target_addr))
return -EINVAL;
collection = find_collection(its, coll_id);
@@ -2571,7 +2498,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
return 1;
}
-/**
+/*
* vgic_its_save_collection_table - Save the collection table into
* guest RAM
*/
@@ -2581,7 +2508,6 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
u64 baser = its->baser_coll_table;
gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
struct its_collection *collection;
- u64 val;
size_t max_size, filled = 0;
int ret, cte_esz = abi->cte_esz;
@@ -2591,7 +2517,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
list_for_each_entry(collection, &its->collection_list, coll_list) {
- ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
+ ret = vgic_its_save_cte(its, collection, gpa);
if (ret)
return ret;
gpa += cte_esz;
@@ -2605,13 +2531,10 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
* table is not fully filled, add a last dummy element
* with valid bit unset
*/
- val = 0;
- BUG_ON(cte_esz > sizeof(val));
- ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
- return ret;
+ return vgic_its_write_entry_lock(its, gpa, 0ULL, cte);
}
-/**
+/*
* vgic_its_restore_collection_table - reads the collection table
* in guest memory and restores the ITS internal state. Requires the
* BASER registers to be restored before.
@@ -2633,7 +2556,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its)
max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
while (read < max_size) {
- ret = vgic_its_restore_cte(its, gpa, cte_esz);
+ ret = vgic_its_restore_cte(its, gpa);
if (ret <= 0)
break;
gpa += cte_esz;
@@ -2649,7 +2572,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its)
return ret;
}
-/**
+/*
* vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
* according to v0 ABI
*/
@@ -2664,7 +2587,7 @@ static int vgic_its_save_tables_v0(struct vgic_its *its)
return vgic_its_save_collection_table(its);
}
-/**
+/*
* vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
* to internal data structs according to V0 ABI
*
@@ -2743,37 +2666,39 @@ static int vgic_its_has_attr(struct kvm_device *dev,
static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
{
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
- struct vgic_dist *dist = &kvm->arch.vgic;
int ret = 0;
if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
return 0;
mutex_lock(&kvm->lock);
- mutex_lock(&its->its_lock);
- if (!lock_all_vcpus(kvm)) {
- mutex_unlock(&its->its_lock);
+ if (kvm_trylock_all_vcpus(kvm)) {
mutex_unlock(&kvm->lock);
return -EBUSY;
}
+ mutex_lock(&kvm->arch.config_lock);
+ mutex_lock(&its->its_lock);
+
switch (attr) {
case KVM_DEV_ARM_ITS_CTRL_RESET:
vgic_its_reset(kvm, its);
break;
case KVM_DEV_ARM_ITS_SAVE_TABLES:
- dist->save_its_tables_in_progress = true;
ret = abi->save_tables(its);
- dist->save_its_tables_in_progress = false;
break;
case KVM_DEV_ARM_ITS_RESTORE_TABLES:
ret = abi->restore_tables(its);
break;
+ default:
+ ret = -ENXIO;
+ break;
}
- unlock_all_vcpus(kvm);
mutex_unlock(&its->its_lock);
+ mutex_unlock(&kvm->arch.config_lock);
+ kvm_unlock_all_vcpus(kvm);
mutex_unlock(&kvm->lock);
return ret;
}
@@ -2792,7 +2717,7 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- return dist->save_its_tables_in_progress;
+ return dist->table_write_in_progress;
}
static int vgic_its_set_attr(struct kvm_device *dev,
@@ -2818,7 +2743,12 @@ static int vgic_its_set_attr(struct kvm_device *dev,
if (ret)
return ret;
- return vgic_register_its_iodev(dev->kvm, its, addr);
+ ret = vgic_register_its_iodev(dev->kvm, its, addr);
+ if (ret)
+ return ret;
+
+ return vgic_its_debug_init(dev);
+
}
case KVM_DEV_ARM_VGIC_GRP_CTRL:
return vgic_its_ctrl(dev->kvm, its, attr->attr);
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index edeac2380591..3d1a776b716d 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -5,6 +5,7 @@
* Copyright (C) 2015 ARM Ltd.
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <linux/irqchip/arm-gic-v3.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
#include <linux/uaccess.h>
@@ -27,7 +28,8 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
if (addr + size < addr)
return -EINVAL;
- if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm))
+ if (addr & ~kvm_phys_mask(&kvm->arch.mmu) ||
+ (addr + size) > kvm_phys_size(&kvm->arch.mmu))
return -E2BIG;
return 0;
@@ -46,7 +48,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev
struct vgic_dist *vgic = &kvm->arch.vgic;
int r;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) {
case KVM_VGIC_V2_ADDR_TYPE_DIST:
r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -68,7 +70,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev
r = -ENODEV;
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return r;
}
@@ -102,7 +104,11 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri
if (get_user(addr, uaddr))
return -EFAULT;
- mutex_lock(&kvm->lock);
+ /*
+ * Since we can't hold config_lock while registering the redistributor
+ * iodevs, take the slots_lock immediately.
+ */
+ mutex_lock(&kvm->slots_lock);
switch (attr->attr) {
case KVM_VGIC_V2_ADDR_TYPE_DIST:
r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -182,6 +188,7 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri
if (r)
goto out;
+ mutex_lock(&kvm->arch.config_lock);
if (write) {
r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size);
if (!r)
@@ -189,9 +196,10 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri
} else {
addr = *addr_ptr;
}
+ mutex_unlock(&kvm->arch.config_lock);
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->slots_lock);
if (!r && !write)
r = put_user(addr, uaddr);
@@ -227,24 +235,29 @@ static int vgic_set_common_attr(struct kvm_device *dev,
(val & 31))
return -EINVAL;
- mutex_lock(&dev->kvm->lock);
+ mutex_lock(&dev->kvm->arch.config_lock);
- if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
+ /*
+ * Either userspace has already configured NR_IRQS or
+ * the vgic has already been initialized and vgic_init()
+ * supplied a default amount of SPIs.
+ */
+ if (dev->kvm->arch.vgic.nr_spis)
ret = -EBUSY;
else
dev->kvm->arch.vgic.nr_spis =
val - VGIC_NR_PRIVATE_IRQS;
- mutex_unlock(&dev->kvm->lock);
+ mutex_unlock(&dev->kvm->arch.config_lock);
return ret;
}
case KVM_DEV_ARM_VGIC_GRP_CTRL: {
switch (attr->attr) {
case KVM_DEV_ARM_VGIC_CTRL_INIT:
- mutex_lock(&dev->kvm->lock);
+ mutex_lock(&dev->kvm->arch.config_lock);
r = vgic_init(dev->kvm);
- mutex_unlock(&dev->kvm->lock);
+ mutex_unlock(&dev->kvm->arch.config_lock);
return r;
case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
/*
@@ -256,12 +269,15 @@ static int vgic_set_common_attr(struct kvm_device *dev,
return -ENXIO;
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
+
+ mutex_lock(&dev->kvm->arch.config_lock);
r = vgic_v3_save_pending_tables(dev->kvm);
- unlock_all_vcpus(dev->kvm);
+ mutex_unlock(&dev->kvm->arch.config_lock);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return r;
}
@@ -328,58 +344,16 @@ int kvm_register_vgic_device(unsigned long type)
int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
struct vgic_reg_attr *reg_attr)
{
- int cpuid;
+ int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr);
- cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
- KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
- if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
- return -EINVAL;
-
- reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+ reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid);
+ if (!reg_attr->vcpu)
+ return -EINVAL;
return 0;
}
-/* unlocks vcpus from @vcpu_lock_idx and smaller */
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
- struct kvm_vcpu *tmp_vcpu;
-
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
- mutex_unlock(&tmp_vcpu->mutex);
- }
-}
-
-void unlock_all_vcpus(struct kvm *kvm)
-{
- unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-/* Returns true if all vcpus were locked, false otherwise */
-bool lock_all_vcpus(struct kvm *kvm)
-{
- struct kvm_vcpu *tmp_vcpu;
- unsigned long c;
-
- /*
- * Any time a vcpu is run, vcpu_load is called which tries to grab the
- * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure
- * that no other VCPUs are run and fiddle with the vgic state while we
- * access it.
- */
- kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
- if (!mutex_trylock(&tmp_vcpu->mutex)) {
- unlock_vcpus(kvm, c - 1);
- return false;
- }
- }
-
- return true;
-}
-
/**
* vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
*
@@ -411,15 +385,17 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return -EBUSY;
+ }
+
+ mutex_lock(&dev->kvm->arch.config_lock);
+
ret = vgic_init(dev->kvm);
if (ret)
goto out;
- if (!lock_all_vcpus(dev->kvm)) {
- ret = -EBUSY;
- goto out;
- }
-
switch (attr->group) {
case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val);
@@ -432,8 +408,9 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
break;
}
- unlock_all_vcpus(dev->kvm);
out:
+ mutex_unlock(&dev->kvm->arch.config_lock);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && !is_write)
@@ -528,6 +505,24 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
}
/*
+ * Allow access to certain ID-like registers prior to VGIC initialization,
+ * thereby allowing the VMM to provision the features / sizing of the VGIC.
+ */
+static bool reg_allowed_pre_init(struct kvm_device_attr *attr)
+{
+ if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS)
+ return false;
+
+ switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) {
+ case GICD_IIDR:
+ case GICD_TYPER2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
* vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
*
* @dev: kvm device handle
@@ -569,12 +564,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (unlikely(!vgic_initialized(dev->kvm))) {
- ret = -EBUSY;
- goto out;
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return -EBUSY;
}
- if (!lock_all_vcpus(dev->kvm)) {
+ mutex_lock(&dev->kvm->arch.config_lock);
+
+ if (!(vgic_initialized(dev->kvm) || reg_allowed_pre_init(attr))) {
ret = -EBUSY;
goto out;
}
@@ -609,8 +606,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
break;
}
- unlock_all_vcpus(dev->kvm);
out:
+ mutex_unlock(&dev->kvm->arch.config_lock);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && uaccess && !is_write) {
@@ -630,6 +628,23 @@ static int vgic_v3_set_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
return vgic_v3_attr_regs_access(dev, attr, true);
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)attr->addr;
+ u32 val;
+
+ if (get_user(val, uaddr))
+ return -EFAULT;
+
+ guard(mutex)(&dev->kvm->arch.config_lock);
+ if (vgic_initialized(dev->kvm))
+ return -EBUSY;
+
+ if (!irq_is_ppi(val))
+ return -EINVAL;
+
+ dev->kvm->arch.vgic.mi_intid = val;
+ return 0;
+ }
default:
return vgic_set_common_attr(dev, attr);
}
@@ -644,6 +659,12 @@ static int vgic_v3_get_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
return vgic_v3_attr_regs_access(dev, attr, false);
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+ guard(mutex)(&dev->kvm->arch.config_lock);
+ return put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
+ }
default:
return vgic_get_common_attr(dev, attr);
}
@@ -666,6 +687,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
return vgic_v3_has_attr_regs(dev, attr);
case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return 0;
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index e070cda86e12..406845b3117c 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -148,7 +148,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
if (!(targets & (1U << c)))
continue;
- irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = true;
@@ -167,7 +167,7 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
u64 val = 0;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
val |= (u64)irq->targets << (i * 8);
@@ -191,7 +191,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
return;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid + i);
int target;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -213,7 +213,7 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
u64 val = 0;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
val |= (u64)irq->source << (i * 8);
@@ -231,7 +231,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
unsigned long flags;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -253,7 +253,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
unsigned long flags;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
vgic_set_vmcr(vcpu, &vmcr);
}
+static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_deactivate(vcpu, val);
+ else
+ vgic_v3_deactivate(vcpu, val);
+}
+
static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
@@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = {
REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE,
+ vgic_mmio_read_raz, vgic_mmio_write_dir,
+ vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi,
+ 4, VGIC_ACCESS_32bit),
};
unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
@@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
return SZ_4K;
}
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev)
+{
+ dev->regions = vgic_v2_cpu_registers;
+ dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+
+ kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+ return KVM_VGIC_V2_CPU_SIZE;
+}
+
int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
{
const struct vgic_register_region *region;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 91201f743033..70d50c77e5dc 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -50,8 +50,25 @@ bool vgic_has_its(struct kvm *kvm)
bool vgic_supports_direct_msis(struct kvm *kvm)
{
- return (kvm_vgic_global_state.has_gicv4_1 ||
- (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
+ /*
+ * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
+ * indirectly allowing userspace to control whether or not vPEs are
+ * allocated for the VM.
+ */
+ if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm))
+ return false;
+
+ return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
+}
+
+bool system_supports_direct_sgis(void)
+{
+ return kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi();
+}
+
+bool vgic_supports_direct_sgis(struct kvm *kvm)
+{
+ return kvm->arch.vgic.nassgicap;
}
/*
@@ -86,7 +103,7 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
}
break;
case GICD_TYPER2:
- if (kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi())
+ if (vgic_supports_direct_sgis(vcpu->kvm))
value = GICD_TYPER2_nASSGIcap;
break;
case GICD_IIDR:
@@ -111,7 +128,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
case GICD_CTLR: {
bool was_enabled, is_hwsgi;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
was_enabled = dist->enabled;
is_hwsgi = dist->nassgireq;
@@ -119,7 +136,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
/* Not a GICv4.1? No HW SGIs */
- if (!kvm_vgic_global_state.has_gicv4_1 || !gic_cpuif_has_vsgi())
+ if (!vgic_supports_direct_sgis(vcpu->kvm))
val &= ~GICD_CTLR_nASSGIreq;
/* Dist stays enabled? nASSGIreq is RO */
@@ -133,13 +150,13 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
if (is_hwsgi != dist->nassgireq)
vgic_v4_configure_vsgis(vcpu->kvm);
- if (kvm_vgic_global_state.has_gicv4_1 &&
+ if (vgic_supports_direct_sgis(vcpu->kvm) &&
was_enabled != dist->enabled)
kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
else if (!was_enabled && dist->enabled)
vgic_kick_vcpus(vcpu->kvm);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
break;
}
case GICD_TYPER:
@@ -159,8 +176,18 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
switch (addr & 0x0c) {
case GICD_TYPER2:
- if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
+ reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
+
+ if (reg == val)
+ return 0;
+ if (vgic_initialized(vcpu->kvm))
+ return -EBUSY;
+ if ((reg ^ val) & ~GICD_TYPER2_nASSGIcap)
return -EINVAL;
+ if (!system_supports_direct_sgis() && val)
+ return -EINVAL;
+
+ dist->nassgicap = val & GICD_TYPER2_nASSGIcap;
return 0;
case GICD_IIDR:
reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
@@ -178,7 +205,7 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
}
case GICD_CTLR:
/* Not a GICv4.1? No HW SGIs */
- if (!kvm_vgic_global_state.has_gicv4_1)
+ if (!vgic_supports_direct_sgis(vcpu->kvm))
val &= ~GICD_CTLR_nASSGIreq;
dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
@@ -194,7 +221,7 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
int intid = VGIC_ADDR_TO_INTID(addr, 64);
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid);
unsigned long ret = 0;
if (!irq)
@@ -220,7 +247,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
if (addr & 4)
return;
- irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+ irq = vgic_get_irq(vcpu->kvm, intid);
if (!irq)
return;
@@ -277,7 +304,7 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
return;
vgic_flush_pending_lpis(vcpu);
- vgic_its_invalidate_cache(vcpu->kvm);
+ vgic_its_invalidate_all_caches(vcpu->kvm);
atomic_set_release(&vgic_cpu->ctlr, 0);
} else {
ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0,
@@ -357,31 +384,13 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
{
- u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- int i;
- unsigned long flags;
-
- for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
- if (test_bit(i, &val)) {
- /*
- * pending_latch is set irrespective of irq type
- * (level or edge) to avoid dependency that VM should
- * restore irq config before pending info.
- */
- irq->pending_latch = true;
- vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
- } else {
- irq->pending_latch = false;
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
- }
+ int ret;
- vgic_put_irq(vcpu->kvm, irq);
- }
+ ret = vgic_uaccess_write_spending(vcpu, addr, len, val);
+ if (ret)
+ return ret;
- return 0;
+ return vgic_uaccess_write_cpending(vcpu, addr, len, ~val);
}
/* We want to avoid outer shareable. */
@@ -548,6 +557,7 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu,
unsigned long val)
{
struct vgic_irq *irq;
+ u32 intid;
/*
* If the guest wrote only to the upper 32bit part of the
@@ -559,9 +569,13 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu,
if ((addr & 4) || !vgic_lpis_enabled(vcpu))
return;
+ intid = lower_32_bits(val);
+ if (intid < VGIC_MIN_LPI)
+ return;
+
vgic_set_rdist_busy(vcpu, true);
- irq = vgic_get_irq(vcpu->kvm, NULL, lower_32_bits(val));
+ irq = vgic_get_irq(vcpu->kvm, intid);
if (irq) {
vgic_its_inv_lpi(vcpu->kvm, irq);
vgic_put_irq(vcpu->kvm, irq);
@@ -769,10 +783,13 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
struct vgic_redist_region *rdreg;
gpa_t rd_base;
- int ret;
+ int ret = 0;
+
+ lockdep_assert_held(&kvm->slots_lock);
+ mutex_lock(&kvm->arch.config_lock);
if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
- return 0;
+ goto out_unlock;
/*
* We may be creating VCPUs before having set the base address for the
@@ -782,10 +799,12 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
*/
rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
if (!rdreg)
- return 0;
+ goto out_unlock;
- if (!vgic_v3_check_base(kvm))
- return -EINVAL;
+ if (!vgic_v3_check_base(kvm)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
vgic_cpu->rdreg = rdreg;
vgic_cpu->rdreg_index = rdreg->free_index;
@@ -799,19 +818,23 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
rd_dev->redist_vcpu = vcpu;
- mutex_lock(&kvm->slots_lock);
+ mutex_unlock(&kvm->arch.config_lock);
+
ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
2 * SZ_64K, &rd_dev->dev);
- mutex_unlock(&kvm->slots_lock);
-
if (ret)
return ret;
+ /* Protected by slots_lock */
rdreg->free_index++;
return 0;
+
+out_unlock:
+ mutex_unlock(&kvm->arch.config_lock);
+ return ret;
}
-static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
+void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
{
struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
@@ -824,6 +847,8 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
unsigned long c;
int ret = 0;
+ lockdep_assert_held(&kvm->slots_lock);
+
kvm_for_each_vcpu(c, vcpu, kvm) {
ret = vgic_register_redist_iodev(vcpu);
if (ret)
@@ -834,12 +859,10 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
/* The current c failed, so iterate over the previous ones. */
int i;
- mutex_lock(&kvm->slots_lock);
for (i = 0; i < c; i++) {
vcpu = kvm_get_vcpu(kvm, i);
vgic_unregister_redist_iodev(vcpu);
}
- mutex_unlock(&kvm->slots_lock);
}
return ret;
@@ -928,8 +951,19 @@ free:
return ret;
}
-void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg)
+void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg)
{
+ struct kvm_vcpu *vcpu;
+ unsigned long c;
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ /* Garbage collect the region */
+ kvm_for_each_vcpu(c, vcpu, kvm) {
+ if (vcpu->arch.vgic_cpu.rdreg == rdreg)
+ vcpu->arch.vgic_cpu.rdreg = NULL;
+ }
+
list_del(&rdreg->list);
kfree(rdreg);
}
@@ -938,7 +972,9 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
{
int ret;
+ mutex_lock(&kvm->arch.config_lock);
ret = vgic_v3_alloc_redist_region(kvm, index, addr, count);
+ mutex_unlock(&kvm->arch.config_lock);
if (ret)
return ret;
@@ -950,8 +986,10 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
if (ret) {
struct vgic_redist_region *rdreg;
+ mutex_lock(&kvm->arch.config_lock);
rdreg = vgic_v3_rdist_region_from_index(kvm, index);
- vgic_v3_free_redist_region(rdreg);
+ vgic_v3_free_redist_region(kvm, rdreg);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
@@ -1002,35 +1040,6 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
return 0;
}
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
- unsigned long affinity;
- int level0;
-
- /*
- * Split the current VCPU's MPIDR into affinity level 0 and the
- * rest as this is what we have to compare against.
- */
- affinity = kvm_vcpu_get_mpidr_aff(vcpu);
- level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
- affinity &= ~MPIDR_LEVEL_MASK;
-
- /* bail out if the upper three levels don't match */
- if (sgi_aff != affinity)
- return -1;
-
- /* Is this VCPU's bit set in the mask ? */
- if (!(sgi_cpu_mask & BIT(level0)))
- return -1;
-
- return level0;
-}
/*
* The ICC_SGI* registers encode the affinity differently from the MPIDR,
@@ -1041,6 +1050,38 @@ static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
>> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1)
+{
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, sgi);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ /*
+ * An access targeting Group0 SGIs can only generate
+ * those, while an access targeting Group1 SGIs can
+ * generate interrupts of either group.
+ */
+ if (!irq->group || allow_group1) {
+ if (!irq->hw) {
+ irq->pending_latch = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ } else {
+ /* HW SGI? Ask the GIC to inject it */
+ int err;
+ err = irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ true);
+ WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+ } else {
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
/**
* vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
* @vcpu: The VCPU requesting a SGI
@@ -1051,83 +1092,46 @@ static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
* This will trap in sys_regs.c and call this function.
* This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
* target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
+ *
+ * If the interrupt routing mode bit is not set, we iterate over the Aff0
+ * bits and signal the VCPUs matching the provided Aff{3,2,1}.
+ *
+ * If this bit is set, we signal all, but not the calling VCPU.
*/
void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_vcpu *c_vcpu;
- u16 target_cpus;
+ unsigned long target_cpus;
u64 mpidr;
- int sgi;
- int vcpu_id = vcpu->vcpu_id;
- bool broadcast;
- unsigned long c, flags;
-
- sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
- broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
- target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
- mpidr = SGI_AFFINITY_LEVEL(reg, 3);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
- /*
- * We iterate over all VCPUs to find the MPIDRs matching the request.
- * If we have handled one CPU, we clear its bit to detect early
- * if we are already finished. This avoids iterating through all
- * VCPUs when most of the times we just signal a single VCPU.
- */
- kvm_for_each_vcpu(c, c_vcpu, kvm) {
- struct vgic_irq *irq;
-
- /* Exit early if we have dealt with all requested CPUs */
- if (!broadcast && target_cpus == 0)
- break;
-
- /* Don't signal the calling VCPU */
- if (broadcast && c == vcpu_id)
- continue;
+ u32 sgi, aff0;
+ unsigned long c;
- if (!broadcast) {
- int level0;
+ sgi = FIELD_GET(ICC_SGI1R_SGI_ID_MASK, reg);
- level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
- if (level0 == -1)
+ /* Broadcast */
+ if (unlikely(reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT))) {
+ kvm_for_each_vcpu(c, c_vcpu, kvm) {
+ /* Don't signal the calling VCPU */
+ if (c_vcpu == vcpu)
continue;
- /* remove this matching VCPU from the mask */
- target_cpus &= ~BIT(level0);
+ vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
}
- irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
-
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ return;
+ }
- /*
- * An access targeting Group0 SGIs can only generate
- * those, while an access targeting Group1 SGIs can
- * generate interrupts of either group.
- */
- if (!irq->group || allow_group1) {
- if (!irq->hw) {
- irq->pending_latch = true;
- vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
- } else {
- /* HW SGI? Ask the GIC to inject it */
- int err;
- err = irq_set_irqchip_state(irq->host_irq,
- IRQCHIP_STATE_PENDING,
- true);
- WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
- }
- } else {
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
- }
+ /* We iterate over affinities to find the corresponding vcpus */
+ mpidr = SGI_AFFINITY_LEVEL(reg, 3);
+ mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
+ mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+ target_cpus = FIELD_GET(ICC_SGI1R_TARGET_LIST_MASK, reg);
- vgic_put_irq(vcpu->kvm, irq);
+ for_each_set_bit(aff0, &target_cpus, hweight_long(ICC_SGI1R_TARGET_LIST_MASK)) {
+ c_vcpu = kvm_mpidr_to_vcpu(kvm, mpidr | aff0);
+ if (c_vcpu)
+ vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
}
}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index b32d434c1d4a..a573b1f0c6cb 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -50,7 +50,7 @@ unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->group)
value |= BIT(i);
@@ -74,7 +74,7 @@ void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
unsigned long flags;
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->group = !!(val & BIT(i));
@@ -102,7 +102,7 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->enabled)
value |= (1U << i);
@@ -122,7 +122,7 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
@@ -171,7 +171,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
@@ -193,7 +193,7 @@ int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->enabled = true;
@@ -214,7 +214,7 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->enabled = false;
@@ -236,7 +236,7 @@ static unsigned long __read_pending(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
unsigned long flags;
bool val;
@@ -301,25 +301,32 @@ static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2);
}
-void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len,
- unsigned long val)
+static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
+ unsigned long val, bool is_user)
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
int i;
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
- /* GICD_ISPENDR0 SGI bits are WI */
- if (is_vgic_v2_sgi(vcpu, irq)) {
+ /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */
+ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) {
vgic_put_irq(vcpu->kvm, irq);
continue;
}
raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ /*
+ * GICv2 SGIs are terribly broken. We can't restore
+ * the source of the interrupt, so just pick the vcpu
+ * itself as the source...
+ */
+ if (is_vgic_v2_sgi(vcpu, irq))
+ irq->source |= BIT(vcpu->vcpu_id);
+
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
/* HW SGI? Ask the GIC to inject it */
int err;
@@ -335,7 +342,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
}
irq->pending_latch = true;
- if (irq->hw)
+ if (irq->hw && !is_user)
vgic_irq_set_phys_active(irq, true);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
@@ -343,33 +350,18 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
}
}
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ __set_pending(vcpu, addr, len, val, false);
+}
+
int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
{
- u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- int i;
- unsigned long flags;
-
- for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
- irq->pending_latch = true;
-
- /*
- * GICv2 SGIs are terribly broken. We can't restore
- * the source of the interrupt, so just pick the vcpu
- * itself as the source...
- */
- if (is_vgic_v2_sgi(vcpu, irq))
- irq->source |= BIT(vcpu->vcpu_id);
-
- vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
- vgic_put_irq(vcpu->kvm, irq);
- }
-
+ __set_pending(vcpu, addr, len, val, true);
return 0;
}
@@ -394,25 +386,33 @@ static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
vgic_irq_set_phys_active(irq, false);
}
-void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len,
- unsigned long val)
+static void __clear_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val, bool is_user)
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
int i;
unsigned long flags;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
- /* GICD_ICPENDR0 SGI bits are WI */
- if (is_vgic_v2_sgi(vcpu, irq)) {
+ /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */
+ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) {
vgic_put_irq(vcpu->kvm, irq);
continue;
}
raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ /*
+ * More fun with GICv2 SGIs! If we're clearing one of them
+ * from userspace, which source vcpu to clear? Let's not
+ * even think of it, and blow the whole set.
+ */
+ if (is_vgic_v2_sgi(vcpu, irq))
+ irq->source = 0;
+
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
/* HW SGI? Ask the GIC to clear its pending bit */
int err;
@@ -427,7 +427,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
continue;
}
- if (irq->hw)
+ if (irq->hw && !is_user)
vgic_hw_irq_cpending(vcpu, irq);
else
irq->pending_latch = false;
@@ -437,33 +437,18 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
}
}
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ __clear_pending(vcpu, addr, len, val, false);
+}
+
int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
{
- u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- int i;
- unsigned long flags;
-
- for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
- /*
- * More fun with GICv2 SGIs! If we're clearing one of them
- * from userspace, which source vcpu to clear? Let's not
- * even think of it, and blow the whole set.
- */
- if (is_vgic_v2_sgi(vcpu, irq))
- irq->source = 0;
-
- irq->pending_latch = false;
-
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
- vgic_put_irq(vcpu->kvm, irq);
- }
-
+ __clear_pending(vcpu, addr, len, val, true);
return 0;
}
@@ -473,9 +458,10 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
* active state can be overwritten when the VCPU's state is synced coming back
* from the guest.
*
- * For shared interrupts as well as GICv3 private interrupts, we have to
- * stop all the VCPUs because interrupts can be migrated while we don't hold
- * the IRQ locks and we don't want to be chasing moving targets.
+ * For shared interrupts as well as GICv3 private interrupts accessed from the
+ * non-owning CPU, we have to stop all the VCPUs because interrupts can be
+ * migrated while we don't hold the IRQ locks and we don't want to be chasing
+ * moving targets.
*
* For GICv2 private interrupts we don't have to do anything because
* userspace accesses to the VGIC state already require all VCPUs to be
@@ -484,7 +470,8 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
*/
static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
+ vcpu != kvm_get_running_vcpu()) ||
intid >= VGIC_NR_PRIVATE_IRQS)
kvm_arm_halt_guest(vcpu->kvm);
}
@@ -492,7 +479,8 @@ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
/* See vgic_access_active_prepare */
static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
+ vcpu != kvm_get_running_vcpu()) ||
intid >= VGIC_NR_PRIVATE_IRQS)
kvm_arm_resume_guest(vcpu->kvm);
}
@@ -506,7 +494,7 @@ static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
/* Loop over all IRQs affected by this read */
for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
/*
* Even for HW interrupts, don't evaluate the HW state as
@@ -527,13 +515,13 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
u32 val;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
vgic_access_active_prepare(vcpu, intid);
val = __vgic_mmio_read_active(vcpu, addr, len);
vgic_access_active_finish(vcpu, intid);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
return val;
}
@@ -610,7 +598,7 @@ static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
int i;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, false);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -622,13 +610,13 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_cactive(vcpu, addr, len, val);
vgic_access_active_finish(vcpu, intid);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
}
int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
@@ -647,7 +635,7 @@ static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
int i;
for_each_set_bit(i, &val, len * 8) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, true);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -659,13 +647,13 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_sactive(vcpu, addr, len, val);
vgic_access_active_finish(vcpu, intid);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
}
int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
@@ -684,7 +672,7 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
u64 val = 0;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
val |= (u64)irq->priority << (i * 8);
@@ -710,7 +698,7 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
unsigned long flags;
for (i = 0; i < len; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
/* Narrow the priority range to what we actually support */
@@ -731,7 +719,7 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
int i;
for (i = 0; i < len * 4; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->config == VGIC_CONFIG_EDGE)
value |= (2U << (i * 2));
@@ -762,7 +750,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
if (intid + i < VGIC_NR_PRIVATE_IRQS)
continue;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ irq = vgic_get_irq(vcpu->kvm, intid + i);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (test_bit(i * 2 + 1, &val))
@@ -787,7 +775,7 @@ u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
continue;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ irq = vgic_get_vcpu_irq(vcpu, intid + i);
if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
val |= (1U << i);
@@ -811,7 +799,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
continue;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ irq = vgic_get_vcpu_irq(vcpu, intid + i);
/*
* Line level is set irrespective of irq type
@@ -1093,7 +1081,6 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
enum vgic_type type)
{
struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
- int ret = 0;
unsigned int len;
switch (type) {
@@ -1104,17 +1091,13 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
len = vgic_v3_init_dist_iodev(io_device);
break;
default:
- BUG_ON(1);
+ BUG();
}
io_device->base_addr = dist_base_address;
io_device->iodev_type = IODEV_DIST;
io_device->redist_vcpu = NULL;
- mutex_lock(&kvm->slots_lock);
- ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
- len, &io_device->dev);
- mutex_unlock(&kvm->slots_lock);
-
- return ret;
+ return kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
+ len, &io_device->dev);
}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
index 5b490a4dfa5e..50dc80220b0f 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.h
+++ b/arch/arm64/kvm/vgic/vgic-mmio.h
@@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
const u32 val);
unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev);
unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 645648349c99..585491fbda80 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -9,6 +9,7 @@
#include <kvm/arm_vgic.h>
#include <asm/kvm_mmu.h>
+#include "vgic-mmio.h"
#include "vgic.h"
static inline void vgic_v2_write_lr(int lr, u32 val)
@@ -26,11 +27,24 @@ void vgic_v2_init_lrs(void)
vgic_v2_write_lr(i, 0);
}
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
- cpuif->vgic_hcr |= GICH_HCR_UIE;
+ cpuif->vgic_hcr = GICH_HCR_EN;
+
+ if (irqs_pending_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_NPIE;
+ if (irqs_active_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_LRENPIE;
+ if (irqs_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_UIE;
+
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ?
+ GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE;
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ?
+ GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE;
}
static bool lr_signals_eoi_mi(u32 lr_val)
@@ -39,43 +53,23 @@ static bool lr_signals_eoi_mi(u32 lr_val)
!(lr_val & GICH_LR_HW);
}
-/*
- * transfer the content of the LRs back into the corresponding ap_list:
- * - active bit is transferred as is
- * - pending bit is
- * - transferred as is in case of edge sensitive IRQs
- * - set to the line-level (resample time) for level sensitive IRQs
- */
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val)
{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
- int lr;
-
- DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
- cpuif->vgic_hcr &= ~GICH_HCR_UIE;
-
- for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) {
- u32 val = cpuif->vgic_lr[lr];
- u32 cpuid, intid = val & GICH_LR_VIRTUALID;
- struct vgic_irq *irq;
- bool deactivated;
-
- /* Extract the source vCPU id from the LR */
- cpuid = val & GICH_LR_PHYSID_CPUID;
- cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
- cpuid &= 7;
+ u32 cpuid, intid = val & GICH_LR_VIRTUALID;
+ struct vgic_irq *irq;
+ bool deactivated;
- /* Notify fds when the guest EOI'ed a level-triggered SPI */
- if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
- kvm_notify_acked_irq(vcpu->kvm, 0,
- intid - VGIC_NR_PRIVATE_IRQS);
+ /* Extract the source vCPU id from the LR */
+ cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ /* Notify fds when the guest EOI'ed a level-triggered SPI */
+ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
- raw_spin_lock(&irq->irq_lock);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
/* Always preserve the active bit, note deactivation */
deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
@@ -101,29 +95,139 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
/* Handle resampling for mapped interrupts if required */
vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
- raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ irq->on_lr = false;
}
- cpuif->used_lrs = 0;
+ vgic_put_irq(vcpu->kvm, irq);
}
+static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq);
+
/*
- * Populates the particular LR with the state of a given IRQ:
- * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
- * - for a level sensitive IRQ the pending state value is unchanged;
- * it is dictated directly by the input level
- *
- * If @irq describes an SGI with multiple sources, we choose the
- * lowest-numbered source VCPU and clear that bit in the source bitmap.
- *
- * The irq_lock must be held by the caller.
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ * - transferred as is in case of edge sensitive IRQs
+ * - set to the line-level (resample time) for level sensitive IRQs
*/
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+ u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr);
+ struct vgic_irq *irq;
+
+ DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+ for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++)
+ vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]);
+
+ /* See the GICv3 equivalent for the EOIcount handling rationale */
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ u32 lr;
+
+ if (!eoicount) {
+ break;
+ } else {
+ guard(raw_spinlock)(&irq->irq_lock);
+
+ if (!(likely(vgic_target_oracle(irq) == vcpu) &&
+ irq->active))
+ continue;
+
+ lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT;
+ }
+
+ if (lr & GICH_LR_HW)
+ writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr),
+ kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE);
+ vgic_v2_fold_lr(vcpu, lr);
+ eoicount--;
+ }
+
+ cpuif->used_lrs = 0;
+}
+
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+ struct kvm_vcpu *target_vcpu = NULL;
+ bool mmio = false;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ u64 lr = 0;
+ u8 cpuid;
+
+ /* Snapshot CPUID, and remove it from the INTID */
+ cpuid = FIELD_GET(GENMASK_ULL(12, 10), val);
+ val &= ~GENMASK_ULL(12, 10);
+
+ /* We only deal with DIR when EOIMode==1 */
+ if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK))
+ return;
+
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
+
+ irq = vgic_get_vcpu_irq(vcpu, val);
+ if (WARN_ON_ONCE(!irq))
+ goto out;
+
+ /* See the corresponding v3 code for the rationale */
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ target_vcpu = irq->vcpu;
+
+ /* Not on any ap_list? */
+ if (!target_vcpu)
+ goto put;
+
+ /*
+ * Urgh. We're deactivating something that we cannot
+ * observe yet... Big hammer time.
+ */
+ if (irq->on_lr) {
+ mmio = true;
+ goto put;
+ }
+
+ /* SGI: check that the cpuid matches */
+ if (val < VGIC_NR_SGIS && irq->active_source != cpuid) {
+ target_vcpu = NULL;
+ goto put;
+ }
+
+ /* (with a Dalek voice) DEACTIVATE!!!! */
+ lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT;
+ }
+
+ if (lr & GICH_LR_HW)
+ writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr),
+ kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE);
+
+ vgic_v2_fold_lr(vcpu, lr);
+
+put:
+ vgic_put_irq(vcpu->kvm, irq);
+
+out:
+ local_irq_restore(flags);
+
+ if (mmio)
+ vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+ /* Force the ap_list to be pruned */
+ if (target_vcpu)
+ kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
+}
+
+static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
u32 val = irq->intid;
bool allow_pending = true;
+ WARN_ON(irq->on_lr);
+
if (irq->active) {
val |= GICH_LR_ACTIVE_BIT;
if (vgic_irq_is_sgi(irq->intid))
@@ -163,22 +267,52 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (allow_pending && irq_is_pending(irq)) {
val |= GICH_LR_PENDING_BIT;
- if (irq->config == VGIC_CONFIG_EDGE)
- irq->pending_latch = false;
-
if (vgic_irq_is_sgi(irq->intid)) {
u32 src = ffs(irq->source);
if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
irq->intid))
- return;
+ return 0;
val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
- irq->source &= ~(1 << (src - 1));
- if (irq->source) {
- irq->pending_latch = true;
+ if (irq->source & ~BIT(src - 1))
val |= GICH_LR_EOI;
- }
+ }
+ }
+
+ /* The GICv2 LR only holds five bits of priority. */
+ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+ return val;
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ * it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 val = vgic_v2_compute_lr(vcpu, irq);
+
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+
+ if (val & GICH_LR_PENDING_BIT) {
+ if (irq->config == VGIC_CONFIG_EDGE)
+ irq->pending_latch = false;
+
+ if (vgic_irq_is_sgi(irq->intid)) {
+ u32 src = ffs(irq->source);
+
+ irq->source &= ~BIT(src - 1);
+ if (irq->source)
+ irq->pending_latch = true;
}
}
@@ -194,7 +328,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
/* The GICv2 LR only holds five bits of priority. */
val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
- vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+ irq->on_lr = true;
}
void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
@@ -257,7 +391,7 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
}
-void vgic_v2_enable(struct kvm_vcpu *vcpu)
+void vgic_v2_reset(struct kvm_vcpu *vcpu)
{
/*
* By forcing VMCR to zero, the GIC will restore the binary
@@ -265,9 +399,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu)
* anyway.
*/
vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-
- /* Get the show on the road... */
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
}
/* check for overlapping regions and for regions crossing the end of memory */
@@ -289,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
int vgic_v2_map_resources(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned int len;
int ret = 0;
if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
@@ -312,16 +444,20 @@ int vgic_v2_map_resources(struct kvm *kvm)
return ret;
}
- ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
- if (ret) {
- kvm_err("Unable to register VGIC MMIO regions\n");
+ len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev);
+ dist->cpuif_iodev.base_addr = dist->vgic_cpu_base;
+ dist->cpuif_iodev.iodev_type = IODEV_CPUIF;
+ dist->cpuif_iodev.redist_vcpu = NULL;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base,
+ len, &dist->cpuif_iodev.dev);
+ if (ret)
return ret;
- }
if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
kvm_vgic_global_state.vcpu_base,
- KVM_VGIC_V2_CPU_SIZE, true);
+ KVM_VGIC_V2_CPU_SIZE - SZ_4K, true);
if (ret) {
kvm_err("Unable to remap VGIC CPU to VCPU\n");
return ret;
@@ -391,6 +527,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
kvm_vgic_global_state.can_emulate_gicv2 = true;
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+ kvm_vgic_global_state.gicc_base = info->gicc_base;
kvm_vgic_global_state.type = VGIC_V2;
kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
@@ -429,16 +566,26 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
void vgic_v2_save_state(struct kvm_vcpu *vcpu)
{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
void __iomem *base = kvm_vgic_global_state.vctrl_base;
u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
if (!base)
return;
- if (used_lrs) {
+ cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+
+ if (used_lrs)
save_lrs(vcpu, base);
- writel_relaxed(0, base + GICH_HCR);
+
+ if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) {
+ u32 val = readl_relaxed(base + GICH_HCR);
+
+ cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT;
+ cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT;
}
+
+ writel_relaxed(0, base + GICH_HCR);
}
void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
@@ -451,13 +598,10 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
if (!base)
return;
- if (used_lrs) {
- writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
- for (i = 0; i < used_lrs; i++) {
- writel_relaxed(cpu_if->vgic_lr[i],
- base + GICH_LR0 + (i * 4));
- }
- }
+ writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+
+ for (i = 0; i < used_lrs; i++)
+ writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4));
}
void vgic_v2_load(struct kvm_vcpu *vcpu)
@@ -470,17 +614,9 @@ void vgic_v2_load(struct kvm_vcpu *vcpu)
kvm_vgic_global_state.vctrl_base + GICH_APR);
}
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
-{
- struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
- cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-}
-
void vgic_v2_put(struct kvm_vcpu *vcpu)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- vgic_v2_vmcr_sync(vcpu);
cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
new file mode 100644
index 000000000000..61b44f3f2bf1
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
+
+#include "vgic.h"
+
+#define ICH_LRN(n) (ICH_LR0_EL2 + (n))
+#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n))
+#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n))
+
+struct mi_state {
+ u16 eisr;
+ u16 elrsr;
+ bool pend;
+};
+
+/*
+ * The shadow registers loaded to the hardware when running a L2 guest
+ * with the virtual IMO/FMO bits set.
+ */
+struct shadow_if {
+ struct vgic_v3_cpu_if cpuif;
+ unsigned long lr_map;
+};
+
+static DEFINE_PER_CPU(struct shadow_if, shadow_if);
+
+static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
+{
+ return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
+}
+
+/*
+ * Nesting GICv3 support
+ *
+ * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
+ * completely controls the interrupts injected via the list registers.
+ * Consequently, most of the state that is modified by the guest (by ACK-ing
+ * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
+ * keep a semi-consistent view of the interrupts.
+ *
+ * This still applies for a NV guest, but only while "InHost" (either
+ * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
+ *
+ * When running a L2 guest ("not InHost"), things are radically different,
+ * as the L1 guest is in charge of provisioning the interrupts via its own
+ * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
+ * page. This means that the flow described above does work (there is no
+ * state to rebuild in the L0 hypervisor), and that most things happed on L2
+ * load/put:
+ *
+ * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
+ * per-CPU data structure that is used to populate the actual LRs. This is
+ * an extra copy that we could avoid, but life is short. In the process,
+ * we remap any interrupt that has the HW bit set to the mapped interrupt
+ * on the host, should the host consider it a HW one. This allows the HW
+ * deactivation to take its course, such as for the timer.
+ *
+ * - on L2 put: perform the inverse transformation, so that the result of L2
+ * running becomes visible to L1 in the VNCR-accessible registers.
+ *
+ * - there is nothing to do on L2 entry apart from enabling the vgic, as
+ * everything will have happened on load. However, this is the point where
+ * we detect that an interrupt targeting L1 and prepare the grand
+ * switcheroo.
+ *
+ * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate
+ * corresponding the L1 interrupt. The L0 active state will be cleared by
+ * the HW if the L1 interrupt was itself backed by a HW interrupt.
+ *
+ * Maintenance Interrupt (MI) management:
+ *
+ * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
+ * used as a handover point between L2 and L1.
+ *
+ * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
+ * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
+ * run and process the MI.
+ *
+ * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
+ * state must be computed at each entry/exit of the guest, much like we do
+ * it for the PMU interrupt.
+ *
+ * - because most of the ICH_*_EL2 registers live in the VNCR page, the
+ * quality of emulation is poor: L1 can setup the vgic so that an MI would
+ * immediately fire, and not observe anything until the next exit.
+ * Similarly, a pending MI is not immediately disabled by clearing
+ * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for
+ * example.
+ *
+ * System register emulation:
+ *
+ * We get two classes of registers:
+ *
+ * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
+ * them, and L0 doesn't see a thing.
+ *
+ * - those that always trap (ELRSR, EISR, MISR): these are status registers
+ * that are built on the fly based on the in-memory state.
+ *
+ * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
+ * and a NV L2 would either access the VNCR page provided by L1 (memory
+ * based registers), or see the access redirected to L1 (registers that
+ * trap) thanks to NV being set by L1.
+ */
+
+bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
+{
+ u64 xmo;
+
+ if (is_nested_ctxt(vcpu)) {
+ xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
+ WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
+ "Separate virtual IRQ/FIQ settings not supported\n");
+
+ return !!xmo;
+ }
+
+ return false;
+}
+
+static struct shadow_if *get_shadow_if(void)
+{
+ return this_cpu_ptr(&shadow_if);
+}
+
+static bool lr_triggers_eoi(u64 lr)
+{
+ return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
+}
+
+static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
+{
+ u16 eisr = 0, elrsr = 0;
+ bool pend = false;
+
+ for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+
+ if (lr_triggers_eoi(lr))
+ eisr |= BIT(i);
+ if (!(lr & ICH_LR_STATE))
+ elrsr |= BIT(i);
+ pend |= (lr & ICH_LR_PENDING_BIT);
+ }
+
+ mi_state->eisr = eisr;
+ mi_state->elrsr = elrsr;
+ mi_state->pend = pend;
+}
+
+u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+ return mi_state.eisr;
+}
+
+u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+ return mi_state.elrsr;
+}
+
+u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
+{
+ struct mi_state mi_state;
+ u64 reg = 0, hcr, vmcr;
+
+ hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+ vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
+
+ vgic_compute_mi_state(vcpu, &mi_state);
+
+ if (mi_state.eisr)
+ reg |= ICH_MISR_EL2_EOI;
+
+ if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
+ int used_lrs = kvm_vgic_global_state.nr_lr;
+
+ used_lrs -= hweight16(mi_state.elrsr);
+ reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
+ }
+
+ if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
+ reg |= ICH_MISR_EL2_LRENP;
+
+ if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
+ reg |= ICH_MISR_EL2_NP;
+
+ if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
+ reg |= ICH_MISR_EL2_VGrp0E;
+
+ if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
+ reg |= ICH_MISR_EL2_VGrp0D;
+
+ if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
+ reg |= ICH_MISR_EL2_VGrp1E;
+
+ if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
+ reg |= ICH_MISR_EL2_VGrp1D;
+
+ return reg;
+}
+
+static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
+{
+ struct vgic_irq *irq;
+
+ if (!(lr & ICH_LR_HW))
+ return lr;
+
+ /* We have the HW bit set, check for validity of pINTID */
+ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ /* If there was no real mapping, nuke the HW bit */
+ if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
+ lr &= ~ICH_LR_HW;
+
+ /* Translate the virtual mapping to the real one, even if invalid */
+ if (irq) {
+ lr &= ~ICH_LR_PHYS_ID_MASK;
+ lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return lr;
+}
+
+/*
+ * For LRs which have HW bit set such as timer interrupts, we modify them to
+ * have the host hardware interrupt number instead of the virtual one programmed
+ * by the guest hypervisor.
+ */
+static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
+ struct vgic_v3_cpu_if *s_cpu_if)
+{
+ struct shadow_if *shadow_if;
+
+ shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
+ shadow_if->lr_map = 0;
+
+ for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
+ u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+
+ if (!(lr & ICH_LR_STATE))
+ continue;
+
+ lr = translate_lr_pintid(vcpu, lr);
+
+ s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
+ shadow_if->lr_map |= BIT(i);
+ }
+
+ s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
+}
+
+void vgic_v3_flush_nested(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+
+ write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2);
+}
+
+void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ int i;
+
+ for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
+ u64 val, host_lr, lr;
+
+ host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
+
+ /* Propagate the new LR state */
+ lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+ val = lr & ~ICH_LR_STATE;
+ val |= host_lr & ICH_LR_STATE;
+ __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
+
+ /*
+ * Deactivation of a HW interrupt: the LR must have the HW
+ * bit set, have been in a non-invalid state before the run,
+ * and now be in an invalid state. If any of that doesn't
+ * hold, we're done with this LR.
+ */
+ if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) &&
+ !(host_lr & ICH_LR_STATE)))
+ continue;
+
+ /*
+ * If we had a HW lr programmed by the guest hypervisor, we
+ * need to emulate the HW effect between the guest hypervisor
+ * and the nested guest.
+ */
+ vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ }
+
+ /* We need these to be synchronised to generate the MI */
+ __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2));
+ __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount);
+ __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount);
+
+ write_sysreg_s(0, SYS_ICH_HCR_EL2);
+ isb();
+
+ vgic_v3_nested_update_mi(vcpu);
+}
+
+static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
+ struct vgic_v3_cpu_if *s_cpu_if)
+{
+ struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ int i;
+
+ s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+ s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
+ s_cpu_if->vgic_sre = host_if->vgic_sre;
+
+ for (i = 0; i < 4; i++) {
+ s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
+ s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
+ }
+
+ vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
+}
+
+void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
+
+ BUG_ON(!vgic_state_is_nested(vcpu));
+
+ vgic_v3_create_shadow_state(vcpu, cpu_if);
+
+ __vgic_v3_restore_vmcr_aprs(cpu_if);
+ __vgic_v3_activate_traps(cpu_if);
+
+ for (int i = 0; i < cpu_if->used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+
+ /*
+ * Propagate the number of used LRs for the benefit of the HYP
+ * GICv3 emulation code. Yes, this is a pretty sorry hack.
+ */
+ vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
+}
+
+void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
+{
+ struct shadow_if *shadow_if = get_shadow_if();
+ struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
+ int i;
+
+ __vgic_v3_save_aprs(s_cpu_if);
+
+ for (i = 0; i < 4; i++) {
+ __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
+ __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
+ }
+
+ for (i = 0; i < s_cpu_if->used_lrs; i++)
+ __gic_v3_set_lr(0, i);
+
+ __vgic_v3_deactivate_traps(s_cpu_if);
+
+ vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
+}
+
+/*
+ * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
+ * then we need to forward this to L1 so that it can re-sync the appropriate
+ * LRs and sample level triggered interrupts again.
+ */
+void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
+{
+ bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
+
+ /* This will force a switch back to L1 if the level is high */
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
+
+ sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
+}
+
+void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
+{
+ bool level;
+
+ level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu);
+ kvm_vgic_inject_irq(vcpu->kvm, vcpu,
+ vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 2624963cb95b..1d6dd1b545bd 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -3,13 +3,16 @@
#include <linux/irqchip/arm-gic-v3.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
+#include <linux/kstrtox.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/string_choices.h>
#include <kvm/arm_vgic.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_asm.h>
+#include "vgic-mmio.h"
#include "vgic.h"
static bool group0_trap;
@@ -18,11 +21,48 @@ static bool common_trap;
static bool dir_trap;
static bool gicv4_enable;
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
- cpuif->vgic_hcr |= ICH_HCR_UIE;
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ cpuif->vgic_hcr = ICH_HCR_EL2_En;
+
+ if (irqs_pending_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE;
+ if (irqs_active_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE;
+ if (irqs_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
+
+ if (!als->nr_sgi)
+ cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount;
+
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ?
+ ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE;
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ?
+ ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE;
+
+ /*
+ * Dealing with EOImode=1 is a massive source of headache. Not
+ * only do we need to track that we have active interrupts
+ * outside of the LRs and force DIR to be trapped, we also
+ * need to deal with SPIs that can be deactivated on another
+ * CPU.
+ *
+ * On systems that do not implement TDIR, force the bit in the
+ * shadow state anyway to avoid IPI-ing on these poor sods.
+ *
+ * Note that we set the trap irrespective of EOIMode, as that
+ * can change behind our back without any warning...
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) ||
+ irqs_active_outside_lrs(als) ||
+ atomic_read(&vcpu->kvm->arch.vgic.active_spis))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR;
}
static bool lr_signals_eoi_mi(u64 lr_val)
@@ -31,84 +71,238 @@ static bool lr_signals_eoi_mi(u64 lr_val)
!(lr_val & ICH_LR_HW);
}
+static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val)
+{
+ struct vgic_irq *irq;
+ bool is_v2_sgi = false;
+ bool deactivated;
+ u32 intid;
+
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ intid = val & ICH_LR_VIRTUAL_ID_MASK;
+ } else {
+ intid = val & GICH_LR_VIRTUALID;
+ is_v2_sgi = vgic_irq_is_sgi(intid);
+ }
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ if (!irq) /* An LPI could have been unmapped. */
+ return;
+
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ /* Always preserve the active bit for !LPIs, note deactivation */
+ if (irq->intid >= VGIC_MIN_LPI)
+ val &= ~ICH_LR_ACTIVE_BIT;
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
+ irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+ /* Edge is the only case where we preserve the pending bit */
+ if (irq->config == VGIC_CONFIG_EDGE &&
+ (val & ICH_LR_PENDING_BIT))
+ irq->pending_latch = true;
+
+ /*
+ * Clear soft pending state when level irqs have been acked.
+ */
+ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+ irq->pending_latch = false;
+
+ if (is_v2_sgi) {
+ u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val);
+
+ if (irq->active)
+ irq->active_source = cpuid;
+
+ if (val & ICH_LR_PENDING_BIT)
+ irq->source |= BIT(cpuid);
+ }
+
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
+
+ irq->on_lr = false;
+ }
+
+ /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */
+ if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) {
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
+ atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq);
+
+static void vgic_v3_deactivate_phys(u32 intid)
+{
+ if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY))
+ gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI);
+ else
+ gic_write_dir(intid);
+}
+
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
- u32 model = vcpu->kvm->arch.vgic.vgic_model;
- int lr;
+ u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr);
+ struct vgic_irq *irq;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- cpuif->vgic_hcr &= ~ICH_HCR_UIE;
-
- for (lr = 0; lr < cpuif->used_lrs; lr++) {
- u64 val = cpuif->vgic_lr[lr];
- u32 intid, cpuid;
- struct vgic_irq *irq;
- bool is_v2_sgi = false;
- bool deactivated;
+ for (int lr = 0; lr < cpuif->used_lrs; lr++)
+ vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]);
- cpuid = val & GICH_LR_PHYSID_CPUID;
- cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+ /*
+ * EOIMode=0: use EOIcount to emulate deactivation. We are
+ * guaranteed to deactivate in reverse order of the activation, so
+ * just pick one active interrupt after the other in the ap_list,
+ * and replay the deactivation as if the CPU was doing it. We also
+ * rely on priority drop to have taken place, and the list to be
+ * sorted by priority.
+ */
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ u64 lr;
- if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- intid = val & ICH_LR_VIRTUAL_ID_MASK;
+ /*
+ * I would have loved to write this using a scoped_guard(),
+ * but using 'continue' here is a total train wreck.
+ */
+ if (!eoicount) {
+ break;
} else {
- intid = val & GICH_LR_VIRTUALID;
- is_v2_sgi = vgic_irq_is_sgi(intid);
+ guard(raw_spinlock)(&irq->irq_lock);
+
+ if (!(likely(vgic_target_oracle(irq) == vcpu) &&
+ irq->active))
+ continue;
+
+ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT;
}
- /* Notify fds when the guest EOI'ed a level-triggered IRQ */
- if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
- kvm_notify_acked_irq(vcpu->kvm, 0,
- intid - VGIC_NR_PRIVATE_IRQS);
+ if (lr & ICH_LR_HW)
+ vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
- if (!irq) /* An LPI could have been unmapped. */
- continue;
+ vgic_v3_fold_lr(vcpu, lr);
+ eoicount--;
+ }
- raw_spin_lock(&irq->irq_lock);
+ cpuif->used_lrs = 0;
+}
- /* Always preserve the active bit, note deactivation */
- deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
- irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ struct kvm_vcpu *target_vcpu = NULL;
+ bool mmio = false, is_v2_sgi;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ u64 lr = 0;
+ u8 cpuid;
- if (irq->active && is_v2_sgi)
- irq->active_source = cpuid;
+ /* Snapshot CPUID, and remove it from the INTID */
+ cpuid = FIELD_GET(GENMASK_ULL(12, 10), val);
+ val &= ~GENMASK_ULL(12, 10);
- /* Edge is the only case where we preserve the pending bit */
- if (irq->config == VGIC_CONFIG_EDGE &&
- (val & ICH_LR_PENDING_BIT)) {
- irq->pending_latch = true;
+ is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+ val < VGIC_NR_SGIS);
- if (is_v2_sgi)
- irq->source |= (1 << cpuid);
- }
+ /*
+ * We only deal with DIR when EOIMode==1, and only for SGI,
+ * PPI or SPI.
+ */
+ if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) ||
+ val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)
+ return;
+
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
+
+ irq = vgic_get_vcpu_irq(vcpu, val);
+ if (WARN_ON_ONCE(!irq))
+ goto out;
+
+ /*
+ * EOIMode=1: we must rely on traps to handle deactivate of
+ * overflowing interrupts, as there is no ordering guarantee and
+ * EOIcount isn't being incremented. Priority drop will have taken
+ * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs.
+ *
+ * Three possibities:
+ *
+ * - The irq is not queued on any CPU, and there is nothing to
+ * do,
+ *
+ * - Or the irq is in an LR, meaning that its state is not
+ * directly observable. Treat it bluntly by making it as if
+ * this was a write to GICD_ICACTIVER, which will force an
+ * exit on all vcpus. If it hurts, don't do that.
+ *
+ * - Or the irq is active, but not in an LR, and we can
+ * directly deactivate it by building a pseudo-LR, fold it,
+ * and queue a request to prune the resulting ap_list,
+ *
+ * Special care must be taken to match the source CPUID when
+ * deactivating a GICv2 SGI.
+ */
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ target_vcpu = irq->vcpu;
+
+ /* Not on any ap_list? */
+ if (!target_vcpu)
+ goto put;
/*
- * Clear soft pending state when level irqs have been acked.
+ * Urgh. We're deactivating something that we cannot
+ * observe yet... Big hammer time.
*/
- if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
- irq->pending_latch = false;
+ if (irq->on_lr) {
+ mmio = true;
+ goto put;
+ }
- /* Handle resampling for mapped interrupts if required */
- vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
+ /* GICv2 SGI: check that the cpuid matches */
+ if (is_v2_sgi && irq->active_source != cpuid) {
+ target_vcpu = NULL;
+ goto put;
+ }
- raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ /* (with a Dalek voice) DEACTIVATE!!!! */
+ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT;
}
- cpuif->used_lrs = 0;
+ if (lr & ICH_LR_HW)
+ vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+
+ vgic_v3_fold_lr(vcpu, lr);
+
+put:
+ vgic_put_irq(vcpu->kvm, irq);
+
+out:
+ local_irq_restore(flags);
+
+ if (mmio)
+ vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+ /* Force the ap_list to be pruned */
+ if (target_vcpu)
+ kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
}
/* Requires the irq to be locked already */
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
u32 model = vcpu->kvm->arch.vgic.vgic_model;
u64 val = irq->intid;
bool allow_pending = true, is_v2_sgi;
+ WARN_ON(irq->on_lr);
+
is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
model == KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -148,6 +342,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (allow_pending && irq_is_pending(irq)) {
val |= ICH_LR_PENDING_BIT;
+ if (is_v2_sgi) {
+ u32 src = ffs(irq->source);
+
+ if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+ irq->intid))
+ return 0;
+
+ val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+ if (irq->source & ~BIT(src - 1))
+ val |= ICH_LR_EOI;
+ }
+ }
+
+ if (irq->group)
+ val |= ICH_LR_GROUP;
+
+ val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+ return val;
+}
+
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u64 val = vgic_v3_compute_lr(vcpu, irq);
+
+ vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+
+ if (val & ICH_LR_PENDING_BIT) {
if (irq->config == VGIC_CONFIG_EDGE)
irq->pending_latch = false;
@@ -155,16 +378,9 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
model == KVM_DEV_TYPE_ARM_VGIC_V2) {
u32 src = ffs(irq->source);
- if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
- irq->intid))
- return;
-
- val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
- irq->source &= ~(1 << (src - 1));
- if (irq->source) {
+ irq->source &= ~BIT(src - 1);
+ if (irq->source)
irq->pending_latch = true;
- val |= ICH_LR_EOI;
- }
}
}
@@ -177,12 +393,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
irq->line_level = false;
- if (irq->group)
- val |= ICH_LR_GROUP;
-
- val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
-
- vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+ irq->on_lr = true;
}
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
@@ -256,7 +467,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \
GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
-void vgic_v3_enable(struct kvm_vcpu *vcpu)
+void vgic_v3_reset(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -282,23 +493,24 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
vgic_v3->vgic_sre = 0;
}
- vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
- ICH_VTR_ID_BITS_MASK) >>
- ICH_VTR_ID_BITS_SHIFT;
- vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
- ICH_VTR_PRI_BITS_MASK) >>
- ICH_VTR_PRI_BITS_SHIFT) + 1;
+ vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits,
+ kvm_vgic_global_state.ich_vtr_el2);
+ vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits,
+ kvm_vgic_global_state.ich_vtr_el2) + 1;
+}
- /* Get the show on the road... */
- vgic_v3->vgic_hcr = ICH_HCR_EN;
- if (group0_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
- if (group1_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
- if (common_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TC;
- if (dir_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_TDIR;
+void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ if (!vgic_is_v3(vcpu->kvm))
+ return;
+
+ /* Hide GICv3 sysreg if necessary */
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 ||
+ !irqchip_in_kernel(vcpu->kvm))
+ vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
+ ICH_HCR_EL2_TC);
}
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
@@ -339,7 +551,7 @@ retry:
if (status) {
/* clear consumed data */
val &= ~(1 << bit_nr);
- ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+ ret = vgic_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
return ret;
}
@@ -369,7 +581,7 @@ static void map_all_vpes(struct kvm *kvm)
dist->its_vm.vpes[i]->irq));
}
-/**
+/*
* vgic_v3_save_pending_tables - Save the pending tables into guest RAM
* kvm lock and all vcpu lock must be held
*/
@@ -379,6 +591,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
bool vlpi_avail = false;
+ unsigned long index;
int ret = 0;
u8 val;
@@ -395,7 +608,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
vlpi_avail = true;
}
- list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ xa_for_each(&dist->lpi_xa, index, irq) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
@@ -434,7 +647,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
else
val &= ~(1 << bit_nr);
- ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+ ret = vgic_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
goto out;
}
@@ -538,7 +751,6 @@ int vgic_v3_map_resources(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu;
- int ret = 0;
unsigned long c;
kvm_for_each_vcpu(c, vcpu, kvm) {
@@ -568,12 +780,6 @@ int vgic_v3_map_resources(struct kvm *kvm)
return -EBUSY;
}
- ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
- if (ret) {
- kvm_err("Unable to register VGICv3 dist MMIO regions\n");
- return ret;
- }
-
if (kvm_vgic_global_state.has_gicv4_1)
vgic_v4_configure_vsgis(kvm);
@@ -581,28 +787,29 @@ int vgic_v3_map_resources(struct kvm *kvm)
}
DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
+DEFINE_STATIC_KEY_FALSE(vgic_v3_has_v2_compat);
static int __init early_group0_trap_cfg(char *buf)
{
- return strtobool(buf, &group0_trap);
+ return kstrtobool(buf, &group0_trap);
}
early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
static int __init early_group1_trap_cfg(char *buf)
{
- return strtobool(buf, &group1_trap);
+ return kstrtobool(buf, &group1_trap);
}
early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
static int __init early_common_trap_cfg(char *buf)
{
- return strtobool(buf, &common_trap);
+ return kstrtobool(buf, &common_trap);
}
early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
static int __init early_gicv4_enable(char *buf)
{
- return strtobool(buf, &gicv4_enable);
+ return kstrtobool(buf, &gicv4_enable);
}
early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
@@ -615,13 +822,62 @@ static const struct midr_range broken_seis[] = {
MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX),
{},
};
static bool vgic_v3_broken_seis(void)
{
- return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) &&
- is_midr_in_range_list(read_cpuid_id(), broken_seis));
+ return (is_kernel_in_hyp_mode() &&
+ is_midr_in_range_list(broken_seis) &&
+ (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS));
+}
+
+void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr,
+ int nr_inst)
+{
+ u32 insn, oinsn, rd;
+ u64 hcr = 0;
+
+ if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+ group0_trap = true;
+ group1_trap = true;
+ }
+
+ if (vgic_v3_broken_seis()) {
+ /* We know that these machines have ICH_HCR_EL2.TDIR */
+ group0_trap = true;
+ group1_trap = true;
+ dir_trap = true;
+ }
+
+ if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR))
+ common_trap = true;
+
+ if (group0_trap)
+ hcr |= ICH_HCR_EL2_TALL0;
+ if (group1_trap)
+ hcr |= ICH_HCR_EL2_TALL1;
+ if (common_trap)
+ hcr |= ICH_HCR_EL2_TC;
+ if (dir_trap)
+ hcr |= ICH_HCR_EL2_TDIR;
+
+ /* Compute target register */
+ oinsn = le32_to_cpu(*origptr);
+ rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn);
+
+ /* movz rd, #(val & 0xffff) */
+ insn = aarch64_insn_gen_movewide(rd,
+ (u16)hcr,
+ 0,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_ZERO);
+ *updptr = cpu_to_le32(insn);
}
/**
@@ -635,6 +891,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
{
u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
bool has_v2;
+ u64 traps;
int ret;
has_v2 = ich_vtr_el2 >> 63;
@@ -652,9 +909,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (info->has_v4) {
kvm_vgic_global_state.has_gicv4 = gicv4_enable;
kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
- kvm_info("GICv4%s support %sabled\n",
+ kvm_info("GICv4%s support %s\n",
kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
- gicv4_enable ? "en" : "dis");
+ str_enabled_disabled(gicv4_enable));
}
kvm_vgic_global_state.vcpu_base = 0;
@@ -686,29 +943,25 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (kvm_vgic_global_state.vcpu_base == 0)
kvm_info("disabling GICv2 emulation\n");
- if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
- group0_trap = true;
- group1_trap = true;
- }
+ /*
+ * Flip the static branch if the HW supports v2, even if we're
+ * not using it (such as in protected mode).
+ */
+ if (has_v2)
+ static_branch_enable(&vgic_v3_has_v2_compat);
if (vgic_v3_broken_seis()) {
kvm_info("GICv3 with broken locally generated SEI\n");
-
- kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK;
- group0_trap = true;
- group1_trap = true;
- if (ich_vtr_el2 & ICH_VTR_TDS_MASK)
- dir_trap = true;
- else
- common_trap = true;
+ kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS;
}
- if (group0_trap || group1_trap || common_trap | dir_trap) {
+ traps = vgic_ich_hcr_trap_bits();
+ if (traps) {
kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n",
- group0_trap ? "G0" : "",
- group1_trap ? "G1" : "",
- common_trap ? "C" : "",
- dir_trap ? "D" : "");
+ (traps & ICH_HCR_EL2_TALL0) ? "G0" : "",
+ (traps & ICH_HCR_EL2_TALL1) ? "G1" : "",
+ (traps & ICH_HCR_EL2_TC) ? "C" : "",
+ (traps & ICH_HCR_EL2_TDIR) ? "D" : "");
static_branch_enable(&vgic_v3_cpuif_trap);
}
@@ -723,15 +976,14 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- /*
- * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
- * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
- * VMCR_EL2 save/restore in the world switch.
- */
- if (likely(cpu_if->vgic_sre))
- kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+ /* If the vgic is nested, perform the full state loading */
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_load_nested(vcpu);
+ return;
+ }
- kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
+ if (likely(!is_protected_kvm_enabled()))
+ kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);
if (has_vhe())
__vgic_v3_activate_traps(cpu_if);
@@ -739,23 +991,18 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
WARN_ON(vgic_v4_load(vcpu));
}
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
-{
- struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
- if (likely(cpu_if->vgic_sre))
- cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
-}
-
void vgic_v3_put(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- WARN_ON(vgic_v4_put(vcpu, false));
-
- vgic_v3_vmcr_sync(vcpu);
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_put_nested(vcpu);
+ return;
+ }
- kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
+ if (likely(!is_protected_kvm_enabled()))
+ kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
+ WARN_ON(vgic_v4_put(vcpu));
if (has_vhe())
__vgic_v3_deactivate_traps(cpu_if);
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index a413718be92b..09c3e9eb23f8 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -123,7 +123,7 @@ static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
* IRQ. The SGI code will do its magic.
*/
for (i = 0; i < VGIC_NR_SGIS; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
struct irq_desc *desc;
unsigned long flags;
int ret;
@@ -160,9 +160,10 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
int i;
for (i = 0; i < VGIC_NR_SGIS; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
struct irq_desc *desc;
unsigned long flags;
+ bool pending;
int ret;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
irq->hw = false;
ret = irq_get_irqchip_state(irq->host_irq,
IRQCHIP_STATE_PENDING,
- &irq->pending_latch);
+ &pending);
WARN_ON(ret);
+ irq->pending_latch = pending;
+
desc = irq_to_desc(irq->host_irq);
irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
unlock:
@@ -184,13 +187,14 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
}
}
-/* Must be called with the kvm lock held */
void vgic_v4_configure_vsgis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu;
unsigned long i;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
kvm_arm_halt_guest(kvm);
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -232,9 +236,8 @@ int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq)
* @kvm: Pointer to the VM being initialized
*
* We may be called each time a vITS is created, or when the
- * vgic is initialized. This relies on kvm->lock to be
- * held. In both cases, the number of vcpus should now be
- * fixed.
+ * vgic is initialized. In both cases, the number of vcpus
+ * should now be fixed.
*/
int vgic_v4_init(struct kvm *kvm)
{
@@ -243,6 +246,8 @@ int vgic_v4_init(struct kvm *kvm)
int nr_vcpus, ret;
unsigned long i;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (!kvm_vgic_global_state.has_gicv4)
return 0; /* Nothing to see here... move along. */
@@ -309,14 +314,14 @@ int vgic_v4_init(struct kvm *kvm)
/**
* vgic_v4_teardown - Free the GICv4 data structures
* @kvm: Pointer to the VM being destroyed
- *
- * Relies on kvm->lock to be held.
*/
void vgic_v4_teardown(struct kvm *kvm)
{
struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
int i;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (!its_vm->vpes)
return;
@@ -334,14 +339,30 @@ void vgic_v4_teardown(struct kvm *kvm)
its_vm->vpes = NULL;
}
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
+static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_get_flag(vcpu, IN_WFI))
+ return true;
+
+ if (likely(!vcpu_has_nv(vcpu)))
+ return false;
+
+ /*
+ * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the
+ * L1 context) nonresident and request a doorbell to kick us out of the
+ * L2 when an IRQ becomes pending.
+ */
+ return vcpu_get_flag(vcpu, IN_NESTED_ERET);
+}
+
+int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
- if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
+ if (!vgic_supports_direct_irqs(vcpu->kvm) || !vpe->resident)
return 0;
- return its_make_vpe_non_resident(vpe, need_db);
+ return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu));
}
int vgic_v4_load(struct kvm_vcpu *vcpu)
@@ -349,7 +370,10 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
int err;
- if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
+ if (!vgic_supports_direct_irqs(vcpu->kvm) || vpe->resident)
+ return 0;
+
+ if (vcpu_get_flag(vcpu, IN_WFI))
return 0;
/*
@@ -410,7 +434,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
struct vgic_irq *irq;
struct its_vlpi_map map;
unsigned long flags;
- int ret;
+ int ret = 0;
if (!vgic_supports_direct_msis(kvm))
return 0;
@@ -423,13 +447,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
if (IS_ERR(its))
return 0;
- mutex_lock(&its->its_lock);
+ guard(mutex)(&its->its_lock);
- /* Perform the actual DevID/EventID -> LPI translation. */
- ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
- irq_entry->msi.data, &irq);
- if (ret)
- goto out;
+ /*
+ * Perform the actual DevID/EventID -> LPI translation.
+ *
+ * Silently exit if translation fails as the guest (or userspace!) has
+ * managed to do something stupid. Emulated LPI injection will still
+ * work if the guest figures itself out at a later time.
+ */
+ if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+ irq_entry->msi.data, &irq))
+ return 0;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ /* Silently exit if the vLPI is already mapped */
+ if (irq->hw)
+ goto out_unlock_irq;
/*
* Emit the mapping request. If it fails, the ITS probably
@@ -449,68 +484,72 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
ret = its_map_vlpi(virq, &map);
if (ret)
- goto out;
+ goto out_unlock_irq;
irq->hw = true;
irq->host_irq = virq;
atomic_inc(&map.vpe->vlpi_count);
/* Transfer pending state */
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
- if (irq->pending_latch) {
- ret = irq_set_irqchip_state(irq->host_irq,
- IRQCHIP_STATE_PENDING,
- irq->pending_latch);
- WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+ if (!irq->pending_latch)
+ goto out_unlock_irq;
- /*
- * Clear pending_latch and communicate this state
- * change via vgic_queue_irq_unlock.
- */
- irq->pending_latch = false;
- vgic_queue_irq_unlock(kvm, irq, flags);
- } else {
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
- }
+ ret = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING,
+ irq->pending_latch);
+ WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
-out:
- mutex_unlock(&its->its_lock);
+ /*
+ * Clear pending_latch and communicate this state
+ * change via vgic_queue_irq_unlock.
+ */
+ irq->pending_latch = false;
+ vgic_queue_irq_unlock(kvm, irq, flags);
+ return ret;
+
+out_unlock_irq:
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
return ret;
}
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
- struct kvm_kernel_irq_routing_entry *irq_entry)
+static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
{
- struct vgic_its *its;
struct vgic_irq *irq;
- int ret;
+ unsigned long idx;
- if (!vgic_supports_direct_msis(kvm))
- return 0;
+ guard(rcu)();
+ xa_for_each(&kvm->arch.vgic.lpi_xa, idx, irq) {
+ if (!irq->hw || irq->host_irq != host_irq)
+ continue;
- /*
- * Get the ITS, and escape early on error (not a valid
- * doorbell for any of our vITSs).
- */
- its = vgic_get_its(kvm, irq_entry);
- if (IS_ERR(its))
- return 0;
+ if (!vgic_try_get_irq_ref(irq))
+ return NULL;
- mutex_lock(&its->its_lock);
+ return irq;
+ }
- ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
- irq_entry->msi.data, &irq);
- if (ret)
- goto out;
+ return NULL;
+}
- WARN_ON(!(irq->hw && irq->host_irq == virq));
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
+{
+ struct vgic_irq *irq;
+ unsigned long flags;
+
+ if (!vgic_supports_direct_msis(kvm))
+ return;
+
+ irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
+ if (!irq)
+ return;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ WARN_ON(irq->hw && irq->host_irq != host_irq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
- ret = its_unmap_vlpi(virq);
+ its_unmap_vlpi(host_irq);
}
-out:
- mutex_unlock(&its->its_lock);
- return ret;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(kvm, irq);
}
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
new file mode 100644
index 000000000000..2d3811f4e117
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <kvm/arm_vgic.h>
+#include <linux/irqchip/arm-vgic-info.h>
+
+#include "vgic.h"
+
+/*
+ * Probe for a vGICv5 compatible interrupt controller, returning 0 on success.
+ * Currently only supports GICv3-based VMs on a GICv5 host, and hence only
+ * registers a VGIC_V3 device.
+ */
+int vgic_v5_probe(const struct gic_kvm_info *info)
+{
+ u64 ich_vtr_el2;
+ int ret;
+
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY))
+ return -ENODEV;
+
+ kvm_vgic_global_state.type = VGIC_V5;
+ kvm_vgic_global_state.has_gcie_v3_compat = true;
+
+ /* We only support v3 compat mode - use vGICv3 limits */
+ kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+ kvm_vgic_global_state.vcpu_base = 0;
+ kvm_vgic_global_state.vctrl_base = NULL;
+ kvm_vgic_global_state.can_emulate_gicv2 = false;
+ kvm_vgic_global_state.has_gicv4 = false;
+ kvm_vgic_global_state.has_gicv4_1 = false;
+
+ ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
+ kvm_vgic_global_state.ich_vtr_el2 = (u32)ich_vtr_el2;
+
+ /*
+ * The ListRegs field is 5 bits, but there is an architectural
+ * maximum of 16 list registers. Just ignore bit 4...
+ */
+ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret) {
+ kvm_err("Cannot register GICv3-legacy KVM device.\n");
+ return ret;
+ }
+
+ static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+ kvm_info("GCIE legacy system register CPU interface\n");
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index d97e6080b421..430aa98888fd 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -24,16 +24,23 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
/*
* Locking order is always:
* kvm->lock (mutex)
- * its->cmd_lock (mutex)
- * its->its_lock (mutex)
- * vgic_cpu->ap_list_lock must be taken with IRQs disabled
- * kvm->lpi_list_lock must be taken with IRQs disabled
- * vgic_irq->irq_lock must be taken with IRQs disabled
+ * vcpu->mutex (mutex)
+ * kvm->arch.config_lock (mutex)
+ * its->cmd_lock (mutex)
+ * its->its_lock (mutex)
+ * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled
+ * vgic_cpu->ap_list_lock must be taken with IRQs disabled
+ * vgic_irq->irq_lock must be taken with IRQs disabled
*
* As the ap_list_lock might be taken from the timer interrupt handler,
* we have to disable IRQs before taking this lock and everything lower
* than it.
*
+ * The config_lock has additional ordering requirements:
+ * kvm->slots_lock
+ * kvm->srcu
+ * kvm->arch.config_lock
+ *
* If you need to take multiple locks, always take the upper lock first,
* then the lower ones, e.g. first take the its_lock, then the irq_lock.
* If you are already holding a lock and need to take a higher one, you
@@ -52,32 +59,22 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
*/
/*
- * Iterate over the VM's list of mapped LPIs to find the one with a
- * matching interrupt ID and return a reference to the IRQ structure.
+ * Index the VM's xarray of mapped LPIs and return a reference to the IRQ
+ * structure. The caller is expected to call vgic_put_irq() later once it's
+ * finished with the IRQ.
*/
static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq = NULL;
- unsigned long flags;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ rcu_read_lock();
- list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
- if (irq->intid != intid)
- continue;
+ irq = xa_load(&dist->lpi_xa, intid);
+ if (!vgic_try_get_irq_ref(irq))
+ irq = NULL;
- /*
- * This increases the refcount, the caller is expected to
- * call vgic_put_irq() later once it's finished with the IRQ.
- */
- vgic_get_irq_kref(irq);
- goto out_unlock;
- }
- irq = NULL;
-
-out_unlock:
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ rcu_read_unlock();
return irq;
}
@@ -87,17 +84,11 @@ out_unlock:
* struct vgic_irq. It also increases the refcount, so any caller is expected
* to call vgic_put_irq() once it's finished with this IRQ.
*/
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
- u32 intid)
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid)
{
- /* SGIs and PPIs */
- if (intid <= VGIC_MAX_PRIVATE) {
- intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
- return &vcpu->arch.vgic_cpu.private_irqs[intid];
- }
-
/* SPIs */
- if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+ if (intid >= VGIC_NR_PRIVATE_IRQS &&
+ intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
}
@@ -109,29 +100,42 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
return NULL;
}
-/*
- * We can't do anything in here, because we lack the kvm pointer to
- * lock and remove the item from the lpi_list. So we keep this function
- * empty and use the return value of kref_put() to trigger the freeing.
- */
-static void vgic_irq_release(struct kref *ref)
+struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
{
+ if (WARN_ON(!vcpu))
+ return NULL;
+
+ /* SGIs and PPIs */
+ if (intid < VGIC_NR_PRIVATE_IRQS) {
+ intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS);
+ return &vcpu->arch.vgic_cpu.private_irqs[intid];
+ }
+
+ return vgic_get_irq(vcpu->kvm, intid);
}
-/*
- * Drop the refcount on the LPI. Must be called with lpi_list_lock held.
- */
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
+static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq)
{
- struct vgic_dist *dist = &kvm->arch.vgic;
+ lockdep_assert_held(&dist->lpi_xa.xa_lock);
+ __xa_erase(&dist->lpi_xa, irq->intid);
+ kfree_rcu(irq, rcu);
+}
- if (!kref_put(&irq->refcount, vgic_irq_release))
- return;
+static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+ if (irq->intid < VGIC_MIN_LPI)
+ return false;
- list_del(&irq->lpi_list);
- dist->lpi_list_count--;
+ return refcount_dec_and_test(&irq->refcount);
+}
- kfree(irq);
+static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq)
+{
+ if (!__vgic_put_irq(kvm, irq))
+ return false;
+
+ irq->pending_release = true;
+ return true;
}
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
@@ -139,18 +143,44 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
struct vgic_dist *dist = &kvm->arch.vgic;
unsigned long flags;
- if (irq->intid < VGIC_MIN_LPI)
+ /*
+ * Normally the lock is only taken when the refcount drops to 0.
+ * Acquire/release it early on lockdep kernels to make locking issues
+ * in rare release paths a bit more obvious.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) {
+ guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock);
+ }
+
+ if (!__vgic_put_irq(kvm, irq))
return;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
- __vgic_put_lpi_locked(kvm, irq);
- raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+ vgic_release_lpi_locked(dist, irq);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+}
+
+static void vgic_release_deleted_lpis(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long flags, intid;
+ struct vgic_irq *irq;
+
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ if (irq->pending_release)
+ vgic_release_lpi_locked(dist, irq);
+ }
+
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
}
void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_irq *irq, *tmp;
+ bool deleted = false;
unsigned long flags;
raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
@@ -161,11 +191,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
list_del(&irq->ap_list);
irq->vcpu = NULL;
raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ deleted |= vgic_put_irq_norelease(vcpu->kvm, irq);
}
}
raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+
+ if (deleted)
+ vgic_release_deleted_lpis(vcpu->kvm);
}
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
@@ -201,7 +234,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
}
/**
- * kvm_vgic_target_oracle - compute the target vcpu for an irq
+ * vgic_target_oracle - compute the target vcpu for an irq
*
* @irq: The irq to route. Must be already locked.
*
@@ -211,7 +244,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
*
* Requires the IRQ lock to be held.
*/
-static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
{
lockdep_assert_held(&irq->irq_lock);
@@ -239,17 +272,20 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
return NULL;
}
+struct vgic_sort_info {
+ struct kvm_vcpu *vcpu;
+ struct vgic_vmcr vmcr;
+};
+
/*
* The order of items in the ap_lists defines how we'll pack things in LRs as
* well, the first items in the list being the first things populated in the
* LRs.
*
- * A hard rule is that active interrupts can never be pushed out of the LRs
- * (and therefore take priority) since we cannot reliably trap on deactivation
- * of IRQs and therefore they have to be present in the LRs.
- *
+ * Pending, non-active interrupts must be placed at the head of the list.
* Otherwise things should be sorted by the priority field and the GIC
* hardware support will take care of preemption of priority groups etc.
+ * Interrupts that are not deliverable should be at the end of the list.
*
* Return negative if "a" sorts before "b", 0 to preserve order, and positive
* to sort "b" before "a".
@@ -259,6 +295,8 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a,
{
struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+ struct vgic_sort_info *info = priv;
+ struct kvm_vcpu *vcpu = info->vcpu;
bool penda, pendb;
int ret;
@@ -272,21 +310,32 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a,
raw_spin_lock(&irqa->irq_lock);
raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
- if (irqa->active || irqb->active) {
- ret = (int)irqb->active - (int)irqa->active;
+ /* Undeliverable interrupts should be last */
+ ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu);
+ if (ret)
goto out;
- }
- penda = irqa->enabled && irq_is_pending(irqa);
- pendb = irqb->enabled && irq_is_pending(irqb);
+ /* Same thing for interrupts targeting a disabled group */
+ ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0);
+ ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0);
+ if (ret)
+ goto out;
+
+ penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active;
+ pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active;
+
+ ret = (int)pendb - (int)penda;
+ if (ret)
+ goto out;
- if (!penda || !pendb) {
- ret = (int)pendb - (int)penda;
+ /* Both pending and enabled, sort by priority (lower number first) */
+ ret = (int)irqa->priority - (int)irqb->priority;
+ if (ret)
goto out;
- }
- /* Both pending and enabled, sort by priority */
- ret = irqa->priority - irqb->priority;
+ /* Finally, HW bit active interrupts have priority over non-HW ones */
+ ret = (int)irqb->hw - (int)irqa->hw;
+
out:
raw_spin_unlock(&irqb->irq_lock);
raw_spin_unlock(&irqa->irq_lock);
@@ -297,10 +346,12 @@ out:
static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_sort_info info = { .vcpu = vcpu, };
lockdep_assert_held(&vgic_cpu->ap_list_lock);
- list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+ vgic_get_vmcr(vcpu, &info.vmcr);
+ list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp);
}
/*
@@ -323,6 +374,20 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne
return false;
}
+static bool vgic_model_needs_bcst_kick(struct kvm *kvm)
+{
+ /*
+ * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest
+ * needs a broadcast kick to set TDIR globally.
+ *
+ * For systems that do not have TDIR (ARM's own v8.0 CPUs), the
+ * shadow TDIR bit is always set, and so is the register's TC bit,
+ * so no need to kick the CPUs.
+ */
+ return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) &&
+ kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3);
+}
+
/*
* Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
* Do the queuing if necessary, taking the right locks in the right order.
@@ -332,9 +397,10 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne
* with all locks dropped.
*/
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
- unsigned long flags)
+ unsigned long flags) __releases(&irq->irq_lock)
{
struct kvm_vcpu *vcpu;
+ bool bcast;
lockdep_assert_held(&irq->irq_lock);
@@ -402,17 +468,27 @@ retry:
/*
* Grab a reference to the irq to reflect the fact that it is
- * now in the ap_list.
+ * now in the ap_list. This is safe as the caller must already hold a
+ * reference on the irq.
*/
- vgic_get_irq_kref(irq);
+ vgic_get_irq_ref(irq);
list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
irq->vcpu = vcpu;
+ /* A new SPI may result in deactivation trapping on all vcpus */
+ bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) &&
+ vgic_valid_spi(vcpu->kvm, irq->intid) &&
+ atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0);
+
raw_spin_unlock(&irq->irq_lock);
raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
- kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
- kvm_vcpu_kick(vcpu);
+ if (!bcast) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING);
+ }
return true;
}
@@ -420,7 +496,7 @@ retry:
/**
* kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
* @kvm: The VM structure pointer
- * @cpuid: The CPU for PPIs
+ * @vcpu: The CPU for PPIs or NULL for global interrupts
* @intid: The INTID to inject a new state to.
* @level: Edge-triggered: true: to trigger the interrupt
* false: to ignore the call
@@ -434,25 +510,26 @@ retry:
* level-sensitive interrupts. You can think of the level parameter as 1
* being HIGH and 0 being LOW and all devices being active-HIGH.
*/
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
- bool level, void *owner)
+int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ unsigned int intid, bool level, void *owner)
{
- struct kvm_vcpu *vcpu;
struct vgic_irq *irq;
unsigned long flags;
int ret;
- trace_vgic_update_irq_pending(cpuid, intid, level);
-
ret = vgic_lazy_init(kvm);
if (ret)
return ret;
- vcpu = kvm_get_vcpu(kvm, cpuid);
if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
return -EINVAL;
- irq = vgic_get_irq(kvm, vcpu, intid);
+ trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level);
+
+ if (intid < VGIC_NR_PRIVATE_IRQS)
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ else
+ irq = vgic_get_irq(kvm, intid);
if (!irq)
return -EINVAL;
@@ -514,7 +591,7 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
u32 vintid, struct irq_ops *ops)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
unsigned long flags;
int ret;
@@ -539,7 +616,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
*/
void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
unsigned long flags;
if (!irq->hw)
@@ -562,7 +639,7 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
if (!vgic_initialized(vcpu->kvm))
return -EAGAIN;
- irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ irq = vgic_get_vcpu_irq(vcpu, vintid);
BUG_ON(!irq);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -573,6 +650,21 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
return 0;
}
+int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
+ unsigned long flags;
+ int ret = -1;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->hw)
+ ret = irq->hwintid;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ return ret;
+}
+
/**
* kvm_vgic_set_owner - Set the owner of an interrupt for a VM
*
@@ -596,7 +688,7 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
return -EINVAL;
- irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->owner && irq->owner != owner)
ret = -EEXIST;
@@ -619,6 +711,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_irq *irq, *tmp;
+ bool deleted_lpis = false;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -646,12 +739,12 @@ retry:
/*
* This vgic_put_irq call matches the
- * vgic_get_irq_kref in vgic_queue_irq_unlock,
+ * vgic_get_irq_ref in vgic_queue_irq_unlock,
* where we added the LPI to the ap_list. As
* we remove the irq from the list, we drop
* also drop the refcount.
*/
- vgic_put_irq(vcpu->kvm, irq);
+ deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq);
continue;
}
@@ -714,6 +807,9 @@ retry:
}
raw_spin_unlock(&vgic_cpu->ap_list_lock);
+
+ if (unlikely(deleted_lpis))
+ vgic_release_deleted_lpis(vcpu->kvm);
}
static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
@@ -744,98 +840,148 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
vgic_v3_clear_lr(vcpu, lr);
}
-static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
-{
- if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_set_underflow(vcpu);
- else
- vgic_v3_set_underflow(vcpu);
-}
-
-/* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
- bool *multi_sgi)
+static void summarize_ap_list(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_irq *irq;
- int count = 0;
-
- *multi_sgi = false;
lockdep_assert_held(&vgic_cpu->ap_list_lock);
+ *als = (typeof(*als)){};
+
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- int w;
+ guard(raw_spinlock)(&irq->irq_lock);
- raw_spin_lock(&irq->irq_lock);
- /* GICv2 SGIs can count for more than one... */
- w = vgic_irq_get_lr_count(irq);
- raw_spin_unlock(&irq->irq_lock);
+ if (unlikely(vgic_target_oracle(irq) != vcpu))
+ continue;
- count += w;
- *multi_sgi |= (w > 1);
+ if (!irq->active)
+ als->nr_pend++;
+ else
+ als->nr_act++;
+
+ if (irq->intid < VGIC_NR_SGIS)
+ als->nr_sgi++;
}
- return count;
}
-/* Requires the VCPU's ap_list_lock to be held. */
+/*
+ * Dealing with LR overflow is close to black magic -- dress accordingly.
+ *
+ * We have to present an almost infinite number of interrupts through a very
+ * limited number of registers. Therefore crucial decisions must be made to
+ * ensure we feed the most relevant interrupts into the LRs, and yet have
+ * some facilities to let the guest interact with those that are not there.
+ *
+ * All considerations below are in the context of interrupts targeting a
+ * single vcpu with non-idle state (either pending, active, or both),
+ * colloquially called the ap_list:
+ *
+ * - Pending interrupts must have priority over active interrupts. This also
+ * excludes pending+active interrupts. This ensures that a guest can
+ * perform priority drops on any number of interrupts, and yet be
+ * presented the next pending one.
+ *
+ * - Deactivation of interrupts outside of the LRs must be tracked by using
+ * either the EOIcount-driven maintenance interrupt, and sometimes by
+ * trapping the DIR register.
+ *
+ * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the
+ * point that made it into the LRs, and deactivate interrupts that would
+ * have made it onto the LRs if we had the space.
+ *
+ * - The MI-generation bits must be used to try and force an exit when the
+ * guest has done enough changes to the LRs that we want to reevaluate the
+ * situation:
+ *
+ * - if the total number of pending interrupts exceeds the number of
+ * LR, NPIE must be set in order to exit once no pending interrupts
+ * are present in the LRs, allowing us to populate the next batch.
+ *
+ * - if there are active interrupts outside of the LRs, then LRENPIE
+ * must be set so that we exit on deactivation of one of these, and
+ * work out which one is to be deactivated. Note that this is not
+ * enough to deal with EOImode=1, see below.
+ *
+ * - if the overall number of interrupts exceeds the number of LRs,
+ * then UIE must be set to allow refilling of the LRs once the
+ * majority of them has been processed.
+ *
+ * - as usual, MI triggers are only an optimisation, since we cannot
+ * rely on the MI being delivered in timely manner...
+ *
+ * - EOImode=1 creates some additional problems:
+ *
+ * - deactivation can happen in any order, and we cannot rely on
+ * EOImode=0's coupling of priority-drop and deactivation which
+ * imposes strict reverse Ack order. This means that DIR must
+ * trap if we have active interrupts outside of the LRs.
+ *
+ * - deactivation of SPIs can occur on any CPU, while the SPI is only
+ * present in the ap_list of the CPU that actually ack-ed it. In that
+ * case, EOIcount doesn't provide enough information, and we must
+ * resort to trapping DIR even if we don't overflow the LRs. Bonus
+ * point for not trapping DIR when no SPIs are pending or active in
+ * the whole VM.
+ *
+ * - LPIs do not suffer the same problem as SPIs on deactivation, as we
+ * have to essentially discard the active state, see below.
+ *
+ * - Virtual LPIs have an active state (surprise!), which gets removed on
+ * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI
+ * is not present in the LR (surprise again!). Special care must therefore
+ * be taken to remove the active state from any activated LPI when exiting
+ * from the guest. This is in a way no different from what happens on the
+ * physical side. We still rely on the running priority to have been
+ * removed from the APRs, irrespective of the LPI being present in the LRs
+ * or not.
+ *
+ * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as
+ * they are not managed in SW and don't have a true active state. So only
+ * set vSGIEOICount when no SGIs are in the ap_list.
+ *
+ * - GICv2 SGIs with multiple sources are injected one source at a time, as
+ * if they were made pending sequentially. This may mean that we don't
+ * always present the HPPI if other interrupts with lower priority are
+ * pending in the LRs. Big deal.
+ */
static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct ap_list_summary als;
struct vgic_irq *irq;
- int count;
- bool multi_sgi;
- u8 prio = 0xff;
- int i = 0;
+ int count = 0;
lockdep_assert_held(&vgic_cpu->ap_list_lock);
- count = compute_ap_list_depth(vcpu, &multi_sgi);
- if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
- vgic_sort_ap_list(vcpu);
+ summarize_ap_list(vcpu, &als);
- count = 0;
+ if (irqs_outside_lrs(&als))
+ vgic_sort_ap_list(vcpu);
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- raw_spin_lock(&irq->irq_lock);
-
- /*
- * If we have multi-SGIs in the pipeline, we need to
- * guarantee that they are all seen before any IRQ of
- * lower priority. In that case, we need to filter out
- * these interrupts by exiting early. This is easy as
- * the AP list has been sorted already.
- */
- if (multi_sgi && irq->priority > prio) {
- _raw_spin_unlock(&irq->irq_lock);
- break;
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ if (likely(vgic_target_oracle(irq) == vcpu)) {
+ vgic_populate_lr(vcpu, irq, count++);
+ }
}
- if (likely(vgic_target_oracle(irq) == vcpu)) {
- vgic_populate_lr(vcpu, irq, count++);
-
- if (irq->source)
- prio = irq->priority;
- }
-
- raw_spin_unlock(&irq->irq_lock);
-
- if (count == kvm_vgic_global_state.nr_lr) {
- if (!list_is_last(&irq->ap_list,
- &vgic_cpu->ap_list_head))
- vgic_set_underflow(vcpu);
+ if (count == kvm_vgic_global_state.nr_lr)
break;
- }
}
/* Nuke remaining LRs */
- for (i = count ; i < kvm_vgic_global_state.nr_lr; i++)
+ for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++)
vgic_clear_lr(vcpu, i);
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count;
- else
+ vgic_v2_configure_hcr(vcpu, &als);
+ } else {
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count;
+ vgic_v3_configure_hcr(vcpu, &als);
+ }
}
static inline bool can_access_vgic_from_kernel(void)
@@ -859,23 +1005,31 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu)
/* Sync back the hardware VGIC state into our emulation after a guest's run. */
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
- int used_lrs;
-
- /* An empty ap_list_head implies used_lrs == 0 */
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+ /* If nesting, emulate the HW effect from L0 to L1 */
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_sync_nested(vcpu);
return;
+ }
+
+ if (vcpu_has_nv(vcpu))
+ vgic_v3_nested_update_mi(vcpu);
if (can_access_vgic_from_kernel())
vgic_save_state(vcpu);
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
- used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
- else
- used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
+ vgic_fold_lr_state(vcpu);
+ vgic_prune_ap_list(vcpu);
+}
+
+/* Sync interrupts that were deactivated through a DIR trap */
+void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
- if (used_lrs)
- vgic_fold_lr_state(vcpu);
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
vgic_prune_ap_list(vcpu);
+ local_irq_restore(flags);
}
static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
@@ -890,42 +1044,57 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
/*
- * If there are no virtual interrupts active or pending for this
- * VCPU, then there is no work to do and we can bail out without
- * taking any lock. There is a potential race with someone injecting
- * interrupts to the VCPU, but it is a benign race as the VCPU will
- * either observe the new interrupt before or after doing this check,
- * and introducing additional synchronization mechanism doesn't change
- * this.
+ * If in a nested state, we must return early. Two possibilities:
*
- * Note that we still need to go through the whole thing if anything
- * can be directly injected (GICv4).
+ * - If we have any pending IRQ for the guest and the guest
+ * expects IRQs to be handled in its virtual EL2 mode (the
+ * virtual IMO bit is set) and it is not already running in
+ * virtual EL2 mode, then we have to emulate an IRQ
+ * exception to virtual EL2.
+ *
+ * We do that by placing a request to ourselves which will
+ * abort the entry procedure and inject the exception at the
+ * beginning of the run loop.
+ *
+ * - Otherwise, do exactly *NOTHING* apart from enabling the virtual
+ * CPU interface. The guest state is already loaded, and we can
+ * carry on with running it.
+ *
+ * If we have NV, but are not in a nested state, compute the
+ * maintenance interrupt state, as it may fire.
*/
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
- !vgic_supports_direct_msis(vcpu->kvm))
+ if (vgic_state_is_nested(vcpu)) {
+ if (kvm_vgic_vcpu_pending_irq(vcpu))
+ kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu);
+
+ vgic_v3_flush_nested(vcpu);
return;
+ }
+
+ if (vcpu_has_nv(vcpu))
+ vgic_v3_nested_update_mi(vcpu);
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
- raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
vgic_flush_lr_state(vcpu);
- raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
- }
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
- if (vgic_supports_direct_msis(vcpu->kvm))
+ if (vgic_supports_direct_irqs(vcpu->kvm))
vgic_v4_commit(vcpu);
}
void kvm_vgic_load(struct kvm_vcpu *vcpu)
{
- if (unlikely(!vgic_initialized(vcpu->kvm)))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
+ if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
+ }
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_load(vcpu);
else
vgic_v3_load(vcpu);
@@ -933,26 +1102,18 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu)
void kvm_vgic_put(struct kvm_vcpu *vcpu)
{
- if (unlikely(!vgic_initialized(vcpu->kvm)))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
+ if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
+ }
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_put(vcpu);
else
vgic_v3_put(vcpu);
}
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
-{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
- return;
-
- if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_vmcr_sync(vcpu);
- else
- vgic_v3_vmcr_sync(vcpu);
-}
-
int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -1013,7 +1174,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
if (!vgic_initialized(vcpu->kvm))
return false;
- irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ irq = vgic_get_vcpu_irq(vcpu, vintid);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
map_is_active = irq->hw && irq->active;
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 23e280fa0a16..5f0fc96b4dc2 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -6,6 +6,7 @@
#define __KVM_ARM_VGIC_NEW_H__
#include <linux/irqchip/arm-gic-common.h>
+#include <asm/kvm_mmu.h>
#define PRODUCT_ID_KVM 0x4b /* ASCII code K */
#define IMPLEMENTER_ARM 0x43b
@@ -15,6 +16,7 @@
#define INTERRUPT_ID_BITS_SPIS 10
#define INTERRUPT_ID_BITS_ITS 16
+#define VGIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1)
#define VGIC_PRI_BITS 5
#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
@@ -62,6 +64,24 @@
KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
+#define KVM_ICC_SRE_EL2 (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | \
+ ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB)
+#define KVM_ICH_VTR_EL2_RES0 (ICH_VTR_EL2_DVIM | \
+ ICH_VTR_EL2_A3V | \
+ ICH_VTR_EL2_IDbits)
+#define KVM_ICH_VTR_EL2_RES1 ICH_VTR_EL2_nV4
+
+static inline u64 kvm_get_guest_vtr_el2(void)
+{
+ u64 vtr;
+
+ vtr = kvm_vgic_global_state.ich_vtr_el2;
+ vtr &= ~KVM_ICH_VTR_EL2_RES0;
+ vtr |= KVM_ICH_VTR_EL2_RES1;
+
+ return vtr;
+}
+
/*
* As per Documentation/virt/kvm/devices/arm-vgic-its.rst,
* below macros are defined for ITS table entry encoding.
@@ -131,6 +151,35 @@ static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
return vgic_irq_get_lr_count(irq) > 1;
}
+static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa,
+ const void *data, unsigned long len)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ int ret;
+
+ dist->table_write_in_progress = true;
+ ret = kvm_write_guest_lock(kvm, gpa, data, len);
+ dist->table_write_in_progress = false;
+
+ return ret;
+}
+
+void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst);
+
+static inline u64 vgic_ich_hcr_trap_bits(void)
+{
+ u64 hcr;
+
+ /* All the traps are in the bottom 16bits */
+ asm volatile(ALTERNATIVE_CB("movz %0, #0\n",
+ ARM64_ALWAYS_SYSTEM,
+ kvm_compute_ich_hcr_trap_bits)
+ : "=r" (hcr));
+
+ return hcr;
+}
+
/*
* This struct provides an intermediate representation of the fields contained
* in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
@@ -157,6 +206,51 @@ struct vgic_reg_attr {
gpa_t addr;
};
+struct its_device {
+ struct list_head dev_list;
+
+ /* the head for the list of ITTEs */
+ struct list_head itt_head;
+ u32 num_eventid_bits;
+ gpa_t itt_addr;
+ u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+ struct list_head coll_list;
+
+ u32 collection_id;
+ u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+ ((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_ite {
+ struct list_head ite_list;
+
+ struct vgic_irq *irq;
+ struct its_collection *collection;
+ u32 event_id;
+};
+
+struct ap_list_summary {
+ unsigned int nr_pend; /* purely pending, not active */
+ unsigned int nr_act; /* active, or active+pending */
+ unsigned int nr_sgi; /* any SGI */
+};
+
+#define irqs_outside_lrs(s) \
+ (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr)
+
+#define irqs_pending_outside_lrs(s) \
+ ((s)->nr_pend > kvm_vgic_global_state.nr_lr)
+
+#define irqs_active_outside_lrs(s) \
+ ((s)->nr_act && irqs_outside_lrs(s))
+
int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
struct vgic_reg_attr *reg_attr);
int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
@@ -164,15 +258,15 @@ int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
const struct vgic_register_region *
vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
gpa_t addr, int len);
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
- u32 intid);
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq);
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid);
+struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid);
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq);
bool vgic_get_phys_line_level(struct vgic_irq *irq);
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
- unsigned long flags);
+ unsigned long flags) __releases(&irq->irq_lock);
void vgic_kick_vcpus(struct kvm *kvm);
void vgic_irq_handle_resampling(struct vgic_irq *irq,
bool lr_deactivated, bool lr_pending);
@@ -183,9 +277,9 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val);
void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
+void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int offset, u32 *val);
@@ -193,7 +287,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int offset, u32 *val);
void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_enable(struct kvm_vcpu *vcpu);
+void vgic_v2_reset(struct kvm_vcpu *vcpu);
int vgic_v2_probe(const struct gic_kvm_info *info);
int vgic_v2_map_resources(struct kvm *kvm);
int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
@@ -202,38 +296,45 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
void vgic_v2_init_lrs(void);
void vgic_v2_load(struct kvm_vcpu *vcpu);
void vgic_v2_put(struct kvm_vcpu *vcpu);
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
void vgic_v2_save_state(struct kvm_vcpu *vcpu);
void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq)
{
+ if (!irq)
+ return false;
+
if (irq->intid < VGIC_MIN_LPI)
- return;
+ return true;
+
+ return refcount_inc_not_zero(&irq->refcount);
+}
- kref_get(&irq->refcount);
+static inline void vgic_get_irq_ref(struct vgic_irq *irq)
+{
+ WARN_ON_ONCE(!vgic_try_get_irq_ref(irq));
}
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val);
+void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_enable(struct kvm_vcpu *vcpu);
+void vgic_v3_reset(struct kvm_vcpu *vcpu);
int vgic_v3_probe(const struct gic_kvm_info *info);
int vgic_v3_map_resources(struct kvm *kvm);
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
int vgic_v3_save_pending_tables(struct kvm *kvm);
int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
+void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu);
bool vgic_v3_check_base(struct kvm *kvm);
void vgic_v3_load(struct kvm_vcpu *vcpu);
void vgic_v3_put(struct kvm_vcpu *vcpu);
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
bool vgic_has_its(struct kvm *kvm);
int kvm_vgic_register_its_device(void);
@@ -248,6 +349,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr, bool is_write);
int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
+const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz);
int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
u32 intid, u32 *val);
int kvm_register_vgic_device(unsigned long type);
@@ -259,8 +361,7 @@ int vgic_init(struct kvm *kvm);
void vgic_debug_init(struct kvm *kvm);
void vgic_debug_destroy(struct kvm *kvm);
-bool lock_all_vcpus(struct kvm *kvm);
-void unlock_all_vcpus(struct kvm *kvm);
+int vgic_v5_probe(const struct gic_kvm_info *info);
static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
{
@@ -300,7 +401,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
u32 index);
-void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg);
+void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg);
bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
@@ -313,24 +414,57 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
}
bool vgic_lpis_enabled(struct kvm_vcpu *vcpu);
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
u32 devid, u32 eventid, struct vgic_irq **irq);
struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
-void vgic_lpi_translation_cache_init(struct kvm *kvm);
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
-void vgic_its_invalidate_cache(struct kvm *kvm);
+void vgic_its_invalidate_all_caches(struct kvm *kvm);
/* GICv4.1 MMIO interface */
int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
int vgic_its_invall(struct kvm_vcpu *vcpu);
+bool system_supports_direct_sgis(void);
bool vgic_supports_direct_msis(struct kvm *kvm);
+bool vgic_supports_direct_sgis(struct kvm *kvm);
+
+static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
+{
+ return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
+}
+
int vgic_v4_init(struct kvm *kvm);
void vgic_v4_teardown(struct kvm *kvm);
void vgic_v4_configure_vsgis(struct kvm *kvm);
void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq);
+void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu);
+
+static inline bool kvm_has_gicv3(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
+}
+
+void vgic_v3_flush_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_sync_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_load_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
+void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
+void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
+
+static inline bool vgic_is_v3_compat(struct kvm *kvm)
+{
+ return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) &&
+ kvm_vgic_global_state.has_gcie_v3_compat;
+}
+
+static inline bool vgic_is_v3(struct kvm *kvm)
+{
+ return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm);
+}
+
+int vgic_its_debug_init(struct kvm_device *dev);
+void vgic_its_debug_destroy(struct kvm_device *dev);
+
#endif
diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c
index d78ae63d7c15..7fe8ba1a2851 100644
--- a/arch/arm64/kvm/vmid.c
+++ b/arch/arm64/kvm/vmid.c
@@ -16,7 +16,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
-unsigned int kvm_arm_vmid_bits;
+unsigned int __ro_after_init kvm_arm_vmid_bits;
static DEFINE_RAW_SPINLOCK(cpu_vmid_lock);
static atomic64_t vmid_generation;
@@ -47,7 +47,7 @@ static void flush_context(void)
int cpu;
u64 vmid;
- bitmap_clear(vmid_map, 0, NUM_USER_VMIDS);
+ bitmap_zero(vmid_map, NUM_USER_VMIDS);
for_each_possible_cpu(cpu) {
vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0);
@@ -172,7 +172,7 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
/*
* Initialize the VMID allocator
*/
-int kvm_arm_vmid_alloc_init(void)
+int __init kvm_arm_vmid_alloc_init(void)
{
kvm_arm_vmid_bits = kvm_get_vmid_bits();
@@ -182,15 +182,14 @@ int kvm_arm_vmid_alloc_init(void)
*/
WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus());
atomic64_set(&vmid_generation, VMID_FIRST_VERSION);
- vmid_map = kcalloc(BITS_TO_LONGS(NUM_USER_VMIDS),
- sizeof(*vmid_map), GFP_KERNEL);
+ vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL);
if (!vmid_map)
return -ENOMEM;
return 0;
}
-void kvm_arm_vmid_alloc_free(void)
+void __init kvm_arm_vmid_alloc_free(void)
{
- kfree(vmid_map);
+ bitmap_free(vmid_map);
}