summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm/hyp/nvhe
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm/hyp/nvhe')
-rw-r--r--arch/arm64/kvm/hyp/nvhe/.gitignore4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile112
-rw-r--r--arch/arm64/kvm/hyp/nvhe/cache.S25
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c113
-rw-r--r--arch/arm64/kvm/hyp/nvhe/early_alloc.c59
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c780
-rw-r--r--arch/arm64/kvm/hyp/nvhe/gen-hyprel.c456
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S309
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S297
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c438
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-smp.c40
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S29
-rw-r--r--arch/arm64/kvm/hyp/nvhe/list_debug.c56
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c1305
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c423
-rw-r--r--arch/arm64/kvm/hyp/nvhe/page_alloc.c248
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c640
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c303
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c346
-rw-r--r--arch/arm64/kvm/hyp/nvhe/stacktrace.c158
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c396
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c516
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sysreg-sr.c35
-rw-r--r--arch/arm64/kvm/hyp/nvhe/timer-sr.c62
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c202
25 files changed, 7352 insertions, 0 deletions
diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore
new file mode 100644
index 000000000000..5b6c43cc96f8
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+gen-hyprel
+hyp.lds
+hyp-reloc.S
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
new file mode 100644
index 000000000000..2250253a6429
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part
+#
+
+asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
+
+# Tracepoint and MMIO logging symbols should not be visible at nVHE KVM as
+# there is no way to execute them and any such MMIO access from nVHE KVM
+# will explode instantly (Words of Marc Zyngier). So introduce a generic flag
+# __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM.
+ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
+ccflags-y += -fno-stack-protector \
+ -DDISABLE_BRANCH_PROFILING \
+ $(DISABLE_STACKLEAK_PLUGIN)
+
+hostprogs := gen-hyprel
+HOST_EXTRACFLAGS += -I$(objtree)/include
+
+lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+
+hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
+ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
+ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
+hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
+ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
+hyp-obj-y += $(lib-objs)
+
+##
+## Build rules for compiling nVHE hyp code
+## Output of this folder is `kvm_nvhe.o`, a partially linked object
+## file containing all nVHE hyp code and data.
+##
+
+hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y))
+obj-y := kvm_nvhe.o
+targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o
+
+# 1) Compile all source files to `.nvhe.o` object files. The file extension
+# avoids file name clashes for files shared with VHE.
+$(obj)/%.nvhe.o: $(src)/%.c FORCE
+ $(call if_changed_rule,cc_o_c)
+$(obj)/%.nvhe.o: $(src)/%.S FORCE
+ $(call if_changed_rule,as_o_S)
+
+# 2) Compile linker script.
+$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
+ $(call if_changed_dep,cpp_lds_S)
+
+# 3) Partially link all '.nvhe.o' files and apply the linker script.
+# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
+# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
+# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
+# keep the dependency on the target while avoiding an error from
+# GNU ld if the linker script is passed to it twice.
+LDFLAGS_kvm_nvhe.tmp.o := -r -T
+$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
+ $(call if_changed,ld)
+
+# 4) Generate list of hyp code/data positions that need to be relocated at
+# runtime. Because the hypervisor is part of the kernel binary, relocations
+# produce a kernel VA. We enumerate relocations targeting hyp at build time
+# and convert the kernel VAs at those positions to hyp VAs.
+$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE
+ $(call if_changed,hyprel)
+
+# 5) Compile hyp-reloc.S and link it into the existing partially linked object.
+# The object file now contains a section with pointers to hyp positions that
+# will contain kernel VAs at runtime. These pointers have relocations on them
+# so that they get updated as the hyp object is linked into `vmlinux`.
+LDFLAGS_kvm_nvhe.rel.o := -r
+$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE
+ $(call if_changed,ld)
+
+# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+# Prefixes names of ELF symbols with '__kvm_nvhe_'.
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
+ $(call if_changed,hypcopy)
+
+# The HYPREL command calls `gen-hyprel` to generate an assembly file with
+# a list of relocations targeting hyp code/data.
+quiet_cmd_hyprel = HYPREL $@
+ cmd_hyprel = $(obj)/gen-hyprel $< > $@
+
+# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
+# to avoid clashes with VHE code/data.
+quiet_cmd_hypcopy = HYPCOPY $@
+ cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
+
+# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
+# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
+# when profile optimization is applied. gen-hyprel does not support SHT_REL and
+# causes a build failure. Remove profile optimization flags.
+KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables
+
+# KVM nVHE code is run at a different exception code with a different map, so
+# compiler instrumentation that inserts callbacks or checks into the code may
+# cause crashes. Just disable it.
+GCOV_PROFILE := n
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+# Skip objtool checking for this directory because nVHE code is compiled with
+# non-standard build rules.
+OBJECT_FILES_NON_STANDARD := y
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
new file mode 100644
index 000000000000..85936c17ae40
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Code copied from arch/arm64/mm/cache.S.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/alternative.h>
+
+SYM_FUNC_START(__pi_dcache_clean_inval_poc)
+ dcache_by_line_op civac, sy, x0, x1, x2, x3
+ ret
+SYM_FUNC_END(__pi_dcache_clean_inval_poc)
+SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc)
+
+SYM_FUNC_START(__pi_icache_inval_pou)
+alternative_if ARM64_HAS_CACHE_DIC
+ isb
+ ret
+alternative_else_nop_endif
+
+ invalidate_icache_by_line x0, x1, x2, x3
+ ret
+SYM_FUNC_END(__pi_icache_inval_pou)
+SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou)
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
new file mode 100644
index 000000000000..7746ea507b6f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <hyp/debug-sr.h>
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+static void __debug_save_spe(u64 *pmscr_el1)
+{
+ u64 reg;
+
+ /* Clear pmscr in case of early return */
+ *pmscr_el1 = 0;
+
+ /*
+ * At this point, we know that this CPU implements
+ * SPE and is available to the host.
+ * Check if the host is actually using it ?
+ */
+ reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
+ if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT)))
+ return;
+
+ /* Yes; save the control register and disable data generation */
+ *pmscr_el1 = read_sysreg_el1(SYS_PMSCR);
+ write_sysreg_el1(0, SYS_PMSCR);
+ isb();
+
+ /* Now drain all buffered data to memory */
+ psb_csync();
+}
+
+static void __debug_restore_spe(u64 pmscr_el1)
+{
+ if (!pmscr_el1)
+ return;
+
+ /* The host page table is installed, but not yet synchronised */
+ isb();
+
+ /* Re-enable data generation */
+ write_sysreg_el1(pmscr_el1, SYS_PMSCR);
+}
+
+static void __debug_save_trace(u64 *trfcr_el1)
+{
+ *trfcr_el1 = 0;
+
+ /* Check if the TRBE is enabled */
+ if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E))
+ return;
+ /*
+ * Prohibit trace generation while we are in guest.
+ * Since access to TRFCR_EL1 is trapped, the guest can't
+ * modify the filtering set by the host.
+ */
+ *trfcr_el1 = read_sysreg_el1(SYS_TRFCR);
+ write_sysreg_el1(0, SYS_TRFCR);
+ isb();
+ /* Drain the trace buffer to memory */
+ tsb_csync();
+}
+
+static void __debug_restore_trace(u64 trfcr_el1)
+{
+ if (!trfcr_el1)
+ return;
+
+ /* Restore trace filter controls */
+ write_sysreg_el1(trfcr_el1, SYS_TRFCR);
+}
+
+void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
+{
+ /* Disable and flush SPE data generation */
+ if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE))
+ __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+ /* Disable and flush Self-Hosted Trace generation */
+ if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE))
+ __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1);
+}
+
+void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ __debug_switch_to_guest_common(vcpu);
+}
+
+void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE))
+ __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+ if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE))
+ __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1);
+}
+
+void __debug_switch_to_host(struct kvm_vcpu *vcpu)
+{
+ __debug_switch_to_host_common(vcpu);
+}
+
+u64 __kvm_get_mdcr_el2(void)
+{
+ return read_sysreg(mdcr_el2);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
new file mode 100644
index 000000000000..00de04153cc6
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/memory.h>
+
+struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+s64 __ro_after_init hyp_physvirt_offset;
+
+static unsigned long base;
+static unsigned long end;
+static unsigned long cur;
+
+unsigned long hyp_early_alloc_nr_used_pages(void)
+{
+ return (cur - base) >> PAGE_SHIFT;
+}
+
+void *hyp_early_alloc_contig(unsigned int nr_pages)
+{
+ unsigned long size = (nr_pages << PAGE_SHIFT);
+ void *ret = (void *)cur;
+
+ if (!nr_pages)
+ return NULL;
+
+ if (end - cur < size)
+ return NULL;
+
+ cur += size;
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+void *hyp_early_alloc_page(void *arg)
+{
+ return hyp_early_alloc_contig(1);
+}
+
+static void hyp_early_alloc_get_page(void *addr) { }
+static void hyp_early_alloc_put_page(void *addr) { }
+
+void hyp_early_alloc_init(void *virt, unsigned long size)
+{
+ base = cur = (unsigned long)virt;
+ end = base + size;
+
+ hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page;
+ hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt;
+ hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys;
+ hyp_early_alloc_mm_ops.get_page = hyp_early_alloc_get_page;
+ hyp_early_alloc_mm_ops.put_page = hyp_early_alloc_put_page;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
new file mode 100644
index 000000000000..320f2eaa14a9
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by
+ * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware
+ * Framework for Arm A-profile", which is specified by Arm in document
+ * number DEN0077.
+ *
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ *
+ * This driver hooks into the SMC trapping logic for the host and intercepts
+ * all calls falling within the FF-A range. Each call is either:
+ *
+ * - Forwarded on unmodified to the SPMD at EL3
+ * - Rejected as "unsupported"
+ * - Accompanied by a host stage-2 page-table check/update and reissued
+ *
+ * Consequently, any attempts by the host to make guest memory pages
+ * accessible to the secure world using FF-A will be detected either here
+ * (in the case that the memory is already owned by the guest) or during
+ * donation to the guest (in the case that the memory was previously shared
+ * with the secure world).
+ *
+ * To allow the rolling-back of page-table updates and FF-A calls in the
+ * event of failure, operations involving the RXTX buffers are locked for
+ * the duration and are therefore serialised.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/arm_ffa.h>
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/ffa.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/trap_handler.h>
+#include <nvhe/spinlock.h>
+
+/*
+ * "ID value 0 must be returned at the Non-secure physical FF-A instance"
+ * We share this ID with the host.
+ */
+#define HOST_FFA_ID 0
+
+/*
+ * A buffer to hold the maximum descriptor size we can see from the host,
+ * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP
+ * when resolving the handle on the reclaim path.
+ */
+struct kvm_ffa_descriptor_buffer {
+ void *buf;
+ size_t len;
+};
+
+static struct kvm_ffa_descriptor_buffer ffa_desc_buf;
+
+struct kvm_ffa_buffers {
+ hyp_spinlock_t lock;
+ void *tx;
+ void *rx;
+};
+
+/*
+ * Note that we don't currently lock these buffers explicitly, instead
+ * relying on the locking of the host FFA buffers as we only have one
+ * client.
+ */
+static struct kvm_ffa_buffers hyp_buffers;
+static struct kvm_ffa_buffers host_buffers;
+
+static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
+{
+ *res = (struct arm_smccc_res) {
+ .a0 = FFA_ERROR,
+ .a2 = ffa_errno,
+ };
+}
+
+static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
+{
+ if (ret == FFA_RET_SUCCESS) {
+ *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS,
+ .a2 = prop };
+ } else {
+ ffa_to_smccc_error(res, ret);
+ }
+}
+
+static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret)
+{
+ ffa_to_smccc_res_prop(res, ret, 0);
+}
+
+static void ffa_set_retval(struct kvm_cpu_context *ctxt,
+ struct arm_smccc_res *res)
+{
+ cpu_reg(ctxt, 0) = res->a0;
+ cpu_reg(ctxt, 1) = res->a1;
+ cpu_reg(ctxt, 2) = res->a2;
+ cpu_reg(ctxt, 3) = res->a3;
+}
+
+static bool is_ffa_call(u64 func_id)
+{
+ return ARM_SMCCC_IS_FAST_CALL(func_id) &&
+ ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
+ ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM &&
+ ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM;
+}
+
+static int ffa_map_hyp_buffers(u64 ffa_page_count)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP,
+ hyp_virt_to_phys(hyp_buffers.tx),
+ hyp_virt_to_phys(hyp_buffers.rx),
+ ffa_page_count,
+ 0, 0, 0, 0,
+ &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static int ffa_unmap_hyp_buffers(void)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_RXTX_UNMAP,
+ HOST_FFA_ID,
+ 0, 0, 0, 0, 0, 0,
+ &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 fraglen, u32 endpoint_id)
+{
+ arm_smccc_1_1_smc(FFA_MEM_FRAG_TX,
+ handle_lo, handle_hi, fraglen, endpoint_id,
+ 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 fragoff)
+{
+ arm_smccc_1_1_smc(FFA_MEM_FRAG_RX,
+ handle_lo, handle_hi, fragoff, HOST_FFA_ID,
+ 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
+ u32 fraglen)
+{
+ arm_smccc_1_1_smc(func_id, len, fraglen,
+ 0, 0, 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 flags)
+{
+ arm_smccc_1_1_smc(FFA_MEM_RECLAIM,
+ handle_lo, handle_hi, flags,
+ 0, 0, 0, 0,
+ res);
+}
+
+static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len)
+{
+ arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ,
+ len, len,
+ 0, 0, 0, 0, 0,
+ res);
+}
+
+static void do_ffa_rxtx_map(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(phys_addr_t, tx, ctxt, 1);
+ DECLARE_REG(phys_addr_t, rx, ctxt, 2);
+ DECLARE_REG(u32, npages, ctxt, 3);
+ int ret = 0;
+ void *rx_virt, *tx_virt;
+
+ if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (host_buffers.tx) {
+ ret = FFA_RET_DENIED;
+ goto out_unlock;
+ }
+
+ /*
+ * Map our hypervisor buffers into the SPMD before mapping and
+ * pinning the host buffers in our own address space.
+ */
+ ret = ffa_map_hyp_buffers(npages);
+ if (ret)
+ goto out_unlock;
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unmap;
+ }
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_tx;
+ }
+
+ tx_virt = hyp_phys_to_virt(tx);
+ ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_rx;
+ }
+
+ rx_virt = hyp_phys_to_virt(rx);
+ ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unpin_tx;
+ }
+
+ host_buffers.tx = tx_virt;
+ host_buffers.rx = rx_virt;
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unpin_tx:
+ hyp_unpin_shared_mem(tx_virt, tx_virt + 1);
+err_unshare_rx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx));
+err_unshare_tx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx));
+err_unmap:
+ ffa_unmap_hyp_buffers();
+ goto out_unlock;
+}
+
+static void do_ffa_rxtx_unmap(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ int ret = 0;
+
+ if (id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx)));
+ host_buffers.tx = NULL;
+
+ hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx)));
+ host_buffers.rx = NULL;
+
+ ffa_unmap_hyp_buffers();
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+}
+
+static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nshared = __ffa_host_share_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nshared != nranges) {
+ WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nunshared != nranges) {
+ WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static void do_ffa_mem_frag_tx(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, fraglen, ctxt, 3);
+ DECLARE_REG(u32, endpoint_id, ctxt, 4);
+ struct ffa_mem_region_addr_range *buf;
+ int ret = FFA_RET_INVALID_PARAMETERS;
+ u32 nr_ranges;
+
+ if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)
+ goto out;
+
+ if (fraglen % sizeof(*buf))
+ goto out;
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx)
+ goto out_unlock;
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+ nr_ranges = fraglen / sizeof(*buf);
+
+ ret = ffa_host_share_ranges(buf, nr_ranges);
+ if (ret) {
+ /*
+ * We're effectively aborting the transaction, so we need
+ * to restore the global state back to what it was prior to
+ * transmission of the first fragment.
+ */
+ ffa_mem_reclaim(res, handle_lo, handle_hi, 0);
+ WARN_ON(res->a0 != FFA_SUCCESS);
+ goto out_unlock;
+ }
+
+ ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id);
+ if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX)
+ WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges));
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+
+ /*
+ * If for any reason this did not succeed, we're in trouble as we have
+ * now lost the content of the previous fragments and we can't rollback
+ * the host stage-2 changes. The pages previously marked as shared will
+ * remain stuck in that state forever, hence preventing the host from
+ * sharing/donating them again and may possibly lead to subsequent
+ * failures, but this will not compromise confidentiality.
+ */
+ return;
+}
+
+static __always_inline void do_ffa_mem_xfer(const u64 func_id,
+ struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, len, ctxt, 1);
+ DECLARE_REG(u32, fraglen, ctxt, 2);
+ DECLARE_REG(u64, addr_mbz, ctxt, 3);
+ DECLARE_REG(u32, npages_mbz, ctxt, 4);
+ struct ffa_mem_region_attributes *ep_mem_access;
+ struct ffa_composite_mem_region *reg;
+ struct ffa_mem_region *buf;
+ u32 offset, nr_ranges;
+ int ret = 0;
+
+ BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
+ func_id != FFA_FN64_MEM_LEND);
+
+ if (addr_mbz || npages_mbz || fraglen > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (fraglen < sizeof(struct ffa_mem_region) +
+ sizeof(struct ffa_mem_region_attributes)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+
+ ep_mem_access = (void *)buf +
+ ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+ offset = ep_mem_access->composite_off;
+ if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ reg = (void *)buf + offset;
+ nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents;
+ if (nr_ranges % sizeof(reg->constituents[0])) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ nr_ranges /= sizeof(reg->constituents[0]);
+ ret = ffa_host_share_ranges(reg->constituents, nr_ranges);
+ if (ret)
+ goto out_unlock;
+
+ ffa_mem_xfer(res, func_id, len, fraglen);
+ if (fraglen != len) {
+ if (res->a0 != FFA_MEM_FRAG_RX)
+ goto err_unshare;
+
+ if (res->a3 != fraglen)
+ goto err_unshare;
+ } else if (res->a0 != FFA_SUCCESS) {
+ goto err_unshare;
+ }
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unshare:
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges));
+ goto out_unlock;
+}
+
+static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, flags, ctxt, 3);
+ struct ffa_mem_region_attributes *ep_mem_access;
+ struct ffa_composite_mem_region *reg;
+ u32 offset, len, fraglen, fragoff;
+ struct ffa_mem_region *buf;
+ int ret = 0;
+ u64 handle;
+
+ handle = PACK_HANDLE(handle_lo, handle_hi);
+
+ hyp_spin_lock(&host_buffers.lock);
+
+ buf = hyp_buffers.tx;
+ *buf = (struct ffa_mem_region) {
+ .sender_id = HOST_FFA_ID,
+ .handle = handle,
+ };
+
+ ffa_retrieve_req(res, sizeof(*buf));
+ buf = hyp_buffers.rx;
+ if (res->a0 != FFA_MEM_RETRIEVE_RESP)
+ goto out_unlock;
+
+ len = res->a1;
+ fraglen = res->a2;
+
+ ep_mem_access = (void *)buf +
+ ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+ offset = ep_mem_access->composite_off;
+ /*
+ * We can trust the SPMD to get this right, but let's at least
+ * check that we end up with something that doesn't look _completely_
+ * bogus.
+ */
+ if (WARN_ON(offset > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
+ ret = FFA_RET_ABORTED;
+ goto out_unlock;
+ }
+
+ if (len > ffa_desc_buf.len) {
+ ret = FFA_RET_NO_MEMORY;
+ goto out_unlock;
+ }
+
+ buf = ffa_desc_buf.buf;
+ memcpy(buf, hyp_buffers.rx, fraglen);
+
+ for (fragoff = fraglen; fragoff < len; fragoff += fraglen) {
+ ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff);
+ if (res->a0 != FFA_MEM_FRAG_TX) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ fraglen = res->a3;
+ memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen);
+ }
+
+ ffa_mem_reclaim(res, handle_lo, handle_hi, flags);
+ if (res->a0 != FFA_SUCCESS)
+ goto out_unlock;
+
+ reg = (void *)buf + offset;
+ /* If the SPMD was happy, then we should be too. */
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents,
+ reg->addr_range_cnt));
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+}
+
+/*
+ * Is a given FFA function supported, either by forwarding on directly
+ * or by handling at EL2?
+ */
+static bool ffa_call_supported(u64 func_id)
+{
+ switch (func_id) {
+ /* Unsupported memory management calls */
+ case FFA_FN64_MEM_RETRIEVE_REQ:
+ case FFA_MEM_RETRIEVE_RESP:
+ case FFA_MEM_RELINQUISH:
+ case FFA_MEM_OP_PAUSE:
+ case FFA_MEM_OP_RESUME:
+ case FFA_MEM_FRAG_RX:
+ case FFA_FN64_MEM_DONATE:
+ /* Indirect message passing via RX/TX buffers */
+ case FFA_MSG_SEND:
+ case FFA_MSG_POLL:
+ case FFA_MSG_WAIT:
+ /* 32-bit variants of 64-bit calls */
+ case FFA_MSG_SEND_DIRECT_REQ:
+ case FFA_MSG_SEND_DIRECT_RESP:
+ case FFA_RXTX_MAP:
+ case FFA_MEM_DONATE:
+ case FFA_MEM_RETRIEVE_REQ:
+ return false;
+ }
+
+ return true;
+}
+
+static bool do_ffa_features(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ u64 prop = 0;
+ int ret = 0;
+
+ if (!ffa_call_supported(id)) {
+ ret = FFA_RET_NOT_SUPPORTED;
+ goto out_handled;
+ }
+
+ switch (id) {
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ ret = FFA_RET_SUCCESS;
+ prop = 0; /* No support for dynamic buffers */
+ goto out_handled;
+ default:
+ return false;
+ }
+
+out_handled:
+ ffa_to_smccc_res_prop(res, ret, prop);
+ return true;
+}
+
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
+{
+ struct arm_smccc_res res;
+
+ /*
+ * There's no way we can tell what a non-standard SMC call might
+ * be up to. Ideally, we would terminate these here and return
+ * an error to the host, but sadly devices make use of custom
+ * firmware calls for things like power management, debugging,
+ * RNG access and crash reporting.
+ *
+ * Given that the architecture requires us to trust EL3 anyway,
+ * we forward unrecognised calls on under the assumption that
+ * the firmware doesn't expose a mechanism to access arbitrary
+ * non-secure memory. Short of a per-device table of SMCs, this
+ * is the best we can do.
+ */
+ if (!is_ffa_call(func_id))
+ return false;
+
+ switch (func_id) {
+ case FFA_FEATURES:
+ if (!do_ffa_features(&res, host_ctxt))
+ return false;
+ goto out_handled;
+ /* Memory management */
+ case FFA_FN64_RXTX_MAP:
+ do_ffa_rxtx_map(&res, host_ctxt);
+ goto out_handled;
+ case FFA_RXTX_UNMAP:
+ do_ffa_rxtx_unmap(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_RECLAIM:
+ do_ffa_mem_reclaim(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_FRAG_TX:
+ do_ffa_mem_frag_tx(&res, host_ctxt);
+ goto out_handled;
+ }
+
+ if (ffa_call_supported(func_id))
+ return false; /* Pass through */
+
+ ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED);
+out_handled:
+ ffa_set_retval(host_ctxt, &res);
+ return true;
+}
+
+int hyp_ffa_init(void *pages)
+{
+ struct arm_smccc_res res;
+ size_t min_rxtx_sz;
+ void *tx, *rx;
+
+ if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
+ return 0;
+
+ arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 == FFA_RET_NOT_SUPPORTED)
+ return 0;
+
+ /*
+ * Firmware returns the maximum supported version of the FF-A
+ * implementation. Check that the returned version is
+ * backwards-compatible with the hyp according to the rules in DEN0077A
+ * v1.1 REL0 13.2.1.
+ *
+ * Of course, things are never simple when dealing with firmware. v1.1
+ * broke ABI with v1.0 on several structures, which is itself
+ * incompatible with the aforementioned versioning scheme. The
+ * expectation is that v1.x implementations that do not support the v1.0
+ * ABI return NOT_SUPPORTED rather than a version number, according to
+ * DEN0077A v1.1 REL0 18.6.4.
+ */
+ if (FFA_MAJOR_VERSION(res.a0) != 1)
+ return -EOPNOTSUPP;
+
+ arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ if (res.a2 != HOST_FFA_ID)
+ return -EINVAL;
+
+ arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
+ 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ switch (res.a2) {
+ case FFA_FEAT_RXTX_MIN_SZ_4K:
+ min_rxtx_sz = SZ_4K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_16K:
+ min_rxtx_sz = SZ_16K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_64K:
+ min_rxtx_sz = SZ_64K;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (min_rxtx_sz > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ tx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+ rx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+
+ ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) {
+ .buf = pages,
+ .len = PAGE_SIZE *
+ (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)),
+ };
+
+ hyp_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ .tx = tx,
+ .rx = rx,
+ };
+
+ host_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ };
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
new file mode 100644
index 000000000000..6bc88a756cb7
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ *
+ * Generates relocation information used by the kernel to convert
+ * absolute addresses in hyp data from kernel VAs to hyp VAs.
+ *
+ * This is necessary because hyp code is linked into the same binary
+ * as the kernel but executes under different memory mappings.
+ * If the compiler used absolute addressing, those addresses need to
+ * be converted before they are used by hyp code.
+ *
+ * The input of this program is the relocatable ELF object containing
+ * all hyp code/data, not yet linked into vmlinux. Hyp section names
+ * should have been prefixed with `.hyp` at this point.
+ *
+ * The output (printed to stdout) is an assembly file containing
+ * an array of 32-bit integers and static relocations that instruct
+ * the linker of `vmlinux` to populate the array entries with offsets
+ * to positions in the kernel binary containing VAs used by hyp code.
+ *
+ * Note that dynamic relocations could be used for the same purpose.
+ * However, those are only generated if CONFIG_RELOCATABLE=y.
+ */
+
+#include <elf.h>
+#include <endian.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <generated/autoconf.h>
+
+#define HYP_SECTION_PREFIX ".hyp"
+#define HYP_RELOC_SECTION ".hyp.reloc"
+#define HYP_SECTION_SYMBOL_PREFIX "__hyp_section_"
+
+/*
+ * AArch64 relocation type constants.
+ * Included in case these are not defined in the host toolchain.
+ */
+#ifndef R_AARCH64_ABS64
+#define R_AARCH64_ABS64 257
+#endif
+#ifndef R_AARCH64_PREL64
+#define R_AARCH64_PREL64 260
+#endif
+#ifndef R_AARCH64_PREL32
+#define R_AARCH64_PREL32 261
+#endif
+#ifndef R_AARCH64_PREL16
+#define R_AARCH64_PREL16 262
+#endif
+#ifndef R_AARCH64_PLT32
+#define R_AARCH64_PLT32 314
+#endif
+#ifndef R_AARCH64_LD_PREL_LO19
+#define R_AARCH64_LD_PREL_LO19 273
+#endif
+#ifndef R_AARCH64_ADR_PREL_LO21
+#define R_AARCH64_ADR_PREL_LO21 274
+#endif
+#ifndef R_AARCH64_ADR_PREL_PG_HI21
+#define R_AARCH64_ADR_PREL_PG_HI21 275
+#endif
+#ifndef R_AARCH64_ADR_PREL_PG_HI21_NC
+#define R_AARCH64_ADR_PREL_PG_HI21_NC 276
+#endif
+#ifndef R_AARCH64_ADD_ABS_LO12_NC
+#define R_AARCH64_ADD_ABS_LO12_NC 277
+#endif
+#ifndef R_AARCH64_LDST8_ABS_LO12_NC
+#define R_AARCH64_LDST8_ABS_LO12_NC 278
+#endif
+#ifndef R_AARCH64_TSTBR14
+#define R_AARCH64_TSTBR14 279
+#endif
+#ifndef R_AARCH64_CONDBR19
+#define R_AARCH64_CONDBR19 280
+#endif
+#ifndef R_AARCH64_JUMP26
+#define R_AARCH64_JUMP26 282
+#endif
+#ifndef R_AARCH64_CALL26
+#define R_AARCH64_CALL26 283
+#endif
+#ifndef R_AARCH64_LDST16_ABS_LO12_NC
+#define R_AARCH64_LDST16_ABS_LO12_NC 284
+#endif
+#ifndef R_AARCH64_LDST32_ABS_LO12_NC
+#define R_AARCH64_LDST32_ABS_LO12_NC 285
+#endif
+#ifndef R_AARCH64_LDST64_ABS_LO12_NC
+#define R_AARCH64_LDST64_ABS_LO12_NC 286
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G0
+#define R_AARCH64_MOVW_PREL_G0 287
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G0_NC
+#define R_AARCH64_MOVW_PREL_G0_NC 288
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G1
+#define R_AARCH64_MOVW_PREL_G1 289
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G1_NC
+#define R_AARCH64_MOVW_PREL_G1_NC 290
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G2
+#define R_AARCH64_MOVW_PREL_G2 291
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G2_NC
+#define R_AARCH64_MOVW_PREL_G2_NC 292
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G3
+#define R_AARCH64_MOVW_PREL_G3 293
+#endif
+#ifndef R_AARCH64_LDST128_ABS_LO12_NC
+#define R_AARCH64_LDST128_ABS_LO12_NC 299
+#endif
+
+/* Global state of the processed ELF. */
+static struct {
+ const char *path;
+ char *begin;
+ size_t size;
+ Elf64_Ehdr *ehdr;
+ Elf64_Shdr *sh_table;
+ const char *sh_string;
+} elf;
+
+#if defined(CONFIG_CPU_LITTLE_ENDIAN)
+
+#define elf16toh(x) le16toh(x)
+#define elf32toh(x) le32toh(x)
+#define elf64toh(x) le64toh(x)
+
+#define ELFENDIAN ELFDATA2LSB
+
+#elif defined(CONFIG_CPU_BIG_ENDIAN)
+
+#define elf16toh(x) be16toh(x)
+#define elf32toh(x) be32toh(x)
+#define elf64toh(x) be64toh(x)
+
+#define ELFENDIAN ELFDATA2MSB
+
+#else
+
+#error PDP-endian sadly unsupported...
+
+#endif
+
+#define fatal_error(fmt, ...) \
+ ({ \
+ fprintf(stderr, "error: %s: " fmt "\n", \
+ elf.path, ## __VA_ARGS__); \
+ exit(EXIT_FAILURE); \
+ __builtin_unreachable(); \
+ })
+
+#define fatal_perror(msg) \
+ ({ \
+ fprintf(stderr, "error: %s: " msg ": %s\n", \
+ elf.path, strerror(errno)); \
+ exit(EXIT_FAILURE); \
+ __builtin_unreachable(); \
+ })
+
+#define assert_op(lhs, rhs, fmt, op) \
+ ({ \
+ typeof(lhs) _lhs = (lhs); \
+ typeof(rhs) _rhs = (rhs); \
+ \
+ if (!(_lhs op _rhs)) { \
+ fatal_error("assertion " #lhs " " #op " " #rhs \
+ " failed (lhs=" fmt ", rhs=" fmt \
+ ", line=%d)", _lhs, _rhs, __LINE__); \
+ } \
+ })
+
+#define assert_eq(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, ==)
+#define assert_ne(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, !=)
+#define assert_lt(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, <)
+#define assert_ge(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, >=)
+
+/*
+ * Return a pointer of a given type at a given offset from
+ * the beginning of the ELF file.
+ */
+#define elf_ptr(type, off) ((type *)(elf.begin + (off)))
+
+/* Iterate over all sections in the ELF. */
+#define for_each_section(var) \
+ for (var = elf.sh_table; var < elf.sh_table + elf16toh(elf.ehdr->e_shnum); ++var)
+
+/* Iterate over all Elf64_Rela relocations in a given section. */
+#define for_each_rela(shdr, var) \
+ for (var = elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset)); \
+ var < elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset) + elf64toh(shdr->sh_size)); var++)
+
+/* True if a string starts with a given prefix. */
+static inline bool starts_with(const char *str, const char *prefix)
+{
+ return memcmp(str, prefix, strlen(prefix)) == 0;
+}
+
+/* Returns a string containing the name of a given section. */
+static inline const char *section_name(Elf64_Shdr *shdr)
+{
+ return elf.sh_string + elf32toh(shdr->sh_name);
+}
+
+/* Returns a pointer to the first byte of section data. */
+static inline const char *section_begin(Elf64_Shdr *shdr)
+{
+ return elf_ptr(char, elf64toh(shdr->sh_offset));
+}
+
+/* Find a section by its offset from the beginning of the file. */
+static inline Elf64_Shdr *section_by_off(Elf64_Off off)
+{
+ assert_ne(off, 0UL, "%lu");
+ return elf_ptr(Elf64_Shdr, off);
+}
+
+/* Find a section by its index. */
+static inline Elf64_Shdr *section_by_idx(uint16_t idx)
+{
+ assert_ne(idx, SHN_UNDEF, "%u");
+ return &elf.sh_table[idx];
+}
+
+/*
+ * Memory-map the given ELF file, perform sanity checks, and
+ * populate global state.
+ */
+static void init_elf(const char *path)
+{
+ int fd, ret;
+ struct stat stat;
+
+ /* Store path in the global struct for error printing. */
+ elf.path = path;
+
+ /* Open the ELF file. */
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ fatal_perror("Could not open ELF file");
+
+ /* Get status of ELF file to obtain its size. */
+ ret = fstat(fd, &stat);
+ if (ret < 0) {
+ close(fd);
+ fatal_perror("Could not get status of ELF file");
+ }
+
+ /* mmap() the entire ELF file read-only at an arbitrary address. */
+ elf.begin = mmap(0, stat.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (elf.begin == MAP_FAILED) {
+ close(fd);
+ fatal_perror("Could not mmap ELF file");
+ }
+
+ /* mmap() was successful, close the FD. */
+ close(fd);
+
+ /* Get pointer to the ELF header. */
+ assert_ge(stat.st_size, sizeof(*elf.ehdr), "%lu");
+ elf.ehdr = elf_ptr(Elf64_Ehdr, 0);
+
+ /* Check the ELF magic. */
+ assert_eq(elf.ehdr->e_ident[EI_MAG0], ELFMAG0, "0x%x");
+ assert_eq(elf.ehdr->e_ident[EI_MAG1], ELFMAG1, "0x%x");
+ assert_eq(elf.ehdr->e_ident[EI_MAG2], ELFMAG2, "0x%x");
+ assert_eq(elf.ehdr->e_ident[EI_MAG3], ELFMAG3, "0x%x");
+
+ /* Sanity check that this is an ELF64 relocatable object for AArch64. */
+ assert_eq(elf.ehdr->e_ident[EI_CLASS], ELFCLASS64, "%u");
+ assert_eq(elf.ehdr->e_ident[EI_DATA], ELFENDIAN, "%u");
+ assert_eq(elf16toh(elf.ehdr->e_type), ET_REL, "%u");
+ assert_eq(elf16toh(elf.ehdr->e_machine), EM_AARCH64, "%u");
+
+ /* Populate fields of the global struct. */
+ elf.sh_table = section_by_off(elf64toh(elf.ehdr->e_shoff));
+ elf.sh_string = section_begin(section_by_idx(elf16toh(elf.ehdr->e_shstrndx)));
+}
+
+/* Print the prologue of the output ASM file. */
+static void emit_prologue(void)
+{
+ printf(".data\n"
+ ".pushsection " HYP_RELOC_SECTION ", \"a\"\n");
+}
+
+/* Print ASM statements needed as a prologue to a processed hyp section. */
+static void emit_section_prologue(const char *sh_orig_name)
+{
+ /* Declare the hyp section symbol. */
+ printf(".global %s%s\n", HYP_SECTION_SYMBOL_PREFIX, sh_orig_name);
+}
+
+/*
+ * Print ASM statements to create a hyp relocation entry for a given
+ * R_AARCH64_ABS64 relocation.
+ *
+ * The linker of vmlinux will populate the position given by `rela` with
+ * an absolute 64-bit kernel VA. If the kernel is relocatable, it will
+ * also generate a dynamic relocation entry so that the kernel can shift
+ * the address at runtime for KASLR.
+ *
+ * Emit a 32-bit offset from the current address to the position given
+ * by `rela`. This way the kernel can iterate over all kernel VAs used
+ * by hyp at runtime and convert them to hyp VAs. However, that offset
+ * will not be known until linking of `vmlinux`, so emit a PREL32
+ * relocation referencing a symbol that the hyp linker script put at
+ * the beginning of the relocated section + the offset from `rela`.
+ */
+static void emit_rela_abs64(Elf64_Rela *rela, const char *sh_orig_name)
+{
+ /* Offset of this reloc from the beginning of HYP_RELOC_SECTION. */
+ static size_t reloc_offset;
+
+ /* Create storage for the 32-bit offset. */
+ printf(".word 0\n");
+
+ /*
+ * Create a PREL32 relocation which instructs the linker of `vmlinux`
+ * to insert offset to position <base> + <offset>, where <base> is
+ * a symbol at the beginning of the relocated section, and <offset>
+ * is `rela->r_offset`.
+ */
+ printf(".reloc %lu, R_AARCH64_PREL32, %s%s + 0x%lx\n",
+ reloc_offset, HYP_SECTION_SYMBOL_PREFIX, sh_orig_name,
+ elf64toh(rela->r_offset));
+
+ reloc_offset += 4;
+}
+
+/* Print the epilogue of the output ASM file. */
+static void emit_epilogue(void)
+{
+ printf(".popsection\n");
+}
+
+/*
+ * Iterate over all RELA relocations in a given section and emit
+ * hyp relocation data for all absolute addresses in hyp code/data.
+ *
+ * Static relocations that generate PC-relative-addressing are ignored.
+ * Failure is reported for unexpected relocation types.
+ */
+static void emit_rela_section(Elf64_Shdr *sh_rela)
+{
+ Elf64_Shdr *sh_orig = &elf.sh_table[elf32toh(sh_rela->sh_info)];
+ const char *sh_orig_name = section_name(sh_orig);
+ Elf64_Rela *rela;
+
+ /* Skip all non-hyp sections. */
+ if (!starts_with(sh_orig_name, HYP_SECTION_PREFIX))
+ return;
+
+ emit_section_prologue(sh_orig_name);
+
+ for_each_rela(sh_rela, rela) {
+ uint32_t type = (uint32_t)elf64toh(rela->r_info);
+
+ /* Check that rela points inside the relocated section. */
+ assert_lt(elf64toh(rela->r_offset), elf64toh(sh_orig->sh_size), "0x%lx");
+
+ switch (type) {
+ /*
+ * Data relocations to generate absolute addressing.
+ * Emit a hyp relocation.
+ */
+ case R_AARCH64_ABS64:
+ emit_rela_abs64(rela, sh_orig_name);
+ break;
+ /* Allow position-relative data relocations. */
+ case R_AARCH64_PREL64:
+ case R_AARCH64_PREL32:
+ case R_AARCH64_PREL16:
+ case R_AARCH64_PLT32:
+ break;
+ /* Allow relocations to generate PC-relative addressing. */
+ case R_AARCH64_LD_PREL_LO19:
+ case R_AARCH64_ADR_PREL_LO21:
+ case R_AARCH64_ADR_PREL_PG_HI21:
+ case R_AARCH64_ADR_PREL_PG_HI21_NC:
+ case R_AARCH64_ADD_ABS_LO12_NC:
+ case R_AARCH64_LDST8_ABS_LO12_NC:
+ case R_AARCH64_LDST16_ABS_LO12_NC:
+ case R_AARCH64_LDST32_ABS_LO12_NC:
+ case R_AARCH64_LDST64_ABS_LO12_NC:
+ case R_AARCH64_LDST128_ABS_LO12_NC:
+ break;
+ /* Allow relative relocations for control-flow instructions. */
+ case R_AARCH64_TSTBR14:
+ case R_AARCH64_CONDBR19:
+ case R_AARCH64_JUMP26:
+ case R_AARCH64_CALL26:
+ break;
+ /* Allow group relocations to create PC-relative offset inline. */
+ case R_AARCH64_MOVW_PREL_G0:
+ case R_AARCH64_MOVW_PREL_G0_NC:
+ case R_AARCH64_MOVW_PREL_G1:
+ case R_AARCH64_MOVW_PREL_G1_NC:
+ case R_AARCH64_MOVW_PREL_G2:
+ case R_AARCH64_MOVW_PREL_G2_NC:
+ case R_AARCH64_MOVW_PREL_G3:
+ break;
+ default:
+ fatal_error("Unexpected RELA type %u", type);
+ }
+ }
+}
+
+/* Iterate over all sections and emit hyp relocation data for RELA sections. */
+static void emit_all_relocs(void)
+{
+ Elf64_Shdr *shdr;
+
+ for_each_section(shdr) {
+ switch (elf32toh(shdr->sh_type)) {
+ case SHT_REL:
+ fatal_error("Unexpected SHT_REL section \"%s\"",
+ section_name(shdr));
+ case SHT_RELA:
+ emit_rela_section(shdr);
+ break;
+ }
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <elf_input>\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ init_elf(argv[1]);
+
+ emit_prologue();
+ emit_all_relocs();
+ emit_epilogue();
+
+ return EXIT_SUCCESS;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
new file mode 100644
index 000000000000..135cfb294ee5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/assembler.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_ptrauth.h>
+
+ .text
+
+SYM_FUNC_START(__host_exit)
+ get_host_ctxt x0, x1
+
+ /* Store the host regs x2 and x3 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
+
+ /* Retrieve the host regs x0-x1 from the stack */
+ ldp x2, x3, [sp], #16 // x0, x1
+
+ /* Store the host regs x0-x1 and x4-x17 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(0)]
+ stp x4, x5, [x0, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x0, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x0, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x0, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x0, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x0, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x0, #CPU_XREG_OFFSET(16)]
+
+ /* Store the host regs x18-x29, lr */
+ save_callee_saved_regs x0
+
+ /* Save the host context pointer in x29 across the function call */
+ mov x29, x0
+
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_save
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ /* Save kernel ptrauth keys. */
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_save_state x18, x19, x20
+
+ /* Use hyp keys. */
+ adr_this_cpu x18, kvm_hyp_ctxt, x19
+ add x18, x18, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+ isb
+alternative_else_nop_endif
+__skip_pauth_save:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+ bl handle_trap
+
+__host_enter_restore_full:
+ /* Restore kernel keys. */
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_restore
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+alternative_else_nop_endif
+__skip_pauth_restore:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+ /* Restore host regs x0-x17 */
+ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)]
+
+ /* x0-7 are use for panic arguments */
+__host_enter_for_panic:
+ ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]
+
+ /* Restore host regs x18-x29, lr */
+ restore_callee_saved_regs x29
+
+ /* Do not touch any register after this! */
+__host_enter_without_restoring:
+ eret
+ sb
+SYM_FUNC_END(__host_exit)
+
+/*
+ * void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+ */
+SYM_FUNC_START(__host_enter)
+ mov x29, x0
+ b __host_enter_restore_full
+SYM_FUNC_END(__host_enter)
+
+/*
+ * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
+ * u64 elr, u64 par);
+ */
+SYM_FUNC_START(__hyp_do_panic)
+ /* Prepare and exit to the host's panic function. */
+ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+ PSR_MODE_EL1h)
+ msr spsr_el2, lr
+ adr_l lr, nvhe_hyp_panic_handler
+ hyp_kimg_va lr, x6
+ msr elr_el2, lr
+
+ mov x29, x0
+
+#ifdef CONFIG_NVHE_EL2_DEBUG
+ /* Ensure host stage-2 is disabled */
+ mrs x0, hcr_el2
+ bic x0, x0, #HCR_VM
+ msr hcr_el2, x0
+ isb
+ tlbi vmalls12e1
+ dsb nsh
+#endif
+
+ /* Load the panic arguments into x0-7 */
+ mrs x0, esr_el2
+ mov x4, x3
+ mov x3, x2
+ hyp_pa x3, x6
+ get_vcpu_ptr x5, x6
+ mrs x6, far_el2
+ mrs x7, hpfar_el2
+
+ /* Enter the host, conditionally restoring the host context. */
+ cbz x29, __host_enter_without_restoring
+ b __host_enter_for_panic
+SYM_FUNC_END(__hyp_do_panic)
+
+SYM_FUNC_START(__host_hvc)
+ ldp x0, x1, [sp] // Don't fixup the stack yet
+
+ /* No stub for you, sonny Jim */
+alternative_if ARM64_KVM_PROTECTED_MODE
+ b __host_exit
+alternative_else_nop_endif
+
+ /* Check for a stub HVC call */
+ cmp x0, #HVC_STUB_HCALL_NR
+ b.hs __host_exit
+
+ add sp, sp, #16
+ /*
+ * Compute the idmap address of __kvm_handle_stub_hvc and
+ * jump there.
+ *
+ * Preserve x0-x4, which may contain stub parameters.
+ */
+ adr_l x5, __kvm_handle_stub_hvc
+ hyp_pa x5, x6
+ br x5
+SYM_FUNC_END(__host_hvc)
+
+.macro host_el1_sync_vect
+ .align 7
+.L__vect_start\@:
+ stp x0, x1, [sp, #-16]!
+ mrs x0, esr_el2
+ ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH
+ cmp x0, #ESR_ELx_EC_HVC64
+ b.eq __host_hvc
+ b __host_exit
+.L__vect_end\@:
+.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
+ .error "host_el1_sync_vect larger than vector entry"
+.endif
+.endm
+
+.macro invalid_host_el2_vect
+ .align 7
+
+ /*
+ * Test whether the SP has overflowed, without corrupting a GPR.
+ * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
+ * of SP should always be 1.
+ */
+ add sp, sp, x0 // sp' = sp + x0
+ sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
+ tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
+ sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
+ sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
+
+ /* If a guest is loaded, panic out of it. */
+ stp x0, x1, [sp, #-16]!
+ get_loaded_vcpu x0, x1
+ cbnz x0, __guest_exit_panic
+ add sp, sp, #16
+
+ /*
+ * The panic may not be clean if the exception is taken before the host
+ * context has been saved by __host_exit or after the hyp context has
+ * been partially clobbered by __host_enter.
+ */
+ b hyp_panic
+
+.L__hyp_sp_overflow\@:
+ /* Switch to the overflow stack */
+ adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0
+
+ b hyp_panic_bad_stack
+ ASM_BUG()
+.endm
+
+.macro invalid_host_el1_vect
+ .align 7
+ mov x0, xzr /* restore_host = false */
+ mrs x1, spsr_el2
+ mrs x2, elr_el2
+ mrs x3, par_el1
+ b __hyp_do_panic
+.endm
+
+/*
+ * The host vector does not use an ESB instruction in order to avoid consuming
+ * SErrors that should only be consumed by the host. Guest entry is deferred by
+ * __guest_enter if there are any pending asynchronous exceptions so hyp will
+ * always return to the host without having consumerd host SErrors.
+ *
+ * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
+ * host knows about the EL2 vectors already, and there is no point in hiding
+ * them.
+ */
+ .align 11
+SYM_CODE_START(__kvm_hyp_host_vector)
+ invalid_host_el2_vect // Synchronous EL2t
+ invalid_host_el2_vect // IRQ EL2t
+ invalid_host_el2_vect // FIQ EL2t
+ invalid_host_el2_vect // Error EL2t
+
+ invalid_host_el2_vect // Synchronous EL2h
+ invalid_host_el2_vect // IRQ EL2h
+ invalid_host_el2_vect // FIQ EL2h
+ invalid_host_el2_vect // Error EL2h
+
+ host_el1_sync_vect // Synchronous 64-bit EL1/EL0
+ invalid_host_el1_vect // IRQ 64-bit EL1/EL0
+ invalid_host_el1_vect // FIQ 64-bit EL1/EL0
+ invalid_host_el1_vect // Error 64-bit EL1/EL0
+
+ host_el1_sync_vect // Synchronous 32-bit EL1/EL0
+ invalid_host_el1_vect // IRQ 32-bit EL1/EL0
+ invalid_host_el1_vect // FIQ 32-bit EL1/EL0
+ invalid_host_el1_vect // Error 32-bit EL1/EL0
+SYM_CODE_END(__kvm_hyp_host_vector)
+
+/*
+ * Forward SMC with arguments in struct kvm_cpu_context, and
+ * store the result into the same struct. Assumes SMCCC 1.2 or older.
+ *
+ * x0: struct kvm_cpu_context*
+ */
+SYM_CODE_START(__kvm_hyp_host_forward_smc)
+ /*
+ * Use x18 to keep the pointer to the host context because
+ * x18 is callee-saved in SMCCC but not in AAPCS64.
+ */
+ mov x18, x0
+
+ ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)]
+ ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)]
+
+ smc #0
+
+ stp x0, x1, [x18, #CPU_XREG_OFFSET(0)]
+ stp x2, x3, [x18, #CPU_XREG_OFFSET(2)]
+ stp x4, x5, [x18, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x18, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x18, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x18, #CPU_XREG_OFFSET(16)]
+
+ ret
+SYM_CODE_END(__kvm_hyp_host_forward_smc)
+
+/*
+ * kvm_host_psci_cpu_entry is called through br instruction, which requires
+ * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external
+ * functions, but bti c instead.
+ */
+SYM_CODE_START(kvm_host_psci_cpu_entry)
+ bti j
+ b __kvm_host_psci_cpu_entry
+SYM_CODE_END(kvm_host_psci_cpu_entry)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
new file mode 100644
index 000000000000..2994878d68ea
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/linkage.h>
+
+#include <asm/alternative.h>
+#include <asm/assembler.h>
+#include <asm/el2_setup.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/pgtable-hwdef.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+ .text
+ .pushsection .idmap.text, "ax"
+
+ .align 11
+
+SYM_CODE_START(__kvm_hyp_init)
+ ventry __invalid // Synchronous EL2t
+ ventry __invalid // IRQ EL2t
+ ventry __invalid // FIQ EL2t
+ ventry __invalid // Error EL2t
+
+ ventry __invalid // Synchronous EL2h
+ ventry __invalid // IRQ EL2h
+ ventry __invalid // FIQ EL2h
+ ventry __invalid // Error EL2h
+
+ ventry __do_hyp_init // Synchronous 64-bit EL1
+ ventry __invalid // IRQ 64-bit EL1
+ ventry __invalid // FIQ 64-bit EL1
+ ventry __invalid // Error 64-bit EL1
+
+ ventry __invalid // Synchronous 32-bit EL1
+ ventry __invalid // IRQ 32-bit EL1
+ ventry __invalid // FIQ 32-bit EL1
+ ventry __invalid // Error 32-bit EL1
+
+__invalid:
+ b .
+
+ /*
+ * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers.
+ *
+ * x0: SMCCC function ID
+ * x1: struct kvm_nvhe_init_params PA
+ */
+__do_hyp_init:
+ /* Check for a stub HVC call */
+ cmp x0, #HVC_STUB_HCALL_NR
+ b.lo __kvm_handle_stub_hvc
+
+ bic x0, x0, #ARM_SMCCC_CALL_HINTS
+ mov x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
+ cmp x0, x3
+ b.eq 1f
+
+ mov x0, #SMCCC_RET_NOT_SUPPORTED
+ eret
+
+1: mov x0, x1
+ mov x3, lr
+ bl ___kvm_hyp_init // Clobbers x0..x2
+ mov lr, x3
+
+ /* Hello, World! */
+ mov x0, #SMCCC_RET_SUCCESS
+ eret
+SYM_CODE_END(__kvm_hyp_init)
+
+/*
+ * Initialize the hypervisor in EL2.
+ *
+ * Only uses x0..x2 so as to not clobber callee-saved SMCCC registers
+ * and leave x3 for the caller.
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ */
+SYM_CODE_START_LOCAL(___kvm_hyp_init)
+ ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA]
+ mov sp, x1
+
+ ldr x1, [x0, #NVHE_INIT_MAIR_EL2]
+ msr mair_el2, x1
+
+ ldr x1, [x0, #NVHE_INIT_HCR_EL2]
+ msr hcr_el2, x1
+
+ mov x2, #HCR_E2H
+ and x2, x1, x2
+ cbz x2, 1f
+
+ // hVHE: Replay the EL2 setup to account for the E2H bit
+ // TPIDR_EL2 is used to preserve x0 across the macro maze...
+ isb
+ msr tpidr_el2, x0
+ init_el2_state
+ finalise_el2_state
+ mrs x0, tpidr_el2
+
+1:
+ ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
+ msr tpidr_el2, x1
+
+ ldr x1, [x0, #NVHE_INIT_VTTBR]
+ msr vttbr_el2, x1
+
+ ldr x1, [x0, #NVHE_INIT_VTCR]
+ msr vtcr_el2, x1
+
+ ldr x1, [x0, #NVHE_INIT_PGD_PA]
+ phys_to_ttbr x2, x1
+alternative_if ARM64_HAS_CNP
+ orr x2, x2, #TTBR_CNP_BIT
+alternative_else_nop_endif
+ msr ttbr0_el2, x2
+
+ ldr x0, [x0, #NVHE_INIT_TCR_EL2]
+ msr tcr_el2, x0
+
+ isb
+
+ /* Invalidate the stale TLBs from Bootloader */
+ tlbi alle2
+ tlbi vmalls12e1
+ dsb sy
+
+ mov_q x0, INIT_SCTLR_EL2_MMU_ON
+alternative_if ARM64_HAS_ADDRESS_AUTH
+ mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
+ SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
+ orr x0, x0, x1
+alternative_else_nop_endif
+
+#ifdef CONFIG_ARM64_BTI_KERNEL
+alternative_if ARM64_BTI
+ orr x0, x0, #SCTLR_EL2_BT
+alternative_else_nop_endif
+#endif /* CONFIG_ARM64_BTI_KERNEL */
+
+ msr sctlr_el2, x0
+ isb
+
+ /* Set the host vector */
+ ldr x0, =__kvm_hyp_host_vector
+ msr vbar_el2, x0
+
+ ret
+SYM_CODE_END(___kvm_hyp_init)
+
+/*
+ * PSCI CPU_ON entry point
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ */
+SYM_CODE_START(kvm_hyp_cpu_entry)
+ mov x1, #1 // is_cpu_on = true
+ b __kvm_hyp_init_cpu
+SYM_CODE_END(kvm_hyp_cpu_entry)
+
+/*
+ * PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ */
+SYM_CODE_START(kvm_hyp_cpu_resume)
+ mov x1, #0 // is_cpu_on = false
+ b __kvm_hyp_init_cpu
+SYM_CODE_END(kvm_hyp_cpu_resume)
+
+/*
+ * Common code for CPU entry points. Initializes EL2 state and
+ * installs the hypervisor before handing over to a C handler.
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ * x1: bool is_cpu_on
+ */
+SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
+ mov x28, x0 // Stash arguments
+ mov x29, x1
+
+ /* Check that the core was booted in EL2. */
+ mrs x0, CurrentEL
+ cmp x0, #CurrentEL_EL2
+ b.eq 2f
+
+ /* The core booted in EL1. KVM cannot be initialized on it. */
+1: wfe
+ wfi
+ b 1b
+
+2: msr SPsel, #1 // We want to use SP_EL{1,2}
+
+ /* Initialize EL2 CPU state to sane values. */
+ init_el2_state // Clobbers x0..x2
+ finalise_el2_state
+ __init_el2_nvhe_prepare_eret
+
+ /* Enable MMU, set vectors and stack. */
+ mov x0, x28
+ bl ___kvm_hyp_init // Clobbers x0..x2
+
+ /* Leave idmap. */
+ mov x0, x29
+ ldr x1, =kvm_host_psci_cpu_entry
+ br x1
+SYM_CODE_END(__kvm_hyp_init_cpu)
+
+SYM_CODE_START(__kvm_handle_stub_hvc)
+ /*
+ * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so
+ * we need bti j at beginning.
+ */
+ bti j
+ cmp x0, #HVC_SOFT_RESTART
+ b.ne 1f
+
+ /* This is where we're about to jump, staying at EL2 */
+ msr elr_el2, x1
+ mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT | PSR_MODE_EL2h)
+ msr spsr_el2, x0
+
+ /* Shuffle the arguments, and don't come back */
+ mov x0, x2
+ mov x1, x3
+ mov x2, x4
+ b reset
+
+1: cmp x0, #HVC_RESET_VECTORS
+ b.ne 1f
+
+ /*
+ * Set the HVC_RESET_VECTORS return code before entering the common
+ * path so that we do not clobber x0-x2 in case we are coming via
+ * HVC_SOFT_RESTART.
+ */
+ mov x0, xzr
+reset:
+ /* Reset kvm back to the hyp stub. */
+ mov_q x5, INIT_SCTLR_EL2_MMU_OFF
+ pre_disable_mmu_workaround
+ msr sctlr_el2, x5
+ isb
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ mov_q x5, HCR_HOST_NVHE_FLAGS
+ msr hcr_el2, x5
+alternative_else_nop_endif
+
+ /* Install stub vectors */
+ adr_l x5, __hyp_stub_vectors
+ msr vbar_el2, x5
+ eret
+
+1: /* Bad stub call */
+ mov_q x0, HVC_STUB_ERR
+ eret
+
+SYM_CODE_END(__kvm_handle_stub_hvc)
+
+SYM_FUNC_START(__pkvm_init_switch_pgd)
+ /* Turn the MMU off */
+ pre_disable_mmu_workaround
+ mrs x2, sctlr_el2
+ bic x3, x2, #SCTLR_ELx_M
+ msr sctlr_el2, x3
+ isb
+
+ tlbi alle2
+
+ /* Install the new pgtables */
+ ldr x3, [x0, #NVHE_INIT_PGD_PA]
+ phys_to_ttbr x4, x3
+alternative_if ARM64_HAS_CNP
+ orr x4, x4, #TTBR_CNP_BIT
+alternative_else_nop_endif
+ msr ttbr0_el2, x4
+
+ /* Set the new stack pointer */
+ ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA]
+ mov sp, x0
+
+ /* And turn the MMU back on! */
+ dsb nsh
+ isb
+ set_sctlr_el2 x2
+ ret x1
+SYM_FUNC_END(__pkvm_init_switch_pgd)
+
+ .popsection
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
new file mode 100644
index 000000000000..2385fd03ed87
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <hyp/adjust_pc.h>
+
+#include <asm/pgtable-types.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#include <nvhe/ffa.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
+#include <nvhe/trap_handler.h>
+
+DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+
+void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
+
+static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
+
+ hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
+ hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
+
+ hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu;
+
+ hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2;
+ hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
+ hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2;
+
+ hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags;
+ hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state;
+
+ hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr);
+ hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state;
+
+ hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
+
+ hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
+}
+
+static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+ struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
+ struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
+ unsigned int i;
+
+ host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt;
+
+ host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2;
+ host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2;
+
+ host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault;
+
+ host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
+ host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state;
+
+ host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
+ for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
+ host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
+}
+
+static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
+ int ret;
+
+ host_vcpu = kern_hyp_va(host_vcpu);
+
+ if (unlikely(is_protected_kvm_enabled())) {
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct kvm *host_kvm;
+
+ host_kvm = kern_hyp_va(host_vcpu->kvm);
+ hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
+ host_vcpu->vcpu_idx);
+ if (!hyp_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ flush_hyp_vcpu(hyp_vcpu);
+
+ ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
+
+ sync_hyp_vcpu(hyp_vcpu);
+ pkvm_put_hyp_vcpu(hyp_vcpu);
+ } else {
+ /* The host is fully trusted, run its vCPU directly. */
+ ret = __kvm_vcpu_run(host_vcpu);
+ }
+
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+
+ __kvm_adjust_pc(kern_hyp_va(vcpu));
+}
+
+static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt)
+{
+ __kvm_flush_vm_context();
+}
+
+static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
+ DECLARE_REG(int, level, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
+}
+
+static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
+ DECLARE_REG(int, level, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level);
+}
+
+static void
+handle___kvm_tlb_flush_vmid_range(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, start, host_ctxt, 2);
+ DECLARE_REG(unsigned long, pages, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_range(kern_hyp_va(mmu), start, pages);
+}
+
+static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+
+ __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
+}
+
+static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+
+ __kvm_flush_cpu_context(kern_hyp_va(mmu));
+}
+
+static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt)
+{
+ __kvm_timer_set_cntvoff(cpu_reg(host_ctxt, 1));
+}
+
+static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt)
+{
+ u64 tmp;
+
+ tmp = read_sysreg_el2(SYS_SCTLR);
+ tmp |= SCTLR_ELx_DSSBS;
+ write_sysreg_el2(tmp, SYS_SCTLR);
+}
+
+static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config();
+}
+
+static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr();
+}
+
+static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt)
+{
+ __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1));
+}
+
+static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
+{
+ __vgic_v3_init_lrs();
+}
+
+static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2();
+}
+
+static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+}
+
+static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+}
+
+static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+ DECLARE_REG(unsigned long, size, host_ctxt, 2);
+ DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
+ DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
+ DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+
+ /*
+ * __pkvm_init() will return only if an error occurred, otherwise it
+ * will tail-call in __pkvm_init_finalise() which will have to deal
+ * with the host context directly.
+ */
+ cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
+ hyp_va_bits);
+}
+
+static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
+}
+
+static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn);
+}
+
+static void handle___pkvm_host_unshare_hyp(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_host_unshare_hyp(pfn);
+}
+
+static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+ DECLARE_REG(size_t, size, host_ctxt, 2);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
+
+ /*
+ * __pkvm_create_private_mapping() populates a pointer with the
+ * hypervisor start address of the allocation.
+ *
+ * However, handle___pkvm_create_private_mapping() hypercall crosses the
+ * EL1/EL2 boundary so the pointer would not be valid in this context.
+ *
+ * Instead pass the allocation address as the return value (or return
+ * ERR_PTR() on failure).
+ */
+ unsigned long haddr;
+ int err = __pkvm_create_private_mapping(phys, size, prot, &haddr);
+
+ if (err)
+ haddr = (unsigned long)ERR_PTR(err);
+
+ cpu_reg(host_ctxt, 1) = haddr;
+}
+
+static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
+}
+
+static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+
+ __pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
+}
+
+static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
+ DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2);
+ DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3);
+
+ host_kvm = kern_hyp_va(host_kvm);
+ cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva);
+}
+
+static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2);
+ DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3);
+
+ host_vcpu = kern_hyp_va(host_vcpu);
+ cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
+}
+
+static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+}
+
+typedef void (*hcall_t)(struct kvm_cpu_context *);
+
+#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
+
+static const hcall_t host_hcall[] = {
+ /* ___kvm_hyp_init */
+ HANDLE_FUNC(__kvm_get_mdcr_el2),
+ HANDLE_FUNC(__pkvm_init),
+ HANDLE_FUNC(__pkvm_create_private_mapping),
+ HANDLE_FUNC(__pkvm_cpu_set_vector),
+ HANDLE_FUNC(__kvm_enable_ssbs),
+ HANDLE_FUNC(__vgic_v3_init_lrs),
+ HANDLE_FUNC(__vgic_v3_get_gic_config),
+ HANDLE_FUNC(__pkvm_prot_finalize),
+
+ HANDLE_FUNC(__pkvm_host_share_hyp),
+ HANDLE_FUNC(__pkvm_host_unshare_hyp),
+ HANDLE_FUNC(__kvm_adjust_pc),
+ HANDLE_FUNC(__kvm_vcpu_run),
+ HANDLE_FUNC(__kvm_flush_vm_context),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_range),
+ HANDLE_FUNC(__kvm_flush_cpu_context),
+ HANDLE_FUNC(__kvm_timer_set_cntvoff),
+ HANDLE_FUNC(__vgic_v3_read_vmcr),
+ HANDLE_FUNC(__vgic_v3_write_vmcr),
+ HANDLE_FUNC(__vgic_v3_save_aprs),
+ HANDLE_FUNC(__vgic_v3_restore_aprs),
+ HANDLE_FUNC(__pkvm_vcpu_init_traps),
+ HANDLE_FUNC(__pkvm_init_vm),
+ HANDLE_FUNC(__pkvm_init_vcpu),
+ HANDLE_FUNC(__pkvm_teardown_vm),
+};
+
+static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, id, host_ctxt, 0);
+ unsigned long hcall_min = 0;
+ hcall_t hfn;
+
+ /*
+ * If pKVM has been initialised then reject any calls to the
+ * early "privileged" hypercalls. Note that we cannot reject
+ * calls to __pkvm_prot_finalize for two reasons: (1) The static
+ * key used to determine initialisation must be toggled prior to
+ * finalisation and (2) finalisation is performed on a per-CPU
+ * basis. This is all fine, however, since __pkvm_prot_finalize
+ * returns -EPERM after the first call for a given CPU.
+ */
+ if (static_branch_unlikely(&kvm_protected_mode_initialized))
+ hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize;
+
+ id &= ~ARM_SMCCC_CALL_HINTS;
+ id -= KVM_HOST_SMCCC_ID(0);
+
+ if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall)))
+ goto inval;
+
+ hfn = host_hcall[id];
+ if (unlikely(!hfn))
+ goto inval;
+
+ cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS;
+ hfn(host_ctxt);
+
+ return;
+inval:
+ cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+}
+
+static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
+{
+ __kvm_hyp_host_forward_smc(host_ctxt);
+}
+
+static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, func_id, host_ctxt, 0);
+ bool handled;
+
+ func_id &= ~ARM_SMCCC_CALL_HINTS;
+
+ handled = kvm_host_psci_handler(host_ctxt, func_id);
+ if (!handled)
+ handled = kvm_host_ffa_handler(host_ctxt, func_id);
+ if (!handled)
+ default_host_smc_handler(host_ctxt);
+
+ /* SMC was trapped, move ELR past the current PC. */
+ kvm_skip_host_instr();
+}
+
+void handle_trap(struct kvm_cpu_context *host_ctxt)
+{
+ u64 esr = read_sysreg_el2(SYS_ESR);
+
+ switch (ESR_ELx_EC(esr)) {
+ case ESR_ELx_EC_HVC64:
+ handle_host_hcall(host_ctxt);
+ break;
+ case ESR_ELx_EC_SMC64:
+ handle_host_smc(host_ctxt);
+ break;
+ case ESR_ELx_EC_SVE:
+ if (has_hvhe())
+ sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN |
+ CPACR_EL1_ZEN_EL0EN));
+ else
+ sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
+ isb();
+ sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
+ break;
+ case ESR_ELx_EC_IABT_LOW:
+ case ESR_ELx_EC_DABT_LOW:
+ handle_host_mem_abort(host_ctxt);
+ break;
+ default:
+ BUG();
+ }
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
new file mode 100644
index 000000000000..04d194583f1e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+/*
+ * nVHE copy of data structures tracking available CPU cores.
+ * Only entries for CPUs that were online at KVM init are populated.
+ * Other CPUs should not be allowed to boot because their features were
+ * not checked against the finalized system capabilities.
+ */
+u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
+
+u64 cpu_logical_map(unsigned int cpu)
+{
+ BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map));
+
+ return hyp_cpu_logical_map[cpu];
+}
+
+unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS];
+
+unsigned long __hyp_per_cpu_offset(unsigned int cpu)
+{
+ unsigned long *cpu_base_array;
+ unsigned long this_cpu_base;
+ unsigned long elf_base;
+
+ BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base));
+
+ cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base;
+ this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
+ elf_base = (unsigned long)&__per_cpu_start;
+ return this_cpu_base - elf_base;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
new file mode 100644
index 000000000000..f4562f417d3f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ *
+ * Linker script used for partial linking of nVHE EL2 object files.
+ */
+
+#include <asm/hyp_image.h>
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/memory.h>
+
+SECTIONS {
+ HYP_SECTION(.idmap.text)
+ HYP_SECTION(.text)
+ HYP_SECTION(.data..ro_after_init)
+ HYP_SECTION(.rodata)
+
+ /*
+ * .hyp..data..percpu needs to be page aligned to maintain the same
+ * alignment for when linking into vmlinux.
+ */
+ . = ALIGN(PAGE_SIZE);
+ BEGIN_HYP_SECTION(.data..percpu)
+ PERCPU_INPUT(L1_CACHE_BYTES)
+ END_HYP_SECTION
+ HYP_SECTION(.bss)
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c
new file mode 100644
index 000000000000..46a2d4f2b3c6
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 - Google LLC
+ * Author: Keir Fraser <keirf@google.com>
+ */
+
+#include <linux/list.h>
+#include <linux/bug.h>
+
+static inline __must_check bool nvhe_check_data_corruption(bool v)
+{
+ return v;
+}
+
+#define NVHE_CHECK_DATA_CORRUPTION(condition) \
+ nvhe_check_data_corruption(({ \
+ bool corruption = unlikely(condition); \
+ if (corruption) { \
+ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
+ BUG_ON(1); \
+ } else \
+ WARN_ON(1); \
+ } \
+ corruption; \
+ }))
+
+/* The predicates checked here are taken from lib/list_debug.c. */
+
+__list_valid_slowpath
+bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
+{
+ if (NVHE_CHECK_DATA_CORRUPTION(next->prev != prev) ||
+ NVHE_CHECK_DATA_CORRUPTION(prev->next != next) ||
+ NVHE_CHECK_DATA_CORRUPTION(new == prev || new == next))
+ return false;
+
+ return true;
+}
+
+__list_valid_slowpath
+bool __list_del_entry_valid_or_report(struct list_head *entry)
+{
+ struct list_head *prev, *next;
+
+ prev = entry->prev;
+ next = entry->next;
+
+ if (NVHE_CHECK_DATA_CORRUPTION(next == LIST_POISON1) ||
+ NVHE_CHECK_DATA_CORRUPTION(prev == LIST_POISON2) ||
+ NVHE_CHECK_DATA_CORRUPTION(prev->next != entry) ||
+ NVHE_CHECK_DATA_CORRUPTION(next->prev != entry))
+ return false;
+
+ return true;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
new file mode 100644
index 000000000000..861c76021a25
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -0,0 +1,1305 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/kvm_pkvm.h>
+#include <asm/stage2_pgtable.h>
+
+#include <hyp/fault.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+
+#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
+
+struct host_mmu host_mmu;
+
+static struct hyp_pool host_s2_pool;
+
+static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
+#define current_vm (*this_cpu_ptr(&__current_vm))
+
+static void guest_lock_component(struct pkvm_hyp_vm *vm)
+{
+ hyp_spin_lock(&vm->lock);
+ current_vm = vm;
+}
+
+static void guest_unlock_component(struct pkvm_hyp_vm *vm)
+{
+ current_vm = NULL;
+ hyp_spin_unlock(&vm->lock);
+}
+
+static void host_lock_component(void)
+{
+ hyp_spin_lock(&host_mmu.lock);
+}
+
+static void host_unlock_component(void)
+{
+ hyp_spin_unlock(&host_mmu.lock);
+}
+
+static void hyp_lock_component(void)
+{
+ hyp_spin_lock(&pkvm_pgd_lock);
+}
+
+static void hyp_unlock_component(void)
+{
+ hyp_spin_unlock(&pkvm_pgd_lock);
+}
+
+static void *host_s2_zalloc_pages_exact(size_t size)
+{
+ void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size));
+
+ hyp_split_page(hyp_virt_to_page(addr));
+
+ /*
+ * The size of concatenated PGDs is always a power of two of PAGE_SIZE,
+ * so there should be no need to free any of the tail pages to make the
+ * allocation exact.
+ */
+ WARN_ON(size != (PAGE_SIZE << get_order(size)));
+
+ return addr;
+}
+
+static void *host_s2_zalloc_page(void *pool)
+{
+ return hyp_alloc_pages(pool, 0);
+}
+
+static void host_s2_get_page(void *addr)
+{
+ hyp_get_page(&host_s2_pool, addr);
+}
+
+static void host_s2_put_page(void *addr)
+{
+ hyp_put_page(&host_s2_pool, addr);
+}
+
+static void host_s2_free_unlinked_table(void *addr, s8 level)
+{
+ kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level);
+}
+
+static int prepare_s2_pool(void *pgt_pool_base)
+{
+ unsigned long nr_pages, pfn;
+ int ret;
+
+ pfn = hyp_virt_to_pfn(pgt_pool_base);
+ nr_pages = host_s2_pgtable_pages();
+ ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0);
+ if (ret)
+ return ret;
+
+ host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
+ .zalloc_pages_exact = host_s2_zalloc_pages_exact,
+ .zalloc_page = host_s2_zalloc_page,
+ .free_unlinked_table = host_s2_free_unlinked_table,
+ .phys_to_virt = hyp_phys_to_virt,
+ .virt_to_phys = hyp_virt_to_phys,
+ .page_count = hyp_page_count,
+ .get_page = host_s2_get_page,
+ .put_page = host_s2_put_page,
+ };
+
+ return 0;
+}
+
+static void prepare_host_vtcr(void)
+{
+ u32 parange, phys_shift;
+
+ /* The host stage 2 is id-mapped, so use parange for T0SZ */
+ parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
+ phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+ host_mmu.arch.mmu.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+ id_aa64mmfr1_el1_sys_val, phys_shift);
+}
+
+static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
+
+int kvm_host_prepare_stage2(void *pgt_pool_base)
+{
+ struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
+ int ret;
+
+ prepare_host_vtcr();
+ hyp_spin_lock_init(&host_mmu.lock);
+ mmu->arch = &host_mmu.arch;
+
+ ret = prepare_s2_pool(pgt_pool_base);
+ if (ret)
+ return ret;
+
+ ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu,
+ &host_mmu.mm_ops, KVM_HOST_S2_FLAGS,
+ host_stage2_force_pte_cb);
+ if (ret)
+ return ret;
+
+ mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd);
+ mmu->pgt = &host_mmu.pgt;
+ atomic64_set(&mmu->vmid.id, 0);
+
+ return 0;
+}
+
+static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
+ enum kvm_pgtable_prot prot)
+{
+ return true;
+}
+
+static void *guest_s2_zalloc_pages_exact(size_t size)
+{
+ void *addr = hyp_alloc_pages(&current_vm->pool, get_order(size));
+
+ WARN_ON(size != (PAGE_SIZE << get_order(size)));
+ hyp_split_page(hyp_virt_to_page(addr));
+
+ return addr;
+}
+
+static void guest_s2_free_pages_exact(void *addr, unsigned long size)
+{
+ u8 order = get_order(size);
+ unsigned int i;
+
+ for (i = 0; i < (1 << order); i++)
+ hyp_put_page(&current_vm->pool, addr + (i * PAGE_SIZE));
+}
+
+static void *guest_s2_zalloc_page(void *mc)
+{
+ struct hyp_page *p;
+ void *addr;
+
+ addr = hyp_alloc_pages(&current_vm->pool, 0);
+ if (addr)
+ return addr;
+
+ addr = pop_hyp_memcache(mc, hyp_phys_to_virt);
+ if (!addr)
+ return addr;
+
+ memset(addr, 0, PAGE_SIZE);
+ p = hyp_virt_to_page(addr);
+ memset(p, 0, sizeof(*p));
+ p->refcount = 1;
+
+ return addr;
+}
+
+static void guest_s2_get_page(void *addr)
+{
+ hyp_get_page(&current_vm->pool, addr);
+}
+
+static void guest_s2_put_page(void *addr)
+{
+ hyp_put_page(&current_vm->pool, addr);
+}
+
+static void clean_dcache_guest_page(void *va, size_t size)
+{
+ __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
+ hyp_fixmap_unmap();
+}
+
+static void invalidate_icache_guest_page(void *va, size_t size)
+{
+ __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
+ hyp_fixmap_unmap();
+}
+
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
+{
+ struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu;
+ unsigned long nr_pages;
+ int ret;
+
+ nr_pages = kvm_pgtable_stage2_pgd_size(mmu->vtcr) >> PAGE_SHIFT;
+ ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0);
+ if (ret)
+ return ret;
+
+ hyp_spin_lock_init(&vm->lock);
+ vm->mm_ops = (struct kvm_pgtable_mm_ops) {
+ .zalloc_pages_exact = guest_s2_zalloc_pages_exact,
+ .free_pages_exact = guest_s2_free_pages_exact,
+ .zalloc_page = guest_s2_zalloc_page,
+ .phys_to_virt = hyp_phys_to_virt,
+ .virt_to_phys = hyp_virt_to_phys,
+ .page_count = hyp_page_count,
+ .get_page = guest_s2_get_page,
+ .put_page = guest_s2_put_page,
+ .dcache_clean_inval_poc = clean_dcache_guest_page,
+ .icache_inval_pou = invalidate_icache_guest_page,
+ };
+
+ guest_lock_component(vm);
+ ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
+ guest_stage2_force_pte_cb);
+ guest_unlock_component(vm);
+ if (ret)
+ return ret;
+
+ vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd);
+
+ return 0;
+}
+
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
+{
+ void *addr;
+
+ /* Dump all pgtable pages in the hyp_pool */
+ guest_lock_component(vm);
+ kvm_pgtable_stage2_destroy(&vm->pgt);
+ vm->kvm.arch.mmu.pgd_phys = 0ULL;
+ guest_unlock_component(vm);
+
+ /* Drain the hyp_pool into the memcache */
+ addr = hyp_alloc_pages(&vm->pool, 0);
+ while (addr) {
+ memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
+ push_hyp_memcache(mc, addr, hyp_virt_to_phys);
+ WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
+ addr = hyp_alloc_pages(&vm->pool, 0);
+ }
+}
+
+int __pkvm_prot_finalize(void)
+{
+ struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
+ struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+ if (params->hcr_el2 & HCR_VM)
+ return -EPERM;
+
+ params->vttbr = kvm_get_vttbr(mmu);
+ params->vtcr = mmu->vtcr;
+ params->hcr_el2 |= HCR_VM;
+
+ /*
+ * The CMO below not only cleans the updated params to the
+ * PoC, but also provides the DSB that ensures ongoing
+ * page-table walks that have started before we trapped to EL2
+ * have completed.
+ */
+ kvm_flush_dcache_to_poc(params, sizeof(*params));
+
+ write_sysreg(params->hcr_el2, hcr_el2);
+ __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
+
+ /*
+ * Make sure to have an ISB before the TLB maintenance below but only
+ * when __load_stage2() doesn't include one already.
+ */
+ asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+ /* Invalidate stale HCR bits that may be cached in TLBs */
+ __tlbi(vmalls12e1);
+ dsb(nsh);
+ isb();
+
+ return 0;
+}
+
+static int host_stage2_unmap_dev_all(void)
+{
+ struct kvm_pgtable *pgt = &host_mmu.pgt;
+ struct memblock_region *reg;
+ u64 addr = 0;
+ int i, ret;
+
+ /* Unmap all non-memory regions to recycle the pages */
+ for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
+ reg = &hyp_memory[i];
+ ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+ if (ret)
+ return ret;
+ }
+ return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
+}
+
+struct kvm_mem_range {
+ u64 start;
+ u64 end;
+};
+
+static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+{
+ int cur, left = 0, right = hyp_memblock_nr;
+ struct memblock_region *reg;
+ phys_addr_t end;
+
+ range->start = 0;
+ range->end = ULONG_MAX;
+
+ /* The list of memblock regions is sorted, binary search it */
+ while (left < right) {
+ cur = (left + right) >> 1;
+ reg = &hyp_memory[cur];
+ end = reg->base + reg->size;
+ if (addr < reg->base) {
+ right = cur;
+ range->end = reg->base;
+ } else if (addr >= end) {
+ left = cur + 1;
+ range->start = end;
+ } else {
+ range->start = reg->base;
+ range->end = end;
+ return reg;
+ }
+ }
+
+ return NULL;
+}
+
+bool addr_is_memory(phys_addr_t phys)
+{
+ struct kvm_mem_range range;
+
+ return !!find_mem_range(phys, &range);
+}
+
+static bool addr_is_allowed_memory(phys_addr_t phys)
+{
+ struct memblock_region *reg;
+ struct kvm_mem_range range;
+
+ reg = find_mem_range(phys, &range);
+
+ return reg && !(reg->flags & MEMBLOCK_NOMAP);
+}
+
+static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
+{
+ return range->start <= addr && addr < range->end;
+}
+
+static bool range_is_memory(u64 start, u64 end)
+{
+ struct kvm_mem_range r;
+
+ if (!find_mem_range(start, &r))
+ return false;
+
+ return is_in_mem_range(end - 1, &r);
+}
+
+static inline int __host_stage2_idmap(u64 start, u64 end,
+ enum kvm_pgtable_prot prot)
+{
+ return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
+ prot, &host_s2_pool, 0);
+}
+
+/*
+ * The pool has been provided with enough pages to cover all of memory with
+ * page granularity, but it is difficult to know how much of the MMIO range
+ * we will need to cover upfront, so we may need to 'recycle' the pages if we
+ * run out.
+ */
+#define host_stage2_try(fn, ...) \
+ ({ \
+ int __ret; \
+ hyp_assert_lock_held(&host_mmu.lock); \
+ __ret = fn(__VA_ARGS__); \
+ if (__ret == -ENOMEM) { \
+ __ret = host_stage2_unmap_dev_all(); \
+ if (!__ret) \
+ __ret = fn(__VA_ARGS__); \
+ } \
+ __ret; \
+ })
+
+static inline bool range_included(struct kvm_mem_range *child,
+ struct kvm_mem_range *parent)
+{
+ return parent->start <= child->start && child->end <= parent->end;
+}
+
+static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
+{
+ struct kvm_mem_range cur;
+ kvm_pte_t pte;
+ s8 level;
+ int ret;
+
+ hyp_assert_lock_held(&host_mmu.lock);
+ ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
+ if (ret)
+ return ret;
+
+ if (kvm_pte_valid(pte))
+ return -EAGAIN;
+
+ if (pte)
+ return -EPERM;
+
+ do {
+ u64 granule = kvm_granule_size(level);
+ cur.start = ALIGN_DOWN(addr, granule);
+ cur.end = cur.start + granule;
+ level++;
+ } while ((level <= KVM_PGTABLE_LAST_LEVEL) &&
+ !(kvm_level_supports_block_mapping(level) &&
+ range_included(&cur, range)));
+
+ *range = cur;
+
+ return 0;
+}
+
+int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
+ enum kvm_pgtable_prot prot)
+{
+ return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
+}
+
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+{
+ return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
+ addr, size, &host_s2_pool, owner_id);
+}
+
+static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
+{
+ /*
+ * Block mappings must be used with care in the host stage-2 as a
+ * kvm_pgtable_stage2_map() operation targeting a page in the range of
+ * an existing block will delete the block under the assumption that
+ * mappings in the rest of the block range can always be rebuilt lazily.
+ * That assumption is correct for the host stage-2 with RWX mappings
+ * targeting memory or RW mappings targeting MMIO ranges (see
+ * host_stage2_idmap() below which implements some of the host memory
+ * abort logic). However, this is not safe for any other mappings where
+ * the host stage-2 page-table is in fact the only place where this
+ * state is stored. In all those cases, it is safer to use page-level
+ * mappings, hence avoiding to lose the state because of side-effects in
+ * kvm_pgtable_stage2_map().
+ */
+ if (range_is_memory(addr, end))
+ return prot != PKVM_HOST_MEM_PROT;
+ else
+ return prot != PKVM_HOST_MMIO_PROT;
+}
+
+static int host_stage2_idmap(u64 addr)
+{
+ struct kvm_mem_range range;
+ bool is_memory = !!find_mem_range(addr, &range);
+ enum kvm_pgtable_prot prot;
+ int ret;
+
+ prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
+
+ host_lock_component();
+ ret = host_stage2_adjust_range(addr, &range);
+ if (ret)
+ goto unlock;
+
+ ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot);
+unlock:
+ host_unlock_component();
+
+ return ret;
+}
+
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+ struct kvm_vcpu_fault_info fault;
+ u64 esr, addr;
+ int ret = 0;
+
+ esr = read_sysreg_el2(SYS_ESR);
+ BUG_ON(!__get_fault_info(esr, &fault));
+
+ addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+ ret = host_stage2_idmap(addr);
+ BUG_ON(ret && ret != -EAGAIN);
+}
+
+struct pkvm_mem_transition {
+ u64 nr_pages;
+
+ struct {
+ enum pkvm_component_id id;
+ /* Address in the initiator's address space */
+ u64 addr;
+
+ union {
+ struct {
+ /* Address in the completer's address space */
+ u64 completer_addr;
+ } host;
+ struct {
+ u64 completer_addr;
+ } hyp;
+ };
+ } initiator;
+
+ struct {
+ enum pkvm_component_id id;
+ } completer;
+};
+
+struct pkvm_mem_share {
+ const struct pkvm_mem_transition tx;
+ const enum kvm_pgtable_prot completer_prot;
+};
+
+struct pkvm_mem_donation {
+ const struct pkvm_mem_transition tx;
+};
+
+struct check_walk_data {
+ enum pkvm_page_state desired;
+ enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr);
+};
+
+static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct check_walk_data *d = ctx->arg;
+
+ return d->get_page_state(ctx->old, ctx->addr) == d->desired ? 0 : -EPERM;
+}
+
+static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct check_walk_data *data)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = __check_page_state_visitor,
+ .arg = data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr)
+{
+ if (!addr_is_allowed_memory(addr))
+ return PKVM_NOPAGE;
+
+ if (!kvm_pte_valid(pte) && pte)
+ return PKVM_NOPAGE;
+
+ return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
+}
+
+static int __host_check_page_state_range(u64 addr, u64 size,
+ enum pkvm_page_state state)
+{
+ struct check_walk_data d = {
+ .desired = state,
+ .get_page_state = host_get_page_state,
+ };
+
+ hyp_assert_lock_held(&host_mmu.lock);
+ return check_page_state_range(&host_mmu.pgt, addr, size, &d);
+}
+
+static int __host_set_page_state_range(u64 addr, u64 size,
+ enum pkvm_page_state state)
+{
+ enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state);
+
+ return host_stage2_idmap_locked(addr, size, prot);
+}
+
+static int host_request_owned_transition(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 addr = tx->initiator.addr;
+
+ *completer_addr = tx->initiator.host.completer_addr;
+ return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+}
+
+static int host_request_unshare(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 addr = tx->initiator.addr;
+
+ *completer_addr = tx->initiator.host.completer_addr;
+ return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED);
+}
+
+static int host_initiate_share(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 addr = tx->initiator.addr;
+
+ *completer_addr = tx->initiator.host.completer_addr;
+ return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED);
+}
+
+static int host_initiate_unshare(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 addr = tx->initiator.addr;
+
+ *completer_addr = tx->initiator.host.completer_addr;
+ return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED);
+}
+
+static int host_initiate_donation(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u8 owner_id = tx->completer.id;
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ *completer_addr = tx->initiator.host.completer_addr;
+ return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id);
+}
+
+static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
+{
+ return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
+ tx->initiator.id != PKVM_ID_HYP);
+}
+
+static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx,
+ enum pkvm_page_state state)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ if (__host_ack_skip_pgtable_check(tx))
+ return 0;
+
+ return __host_check_page_state_range(addr, size, state);
+}
+
+static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ return __host_ack_transition(addr, tx, PKVM_NOPAGE);
+}
+
+static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u8 host_id = tx->completer.id;
+
+ return host_stage2_set_owner_locked(addr, size, host_id);
+}
+
+static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr)
+{
+ if (!kvm_pte_valid(pte))
+ return PKVM_NOPAGE;
+
+ return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
+}
+
+static int __hyp_check_page_state_range(u64 addr, u64 size,
+ enum pkvm_page_state state)
+{
+ struct check_walk_data d = {
+ .desired = state,
+ .get_page_state = hyp_get_page_state,
+ };
+
+ hyp_assert_lock_held(&pkvm_pgd_lock);
+ return check_page_state_range(&pkvm_pgtable, addr, size, &d);
+}
+
+static int hyp_request_donation(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 addr = tx->initiator.addr;
+
+ *completer_addr = tx->initiator.hyp.completer_addr;
+ return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+}
+
+static int hyp_initiate_donation(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ int ret;
+
+ *completer_addr = tx->initiator.hyp.completer_addr;
+ ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size);
+ return (ret != size) ? -EFAULT : 0;
+}
+
+static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
+{
+ return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
+ tx->initiator.id != PKVM_ID_HOST);
+}
+
+static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx,
+ enum kvm_pgtable_prot perms)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ if (perms != PAGE_HYP)
+ return -EPERM;
+
+ if (__hyp_ack_skip_pgtable_check(tx))
+ return 0;
+
+ return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+}
+
+static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr))
+ return -EBUSY;
+
+ if (__hyp_ack_skip_pgtable_check(tx))
+ return 0;
+
+ return __hyp_check_page_state_range(addr, size,
+ PKVM_PAGE_SHARED_BORROWED);
+}
+
+static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ if (__hyp_ack_skip_pgtable_check(tx))
+ return 0;
+
+ return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+}
+
+static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
+ enum kvm_pgtable_prot perms)
+{
+ void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
+ enum kvm_pgtable_prot prot;
+
+ prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED);
+ return pkvm_create_mappings_locked(start, end, prot);
+}
+
+static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size);
+
+ return (ret != size) ? -EFAULT : 0;
+}
+
+static int hyp_complete_donation(u64 addr,
+ const struct pkvm_mem_transition *tx)
+{
+ void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
+ enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
+
+ return pkvm_create_mappings_locked(start, end, prot);
+}
+
+static int check_share(struct pkvm_mem_share *share)
+{
+ const struct pkvm_mem_transition *tx = &share->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_request_owned_transition(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HYP:
+ ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
+ break;
+ case PKVM_ID_FFA:
+ /*
+ * We only check the host; the secure side will check the other
+ * end when we forward the FFA call.
+ */
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int __do_share(struct pkvm_mem_share *share)
+{
+ const struct pkvm_mem_transition *tx = &share->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_initiate_share(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HYP:
+ ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
+ break;
+ case PKVM_ID_FFA:
+ /*
+ * We're not responsible for any secure page-tables, so there's
+ * nothing to do here.
+ */
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/*
+ * do_share():
+ *
+ * The page owner grants access to another component with a given set
+ * of permissions.
+ *
+ * Initiator: OWNED => SHARED_OWNED
+ * Completer: NOPAGE => SHARED_BORROWED
+ */
+static int do_share(struct pkvm_mem_share *share)
+{
+ int ret;
+
+ ret = check_share(share);
+ if (ret)
+ return ret;
+
+ return WARN_ON(__do_share(share));
+}
+
+static int check_unshare(struct pkvm_mem_share *share)
+{
+ const struct pkvm_mem_transition *tx = &share->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_request_unshare(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HYP:
+ ret = hyp_ack_unshare(completer_addr, tx);
+ break;
+ case PKVM_ID_FFA:
+ /* See check_share() */
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int __do_unshare(struct pkvm_mem_share *share)
+{
+ const struct pkvm_mem_transition *tx = &share->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_initiate_unshare(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HYP:
+ ret = hyp_complete_unshare(completer_addr, tx);
+ break;
+ case PKVM_ID_FFA:
+ /* See __do_share() */
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/*
+ * do_unshare():
+ *
+ * The page owner revokes access from another component for a range of
+ * pages which were previously shared using do_share().
+ *
+ * Initiator: SHARED_OWNED => OWNED
+ * Completer: SHARED_BORROWED => NOPAGE
+ */
+static int do_unshare(struct pkvm_mem_share *share)
+{
+ int ret;
+
+ ret = check_unshare(share);
+ if (ret)
+ return ret;
+
+ return WARN_ON(__do_unshare(share));
+}
+
+static int check_donation(struct pkvm_mem_donation *donation)
+{
+ const struct pkvm_mem_transition *tx = &donation->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_request_owned_transition(&completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_request_donation(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HOST:
+ ret = host_ack_donation(completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_ack_donation(completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int __do_donate(struct pkvm_mem_donation *donation)
+{
+ const struct pkvm_mem_transition *tx = &donation->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_initiate_donation(&completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_initiate_donation(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HOST:
+ ret = host_complete_donation(completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_complete_donation(completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/*
+ * do_donate():
+ *
+ * The page owner transfers ownership to another component, losing access
+ * as a consequence.
+ *
+ * Initiator: OWNED => NOPAGE
+ * Completer: NOPAGE => OWNED
+ */
+static int do_donate(struct pkvm_mem_donation *donation)
+{
+ int ret;
+
+ ret = check_donation(donation);
+ if (ret)
+ return ret;
+
+ return WARN_ON(__do_donate(donation));
+}
+
+int __pkvm_host_share_hyp(u64 pfn)
+{
+ int ret;
+ u64 host_addr = hyp_pfn_to_phys(pfn);
+ u64 hyp_addr = (u64)__hyp_va(host_addr);
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = 1,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = host_addr,
+ .host = {
+ .completer_addr = hyp_addr,
+ },
+ },
+ .completer = {
+ .id = PKVM_ID_HYP,
+ },
+ },
+ .completer_prot = PAGE_HYP,
+ };
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = do_share(&share);
+
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_host_unshare_hyp(u64 pfn)
+{
+ int ret;
+ u64 host_addr = hyp_pfn_to_phys(pfn);
+ u64 hyp_addr = (u64)__hyp_va(host_addr);
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = 1,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = host_addr,
+ .host = {
+ .completer_addr = hyp_addr,
+ },
+ },
+ .completer = {
+ .id = PKVM_ID_HYP,
+ },
+ },
+ .completer_prot = PAGE_HYP,
+ };
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = do_unshare(&share);
+
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ u64 host_addr = hyp_pfn_to_phys(pfn);
+ u64 hyp_addr = (u64)__hyp_va(host_addr);
+ struct pkvm_mem_donation donation = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = host_addr,
+ .host = {
+ .completer_addr = hyp_addr,
+ },
+ },
+ .completer = {
+ .id = PKVM_ID_HYP,
+ },
+ },
+ };
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = do_donate(&donation);
+
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ u64 host_addr = hyp_pfn_to_phys(pfn);
+ u64 hyp_addr = (u64)__hyp_va(host_addr);
+ struct pkvm_mem_donation donation = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HYP,
+ .addr = hyp_addr,
+ .hyp = {
+ .completer_addr = host_addr,
+ },
+ },
+ .completer = {
+ .id = PKVM_ID_HOST,
+ },
+ },
+ };
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = do_donate(&donation);
+
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+int hyp_pin_shared_mem(void *from, void *to)
+{
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
+ u64 size = end - start;
+ int ret;
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = __host_check_page_state_range(__hyp_pa(start), size,
+ PKVM_PAGE_SHARED_OWNED);
+ if (ret)
+ goto unlock;
+
+ ret = __hyp_check_page_state_range(start, size,
+ PKVM_PAGE_SHARED_BORROWED);
+ if (ret)
+ goto unlock;
+
+ for (cur = start; cur < end; cur += PAGE_SIZE)
+ hyp_page_ref_inc(hyp_virt_to_page(cur));
+
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+void hyp_unpin_shared_mem(void *from, void *to)
+{
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
+
+ host_lock_component();
+ hyp_lock_component();
+
+ for (cur = start; cur < end; cur += PAGE_SIZE)
+ hyp_page_ref_dec(hyp_virt_to_page(cur));
+
+ hyp_unlock_component();
+ host_unlock_component();
+}
+
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = hyp_pfn_to_phys(pfn),
+ },
+ .completer = {
+ .id = PKVM_ID_FFA,
+ },
+ },
+ };
+
+ host_lock_component();
+ ret = do_share(&share);
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = hyp_pfn_to_phys(pfn),
+ },
+ .completer = {
+ .id = PKVM_ID_FFA,
+ },
+ },
+ };
+
+ host_lock_component();
+ ret = do_unshare(&share);
+ host_unlock_component();
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
new file mode 100644
index 000000000000..8850b591d775
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/kvm_pkvm.h>
+#include <asm/spectre.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/spinlock.h>
+
+struct kvm_pgtable pkvm_pgtable;
+hyp_spinlock_t pkvm_pgd_lock;
+
+struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
+unsigned int hyp_memblock_nr;
+
+static u64 __io_map_base;
+
+struct hyp_fixmap_slot {
+ u64 addr;
+ kvm_pte_t *ptep;
+};
+static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
+
+static int __pkvm_create_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot)
+{
+ int err;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+ err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot);
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ return err;
+}
+
+static int __pkvm_alloc_private_va_range(unsigned long start, size_t size)
+{
+ unsigned long cur;
+
+ hyp_assert_lock_held(&pkvm_pgd_lock);
+
+ if (!start || start < __io_map_base)
+ return -EINVAL;
+
+ /* The allocated size is always a multiple of PAGE_SIZE */
+ cur = start + PAGE_ALIGN(size);
+
+ /* Are we overflowing on the vmemmap ? */
+ if (cur > __hyp_vmemmap)
+ return -ENOMEM;
+
+ __io_map_base = cur;
+
+ return 0;
+}
+
+/**
+ * pkvm_alloc_private_va_range - Allocates a private VA range.
+ * @size: The size of the VA range to reserve.
+ * @haddr: The hypervisor virtual start address of the allocation.
+ *
+ * The private virtual address (VA) range is allocated above __io_map_base
+ * and aligned based on the order of @size.
+ *
+ * Return: 0 on success or negative error code on failure.
+ */
+int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr)
+{
+ unsigned long addr;
+ int ret;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+ addr = __io_map_base;
+ ret = __pkvm_alloc_private_va_range(addr, size);
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ *haddr = addr;
+
+ return ret;
+}
+
+int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+ enum kvm_pgtable_prot prot,
+ unsigned long *haddr)
+{
+ unsigned long addr;
+ int err;
+
+ size = PAGE_ALIGN(size + offset_in_page(phys));
+ err = pkvm_alloc_private_va_range(size, &addr);
+ if (err)
+ return err;
+
+ err = __pkvm_create_mappings(addr, size, phys, prot);
+ if (err)
+ return err;
+
+ *haddr = addr + offset_in_page(phys);
+ return err;
+}
+
+int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+ unsigned long start = (unsigned long)from;
+ unsigned long end = (unsigned long)to;
+ unsigned long virt_addr;
+ phys_addr_t phys;
+
+ hyp_assert_lock_held(&pkvm_pgd_lock);
+
+ start = start & PAGE_MASK;
+ end = PAGE_ALIGN(end);
+
+ for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+ int err;
+
+ phys = hyp_virt_to_phys((void *)virt_addr);
+ err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE,
+ phys, prot);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+ int ret;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+ ret = pkvm_create_mappings_locked(from, to, prot);
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ return ret;
+}
+
+int hyp_back_vmemmap(phys_addr_t back)
+{
+ unsigned long i, start, size, end = 0;
+ int ret;
+
+ for (i = 0; i < hyp_memblock_nr; i++) {
+ start = hyp_memory[i].base;
+ start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE);
+ /*
+ * The beginning of the hyp_vmemmap region for the current
+ * memblock may already be backed by the page backing the end
+ * the previous region, so avoid mapping it twice.
+ */
+ start = max(start, end);
+
+ end = hyp_memory[i].base + hyp_memory[i].size;
+ end = PAGE_ALIGN((u64)hyp_phys_to_page(end));
+ if (start >= end)
+ continue;
+
+ size = end - start;
+ ret = __pkvm_create_mappings(start, size, back, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ memset(hyp_phys_to_virt(back), 0, size);
+ back += size;
+ }
+
+ return 0;
+}
+
+static void *__hyp_bp_vect_base;
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot)
+{
+ void *vector;
+
+ switch (slot) {
+ case HYP_VECTOR_DIRECT: {
+ vector = __kvm_hyp_vector;
+ break;
+ }
+ case HYP_VECTOR_SPECTRE_DIRECT: {
+ vector = __bp_harden_hyp_vecs;
+ break;
+ }
+ case HYP_VECTOR_INDIRECT:
+ case HYP_VECTOR_SPECTRE_INDIRECT: {
+ vector = (void *)__hyp_bp_vect_base;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ vector = __kvm_vector_slot2addr(vector, slot);
+ *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector;
+
+ return 0;
+}
+
+int hyp_map_vectors(void)
+{
+ phys_addr_t phys;
+ unsigned long bp_base;
+ int ret;
+
+ if (!kvm_system_needs_idmapped_vectors()) {
+ __hyp_bp_vect_base = __bp_harden_hyp_vecs;
+ return 0;
+ }
+
+ phys = __hyp_pa(__bp_harden_hyp_vecs);
+ ret = __pkvm_create_private_mapping(phys, __BP_HARDEN_HYP_VECS_SZ,
+ PAGE_HYP_EXEC, &bp_base);
+ if (ret)
+ return ret;
+
+ __hyp_bp_vect_base = (void *)bp_base;
+
+ return 0;
+}
+
+void *hyp_fixmap_map(phys_addr_t phys)
+{
+ struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
+ kvm_pte_t pte, *ptep = slot->ptep;
+
+ pte = *ptep;
+ pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
+ pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
+ WRITE_ONCE(*ptep, pte);
+ dsb(ishst);
+
+ return (void *)slot->addr;
+}
+
+static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
+{
+ kvm_pte_t *ptep = slot->ptep;
+ u64 addr = slot->addr;
+
+ WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
+
+ /*
+ * Irritatingly, the architecture requires that we use inner-shareable
+ * broadcast TLB invalidation here in case another CPU speculates
+ * through our fixmap and decides to create an "amalagamation of the
+ * values held in the TLB" due to the apparent lack of a
+ * break-before-make sequence.
+ *
+ * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
+ */
+ dsb(ishst);
+ __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL);
+ dsb(ish);
+ isb();
+}
+
+void hyp_fixmap_unmap(void)
+{
+ fixmap_clear_slot(this_cpu_ptr(&fixmap_slots));
+}
+
+static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg);
+
+ if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL)
+ return -EINVAL;
+
+ slot->addr = ctx->addr;
+ slot->ptep = ctx->ptep;
+
+ /*
+ * Clear the PTE, but keep the page-table page refcount elevated to
+ * prevent it from ever being freed. This lets us manipulate the PTEs
+ * by hand safely without ever needing to allocate memory.
+ */
+ fixmap_clear_slot(slot);
+
+ return 0;
+}
+
+static int create_fixmap_slot(u64 addr, u64 cpu)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = __create_fixmap_slot_cb,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = (void *)cpu,
+ };
+
+ return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
+}
+
+int hyp_create_pcpu_fixmap(void)
+{
+ unsigned long addr, i;
+ int ret;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE,
+ __hyp_pa(__hyp_bss_start), PAGE_HYP);
+ if (ret)
+ return ret;
+
+ ret = create_fixmap_slot(addr, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int hyp_create_idmap(u32 hyp_va_bits)
+{
+ unsigned long start, end;
+
+ start = hyp_virt_to_phys((void *)__hyp_idmap_text_start);
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+
+ end = hyp_virt_to_phys((void *)__hyp_idmap_text_end);
+ end = ALIGN(end, PAGE_SIZE);
+
+ /*
+ * One half of the VA space is reserved to linearly map portions of
+ * memory -- see va_layout.c for more details. The other half of the VA
+ * space contains the trampoline page, and needs some care. Split that
+ * second half in two and find the quarter of VA space not conflicting
+ * with the idmap to place the IOs and the vmemmap. IOs use the lower
+ * half of the quarter and the vmemmap the upper half.
+ */
+ __io_map_base = start & BIT(hyp_va_bits - 2);
+ __io_map_base ^= BIT(hyp_va_bits - 2);
+ __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+
+ return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
+}
+
+int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr)
+{
+ unsigned long addr, prev_base;
+ size_t size;
+ int ret;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+
+ prev_base = __io_map_base;
+ /*
+ * Efficient stack verification using the PAGE_SHIFT bit implies
+ * an alignment of our allocation on the order of the size.
+ */
+ size = PAGE_SIZE * 2;
+ addr = ALIGN(__io_map_base, size);
+
+ ret = __pkvm_alloc_private_va_range(addr, size);
+ if (!ret) {
+ /*
+ * Since the stack grows downwards, map the stack to the page
+ * at the higher address and leave the lower guard page
+ * unbacked.
+ *
+ * Any valid stack address now has the PAGE_SHIFT bit as 1
+ * and addresses corresponding to the guard page have the
+ * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+ */
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + PAGE_SIZE,
+ PAGE_SIZE, phys, PAGE_HYP);
+ if (ret)
+ __io_map_base = prev_base;
+ }
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ *haddr = addr + size;
+
+ return ret;
+}
+
+static void *admit_host_page(void *arg)
+{
+ struct kvm_hyp_memcache *host_mc = arg;
+
+ if (!host_mc->nr_pages)
+ return NULL;
+
+ /*
+ * The host still owns the pages in its memcache, so we need to go
+ * through a full host-to-hyp donation cycle to change it. Fortunately,
+ * __pkvm_host_donate_hyp() takes care of races for us, so if it
+ * succeeds we're good to go.
+ */
+ if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1))
+ return NULL;
+
+ return pop_hyp_memcache(host_mc, hyp_phys_to_virt);
+}
+
+/* Refill our local memcache by popping pages from the one provided by the host. */
+int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
+ struct kvm_hyp_memcache *host_mc)
+{
+ struct kvm_hyp_memcache tmp = *host_mc;
+ int ret;
+
+ ret = __topup_hyp_memcache(mc, min_pages, admit_host_page,
+ hyp_virt_to_phys, &tmp);
+ *host_mc = tmp;
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
new file mode 100644
index 000000000000..e691290d3765
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_hyp.h>
+#include <nvhe/gfp.h>
+
+u64 __hyp_vmemmap;
+
+/*
+ * Index the hyp_vmemmap to find a potential buddy page, but make no assumption
+ * about its current state.
+ *
+ * Example buddy-tree for a 4-pages physically contiguous pool:
+ *
+ * o : Page 3
+ * /
+ * o-o : Page 2
+ * /
+ * / o : Page 1
+ * / /
+ * o---o-o : Page 0
+ * Order 2 1 0
+ *
+ * Example of requests on this pool:
+ * __find_buddy_nocheck(pool, page 0, order 0) => page 1
+ * __find_buddy_nocheck(pool, page 0, order 1) => page 2
+ * __find_buddy_nocheck(pool, page 1, order 0) => page 0
+ * __find_buddy_nocheck(pool, page 2, order 0) => page 3
+ */
+static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
+ struct hyp_page *p,
+ unsigned short order)
+{
+ phys_addr_t addr = hyp_page_to_phys(p);
+
+ addr ^= (PAGE_SIZE << order);
+
+ /*
+ * Don't return a page outside the pool range -- it belongs to
+ * something else and may not be mapped in hyp_vmemmap.
+ */
+ if (addr < pool->range_start || addr >= pool->range_end)
+ return NULL;
+
+ return hyp_phys_to_page(addr);
+}
+
+/* Find a buddy page currently available for allocation */
+static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
+ struct hyp_page *p,
+ unsigned short order)
+{
+ struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
+
+ if (!buddy || buddy->order != order || buddy->refcount)
+ return NULL;
+
+ return buddy;
+
+}
+
+/*
+ * Pages that are available for allocation are tracked in free-lists, so we use
+ * the pages themselves to store the list nodes to avoid wasting space. As the
+ * allocator always returns zeroed pages (which are zeroed on the hyp_put_page()
+ * path to optimize allocation speed), we also need to clean-up the list node in
+ * each page when we take it out of the list.
+ */
+static inline void page_remove_from_list(struct hyp_page *p)
+{
+ struct list_head *node = hyp_page_to_virt(p);
+
+ __list_del_entry(node);
+ memset(node, 0, sizeof(*node));
+}
+
+static inline void page_add_to_list(struct hyp_page *p, struct list_head *head)
+{
+ struct list_head *node = hyp_page_to_virt(p);
+
+ INIT_LIST_HEAD(node);
+ list_add_tail(node, head);
+}
+
+static inline struct hyp_page *node_to_page(struct list_head *node)
+{
+ return hyp_virt_to_page(node);
+}
+
+static void __hyp_attach_page(struct hyp_pool *pool,
+ struct hyp_page *p)
+{
+ phys_addr_t phys = hyp_page_to_phys(p);
+ unsigned short order = p->order;
+ struct hyp_page *buddy;
+
+ memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
+
+ /* Skip coalescing for 'external' pages being freed into the pool. */
+ if (phys < pool->range_start || phys >= pool->range_end)
+ goto insert;
+
+ /*
+ * Only the first struct hyp_page of a high-order page (otherwise known
+ * as the 'head') should have p->order set. The non-head pages should
+ * have p->order = HYP_NO_ORDER. Here @p may no longer be the head
+ * after coalescing, so make sure to mark it HYP_NO_ORDER proactively.
+ */
+ p->order = HYP_NO_ORDER;
+ for (; (order + 1) <= pool->max_order; order++) {
+ buddy = __find_buddy_avail(pool, p, order);
+ if (!buddy)
+ break;
+
+ /* Take the buddy out of its list, and coalesce with @p */
+ page_remove_from_list(buddy);
+ buddy->order = HYP_NO_ORDER;
+ p = min(p, buddy);
+ }
+
+insert:
+ /* Mark the new head, and insert it */
+ p->order = order;
+ page_add_to_list(p, &pool->free_area[order]);
+}
+
+static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
+ struct hyp_page *p,
+ unsigned short order)
+{
+ struct hyp_page *buddy;
+
+ page_remove_from_list(p);
+ while (p->order > order) {
+ /*
+ * The buddy of order n - 1 currently has HYP_NO_ORDER as it
+ * is covered by a higher-level page (whose head is @p). Use
+ * __find_buddy_nocheck() to find it and inject it in the
+ * free_list[n - 1], effectively splitting @p in half.
+ */
+ p->order--;
+ buddy = __find_buddy_nocheck(pool, p, p->order);
+ buddy->order = p->order;
+ page_add_to_list(buddy, &pool->free_area[buddy->order]);
+ }
+
+ return p;
+}
+
+static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
+{
+ if (hyp_page_ref_dec_and_test(p))
+ __hyp_attach_page(pool, p);
+}
+
+/*
+ * Changes to the buddy tree and page refcounts must be done with the hyp_pool
+ * lock held. If a refcount change requires an update to the buddy tree (e.g.
+ * hyp_put_page()), both operations must be done within the same critical
+ * section to guarantee transient states (e.g. a page with null refcount but
+ * not yet attached to a free list) can't be observed by well-behaved readers.
+ */
+void hyp_put_page(struct hyp_pool *pool, void *addr)
+{
+ struct hyp_page *p = hyp_virt_to_page(addr);
+
+ hyp_spin_lock(&pool->lock);
+ __hyp_put_page(pool, p);
+ hyp_spin_unlock(&pool->lock);
+}
+
+void hyp_get_page(struct hyp_pool *pool, void *addr)
+{
+ struct hyp_page *p = hyp_virt_to_page(addr);
+
+ hyp_spin_lock(&pool->lock);
+ hyp_page_ref_inc(p);
+ hyp_spin_unlock(&pool->lock);
+}
+
+void hyp_split_page(struct hyp_page *p)
+{
+ unsigned short order = p->order;
+ unsigned int i;
+
+ p->order = 0;
+ for (i = 1; i < (1 << order); i++) {
+ struct hyp_page *tail = p + i;
+
+ tail->order = 0;
+ hyp_set_page_refcounted(tail);
+ }
+}
+
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
+{
+ unsigned short i = order;
+ struct hyp_page *p;
+
+ hyp_spin_lock(&pool->lock);
+
+ /* Look for a high-enough-order page */
+ while (i <= pool->max_order && list_empty(&pool->free_area[i]))
+ i++;
+ if (i > pool->max_order) {
+ hyp_spin_unlock(&pool->lock);
+ return NULL;
+ }
+
+ /* Extract it from the tree at the right order */
+ p = node_to_page(pool->free_area[i].next);
+ p = __hyp_extract_page(pool, p, order);
+
+ hyp_set_page_refcounted(p);
+ hyp_spin_unlock(&pool->lock);
+
+ return hyp_page_to_virt(p);
+}
+
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+ unsigned int reserved_pages)
+{
+ phys_addr_t phys = hyp_pfn_to_phys(pfn);
+ struct hyp_page *p;
+ int i;
+
+ hyp_spin_lock_init(&pool->lock);
+ pool->max_order = min(MAX_PAGE_ORDER,
+ get_order(nr_pages << PAGE_SHIFT));
+ for (i = 0; i <= pool->max_order; i++)
+ INIT_LIST_HEAD(&pool->free_area[i]);
+ pool->range_start = phys;
+ pool->range_end = phys + (nr_pages << PAGE_SHIFT);
+
+ /* Init the vmemmap portion */
+ p = hyp_phys_to_page(phys);
+ for (i = 0; i < nr_pages; i++)
+ hyp_set_page_refcounted(&p[i]);
+
+ /* Attach the unused pages to the buddy tree */
+ for (i = reserved_pages; i < nr_pages; i++)
+ __hyp_put_page(pool, &p[i]);
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
new file mode 100644
index 000000000000..26dd9a20ad6e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -0,0 +1,640 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Google LLC
+ * Author: Fuad Tabba <tabba@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/mm.h>
+#include <nvhe/fixed_config.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/pkvm.h>
+#include <nvhe/trap_handler.h>
+
+/* Used by icache_is_aliasing(). */
+unsigned long __icache_flags;
+
+/* Used by kvm_get_vttbr(). */
+unsigned int kvm_arm_vmid_bits;
+
+/*
+ * Set trap register values based on features in ID_AA64PFR0.
+ */
+static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
+{
+ const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1);
+ u64 hcr_set = HCR_RW;
+ u64 hcr_clear = 0;
+ u64 cptr_set = 0;
+ u64 cptr_clear = 0;
+
+ /* Protected KVM does not support AArch32 guests. */
+ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0),
+ PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
+ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1),
+ PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
+
+ /*
+ * Linux guests assume support for floating-point and Advanced SIMD. Do
+ * not change the trapping behavior for these from the KVM default.
+ */
+ BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP),
+ PVM_ID_AA64PFR0_ALLOW));
+ BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD),
+ PVM_ID_AA64PFR0_ALLOW));
+
+ if (has_hvhe())
+ hcr_set |= HCR_E2H;
+
+ /* Trap RAS unless all current versions are supported */
+ if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) <
+ ID_AA64PFR0_EL1_RAS_V1P1) {
+ hcr_set |= HCR_TERR | HCR_TEA;
+ hcr_clear |= HCR_FIEN;
+ }
+
+ /* Trap AMU */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) {
+ hcr_clear |= HCR_AMVOFFEN;
+ cptr_set |= CPTR_EL2_TAM;
+ }
+
+ /* Trap SVE */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) {
+ if (has_hvhe())
+ cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
+ else
+ cptr_set |= CPTR_EL2_TZ;
+ }
+
+ vcpu->arch.hcr_el2 |= hcr_set;
+ vcpu->arch.hcr_el2 &= ~hcr_clear;
+ vcpu->arch.cptr_el2 |= cptr_set;
+ vcpu->arch.cptr_el2 &= ~cptr_clear;
+}
+
+/*
+ * Set trap register values based on features in ID_AA64PFR1.
+ */
+static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu)
+{
+ const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1);
+ u64 hcr_set = 0;
+ u64 hcr_clear = 0;
+
+ /* Memory Tagging: Trap and Treat as Untagged if not supported. */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) {
+ hcr_set |= HCR_TID5;
+ hcr_clear |= HCR_DCT | HCR_ATA;
+ }
+
+ vcpu->arch.hcr_el2 |= hcr_set;
+ vcpu->arch.hcr_el2 &= ~hcr_clear;
+}
+
+/*
+ * Set trap register values based on features in ID_AA64DFR0.
+ */
+static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
+{
+ const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1);
+ u64 mdcr_set = 0;
+ u64 mdcr_clear = 0;
+ u64 cptr_set = 0;
+
+ /* Trap/constrain PMU */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) {
+ mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
+ mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME |
+ MDCR_EL2_HPMN_MASK;
+ }
+
+ /* Trap Debug */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids))
+ mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE;
+
+ /* Trap OS Double Lock */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids))
+ mdcr_set |= MDCR_EL2_TDOSA;
+
+ /* Trap SPE */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) {
+ mdcr_set |= MDCR_EL2_TPMS;
+ mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+ }
+
+ /* Trap Trace Filter */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids))
+ mdcr_set |= MDCR_EL2_TTRF;
+
+ /* Trap Trace */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) {
+ if (has_hvhe())
+ cptr_set |= CPACR_EL1_TTA;
+ else
+ cptr_set |= CPTR_EL2_TTA;
+ }
+
+ /* Trap External Trace */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids))
+ mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
+
+ vcpu->arch.mdcr_el2 |= mdcr_set;
+ vcpu->arch.mdcr_el2 &= ~mdcr_clear;
+ vcpu->arch.cptr_el2 |= cptr_set;
+}
+
+/*
+ * Set trap register values based on features in ID_AA64MMFR0.
+ */
+static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu)
+{
+ const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1);
+ u64 mdcr_set = 0;
+
+ /* Trap Debug Communications Channel registers */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids))
+ mdcr_set |= MDCR_EL2_TDCC;
+
+ vcpu->arch.mdcr_el2 |= mdcr_set;
+}
+
+/*
+ * Set trap register values based on features in ID_AA64MMFR1.
+ */
+static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu)
+{
+ const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1);
+ u64 hcr_set = 0;
+
+ /* Trap LOR */
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids))
+ hcr_set |= HCR_TLOR;
+
+ vcpu->arch.hcr_el2 |= hcr_set;
+}
+
+/*
+ * Set baseline trap register values.
+ */
+static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
+{
+ const u64 hcr_trap_feat_regs = HCR_TID3;
+ const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1;
+
+ /*
+ * Always trap:
+ * - Feature id registers: to control features exposed to guests
+ * - Implementation-defined features
+ */
+ vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef;
+
+ /* Clear res0 and set res1 bits to trap potential new features. */
+ vcpu->arch.hcr_el2 &= ~(HCR_RES0);
+ vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
+ if (!has_hvhe()) {
+ vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
+ vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+ }
+}
+
+/*
+ * Initialize trap register values for protected VMs.
+ */
+void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
+{
+ pvm_init_trap_regs(vcpu);
+ pvm_init_traps_aa64pfr0(vcpu);
+ pvm_init_traps_aa64pfr1(vcpu);
+ pvm_init_traps_aa64dfr0(vcpu);
+ pvm_init_traps_aa64mmfr0(vcpu);
+ pvm_init_traps_aa64mmfr1(vcpu);
+}
+
+/*
+ * Start the VM table handle at the offset defined instead of at 0.
+ * Mainly for sanity checking and debugging.
+ */
+#define HANDLE_OFFSET 0x1000
+
+static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
+{
+ return handle - HANDLE_OFFSET;
+}
+
+static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
+{
+ return idx + HANDLE_OFFSET;
+}
+
+/*
+ * Spinlock for protecting state related to the VM table. Protects writes
+ * to 'vm_table' and 'nr_table_entries' as well as reads and writes to
+ * 'last_hyp_vcpu_lookup'.
+ */
+static DEFINE_HYP_SPINLOCK(vm_table_lock);
+
+/*
+ * The table of VM entries for protected VMs in hyp.
+ * Allocated at hyp initialization and setup.
+ */
+static struct pkvm_hyp_vm **vm_table;
+
+void pkvm_hyp_vm_table_init(void *tbl)
+{
+ WARN_ON(vm_table);
+ vm_table = tbl;
+}
+
+/*
+ * Return the hyp vm structure corresponding to the handle.
+ */
+static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
+{
+ unsigned int idx = vm_handle_to_idx(handle);
+
+ if (unlikely(idx >= KVM_MAX_PVMS))
+ return NULL;
+
+ return vm_table[idx];
+}
+
+struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
+ unsigned int vcpu_idx)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
+ struct pkvm_hyp_vm *hyp_vm;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
+ goto unlock;
+
+ hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+ hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
+unlock:
+ hyp_spin_unlock(&vm_table_lock);
+ return hyp_vcpu;
+}
+
+void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
+ hyp_spin_unlock(&vm_table_lock);
+}
+
+static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
+{
+ if (host_vcpu)
+ hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
+}
+
+static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
+ unsigned int nr_vcpus)
+{
+ int i;
+
+ for (i = 0; i < nr_vcpus; i++)
+ unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+}
+
+static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
+ unsigned int nr_vcpus)
+{
+ hyp_vm->host_kvm = host_kvm;
+ hyp_vm->kvm.created_vcpus = nr_vcpus;
+ hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
+}
+
+static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
+ struct pkvm_hyp_vm *hyp_vm,
+ struct kvm_vcpu *host_vcpu,
+ unsigned int vcpu_idx)
+{
+ int ret = 0;
+
+ if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
+ return -EBUSY;
+
+ if (host_vcpu->vcpu_idx != vcpu_idx) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ hyp_vcpu->host_vcpu = host_vcpu;
+
+ hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
+ hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
+ hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+
+ hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
+ hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
+done:
+ if (ret)
+ unpin_host_vcpu(host_vcpu);
+ return ret;
+}
+
+static int find_free_vm_table_entry(struct kvm *host_kvm)
+{
+ int i;
+
+ for (i = 0; i < KVM_MAX_PVMS; ++i) {
+ if (!vm_table[i])
+ return i;
+ }
+
+ return -ENOMEM;
+}
+
+/*
+ * Allocate a VM table entry and insert a pointer to the new vm.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
+ int idx;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ /*
+ * Initializing protected state might have failed, yet a malicious
+ * host could trigger this function. Thus, ensure that 'vm_table'
+ * exists.
+ */
+ if (unlikely(!vm_table))
+ return -EINVAL;
+
+ idx = find_free_vm_table_entry(host_kvm);
+ if (idx < 0)
+ return idx;
+
+ hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx);
+
+ /* VMID 0 is reserved for the host */
+ atomic64_set(&mmu->vmid.id, idx + 1);
+
+ mmu->arch = &hyp_vm->kvm.arch;
+ mmu->pgt = &hyp_vm->pgt;
+
+ vm_table[idx] = hyp_vm;
+ return hyp_vm->kvm.arch.pkvm.handle;
+}
+
+/*
+ * Deallocate and remove the VM table entry corresponding to the handle.
+ */
+static void remove_vm_table_entry(pkvm_handle_t handle)
+{
+ hyp_assert_lock_held(&vm_table_lock);
+ vm_table[vm_handle_to_idx(handle)] = NULL;
+}
+
+static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus)
+{
+ return size_add(sizeof(struct pkvm_hyp_vm),
+ size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus));
+}
+
+static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
+{
+ void *va = (void *)kern_hyp_va(host_va);
+
+ if (!PAGE_ALIGNED(va))
+ return NULL;
+
+ if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
+ PAGE_ALIGN(size) >> PAGE_SHIFT))
+ return NULL;
+
+ return va;
+}
+
+static void *map_donated_memory(unsigned long host_va, size_t size)
+{
+ void *va = map_donated_memory_noclear(host_va, size);
+
+ if (va)
+ memset(va, 0, size);
+
+ return va;
+}
+
+static void __unmap_donated_memory(void *va, size_t size)
+{
+ WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
+ PAGE_ALIGN(size) >> PAGE_SHIFT));
+}
+
+static void unmap_donated_memory(void *va, size_t size)
+{
+ if (!va)
+ return;
+
+ memset(va, 0, size);
+ __unmap_donated_memory(va, size);
+}
+
+static void unmap_donated_memory_noclear(void *va, size_t size)
+{
+ if (!va)
+ return;
+
+ __unmap_donated_memory(va, size);
+}
+
+/*
+ * Initialize the hypervisor copy of the protected VM state using the
+ * memory donated by the host.
+ *
+ * Unmaps the donated memory from the host at stage 2.
+ *
+ * host_kvm: A pointer to the host's struct kvm.
+ * vm_hva: The host va of the area being donated for the VM state.
+ * Must be page aligned.
+ * pgd_hva: The host va of the area being donated for the stage-2 PGD for
+ * the VM. Must be page aligned. Its size is implied by the VM's
+ * VTCR.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+ unsigned long pgd_hva)
+{
+ struct pkvm_hyp_vm *hyp_vm = NULL;
+ size_t vm_size, pgd_size;
+ unsigned int nr_vcpus;
+ void *pgd = NULL;
+ int ret;
+
+ ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1);
+ if (ret)
+ return ret;
+
+ nr_vcpus = READ_ONCE(host_kvm->created_vcpus);
+ if (nr_vcpus < 1) {
+ ret = -EINVAL;
+ goto err_unpin_kvm;
+ }
+
+ vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
+ pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr);
+
+ ret = -ENOMEM;
+
+ hyp_vm = map_donated_memory(vm_hva, vm_size);
+ if (!hyp_vm)
+ goto err_remove_mappings;
+
+ pgd = map_donated_memory_noclear(pgd_hva, pgd_size);
+ if (!pgd)
+ goto err_remove_mappings;
+
+ init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = insert_vm_table_entry(host_kvm, hyp_vm);
+ if (ret < 0)
+ goto err_unlock;
+
+ ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
+ if (ret)
+ goto err_remove_vm_table_entry;
+ hyp_spin_unlock(&vm_table_lock);
+
+ return hyp_vm->kvm.arch.pkvm.handle;
+
+err_remove_vm_table_entry:
+ remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle);
+err_unlock:
+ hyp_spin_unlock(&vm_table_lock);
+err_remove_mappings:
+ unmap_donated_memory(hyp_vm, vm_size);
+ unmap_donated_memory(pgd, pgd_size);
+err_unpin_kvm:
+ hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
+ return ret;
+}
+
+/*
+ * Initialize the hypervisor copy of the protected vCPU state using the
+ * memory donated by the host.
+ *
+ * handle: The handle for the protected vm.
+ * host_vcpu: A pointer to the corresponding host vcpu.
+ * vcpu_hva: The host va of the area being donated for the vcpu state.
+ * Must be page aligned. The size of the area must be equal to
+ * the page-aligned size of 'struct pkvm_hyp_vcpu'.
+ * Return 0 on success, negative error code on failure.
+ */
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+ unsigned long vcpu_hva)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct pkvm_hyp_vm *hyp_vm;
+ unsigned int idx;
+ int ret;
+
+ hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
+ if (!hyp_vcpu)
+ return -ENOMEM;
+
+ hyp_spin_lock(&vm_table_lock);
+
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ idx = hyp_vm->nr_vcpus;
+ if (idx >= hyp_vm->kvm.created_vcpus) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
+ if (ret)
+ goto unlock;
+
+ hyp_vm->vcpus[idx] = hyp_vcpu;
+ hyp_vm->nr_vcpus++;
+unlock:
+ hyp_spin_unlock(&vm_table_lock);
+
+ if (ret)
+ unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
+
+ return ret;
+}
+
+static void
+teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
+{
+ size = PAGE_ALIGN(size);
+ memset(addr, 0, size);
+
+ for (void *start = addr; start < addr + size; start += PAGE_SIZE)
+ push_hyp_memcache(mc, start, hyp_virt_to_phys);
+
+ unmap_donated_memory_noclear(addr, size);
+}
+
+int __pkvm_teardown_vm(pkvm_handle_t handle)
+{
+ struct kvm_hyp_memcache *mc;
+ struct pkvm_hyp_vm *hyp_vm;
+ struct kvm *host_kvm;
+ unsigned int idx;
+ size_t vm_size;
+ int err;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm) {
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ if (WARN_ON(hyp_page_count(hyp_vm))) {
+ err = -EBUSY;
+ goto err_unlock;
+ }
+
+ host_kvm = hyp_vm->host_kvm;
+
+ /* Ensure the VMID is clean before it can be reallocated */
+ __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
+ remove_vm_table_entry(handle);
+ hyp_spin_unlock(&vm_table_lock);
+
+ /* Reclaim guest pages (including page-table pages) */
+ mc = &host_kvm->arch.pkvm.teardown_mc;
+ reclaim_guest_pages(hyp_vm, mc);
+ unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+
+ /* Push the metadata pages to the teardown memcache */
+ for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
+ struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
+
+ teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
+ }
+
+ vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
+ teardown_donated_memory(mc, hyp_vm, vm_size);
+ hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
+ return 0;
+
+err_unlock:
+ hyp_spin_unlock(&vm_table_lock);
+ return err;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
new file mode 100644
index 000000000000..d57bcb6ab94d
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+#include <uapi/linux/psci.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/trap_handler.h>
+
+void kvm_hyp_cpu_entry(unsigned long r0);
+void kvm_hyp_cpu_resume(unsigned long r0);
+
+void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+
+/* Config options set by the host. */
+struct kvm_host_psci_config __ro_after_init kvm_host_psci_config;
+
+#define INVALID_CPU_ID UINT_MAX
+
+struct psci_boot_args {
+ atomic_t lock;
+ unsigned long pc;
+ unsigned long r0;
+};
+
+#define PSCI_BOOT_ARGS_UNLOCKED 0
+#define PSCI_BOOT_ARGS_LOCKED 1
+
+#define PSCI_BOOT_ARGS_INIT \
+ ((struct psci_boot_args){ \
+ .lock = ATOMIC_INIT(PSCI_BOOT_ARGS_UNLOCKED), \
+ })
+
+static DEFINE_PER_CPU(struct psci_boot_args, cpu_on_args) = PSCI_BOOT_ARGS_INIT;
+static DEFINE_PER_CPU(struct psci_boot_args, suspend_args) = PSCI_BOOT_ARGS_INIT;
+
+#define is_psci_0_1(what, func_id) \
+ (kvm_host_psci_config.psci_0_1_ ## what ## _implemented && \
+ (func_id) == kvm_host_psci_config.function_ids_0_1.what)
+
+static bool is_psci_0_1_call(u64 func_id)
+{
+ return (is_psci_0_1(cpu_suspend, func_id) ||
+ is_psci_0_1(cpu_on, func_id) ||
+ is_psci_0_1(cpu_off, func_id) ||
+ is_psci_0_1(migrate, func_id));
+}
+
+static bool is_psci_0_2_call(u64 func_id)
+{
+ /* SMCCC reserves IDs 0x00-1F with the given 32/64-bit base for PSCI. */
+ return (PSCI_0_2_FN(0) <= func_id && func_id <= PSCI_0_2_FN(31)) ||
+ (PSCI_0_2_FN64(0) <= func_id && func_id <= PSCI_0_2_FN64(31));
+}
+
+static unsigned long psci_call(unsigned long fn, unsigned long arg0,
+ unsigned long arg1, unsigned long arg2)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
+ return res.a0;
+}
+
+static unsigned long psci_forward(struct kvm_cpu_context *host_ctxt)
+{
+ return psci_call(cpu_reg(host_ctxt, 0), cpu_reg(host_ctxt, 1),
+ cpu_reg(host_ctxt, 2), cpu_reg(host_ctxt, 3));
+}
+
+static unsigned int find_cpu_id(u64 mpidr)
+{
+ unsigned int i;
+
+ /* Reject invalid MPIDRs */
+ if (mpidr & ~MPIDR_HWID_BITMASK)
+ return INVALID_CPU_ID;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (cpu_logical_map(i) == mpidr)
+ return i;
+ }
+
+ return INVALID_CPU_ID;
+}
+
+static __always_inline bool try_acquire_boot_args(struct psci_boot_args *args)
+{
+ return atomic_cmpxchg_acquire(&args->lock,
+ PSCI_BOOT_ARGS_UNLOCKED,
+ PSCI_BOOT_ARGS_LOCKED) ==
+ PSCI_BOOT_ARGS_UNLOCKED;
+}
+
+static __always_inline void release_boot_args(struct psci_boot_args *args)
+{
+ atomic_set_release(&args->lock, PSCI_BOOT_ARGS_UNLOCKED);
+}
+
+static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, mpidr, host_ctxt, 1);
+ DECLARE_REG(unsigned long, pc, host_ctxt, 2);
+ DECLARE_REG(unsigned long, r0, host_ctxt, 3);
+
+ unsigned int cpu_id;
+ struct psci_boot_args *boot_args;
+ struct kvm_nvhe_init_params *init_params;
+ int ret;
+
+ /*
+ * Find the logical CPU ID for the given MPIDR. The search set is
+ * the set of CPUs that were online at the point of KVM initialization.
+ * Booting other CPUs is rejected because their cpufeatures were not
+ * checked against the finalized capabilities. This could be relaxed
+ * by doing the feature checks in hyp.
+ */
+ cpu_id = find_cpu_id(mpidr);
+ if (cpu_id == INVALID_CPU_ID)
+ return PSCI_RET_INVALID_PARAMS;
+
+ boot_args = per_cpu_ptr(&cpu_on_args, cpu_id);
+ init_params = per_cpu_ptr(&kvm_init_params, cpu_id);
+
+ /* Check if the target CPU is already being booted. */
+ if (!try_acquire_boot_args(boot_args))
+ return PSCI_RET_ALREADY_ON;
+
+ boot_args->pc = pc;
+ boot_args->r0 = r0;
+ wmb();
+
+ ret = psci_call(func_id, mpidr,
+ __hyp_pa(&kvm_hyp_cpu_entry),
+ __hyp_pa(init_params));
+
+ /* If successful, the lock will be released by the target CPU. */
+ if (ret != PSCI_RET_SUCCESS)
+ release_boot_args(boot_args);
+
+ return ret;
+}
+
+static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, power_state, host_ctxt, 1);
+ DECLARE_REG(unsigned long, pc, host_ctxt, 2);
+ DECLARE_REG(unsigned long, r0, host_ctxt, 3);
+
+ struct psci_boot_args *boot_args;
+ struct kvm_nvhe_init_params *init_params;
+
+ boot_args = this_cpu_ptr(&suspend_args);
+ init_params = this_cpu_ptr(&kvm_init_params);
+
+ /*
+ * No need to acquire a lock before writing to boot_args because a core
+ * can only suspend itself. Racy CPU_ON calls use a separate struct.
+ */
+ boot_args->pc = pc;
+ boot_args->r0 = r0;
+
+ /*
+ * Will either return if shallow sleep state, or wake up into the entry
+ * point if it is a deep sleep state.
+ */
+ return psci_call(func_id, power_state,
+ __hyp_pa(&kvm_hyp_cpu_resume),
+ __hyp_pa(init_params));
+}
+
+static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, pc, host_ctxt, 1);
+ DECLARE_REG(unsigned long, r0, host_ctxt, 2);
+
+ struct psci_boot_args *boot_args;
+ struct kvm_nvhe_init_params *init_params;
+
+ boot_args = this_cpu_ptr(&suspend_args);
+ init_params = this_cpu_ptr(&kvm_init_params);
+
+ /*
+ * No need to acquire a lock before writing to boot_args because a core
+ * can only suspend itself. Racy CPU_ON calls use a separate struct.
+ */
+ boot_args->pc = pc;
+ boot_args->r0 = r0;
+
+ /* Will only return on error. */
+ return psci_call(func_id,
+ __hyp_pa(&kvm_hyp_cpu_resume),
+ __hyp_pa(init_params), 0);
+}
+
+asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
+{
+ struct psci_boot_args *boot_args;
+ struct kvm_cpu_context *host_ctxt;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+
+ if (is_cpu_on)
+ boot_args = this_cpu_ptr(&cpu_on_args);
+ else
+ boot_args = this_cpu_ptr(&suspend_args);
+
+ cpu_reg(host_ctxt, 0) = boot_args->r0;
+ write_sysreg_el2(boot_args->pc, SYS_ELR);
+
+ if (is_cpu_on)
+ release_boot_args(boot_args);
+
+ __host_enter(host_ctxt);
+}
+
+static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id))
+ return psci_forward(host_ctxt);
+ if (is_psci_0_1(cpu_on, func_id))
+ return psci_cpu_on(func_id, host_ctxt);
+ if (is_psci_0_1(cpu_suspend, func_id))
+ return psci_cpu_suspend(func_id, host_ctxt);
+
+ return PSCI_RET_NOT_SUPPORTED;
+}
+
+static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ switch (func_id) {
+ case PSCI_0_2_FN_PSCI_VERSION:
+ case PSCI_0_2_FN_CPU_OFF:
+ case PSCI_0_2_FN64_AFFINITY_INFO:
+ case PSCI_0_2_FN64_MIGRATE:
+ case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+ case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU:
+ return psci_forward(host_ctxt);
+ /*
+ * SYSTEM_OFF/RESET should not return according to the spec.
+ * Allow it so as to stay robust to broken firmware.
+ */
+ case PSCI_0_2_FN_SYSTEM_OFF:
+ case PSCI_0_2_FN_SYSTEM_RESET:
+ return psci_forward(host_ctxt);
+ case PSCI_0_2_FN64_CPU_SUSPEND:
+ return psci_cpu_suspend(func_id, host_ctxt);
+ case PSCI_0_2_FN64_CPU_ON:
+ return psci_cpu_on(func_id, host_ctxt);
+ default:
+ return PSCI_RET_NOT_SUPPORTED;
+ }
+}
+
+static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ switch (func_id) {
+ case PSCI_1_0_FN_PSCI_FEATURES:
+ case PSCI_1_0_FN_SET_SUSPEND_MODE:
+ case PSCI_1_1_FN64_SYSTEM_RESET2:
+ return psci_forward(host_ctxt);
+ case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+ return psci_system_suspend(func_id, host_ctxt);
+ default:
+ return psci_0_2_handler(func_id, host_ctxt);
+ }
+}
+
+bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
+{
+ unsigned long ret;
+
+ switch (kvm_host_psci_config.version) {
+ case PSCI_VERSION(0, 1):
+ if (!is_psci_0_1_call(func_id))
+ return false;
+ ret = psci_0_1_handler(func_id, host_ctxt);
+ break;
+ case PSCI_VERSION(0, 2):
+ if (!is_psci_0_2_call(func_id))
+ return false;
+ ret = psci_0_2_handler(func_id, host_ctxt);
+ break;
+ default:
+ if (!is_psci_0_2_call(func_id))
+ return false;
+ ret = psci_1_0_handler(func_id, host_ctxt);
+ break;
+ }
+
+ cpu_reg(host_ctxt, 0) = ret;
+ cpu_reg(host_ctxt, 1) = 0;
+ cpu_reg(host_ctxt, 2) = 0;
+ cpu_reg(host_ctxt, 3) = 0;
+ return true;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
new file mode 100644
index 000000000000..bc58d1b515af
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/ffa.h>
+#include <nvhe/fixed_config.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
+#include <nvhe/trap_handler.h>
+
+unsigned long hyp_nr_cpus;
+
+#define hyp_percpu_size ((unsigned long)__per_cpu_end - \
+ (unsigned long)__per_cpu_start)
+
+static void *vmemmap_base;
+static void *vm_table_base;
+static void *hyp_pgt_base;
+static void *host_s2_pgt_base;
+static void *ffa_proxy_pages;
+static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
+static struct hyp_pool hpool;
+
+static int divide_memory_pool(void *virt, unsigned long size)
+{
+ unsigned long nr_pages;
+
+ hyp_early_alloc_init(virt, size);
+
+ nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page));
+ vmemmap_base = hyp_early_alloc_contig(nr_pages);
+ if (!vmemmap_base)
+ return -ENOMEM;
+
+ nr_pages = hyp_vm_table_pages();
+ vm_table_base = hyp_early_alloc_contig(nr_pages);
+ if (!vm_table_base)
+ return -ENOMEM;
+
+ nr_pages = hyp_s1_pgtable_pages();
+ hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
+ if (!hyp_pgt_base)
+ return -ENOMEM;
+
+ nr_pages = host_s2_pgtable_pages();
+ host_s2_pgt_base = hyp_early_alloc_contig(nr_pages);
+ if (!host_s2_pgt_base)
+ return -ENOMEM;
+
+ nr_pages = hyp_ffa_proxy_pages();
+ ffa_proxy_pages = hyp_early_alloc_contig(nr_pages);
+ if (!ffa_proxy_pages)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
+ unsigned long *per_cpu_base,
+ u32 hyp_va_bits)
+{
+ void *start, *end, *virt = hyp_phys_to_virt(phys);
+ unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
+ enum kvm_pgtable_prot prot;
+ int ret, i;
+
+ /* Recreate the hyp page-table using the early page allocator */
+ hyp_early_alloc_init(hyp_pgt_base, pgt_size);
+ ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits,
+ &hyp_early_alloc_mm_ops);
+ if (ret)
+ return ret;
+
+ ret = hyp_create_idmap(hyp_va_bits);
+ if (ret)
+ return ret;
+
+ ret = hyp_map_vectors();
+ if (ret)
+ return ret;
+
+ ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base));
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i);
+
+ start = (void *)kern_hyp_va(per_cpu_base[i]);
+ end = start + PAGE_ALIGN(hyp_percpu_size);
+ ret = pkvm_create_mappings(start, end, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_stack(params->stack_pa, &params->stack_hyp_va);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Map the host sections RO in the hypervisor, but transfer the
+ * ownership from the host to the hypervisor itself to make sure they
+ * can't be donated or shared with another entity.
+ *
+ * The ownership transition requires matching changes in the host
+ * stage-2. This will be done later (see finalize_host_mappings()) once
+ * the hyp_vmemmap is addressable.
+ */
+ prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
+ ret = pkvm_create_mappings(&kvm_vgic_global_state,
+ &kvm_vgic_global_state + 1, prot);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void update_nvhe_init_params(void)
+{
+ struct kvm_nvhe_init_params *params;
+ unsigned long i;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ params = per_cpu_ptr(&kvm_init_params, i);
+ params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
+ dcache_clean_inval_poc((unsigned long)params,
+ (unsigned long)params + sizeof(*params));
+ }
+}
+
+static void *hyp_zalloc_hyp_page(void *arg)
+{
+ return hyp_alloc_pages(&hpool, 0);
+}
+
+static void hpool_get_page(void *addr)
+{
+ hyp_get_page(&hpool, addr);
+}
+
+static void hpool_put_page(void *addr)
+{
+ hyp_put_page(&hpool, addr);
+}
+
+static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ enum kvm_pgtable_prot prot;
+ enum pkvm_page_state state;
+ phys_addr_t phys;
+
+ if (!kvm_pte_valid(ctx->old))
+ return 0;
+
+ if (ctx->level != KVM_PGTABLE_LAST_LEVEL)
+ return -EINVAL;
+
+ phys = kvm_pte_to_phys(ctx->old);
+ if (!addr_is_memory(phys))
+ return -EINVAL;
+
+ /*
+ * Adjust the host stage-2 mappings to match the ownership attributes
+ * configured in the hypervisor stage-1.
+ */
+ state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
+ switch (state) {
+ case PKVM_PAGE_OWNED:
+ return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
+ case PKVM_PAGE_SHARED_OWNED:
+ prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
+ break;
+ case PKVM_PAGE_SHARED_BORROWED:
+ prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
+}
+
+static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ /*
+ * Fix-up the refcount for the page-table pages as the early allocator
+ * was unable to access the hyp_vmemmap and so the buddy allocator has
+ * initialised the refcount to '1'.
+ */
+ if (kvm_pte_valid(ctx->old))
+ ctx->mm_ops->get_page(ctx->ptep);
+
+ return 0;
+}
+
+static int fix_host_ownership(void)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = fix_host_ownership_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+ int i, ret;
+
+ for (i = 0; i < hyp_memblock_nr; i++) {
+ struct memblock_region *reg = &hyp_memory[i];
+ u64 start = (u64)hyp_phys_to_virt(reg->base);
+
+ ret = kvm_pgtable_walk(&pkvm_pgtable, start, reg->size, &walker);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int fix_hyp_pgtable_refcnt(void)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = fix_hyp_pgtable_refcnt_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = pkvm_pgtable.mm_ops,
+ };
+
+ return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits),
+ &walker);
+}
+
+void __noreturn __pkvm_init_finalise(void)
+{
+ struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
+ struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt;
+ unsigned long nr_pages, reserved_pages, pfn;
+ int ret;
+
+ /* Now that the vmemmap is backed, install the full-fledged allocator */
+ pfn = hyp_virt_to_pfn(hyp_pgt_base);
+ nr_pages = hyp_s1_pgtable_pages();
+ reserved_pages = hyp_early_alloc_nr_used_pages();
+ ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages);
+ if (ret)
+ goto out;
+
+ ret = kvm_host_prepare_stage2(host_s2_pgt_base);
+ if (ret)
+ goto out;
+
+ pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
+ .zalloc_page = hyp_zalloc_hyp_page,
+ .phys_to_virt = hyp_phys_to_virt,
+ .virt_to_phys = hyp_virt_to_phys,
+ .get_page = hpool_get_page,
+ .put_page = hpool_put_page,
+ .page_count = hyp_page_count,
+ };
+ pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
+
+ ret = fix_host_ownership();
+ if (ret)
+ goto out;
+
+ ret = fix_hyp_pgtable_refcnt();
+ if (ret)
+ goto out;
+
+ ret = hyp_create_pcpu_fixmap();
+ if (ret)
+ goto out;
+
+ ret = hyp_ffa_init(ffa_proxy_pages);
+ if (ret)
+ goto out;
+
+ pkvm_hyp_vm_table_init(vm_table_base);
+out:
+ /*
+ * We tail-called to here from handle___pkvm_init() and will not return,
+ * so make sure to propagate the return value to the host.
+ */
+ cpu_reg(host_ctxt, 1) = ret;
+
+ __host_enter(host_ctxt);
+}
+
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+ unsigned long *per_cpu_base, u32 hyp_va_bits)
+{
+ struct kvm_nvhe_init_params *params;
+ void *virt = hyp_phys_to_virt(phys);
+ void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+ int ret;
+
+ BUG_ON(kvm_check_pvm_sysreg_table());
+
+ if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+ return -EINVAL;
+
+ hyp_spin_lock_init(&pkvm_pgd_lock);
+ hyp_nr_cpus = nr_cpus;
+
+ ret = divide_memory_pool(virt, size);
+ if (ret)
+ return ret;
+
+ ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits);
+ if (ret)
+ return ret;
+
+ update_nvhe_init_params();
+
+ /* Jump in the idmap page to switch to the new page-tables */
+ params = this_cpu_ptr(&kvm_init_params);
+ fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
+ fn(__hyp_pa(params), __pkvm_init_finalise);
+
+ unreachable();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
new file mode 100644
index 000000000000..ed6b58b19cfa
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM nVHE hypervisor stack tracing support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/memory.h>
+#include <asm/percpu.h>
+
+DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)
+ __aligned(16);
+
+DEFINE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info);
+
+/*
+ * hyp_prepare_backtrace - Prepare non-protected nVHE backtrace.
+ *
+ * @fp : frame pointer at which to start the unwinding.
+ * @pc : program counter at which to start the unwinding.
+ *
+ * Save the information needed by the host to unwind the non-protected
+ * nVHE hypervisor stack in EL1.
+ */
+static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc)
+{
+ struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info);
+ struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+ stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE);
+ stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack);
+ stacktrace_info->fp = fp;
+ stacktrace_info->pc = pc;
+}
+
+#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE
+#include <asm/stacktrace/nvhe.h>
+
+DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace);
+
+static struct stack_info stackinfo_get_overflow(void)
+{
+ unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack);
+ unsigned long high = low + OVERFLOW_STACK_SIZE;
+
+ return (struct stack_info) {
+ .low = low,
+ .high = high,
+ };
+}
+
+static struct stack_info stackinfo_get_hyp(void)
+{
+ struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+ unsigned long high = params->stack_hyp_va;
+ unsigned long low = high - PAGE_SIZE;
+
+ return (struct stack_info) {
+ .low = low,
+ .high = high,
+ };
+}
+
+static int unwind_next(struct unwind_state *state)
+{
+ return unwind_next_frame_record(state);
+}
+
+static void notrace unwind(struct unwind_state *state,
+ stack_trace_consume_fn consume_entry,
+ void *cookie)
+{
+ while (1) {
+ int ret;
+
+ if (!consume_entry(cookie, state->pc))
+ break;
+ ret = unwind_next(state);
+ if (ret < 0)
+ break;
+ }
+}
+
+/*
+ * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry
+ *
+ * @arg : index of the entry in the stacktrace buffer
+ * @where : the program counter corresponding to the stack frame
+ *
+ * Save the return address of a stack frame to the shared stacktrace buffer.
+ * The host can access this shared buffer from EL1 to dump the backtrace.
+ */
+static bool pkvm_save_backtrace_entry(void *arg, unsigned long where)
+{
+ unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace);
+ int *idx = (int *)arg;
+
+ /*
+ * Need 2 free slots: 1 for current entry and 1 for the
+ * delimiter.
+ */
+ if (*idx > ARRAY_SIZE(pkvm_stacktrace) - 2)
+ return false;
+
+ stacktrace[*idx] = where;
+ stacktrace[++*idx] = 0UL;
+
+ return true;
+}
+
+/*
+ * pkvm_save_backtrace - Saves the protected nVHE HYP stacktrace
+ *
+ * @fp : frame pointer at which to start the unwinding.
+ * @pc : program counter at which to start the unwinding.
+ *
+ * Save the unwinded stack addresses to the shared stacktrace buffer.
+ * The host can access this shared buffer from EL1 to dump the backtrace.
+ */
+static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
+{
+ struct stack_info stacks[] = {
+ stackinfo_get_overflow(),
+ stackinfo_get_hyp(),
+ };
+ struct unwind_state state = {
+ .stacks = stacks,
+ .nr_stacks = ARRAY_SIZE(stacks),
+ };
+ int idx = 0;
+
+ kvm_nvhe_unwind_init(&state, fp, pc);
+
+ unwind(&state, pkvm_save_backtrace_entry, &idx);
+}
+#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */
+static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
+{
+}
+#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */
+
+/*
+ * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace
+ *
+ * @fp : frame pointer at which to start the unwinding.
+ * @pc : program counter at which to start the unwinding.
+ *
+ * Saves the information needed by the host to dump the nVHE hypervisor
+ * backtrace.
+ */
+void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc)
+{
+ if (is_protected_kvm_enabled())
+ pkvm_save_backtrace(fp, pc);
+ else
+ hyp_prepare_backtrace(fp, pc);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
new file mode 100644
index 000000000000..c50f8459e4fc
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <hyp/switch.h>
+#include <hyp/sysreg-sr.h>
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <uapi/linux/psci.h>
+
+#include <kvm/arm_psci.h>
+
+#include <asm/barrier.h>
+#include <asm/cpufeature.h>
+#include <asm/kprobes.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/fpsimd.h>
+#include <asm/debug-monitors.h>
+#include <asm/processor.h>
+
+#include <nvhe/fixed_config.h>
+#include <nvhe/mem_protect.h>
+
+/* Non-VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+
+extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
+
+static void __activate_traps(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ ___activate_traps(vcpu);
+ __activate_traps_common(vcpu);
+
+ val = vcpu->arch.cptr_el2;
+ val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */
+ val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA;
+ if (cpus_have_final_cap(ARM64_SME)) {
+ if (has_hvhe())
+ val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN);
+ else
+ val |= CPTR_EL2_TSM;
+ }
+
+ if (!guest_owns_fp_regs(vcpu)) {
+ if (has_hvhe())
+ val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
+ CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN);
+ else
+ val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
+
+ __activate_traps_fpsimd32(vcpu);
+ }
+
+ kvm_write_cptr_el2(val);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
+ struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+
+ isb();
+ /*
+ * At this stage, and thanks to the above isb(), S2 is
+ * configured and enabled. We can now restore the guest's S1
+ * configuration: SCTLR, and only then TCR.
+ */
+ write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1), SYS_SCTLR);
+ isb();
+ write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1), SYS_TCR);
+ }
+}
+
+static void __deactivate_traps(struct kvm_vcpu *vcpu)
+{
+ extern char __kvm_hyp_host_vector[];
+
+ ___deactivate_traps(vcpu);
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
+ u64 val;
+
+ /*
+ * Set the TCR and SCTLR registers in the exact opposite
+ * sequence as __activate_traps (first prevent walks,
+ * then force the MMU on). A generous sprinkling of isb()
+ * ensure that things happen in this exact order.
+ */
+ val = read_sysreg_el1(SYS_TCR);
+ write_sysreg_el1(val | TCR_EPD1_MASK | TCR_EPD0_MASK, SYS_TCR);
+ isb();
+ val = read_sysreg_el1(SYS_SCTLR);
+ write_sysreg_el1(val | SCTLR_ELx_M, SYS_SCTLR);
+ isb();
+ }
+
+ __deactivate_traps_common(vcpu);
+
+ write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
+
+ kvm_reset_cptr_el2(vcpu);
+ write_sysreg(__kvm_hyp_host_vector, vbar_el2);
+}
+
+/* Save VGICv3 state on non-VHE systems */
+static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
+ __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
+ __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
+ }
+}
+
+/* Restore VGICv3 state on non-VHE systems */
+static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
+ __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
+ __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
+ }
+}
+
+/*
+ * Disable host events, enable guest events
+ */
+#ifdef CONFIG_HW_PERF_EVENTS
+static bool __pmu_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events;
+
+ if (pmu->events_host)
+ write_sysreg(pmu->events_host, pmcntenclr_el0);
+
+ if (pmu->events_guest)
+ write_sysreg(pmu->events_guest, pmcntenset_el0);
+
+ return (pmu->events_host || pmu->events_guest);
+}
+
+/*
+ * Disable guest events, enable host events
+ */
+static void __pmu_switch_to_host(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events;
+
+ if (pmu->events_guest)
+ write_sysreg(pmu->events_guest, pmcntenclr_el0);
+
+ if (pmu->events_host)
+ write_sysreg(pmu->events_host, pmcntenset_el0);
+}
+#else
+#define __pmu_switch_to_guest(v) ({ false; })
+#define __pmu_switch_to_host(v) do {} while (0)
+#endif
+
+/*
+ * Handler for protected VM MSR, MRS or System instruction execution in AArch64.
+ *
+ * Returns true if the hypervisor has handled the exit, and control should go
+ * back to the guest, or false if it hasn't.
+ */
+static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ /*
+ * Make sure we handle the exit for workarounds and ptrauth
+ * before the pKVM handling, as the latter could decide to
+ * UNDEF.
+ */
+ return (kvm_hyp_handle_sysreg(vcpu, exit_code) ||
+ kvm_handle_pvm_sysreg(vcpu, exit_code));
+}
+
+static const exit_handler_fn hyp_exit_handlers[] = {
+ [0 ... ESR_ELx_EC_MAX] = NULL,
+ [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32,
+ [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg,
+ [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd,
+ [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
+ [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
+ [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
+ [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
+ [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
+ [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
+};
+
+static const exit_handler_fn pvm_exit_handlers[] = {
+ [0 ... ESR_ELx_EC_MAX] = NULL,
+ [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64,
+ [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted,
+ [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
+ [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
+ [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low,
+ [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low,
+ [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth,
+ [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
+};
+
+static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm))))
+ return pvm_exit_handlers;
+
+ return hyp_exit_handlers;
+}
+
+/*
+ * Some guests (e.g., protected VMs) are not be allowed to run in AArch32.
+ * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a
+ * guest from dropping to AArch32 EL0 if implemented by the CPU. If the
+ * hypervisor spots a guest in such a state ensure it is handled, and don't
+ * trust the host to spot or fix it. The check below is based on the one in
+ * kvm_arch_vcpu_ioctl_run().
+ *
+ * Returns false if the guest ran in AArch32 when it shouldn't have, and
+ * thus should exit to the host, or true if a the guest run loop can continue.
+ */
+static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+
+ if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) {
+ /*
+ * As we have caught the guest red-handed, decide that it isn't
+ * fit for purpose anymore by making the vcpu invalid. The VMM
+ * can try and fix it by re-initializing the vcpu with
+ * KVM_ARM_VCPU_INIT, however, this is likely not possible for
+ * protected VMs.
+ */
+ vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
+ *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT);
+ *exit_code |= ARM_EXCEPTION_IL;
+ }
+}
+
+/* Switch to the guest for legacy non-VHE systems */
+int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_cpu_context *guest_ctxt;
+ struct kvm_s2_mmu *mmu;
+ bool pmu_switch_needed;
+ u64 exit_code;
+
+ /*
+ * Having IRQs masked via PMR when entering the guest means the GIC
+ * will not signal the CPU of interrupts of lower priority, and the
+ * only way to get out will be via guest exceptions.
+ * Naturally, we want to avoid this.
+ */
+ if (system_uses_irq_prio_masking()) {
+ gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
+ pmr_sync();
+ }
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ host_ctxt->__hyp_running_vcpu = vcpu;
+ guest_ctxt = &vcpu->arch.ctxt;
+
+ pmu_switch_needed = __pmu_switch_to_guest(vcpu);
+
+ __sysreg_save_state_nvhe(host_ctxt);
+ /*
+ * We must flush and disable the SPE buffer for nVHE, as
+ * the translation regime(EL1&0) is going to be loaded with
+ * that of the guest. And we must do this before we change the
+ * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and
+ * before we load guest Stage1.
+ */
+ __debug_save_host_buffers_nvhe(vcpu);
+
+ /*
+ * We're about to restore some new MMU state. Make sure
+ * ongoing page-table walks that have started before we
+ * trapped to EL2 have completed. This also synchronises the
+ * above disabling of SPE and TRBE.
+ *
+ * See DDI0487I.a D8.1.5 "Out-of-context translation regimes",
+ * rule R_LFHQG and subsequent information statements.
+ */
+ dsb(nsh);
+
+ __kvm_adjust_pc(vcpu);
+
+ /*
+ * We must restore the 32-bit state before the sysregs, thanks
+ * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
+ *
+ * Also, and in order to be able to deal with erratum #1319537 (A57)
+ * and #1319367 (A72), we must ensure that all VM-related sysreg are
+ * restored before we enable S2 translation.
+ */
+ __sysreg32_restore_state(vcpu);
+ __sysreg_restore_state_nvhe(guest_ctxt);
+
+ mmu = kern_hyp_va(vcpu->arch.hw_mmu);
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
+ __activate_traps(vcpu);
+
+ __hyp_vgic_restore_state(vcpu);
+ __timer_enable_traps(vcpu);
+
+ __debug_switch_to_guest(vcpu);
+
+ do {
+ /* Jump in the fire! */
+ exit_code = __guest_enter(vcpu);
+
+ /* And we're baaack! */
+ } while (fixup_guest_exit(vcpu, &exit_code));
+
+ __sysreg_save_state_nvhe(guest_ctxt);
+ __sysreg32_save_state(vcpu);
+ __timer_disable_traps(vcpu);
+ __hyp_vgic_save_state(vcpu);
+
+ /*
+ * Same thing as before the guest run: we're about to switch
+ * the MMU context, so let's make sure we don't have any
+ * ongoing EL1&0 translations.
+ */
+ dsb(nsh);
+
+ __deactivate_traps(vcpu);
+ __load_host_stage2();
+
+ __sysreg_restore_state_nvhe(host_ctxt);
+
+ if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)
+ __fpsimd_save_fpexc32(vcpu);
+
+ __debug_switch_to_host(vcpu);
+ /*
+ * This must come after restoring the host sysregs, since a non-VHE
+ * system may enable SPE here and make use of the TTBRs.
+ */
+ __debug_restore_host_buffers_nvhe(vcpu);
+
+ if (pmu_switch_needed)
+ __pmu_switch_to_host(vcpu);
+
+ /* Returning to host will clear PSR.I, remask PMR if needed */
+ if (system_uses_irq_prio_masking())
+ gic_write_pmr(GIC_PRIO_IRQOFF);
+
+ host_ctxt->__hyp_running_vcpu = NULL;
+
+ return exit_code;
+}
+
+asmlinkage void __noreturn hyp_panic(void)
+{
+ u64 spsr = read_sysreg_el2(SYS_SPSR);
+ u64 elr = read_sysreg_el2(SYS_ELR);
+ u64 par = read_sysreg_par();
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+
+ if (vcpu) {
+ __timer_disable_traps(vcpu);
+ __deactivate_traps(vcpu);
+ __load_host_stage2();
+ __sysreg_restore_state_nvhe(host_ctxt);
+ }
+
+ /* Prepare to dump kvm nvhe hyp stacktrace */
+ kvm_nvhe_prepare_backtrace((unsigned long)__builtin_frame_address(0),
+ _THIS_IP_);
+
+ __hyp_do_panic(host_ctxt, spsr, elr, par);
+ unreachable();
+}
+
+asmlinkage void __noreturn hyp_panic_bad_stack(void)
+{
+ hyp_panic();
+}
+
+asmlinkage void kvm_unexpected_el2_exception(void)
+{
+ __kvm_unexpected_el2_exception();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
new file mode 100644
index 000000000000..edd969a1f36b
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Google LLC
+ * Author: Fuad Tabba <tabba@google.com>
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+#include <hyp/adjust_pc.h>
+
+#include <nvhe/fixed_config.h>
+
+#include "../../sys_regs.h"
+
+/*
+ * Copies of the host's CPU features registers holding sanitized values at hyp.
+ */
+u64 id_aa64pfr0_el1_sys_val;
+u64 id_aa64pfr1_el1_sys_val;
+u64 id_aa64isar0_el1_sys_val;
+u64 id_aa64isar1_el1_sys_val;
+u64 id_aa64isar2_el1_sys_val;
+u64 id_aa64mmfr0_el1_sys_val;
+u64 id_aa64mmfr1_el1_sys_val;
+u64 id_aa64mmfr2_el1_sys_val;
+u64 id_aa64smfr0_el1_sys_val;
+
+/*
+ * Inject an unknown/undefined exception to an AArch64 guest while most of its
+ * sysregs are live.
+ */
+static void inject_undef64(struct kvm_vcpu *vcpu)
+{
+ u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
+
+ *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
+ *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
+
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+
+ __kvm_adjust_pc(vcpu);
+
+ write_sysreg_el1(esr, SYS_ESR);
+ write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
+ write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
+ write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
+}
+
+/*
+ * Returns the restricted features values of the feature register based on the
+ * limitations in restrict_fields.
+ * A feature id field value of 0b0000 does not impose any restrictions.
+ * Note: Use only for unsigned feature field values.
+ */
+static u64 get_restricted_features_unsigned(u64 sys_reg_val,
+ u64 restrict_fields)
+{
+ u64 value = 0UL;
+ u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+
+ /*
+ * According to the Arm Architecture Reference Manual, feature fields
+ * use increasing values to indicate increases in functionality.
+ * Iterate over the restricted feature fields and calculate the minimum
+ * unsigned value between the one supported by the system, and what the
+ * value is being restricted to.
+ */
+ while (sys_reg_val && restrict_fields) {
+ value |= min(sys_reg_val & mask, restrict_fields & mask);
+ sys_reg_val &= ~mask;
+ restrict_fields &= ~mask;
+ mask <<= ARM64_FEATURE_FIELD_BITS;
+ }
+
+ return value;
+}
+
+/*
+ * Functions that return the value of feature id registers for protected VMs
+ * based on allowed features, system features, and KVM support.
+ */
+
+static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu)
+{
+ u64 set_mask = 0;
+ u64 allow_mask = PVM_ID_AA64PFR0_ALLOW;
+
+ set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val,
+ PVM_ID_AA64PFR0_RESTRICT_UNSIGNED);
+
+ return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask;
+}
+
+static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu)
+{
+ const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm);
+ u64 allow_mask = PVM_ID_AA64PFR1_ALLOW;
+
+ if (!kvm_has_mte(kvm))
+ allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
+
+ return id_aa64pfr1_el1_sys_val & allow_mask;
+}
+
+static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu)
+{
+ /*
+ * No support for Scalable Vectors, therefore, hyp has no sanitized
+ * copy of the feature id register.
+ */
+ BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL);
+ return 0;
+}
+
+static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu)
+{
+ /*
+ * No support for debug, including breakpoints, and watchpoints,
+ * therefore, pKVM has no sanitized copy of the feature id register.
+ */
+ BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL);
+ return 0;
+}
+
+static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu)
+{
+ /*
+ * No support for debug, therefore, hyp has no sanitized copy of the
+ * feature id register.
+ */
+ BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL);
+ return 0;
+}
+
+static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu)
+{
+ /*
+ * No support for implementation defined features, therefore, hyp has no
+ * sanitized copy of the feature id register.
+ */
+ BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL);
+ return 0;
+}
+
+static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu)
+{
+ /*
+ * No support for implementation defined features, therefore, hyp has no
+ * sanitized copy of the feature id register.
+ */
+ BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL);
+ return 0;
+}
+
+static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu)
+{
+ return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW;
+}
+
+static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu)
+{
+ u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW;
+
+ if (!vcpu_has_ptrauth(vcpu))
+ allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
+
+ return id_aa64isar1_el1_sys_val & allow_mask;
+}
+
+static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu)
+{
+ u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW;
+
+ if (!vcpu_has_ptrauth(vcpu))
+ allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
+
+ return id_aa64isar2_el1_sys_val & allow_mask;
+}
+
+static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu)
+{
+ u64 set_mask;
+
+ set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val,
+ PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED);
+
+ return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask;
+}
+
+static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu)
+{
+ return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW;
+}
+
+static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu)
+{
+ return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW;
+}
+
+/* Read a sanitized cpufeature ID register by its encoding */
+u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id)
+{
+ switch (id) {
+ case SYS_ID_AA64PFR0_EL1:
+ return get_pvm_id_aa64pfr0(vcpu);
+ case SYS_ID_AA64PFR1_EL1:
+ return get_pvm_id_aa64pfr1(vcpu);
+ case SYS_ID_AA64ZFR0_EL1:
+ return get_pvm_id_aa64zfr0(vcpu);
+ case SYS_ID_AA64DFR0_EL1:
+ return get_pvm_id_aa64dfr0(vcpu);
+ case SYS_ID_AA64DFR1_EL1:
+ return get_pvm_id_aa64dfr1(vcpu);
+ case SYS_ID_AA64AFR0_EL1:
+ return get_pvm_id_aa64afr0(vcpu);
+ case SYS_ID_AA64AFR1_EL1:
+ return get_pvm_id_aa64afr1(vcpu);
+ case SYS_ID_AA64ISAR0_EL1:
+ return get_pvm_id_aa64isar0(vcpu);
+ case SYS_ID_AA64ISAR1_EL1:
+ return get_pvm_id_aa64isar1(vcpu);
+ case SYS_ID_AA64ISAR2_EL1:
+ return get_pvm_id_aa64isar2(vcpu);
+ case SYS_ID_AA64MMFR0_EL1:
+ return get_pvm_id_aa64mmfr0(vcpu);
+ case SYS_ID_AA64MMFR1_EL1:
+ return get_pvm_id_aa64mmfr1(vcpu);
+ case SYS_ID_AA64MMFR2_EL1:
+ return get_pvm_id_aa64mmfr2(vcpu);
+ default:
+ /* Unhandled ID register, RAZ */
+ return 0;
+ }
+}
+
+static u64 read_id_reg(const struct kvm_vcpu *vcpu,
+ struct sys_reg_desc const *r)
+{
+ return pvm_read_id_reg(vcpu, reg_to_encoding(r));
+}
+
+/* Handler to RAZ/WI sysregs */
+static bool pvm_access_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (!p->is_write)
+ p->regval = 0;
+
+ return true;
+}
+
+/*
+ * Accessor for AArch32 feature id registers.
+ *
+ * The value of these registers is "unknown" according to the spec if AArch32
+ * isn't supported.
+ */
+static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write) {
+ inject_undef64(vcpu);
+ return false;
+ }
+
+ /*
+ * No support for AArch32 guests, therefore, pKVM has no sanitized copy
+ * of AArch32 feature id registers.
+ */
+ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1),
+ PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
+
+ return pvm_access_raz_wi(vcpu, p, r);
+}
+
+/*
+ * Accessor for AArch64 feature id registers.
+ *
+ * If access is allowed, set the regval to the protected VM's view of the
+ * register and return true.
+ * Otherwise, inject an undefined exception and return false.
+ */
+static bool pvm_access_id_aarch64(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write) {
+ inject_undef64(vcpu);
+ return false;
+ }
+
+ p->regval = read_id_reg(vcpu, r);
+ return true;
+}
+
+static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ /* pVMs only support GICv3. 'nuf said. */
+ if (!p->is_write)
+ p->regval = ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE;
+
+ return true;
+}
+
+/* Mark the specified system register as an AArch32 feature id register. */
+#define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 }
+
+/* Mark the specified system register as an AArch64 feature id register. */
+#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 }
+
+/*
+ * sys_reg_desc initialiser for architecturally unallocated cpufeature ID
+ * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
+ * (1 <= crm < 8, 0 <= Op2 < 8).
+ */
+#define ID_UNALLOCATED(crm, op2) { \
+ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
+ .access = pvm_access_id_aarch64, \
+}
+
+/* Mark the specified system register as Read-As-Zero/Write-Ignored */
+#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi }
+
+/* Mark the specified system register as not being handled in hyp. */
+#define HOST_HANDLED(REG) { SYS_DESC(REG), .access = NULL }
+
+/*
+ * Architected system registers.
+ * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
+ *
+ * NOTE: Anything not explicitly listed here is *restricted by default*, i.e.,
+ * it will lead to injecting an exception into the guest.
+ */
+static const struct sys_reg_desc pvm_sys_reg_descs[] = {
+ /* Cache maintenance by set/way operations are restricted. */
+
+ /* Debug and Trace Registers are restricted. */
+
+ /* AArch64 mappings of the AArch32 ID registers */
+ /* CRm=1 */
+ AARCH32(SYS_ID_PFR0_EL1),
+ AARCH32(SYS_ID_PFR1_EL1),
+ AARCH32(SYS_ID_DFR0_EL1),
+ AARCH32(SYS_ID_AFR0_EL1),
+ AARCH32(SYS_ID_MMFR0_EL1),
+ AARCH32(SYS_ID_MMFR1_EL1),
+ AARCH32(SYS_ID_MMFR2_EL1),
+ AARCH32(SYS_ID_MMFR3_EL1),
+
+ /* CRm=2 */
+ AARCH32(SYS_ID_ISAR0_EL1),
+ AARCH32(SYS_ID_ISAR1_EL1),
+ AARCH32(SYS_ID_ISAR2_EL1),
+ AARCH32(SYS_ID_ISAR3_EL1),
+ AARCH32(SYS_ID_ISAR4_EL1),
+ AARCH32(SYS_ID_ISAR5_EL1),
+ AARCH32(SYS_ID_MMFR4_EL1),
+ AARCH32(SYS_ID_ISAR6_EL1),
+
+ /* CRm=3 */
+ AARCH32(SYS_MVFR0_EL1),
+ AARCH32(SYS_MVFR1_EL1),
+ AARCH32(SYS_MVFR2_EL1),
+ ID_UNALLOCATED(3,3),
+ AARCH32(SYS_ID_PFR2_EL1),
+ AARCH32(SYS_ID_DFR1_EL1),
+ AARCH32(SYS_ID_MMFR5_EL1),
+ ID_UNALLOCATED(3,7),
+
+ /* AArch64 ID registers */
+ /* CRm=4 */
+ AARCH64(SYS_ID_AA64PFR0_EL1),
+ AARCH64(SYS_ID_AA64PFR1_EL1),
+ ID_UNALLOCATED(4,2),
+ ID_UNALLOCATED(4,3),
+ AARCH64(SYS_ID_AA64ZFR0_EL1),
+ ID_UNALLOCATED(4,5),
+ ID_UNALLOCATED(4,6),
+ ID_UNALLOCATED(4,7),
+ AARCH64(SYS_ID_AA64DFR0_EL1),
+ AARCH64(SYS_ID_AA64DFR1_EL1),
+ ID_UNALLOCATED(5,2),
+ ID_UNALLOCATED(5,3),
+ AARCH64(SYS_ID_AA64AFR0_EL1),
+ AARCH64(SYS_ID_AA64AFR1_EL1),
+ ID_UNALLOCATED(5,6),
+ ID_UNALLOCATED(5,7),
+ AARCH64(SYS_ID_AA64ISAR0_EL1),
+ AARCH64(SYS_ID_AA64ISAR1_EL1),
+ AARCH64(SYS_ID_AA64ISAR2_EL1),
+ ID_UNALLOCATED(6,3),
+ ID_UNALLOCATED(6,4),
+ ID_UNALLOCATED(6,5),
+ ID_UNALLOCATED(6,6),
+ ID_UNALLOCATED(6,7),
+ AARCH64(SYS_ID_AA64MMFR0_EL1),
+ AARCH64(SYS_ID_AA64MMFR1_EL1),
+ AARCH64(SYS_ID_AA64MMFR2_EL1),
+ ID_UNALLOCATED(7,3),
+ ID_UNALLOCATED(7,4),
+ ID_UNALLOCATED(7,5),
+ ID_UNALLOCATED(7,6),
+ ID_UNALLOCATED(7,7),
+
+ /* Scalable Vector Registers are restricted. */
+
+ RAZ_WI(SYS_ERRIDR_EL1),
+ RAZ_WI(SYS_ERRSELR_EL1),
+ RAZ_WI(SYS_ERXFR_EL1),
+ RAZ_WI(SYS_ERXCTLR_EL1),
+ RAZ_WI(SYS_ERXSTATUS_EL1),
+ RAZ_WI(SYS_ERXADDR_EL1),
+ RAZ_WI(SYS_ERXMISC0_EL1),
+ RAZ_WI(SYS_ERXMISC1_EL1),
+
+ /* Performance Monitoring Registers are restricted. */
+
+ /* Limited Ordering Regions Registers are restricted. */
+
+ HOST_HANDLED(SYS_ICC_SGI1R_EL1),
+ HOST_HANDLED(SYS_ICC_ASGI1R_EL1),
+ HOST_HANDLED(SYS_ICC_SGI0R_EL1),
+ { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, },
+
+ HOST_HANDLED(SYS_CCSIDR_EL1),
+ HOST_HANDLED(SYS_CLIDR_EL1),
+ HOST_HANDLED(SYS_CSSELR_EL1),
+ HOST_HANDLED(SYS_CTR_EL0),
+
+ /* Performance Monitoring Registers are restricted. */
+
+ /* Activity Monitoring Registers are restricted. */
+
+ HOST_HANDLED(SYS_CNTP_TVAL_EL0),
+ HOST_HANDLED(SYS_CNTP_CTL_EL0),
+ HOST_HANDLED(SYS_CNTP_CVAL_EL0),
+
+ /* Performance Monitoring Registers are restricted. */
+};
+
+/*
+ * Checks that the sysreg table is unique and in-order.
+ *
+ * Returns 0 if the table is consistent, or 1 otherwise.
+ */
+int kvm_check_pvm_sysreg_table(void)
+{
+ unsigned int i;
+
+ for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_descs); i++) {
+ if (cmp_sys_reg(&pvm_sys_reg_descs[i-1], &pvm_sys_reg_descs[i]) >= 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Handler for protected VM MSR, MRS or System instruction execution.
+ *
+ * Returns true if the hypervisor has handled the exit, and control should go
+ * back to the guest, or false if it hasn't, to be handled by the host.
+ */
+bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ const struct sys_reg_desc *r;
+ struct sys_reg_params params;
+ unsigned long esr = kvm_vcpu_get_esr(vcpu);
+ int Rt = kvm_vcpu_sys_get_rt(vcpu);
+
+ params = esr_sys64_to_params(esr);
+ params.regval = vcpu_get_reg(vcpu, Rt);
+
+ r = find_reg(&params, pvm_sys_reg_descs, ARRAY_SIZE(pvm_sys_reg_descs));
+
+ /* Undefined (RESTRICTED). */
+ if (r == NULL) {
+ inject_undef64(vcpu);
+ return true;
+ }
+
+ /* Handled by the host (HOST_HANDLED) */
+ if (r->access == NULL)
+ return false;
+
+ /* Handled by hyp: skip instruction if instructed to do so. */
+ if (r->access(vcpu, &params, r))
+ __kvm_skip_instr(vcpu);
+
+ if (!params.is_write)
+ vcpu_set_reg(vcpu, Rt, params.regval);
+
+ return true;
+}
+
+/*
+ * Handler for protected VM restricted exceptions.
+ *
+ * Inject an undefined exception into the guest and return true to indicate that
+ * the hypervisor has handled the exit, and control should go back to the guest.
+ */
+bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ inject_undef64(vcpu);
+ return true;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
new file mode 100644
index 000000000000..29305022bc04
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <hyp/sysreg-sr.h>
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kprobes.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+/*
+ * Non-VHE: Both host and guest must save everything.
+ */
+
+void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt)
+{
+ __sysreg_save_el1_state(ctxt);
+ __sysreg_save_common_state(ctxt);
+ __sysreg_save_user_state(ctxt);
+ __sysreg_save_el2_return_state(ctxt);
+}
+
+void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
+{
+ __sysreg_restore_el1_state(ctxt);
+ __sysreg_restore_common_state(ctxt);
+ __sysreg_restore_user_state(ctxt);
+ __sysreg_restore_el2_return_state(ctxt);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
new file mode 100644
index 000000000000..3aaab20ae5b4
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <clocksource/arm_arch_timer.h>
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+void __kvm_timer_set_cntvoff(u64 cntvoff)
+{
+ write_sysreg(cntvoff, cntvoff_el2);
+}
+
+/*
+ * Should only be called on non-VHE or hVHE setups.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __timer_disable_traps(struct kvm_vcpu *vcpu)
+{
+ u64 val, shift = 0;
+
+ if (has_hvhe())
+ shift = 10;
+
+ /* Allow physical timer/counter access for the host */
+ val = read_sysreg(cnthctl_el2);
+ val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift;
+ write_sysreg(val, cnthctl_el2);
+}
+
+/*
+ * Should only be called on non-VHE or hVHE setups.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __timer_enable_traps(struct kvm_vcpu *vcpu)
+{
+ u64 clr = 0, set = 0;
+
+ /*
+ * Disallow physical timer access for the guest
+ * Physical counter access is allowed if no offset is enforced
+ * or running protected (we don't offset anything in this case).
+ */
+ clr = CNTHCTL_EL1PCEN;
+ if (is_protected_kvm_enabled() ||
+ !kern_hyp_va(vcpu->kvm)->arch.timer_data.poffset)
+ set |= CNTHCTL_EL1PCTEN;
+ else
+ clr |= CNTHCTL_EL1PCTEN;
+
+ if (has_hvhe()) {
+ clr <<= 10;
+ set <<= 10;
+ }
+
+ sysreg_clear_set(cnthctl_el2, clr, set);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
new file mode 100644
index 000000000000..a60fb13e2192
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/tlbflush.h>
+
+#include <nvhe/mem_protect.h>
+
+struct tlb_inv_context {
+ u64 tcr;
+};
+
+static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
+ struct tlb_inv_context *cxt,
+ bool nsh)
+{
+ /*
+ * We have two requirements:
+ *
+ * - ensure that the page table updates are visible to all
+ * CPUs, for which a dsb(DOMAIN-st) is what we need, DOMAIN
+ * being either ish or nsh, depending on the invalidation
+ * type.
+ *
+ * - complete any speculative page table walk started before
+ * we trapped to EL2 so that we can mess with the MM
+ * registers out of context, for which dsb(nsh) is enough
+ *
+ * The composition of these two barriers is a dsb(DOMAIN), and
+ * the 'nsh' parameter tracks the distinction between
+ * Inner-Shareable and Non-Shareable, as specified by the
+ * callers.
+ */
+ if (nsh)
+ dsb(nsh);
+ else
+ dsb(ish);
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
+ u64 val;
+
+ /*
+ * For CPUs that are affected by ARM 1319367, we need to
+ * avoid a host Stage-1 walk while we have the guest's
+ * VMID set in the VTTBR in order to invalidate TLBs.
+ * We're guaranteed that the S1 MMU is enabled, so we can
+ * simply set the EPD bits to avoid any further TLB fill.
+ */
+ val = cxt->tcr = read_sysreg_el1(SYS_TCR);
+ val |= TCR_EPD1_MASK | TCR_EPD0_MASK;
+ write_sysreg_el1(val, SYS_TCR);
+ isb();
+ }
+
+ /*
+ * __load_stage2() includes an ISB only when the AT
+ * workaround is applied. Take care of the opposite condition,
+ * ensuring that we always have an ISB, but not two ISBs back
+ * to back.
+ */
+ __load_stage2(mmu, kern_hyp_va(mmu->arch));
+ asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
+}
+
+static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
+{
+ __load_host_stage2();
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
+ /* Ensure write of the host VMID */
+ isb();
+ /* Restore the host's TCR_EL1 */
+ write_sysreg_el1(cxt->tcr, SYS_TCR);
+ }
+}
+
+void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, false);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1is, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(ish);
+ __tlbi(vmalle1is);
+ dsb(ish);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, true);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
+void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
+ phys_addr_t start, unsigned long pages)
+{
+ struct tlb_inv_context cxt;
+ unsigned long stride;
+
+ /*
+ * Since the range of addresses may not be mapped at
+ * the same level, assume the worst case as PAGE_SIZE
+ */
+ stride = PAGE_SIZE;
+ start = round_down(start, stride);
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, false);
+
+ __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0);
+
+ dsb(ish);
+ __tlbi(vmalle1is);
+ dsb(ish);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
+void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, false);
+
+ __tlbi(vmalls12e1is);
+ dsb(ish);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
+void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, false);
+
+ __tlbi(vmalle1);
+ asm volatile("ic iallu");
+ dsb(nsh);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
+void __kvm_flush_vm_context(void)
+{
+ /* Same remark as in __tlb_switch_to_guest() */
+ dsb(ish);
+ __tlbi(alle1is);
+ dsb(ish);
+}