summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
authorMarc Zyngier <maz@kernel.org>2024-09-12 08:37:47 +0100
committerMarc Zyngier <maz@kernel.org>2024-09-12 08:37:47 +0100
commit2e0f239457c1076e09b36350cbbdb2ed25997a1f (patch)
treeb978201255a6e53af4d90b9d885a02376d3d9663 /arch/arm64
parentf77e63e2748be6768e6e15158a3aacb2e21e4259 (diff)
parentff987ffc0c18c98f05ddc7696d56bb493b994450 (diff)
Merge branch kvm-arm64/nv-at-pan into kvmarm-master/next
* kvm-arm64/nv-at-pan: : . : Add NV support for the AT family of instructions, which mostly results : in adding a page table walker that deals with most of the complexity : of the architecture. : : From the cover letter: : : "Another task that a hypervisor supporting NV on arm64 has to deal with : is to emulate the AT instruction, because we multiplex all the S1 : translations on a single set of registers, and the guest S2 is never : truly resident on the CPU. : : So given that we lie about page tables, we also have to lie about : translation instructions, hence the emulation. Things are made : complicated by the fact that guest S1 page tables can be swapped out, : and that our shadow S2 is likely to be incomplete. So while using AT : to emulate AT is tempting (and useful), it is not going to always : work, and we thus need a fallback in the shape of a SW S1 walker." : . KVM: arm64: nv: Add support for FEAT_ATS1A KVM: arm64: nv: Plumb handling of AT S1* traps from EL2 KVM: arm64: nv: Make AT+PAN instructions aware of FEAT_PAN3 KVM: arm64: nv: Sanitise SCTLR_EL1.EPAN according to VM configuration KVM: arm64: nv: Add SW walker for AT S1 emulation KVM: arm64: nv: Make ps_to_output_size() generally available KVM: arm64: nv: Add emulation of AT S12E{0,1}{R,W} KVM: arm64: nv: Add basic emulation of AT S1E2{R,W} KVM: arm64: nv: Add basic emulation of AT S1E1{R,W}P KVM: arm64: nv: Add basic emulation of AT S1E{0,1}{R,W} KVM: arm64: nv: Honor absence of FEAT_PAN2 KVM: arm64: nv: Turn upper_attr for S2 walk into the full descriptor KVM: arm64: nv: Enforce S2 alignment when contiguous bit is set arm64: Add ESR_ELx_FSC_ADDRSZ_L() helper arm64: Add system register encoding for PSTATE.PAN arm64: Add PAR_EL1 field description arm64: Add missing APTable and TCR_ELx.HPD masks KVM: arm64: Make kvm_at() take an OP_AT_* Signed-off-by: Marc Zyngier <maz@kernel.org> # Conflicts: # arch/arm64/kvm/nested.c
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/include/asm/esr.h5
-rw-r--r--arch/arm64/include/asm/kvm_arm.h1
-rw-r--r--arch/arm64/include/asm/kvm_asm.h6
-rw-r--r--arch/arm64/include/asm/kvm_nested.h40
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h9
-rw-r--r--arch/arm64/include/asm/sysreg.h22
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/at.c1101
-rw-r--r--arch/arm64/kvm/emulate-nested.c2
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/fault.h2
-rw-r--r--arch/arm64/kvm/nested.c40
-rw-r--r--arch/arm64/kvm/sys_regs.c60
12 files changed, 1258 insertions, 32 deletions
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 56c148890daf..d79308c23ddb 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -122,8 +122,8 @@
#define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n))
/* Status codes for individual page table levels */
-#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + n)
-#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + n)
+#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n))
+#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n))
#define ESR_ELx_FSC_FAULT_nL (0x2C)
#define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \
@@ -161,6 +161,7 @@
/* ISS field definitions for exceptions taken in to Hyp */
#define ESR_ELx_FSC_ADDRSZ (0x00)
+#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n))
#define ESR_ELx_CV (UL(1) << 24)
#define ESR_ELx_COND_SHIFT (20)
#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index d81cc746e0eb..109a85ee6910 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -107,6 +107,7 @@
/* TCR_EL2 Registers bits */
#define TCR_EL2_DS (1UL << 32)
#define TCR_EL2_RES1 ((1U << 31) | (1 << 23))
+#define TCR_EL2_HPD (1 << 24)
#define TCR_EL2_TBI (1 << 20)
#define TCR_EL2_PS_SHIFT 16
#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 2181a11b9d92..b36a3b6cc011 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -10,6 +10,7 @@
#include <asm/hyp_image.h>
#include <asm/insn.h>
#include <asm/virt.h>
+#include <asm/sysreg.h>
#define ARM_EXIT_WITH_SERROR_BIT 31
#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
@@ -235,6 +236,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding);
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
+extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
+extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
+extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -259,7 +263,7 @@ extern u64 __kvm_get_mdcr_el2(void);
asm volatile( \
" mrs %1, spsr_el2\n" \
" mrs %2, elr_el2\n" \
- "1: at "at_op", %3\n" \
+ "1: " __msr_s(at_op, "%3") "\n" \
" isb\n" \
" b 9f\n" \
"2: msr spsr_el2, %1\n" \
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 5b06c31035a2..e8bc6d67aba2 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -85,7 +85,7 @@ struct kvm_s2_trans {
bool readable;
int level;
u32 esr;
- u64 upper_attr;
+ u64 desc;
};
static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
@@ -115,7 +115,7 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans)
static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans)
{
- return !(trans->upper_attr & BIT(54));
+ return !(trans->desc & BIT(54));
}
extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
@@ -205,4 +205,40 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans)
return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level);
}
+/* Adjust alignment for the contiguous bit as per StageOA() */
+#define contiguous_bit_shift(d, wi, l) \
+ ({ \
+ u8 shift = 0; \
+ \
+ if ((d) & PTE_CONT) { \
+ switch (BIT((wi)->pgshift)) { \
+ case SZ_4K: \
+ shift = 4; \
+ break; \
+ case SZ_16K: \
+ shift = (l) == 2 ? 5 : 7; \
+ break; \
+ case SZ_64K: \
+ shift = 5; \
+ break; \
+ } \
+ } \
+ \
+ shift; \
+ })
+
+static inline unsigned int ps_to_output_size(unsigned int ps)
+{
+ switch (ps) {
+ case 0: return 32;
+ case 1: return 36;
+ case 2: return 40;
+ case 3: return 42;
+ case 4: return 44;
+ case 5:
+ default:
+ return 48;
+ }
+}
+
#endif /* __ARM64_KVM_NESTED_H */
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 1f60aa1bc750..07dfbdb14bab 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -205,6 +205,11 @@
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
/*
+ * Hierarchical permission for Stage-1 tables
+ */
+#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61)
+
+/*
* Highest possible physical address supported.
*/
#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS)
@@ -298,6 +303,10 @@
#define TCR_TBI1 (UL(1) << 38)
#define TCR_HA (UL(1) << 39)
#define TCR_HD (UL(1) << 40)
+#define TCR_HPD0_SHIFT 41
+#define TCR_HPD0 (UL(1) << TCR_HPD0_SHIFT)
+#define TCR_HPD1_SHIFT 42
+#define TCR_HPD1 (UL(1) << TCR_HPD1_SHIFT)
#define TCR_TBID0 (UL(1) << 51)
#define TCR_TBID1 (UL(1) << 52)
#define TCR_NFD0 (UL(1) << 53)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 4a9ea103817e..bc161f160854 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -109,6 +109,9 @@
#define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x))
#define set_pstate_dit(x) asm volatile(SET_PSTATE_DIT(x))
+/* Register-based PAN access, for save/restore purposes */
+#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3)
+
#define __SYS_BARRIER_INSN(CRm, op2, Rt) \
__emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f))
@@ -325,7 +328,25 @@
#define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0)
#define SYS_PAR_EL1_F BIT(0)
+/* When PAR_EL1.F == 1 */
#define SYS_PAR_EL1_FST GENMASK(6, 1)
+#define SYS_PAR_EL1_PTW BIT(8)
+#define SYS_PAR_EL1_S BIT(9)
+#define SYS_PAR_EL1_AssuredOnly BIT(12)
+#define SYS_PAR_EL1_TopLevel BIT(13)
+#define SYS_PAR_EL1_Overlay BIT(14)
+#define SYS_PAR_EL1_DirtyBit BIT(15)
+#define SYS_PAR_EL1_F1_IMPDEF GENMASK_ULL(63, 48)
+#define SYS_PAR_EL1_F1_RES0 (BIT(7) | BIT(10) | GENMASK_ULL(47, 16))
+#define SYS_PAR_EL1_RES1 BIT(11)
+/* When PAR_EL1.F == 0 */
+#define SYS_PAR_EL1_SH GENMASK_ULL(8, 7)
+#define SYS_PAR_EL1_NS BIT(9)
+#define SYS_PAR_EL1_F0_IMPDEF BIT(10)
+#define SYS_PAR_EL1_NSE BIT(11)
+#define SYS_PAR_EL1_PA GENMASK_ULL(51, 12)
+#define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56)
+#define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52))
/*** Statistical Profiling Extension ***/
#define PMSEVFR_EL1_RES0_IMP \
@@ -652,6 +673,7 @@
#define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5)
#define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6)
#define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7)
+#define OP_AT_S1E2A sys_insn(AT_Op0, 4, AT_CRn, 9, 2)
/* TLBI instructions */
#define TLBI_Op0 1
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 86a629aaf0a1..971258b025f9 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -17,7 +17,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o stacktrace.o \
vgic-sys-reg-v3.o fpsimd.o pkvm.o \
- arch_timer.o trng.o vmid.o emulate-nested.o nested.o \
+ arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
new file mode 100644
index 000000000000..39f0e87a340e
--- /dev/null
+++ b/arch/arm64/kvm/at.c
@@ -0,0 +1,1101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 - Linaro Ltd
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/esr.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+enum trans_regime {
+ TR_EL10,
+ TR_EL20,
+ TR_EL2,
+};
+
+struct s1_walk_info {
+ u64 baddr;
+ enum trans_regime regime;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int txsz;
+ int sl;
+ bool hpd;
+ bool be;
+ bool s2;
+};
+
+struct s1_walk_result {
+ union {
+ struct {
+ u64 desc;
+ u64 pa;
+ s8 level;
+ u8 APTable;
+ bool UXNTable;
+ bool PXNTable;
+ };
+ struct {
+ u8 fst;
+ bool ptw;
+ bool s2;
+ };
+ };
+ bool failed;
+};
+
+static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
+{
+ wr->fst = fst;
+ wr->ptw = ptw;
+ wr->s2 = s2;
+ wr->failed = true;
+}
+
+#define S1_MMU_DISABLED (-127)
+
+static int get_ia_size(struct s1_walk_info *wi)
+{
+ return 64 - wi->txsz;
+}
+
+/* Return true if the IPA is out of the OA range */
+static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
+{
+ return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
+}
+
+/* Return the translation regime that applies to an AT instruction */
+static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
+{
+ /*
+ * We only get here from guest EL2, so the translation
+ * regime AT applies to is solely defined by {E2H,TGE}.
+ */
+ switch (op) {
+ case OP_AT_S1E2R:
+ case OP_AT_S1E2W:
+ case OP_AT_S1E2A:
+ return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
+ break;
+ default:
+ return (vcpu_el2_e2h_is_set(vcpu) &&
+ vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
+ }
+}
+
+static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
+ unsigned int stride, x;
+ bool va55, tbi, lva, as_el0;
+
+ hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
+
+ wi->regime = compute_translation_regime(vcpu, op);
+ as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
+
+ va55 = va & BIT(55);
+
+ if (wi->regime == TR_EL2 && va55)
+ goto addrsz;
+
+ wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
+
+ switch (wi->regime) {
+ case TR_EL10:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+ break;
+ case TR_EL2:
+ case TR_EL20:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+ break;
+ default:
+ BUG();
+ }
+
+ tbi = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_TBI, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_TBI1, tcr) :
+ FIELD_GET(TCR_TBI0, tcr)));
+
+ if (!tbi && (u64)sign_extend64(va, 55) != va)
+ goto addrsz;
+
+ va = (u64)sign_extend64(va, 55);
+
+ /* Let's put the MMU disabled case aside immediately */
+ switch (wi->regime) {
+ case TR_EL10:
+ /*
+ * If dealing with the EL1&0 translation regime, 3 things
+ * can disable the S1 translation:
+ *
+ * - HCR_EL2.DC = 1
+ * - HCR_EL2.{E2H,TGE} = {0,1}
+ * - SCTLR_EL1.M = 0
+ *
+ * The TGE part is interesting. If we have decided that this
+ * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
+ * {0,x}, and we only need to test for TGE == 1.
+ */
+ if (hcr & (HCR_DC | HCR_TGE)) {
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+ fallthrough;
+ case TR_EL2:
+ case TR_EL20:
+ if (!(sctlr & SCTLR_ELx_M))
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+
+ if (wr->level == S1_MMU_DISABLED) {
+ if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
+ goto addrsz;
+
+ wr->pa = va;
+ return 0;
+ }
+
+ wi->be = sctlr & SCTLR_ELx_EE;
+
+ wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
+ wi->hpd &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HPD, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_HPD1, tcr) :
+ FIELD_GET(TCR_HPD0, tcr)));
+
+ /* Someone was silly enough to encode TG0/TG1 differently */
+ if (va55) {
+ wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+ switch (tg << TCR_TG1_SHIFT) {
+ case TCR_TG1_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG1_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG1_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ } else {
+ wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+ switch (tg << TCR_TG0_SHIFT) {
+ case TCR_TG0_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG0_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ }
+
+ /* R_PLCGL, R_YXNYW */
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
+ if (wi->txsz > 39)
+ goto transfault_l0;
+ } else {
+ if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
+ goto transfault_l0;
+ }
+
+ /* R_GTJBY, R_SXWGM */
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
+ lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+ break;
+ case SZ_16K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
+ lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+ break;
+ case SZ_64K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
+ break;
+ }
+
+ if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
+ goto transfault_l0;
+
+ ia_bits = get_ia_size(wi);
+
+ /* R_YYVYV, I_THCZK */
+ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
+ (va55 && va < GENMASK(63, ia_bits)))
+ goto transfault_l0;
+
+ /* I_ZFSYQ */
+ if (wi->regime != TR_EL2 &&
+ (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
+ goto transfault_l0;
+
+ /* R_BNDVG and following statements */
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
+ as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
+ goto transfault_l0;
+
+ /* AArch64.S1StartLevel() */
+ stride = wi->pgshift - 3;
+ wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+ ps = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
+
+ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
+
+ /* Compute minimal alignment */
+ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
+
+ wi->baddr = ttbr & TTBRx_EL1_BADDR;
+
+ /* R_VPBBF */
+ if (check_output_size(wi->baddr, wi))
+ goto addrsz;
+
+ wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+
+ return 0;
+
+addrsz: /* Address Size Fault level 0 */
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
+ return -EFAULT;
+
+transfault_l0: /* Translation Fault level 0 */
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
+ return -EFAULT;
+}
+
+static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 va_top, va_bottom, baddr, desc;
+ int level, stride, ret;
+
+ level = wi->sl;
+ stride = wi->pgshift - 3;
+ baddr = wi->baddr;
+
+ va_top = get_ia_size(wi) - 1;
+
+ while (1) {
+ u64 index, ipa;
+
+ va_bottom = (3 - level) * stride + wi->pgshift;
+ index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
+
+ ipa = baddr | index;
+
+ if (wi->s2) {
+ struct kvm_s2_trans s2_trans = {};
+
+ ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+ if (ret) {
+ fail_s1_walk(wr,
+ (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+ true, true);
+ return ret;
+ }
+
+ if (!kvm_s2_trans_readable(&s2_trans)) {
+ fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
+ true, true);
+
+ return -EPERM;
+ }
+
+ ipa = kvm_s2_trans_output(&s2_trans);
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+ if (ret) {
+ fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
+ true, false);
+ return ret;
+ }
+
+ if (wi->be)
+ desc = be64_to_cpu((__force __be64)desc);
+ else
+ desc = le64_to_cpu((__force __le64)desc);
+
+ /* Invalid descriptor */
+ if (!(desc & BIT(0)))
+ goto transfault;
+
+ /* Block mapping, check validity down the line */
+ if (!(desc & BIT(1)))
+ break;
+
+ /* Page mapping */
+ if (level == 3)
+ break;
+
+ /* Table handling */
+ if (!wi->hpd) {
+ wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
+ wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
+ wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+ }
+
+ baddr = desc & GENMASK_ULL(47, wi->pgshift);
+
+ /* Check for out-of-range OA */
+ if (check_output_size(baddr, wi))
+ goto addrsz;
+
+ /* Prepare for next round */
+ va_top = va_bottom - 1;
+ level++;
+ }
+
+ /* Block mapping, check the validity of the level */
+ if (!(desc & BIT(1))) {
+ bool valid_block = false;
+
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ valid_block = level == 1 || level == 2;
+ break;
+ case SZ_16K:
+ case SZ_64K:
+ valid_block = level == 2;
+ break;
+ }
+
+ if (!valid_block)
+ goto transfault;
+ }
+
+ if (check_output_size(desc & GENMASK(47, va_bottom), wi))
+ goto addrsz;
+
+ va_bottom += contiguous_bit_shift(desc, wi, level);
+
+ wr->failed = false;
+ wr->level = level;
+ wr->desc = desc;
+ wr->pa = desc & GENMASK(47, va_bottom);
+ wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
+
+ return 0;
+
+addrsz:
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
+ return -EINVAL;
+transfault:
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
+ return -ENOENT;
+}
+
+struct mmu_config {
+ u64 ttbr0;
+ u64 ttbr1;
+ u64 tcr;
+ u64 mair;
+ u64 sctlr;
+ u64 vttbr;
+ u64 vtcr;
+ u64 hcr;
+};
+
+static void __mmu_config_save(struct mmu_config *config)
+{
+ config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
+ config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
+ config->tcr = read_sysreg_el1(SYS_TCR);
+ config->mair = read_sysreg_el1(SYS_MAIR);
+ config->sctlr = read_sysreg_el1(SYS_SCTLR);
+ config->vttbr = read_sysreg(vttbr_el2);
+ config->vtcr = read_sysreg(vtcr_el2);
+ config->hcr = read_sysreg(hcr_el2);
+}
+
+static void __mmu_config_restore(struct mmu_config *config)
+{
+ write_sysreg(config->hcr, hcr_el2);
+
+ /*
+ * ARM errata 1165522 and 1530923 require TGE to be 1 before
+ * we update the guest state.
+ */
+ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+ write_sysreg_el1(config->ttbr0, SYS_TTBR0);
+ write_sysreg_el1(config->ttbr1, SYS_TTBR1);
+ write_sysreg_el1(config->tcr, SYS_TCR);
+ write_sysreg_el1(config->mair, SYS_MAIR);
+ write_sysreg_el1(config->sctlr, SYS_SCTLR);
+ write_sysreg(config->vttbr, vttbr_el2);
+ write_sysreg(config->vtcr, vtcr_el2);
+}
+
+static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 host_pan;
+ bool fail;
+
+ host_pan = read_sysreg_s(SYS_PSTATE_PAN);
+ write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ fail = __kvm_at(OP_AT_S1E1RP, vaddr);
+ break;
+ case OP_AT_S1E1WP:
+ fail = __kvm_at(OP_AT_S1E1WP, vaddr);
+ break;
+ }
+
+ write_sysreg_s(host_pan, SYS_PSTATE_PAN);
+
+ return fail;
+}
+
+#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
+#define MEMATTR_NC 0b0100
+#define MEMATTR_Wt 0b1000
+#define MEMATTR_Wb 0b1100
+#define MEMATTR_WbRaWa 0b1111
+
+#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
+
+static u8 s2_memattr_to_attr(u8 memattr)
+{
+ memattr &= 0b1111;
+
+ switch (memattr) {
+ case 0b0000:
+ case 0b0001:
+ case 0b0010:
+ case 0b0011:
+ return memattr << 2;
+ case 0b0100:
+ return MEMATTR(Wb, Wb);
+ case 0b0101:
+ return MEMATTR(NC, NC);
+ case 0b0110:
+ return MEMATTR(Wt, NC);
+ case 0b0111:
+ return MEMATTR(Wb, NC);
+ case 0b1000:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1001:
+ return MEMATTR(NC, Wt);
+ case 0b1010:
+ return MEMATTR(Wt, Wt);
+ case 0b1011:
+ return MEMATTR(Wb, Wt);
+ case 0b1100:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1101:
+ return MEMATTR(NC, Wb);
+ case 0b1110:
+ return MEMATTR(Wt, Wb);
+ case 0b1111:
+ return MEMATTR(Wb, Wb);
+ default:
+ unreachable();
+ }
+}
+
+static u8 combine_s1_s2_attr(u8 s1, u8 s2)
+{
+ bool transient;
+ u8 final = 0;
+
+ /* Upgrade transient s1 to non-transient to simplify things */
+ switch (s1) {
+ case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
+ transient = true;
+ s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
+ break;
+ case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
+ transient = true;
+ s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
+ break;
+ default:
+ transient = false;
+ }
+
+ /* S2CombineS1AttrHints() */
+ if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_NC)
+ final = MEMATTR_NC;
+ else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
+ final = MEMATTR_Wt;
+ else
+ final = MEMATTR_Wb;
+
+ if (final != MEMATTR_NC) {
+ /* Inherit RaWa hints form S1 */
+ if (transient) {
+ switch (s1 & GENMASK(3, 2)) {
+ case MEMATTR_Wt:
+ final = 0;
+ break;
+ case MEMATTR_Wb:
+ final = MEMATTR_NC;
+ break;
+ }
+ }
+
+ final |= s1 & GENMASK(1, 0);
+ }
+
+ return final;
+}
+
+#define ATTR_NSH 0b00
+#define ATTR_RSV 0b01
+#define ATTR_OSH 0b10
+#define ATTR_ISH 0b11
+
+static u8 compute_sh(u8 attr, u64 desc)
+{
+ u8 sh;
+
+ /* Any form of device, as well as NC has SH[1:0]=0b10 */
+ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
+ return ATTR_OSH;
+
+ sh = FIELD_GET(PTE_SHARED, desc);
+ if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
+ sh = ATTR_NSH;
+
+ return sh;
+}
+
+static u8 combine_sh(u8 s1_sh, u8 s2_sh)
+{
+ if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
+ return ATTR_OSH;
+ if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
+ return ATTR_ISH;
+
+ return ATTR_NSH;
+}
+
+static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
+ struct kvm_s2_trans *tr)
+{
+ u8 s1_parattr, s2_memattr, final_attr;
+ u64 par;
+
+ /* If S2 has failed to translate, report the damage */
+ if (tr->esr) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= SYS_PAR_EL1_S;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
+ return par;
+ }
+
+ s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
+ s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
+
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
+ s2_memattr &= ~BIT(3);
+
+ /* Combination of R_VRJSW and R_RHWZM */
+ switch (s2_memattr) {
+ case 0b0101:
+ if (MEMATTR_IS_DEVICE(s1_parattr))
+ final_attr = s1_parattr;
+ else
+ final_attr = MEMATTR(NC, NC);
+ break;
+ case 0b0110:
+ case 0b1110:
+ final_attr = MEMATTR(WbRaWa, WbRaWa);
+ break;
+ case 0b0111:
+ case 0b1111:
+ /* Preserve S1 attribute */
+ final_attr = s1_parattr;
+ break;
+ case 0b0100:
+ case 0b1100:
+ case 0b1101:
+ /* Reserved, do something non-silly */
+ final_attr = s1_parattr;
+ break;
+ default:
+ /* MemAttr[2]=0, Device from S2 */
+ final_attr = s2_memattr & GENMASK(1,0) << 2;
+ }
+ } else {
+ /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
+ u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
+
+ if (MEMATTR_IS_DEVICE(s1_parattr) ||
+ MEMATTR_IS_DEVICE(s2_parattr)) {
+ final_attr = min(s1_parattr, s2_parattr);
+ } else {
+ /* At this stage, this is memory vs memory */
+ final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
+ s2_parattr & 0xf);
+ final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
+ s2_parattr >> 4) << 4;
+ }
+ }
+
+ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
+ !MEMATTR_IS_DEVICE(final_attr))
+ final_attr = MEMATTR(NC, NC);
+
+ par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
+ par |= tr->output & GENMASK(47, 12);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH,
+ combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
+ compute_sh(final_attr, tr->desc)));
+
+ return par;
+}
+
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
+ enum trans_regime regime)
+{
+ u64 par;
+
+ if (wr->failed) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
+ par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
+ par |= wr->s2 ? SYS_PAR_EL1_S : 0;
+ } else if (wr->level == S1_MMU_DISABLED) {
+ /* MMU off or HCR_EL2.DC == 1 */
+ par = SYS_PAR_EL1_NSE;
+ par |= wr->pa & GENMASK_ULL(47, 12);
+
+ if (regime == TR_EL10 &&
+ (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
+ MEMATTR(WbRaWa, WbRaWa));
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
+ } else {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
+ }
+ } else {
+ u64 mair, sctlr;
+ u8 sh;
+
+ par = SYS_PAR_EL1_NSE;
+
+ mair = (regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, MAIR_EL1) :
+ vcpu_read_sys_reg(vcpu, MAIR_EL2));
+
+ mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
+ mair &= 0xff;
+
+ sctlr = (regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
+ vcpu_read_sys_reg(vcpu, SCTLR_EL2));
+
+ /* Force NC for memory if SCTLR_ELx.C is clear */
+ if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
+ mair = MEMATTR(NC, NC);
+
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
+ par |= wr->pa & GENMASK_ULL(47, 12);
+
+ sh = compute_sh(mair, wr->desc);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
+ }
+
+ return par;
+}
+
+static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ u64 sctlr;
+
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
+ return false;
+
+ if (regime == TR_EL10)
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ else
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+
+ return sctlr & SCTLR_EL1_EPAN;
+}
+
+static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ bool perm_fail, ur, uw, ux, pr, pw, px;
+ struct s1_walk_result wr = {};
+ struct s1_walk_info wi = {};
+ int ret, idx;
+
+ ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
+ if (ret)
+ goto compute_par;
+
+ if (wr.level == S1_MMU_DISABLED)
+ goto compute_par;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ ret = walk_s1(vcpu, &wi, &wr, vaddr);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (ret)
+ goto compute_par;
+
+ /* FIXME: revisit when adding indirect permission support */
+ /* AArch64.S1DirectBasePermissions() */
+ if (wi.regime != TR_EL2) {
+ switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
+ case 0b00:
+ pr = pw = true;
+ ur = uw = false;
+ break;
+ case 0b01:
+ pr = pw = ur = uw = true;
+ break;
+ case 0b10:
+ pr = true;
+ pw = ur = uw = false;
+ break;
+ case 0b11:
+ pr = ur = true;
+ pw = uw = false;
+ break;
+ }
+
+ switch (wr.APTable) {
+ case 0b00:
+ break;
+ case 0b01:
+ ur = uw = false;
+ break;
+ case 0b10:
+ pw = uw = false;
+ break;
+ case 0b11:
+ pw = ur = uw = false;
+ break;
+ }
+
+ /* We don't use px for anything yet, but hey... */
+ px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw);
+ ux = !((wr.desc & PTE_UXN) || wr.UXNTable);
+
+ if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) {
+ bool pan;
+
+ pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
+ pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux);
+ pw &= !pan;
+ pr &= !pan;
+ }
+ } else {
+ ur = uw = ux = false;
+
+ if (!(wr.desc & PTE_RDONLY)) {
+ pr = pw = true;
+ } else {
+ pr = true;
+ pw = false;
+ }
+
+ if (wr.APTable & BIT(1))
+ pw = false;
+
+ /* XN maps to UXN */
+ px = !((wr.desc & PTE_UXN) || wr.UXNTable);
+ }
+
+ perm_fail = false;
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1R:
+ case OP_AT_S1E2R:
+ perm_fail = !pr;
+ break;
+ case OP_AT_S1E1WP:
+ case OP_AT_S1E1W:
+ case OP_AT_S1E2W:
+ perm_fail = !pw;
+ break;
+ case OP_AT_S1E0R:
+ perm_fail = !ur;
+ break;
+ case OP_AT_S1E0W:
+ perm_fail = !uw;
+ break;
+ case OP_AT_S1E1A:
+ case OP_AT_S1E2A:
+ break;
+ default:
+ BUG();
+ }
+
+ if (perm_fail)
+ fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
+
+compute_par:
+ return compute_par_s1(vcpu, &wr, wi.regime);
+}
+
+/*
+ * Return the PAR_EL1 value as the result of a valid translation.
+ *
+ * If the translation is unsuccessful, the value may only contain
+ * PAR_EL1.F, and cannot be taken at face value. It isn't an
+ * indication of the translation having failed, only that the fast
+ * path did not succeed, *unless* it indicates a S1 permission fault.
+ */
+static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct mmu_config config;
+ struct kvm_s2_mmu *mmu;
+ bool fail;
+ u64 par;
+
+ par = SYS_PAR_EL1_F;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will
+ * be switching contexts behind everybody's back, disable
+ * interrupts while holding the mmu lock.
+ */
+ guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
+
+ /*
+ * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
+ * the right one (as we trapped from vEL2). If not, save the
+ * full MMU context.
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
+ goto skip_mmu_switch;
+
+ /*
+ * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
+ * find it (recycled by another vcpu, for example). When this
+ * happens, admit defeat immediately and use the SW (slow) path.
+ */
+ mmu = lookup_s2_mmu(vcpu);
+ if (!mmu)
+ return par;
+
+ __mmu_config_save(&config);
+
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
+ __load_stage2(mmu, mmu->arch);
+
+skip_mmu_switch:
+ /* Clear TGE, enable S2 translation, we're rolling */
+ write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2);
+ isb();
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1WP:
+ fail = at_s1e1p_fast(vcpu, op, vaddr);
+ break;
+ case OP_AT_S1E1R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E1W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E0R:
+ fail = __kvm_at(OP_AT_S1E0R, vaddr);
+ break;
+ case OP_AT_S1E0W:
+ fail = __kvm_at(OP_AT_S1E0W, vaddr);
+ break;
+ case OP_AT_S1E1A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ break;
+ }
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
+ __mmu_config_restore(&config);
+
+ return par;
+}
+
+static bool par_check_s1_perm_fault(u64 par)
+{
+ u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+ return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
+ !(par & SYS_PAR_EL1_S));
+}
+
+void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
+
+ /*
+ * If PAR_EL1 reports that AT failed on a S1 permission fault, we
+ * know for sure that the PTW was able to walk the S1 tables and
+ * there's nothing else to do.
+ *
+ * If AT failed for any other reason, then we must walk the guest S1
+ * to emulate the instruction.
+ */
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+ par = handle_at_slow(vcpu, op, vaddr);
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
+
+void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will be
+ * switching context behind everybody's back, disable interrupts...
+ */
+ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
+ struct kvm_s2_mmu *mmu;
+ u64 val, hcr;
+ bool fail;
+
+ mmu = &vcpu->kvm->arch.mmu;
+
+ val = hcr = read_sysreg(hcr_el2);
+ val &= ~HCR_TGE;
+ val |= HCR_VM;
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val |= HCR_NV | HCR_NV1;
+
+ write_sysreg(val, hcr_el2);
+ isb();
+
+ par = SYS_PAR_EL1_F;
+
+ switch (op) {
+ case OP_AT_S1E2R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E2W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E2A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ }
+
+ isb();
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ write_sysreg(hcr, hcr_el2);
+ isb();
+ }
+
+ /* We failed the translation, let's replay it in slow motion */
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+ par = handle_at_slow(vcpu, op, vaddr);
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
+
+void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct kvm_s2_trans out = {};
+ u64 ipa, par;
+ bool write;
+ int ret;
+
+ /* Do the stage-1 translation */
+ switch (op) {
+ case OP_AT_S12E1R:
+ op = OP_AT_S1E1R;
+ write = false;
+ break;
+ case OP_AT_S12E1W:
+ op = OP_AT_S1E1W;
+ write = true;
+ break;
+ case OP_AT_S12E0R:
+ op = OP_AT_S1E0R;
+ write = false;
+ break;
+ case OP_AT_S12E0W:
+ op = OP_AT_S1E0W;
+ write = true;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ __kvm_at_s1e01(vcpu, op, vaddr);
+ par = vcpu_read_sys_reg(vcpu, PAR_EL1);
+ if (par & SYS_PAR_EL1_F)
+ return;
+
+ /*
+ * If we only have a single stage of translation (E2H=0 or
+ * TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
+ */
+ if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
+ !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
+ return;
+
+ /* Do the stage-2 translation */
+ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
+ out.esr = 0;
+ ret = kvm_walk_nested_s2(vcpu, ipa, &out);
+ if (ret < 0)
+ return;
+
+ /* Check the access permission */
+ if (!out.esr &&
+ ((!write && !out.readable) || (write && !out.writable)))
+ out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
+
+ par = compute_par_s12(vcpu, par, &out);
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 7bbeb8c8e5a1..05b6435d02a9 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -826,6 +826,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV),
SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV),
SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV),
SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV),
@@ -907,6 +908,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT),
SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT),
SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT),
SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN),
SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN),
SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN),
diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h
index 9e13c1bc2ad5..487c06099d6f 100644
--- a/arch/arm64/kvm/hyp/include/hyp/fault.h
+++ b/arch/arm64/kvm/hyp/include/hyp/fault.h
@@ -27,7 +27,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
* saved the guest context yet, and we may return early...
*/
par = read_sysreg_par();
- if (!__kvm_at("s1e1r", far))
+ if (!__kvm_at(OP_AT_S1E1R, far))
tmp = read_sysreg_par();
else
tmp = SYS_PAR_EL1_F; /* back to the guest */
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index e2067c594e4a..638b6205526f 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -103,20 +103,6 @@ struct s2_walk_info {
bool be;
};
-static unsigned int ps_to_output_size(unsigned int ps)
-{
- switch (ps) {
- case 0: return 32;
- case 1: return 36;
- case 2: return 40;
- case 3: return 42;
- case 4: return 44;
- case 5:
- default:
- return 48;
- }
-}
-
static u32 compute_fsc(int level, u32 fsc)
{
return fsc | (level & 0x3);
@@ -256,7 +242,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
/* Check for valid descriptor at this point */
if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
@@ -266,7 +252,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
if (check_output_size(wi, desc)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
@@ -278,27 +264,24 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
if (level < first_block_level) {
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
- /*
- * We don't use the contiguous bit in the stage-2 ptes, so skip check
- * for misprogramming of the contiguous bit.
- */
-
if (check_output_size(wi, desc)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
if (!(desc & BIT(10))) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
+ addr_bottom += contiguous_bit_shift(desc, wi, level);
+
/* Calculate and return the result */
paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
(ipa & GENMASK_ULL(addr_bottom - 1, 0));
@@ -307,7 +290,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
out->readable = desc & (0b01 << 6);
out->writable = desc & (0b10 << 6);
out->level = level;
- out->upper_attr = desc & GENMASK_ULL(63, 52);
+ out->desc = desc;
return 0;
}
@@ -1193,5 +1176,12 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
res0 |= ~(res0 | res1);
set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1);
+ /* SCTLR_EL1 */
+ res0 = SCTLR_EL1_RES0;
+ res1 = SCTLR_EL1_RES1;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
+ res0 |= SCTLR_EL1_EPAN;
+ set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);
+
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 2b0d550547f4..45c9e3b2acd4 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2830,6 +2830,43 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};
+static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ __kvm_at_s1e01(vcpu, op, p->regval);
+
+ return true;
+}
+
+static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ /* There is no FGT associated with AT S1E2A :-( */
+ if (op == OP_AT_S1E2A &&
+ !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ __kvm_at_s1e2(vcpu, op, p->regval);
+
+ return true;
+}
+
+static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ __kvm_at_s12(vcpu, op, p->regval);
+
+ return true;
+}
+
static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr)
{
struct kvm *kvm = vpcu->kvm;
@@ -3082,6 +3119,14 @@ static struct sys_reg_desc sys_insn_descs[] = {
{ SYS_DESC(SYS_DC_ISW), access_dcsw },
{ SYS_DESC(SYS_DC_IGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_IGDSW), access_dcgsw },
+
+ SYS_INSN(AT_S1E1R, handle_at_s1e01),
+ SYS_INSN(AT_S1E1W, handle_at_s1e01),
+ SYS_INSN(AT_S1E0R, handle_at_s1e01),
+ SYS_INSN(AT_S1E0W, handle_at_s1e01),
+ SYS_INSN(AT_S1E1RP, handle_at_s1e01),
+ SYS_INSN(AT_S1E1WP, handle_at_s1e01),
+
{ SYS_DESC(SYS_DC_CSW), access_dcsw },
{ SYS_DESC(SYS_DC_CGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_CGDSW), access_dcgsw },
@@ -3161,6 +3206,14 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1),
SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1),
+ SYS_INSN(AT_S1E2R, handle_at_s1e2),
+ SYS_INSN(AT_S1E2W, handle_at_s1e2),
+ SYS_INSN(AT_S12E1R, handle_at_s12),
+ SYS_INSN(AT_S12E1W, handle_at_s12),
+ SYS_INSN(AT_S12E0R, handle_at_s12),
+ SYS_INSN(AT_S12E0W, handle_at_s12),
+ SYS_INSN(AT_S1E2A, handle_at_s1e2),
+
SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is),
SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is),
@@ -4642,6 +4695,13 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
HFGITR_EL2_TLBIRVAAE1OS |
HFGITR_EL2_TLBIRVAE1OS);
+ if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP))
+ kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A;
+
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2))
+ kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP |
+ HFGITR_EL2_ATS1E1WP);
+
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP))
kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 |
HFGxTR_EL2_nPIR_EL1);