diff options
Diffstat (limited to 'arch/arm64/kvm/at.c')
| -rw-r--r-- | arch/arm64/kvm/at.c | 858 |
1 files changed, 607 insertions, 251 deletions
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 8c5d7990e5b3..53bf70126f81 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -10,61 +10,11 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> -enum trans_regime { - TR_EL10, - TR_EL20, - TR_EL2, -}; - -struct s1_walk_info { - u64 baddr; - enum trans_regime regime; - unsigned int max_oa_bits; - unsigned int pgshift; - unsigned int txsz; - int sl; - bool hpd; - bool e0poe; - bool poe; - bool pan; - bool be; - bool s2; -}; - -struct s1_walk_result { - union { - struct { - u64 desc; - u64 pa; - s8 level; - u8 APTable; - bool UXNTable; - bool PXNTable; - bool uwxn; - bool uov; - bool ur; - bool uw; - bool ux; - bool pwxn; - bool pov; - bool pr; - bool pw; - bool px; - }; - struct { - u8 fst; - bool ptw; - bool s2; - }; - }; - bool failed; -}; - -static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) { wr->fst = fst; - wr->ptw = ptw; - wr->s2 = s2; + wr->ptw = s1ptw; + wr->s2 = s1ptw; wr->failed = true; } @@ -78,9 +28,57 @@ static int get_ia_size(struct s1_walk_info *wi) /* Return true if the IPA is out of the OA range */ static bool check_output_size(u64 ipa, struct s1_walk_info *wi) { + if (wi->pa52bit) + return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); } +static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) +{ + switch (BIT(wi->pgshift)) { + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) + return false; + return ((wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : + FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); + case SZ_16K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) + return false; + break; + case SZ_4K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) + return false; + break; + } + + return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); +} + +static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) +{ + u64 addr; + + if (!wi->pa52bit) + return desc & GENMASK_ULL(47, wi->pgshift); + + switch (BIT(wi->pgshift)) { + case SZ_4K: + case SZ_16K: + addr = desc & GENMASK_ULL(49, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; + break; + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + addr = desc & GENMASK_ULL(47, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; + break; + } + + return addr; +} + /* Return the translation regime that applies to an AT instruction */ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) { @@ -93,28 +91,32 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o case OP_AT_S1E2W: case OP_AT_S1E2A: return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; - break; default: return (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; } } +static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (regime == TR_EL10) { + if (vcpu_has_nv(vcpu) && + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) + return 0; + + return vcpu_read_sys_reg(vcpu, TCR2_EL1); + } + + return vcpu_read_sys_reg(vcpu, TCR2_EL2); +} + static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) { if (!kvm_has_s1pie(vcpu->kvm)) return false; - switch (regime) { - case TR_EL2: - case TR_EL20: - return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; - case TR_EL10: - return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && - (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE); - default: - BUG(); - } + /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ + return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; } static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) @@ -126,45 +128,30 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) return; } - switch (wi->regime) { - case TR_EL2: - case TR_EL20: - val = vcpu_read_sys_reg(vcpu, TCR2_EL2); - wi->poe = val & TCR2_EL2_POE; - wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); - break; - case TR_EL10: - if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { - wi->poe = wi->e0poe = false; - return; - } + val = effective_tcr2(vcpu, wi->regime); - val = __vcpu_sys_reg(vcpu, TCR2_EL1); - wi->poe = val & TCR2_EL1x_POE; - wi->e0poe = val & TCR2_EL1x_E0POE; - } + /* Abuse TCR2_EL1_* for EL2 */ + wi->poe = val & TCR2_EL1_POE; + wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); } -static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; unsigned int stride, x; - bool va55, tbi, lva, as_el0; - - hcr = __vcpu_sys_reg(vcpu, HCR_EL2); - - wi->regime = compute_translation_regime(vcpu, op); - as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); - wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && - (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + bool va55, tbi, lva; va55 = va & BIT(55); - if (wi->regime == TR_EL2 && va55) - goto addrsz; - - wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + if (vcpu_has_nv(vcpu)) { + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + } else { + WARN_ON_ONCE(wi->regime != TR_EL10); + wi->s2 = false; + hcr = 0; + } switch (wi->regime) { case TR_EL10: @@ -186,6 +173,46 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, BUG(); } + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55 && wi->regime != TR_EL2) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); + + ia_bits = get_ia_size(wi); + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + tbi = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_TBI, tcr) : (va55 ? @@ -195,6 +222,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, if (!tbi && (u64)sign_extend64(va, 55) != va) goto addrsz; + wi->sh = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_SH0_MASK, tcr) : + (va55 ? + FIELD_GET(TCR_SH1_MASK, tcr) : + FIELD_GET(TCR_SH0_MASK, tcr))); + va = (u64)sign_extend64(va, 55); /* Let's put the MMU disabled case aside immediately */ @@ -249,53 +282,20 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, /* R_BVXDG */ wi->hpd |= (wi->poe || wi->e0poe); - /* Someone was silly enough to encode TG0/TG1 differently */ - if (va55) { - wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG1_MASK, tcr); - - switch (tg << TCR_TG1_SHIFT) { - case TCR_TG1_4K: - wi->pgshift = 12; break; - case TCR_TG1_16K: - wi->pgshift = 14; break; - case TCR_TG1_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } else { - wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG0_MASK, tcr); - - switch (tg << TCR_TG0_SHIFT) { - case TCR_TG0_4K: - wi->pgshift = 12; break; - case TCR_TG0_16K: - wi->pgshift = 14; break; - case TCR_TG0_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } - /* R_PLCGL, R_YXNYW */ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { if (wi->txsz > 39) - goto transfault_l0; + goto transfault; } else { if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) - goto transfault_l0; + goto transfault; } /* R_GTJBY, R_SXWGM */ switch (BIT(wi->pgshift)) { case SZ_4K: - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); - break; case SZ_16K: - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + lva = wi->pa52bit; break; case SZ_64K: lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); @@ -303,38 +303,42 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, } if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) - goto transfault_l0; - - ia_bits = get_ia_size(wi); + goto transfault; /* R_YYVYV, I_THCZK */ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || (va55 && va < GENMASK(63, ia_bits))) - goto transfault_l0; + goto transfault; /* I_ZFSYQ */ if (wi->regime != TR_EL2 && (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) - goto transfault_l0; + goto transfault; /* R_BNDVG and following statements */ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && - as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) - goto transfault_l0; - - /* AArch64.S1StartLevel() */ - stride = wi->pgshift - 3; - wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault; ps = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); - wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); /* Compute minimal alignment */ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); wi->baddr = ttbr & TTBRx_EL1_BADDR; + if (wi->pa52bit) { + /* + * Force the alignment on 64 bytes for top-level tables + * smaller than 8 entries, since TTBR.BADDR[5:2] are used to + * store bits [51:48] of the first level of lookup. + */ + x = max(x, 6); + + wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; + } /* R_VPBBF */ if (check_output_size(wi->baddr, wi)) @@ -342,21 +346,63 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); + wi->ha &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HA, tcr) : + FIELD_GET(TCR_HA, tcr)); + return 0; -addrsz: /* Address Size Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); +addrsz: + /* + * Address Size Fault level 0 to indicate it comes from TTBR. + * yes, this is an oddity. + */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); return -EFAULT; -transfault_l0: /* Translation Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); +transfault: + /* Translation Fault on start level */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); return -EFAULT; } +static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, + struct s1_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + +static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, + struct s1_walk_info *wi) +{ + if (wi->be) { + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); + } else { + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { - u64 va_top, va_bottom, baddr, desc; + u64 va_top, va_bottom, baddr, desc, new_desc, ipa; int level, stride, ret; level = wi->sl; @@ -366,7 +412,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, va_top = get_ia_size(wi) - 1; while (1) { - u64 index, ipa; + u64 index; va_bottom = (3 - level) * stride + wi->pgshift; index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); @@ -380,13 +426,13 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (ret) { fail_s1_walk(wr, (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, - true, true); + true); return ret; } if (!kvm_s2_trans_readable(&s2_trans)) { fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), - true, true); + true); return -EPERM; } @@ -394,17 +440,24 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, ipa = kvm_s2_trans_output(&s2_trans); } - ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + if (wi->filter) { + ret = wi->filter->fn(&(struct s1_walk_context) + { + .wi = wi, + .table_ipa = baddr, + .level = level, + }, wi->filter->priv); + if (ret) + return ret; + } + + ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); if (ret) { - fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), - true, false); + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); return ret; } - if (wi->be) - desc = be64_to_cpu((__force __be64)desc); - else - desc = le64_to_cpu((__force __le64)desc); + new_desc = desc; /* Invalid descriptor */ if (!(desc & BIT(0))) @@ -425,7 +478,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); } - baddr = desc & GENMASK_ULL(47, wi->pgshift); + baddr = desc_to_oa(wi, desc); /* Check for out-of-range OA */ if (check_output_size(baddr, wi)) @@ -442,11 +495,11 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, switch (BIT(wi->pgshift)) { case SZ_4K: - valid_block = level == 1 || level == 2; + valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); break; case SZ_16K: case SZ_64K: - valid_block = level == 2; + valid_block = level == 2 || (wi->pa52bit && level == 1); break; } @@ -454,24 +507,68 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, goto transfault; } - if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + baddr = desc_to_oa(wi, desc); + if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) goto addrsz; + if (wi->ha) + new_desc |= PTE_AF; + + if (new_desc != desc) { + ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + + if (!(desc & PTE_AF)) { + fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); + return -EACCES; + } + va_bottom += contiguous_bit_shift(desc, wi, level); wr->failed = false; wr->level = level; wr->desc = desc; - wr->pa = desc & GENMASK(47, va_bottom); + wr->pa = baddr & GENMASK(52, va_bottom); wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); + if (wr->nG) { + u64 asid_ttbr, tcr; + + switch (wi->regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + wr->asid &= GENMASK(7, 0); + } + return 0; addrsz: - fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); return -EINVAL; transfault: - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); return -ENOENT; } @@ -488,7 +585,6 @@ struct mmu_config { u64 sctlr; u64 vttbr; u64 vtcr; - u64 hcr; }; static void __mmu_config_save(struct mmu_config *config) @@ -511,13 +607,10 @@ static void __mmu_config_save(struct mmu_config *config) config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); config->vtcr = read_sysreg(vtcr_el2); - config->hcr = read_sysreg(hcr_el2); } static void __mmu_config_restore(struct mmu_config *config) { - write_sysreg(config->hcr, hcr_el2); - /* * ARM errata 1165522 and 1530923 require TGE to be 1 before * we update the guest state. @@ -668,21 +761,36 @@ static u8 combine_s1_s2_attr(u8 s1, u8 s2) #define ATTR_OSH 0b10 #define ATTR_ISH 0b11 -static u8 compute_sh(u8 attr, u64 desc) +static u8 compute_final_sh(u8 attr, u8 sh) { - u8 sh; - /* Any form of device, as well as NC has SH[1:0]=0b10 */ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) return ATTR_OSH; - sh = FIELD_GET(PTE_SHARED, desc); if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ sh = ATTR_NSH; return sh; } +static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, + u8 attr) +{ + u8 sh; + + /* + * non-52bit and LPA have their basic shareability described in the + * descriptor. LPA2 gets it from the corresponding field in TCR, + * conveniently recorded in the walk info. + */ + if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) + sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); + else + sh = wi->sh; + + return compute_final_sh(attr, sh); +} + static u8 combine_sh(u8 s1_sh, u8 s2_sh) { if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) @@ -696,7 +804,7 @@ static u8 combine_sh(u8 s1_sh, u8 s2_sh) static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, struct kvm_s2_trans *tr) { - u8 s1_parattr, s2_memattr, final_attr; + u8 s1_parattr, s2_memattr, final_attr, s2_sh; u64 par; /* If S2 has failed to translate, report the damage */ @@ -739,8 +847,15 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, final_attr = s1_parattr; break; default: - /* MemAttr[2]=0, Device from S2 */ - final_attr = s2_memattr & GENMASK(1,0) << 2; + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); } } else { /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ @@ -762,17 +877,19 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, !MEMATTR_IS_DEVICE(final_attr)) final_attr = MEMATTR(NC, NC); + s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); par |= tr->output & GENMASK(47, 12); par |= FIELD_PREP(SYS_PAR_EL1_SH, combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), - compute_sh(final_attr, tr->desc))); + compute_final_sh(final_attr, s2_sh))); return par; } -static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, - enum trans_regime regime) +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr) { u64 par; @@ -785,9 +902,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, } else if (wr->level == S1_MMU_DISABLED) { /* MMU off or HCR_EL2.DC == 1 */ par = SYS_PAR_EL1_NSE; - par |= wr->pa & GENMASK_ULL(47, 12); + par |= wr->pa & SYS_PAR_EL1_PA; - if (regime == TR_EL10 && + if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { par |= FIELD_PREP(SYS_PAR_EL1_ATTR, MEMATTR(WbRaWa, WbRaWa)); @@ -802,14 +919,14 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, par = SYS_PAR_EL1_NSE; - mair = (regime == TR_EL10 ? + mair = (wi->regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, MAIR_EL1) : vcpu_read_sys_reg(vcpu, MAIR_EL2)); mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; mair &= 0xff; - sctlr = (regime == TR_EL10 ? + sctlr = (wi->regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, SCTLR_EL1) : vcpu_read_sys_reg(vcpu, SCTLR_EL2)); @@ -818,9 +935,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, mair = MEMATTR(NC, NC); par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); - par |= wr->pa & GENMASK_ULL(47, 12); + par |= wr->pa & SYS_PAR_EL1_PA; - sh = compute_sh(mair, wr->desc); + sh = compute_s1_sh(wi, wr, mair); par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); } @@ -894,7 +1011,7 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); break; case TR_EL10: - wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); break; } @@ -1068,37 +1185,54 @@ static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); - switch (wi->regime) { - case TR_EL10: - pov_perms = perm_idx(vcpu, POR_EL1, idx); - uov_perms = perm_idx(vcpu, POR_EL0, idx); - break; - case TR_EL20: - pov_perms = perm_idx(vcpu, POR_EL2, idx); - uov_perms = perm_idx(vcpu, POR_EL0, idx); - break; - case TR_EL2: - pov_perms = perm_idx(vcpu, POR_EL2, idx); - uov_perms = 0; - break; - } + if (wr->pov) { + switch (wi->regime) { + case TR_EL10: + pov_perms = perm_idx(vcpu, POR_EL1, idx); + break; + case TR_EL20: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + break; + case TR_EL2: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + break; + } + + if (pov_perms & ~POE_RWX) + pov_perms = POE_NONE; - if (pov_perms & ~POE_RXW) - pov_perms = POE_NONE; + /* R_QXXPC, S1PrivOverflow enabled */ + if (wr->pwxn && (pov_perms & POE_X)) + pov_perms &= ~POE_W; - if (wi->poe && wr->pov) { wr->pr &= pov_perms & POE_R; - wr->px &= pov_perms & POE_X; wr->pw &= pov_perms & POE_W; + wr->px &= pov_perms & POE_X; } - if (uov_perms & ~POE_RXW) - uov_perms = POE_NONE; + if (wr->uov) { + switch (wi->regime) { + case TR_EL10: + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL20: + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL2: + uov_perms = 0; + break; + } + + if (uov_perms & ~POE_RWX) + uov_perms = POE_NONE; + + /* R_NPBXC, S1UnprivOverlay enabled */ + if (wr->uwxn && (uov_perms & POE_X)) + uov_perms &= ~POE_W; - if (wi->e0poe && wr->uov) { wr->ur &= uov_perms & POE_R; - wr->ux &= uov_perms & POE_X; wr->uw &= uov_perms & POE_W; + wr->ux &= uov_perms & POE_X; } } @@ -1116,24 +1250,15 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, if (!wi->hpd) compute_s1_hierarchical_permissions(vcpu, wi, wr); - if (wi->poe || wi->e0poe) - compute_s1_overlay_permissions(vcpu, wi, wr); + compute_s1_overlay_permissions(vcpu, wi, wr); - /* R_QXXPC */ - if (wr->pwxn) { - if (!wr->pov && wr->pw) - wr->px = false; - if (wr->pov && wr->px) - wr->pw = false; - } + /* R_QXXPC, S1PrivOverlay disabled */ + if (!wr->pov) + wr->px &= !(wr->pwxn && wr->pw); - /* R_NPBXC */ - if (wr->uwxn) { - if (!wr->uov && wr->uw) - wr->ux = false; - if (wr->uov && wr->ux) - wr->uw = false; - } + /* R_NPBXC, S1UnprivOverlay disabled */ + if (!wr->uov) + wr->ux &= !(wr->uwxn && wr->uw); pan = wi->pan && (wr->ur || wr->uw || (pan3_enabled(vcpu, wi->regime) && wr->ux)); @@ -1141,14 +1266,19 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, wr->pr &= !pan; } -static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) { struct s1_walk_result wr = {}; struct s1_walk_info wi = {}; bool perm_fail = false; int ret, idx; - ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + wi.regime = compute_translation_regime(vcpu, op); + wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); if (ret) goto compute_par; @@ -1161,6 +1291,11 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* + * Race to update a descriptor -- restart the walk. + */ + if (ret == -EAGAIN) + return ret; if (ret) goto compute_par; @@ -1191,10 +1326,11 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) } if (perm_fail) - fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: - return compute_par_s1(vcpu, &wr, wi.regime); + *par = compute_par_s1(vcpu, &wi, &wr); + return 0; } /* @@ -1203,13 +1339,14 @@ compute_par: * If the translation is unsuccessful, the value may only contain * PAR_EL1.F, and cannot be taken at face value. It isn't an * indication of the translation having failed, only that the fast - * path did not succeed, *unless* it indicates a S1 permission fault. + * path did not succeed, *unless* it indicates a S1 permission or + * access fault. */ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct mmu_config config; struct kvm_s2_mmu *mmu; - bool fail; + bool fail, mmu_cs; u64 par; par = SYS_PAR_EL1_F; @@ -1225,8 +1362,13 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already * the right one (as we trapped from vEL2). If not, save the * full MMU context. + * + * We are also guaranteed to be in the correct context if + * we're not in a nested VM. */ - if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + mmu_cs = (vcpu_has_nv(vcpu) && + !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); + if (!mmu_cs) goto skip_mmu_switch; /* @@ -1259,8 +1401,8 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) __load_stage2(mmu, mmu->arch); skip_mmu_switch: - /* Clear TGE, enable S2 translation, we're rolling */ - write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + /* Temporarily switch back to guest context */ + write_sysreg_hcr(vcpu->arch.hcr_el2); isb(); switch (op) { @@ -1292,7 +1434,9 @@ skip_mmu_switch: if (!fail) par = read_sysreg_par(); - if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + + if (mmu_cs) __mmu_config_restore(&config); return par; @@ -1306,27 +1450,43 @@ static bool par_check_s1_perm_fault(u64 par) !(par & SYS_PAR_EL1_S)); } -void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +static bool par_check_s1_access_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && + !(par & SYS_PAR_EL1_S)); +} + +int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + int ret; /* - * If PAR_EL1 reports that AT failed on a S1 permission fault, we - * know for sure that the PTW was able to walk the S1 tables and - * there's nothing else to do. + * If PAR_EL1 reports that AT failed on a S1 permission or access + * fault, we know for sure that the PTW was able to walk the S1 + * tables and there's nothing else to do. * * If AT failed for any other reason, then we must walk the guest S1 * to emulate the instruction. */ - if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + if ((par & SYS_PAR_EL1_F) && + !par_check_s1_perm_fault(par) && + !par_check_s1_access_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par; + int ret; /* * We've trapped, so everything is live on the CPU. As we will be @@ -1343,7 +1503,7 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (!vcpu_el2_e2h_is_set(vcpu)) val |= HCR_NV | HCR_NV1; - write_sysreg(val, hcr_el2); + write_sysreg_hcr(val); isb(); par = SYS_PAR_EL1_F; @@ -1368,18 +1528,22 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (!fail) par = read_sysreg_par(); - write_sysreg(hcr, hcr_el2); + write_sysreg_hcr(hcr); isb(); } /* We failed the translation, let's replay it in slow motion */ - if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct kvm_s2_trans out = {}; u64 ipa, par; @@ -1406,28 +1570,28 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) break; default: WARN_ON_ONCE(1); - return; + return 0; } __kvm_at_s1e01(vcpu, op, vaddr); par = vcpu_read_sys_reg(vcpu, PAR_EL1); if (par & SYS_PAR_EL1_F) - return; + return 0; /* - * If we only have a single stage of translation (E2H=0 or - * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. + * If we only have a single stage of translation (EL2&0), exit + * early. Same thing if {VM,DC}=={0,0}. */ - if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || + if (compute_translation_regime(vcpu, op) == TR_EL20 || !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) - return; + return 0; /* Do the stage-2 translation */ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); out.esr = 0; ret = kvm_walk_nested_s2(vcpu, ipa, &out); if (ret < 0) - return; + return ret; /* Check the access permission */ if (!out.esr && @@ -1436,4 +1600,196 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; +} + +/* + * Translate a VA for a given EL in a given translation regime, with + * or without PAN. This requires wi->{regime, as_el0, pan} to be + * set. The rest of the wi and wr should be 0-initialised. + */ +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + int ret; + + ret = setup_s1_walk(vcpu, wi, wr, va); + if (ret) + return ret; + + if (wr->level == S1_MMU_DISABLED) { + wr->ur = wr->uw = wr->ux = true; + wr->pr = wr->pw = wr->px = true; + } else { + ret = walk_s1(vcpu, wi, wr, va); + if (ret) + return ret; + + compute_s1_permissions(vcpu, wi, wr); + } + + return 0; +} + +struct desc_match { + u64 ipa; + int level; +}; + +static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) +{ + struct desc_match *dm = priv; + u64 ipa = dm->ipa; + + /* Use S1 granule alignment */ + ipa &= GENMASK(51, ctxt->wi->pgshift); + + /* Not the IPA we're looking for? Continue. */ + if (ipa != ctxt->table_ipa) + return 0; + + /* Note the level and interrupt the walk */ + dm->level = ctxt->level; + return -EINTR; +} + +int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) +{ + struct desc_match dm = { + .ipa = ipa, + }; + struct s1_walk_info wi = { + .filter = &(struct s1_walk_filter){ + .fn = match_s1_desc, + .priv = &dm, + }, + .as_el0 = false, + .pan = false, + }; + struct s1_walk_result wr = {}; + int ret; + + if (is_hyp_ctxt(vcpu)) + wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + else + wi.regime = TR_EL10; + + ret = setup_s1_walk(vcpu, &wi, &wr, va); + if (ret) + return ret; + + /* We really expect the S1 MMU to be on here... */ + if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { + *level = 0; + return 0; + } + + /* Walk the guest's PT, looking for a match along the way */ + ret = walk_s1(vcpu, &wi, &wr, va); + switch (ret) { + case -EINTR: + /* We interrupted the walk on a match, return the level */ + *level = dm.level; + return 0; + case 0: + /* The walk completed, we failed to find the entry */ + return -ENOENT; + default: + /* Any other error... */ + return ret; + } +} + +#ifdef CONFIG_ARM64_LSE_ATOMICS +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + uaccess_enable_privileged(); + + asm volatile(__LSE_PREAMBLE + "1: cas %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} +#else +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + return -EINVAL; +} +#endif + +static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + int ret = 1; + u64 tmp; + + uaccess_enable_privileged(); + + asm volatile("prfm pstl1strm, %[addr]\n" + "1: ldxr %[tmp], %[addr]\n" + "sub %[tmp], %[tmp], %[old]\n" + "cbnz %[tmp], 3f\n" + "2: stlxr %w[ret], %[new], %[addr]\n" + "3:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) + : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) + : [old] "r" (old), [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + /* STLXR didn't update the descriptor, or the compare failed */ + if (ret == 1) + return -EAGAIN; + + return ret; +} + +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + u64 __user *ptep; + bool writable; + int offset; + gfn_t gfn; + int r; + + lockdep_assert(srcu_read_lock_held(&kvm->srcu)); + + gfn = ipa >> PAGE_SHIFT; + offset = offset_in_page(ipa); + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return -EINVAL; + if (!writable) + return -EPERM; + + ptep = (u64 __user *)hva + offset; + if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + r = __lse_swap_desc(ptep, old, new); + else + r = __llsc_swap_desc(ptep, old, new); + + if (r < 0) + return r; + + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; } |
