summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r--arch/arm64/kvm/arch_timer.c14
-rw-r--r--arch/arm64/kvm/arm.c201
-rw-r--r--arch/arm64/kvm/fpsimd.c4
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h101
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/ffa.h17
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h3
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c762
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S36
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S32
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c19
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c74
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c27
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c11
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c28
-rw-r--r--arch/arm64/kvm/hyp/nvhe/timer-sr.c16
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c52
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c228
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c2
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c32
-rw-r--r--arch/arm64/kvm/mmu.c207
-rw-r--r--arch/arm64/kvm/pkvm.c1
-rw-r--r--arch/arm64/kvm/reset.c58
-rw-r--r--arch/arm64/kvm/sys_regs.c505
-rw-r--r--arch/arm64/kvm/sys_regs.h22
25 files changed, 2075 insertions, 379 deletions
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 05b022be885b..0696732fa38c 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1406,7 +1406,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
- goto out_free_irq;
+ goto out_free_vtimer_irq;
}
static_branch_enable(&has_gic_active_state);
@@ -1422,7 +1422,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
if (err) {
kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
host_ptimer_irq, err);
- return err;
+ goto out_free_vtimer_irq;
}
if (has_gic) {
@@ -1430,7 +1430,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
- goto out_free_irq;
+ goto out_free_ptimer_irq;
}
}
@@ -1439,11 +1439,15 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
info->physical_irq);
err = -ENODEV;
- goto out_free_irq;
+ goto out_free_vtimer_irq;
}
return 0;
-out_free_irq:
+
+out_free_ptimer_irq:
+ if (info->physical_irq > 0)
+ free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus());
+out_free_vtimer_irq:
free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
return err;
}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 14391826241c..c2c14059f6a8 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -51,6 +51,8 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+
static bool vgic_present;
static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
@@ -65,6 +67,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
int r;
+ u64 new_cap;
if (cap->flags)
return -EINVAL;
@@ -89,6 +92,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ new_cap = cap->args[0];
+
+ mutex_lock(&kvm->slots_lock);
+ /*
+ * To keep things simple, allow changing the chunk
+ * size only when no memory slots have been created.
+ */
+ if (!kvm_are_all_memslots_empty(kvm)) {
+ r = -EINVAL;
+ } else if (new_cap && !kvm_is_block_size_supported(new_cap)) {
+ r = -EINVAL;
+ } else {
+ r = 0;
+ kvm->arch.mmu.split_page_chunk_size = new_cap;
+ }
+ mutex_unlock(&kvm->slots_lock);
+ break;
default:
r = -EINVAL;
break;
@@ -102,22 +123,6 @@ static int kvm_arm_default_max_vcpus(void)
return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
}
-static void set_default_spectre(struct kvm *kvm)
-{
- /*
- * The default is to expose CSV2 == 1 if the HW isn't affected.
- * Although this is a per-CPU feature, we make it global because
- * asymmetric systems are just a nuisance.
- *
- * Userspace can override this as long as it doesn't promise
- * the impossible.
- */
- if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
- kvm->arch.pfr0_csv2 = 1;
- if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
- kvm->arch.pfr0_csv3 = 1;
-}
-
/**
* kvm_arch_init_vm - initializes a VM data structure
* @kvm: pointer to the KVM struct
@@ -161,14 +166,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->max_vcpus = kvm_arm_default_max_vcpus();
- set_default_spectre(kvm);
kvm_arm_init_hypercalls(kvm);
- /*
- * Initialise the default PMUver before there is a chance to
- * create an actual PMU.
- */
- kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
+ bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
return 0;
@@ -302,6 +302,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PTRAUTH_GENERIC:
r = system_has_full_ptr_auth();
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ if (kvm)
+ r = kvm->arch.mmu.split_page_chunk_size;
+ else
+ r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ break;
+ case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
+ r = kvm_supported_block_sizes();
+ break;
default:
r = 0;
}
@@ -1167,58 +1176,115 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
return -EINVAL;
}
-static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
- const struct kvm_vcpu_init *init)
+static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
{
- unsigned int i, ret;
- u32 phys_target = kvm_target_cpu();
+ unsigned long features = init->features[0];
+ int i;
+
+ if (features & ~KVM_VCPU_VALID_FEATURES)
+ return -ENOENT;
+
+ for (i = 1; i < ARRAY_SIZE(init->features); i++) {
+ if (init->features[i])
+ return -ENOENT;
+ }
+
+ if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
+ return 0;
- if (init->target != phys_target)
+ if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1))
return -EINVAL;
- /*
- * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
- * use the same target.
- */
- if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+ /* MTE is incompatible with AArch32 */
+ if (kvm_has_mte(vcpu->kvm))
return -EINVAL;
- /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
- for (i = 0; i < sizeof(init->features) * 8; i++) {
- bool set = (init->features[i / 32] & (1 << (i % 32)));
+ /* NV is incompatible with AArch32 */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
+ return -EINVAL;
- if (set && i >= KVM_VCPU_MAX_FEATURES)
- return -ENOENT;
+ return 0;
+}
- /*
- * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
- * use the same feature set.
- */
- if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
- test_bit(i, vcpu->arch.features) != set)
- return -EINVAL;
+static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ unsigned long features = init->features[0];
- if (set)
- set_bit(i, vcpu->arch.features);
- }
+ return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES) ||
+ vcpu->arch.target != init->target;
+}
+
+static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ unsigned long features = init->features[0];
+ struct kvm *kvm = vcpu->kvm;
+ int ret = -EINVAL;
+
+ mutex_lock(&kvm->arch.config_lock);
- vcpu->arch.target = phys_target;
+ if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
+ !bitmap_equal(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES))
+ goto out_unlock;
+
+ vcpu->arch.target = init->target;
+ bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES);
/* Now we know what it is, we can reset it. */
ret = kvm_reset_vcpu(vcpu);
if (ret) {
vcpu->arch.target = -1;
bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+ goto out_unlock;
}
+ bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
+ set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
+
+out_unlock:
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ int ret;
+
+ if (init->target != kvm_target_cpu())
+ return -EINVAL;
+
+ ret = kvm_vcpu_init_check_features(vcpu, init);
+ if (ret)
+ return ret;
+
+ if (vcpu->arch.target == -1)
+ return __kvm_vcpu_set_target(vcpu, init);
+
+ if (kvm_vcpu_init_changed(vcpu, init))
+ return -EINVAL;
+
+ return kvm_reset_vcpu(vcpu);
+}
+
static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm_vcpu_init *init)
{
+ bool power_off = false;
int ret;
+ /*
+ * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
+ * reflecting it in the finalized feature set, thus limiting its scope
+ * to a single KVM_ARM_VCPU_INIT call.
+ */
+ if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
+ init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
+ power_off = true;
+ }
+
ret = kvm_vcpu_set_target(vcpu, init);
if (ret)
return ret;
@@ -1240,14 +1306,14 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
}
vcpu_reset_hcr(vcpu);
- vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
+ vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
/*
* Handle the "start in power-off" case.
*/
spin_lock(&vcpu->arch.mp_state_lock);
- if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
+ if (power_off)
__kvm_arm_vcpu_power_off(vcpu);
else
WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
@@ -1666,7 +1732,13 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->mair_el2 = read_sysreg(mair_el1);
- tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
+ tcr = read_sysreg(tcr_el1);
+ if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
+ tcr |= TCR_EPD1_MASK;
+ } else {
+ tcr &= TCR_EL2_MASK;
+ tcr |= TCR_EL2_RES1;
+ }
tcr &= ~TCR_T0SZ_MASK;
tcr |= TCR_T0SZ(hyp_va_bits);
params->tcr_el2 = tcr;
@@ -1676,6 +1748,8 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
else
params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
+ if (cpus_have_final_cap(ARM64_KVM_HVHE))
+ params->hcr_el2 |= HCR_E2H;
params->vttbr = params->vtcr = 0;
/*
@@ -1910,6 +1984,7 @@ static bool __init init_psci_relay(void)
}
kvm_host_psci_config.version = psci_ops.get_version();
+ kvm_host_psci_config.smccc_version = arm_smccc_get_version();
if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
@@ -2067,6 +2142,26 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
return 0;
}
+static void pkvm_hyp_init_ptrauth(void)
+{
+ struct kvm_cpu_context *hyp_ctxt;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
+ hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
+ }
+}
+
/* Inits Hyp-mode on all online CPUs */
static int __init init_hyp_mode(void)
{
@@ -2228,6 +2323,10 @@ static int __init init_hyp_mode(void)
kvm_hyp_init_symbols();
if (is_protected_kvm_enabled()) {
+ if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
+ cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH))
+ pkvm_hyp_init_ptrauth();
+
init_cpu_logical_map();
if (!init_psci_relay()) {
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 4c9dcd8fc939..8c1d0d4853df 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -180,7 +180,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
/*
* If we have VHE then the Hyp code will reset CPACR_EL1 to
- * CPACR_EL1_DEFAULT and we need to reenable SME.
+ * the default value and we need to reenable SME.
*/
if (has_vhe() && system_supports_sme()) {
/* Also restore EL0 state seen on entry */
@@ -210,7 +210,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
/*
* The FPSIMD/SVE state in the CPU has not been touched, and we
* have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
- * reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE
+ * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE
* for EL0. To avoid spurious traps, restore the trap state
* seen by kvm_arch_vcpu_load_fp():
*/
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 4fe217efa218..f35d5abedf9c 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -70,6 +70,56 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
}
}
+static inline bool __hfgxtr_traps_required(void)
+{
+ if (cpus_have_final_cap(ARM64_SME))
+ return true;
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ return true;
+
+ return false;
+}
+
+static inline void __activate_traps_hfgxtr(void)
+{
+ u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp;
+
+ if (cpus_have_final_cap(ARM64_SME)) {
+ tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK;
+
+ r_clr |= tmp;
+ w_clr |= tmp;
+ }
+
+ /*
+ * Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD.
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ w_set |= HFGxTR_EL2_TCR_EL1_MASK;
+
+ sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set);
+ sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set);
+}
+
+static inline void __deactivate_traps_hfgxtr(void)
+{
+ u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp;
+
+ if (cpus_have_final_cap(ARM64_SME)) {
+ tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK;
+
+ r_set |= tmp;
+ w_set |= tmp;
+ }
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ w_clr |= HFGxTR_EL2_TCR_EL1_MASK;
+
+ sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set);
+ sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set);
+}
+
static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
{
/* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
@@ -95,16 +145,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
- if (cpus_have_final_cap(ARM64_SME)) {
- sysreg_clear_set_s(SYS_HFGRTR_EL2,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK,
- 0);
- sysreg_clear_set_s(SYS_HFGWTR_EL2,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK,
- 0);
- }
+ if (__hfgxtr_traps_required())
+ __activate_traps_hfgxtr();
}
static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
@@ -120,14 +162,8 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
}
- if (cpus_have_final_cap(ARM64_SME)) {
- sysreg_clear_set_s(SYS_HFGRTR_EL2, 0,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK);
- sysreg_clear_set_s(SYS_HFGWTR_EL2, 0,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK);
- }
+ if (__hfgxtr_traps_required())
+ __deactivate_traps_hfgxtr();
}
static inline void ___activate_traps(struct kvm_vcpu *vcpu)
@@ -203,7 +239,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Valid trap. Switch the context: */
/* First disable enough traps to allow us to update the registers */
- if (has_vhe()) {
+ if (has_vhe() || has_hvhe()) {
reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN;
if (sve_guest)
reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
@@ -395,12 +431,39 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
return true;
}
+static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ int rt = kvm_vcpu_sys_get_rt(vcpu);
+ u64 val = vcpu_get_reg(vcpu, rt);
+
+ if (sysreg != SYS_TCR_EL1)
+ return false;
+
+ /*
+ * Affected parts do not advertise support for hardware Access Flag /
+ * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying
+ * control bits are still functional. The architecture requires these be
+ * RES0 on systems that do not implement FEAT_HAFDBS.
+ *
+ * Uphold the requirements of the architecture by masking guest writes
+ * to TCR_EL1.{HA,HD} here.
+ */
+ val &= ~(TCR_HD | TCR_HA);
+ write_sysreg_el1(val, SYS_TCR);
+ return true;
+}
+
static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
handle_tx2_tvm(vcpu))
return true;
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) &&
+ handle_ampere1_tcr(vcpu))
+ return true;
+
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
__vgic_v3_perform_cpuif_access(vcpu) == 1)
return true;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
new file mode 100644
index 000000000000..1becb10ecd80
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ */
+#ifndef __KVM_HYP_FFA_H
+#define __KVM_HYP_FFA_H
+
+#include <asm/kvm_host.h>
+
+#define FFA_MIN_FUNC_NUM 0x60
+#define FFA_MAX_FUNC_NUM 0x7F
+
+int hyp_ffa_init(void *pages);
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
+
+#endif /* __KVM_HYP_FFA_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index b7bdbe63deed..0972faccc2af 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -57,6 +57,7 @@ extern struct host_mmu host_mmu;
enum pkvm_component_id {
PKVM_ID_HOST,
PKVM_ID_HYP,
+ PKVM_ID_FFA,
};
extern unsigned long hyp_nr_cpus;
@@ -66,6 +67,8 @@ int __pkvm_host_share_hyp(u64 pfn);
int __pkvm_host_unshare_hyp(u64 pfn);
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 530347cdebe3..9ddc025e4b86 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -22,7 +22,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
- cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
+ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
new file mode 100644
index 000000000000..58dcd92bf346
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by
+ * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware
+ * Framework for Arm A-profile", which is specified by Arm in document
+ * number DEN0077.
+ *
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ *
+ * This driver hooks into the SMC trapping logic for the host and intercepts
+ * all calls falling within the FF-A range. Each call is either:
+ *
+ * - Forwarded on unmodified to the SPMD at EL3
+ * - Rejected as "unsupported"
+ * - Accompanied by a host stage-2 page-table check/update and reissued
+ *
+ * Consequently, any attempts by the host to make guest memory pages
+ * accessible to the secure world using FF-A will be detected either here
+ * (in the case that the memory is already owned by the guest) or during
+ * donation to the guest (in the case that the memory was previously shared
+ * with the secure world).
+ *
+ * To allow the rolling-back of page-table updates and FF-A calls in the
+ * event of failure, operations involving the RXTX buffers are locked for
+ * the duration and are therefore serialised.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/arm_ffa.h>
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/ffa.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/trap_handler.h>
+#include <nvhe/spinlock.h>
+
+/*
+ * "ID value 0 must be returned at the Non-secure physical FF-A instance"
+ * We share this ID with the host.
+ */
+#define HOST_FFA_ID 0
+
+/*
+ * A buffer to hold the maximum descriptor size we can see from the host,
+ * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP
+ * when resolving the handle on the reclaim path.
+ */
+struct kvm_ffa_descriptor_buffer {
+ void *buf;
+ size_t len;
+};
+
+static struct kvm_ffa_descriptor_buffer ffa_desc_buf;
+
+struct kvm_ffa_buffers {
+ hyp_spinlock_t lock;
+ void *tx;
+ void *rx;
+};
+
+/*
+ * Note that we don't currently lock these buffers explicitly, instead
+ * relying on the locking of the host FFA buffers as we only have one
+ * client.
+ */
+static struct kvm_ffa_buffers hyp_buffers;
+static struct kvm_ffa_buffers host_buffers;
+
+static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
+{
+ *res = (struct arm_smccc_res) {
+ .a0 = FFA_ERROR,
+ .a2 = ffa_errno,
+ };
+}
+
+static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
+{
+ if (ret == FFA_RET_SUCCESS) {
+ *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS,
+ .a2 = prop };
+ } else {
+ ffa_to_smccc_error(res, ret);
+ }
+}
+
+static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret)
+{
+ ffa_to_smccc_res_prop(res, ret, 0);
+}
+
+static void ffa_set_retval(struct kvm_cpu_context *ctxt,
+ struct arm_smccc_res *res)
+{
+ cpu_reg(ctxt, 0) = res->a0;
+ cpu_reg(ctxt, 1) = res->a1;
+ cpu_reg(ctxt, 2) = res->a2;
+ cpu_reg(ctxt, 3) = res->a3;
+}
+
+static bool is_ffa_call(u64 func_id)
+{
+ return ARM_SMCCC_IS_FAST_CALL(func_id) &&
+ ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
+ ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM &&
+ ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM;
+}
+
+static int ffa_map_hyp_buffers(u64 ffa_page_count)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP,
+ hyp_virt_to_phys(hyp_buffers.tx),
+ hyp_virt_to_phys(hyp_buffers.rx),
+ ffa_page_count,
+ 0, 0, 0, 0,
+ &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static int ffa_unmap_hyp_buffers(void)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_RXTX_UNMAP,
+ HOST_FFA_ID,
+ 0, 0, 0, 0, 0, 0,
+ &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 fraglen, u32 endpoint_id)
+{
+ arm_smccc_1_1_smc(FFA_MEM_FRAG_TX,
+ handle_lo, handle_hi, fraglen, endpoint_id,
+ 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 fragoff)
+{
+ arm_smccc_1_1_smc(FFA_MEM_FRAG_RX,
+ handle_lo, handle_hi, fragoff, HOST_FFA_ID,
+ 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
+ u32 fraglen)
+{
+ arm_smccc_1_1_smc(func_id, len, fraglen,
+ 0, 0, 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 flags)
+{
+ arm_smccc_1_1_smc(FFA_MEM_RECLAIM,
+ handle_lo, handle_hi, flags,
+ 0, 0, 0, 0,
+ res);
+}
+
+static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len)
+{
+ arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ,
+ len, len,
+ 0, 0, 0, 0, 0,
+ res);
+}
+
+static void do_ffa_rxtx_map(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(phys_addr_t, tx, ctxt, 1);
+ DECLARE_REG(phys_addr_t, rx, ctxt, 2);
+ DECLARE_REG(u32, npages, ctxt, 3);
+ int ret = 0;
+ void *rx_virt, *tx_virt;
+
+ if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (host_buffers.tx) {
+ ret = FFA_RET_DENIED;
+ goto out_unlock;
+ }
+
+ /*
+ * Map our hypervisor buffers into the SPMD before mapping and
+ * pinning the host buffers in our own address space.
+ */
+ ret = ffa_map_hyp_buffers(npages);
+ if (ret)
+ goto out_unlock;
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unmap;
+ }
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_tx;
+ }
+
+ tx_virt = hyp_phys_to_virt(tx);
+ ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_rx;
+ }
+
+ rx_virt = hyp_phys_to_virt(rx);
+ ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unpin_tx;
+ }
+
+ host_buffers.tx = tx_virt;
+ host_buffers.rx = rx_virt;
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unpin_tx:
+ hyp_unpin_shared_mem(tx_virt, tx_virt + 1);
+err_unshare_rx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx));
+err_unshare_tx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx));
+err_unmap:
+ ffa_unmap_hyp_buffers();
+ goto out_unlock;
+}
+
+static void do_ffa_rxtx_unmap(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ int ret = 0;
+
+ if (id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx)));
+ host_buffers.tx = NULL;
+
+ hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx)));
+ host_buffers.rx = NULL;
+
+ ffa_unmap_hyp_buffers();
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+}
+
+static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nshared = __ffa_host_share_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nshared != nranges) {
+ WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nunshared != nranges) {
+ WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static void do_ffa_mem_frag_tx(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, fraglen, ctxt, 3);
+ DECLARE_REG(u32, endpoint_id, ctxt, 4);
+ struct ffa_mem_region_addr_range *buf;
+ int ret = FFA_RET_INVALID_PARAMETERS;
+ u32 nr_ranges;
+
+ if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)
+ goto out;
+
+ if (fraglen % sizeof(*buf))
+ goto out;
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx)
+ goto out_unlock;
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+ nr_ranges = fraglen / sizeof(*buf);
+
+ ret = ffa_host_share_ranges(buf, nr_ranges);
+ if (ret) {
+ /*
+ * We're effectively aborting the transaction, so we need
+ * to restore the global state back to what it was prior to
+ * transmission of the first fragment.
+ */
+ ffa_mem_reclaim(res, handle_lo, handle_hi, 0);
+ WARN_ON(res->a0 != FFA_SUCCESS);
+ goto out_unlock;
+ }
+
+ ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id);
+ if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX)
+ WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges));
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+
+ /*
+ * If for any reason this did not succeed, we're in trouble as we have
+ * now lost the content of the previous fragments and we can't rollback
+ * the host stage-2 changes. The pages previously marked as shared will
+ * remain stuck in that state forever, hence preventing the host from
+ * sharing/donating them again and may possibly lead to subsequent
+ * failures, but this will not compromise confidentiality.
+ */
+ return;
+}
+
+static __always_inline void do_ffa_mem_xfer(const u64 func_id,
+ struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, len, ctxt, 1);
+ DECLARE_REG(u32, fraglen, ctxt, 2);
+ DECLARE_REG(u64, addr_mbz, ctxt, 3);
+ DECLARE_REG(u32, npages_mbz, ctxt, 4);
+ struct ffa_composite_mem_region *reg;
+ struct ffa_mem_region *buf;
+ u32 offset, nr_ranges;
+ int ret = 0;
+
+ BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
+ func_id != FFA_FN64_MEM_LEND);
+
+ if (addr_mbz || npages_mbz || fraglen > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (fraglen < sizeof(struct ffa_mem_region) +
+ sizeof(struct ffa_mem_region_attributes)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+
+ offset = buf->ep_mem_access[0].composite_off;
+ if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ reg = (void *)buf + offset;
+ nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents;
+ if (nr_ranges % sizeof(reg->constituents[0])) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ nr_ranges /= sizeof(reg->constituents[0]);
+ ret = ffa_host_share_ranges(reg->constituents, nr_ranges);
+ if (ret)
+ goto out_unlock;
+
+ ffa_mem_xfer(res, func_id, len, fraglen);
+ if (fraglen != len) {
+ if (res->a0 != FFA_MEM_FRAG_RX)
+ goto err_unshare;
+
+ if (res->a3 != fraglen)
+ goto err_unshare;
+ } else if (res->a0 != FFA_SUCCESS) {
+ goto err_unshare;
+ }
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unshare:
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges));
+ goto out_unlock;
+}
+
+static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, flags, ctxt, 3);
+ struct ffa_composite_mem_region *reg;
+ u32 offset, len, fraglen, fragoff;
+ struct ffa_mem_region *buf;
+ int ret = 0;
+ u64 handle;
+
+ handle = PACK_HANDLE(handle_lo, handle_hi);
+
+ hyp_spin_lock(&host_buffers.lock);
+
+ buf = hyp_buffers.tx;
+ *buf = (struct ffa_mem_region) {
+ .sender_id = HOST_FFA_ID,
+ .handle = handle,
+ };
+
+ ffa_retrieve_req(res, sizeof(*buf));
+ buf = hyp_buffers.rx;
+ if (res->a0 != FFA_MEM_RETRIEVE_RESP)
+ goto out_unlock;
+
+ len = res->a1;
+ fraglen = res->a2;
+
+ offset = buf->ep_mem_access[0].composite_off;
+ /*
+ * We can trust the SPMD to get this right, but let's at least
+ * check that we end up with something that doesn't look _completely_
+ * bogus.
+ */
+ if (WARN_ON(offset > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
+ ret = FFA_RET_ABORTED;
+ goto out_unlock;
+ }
+
+ if (len > ffa_desc_buf.len) {
+ ret = FFA_RET_NO_MEMORY;
+ goto out_unlock;
+ }
+
+ buf = ffa_desc_buf.buf;
+ memcpy(buf, hyp_buffers.rx, fraglen);
+
+ for (fragoff = fraglen; fragoff < len; fragoff += fraglen) {
+ ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff);
+ if (res->a0 != FFA_MEM_FRAG_TX) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ fraglen = res->a3;
+ memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen);
+ }
+
+ ffa_mem_reclaim(res, handle_lo, handle_hi, flags);
+ if (res->a0 != FFA_SUCCESS)
+ goto out_unlock;
+
+ reg = (void *)buf + offset;
+ /* If the SPMD was happy, then we should be too. */
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents,
+ reg->addr_range_cnt));
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+}
+
+/*
+ * Is a given FFA function supported, either by forwarding on directly
+ * or by handling at EL2?
+ */
+static bool ffa_call_supported(u64 func_id)
+{
+ switch (func_id) {
+ /* Unsupported memory management calls */
+ case FFA_FN64_MEM_RETRIEVE_REQ:
+ case FFA_MEM_RETRIEVE_RESP:
+ case FFA_MEM_RELINQUISH:
+ case FFA_MEM_OP_PAUSE:
+ case FFA_MEM_OP_RESUME:
+ case FFA_MEM_FRAG_RX:
+ case FFA_FN64_MEM_DONATE:
+ /* Indirect message passing via RX/TX buffers */
+ case FFA_MSG_SEND:
+ case FFA_MSG_POLL:
+ case FFA_MSG_WAIT:
+ /* 32-bit variants of 64-bit calls */
+ case FFA_MSG_SEND_DIRECT_REQ:
+ case FFA_MSG_SEND_DIRECT_RESP:
+ case FFA_RXTX_MAP:
+ case FFA_MEM_DONATE:
+ case FFA_MEM_RETRIEVE_REQ:
+ return false;
+ }
+
+ return true;
+}
+
+static bool do_ffa_features(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ u64 prop = 0;
+ int ret = 0;
+
+ if (!ffa_call_supported(id)) {
+ ret = FFA_RET_NOT_SUPPORTED;
+ goto out_handled;
+ }
+
+ switch (id) {
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ ret = FFA_RET_SUCCESS;
+ prop = 0; /* No support for dynamic buffers */
+ goto out_handled;
+ default:
+ return false;
+ }
+
+out_handled:
+ ffa_to_smccc_res_prop(res, ret, prop);
+ return true;
+}
+
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, func_id, host_ctxt, 0);
+ struct arm_smccc_res res;
+
+ /*
+ * There's no way we can tell what a non-standard SMC call might
+ * be up to. Ideally, we would terminate these here and return
+ * an error to the host, but sadly devices make use of custom
+ * firmware calls for things like power management, debugging,
+ * RNG access and crash reporting.
+ *
+ * Given that the architecture requires us to trust EL3 anyway,
+ * we forward unrecognised calls on under the assumption that
+ * the firmware doesn't expose a mechanism to access arbitrary
+ * non-secure memory. Short of a per-device table of SMCs, this
+ * is the best we can do.
+ */
+ if (!is_ffa_call(func_id))
+ return false;
+
+ switch (func_id) {
+ case FFA_FEATURES:
+ if (!do_ffa_features(&res, host_ctxt))
+ return false;
+ goto out_handled;
+ /* Memory management */
+ case FFA_FN64_RXTX_MAP:
+ do_ffa_rxtx_map(&res, host_ctxt);
+ goto out_handled;
+ case FFA_RXTX_UNMAP:
+ do_ffa_rxtx_unmap(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_RECLAIM:
+ do_ffa_mem_reclaim(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_FRAG_TX:
+ do_ffa_mem_frag_tx(&res, host_ctxt);
+ goto out_handled;
+ }
+
+ if (ffa_call_supported(func_id))
+ return false; /* Pass through */
+
+ ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED);
+out_handled:
+ ffa_set_retval(host_ctxt, &res);
+ return true;
+}
+
+int hyp_ffa_init(void *pages)
+{
+ struct arm_smccc_res res;
+ size_t min_rxtx_sz;
+ void *tx, *rx;
+
+ if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
+ return 0;
+
+ arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 == FFA_RET_NOT_SUPPORTED)
+ return 0;
+
+ if (res.a0 != FFA_VERSION_1_0)
+ return -EOPNOTSUPP;
+
+ arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ if (res.a2 != HOST_FFA_ID)
+ return -EINVAL;
+
+ arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
+ 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ switch (res.a2) {
+ case FFA_FEAT_RXTX_MIN_SZ_4K:
+ min_rxtx_sz = SZ_4K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_16K:
+ min_rxtx_sz = SZ_16K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_64K:
+ min_rxtx_sz = SZ_64K;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (min_rxtx_sz > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ tx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+ rx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+
+ ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) {
+ .buf = pages,
+ .len = PAGE_SIZE *
+ (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)),
+ };
+
+ hyp_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ .tx = tx,
+ .rx = rx,
+ };
+
+ host_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ };
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index b6c0188c4b35..c87c63133e10 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -10,6 +10,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_ptrauth.h>
.text
@@ -37,10 +38,43 @@ SYM_FUNC_START(__host_exit)
/* Save the host context pointer in x29 across the function call */
mov x29, x0
+
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_save
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ /* Save kernel ptrauth keys. */
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_save_state x18, x19, x20
+
+ /* Use hyp keys. */
+ adr_this_cpu x18, kvm_hyp_ctxt, x19
+ add x18, x18, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+ isb
+alternative_else_nop_endif
+__skip_pauth_save:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
bl handle_trap
- /* Restore host regs x0-x17 */
__host_enter_restore_full:
+ /* Restore kernel keys. */
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_restore
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+alternative_else_nop_endif
+__skip_pauth_restore:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+ /* Restore host regs x0-x17 */
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index a6d67c2bb5ae..90fade1b032e 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -83,9 +83,6 @@ SYM_CODE_END(__kvm_hyp_init)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START_LOCAL(___kvm_hyp_init)
- ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
- msr tpidr_el2, x1
-
ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA]
mov sp, x1
@@ -95,6 +92,22 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
ldr x1, [x0, #NVHE_INIT_HCR_EL2]
msr hcr_el2, x1
+ mov x2, #HCR_E2H
+ and x2, x1, x2
+ cbz x2, 1f
+
+ // hVHE: Replay the EL2 setup to account for the E2H bit
+ // TPIDR_EL2 is used to preserve x0 across the macro maze...
+ isb
+ msr tpidr_el2, x0
+ init_el2_state
+ finalise_el2_state
+ mrs x0, tpidr_el2
+
+1:
+ ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
+ msr tpidr_el2, x1
+
ldr x1, [x0, #NVHE_INIT_VTTBR]
msr vttbr_el2, x1
@@ -128,6 +141,13 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
orr x0, x0, x1
alternative_else_nop_endif
+
+#ifdef CONFIG_ARM64_BTI_KERNEL
+alternative_if ARM64_BTI
+ orr x0, x0, #SCTLR_EL2_BT
+alternative_else_nop_endif
+#endif /* CONFIG_ARM64_BTI_KERNEL */
+
msr sctlr_el2, x0
isb
@@ -184,6 +204,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
/* Initialize EL2 CPU state to sane values. */
init_el2_state // Clobbers x0..x2
finalise_el2_state
+ __init_el2_nvhe_prepare_eret
/* Enable MMU, set vectors and stack. */
mov x0, x28
@@ -196,6 +217,11 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
SYM_CODE_END(__kvm_hyp_init_cpu)
SYM_CODE_START(__kvm_handle_stub_hvc)
+ /*
+ * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so
+ * we need bti j at beginning.
+ */
+ bti j
cmp x0, #HVC_SOFT_RESTART
b.ne 1f
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 728e01d4536b..a169c619db60 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -13,6 +13,7 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
#include <nvhe/pkvm.h>
@@ -125,6 +126,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
}
+static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
+ DECLARE_REG(int, level, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level);
+}
+
static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
@@ -315,6 +325,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
@@ -374,6 +385,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
handled = kvm_host_psci_handler(host_ctxt);
if (!handled)
+ handled = kvm_host_ffa_handler(host_ctxt);
+ if (!handled)
default_host_smc_handler(host_ctxt);
/* SMC was trapped, move ELR past the current PC. */
@@ -392,7 +405,11 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
handle_host_smc(host_ctxt);
break;
case ESR_ELx_EC_SVE:
- sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
+ if (has_hvhe())
+ sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN |
+ CPACR_EL1_ZEN_EL0EN));
+ else
+ sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
isb();
sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
break;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index a8813b212996..9d703441278b 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr)
hyp_put_page(&host_s2_pool, addr);
}
-static void host_s2_free_removed_table(void *addr, u32 level)
+static void host_s2_free_unlinked_table(void *addr, u32 level)
{
- kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
+ kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level);
}
static int prepare_s2_pool(void *pgt_pool_base)
@@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
.zalloc_page = host_s2_zalloc_page,
- .free_removed_table = host_s2_free_removed_table,
+ .free_unlinked_table = host_s2_free_unlinked_table,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
@@ -842,6 +842,13 @@ static int check_share(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
break;
+ case PKVM_ID_FFA:
+ /*
+ * We only check the host; the secure side will check the other
+ * end when we forward the FFA call.
+ */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -870,6 +877,13 @@ static int __do_share(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
break;
+ case PKVM_ID_FFA:
+ /*
+ * We're not responsible for any secure page-tables, so there's
+ * nothing to do here.
+ */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -918,6 +932,10 @@ static int check_unshare(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_ack_unshare(completer_addr, tx);
break;
+ case PKVM_ID_FFA:
+ /* See check_share() */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -946,6 +964,10 @@ static int __do_unshare(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_complete_unshare(completer_addr, tx);
break;
+ case PKVM_ID_FFA:
+ /* See __do_share() */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -1235,3 +1257,49 @@ void hyp_unpin_shared_mem(void *from, void *to)
hyp_unlock_component();
host_unlock_component();
}
+
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = hyp_pfn_to_phys(pfn),
+ },
+ .completer = {
+ .id = PKVM_ID_FFA,
+ },
+ },
+ };
+
+ host_lock_component();
+ ret = do_share(&share);
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = hyp_pfn_to_phys(pfn),
+ },
+ .completer = {
+ .id = PKVM_ID_FFA,
+ },
+ },
+ };
+
+ host_lock_component();
+ ret = do_unshare(&share);
+ host_unlock_component();
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index a06ece14a6d8..8033ef353a5d 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -27,6 +27,7 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
u64 hcr_set = HCR_RW;
u64 hcr_clear = 0;
u64 cptr_set = 0;
+ u64 cptr_clear = 0;
/* Protected KVM does not support AArch32 guests. */
BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0),
@@ -43,6 +44,9 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD),
PVM_ID_AA64PFR0_ALLOW));
+ if (has_hvhe())
+ hcr_set |= HCR_E2H;
+
/* Trap RAS unless all current versions are supported */
if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) <
ID_AA64PFR0_EL1_RAS_V1P1) {
@@ -57,12 +61,17 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
}
/* Trap SVE */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids))
- cptr_set |= CPTR_EL2_TZ;
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) {
+ if (has_hvhe())
+ cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
+ else
+ cptr_set |= CPTR_EL2_TZ;
+ }
vcpu->arch.hcr_el2 |= hcr_set;
vcpu->arch.hcr_el2 &= ~hcr_clear;
vcpu->arch.cptr_el2 |= cptr_set;
+ vcpu->arch.cptr_el2 &= ~cptr_clear;
}
/*
@@ -120,8 +129,12 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
mdcr_set |= MDCR_EL2_TTRF;
/* Trap Trace */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids))
- cptr_set |= CPTR_EL2_TTA;
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) {
+ if (has_hvhe())
+ cptr_set |= CPACR_EL1_TTA;
+ else
+ cptr_set |= CPTR_EL2_TTA;
+ }
vcpu->arch.mdcr_el2 |= mdcr_set;
vcpu->arch.mdcr_el2 &= ~mdcr_clear;
@@ -176,8 +189,10 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
/* Clear res0 and set res1 bits to trap potential new features. */
vcpu->arch.hcr_el2 &= ~(HCR_RES0);
vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
- vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
- vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+ if (!has_hvhe()) {
+ vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
+ vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+ }
}
/*
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 110f04627785..bb98630dfeaf 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -11,6 +11,7 @@
#include <asm/kvm_pkvm.h>
#include <nvhe/early_alloc.h>
+#include <nvhe/ffa.h>
#include <nvhe/fixed_config.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
@@ -28,6 +29,7 @@ static void *vmemmap_base;
static void *vm_table_base;
static void *hyp_pgt_base;
static void *host_s2_pgt_base;
+static void *ffa_proxy_pages;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static struct hyp_pool hpool;
@@ -57,6 +59,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
if (!host_s2_pgt_base)
return -ENOMEM;
+ nr_pages = hyp_ffa_proxy_pages();
+ ffa_proxy_pages = hyp_early_alloc_contig(nr_pages);
+ if (!ffa_proxy_pages)
+ return -ENOMEM;
+
return 0;
}
@@ -314,6 +321,10 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
+ ret = hyp_ffa_init(ffa_proxy_pages);
+ if (ret)
+ goto out;
+
pkvm_hyp_vm_table_init(vm_table_base);
out:
/*
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 77791495c995..0a6271052def 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -44,13 +44,24 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
__activate_traps_common(vcpu);
val = vcpu->arch.cptr_el2;
- val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
+ val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */
+ val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA;
+ if (cpus_have_final_cap(ARM64_SME)) {
+ if (has_hvhe())
+ val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN);
+ else
+ val |= CPTR_EL2_TSM;
+ }
+
if (!guest_owns_fp_regs(vcpu)) {
- val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
+ if (has_hvhe())
+ val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
+ CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN);
+ else
+ val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
+
__activate_traps_fpsimd32(vcpu);
}
- if (cpus_have_final_cap(ARM64_SME))
- val |= CPTR_EL2_TSM;
write_sysreg(val, cptr_el2);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
@@ -73,7 +84,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
extern char __kvm_hyp_host_vector[];
- u64 cptr;
___deactivate_traps(vcpu);
@@ -98,13 +108,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
- cptr = CPTR_EL2_DEFAULT;
- if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
- cptr |= CPTR_EL2_TZ;
- if (cpus_have_final_cap(ARM64_SME))
- cptr &= ~CPTR_EL2_TSM;
-
- write_sysreg(cptr, cptr_el2);
+ kvm_reset_cptr_el2(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
index b185ac0dbd47..3aaab20ae5b4 100644
--- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
@@ -17,21 +17,24 @@ void __kvm_timer_set_cntvoff(u64 cntvoff)
}
/*
- * Should only be called on non-VHE systems.
+ * Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_disable_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ u64 val, shift = 0;
+
+ if (has_hvhe())
+ shift = 10;
/* Allow physical timer/counter access for the host */
val = read_sysreg(cnthctl_el2);
- val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+ val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift;
write_sysreg(val, cnthctl_el2);
}
/*
- * Should only be called on non-VHE systems.
+ * Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_enable_traps(struct kvm_vcpu *vcpu)
@@ -50,5 +53,10 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu)
else
clr |= CNTHCTL_EL1PCTEN;
+ if (has_hvhe()) {
+ clr <<= 10;
+ set <<= 10;
+ }
+
sysreg_clear_set(cnthctl_el2, clr, set);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 978179133f4b..b9991bbd8e3f 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, true);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ /*
+ * If the host is running at EL1 and we have a VPIPT I-cache,
+ * then we must perform I-cache maintenance at EL2 in order for
+ * it to have an effect on the guest. Since the guest cannot hit
+ * I-cache lines allocated with a different VMID, we don't need
+ * to worry about junk out of guest reset (we nuke the I-cache on
+ * VMID rollover), but we do need to be careful when remapping
+ * executable pages for the same guest. This can happen when KSM
+ * takes a CoW fault on an executable page, copies the page into
+ * a page that was previously mapped in the guest and then needs
+ * to invalidate the guest view of the I-cache for that page
+ * from EL1. To solve this, we invalidate the entire I-cache when
+ * unmapping a page from a guest if we have a VPIPT I-cache but
+ * the host is running at EL1. As above, we could do better if
+ * we had the VA.
+ *
+ * The moral of this story is: if you have a VPIPT I-cache, then
+ * you should be running with VHE enabled.
+ */
+ if (icache_is_vpipt())
+ icache_inval_all_pou();
+
+ __tlb_switch_to_host(&cxt);
+}
+
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 95dae02ccc2e..aa740a974e02 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -21,8 +21,10 @@
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
+ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
+ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
@@ -34,7 +36,7 @@
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
-#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
@@ -42,6 +44,8 @@
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
+
#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
@@ -63,6 +67,16 @@ struct kvm_pgtable_walk_data {
const u64 end;
};
+static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
+}
+
+static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
+}
+
static bool kvm_phys_is_valid(u64 phys)
{
return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
@@ -386,6 +400,9 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
if (device)
return -EINVAL;
+
+ if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti())
+ attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
} else {
attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
}
@@ -623,10 +640,18 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable the Hardware Access Flag management, unconditionally
- * on all CPUs. The features is RES0 on CPUs without the support
- * and must be ignored by the CPUs.
+ * on all CPUs. In systems that have asymmetric support for the feature
+ * this allows KVM to leverage hardware support on the subset of cores
+ * that implement the feature.
+ *
+ * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
+ * hardware) on implementations that do not advertise support for the
+ * feature. As such, setting HA unconditionally is safe, unless you
+ * happen to be running on a design that has unadvertised support for
+ * HAFDBS. Here be dragons.
*/
- vtcr |= VTCR_EL2_HA;
+ if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ vtcr |= VTCR_EL2_HA;
#endif /* CONFIG_ARM64_HW_AFDBM */
/* Set the vmid bits */
@@ -755,14 +780,17 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
return false;
- /*
- * Perform the appropriate TLB invalidation based on the evicted pte
- * value (if any).
- */
- if (kvm_pte_table(ctx->old, ctx->level))
- kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
- else if (kvm_pte_valid(ctx->old))
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
+ if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
+ /*
+ * Perform the appropriate TLB invalidation based on the
+ * evicted pte value (if any).
+ */
+ if (kvm_pte_table(ctx->old, ctx->level))
+ kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
+ else if (kvm_pte_valid(ctx->old))
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
+ ctx->addr, ctx->level);
+ }
if (stage2_pte_is_counted(ctx->old))
mm_ops->put_page(ctx->ptep);
@@ -869,11 +897,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
return -EAGAIN;
/* Perform CMOs before installation of the guest stage-2 PTE */
- if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
+ stage2_pte_cacheable(pgt, new))
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
- granule);
+ granule);
- if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
+ stage2_pte_executable(new))
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
stage2_make_pte(ctx, new);
@@ -895,7 +925,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
if (ret)
return ret;
- mm_ops->free_removed_table(childp, ctx->level);
+ mm_ops->free_unlinked_table(childp, ctx->level);
return 0;
}
@@ -940,7 +970,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
* The TABLE_PRE callback runs for table entries on the way down, looking
* for table entries which we could conceivably replace with a block entry
* for this mapping. If it finds one it replaces the entry and calls
- * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
+ * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
*
* Otherwise, the LEAF callback performs the mapping at the existing leaves
* instead.
@@ -1209,7 +1239,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
if (!ret)
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
return ret;
}
@@ -1242,6 +1272,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
+kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
+ u64 phys, u32 level,
+ enum kvm_pgtable_prot prot,
+ void *mc, bool force_pte)
+{
+ struct stage2_map_data map_data = {
+ .phys = phys,
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ .force_pte = force_pte,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
+ KVM_PGTABLE_WALK_SKIP_CMO,
+ .arg = &map_data,
+ };
+ /*
+ * The input address (.addr) is irrelevant for walking an
+ * unlinked table. Construct an ambiguous IA range to map
+ * kvm_granule_size(level) worth of memory.
+ */
+ struct kvm_pgtable_walk_data data = {
+ .walker = &walker,
+ .addr = 0,
+ .end = kvm_granule_size(level),
+ };
+ struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
+ kvm_pte_t *pgtable;
+ int ret;
+
+ if (!IS_ALIGNED(phys, kvm_granule_size(level)))
+ return ERR_PTR(-EINVAL);
+
+ ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ pgtable = mm_ops->zalloc_page(mc);
+ if (!pgtable)
+ return ERR_PTR(-ENOMEM);
+
+ ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
+ level + 1);
+ if (ret) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
+ mm_ops->put_page(pgtable);
+ return ERR_PTR(ret);
+ }
+
+ return pgtable;
+}
+
+/*
+ * Get the number of page-tables needed to replace a block with a
+ * fully populated tree up to the PTE entries. Note that @level is
+ * interpreted as in "level @level entry".
+ */
+static int stage2_block_get_nr_page_tables(u32 level)
+{
+ switch (level) {
+ case 1:
+ return PTRS_PER_PTE + 1;
+ case 2:
+ return 1;
+ case 3:
+ return 0;
+ default:
+ WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
+ level >= KVM_PGTABLE_MAX_LEVELS);
+ return -EINVAL;
+ };
+}
+
+static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ struct kvm_mmu_memory_cache *mc = ctx->arg;
+ struct kvm_s2_mmu *mmu;
+ kvm_pte_t pte = ctx->old, new, *childp;
+ enum kvm_pgtable_prot prot;
+ u32 level = ctx->level;
+ bool force_pte;
+ int nr_pages;
+ u64 phys;
+
+ /* No huge-pages exist at the last level */
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return 0;
+
+ /* We only split valid block mappings */
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ nr_pages = stage2_block_get_nr_page_tables(level);
+ if (nr_pages < 0)
+ return nr_pages;
+
+ if (mc->nobjs >= nr_pages) {
+ /* Build a tree mapped down to the PTE granularity. */
+ force_pte = true;
+ } else {
+ /*
+ * Don't force PTEs, so create_unlinked() below does
+ * not populate the tree up to the PTE level. The
+ * consequence is that the call will require a single
+ * page of level 2 entries at level 1, or a single
+ * page of PTEs at level 2. If we are at level 1, the
+ * PTEs will be created recursively.
+ */
+ force_pte = false;
+ nr_pages = 1;
+ }
+
+ if (mc->nobjs < nr_pages)
+ return -ENOMEM;
+
+ mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
+ phys = kvm_pte_to_phys(pte);
+ prot = kvm_pgtable_stage2_pte_prot(pte);
+
+ childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
+ level, prot, mc, force_pte);
+ if (IS_ERR(childp))
+ return PTR_ERR(childp);
+
+ if (!stage2_try_break_pte(ctx, mmu)) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
+ mm_ops->put_page(childp);
+ return -EAGAIN;
+ }
+
+ /*
+ * Note, the contents of the page table are guaranteed to be made
+ * visible before the new PTE is assigned because stage2_make_pte()
+ * writes the PTE using smp_store_release().
+ */
+ new = kvm_init_table_pte(childp, mm_ops);
+ stage2_make_pte(ctx, new);
+ dsb(ishst);
+ return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_mmu_memory_cache *mc)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_split_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = mc,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
struct kvm_pgtable_mm_ops *mm_ops,
@@ -1311,7 +1497,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
pgt->pgd = NULL;
}
-void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
+void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
{
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
struct kvm_pgtable_walker walker = {
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index b37e7c96efea..6537f58b1a8c 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -84,7 +84,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
*/
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
- write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
+ kvm_reset_cptr_el2(vcpu);
if (!arm64_kernel_unmapped_at_el0())
host_vectors = __this_cpu_read(this_cpu_vector);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 24cef9b87f9e..e69da550cdc5 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ dsb(nshst);
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3b9d4d24c361..6db9ef288ec3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector;
static unsigned long __ro_after_init io_map_base;
-static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
+ phys_addr_t size)
{
- phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
return (boundary - 1 < end - 1) ? boundary : end;
}
+static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
+
+ return __stage2_range_addr_end(addr, end, size);
+}
+
/*
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
@@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
#define stage2_apply_range_resched(mmu, addr, end, fn) \
stage2_apply_range(mmu, addr, end, fn, true)
+/*
+ * Get the maximum number of page-tables pages needed to split a range
+ * of blocks into PAGE_SIZE PTEs. It assumes the range is already
+ * mapped at level 2, or at level 1 if allowed.
+ */
+static int kvm_mmu_split_nr_page_tables(u64 range)
+{
+ int n = 0;
+
+ if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
+ n += DIV_ROUND_UP(range, PUD_SIZE);
+ n += DIV_ROUND_UP(range, PMD_SIZE);
+ return n;
+}
+
+static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
+{
+ struct kvm_mmu_memory_cache *cache;
+ u64 chunk_size, min;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ min = kvm_mmu_split_nr_page_tables(chunk_size);
+ cache = &kvm->arch.mmu.split_page_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end)
+{
+ struct kvm_mmu_memory_cache *cache;
+ struct kvm_pgtable *pgt;
+ int ret, cache_capacity;
+ u64 next, chunk_size;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
+
+ if (chunk_size == 0)
+ return 0;
+
+ cache = &kvm->arch.mmu.split_page_cache;
+
+ do {
+ if (need_split_memcache_topup_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /* Eager page splitting is best-effort. */
+ ret = __kvm_mmu_topup_memory_cache(cache,
+ cache_capacity,
+ cache_capacity);
+ write_lock(&kvm->mmu_lock);
+ if (ret)
+ break;
+ }
+
+ pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = __stage2_range_addr_end(addr, end, chunk_size);
+ ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
+ if (ret)
+ break;
+ } while (addr = next, addr != end);
+
+ return ret;
+}
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -131,21 +211,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
-static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
{
struct page *page = container_of(head, struct page, rcu_head);
void *pgtable = page_to_virt(page);
u32 level = page_private(page);
- kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+ kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
}
-static void stage2_free_removed_table(void *addr, u32 level)
+static void stage2_free_unlinked_table(void *addr, u32 level)
{
struct page *page = virt_to_page(addr);
set_page_private(page, (unsigned long)level);
- call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+ call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
}
static void kvm_host_get_page(void *addr)
@@ -701,7 +781,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
.free_pages_exact = kvm_s2_free_pages_exact,
- .free_removed_table = stage2_free_removed_table,
+ .free_unlinked_table = stage2_free_unlinked_table,
.get_page = kvm_host_get_page,
.put_page = kvm_s2_put_page,
.page_count = kvm_host_page_count,
@@ -775,6 +855,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
+ /* The eager page splitting is disabled by default */
+ mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ mmu->split_page_cache.gfp_zero = __GFP_ZERO;
+
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
return 0;
@@ -786,6 +870,12 @@ out_free_pgtable:
return err;
}
+void kvm_uninit_stage2_mmu(struct kvm *kvm)
+{
+ kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
+}
+
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
@@ -989,39 +1079,66 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
}
/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
+ * pages for memory slot
* @kvm: The KVM pointer
- * @slot: The memory slot associated with mask
- * @gfn_offset: The gfn offset in memory slot
- * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
- * slot to be write protected
+ * @slot: The memory slot to split
*
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
+ * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
*/
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
- phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
- phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
- phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ phys_addr_t start, end;
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ slots = kvm_memslots(kvm);
+ memslot = id_to_memslot(slots, slot);
+
+ start = memslot->base_gfn << PAGE_SHIFT;
+ end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ write_lock(&kvm->mmu_lock);
+ kvm_mmu_split_huge_pages(kvm, start, end);
+ write_unlock(&kvm->mmu_lock);
}
/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of pages at offset 'gfn_offset' in this memory
+ * slot to enable dirty logging on
*
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
+ * Writes protect selected pages to enable dirty logging, and then
+ * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ stage2_wp_range(&kvm->arch.mmu, start, end);
+
+ /*
+ * Eager-splitting is done when manual-protect is set. We
+ * also check for initially-all-set because we can avoid
+ * eager-splitting if initially-all-set is false.
+ * Initially-all-set equal false implies that huge-pages were
+ * already split when enabling dirty logging: no need to do it
+ * again.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_split_huge_pages(kvm, start, end);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1790,20 +1907,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
/*
* At this point memslot has been committed and there is an
* allocated dirty_bitmap[], dirty pages will be tracked while the
* memory slot is write protected.
*/
- if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (log_dirty_pages) {
+
+ if (change == KVM_MR_DELETE)
+ return;
+
/*
- * If we're with initial-all-set, we don't need to write
- * protect any pages because they're all reported as dirty.
- * Huge pages and normal pages will be write protect gradually.
+ * Huge and normal pages are write-protected and split
+ * on either of these two cases:
+ *
+ * 1. with initial-all-set: gradually with CLEAR ioctls,
*/
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- kvm_mmu_wp_memory_region(kvm, new->id);
- }
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+ /*
+ * or
+ * 2. without initial-all-set: all in one shot when
+ * enabling dirty logging.
+ */
+ kvm_mmu_wp_memory_region(kvm, new->id);
+ kvm_mmu_split_memory_region(kvm, new->id);
+ } else {
+ /*
+ * Free any leftovers from the eager page splitting cache. Do
+ * this when deleting, moving, disabling dirty logging, or
+ * creating the memslot (a nop). Doing it for deletes makes
+ * sure we don't leak memory, and there's no need to keep the
+ * cache around for any of the other cases.
+ */
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}
}
@@ -1877,7 +2016,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_uninit_stage2_mmu(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 6e9ece1ebbe7..994a494703c3 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -78,6 +78,7 @@ void __init kvm_hyp_reserve(void)
hyp_mem_pages += host_s2_pgtable_pages();
hyp_mem_pages += hyp_vm_table_pages();
hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
+ hyp_mem_pages += hyp_ffa_proxy_pages();
/*
* Try to allocate a PMD-aligned region to reduce TLB pressure once
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index b5dee8e57e77..bc8556b6f459 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -187,57 +187,6 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
}
/**
- * kvm_set_vm_width() - set the register width for the guest
- * @vcpu: Pointer to the vcpu being configured
- *
- * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED
- * in the VM flags based on the vcpu's requested register width, the HW
- * capabilities and other options (such as MTE).
- * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be
- * consistent with the value of the FLAG_EL1_32BIT bit in the flags.
- *
- * Return: 0 on success, negative error code on failure.
- */
-static int kvm_set_vm_width(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = vcpu->kvm;
- bool is32bit;
-
- is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT);
-
- lockdep_assert_held(&kvm->arch.config_lock);
-
- if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) {
- /*
- * The guest's register width is already configured.
- * Make sure that the vcpu is consistent with it.
- */
- if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags))
- return 0;
-
- return -EINVAL;
- }
-
- if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
- return -EINVAL;
-
- /* MTE is incompatible with AArch32 */
- if (kvm_has_mte(kvm) && is32bit)
- return -EINVAL;
-
- /* NV is incompatible with AArch32 */
- if (vcpu_has_nv(vcpu) && is32bit)
- return -EINVAL;
-
- if (is32bit)
- set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags);
-
- set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags);
-
- return 0;
-}
-
-/**
* kvm_reset_vcpu - sets core registers and sys_regs to reset value
* @vcpu: The VCPU pointer
*
@@ -262,13 +211,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
bool loaded;
u32 pstate;
- mutex_lock(&vcpu->kvm->arch.config_lock);
- ret = kvm_set_vm_width(vcpu);
- mutex_unlock(&vcpu->kvm->arch.config_lock);
-
- if (ret)
- return ret;
-
spin_lock(&vcpu->arch.mp_state_lock);
reset_state = vcpu->arch.reset_state;
vcpu->arch.reset_state.reset = false;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 753aa7418149..6ce28afde022 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -42,6 +42,8 @@
*/
static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
+static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 val);
static bool read_from_write_only(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
@@ -553,10 +555,11 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_bvr(struct kvm_vcpu *vcpu,
+static u64 reset_bvr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val;
+ return rd->val;
}
static bool trap_bcr(struct kvm_vcpu *vcpu,
@@ -589,10 +592,11 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_bcr(struct kvm_vcpu *vcpu,
+static u64 reset_bcr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val;
+ return rd->val;
}
static bool trap_wvr(struct kvm_vcpu *vcpu,
@@ -626,10 +630,11 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_wvr(struct kvm_vcpu *vcpu,
+static u64 reset_wvr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val;
+ return rd->val;
}
static bool trap_wcr(struct kvm_vcpu *vcpu,
@@ -662,25 +667,28 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_wcr(struct kvm_vcpu *vcpu,
+static u64 reset_wcr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val;
+ return rd->val;
}
-static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 amair = read_sysreg(amair_el1);
vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1);
+ return amair;
}
-static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 actlr = read_sysreg(actlr_el1);
vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1);
+ return actlr;
}
-static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 mpidr;
@@ -694,7 +702,10 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
- vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1);
+ mpidr |= (1ULL << 31);
+ vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1);
+
+ return mpidr;
}
static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
@@ -706,13 +717,13 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
-static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
/* No PMU available, any PMU reg may UNDEF... */
if (!kvm_arm_support_pmu_v3())
- return;
+ return 0;
n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
n &= ARMV8_PMU_PMCR_N_MASK;
@@ -721,33 +732,41 @@ static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= mask;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 pmcr;
/* No PMU available, PMCR_EL0 may UNDEF... */
if (!kvm_arm_support_pmu_v3())
- return;
+ return 0;
/* Only preserve PMCR_EL0.N, and reset the rest to 0 */
pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT);
@@ -755,6 +774,8 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
pmcr |= ARMV8_PMU_PMCR_LC;
__vcpu_sys_reg(vcpu, r->reg) = pmcr;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
@@ -1187,25 +1208,89 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}
-static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
+static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
+ s64 new, s64 cur)
{
- if (kvm_vcpu_has_pmu(vcpu))
- return vcpu->kvm->arch.dfr0_pmuver.imp;
+ struct arm64_ftr_bits kvm_ftr = *ftrp;
+
+ /* Some features have different safe value type in KVM than host features */
+ switch (id) {
+ case SYS_ID_AA64DFR0_EL1:
+ if (kvm_ftr.shift == ID_AA64DFR0_EL1_PMUVer_SHIFT)
+ kvm_ftr.type = FTR_LOWER_SAFE;
+ break;
+ case SYS_ID_DFR0_EL1:
+ if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT)
+ kvm_ftr.type = FTR_LOWER_SAFE;
+ break;
+ }
- return vcpu->kvm->arch.dfr0_pmuver.unimp;
+ return arm64_ftr_safe_value(&kvm_ftr, new, cur);
}
-static u8 perfmon_to_pmuver(u8 perfmon)
+/**
+ * arm64_check_features() - Check if a feature register value constitutes
+ * a subset of features indicated by the idreg's KVM sanitised limit.
+ *
+ * This function will check if each feature field of @val is the "safe" value
+ * against idreg's KVM sanitised limit return from reset() callback.
+ * If a field value in @val is the same as the one in limit, it is always
+ * considered the safe value regardless For register fields that are not in
+ * writable, only the value in limit is considered the safe value.
+ *
+ * Return: 0 if all the fields are safe. Otherwise, return negative errno.
+ */
+static int arm64_check_features(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ u64 val)
{
- switch (perfmon) {
- case ID_DFR0_EL1_PerfMon_PMUv3:
- return ID_AA64DFR0_EL1_PMUVer_IMP;
- case ID_DFR0_EL1_PerfMon_IMPDEF:
- return ID_AA64DFR0_EL1_PMUVer_IMP_DEF;
- default:
- /* Anything ARMv8.1+ and NI have the same value. For now. */
- return perfmon;
+ const struct arm64_ftr_reg *ftr_reg;
+ const struct arm64_ftr_bits *ftrp = NULL;
+ u32 id = reg_to_encoding(rd);
+ u64 writable_mask = rd->val;
+ u64 limit = rd->reset(vcpu, rd);
+ u64 mask = 0;
+
+ /*
+ * Hidden and unallocated ID registers may not have a corresponding
+ * struct arm64_ftr_reg. Of course, if the register is RAZ we know the
+ * only safe value is 0.
+ */
+ if (sysreg_visible_as_raz(vcpu, rd))
+ return val ? -E2BIG : 0;
+
+ ftr_reg = get_arm64_ftr_reg(id);
+ if (!ftr_reg)
+ return -EINVAL;
+
+ ftrp = ftr_reg->ftr_bits;
+
+ for (; ftrp && ftrp->width; ftrp++) {
+ s64 f_val, f_lim, safe_val;
+ u64 ftr_mask;
+
+ ftr_mask = arm64_ftr_mask(ftrp);
+ if ((ftr_mask & writable_mask) != ftr_mask)
+ continue;
+
+ f_val = arm64_ftr_value(ftrp, val);
+ f_lim = arm64_ftr_value(ftrp, limit);
+ mask |= ftr_mask;
+
+ if (f_val == f_lim)
+ safe_val = f_val;
+ else
+ safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim);
+
+ if (safe_val != f_val)
+ return -E2BIG;
}
+
+ /* For fields that are not writable, values in limit are the safe values. */
+ if ((val & ~mask) != (limit & ~mask))
+ return -E2BIG;
+
+ return 0;
}
static u8 pmuver_to_perfmon(u8 pmuver)
@@ -1222,7 +1307,8 @@ static u8 pmuver_to_perfmon(u8 pmuver)
}
/* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
+static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
{
u32 id = reg_to_encoding(r);
u64 val;
@@ -1233,19 +1319,6 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
val = read_sanitised_ftr_reg(id);
switch (id) {
- case SYS_ID_AA64PFR0_EL1:
- if (!vcpu_has_sve(vcpu))
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
- if (kvm_vgic_global_state.type == VGIC_V3) {
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1);
- }
- break;
case SYS_ID_AA64PFR1_EL1:
if (!kvm_has_mte(vcpu->kvm))
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
@@ -1266,22 +1339,6 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
if (!cpus_have_final_cap(ARM64_HAS_WFXT))
val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
break;
- case SYS_ID_AA64DFR0_EL1:
- /* Limit debug to ARMv8.0 */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
- /* Set PMUver to the required version */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
- vcpu_pmuver(vcpu));
- /* Hide SPE from guests */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
- break;
- case SYS_ID_DFR0_EL1:
- val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon),
- pmuver_to_perfmon(vcpu_pmuver(vcpu)));
- break;
case SYS_ID_AA64MMFR2_EL1:
val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
break;
@@ -1293,6 +1350,28 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
return val;
}
+static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+ return __kvm_read_sanitised_id_reg(vcpu, r);
+}
+
+static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ return IDREG(vcpu->kvm, reg_to_encoding(r));
+}
+
+/*
+ * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is
+ * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
+ */
+static inline bool is_id_reg(u32 id)
+{
+ return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
+ sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
+ sys_reg_CRm(id) < 8);
+}
+
static unsigned int id_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
@@ -1354,88 +1433,113 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
-static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd,
- u64 val)
+static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
{
- u8 csv2, csv3;
+ u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+ if (!vcpu_has_sve(vcpu))
+ val &= ~ID_AA64PFR0_EL1_SVE_MASK;
/*
- * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as
- * it doesn't promise more than what is actually provided (the
- * guest could otherwise be covered in ectoplasmic residue).
+ * The default is to expose CSV2 == 1 if the HW isn't affected.
+ * Although this is a per-CPU feature, we make it global because
+ * asymmetric systems are just a nuisance.
+ *
+ * Userspace can override this as long as it doesn't promise
+ * the impossible.
*/
- csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT);
- if (csv2 > 1 ||
- (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED))
- return -EINVAL;
+ if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) {
+ val &= ~ID_AA64PFR0_EL1_CSV2_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP);
+ }
+ if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) {
+ val &= ~ID_AA64PFR0_EL1_CSV3_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
+ }
- /* Same thing for CSV3 */
- csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT);
- if (csv3 > 1 ||
- (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED))
- return -EINVAL;
+ if (kvm_vgic_global_state.type == VGIC_V3) {
+ val &= ~ID_AA64PFR0_EL1_GIC_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ }
- /* We can only differ with CSV[23], and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
- if (val)
- return -EINVAL;
+ val &= ~ID_AA64PFR0_EL1_AMU_MASK;
- vcpu->kvm->arch.pfr0_csv2 = csv2;
- vcpu->kvm->arch.pfr0_csv3 = csv3;
+ return val;
+}
- return 0;
+static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
+
+ /* Limit debug to ARMv8.0 */
+ val &= ~ID_AA64DFR0_EL1_DebugVer_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DebugVer, IMP);
+
+ /*
+ * Only initialize the PMU version if the vCPU was configured with one.
+ */
+ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
+ if (kvm_vcpu_has_pmu(vcpu))
+ val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer,
+ kvm_arm_pmu_get_pmuver_limit());
+
+ /* Hide SPE from guests */
+ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK;
+
+ return val;
}
static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 val)
{
- u8 pmuver, host_pmuver;
- bool valid_pmu;
-
- host_pmuver = kvm_arm_pmu_get_pmuver_limit();
+ u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val);
/*
- * Allow AA64DFR0_EL1.PMUver to be set from userspace as long
- * as it doesn't promise more than what the HW gives us. We
- * allow an IMPDEF PMU though, only if no PMU is supported
- * (KVM backward compatibility handling).
+ * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the
+ * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously
+ * exposed an IMP_DEF PMU to userspace and the guest on systems w/
+ * non-architectural PMUs. Of course, PMUv3 is the only game in town for
+ * PMU virtualization, so the IMP_DEF value was rather user-hostile.
+ *
+ * At minimum, we're on the hook to allow values that were given to
+ * userspace by KVM. Cover our tracks here and replace the IMP_DEF value
+ * with a more sensible NI. The value of an ID register changing under
+ * the nose of the guest is unfortunate, but is certainly no more
+ * surprising than an ill-guided PMU driver poking at impdef system
+ * registers that end in an UNDEF...
*/
- pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val);
- if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver))
- return -EINVAL;
+ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
- valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
-
- /* Make sure view register and PMU support do match */
- if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
- return -EINVAL;
+ return set_id_reg(vcpu, rd, val);
+}
- /* We can only differ with PMUver, and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
- if (val)
- return -EINVAL;
+static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1);
- if (valid_pmu)
- vcpu->kvm->arch.dfr0_pmuver.imp = pmuver;
- else
- vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver;
+ val &= ~ID_DFR0_EL1_PerfMon_MASK;
+ if (kvm_vcpu_has_pmu(vcpu))
+ val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon);
- return 0;
+ return val;
}
static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 val)
{
- u8 perfmon, host_perfmon;
- bool valid_pmu;
+ u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val);
- host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) {
+ val &= ~ID_DFR0_EL1_PerfMon_MASK;
+ perfmon = 0;
+ }
/*
* Allow DFR0_EL1.PerfMon to be set from userspace as long as
@@ -1443,29 +1547,10 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
* AArch64 side (as everything is emulated with that), and
* that this is a PMUv3.
*/
- perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val);
- if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) ||
- (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3))
+ if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)
return -EINVAL;
- valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF);
-
- /* Make sure view register and PMU support do match */
- if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
- return -EINVAL;
-
- /* We can only differ with PerfMon, and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
- if (val)
- return -EINVAL;
-
- if (valid_pmu)
- vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon);
- else
- vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon);
-
- return 0;
+ return set_id_reg(vcpu, rd, val);
}
/*
@@ -1478,18 +1563,60 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 *val)
{
+ /*
+ * Avoid locking if the VM has already started, as the ID registers are
+ * guaranteed to be invariant at that point.
+ */
+ if (kvm_vm_has_ran_once(vcpu->kvm)) {
+ *val = read_id_reg(vcpu, rd);
+ return 0;
+ }
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
*val = read_id_reg(vcpu, rd);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
return 0;
}
static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 val)
{
- /* This is what we mean by invariant: you can't change it. */
- if (val != read_id_reg(vcpu, rd))
- return -EINVAL;
+ u32 id = reg_to_encoding(rd);
+ int ret;
- return 0;
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * Once the VM has started the ID registers are immutable. Reject any
+ * write that does not match the final register value.
+ */
+ if (kvm_vm_has_ran_once(vcpu->kvm)) {
+ if (val != read_id_reg(vcpu, rd))
+ ret = -EBUSY;
+ else
+ ret = 0;
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+ return ret;
+ }
+
+ ret = arm64_check_features(vcpu, rd, val);
+ if (!ret)
+ IDREG(vcpu->kvm, id) = val;
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * arm64_check_features() returns -E2BIG to indicate the register's
+ * feature set is a superset of the maximally-allowed register value.
+ * While it would be nice to precisely describe this to userspace, the
+ * existing UAPI for KVM_SET_ONE_REG has it that invalid register
+ * writes return -EINVAL.
+ */
+ if (ret == -E2BIG)
+ ret = -EINVAL;
+ return ret;
}
static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
@@ -1529,7 +1656,7 @@ static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
* Fabricate a CLIDR_EL1 value instead of using the real value, which can vary
* by the physical CPU which the vcpu currently resides in.
*/
-static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
u64 clidr;
@@ -1577,6 +1704,8 @@ static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
clidr |= 2 << CLIDR_TTYPE_SHIFT(loc);
__vcpu_sys_reg(vcpu, r->reg) = clidr;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
@@ -1676,6 +1805,17 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.visibility = elx2_visibility, \
}
+/*
+ * Since reset() callback and field val are not used for idregs, they will be
+ * used for specific purposes for idregs.
+ * The reset() would return KVM sanitised register value. The value would be the
+ * same as the host kernel sanitised value if there is no KVM sanitisation.
+ * The val would be used as a mask indicating writable fields for the idreg.
+ * Only bits with 1 are writable from userspace. This mask might not be
+ * necessary in the future whenever all ID registers are enabled as writable
+ * from userspace.
+ */
+
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
SYS_DESC(SYS_##name), \
@@ -1683,6 +1823,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = id_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
/* sys_reg_desc initialiser for known cpufeature ID registers */
@@ -1692,6 +1834,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = aa32_id_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
/*
@@ -1704,7 +1848,9 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.access = access_id_reg, \
.get_user = get_id_reg, \
.set_user = set_id_reg, \
- .visibility = raz_visibility \
+ .visibility = raz_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
/*
@@ -1718,6 +1864,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = raz_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
static bool access_sp_el1(struct kvm_vcpu *vcpu,
@@ -1825,9 +1973,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* CRm=1 */
AA32_ID_SANITISED(ID_PFR0_EL1),
AA32_ID_SANITISED(ID_PFR1_EL1),
- { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_dfr0_el1,
- .visibility = aa32_id_visibility, },
+ { SYS_DESC(SYS_ID_DFR0_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_dfr0_el1,
+ .visibility = aa32_id_visibility,
+ .reset = read_sanitised_id_dfr0_el1,
+ .val = ID_DFR0_EL1_PerfMon_MASK, },
ID_HIDDEN(ID_AFR0_EL1),
AA32_ID_SANITISED(ID_MMFR0_EL1),
AA32_ID_SANITISED(ID_MMFR1_EL1),
@@ -1856,8 +2008,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* AArch64 ID registers */
/* CRm=4 */
- { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, },
+ { SYS_DESC(SYS_ID_AA64PFR0_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_reg,
+ .reset = read_sanitised_id_aa64pfr0_el1,
+ .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, },
ID_SANITISED(ID_AA64PFR1_EL1),
ID_UNALLOCATED(4,2),
ID_UNALLOCATED(4,3),
@@ -1867,8 +2023,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_UNALLOCATED(4,7),
/* CRm=5 */
- { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, },
+ { SYS_DESC(SYS_ID_AA64DFR0_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_aa64dfr0_el1,
+ .reset = read_sanitised_id_aa64dfr0_el1,
+ .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
ID_SANITISED(ID_AA64DFR1_EL1),
ID_UNALLOCATED(5,2),
ID_UNALLOCATED(5,3),
@@ -2199,7 +2359,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
EL2_REG(HCR_EL2, access_rw, reset_val, 0),
EL2_REG(MDCR_EL2, access_rw, reset_val, 0),
- EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ),
+ EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
EL2_REG(HSTR_EL2, access_rw, reset_val, 0),
EL2_REG(HACR_EL2, access_rw, reset_val, 0),
@@ -2256,6 +2416,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};
+static const struct sys_reg_desc *first_idreg;
+
static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -2946,6 +3108,28 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
return false;
}
+static void kvm_reset_id_regs(struct kvm_vcpu *vcpu)
+{
+ const struct sys_reg_desc *idreg = first_idreg;
+ u32 id = reg_to_encoding(idreg);
+ struct kvm *kvm = vcpu->kvm;
+
+ if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
+ return;
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ /* Initialize all idregs */
+ while (is_id_reg(id)) {
+ IDREG(kvm, id) = idreg->reset(vcpu, idreg);
+
+ idreg++;
+ id = reg_to_encoding(idreg);
+ }
+
+ set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+}
+
/**
* kvm_reset_sys_regs - sets system registers to reset value
* @vcpu: The VCPU pointer
@@ -2957,9 +3141,17 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
{
unsigned long i;
- for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++)
- if (sys_reg_descs[i].reset)
- sys_reg_descs[i].reset(vcpu, &sys_reg_descs[i]);
+ kvm_reset_id_regs(vcpu);
+
+ for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+ const struct sys_reg_desc *r = &sys_reg_descs[i];
+
+ if (is_id_reg(reg_to_encoding(r)))
+ continue;
+
+ if (r->reset)
+ r->reset(vcpu, r);
+ }
}
/**
@@ -3060,19 +3252,21 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
*/
#define FUNCTION_INVARIANT(reg) \
- static void get_##reg(struct kvm_vcpu *v, \
+ static u64 get_##reg(struct kvm_vcpu *v, \
const struct sys_reg_desc *r) \
{ \
((struct sys_reg_desc *)r)->val = read_sysreg(reg); \
+ return ((struct sys_reg_desc *)r)->val; \
}
FUNCTION_INVARIANT(midr_el1)
FUNCTION_INVARIANT(revidr_el1)
FUNCTION_INVARIANT(aidr_el1)
-static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
+static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
{
((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ return ((struct sys_reg_desc *)r)->val;
}
/* ->val is filled in by kvm_sys_reg_table_init() */
@@ -3364,6 +3558,7 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
int __init kvm_sys_reg_table_init(void)
{
+ struct sys_reg_params params;
bool valid = true;
unsigned int i;
@@ -3382,5 +3577,11 @@ int __init kvm_sys_reg_table_init(void)
for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
+ /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */
+ params = encoding_to_params(SYS_ID_PFR0_EL1);
+ first_idreg = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
+ if (!first_idreg)
+ return -EINVAL;
+
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 6b11f2cc7146..c65c129b3500 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -27,6 +27,13 @@ struct sys_reg_params {
bool is_write;
};
+#define encoding_to_params(reg) \
+ ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \
+ .Op1 = sys_reg_Op1(reg), \
+ .CRn = sys_reg_CRn(reg), \
+ .CRm = sys_reg_CRm(reg), \
+ .Op2 = sys_reg_Op2(reg) })
+
#define esr_sys64_to_params(esr) \
((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \
.Op1 = ((esr) >> 14) & 0x7, \
@@ -64,13 +71,16 @@ struct sys_reg_desc {
struct sys_reg_params *,
const struct sys_reg_desc *);
- /* Initialization for vcpu. */
- void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
+ /*
+ * Initialization for vcpu. Return initialized value, or KVM
+ * sanitized value for ID registers.
+ */
+ u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
/* Index into sys_reg[], or 0 if we don't need to save it. */
int reg;
- /* Value (usually reset value) */
+ /* Value (usually reset value), or write mask for idregs */
u64 val;
/* Custom get/set_user functions, fallback to generic if NULL */
@@ -123,19 +133,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu,
}
/* Reset functions */
-static inline void reset_unknown(struct kvm_vcpu *vcpu,
+static inline u64 reset_unknown(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
__vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
__vcpu_sys_reg(vcpu, r->reg) = r->val;
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu,