diff options
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r-- | arch/arm64/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/arm64/kvm/arch_timer.c | 45 | ||||
-rw-r--r-- | arch/arm64/kvm/arm.c | 72 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/include/nvhe/fixed_config.h | 5 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/page_alloc.c | 10 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/sys_regs.c | 7 | ||||
-rw-r--r-- | arch/arm64/kvm/hypercalls.c | 4 | ||||
-rw-r--r-- | arch/arm64/kvm/mmu.c | 99 | ||||
-rw-r--r-- | arch/arm64/kvm/pkvm.c | 47 | ||||
-rw-r--r-- | arch/arm64/kvm/pmu-emul.c | 4 | ||||
-rw-r--r-- | arch/arm64/kvm/sys_regs.c | 22 |
12 files changed, 182 insertions, 136 deletions
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index ca6eadeb7d1a..f531da6b362e 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -29,7 +29,6 @@ menuconfig KVM select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_XFER_TO_GUEST_WORK - select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 00610477ec7b..e1af4301b913 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -84,14 +84,10 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) static u64 timer_get_offset(struct arch_timer_context *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + if (ctxt->offset.vm_offset) + return *ctxt->offset.vm_offset; - switch(arch_timer_ctx_index(ctxt)) { - case TIMER_VTIMER: - return __vcpu_sys_reg(vcpu, CNTVOFF_EL2); - default: - return 0; - } + return 0; } static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) @@ -128,15 +124,12 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch(arch_timer_ctx_index(ctxt)) { - case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTVOFF_EL2) = offset; - break; - default: + if (!ctxt->offset.vm_offset) { WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt)); + return; } + + WRITE_ONCE(*ctxt->offset.vm_offset, offset); } u64 kvm_phys_timer_read(void) @@ -765,25 +758,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) return 0; } -/* Make the updates of cntvoff for all vtimer contexts atomic */ -static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) -{ - unsigned long i; - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *tmp; - - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, tmp, kvm) - timer_set_offset(vcpu_vtimer(tmp), cntvoff); - - /* - * When called from the vcpu create path, the CPU being created is not - * included in the loop above, so we just set it here as well. - */ - timer_set_offset(vcpu_vtimer(vcpu), cntvoff); - mutex_unlock(&kvm->lock); -} - void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); @@ -791,10 +765,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); vtimer->vcpu = vcpu; + vtimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.voffset; ptimer->vcpu = vcpu; /* Synchronize cntvoff across all vtimers of a VM. */ - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); + timer_set_offset(vtimer, kvm_phys_timer_read()); timer_set_offset(ptimer, 0); hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); @@ -840,7 +815,7 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) break; case KVM_REG_ARM_TIMER_CNT: timer = vcpu_vtimer(vcpu); - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); + timer_set_offset(timer, kvm_phys_timer_read() - value); break; case KVM_REG_ARM_TIMER_CVAL: timer = vcpu_vtimer(vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3bd732eaf087..6673c7b4f1a8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -16,7 +16,6 @@ #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> -#include <linux/kmemleak.h> #include <linux/kvm.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> @@ -46,7 +45,6 @@ #include <kvm/arm_psci.h> static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; -DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); @@ -220,6 +218,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -1889,9 +1888,33 @@ static int __init do_pkvm_init(u32 hyp_va_bits) return ret; } +static u64 get_hyp_id_aa64pfr0_el1(void) +{ + /* + * Track whether the system isn't affected by spectre/meltdown in the + * hypervisor's view of id_aa64pfr0_el1, used for protected VMs. + * Although this is per-CPU, we make it global for simplicity, e.g., not + * to have to worry about vcpu migration. + * + * Unlike for non-protected VMs, userspace cannot override this for + * protected VMs. + */ + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); + + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), + arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), + arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); + + return val; +} + static void kvm_hyp_init_symbols(void) { - kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); @@ -2105,41 +2128,6 @@ out_err: return err; } -static void __init _kvm_host_prot_finalize(void *arg) -{ - int *err = arg; - - if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) - WRITE_ONCE(*err, -EINVAL); -} - -static int __init pkvm_drop_host_privileges(void) -{ - int ret = 0; - - /* - * Flip the static key upfront as that may no longer be possible - * once the host stage 2 is installed. - */ - static_branch_enable(&kvm_protected_mode_initialized); - on_each_cpu(_kvm_host_prot_finalize, &ret, 1); - return ret; -} - -static int __init finalize_hyp_mode(void) -{ - if (!is_protected_kvm_enabled()) - return 0; - - /* - * Exclude HYP sections from kmemleak so that they don't get peeked - * at, which would end badly once inaccessible. - */ - kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); - kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); - return pkvm_drop_host_privileges(); -} - struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { struct kvm_vcpu *vcpu; @@ -2257,14 +2245,6 @@ static __init int kvm_arm_init(void) if (err) goto out_hyp; - if (!in_hyp_mode) { - err = finalize_hyp_mode(); - if (err) { - kvm_err("Failed to finalize Hyp protection\n"); - goto out_subs; - } - } - if (is_protected_kvm_enabled()) { kvm_info("Protected nVHE mode initialized successfully\n"); } else if (in_hyp_mode) { diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index 07edfc7524c9..37440e1dda93 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -33,11 +33,14 @@ * Allow for protected VMs: * - Floating-point and Advanced SIMD * - Data Independent Timing + * - Spectre/Meltdown Mitigation */ #define PVM_ID_AA64PFR0_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \ ) /* diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 0a048dc06a7d..fe5472a184a3 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -16,7 +16,7 @@ struct hyp_pool { * API at EL2. */ hyp_spinlock_t lock; - struct list_head free_area[MAX_ORDER]; + struct list_head free_area[MAX_ORDER + 1]; phys_addr_t range_start; phys_addr_t range_end; unsigned short max_order; diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 803ba3222e75..b1e392186a0f 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -110,7 +110,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, * after coalescing, so make sure to mark it HYP_NO_ORDER proactively. */ p->order = HYP_NO_ORDER; - for (; (order + 1) < pool->max_order; order++) { + for (; (order + 1) <= pool->max_order; order++) { buddy = __find_buddy_avail(pool, p, order); if (!buddy) break; @@ -203,9 +203,9 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) hyp_spin_lock(&pool->lock); /* Look for a high-enough-order page */ - while (i < pool->max_order && list_empty(&pool->free_area[i])) + while (i <= pool->max_order && list_empty(&pool->free_area[i])) i++; - if (i >= pool->max_order) { + if (i > pool->max_order) { hyp_spin_unlock(&pool->lock); return NULL; } @@ -228,8 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT)); - for (i = 0; i < pool->max_order; i++) + pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + for (i = 0; i <= pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; pool->range_end = phys + (nr_pages << PAGE_SHIFT); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 08d2b004f4b7..edd969a1f36b 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -85,19 +85,12 @@ static u64 get_restricted_features_unsigned(u64 sys_reg_val, static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) { - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); u64 set_mask = 0; u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); - /* Spectre and Meltdown mitigation in KVM */ - set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), - (u64)kvm->arch.pfr0_csv2); - set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), - (u64)kvm->arch.pfr0_csv3); - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 64c086c02c60..c4b4678bc4a4 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -44,7 +44,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) feature = smccc_get_arg1(vcpu); switch (feature) { case KVM_PTP_VIRT_COUNTER: - cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2); + cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset; break; case KVM_PTP_PHYS_COUNTER: cycles = systime_snapshot.cycles; @@ -397,6 +397,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) u64 val; int wa_level; + if (KVM_REG_SIZE(reg->id) != sizeof(val)) + return -ENOENT; if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) return -EFAULT; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7113587222ff..3b9d4d24c361 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) CONFIG_PGTABLE_LEVELS), .mm_ops = &kvm_user_mm_ops, }; + unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ u32 level = ~0; int ret; + /* + * Disable IRQs so that we hazard against a concurrent + * teardown of the userspace page tables (which relies on + * IPI-ing threads). + */ + local_irq_save(flags); ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); - VM_BUG_ON(ret); - VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); - VM_BUG_ON(!(pte & PTE_VALID)); + local_irq_restore(flags); + + if (ret) + return ret; + + /* + * Not seeing an error, but not updating level? Something went + * deeply wrong... + */ + if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) + return -EFAULT; + + /* Oops, the userspace PTs are gone... Replay the fault */ + if (!kvm_pte_valid(pte)) + return -EAGAIN; return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); } @@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * * Returns the size of the mapping. */ -static unsigned long +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) @@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && - get_user_mapping_size(kvm, hva) >= PMD_SIZE) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + int sz = get_user_mapping_size(kvm, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, { int ret = 0; bool write_fault, writable, force_pte = false; - bool exec_fault; + bool exec_fault, mte_allowed; bool device = false; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; @@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); - unsigned long vma_pagesize, fault_granule; + long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; @@ -1218,6 +1244,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + if (fault_status != ESR_ELx_FSC_PERM || + (logging_active && write_fault)) { + ret = kvm_mmu_topup_memory_cache(memcache, + kvm_mmu_cache_min_pages(kvm)); + if (ret) + return ret; + } + + /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. */ @@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, fault_ipa &= ~(vma_pagesize - 1); gfn = fault_ipa >> PAGE_SHIFT; - mmap_read_unlock(current->mm); + mte_allowed = kvm_vma_mte_allowed(vma); - /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. - */ - if (fault_status != ESR_ELx_FSC_PERM || - (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; - } + /* Don't use the VMA after the unlock -- it may have vanished */ + vma = NULL; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_invalidate_seq happens before we call - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk - * the page we just got a reference to gets unmapped before we have a - * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_gfn will take it away - * from us again properly. This smp_rmb() interacts with the smp_wmb() - * in kvm_mmu_notifier_invalidate_<page|range_end>. + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * acquiring kvm->mmu_lock. * - * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is - * used to avoid unnecessary overhead introduced to locate the memory - * slot because it's always fixed even @gfn is adjusted for huge pages. + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - smp_rmb(); + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, write_fault, &writable, NULL); @@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); + + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } } if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ - if (kvm_vma_mte_allowed(vma)) { + if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); } else { ret = -EFAULT; diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index cf56958b1492..6e9ece1ebbe7 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -4,6 +4,8 @@ * Author: Quentin Perret <qperret@google.com> */ +#include <linux/init.h> +#include <linux/kmemleak.h> #include <linux/kvm_host.h> #include <linux/memblock.h> #include <linux/mutex.h> @@ -13,6 +15,8 @@ #include "hyp_constants.h" +DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); + static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); @@ -213,3 +217,46 @@ int pkvm_init_host_vm(struct kvm *host_kvm) mutex_init(&host_kvm->lock); return 0; } + +static void __init _kvm_host_prot_finalize(void *arg) +{ + int *err = arg; + + if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) + WRITE_ONCE(*err, -EINVAL); +} + +static int __init pkvm_drop_host_privileges(void) +{ + int ret = 0; + + /* + * Flip the static key upfront as that may no longer be possible + * once the host stage 2 is installed. + */ + static_branch_enable(&kvm_protected_mode_initialized); + on_each_cpu(_kvm_host_prot_finalize, &ret, 1); + return ret; +} + +static int __init finalize_pkvm(void) +{ + int ret; + + if (!is_protected_kvm_enabled()) + return 0; + + /* + * Exclude HYP sections from kmemleak so that they don't get peeked + * at, which would end badly once inaccessible. + */ + kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); + + ret = pkvm_drop_host_privileges(); + if (ret) + pr_err("Failed to finalize Hyp protection: %d\n", ret); + + return ret; +} +device_initcall_sync(finalize_pkvm); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 24908400e190..5eca0cdd961d 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -538,7 +538,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (!kvm_pmu_is_3p5(vcpu)) val &= ~ARMV8_PMU_PMCR_LP; - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; + /* The reset bits don't indicate any state, and shouldn't be saved. */ + __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, @@ -557,6 +558,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } + kvm_vcpu_pmu_restore_guest(vcpu); } static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 53749d3a0996..34688918c811 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -794,7 +794,6 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; kvm_pmu_handle_pmcr(vcpu, val); - kvm_vcpu_pmu_restore_guest(vcpu); } else { /* PMCR.P & PMCR.C are RAZ */ val = __vcpu_sys_reg(vcpu, PMCR_EL0) @@ -856,6 +855,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) return true; } +static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + *val = kvm_pmu_get_counter_value(vcpu, idx); + return 0; +} + static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1072,7 +1087,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ - .reset = reset_pmevcntr, \ + .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ @@ -1982,7 +1997,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), |