summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm/mmu.c')
-rw-r--r--arch/arm64/kvm/mmu.c1314
1 files changed, 979 insertions, 335 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a3ee3b605c9b..48d7c372a4cd 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -4,18 +4,20 @@
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
*/
+#include <linux/acpi.h>
#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/sched/signal.h>
#include <trace/events/kvm.h>
+#include <asm/acpi.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
-#include <asm/kvm_ras.h>
+#include <asm/kvm_pkvm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
@@ -25,20 +27,31 @@
static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
+static unsigned long __ro_after_init hyp_idmap_start;
+static unsigned long __ro_after_init hyp_idmap_end;
+static phys_addr_t __ro_after_init hyp_idmap_vector;
-static unsigned long io_map_base;
+u32 __ro_after_init __hyp_va_bits;
-static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+static unsigned long __ro_after_init io_map_base;
+
+#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn)
+
+static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
+ phys_addr_t size)
{
- phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
return (boundary - 1 < end - 1) ? boundary : end;
}
+static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
+
+ return __stage2_range_addr_end(addr, end, size);
+}
+
/*
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
@@ -46,16 +59,17 @@ static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
* long will also starve other vCPUs. We have to also make sure that the page
* tables are not freed while we released the lock.
*/
-static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
phys_addr_t end,
int (*fn)(struct kvm_pgtable *, u64, u64),
bool resched)
{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
int ret;
u64 next;
do {
- struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ struct kvm_pgtable *pgt = mmu->pgt;
if (!pgt)
return -EINVAL;
@@ -71,8 +85,81 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
return ret;
}
-#define stage2_apply_range_resched(kvm, addr, end, fn) \
- stage2_apply_range(kvm, addr, end, fn, true)
+#define stage2_apply_range_resched(mmu, addr, end, fn) \
+ stage2_apply_range(mmu, addr, end, fn, true)
+
+/*
+ * Get the maximum number of page-tables pages needed to split a range
+ * of blocks into PAGE_SIZE PTEs. It assumes the range is already
+ * mapped at level 2, or at level 1 if allowed.
+ */
+static int kvm_mmu_split_nr_page_tables(u64 range)
+{
+ int n = 0;
+
+ if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
+ n += DIV_ROUND_UP(range, PUD_SIZE);
+ n += DIV_ROUND_UP(range, PMD_SIZE);
+ return n;
+}
+
+static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
+{
+ struct kvm_mmu_memory_cache *cache;
+ u64 chunk_size, min;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ min = kvm_mmu_split_nr_page_tables(chunk_size);
+ cache = &kvm->arch.mmu.split_page_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end)
+{
+ struct kvm_mmu_memory_cache *cache;
+ struct kvm_pgtable *pgt;
+ int ret, cache_capacity;
+ u64 next, chunk_size;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
+
+ if (chunk_size == 0)
+ return 0;
+
+ cache = &kvm->arch.mmu.split_page_cache;
+
+ do {
+ if (need_split_memcache_topup_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /* Eager page splitting is best-effort. */
+ ret = __kvm_mmu_topup_memory_cache(cache,
+ cache_capacity,
+ cache_capacity);
+ write_lock(&kvm->mmu_lock);
+ if (ret)
+ break;
+ }
+
+ pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = __stage2_range_addr_end(addr, end, chunk_size);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
+ if (ret)
+ break;
+ } while (addr = next, addr != end);
+
+ return ret;
+}
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
@@ -80,20 +167,31 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
}
/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
* @kvm: pointer to kvm structure.
*
* Interface to HYP function to flush all VM TLB entries
*/
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
- ++kvm->stat.generic.remote_tlb_flush_requests;
- kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
+ return 0;
}
-static bool kvm_is_device_pfn(unsigned long pfn)
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
+ gfn_t gfn, u64 nr_pages)
{
- return !pfn_is_map_memory(pfn);
+ u64 size = nr_pages << PAGE_SHIFT;
+ u64 addr = gfn << PAGE_SHIFT;
+
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size);
+ return 0;
}
static void *stage2_memcache_zalloc_page(void *arg)
@@ -130,21 +228,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
-static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
{
struct page *page = container_of(head, struct page, rcu_head);
void *pgtable = page_to_virt(page);
- u32 level = page_private(page);
+ s8 level = page_private(page);
- kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+ KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level);
}
-static void stage2_free_removed_table(void *addr, u32 level)
+static void stage2_free_unlinked_table(void *addr, s8 level)
{
struct page *page = virt_to_page(addr);
set_page_private(page, (unsigned long)level);
- call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+ call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
}
static void kvm_host_get_page(void *addr)
@@ -216,7 +314,7 @@ static void invalidate_icache_guest_page(void *va, size_t size)
* does.
*/
/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
* @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
@@ -235,13 +333,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
- WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap),
may_block));
}
-static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
+ u64 size, bool may_block)
{
- __unmap_stage2_range(mmu, start, size, true);
+ __unmap_stage2_range(mmu, start, size, may_block);
+}
+
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+{
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush));
}
static void stage2_flush_memslot(struct kvm *kvm,
@@ -250,7 +354,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
+ kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
}
/**
@@ -273,6 +377,8 @@ static void stage2_flush_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_flush_memslot(kvm, memslot);
+ kvm_nested_s2_flush(kvm);
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -280,7 +386,7 @@ static void stage2_flush_vm(struct kvm *kvm)
/**
* free_hyp_pgds - free Hyp-mode page tables
*/
-void free_hyp_pgds(void)
+void __init free_hyp_pgds(void)
{
mutex_lock(&kvm_hyp_pgd_mutex);
if (hyp_pgtable) {
@@ -511,6 +617,25 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return 0;
}
+static int __hyp_alloc_private_va_range(unsigned long base)
+{
+ lockdep_assert_held(&kvm_hyp_pgd_mutex);
+
+ if (!PAGE_ALIGNED(base))
+ return -EINVAL;
+
+ /*
+ * Verify that BIT(VA_BITS - 1) hasn't been flipped by
+ * allocating the new area, as it would indicate we've
+ * overflowed the idmap/IO address range.
+ */
+ if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+ return -ENOMEM;
+
+ io_map_base = base;
+
+ return 0;
+}
/**
* hyp_alloc_private_va_range - Allocates a private VA range.
@@ -531,29 +656,22 @@ int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
/*
* This assumes that we have enough space below the idmap
- * page to allocate our VAs. If not, the check below will
- * kick. A potential alternative would be to detect that
- * overflow and switch to an allocation above the idmap.
+ * page to allocate our VAs. If not, the check in
+ * __hyp_alloc_private_va_range() will kick. A potential
+ * alternative would be to detect that overflow and switch
+ * to an allocation above the idmap.
*
* The allocated size is always a multiple of PAGE_SIZE.
*/
- base = io_map_base - PAGE_ALIGN(size);
-
- /* Align the allocation based on the order of its size */
- base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
-
- /*
- * Verify that BIT(VA_BITS - 1) hasn't been flipped by
- * allocating the new area, as it would indicate we've
- * overflowed the idmap/IO address range.
- */
- if ((base ^ io_map_base) & BIT(VA_BITS - 1))
- ret = -ENOMEM;
- else
- *haddr = io_map_base = base;
+ size = PAGE_ALIGN(size);
+ base = io_map_base - size;
+ ret = __hyp_alloc_private_va_range(base);
mutex_unlock(&kvm_hyp_pgd_mutex);
+ if (!ret)
+ *haddr = base;
+
return ret;
}
@@ -587,6 +705,48 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
return ret;
}
+int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
+{
+ unsigned long base;
+ size_t size;
+ int ret;
+
+ mutex_lock(&kvm_hyp_pgd_mutex);
+ /*
+ * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
+ * an alignment of our allocation on the order of the size.
+ */
+ size = NVHE_STACK_SIZE * 2;
+ base = ALIGN_DOWN(io_map_base - size, size);
+
+ ret = __hyp_alloc_private_va_range(base);
+
+ mutex_unlock(&kvm_hyp_pgd_mutex);
+
+ if (ret) {
+ kvm_err("Cannot allocate hyp stack guard page\n");
+ return ret;
+ }
+
+ /*
+ * Since the stack grows downwards, map the stack to the page
+ * at the higher address and leave the lower guard page
+ * unbacked.
+ *
+ * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
+ * and addresses corresponding to the guard page have the
+ * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
+ */
+ ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE,
+ phys_addr, PAGE_HYP);
+ if (ret)
+ kvm_err("Cannot map hyp stack\n");
+
+ *haddr = base + size;
+
+ return ret;
+}
+
/**
* create_hyp_io_mappings - Map IO into both kernel and HYP
* @phys_addr: The physical start address which gets mapped
@@ -661,18 +821,39 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
struct kvm_pgtable pgt = {
.pgd = (kvm_pteref_t)kvm->mm->pgd,
.ia_bits = vabits_actual,
- .start_level = (KVM_PGTABLE_MAX_LEVELS -
- CONFIG_PGTABLE_LEVELS),
+ .start_level = (KVM_PGTABLE_LAST_LEVEL -
+ ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
.mm_ops = &kvm_user_mm_ops,
};
+ unsigned long flags;
kvm_pte_t pte = 0; /* Keep GCC quiet... */
- u32 level = ~0;
+ s8 level = S8_MAX;
int ret;
+ /*
+ * Disable IRQs so that we hazard against a concurrent
+ * teardown of the userspace page tables (which relies on
+ * IPI-ing threads).
+ */
+ local_irq_save(flags);
ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
- VM_BUG_ON(ret);
- VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
- VM_BUG_ON(!(pte & PTE_VALID));
+ local_irq_restore(flags);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Not seeing an error, but not updating level? Something went
+ * deeply wrong...
+ */
+ if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
+ return -EFAULT;
+ if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL))
+ return -EFAULT;
+
+ /* Oops, the userspace PTs are gone... Replay the fault */
+ if (!kvm_pte_valid(pte))
+ return -EAGAIN;
return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
}
@@ -681,7 +862,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
.free_pages_exact = kvm_s2_free_pages_exact,
- .free_removed_table = stage2_free_removed_table,
+ .free_unlinked_table = stage2_free_unlinked_table,
.get_page = kvm_host_get_page,
.put_page = kvm_s2_put_page,
.page_count = kvm_host_page_count,
@@ -691,21 +872,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.icache_inval_pou = invalidate_icache_guest_page,
};
-/**
- * kvm_init_stage2_mmu - Initialise a S2 MMU structure
- * @kvm: The pointer to the KVM structure
- * @mmu: The pointer to the s2 MMU structure
- * @type: The machine type of the virtual machine
- *
- * Allocates only the stage-2 HW PGD level table(s).
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
{
u32 kvm_ipa_limit = get_kvm_ipa_limit();
- int cpu, err;
- struct kvm_pgtable *pgt;
u64 mmfr0, mmfr1;
u32 phys_shift;
@@ -730,22 +899,98 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
- kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+ mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+
+ return 0;
+}
+
+/*
+ * Assume that @pgt is valid and unlinked from the KVM MMU to free the
+ * page-table without taking the kvm_mmu_lock and without performing any
+ * TLB invalidations.
+ *
+ * Also, the range of addresses can be large enough to cause need_resched
+ * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
+ * cond_resched() periodically to prevent hogging the CPU for a long time
+ * and schedule something else, if required.
+ */
+static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
+ phys_addr_t end)
+{
+ u64 next;
+ do {
+ next = stage2_range_addr_end(addr, end);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr,
+ next - addr);
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+}
+
+static void kvm_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr);
+
+ stage2_destroy_range(pgt, 0, BIT(ia_bits));
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
+}
+
+/**
+ * kvm_init_stage2_mmu - Initialise a S2 MMU structure
+ * @kvm: The pointer to the KVM structure
+ * @mmu: The pointer to the s2 MMU structure
+ * @type: The machine type of the virtual machine
+ *
+ * Allocates only the stage-2 HW PGD level table(s).
+ * Note we don't need locking here as this is only called in two cases:
+ *
+ * - when the VM is created, which can't race against anything
+ *
+ * - when secondary kvm_s2_mmu structures are initialised for NV
+ * guests, and the caller must hold kvm->lock as this is called on a
+ * per-vcpu basis.
+ */
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+{
+ int cpu, err;
+ struct kvm_pgtable *pgt;
+
+ /*
+ * If we already have our page tables in place, and that the
+ * MMU context is the canonical one, we have a bug somewhere,
+ * as this is only supposed to ever happen once per VM.
+ *
+ * Otherwise, we're building nested page tables, and that's
+ * probably because userspace called KVM_ARM_VCPU_INIT more
+ * than once on the same vcpu. Since that's actually legal,
+ * don't kick a fuss and leave gracefully.
+ */
if (mmu->pgt != NULL) {
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ return 0;
+
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
+ err = kvm_init_ipa_range(mmu, type);
+ if (err)
+ return err;
+
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
if (!pgt)
return -ENOMEM;
mmu->arch = &kvm->arch;
- err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
+ err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops);
if (err)
goto out_free_pgtable;
+ mmu->pgt = pgt;
+ if (is_protected_kvm_enabled())
+ return 0;
+
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
err = -ENOMEM;
@@ -755,17 +1000,30 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
- mmu->pgt = pgt;
+ /* The eager page splitting is disabled by default */
+ mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ mmu->split_page_cache.gfp_zero = __GFP_ZERO;
+
mmu->pgd_phys = __pa(pgt->pgd);
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
return 0;
out_destroy_pgtable:
- kvm_pgtable_stage2_destroy(pgt);
+ kvm_stage2_destroy(pgt);
out_free_pgtable:
kfree(pgt);
return err;
}
+void kvm_uninit_stage2_mmu(struct kvm *kvm)
+{
+ kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
+}
+
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
@@ -802,7 +1060,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
if (!(vma->vm_flags & VM_PFNMAP)) {
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
- unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
}
hva = vm_end;
} while (hva < reg_end);
@@ -829,6 +1087,8 @@ void stage2_unmap_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_unmap_memslot(kvm, memslot);
+ kvm_nested_s2_unmap(kvm, true);
+
write_unlock(&kvm->mmu_lock);
mmap_read_unlock(current->mm);
srcu_read_unlock(&kvm->srcu, idx);
@@ -846,29 +1106,47 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
write_unlock(&kvm->mmu_lock);
if (pgt) {
- kvm_pgtable_stage2_destroy(pgt);
+ kvm_stage2_destroy(pgt);
kfree(pgt);
}
}
-static void hyp_mc_free_fn(void *addr, void *unused)
+static void hyp_mc_free_fn(void *addr, void *mc)
{
+ struct kvm_hyp_memcache *memcache = mc;
+
+ if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, -1);
+
free_page((unsigned long)addr);
}
-static void *hyp_mc_alloc_fn(void *unused)
+static void *hyp_mc_alloc_fn(void *mc)
{
- return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ struct kvm_hyp_memcache *memcache = mc;
+ void *addr;
+
+ addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, 1);
+
+ return addr;
}
void free_hyp_memcache(struct kvm_hyp_memcache *mc)
{
- if (is_protected_kvm_enabled())
- __free_hyp_memcache(mc, hyp_mc_free_fn,
- kvm_host_va, NULL);
+ if (!is_protected_kvm_enabled())
+ return;
+
+ kfree(mc->mapping);
+ __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc);
}
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
@@ -876,8 +1154,14 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
if (!is_protected_kvm_enabled())
return 0;
+ if (!mc->mapping) {
+ mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT);
+ if (!mc->mapping)
+ return -ENOMEM;
+ }
+
return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
- kvm_host_pa, NULL);
+ kvm_host_pa, mc);
}
/**
@@ -895,7 +1179,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t addr;
int ret = 0;
struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
- struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct kvm_pgtable *pgt = mmu->pgt;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
KVM_PGTABLE_PROT_R |
(writable ? KVM_PGTABLE_PROT_W : 0);
@@ -908,13 +1193,13 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
- kvm_mmu_cache_min_pages(kvm));
+ kvm_mmu_cache_min_pages(mmu));
if (ret)
break;
write_lock(&kvm->mmu_lock);
- ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
- &cache, 0);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE,
+ pa, prot, &cache, 0);
write_unlock(&kvm->mmu_lock);
if (ret)
break;
@@ -927,15 +1212,14 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
}
/**
- * stage2_wp_range() - write protect stage2 memory region range
+ * kvm_stage2_wp_range() - write protect stage2 memory region range
* @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
-static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect));
}
/**
@@ -964,45 +1248,75 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_nested_s2_wp(kvm);
write_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
+ * pages for memory slot
* @kvm: The KVM pointer
- * @slot: The memory slot associated with mask
- * @gfn_offset: The gfn offset in memory slot
- * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
- * slot to be write protected
+ * @slot: The memory slot to split
*
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
+ * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
*/
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
- phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
- phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
- phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ phys_addr_t start, end;
+
+ lockdep_assert_held(&kvm->slots_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ slots = kvm_memslots(kvm);
+ memslot = id_to_memslot(slots, slot);
+
+ start = memslot->base_gfn << PAGE_SHIFT;
+ end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ write_lock(&kvm->mmu_lock);
+ kvm_mmu_split_huge_pages(kvm, start, end);
+ write_unlock(&kvm->mmu_lock);
}
/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of pages at offset 'gfn_offset' in this memory
+ * slot to enable dirty logging on
*
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
+ * Writes protect selected pages to enable dirty logging, and then
+ * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+
+ /*
+ * Eager-splitting is done when manual-protect is set. We
+ * also check for initially-all-set because we can avoid
+ * eager-splitting if initially-all-set is false.
+ * Initially-all-set equal false implies that huge-pages were
+ * already split when enabling dirty logging: no need to do it
+ * again.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_split_huge_pages(kvm, start, end);
+
+ kvm_nested_s2_wp(kvm);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1022,6 +1336,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
if (map_size == PAGE_SIZE)
return true;
+ /* pKVM only supports PMD_SIZE huge-mappings */
+ if (is_protected_kvm_enabled() && map_size != PMD_SIZE)
+ return false;
+
size = memslot->npages * PAGE_SIZE;
gpa_start = memslot->base_gfn << PAGE_SHIFT;
@@ -1079,7 +1397,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
*
* Returns the size of the mapping.
*/
-static unsigned long
+static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long hva, kvm_pfn_t *pfnp,
phys_addr_t *ipap)
@@ -1091,30 +1409,17 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
* sure that the HVA and IPA are sufficiently aligned and that the
* block map is contained within the memslot.
*/
- if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
- get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
- /*
- * The address we faulted on is backed by a transparent huge
- * page. However, because we map the compound huge page and
- * not the individual tail page, we need to transfer the
- * refcount to the head page. We have to be careful that the
- * THP doesn't start to split while we are adjusting the
- * refcounts.
- *
- * We are sure this doesn't happen, because mmu_invalidate_retry
- * was successful and we are holding the mmu_lock, so if this
- * THP is trying to split, it will be blocked in the mmu
- * notifier before touching any of the pages, specifically
- * before being able to call __split_huge_page_refcount().
- *
- * We can therefore safely transfer the refcount from PG_tail
- * to PG_head and switch the pfn from a tail page to the head
- * page accordingly.
- */
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ int sz = get_user_mapping_size(kvm, hva);
+
+ if (sz < 0)
+ return sz;
+
+ if (sz < PMD_SIZE)
+ return PAGE_SIZE;
+
*ipap &= PMD_MASK;
- kvm_release_pfn_clean(pfn);
pfn &= ~(PTRS_PER_PMD - 1);
- get_page(pfn_to_page(pfn));
*pfnp = pfn;
return PMD_SIZE;
@@ -1158,21 +1463,29 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
* able to see the page's tags and therefore they must be initialised first. If
* PG_mte_tagged is set, tags have already been initialised.
*
- * The race in the test/set of the PG_mte_tagged flag is handled by:
- * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
- * racing to santise the same page
- * - mmap_lock protects between a VM faulting a page in and the VMM performing
- * an mprotect() to add VM_MTE
+ * Must be called with kvm->mmu_lock held to ensure the memory remains mapped
+ * while the tags are zeroed.
*/
static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
unsigned long size)
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
if (!kvm_has_mte(kvm))
return;
+ if (folio_test_hugetlb(folio)) {
+ /* Hugetlb has MTE flags set on head page only */
+ if (folio_try_hugetlb_mte_tagging(folio)) {
+ for (i = 0; i < nr_pages; i++, page++)
+ mte_clear_page_tags(page_address(page));
+ folio_set_hugetlb_mte_tagged(folio);
+ }
+ return;
+ }
+
for (i = 0; i < nr_pages; i++, page++) {
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
@@ -1186,36 +1499,190 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
return vma->vm_flags & VM_MTE_ALLOWED;
}
+static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
+{
+ switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) {
+ case MT_NORMAL_NC:
+ case MT_DEVICE_nGnRnE:
+ case MT_DEVICE_nGnRE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
+ void **memcache)
+{
+ int min_pages;
+
+ if (!is_protected_kvm_enabled())
+ *memcache = &vcpu->arch.mmu_page_cache;
+ else
+ *memcache = &vcpu->arch.pkvm_memcache;
+
+ if (!topup_memcache)
+ return 0;
+
+ min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
+
+ if (!is_protected_kvm_enabled())
+ return kvm_mmu_topup_memory_cache(*memcache, min_pages);
+
+ return topup_hyp_memcache(*memcache, min_pages);
+}
+
+/*
+ * Potentially reduce shadow S2 permissions to match the guest's own S2. For
+ * exec faults, we'd only reach this point if the guest actually allowed it (see
+ * kvm_s2_handle_perm_fault).
+ *
+ * Also encode the level of the original translation in the SW bits of the leaf
+ * entry as a proxy for the span of that translation. This will be retrieved on
+ * TLB invalidation from the guest and used to limit the invalidation scope if a
+ * TTL hint or a range isn't provided.
+ */
+static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot *prot,
+ bool *writable)
+{
+ *writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_readable(nested))
+ *prot &= ~KVM_PGTABLE_PROT_R;
+
+ *prot |= kvm_encode_nested_level(nested);
+}
+
+static void adjust_nested_exec_perms(struct kvm *kvm,
+ struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot *prot)
+{
+ if (!kvm_s2_trans_exec_el0(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_UX;
+ if (!kvm_s2_trans_exec_el1(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_PX;
+}
+
+#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
+
+static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
+ struct kvm_memory_slot *memslot, bool is_perm)
+{
+ bool write_fault, exec_fault, writable;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+ struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ unsigned long mmu_seq;
+ struct page *page;
+ struct kvm *kvm = vcpu->kvm;
+ void *memcache;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ int ret;
+
+ ret = prepare_mmu_memcache(vcpu, true, &memcache);
+ if (ret)
+ return ret;
+
+ if (nested)
+ gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
+ else
+ gfn = fault_ipa >> PAGE_SHIFT;
+
+ write_fault = kvm_is_write_fault(vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
+
+ VM_WARN_ON_ONCE(write_fault && exec_fault);
+
+ mmu_seq = kvm->mmu_invalidate_seq;
+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+ smp_rmb();
+
+ ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
+ if (ret) {
+ kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
+ write_fault, exec_fault, false);
+ return ret;
+ }
+
+ writable = !(memslot->flags & KVM_MEM_READONLY);
+
+ if (nested)
+ adjust_nested_fault_perms(nested, &prot, &writable);
+
+ if (writable)
+ prot |= KVM_PGTABLE_PROT_W;
+
+ if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+ prot |= KVM_PGTABLE_PROT_X;
+
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
+
+ kvm_fault_lock(kvm);
+ if (mmu_invalidate_retry(kvm, mmu_seq)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
+ __pfn_to_phys(pfn), prot,
+ memcache, flags);
+
+out_unlock:
+ kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_fault_unlock(kvm);
+
+ if (writable && !ret)
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
+
+ return ret != -EAGAIN ? ret : 0;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
- unsigned long fault_status)
+ bool fault_is_perm)
{
int ret = 0;
- bool write_fault, writable, force_pte = false;
- bool exec_fault;
- bool device = false;
+ bool topup_memcache;
+ bool write_fault, writable;
+ bool exec_fault, mte_allowed, is_vma_cacheable;
+ bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
+ phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
- struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ void *memcache;
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
- unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
- unsigned long vma_pagesize, fault_granule;
+ bool force_pte = logging_active;
+ long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
+ struct page *page;
+ vm_flags_t vm_flags;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
- fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
+ if (fault_is_perm)
+ fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
- VM_BUG_ON(write_fault && exec_fault);
+ VM_WARN_ON_ONCE(write_fault && exec_fault);
- if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) {
- kvm_err("Unexpected L2 read permission error\n");
- return -EFAULT;
- }
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ topup_memcache = !fault_is_perm || (logging_active && write_fault);
+ ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
+ if (ret)
+ return ret;
/*
* Let's check if we will get back a huge page backed by hugetlbfs, or
@@ -1229,16 +1696,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- /*
- * logging_active is guaranteed to never be true for VM_PFNMAP
- * memslots.
- */
- if (logging_active) {
- force_pte = true;
+ if (force_pte)
vma_shift = PAGE_SHIFT;
- } else {
+ else
vma_shift = get_vma_page_shift(vma, hva);
- }
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
@@ -1265,44 +1726,69 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
vma_pagesize = 1UL << vma_shift;
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
- fault_ipa &= ~(vma_pagesize - 1);
- gfn = fault_ipa >> PAGE_SHIFT;
- mmap_read_unlock(current->mm);
+ if (nested) {
+ unsigned long max_map_size;
+
+ max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
+
+ ipa = kvm_s2_trans_output(nested);
+
+ /*
+ * If we're about to create a shadow stage 2 entry, then we
+ * can only create a block mapping if the guest stage 2 page
+ * table uses at least as big a mapping.
+ */
+ max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+
+ /*
+ * Be careful that if the mapping size falls between
+ * two host sizes, take the smallest of the two.
+ */
+ if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
+ max_map_size = PMD_SIZE;
+ else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
+ max_map_size = PAGE_SIZE;
+
+ force_pte = (max_map_size == PAGE_SIZE);
+ vma_pagesize = min_t(long, vma_pagesize, max_map_size);
+ }
/*
- * Permission faults just need to update the existing leaf entry,
- * and so normally don't require allocations from the memcache. The
- * only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
+ * Both the canonical IPA and fault IPA must be hugepage-aligned to
+ * ensure we find the right PFN and lay down the mapping in the right
+ * place.
*/
- if (fault_status != ESR_ELx_FSC_PERM ||
- (logging_active && write_fault)) {
- ret = kvm_mmu_topup_memory_cache(memcache,
- kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
+ fault_ipa &= ~(vma_pagesize - 1);
+ ipa &= ~(vma_pagesize - 1);
}
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ gfn = ipa >> PAGE_SHIFT;
+ mte_allowed = kvm_vma_mte_allowed(vma);
+
+ vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+
+ vm_flags = vma->vm_flags;
+
+ is_vma_cacheable = kvm_vma_is_cacheable(vma);
+
+ /* Don't use the VMA after the unlock -- it may have vanished */
+ vma = NULL;
+
/*
- * Ensure the read of mmu_invalidate_seq happens before we call
- * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
- * the page we just got a reference to gets unmapped before we have a
- * chance to grab the mmu_lock, which ensure that if the page gets
- * unmapped afterwards, the call to kvm_unmap_gfn will take it away
- * from us again properly. This smp_rmb() interacts with the smp_wmb()
- * in kvm_mmu_notifier_invalidate_<page|range_end>.
+ * Read mmu_invalidate_seq so that KVM can detect if the results of
+ * vma_lookup() or __kvm_faultin_pfn() become stale prior to
+ * acquiring kvm->mmu_lock.
*
- * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
- * used to avoid unnecessary overhead introduced to locate the memory
- * slot because it's always fixed even @gfn is adjusted for huge pages.
+ * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
+ * with the smp_wmb() in kvm_mmu_invalidate_end().
*/
- smp_rmb();
+ mmu_seq = kvm->mmu_invalidate_seq;
+ mmap_read_unlock(current->mm);
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
- write_fault, &writable, NULL);
+ pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+ &writable, &page);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -1310,18 +1796,39 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (is_error_noslot_pfn(pfn))
return -EFAULT;
- if (kvm_is_device_pfn(pfn)) {
- /*
- * If the page was identified as device early by looking at
- * the VMA flags, vma_pagesize is already representing the
- * largest quantity we can map. If instead it was mapped
- * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
- * and must not be upgraded.
- *
- * In both cases, we don't let transparent_hugepage_adjust()
- * change things at the last minute.
- */
- device = true;
+ /*
+ * Check if this is non-struct page memory PFN, and cannot support
+ * CMOs. It could potentially be unsafe to access as cacheable.
+ */
+ if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
+ if (is_vma_cacheable) {
+ /*
+ * Whilst the VMA owner expects cacheable mapping to this
+ * PFN, hardware also has to support the FWB and CACHE DIC
+ * features.
+ *
+ * ARM64 KVM relies on kernel VA mapping to the PFN to
+ * perform cache maintenance as the CMO instructions work on
+ * virtual addresses. VM_PFNMAP region are not necessarily
+ * mapped to a KVA and hence the presence of hardware features
+ * S2FWB and CACHE DIC are mandatory to avoid the need for
+ * cache maintenance.
+ */
+ if (!kvm_supports_cacheable_pfnmap())
+ ret = -EFAULT;
+ } else {
+ /*
+ * If the page was identified as device early by looking at
+ * the VMA flags, vma_pagesize is already representing the
+ * largest quantity we can map. If instead it was mapped
+ * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
+ * and must not be upgraded.
+ *
+ * In both cases, we don't let transparent_hugepage_adjust()
+ * change things at the last minute.
+ */
+ s2_force_noncacheable = true;
+ }
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
@@ -1330,31 +1837,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
writable = false;
}
- if (exec_fault && device)
- return -ENOEXEC;
+ if (exec_fault && s2_force_noncacheable)
+ ret = -ENOEXEC;
- read_lock(&kvm->mmu_lock);
+ if (ret) {
+ kvm_release_page_unused(page);
+ return ret;
+ }
+
+ if (nested)
+ adjust_nested_fault_perms(nested, &prot, &writable);
+
+ kvm_fault_lock(kvm);
pgt = vcpu->arch.hw_mmu->pgt;
- if (mmu_invalidate_retry(kvm, mmu_seq))
+ if (mmu_invalidate_retry(kvm, mmu_seq)) {
+ ret = -EAGAIN;
goto out_unlock;
+ }
/*
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
- if (fault_status == ESR_ELx_FSC_PERM &&
- fault_granule > PAGE_SIZE)
+ if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
+ if (fault_is_perm && fault_granule > PAGE_SIZE)
vma_pagesize = fault_granule;
else
vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn,
&fault_ipa);
+
+ if (vma_pagesize < 0) {
+ ret = vma_pagesize;
+ goto out_unlock;
+ }
}
- if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) {
+ if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new disallowed VMA */
- if (kvm_vma_mte_allowed(vma)) {
+ if (mte_allowed) {
sanitise_mte_tags(kvm, pfn, vma_pagesize);
} else {
ret = -EFAULT;
@@ -1368,53 +1889,138 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
- if (device)
- prot |= KVM_PGTABLE_PROT_DEVICE;
- else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ if (s2_force_noncacheable) {
+ if (vfio_allow_any_uc)
+ prot |= KVM_PGTABLE_PROT_NORMAL_NC;
+ else
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
prot |= KVM_PGTABLE_PROT_X;
+ }
+
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule)
- ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- else
- ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ if (fault_is_perm && vma_pagesize == fault_granule) {
+ /*
+ * Drop the SW bits in favour of those stored in the
+ * PTE, which will be preserved.
+ */
+ prot &= ~KVM_NV_GUEST_MAP_SZ;
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
+ } else {
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
- memcache, KVM_PGTABLE_WALK_SHARED);
+ memcache, flags);
+ }
+
+out_unlock:
+ kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_fault_unlock(kvm);
/* Mark the page dirty only if the fault is handled successfully */
- if (writable && !ret) {
- kvm_set_pfn_dirty(pfn);
+ if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
- }
-out_unlock:
- read_unlock(&kvm->mmu_lock);
- kvm_set_pfn_accessed(pfn);
- kvm_release_pfn_clean(pfn);
return ret != -EAGAIN ? ret : 0;
}
/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pte_t pte;
- kvm_pte_t kpte;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
- write_lock(&vcpu->kvm->mmu_lock);
+ read_lock(&vcpu->kvm->mmu_lock);
mmu = vcpu->arch.hw_mmu;
- kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
- write_unlock(&vcpu->kvm->mmu_lock);
+ KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags);
+ read_unlock(&vcpu->kvm->mmu_lock);
+}
- pte = __pte(kpte);
- if (pte_valid(pte))
- kvm_set_pfn_accessed(pte_pfn(pte));
+/*
+ * Returns true if the SEA should be handled locally within KVM if the abort
+ * is caused by a kernel memory allocation (e.g. stage-2 table memory).
+ */
+static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr)
+{
+ /*
+ * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort
+ * taken from a guest EL to EL2 is due to a host-imposed access (e.g.
+ * stage-2 PTW).
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return true;
+
+ /* KVM owns the VNCR when the vCPU isn't in a nested context. */
+ if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR))
+ return true;
+
+ /*
+ * Determining if an external abort during a table walk happened at
+ * stage-2 is only possible with S1PTW is set. Otherwise, since KVM
+ * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the
+ * PA of the stage-1 descriptor) can reach here and are reported
+ * with a TTW ESR value.
+ */
+ return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW));
+}
+
+int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_run *run = vcpu->run;
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 esr_mask = ESR_ELx_EC_MASK |
+ ESR_ELx_IL |
+ ESR_ELx_FnV |
+ ESR_ELx_EA |
+ ESR_ELx_CM |
+ ESR_ELx_WNR |
+ ESR_ELx_FSC;
+ u64 ipa;
+
+ /*
+ * Give APEI the opportunity to claim the abort before handling it
+ * within KVM. apei_claim_sea() expects to be called with IRQs enabled.
+ */
+ lockdep_assert_irqs_enabled();
+ if (apei_claim_sea(NULL) == 0)
+ return 1;
+
+ if (host_owns_sea(vcpu, esr) ||
+ !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags))
+ return kvm_inject_serror(vcpu);
+
+ /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */
+ if (kvm_has_ras(kvm))
+ esr_mask |= ESR_ELx_SET_MASK;
+
+ /*
+ * Exit to userspace, and provide faulting guest virtual and physical
+ * addresses in case userspace wants to emulate SEA to guest by
+ * writing to FAR_ELx and HPFAR_ELx registers.
+ */
+ memset(&run->arm_sea, 0, sizeof(run->arm_sea));
+ run->exit_reason = KVM_EXIT_ARM_SEA;
+ run->arm_sea.esr = esr & esr_mask;
+
+ if (!(esr & ESR_ELx_FnV))
+ run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu);
+
+ ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (ipa != INVALID_GPA) {
+ run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID;
+ run->arm_sea.gpa = ipa;
+ }
+
+ return 0;
}
/**
@@ -1430,20 +2036,32 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
*/
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
- unsigned long fault_status;
- phys_addr_t fault_ipa;
+ struct kvm_s2_trans nested_trans, *nested = NULL;
+ unsigned long esr;
+ phys_addr_t fault_ipa; /* The address we faulted on */
+ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
struct kvm_memory_slot *memslot;
unsigned long hva;
bool is_iabt, write_fault, writable;
gfn_t gfn;
int ret, idx;
- fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+ if (kvm_vcpu_abt_issea(vcpu))
+ return kvm_handle_guest_sea(vcpu);
+
+ esr = kvm_vcpu_get_esr(vcpu);
+
+ /*
+ * The fault IPA should be reliable at this point as we're not dealing
+ * with an SEA.
+ */
+ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm))
+ return -EFAULT;
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
- if (fault_status == ESR_ELx_FSC_FAULT) {
+ if (esr_fsc_is_translation_fault(esr)) {
/* Beyond sanitised PARange (which is the IPA limit) */
if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
kvm_inject_size_fault(vcpu);
@@ -1451,36 +2069,20 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
/* Falls between the IPA range and the PARange? */
- if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
+ if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- if (is_iabt)
- kvm_inject_pabt(vcpu, fault_ipa);
- else
- kvm_inject_dabt(vcpu, fault_ipa);
- return 1;
+ return kvm_inject_sea(vcpu, is_iabt, fault_ipa);
}
}
- /* Synchronous External Abort? */
- if (kvm_vcpu_abt_issea(vcpu)) {
- /*
- * For RAS the host kernel may handle this abort.
- * There is no need to pass the error into the guest.
- */
- if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
- kvm_inject_vabt(vcpu);
-
- return 1;
- }
-
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
/* Check the stage-2 fault is trans. fault or write fault */
- if (fault_status != ESR_ELx_FSC_FAULT &&
- fault_status != ESR_ELx_FSC_PERM &&
- fault_status != ESR_ELx_FSC_ACCESS) {
+ if (!esr_fsc_is_translation_fault(esr) &&
+ !esr_fsc_is_permission_fault(esr) &&
+ !esr_fsc_is_access_flag_fault(esr)) {
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
@@ -1490,7 +2092,47 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
idx = srcu_read_lock(&vcpu->kvm->srcu);
- gfn = fault_ipa >> PAGE_SHIFT;
+ /*
+ * We may have faulted on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resolve the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we simply inject
+ * this to the guest and carry on.
+ *
+ * If there are no shadow S2 PTs because S2 is disabled, there is
+ * nothing to walk and we treat it as a 1:1 before going through the
+ * canonical translation.
+ */
+ if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
+ vcpu->arch.hw_mmu->nested_stage2_enabled) {
+ u32 esr;
+
+ ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret == -EAGAIN) {
+ ret = 1;
+ goto out_unlock;
+ }
+
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ipa = kvm_s2_trans_output(&nested_trans);
+ nested = &nested_trans;
+ }
+
+ gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
@@ -1507,8 +2149,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
if (kvm_vcpu_abt_iss1tw(vcpu)) {
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
+ ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
goto out_unlock;
}
@@ -1534,28 +2175,34 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* faulting VA. This is always 12 bits, irrespective
* of the page size.
*/
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, fault_ipa);
+ ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+ ret = io_mem_abort(vcpu, ipa);
goto out_unlock;
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+ VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
- if (fault_status == ESR_ELx_FSC_ACCESS) {
+ if (esr_fsc_is_access_flag_fault(esr)) {
handle_access_fault(vcpu, fault_ipa);
ret = 1;
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+ VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
+ !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
+
+ if (kvm_slot_has_gmem(memslot))
+ ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
+ esr_fsc_is_permission_fault(esr));
+ else
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
+ esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
- if (ret == -ENOEXEC) {
- kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
- }
+ if (ret == -ENOEXEC)
+ ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu));
out_unlock:
srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
@@ -1570,67 +2217,36 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
(range->end - range->start) << PAGE_SHIFT,
range->may_block);
+ kvm_nested_s2_unmap(kvm, range->may_block);
return false;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- kvm_pfn_t pfn = pte_pfn(range->pte);
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(range->end - range->start != 1);
-
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
/*
- * If the page isn't tagged, defer to user_mem_abort() for sanitising
- * the MTE tags. The S2 pte should have been unmapped by
- * mmu_notifier_invalidate_range_end().
+ * TODO: Handle nested_mmu structures here using the reverse mapping in
+ * a later version of patch series.
*/
- if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
- return false;
-
- /*
- * We've moved a page around, probably through CoW, so let's treat
- * it just like a translation fault and the map handler will clean
- * the cache to the PoC.
- *
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
- PAGE_SIZE, __pfn_to_phys(pfn),
- KVM_PGTABLE_PROT_R, NULL, 0);
-
- return false;
}
-bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
-}
-
-bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- if (!kvm->arch.mmu.pgt)
- return false;
-
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
@@ -1668,7 +2284,7 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
.virt_to_phys = kvm_host_pa,
};
-int kvm_mmu_init(u32 *hyp_va_bits)
+int __init kvm_mmu_init(u32 *hyp_va_bits)
{
int err;
u32 idmap_bits;
@@ -1687,16 +2303,9 @@ int kvm_mmu_init(u32 *hyp_va_bits)
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
/*
- * The ID map may be configured to use an extended virtual address
- * range. This is only the case if system RAM is out of range for the
- * currently configured page size and VA_BITS_MIN, in which case we will
- * also need the extended virtual range for the HYP ID map, or we won't
- * be able to enable the EL2 MMU.
- *
- * However, in some cases the ID map may be configured for fewer than
- * the number of VA bits used by the regular kernel stage 1. This
- * happens when VA_BITS=52 and the kernel image is placed in PA space
- * below 48 bits.
+ * The ID map is always configured for 48 bits of translation, which
+ * may be fewer than the number of VA bits used by the regular kernel
+ * stage 1, when VA_BITS=52.
*
* At EL2, there is only one TTBR register, and we can't switch between
* translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
@@ -1707,7 +2316,7 @@ int kvm_mmu_init(u32 *hyp_va_bits)
* 1 VA bits to assure that the hypervisor can both ID map its code page
* and map any kernel memory.
*/
- idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ idmap_bits = IDMAP_VA_BITS;
kernel_bits = vabits_actual;
*hyp_va_bits = max(idmap_bits, kernel_bits);
@@ -1745,6 +2354,7 @@ int kvm_mmu_init(u32 *hyp_va_bits)
goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
+ __hyp_va_bits = *hyp_va_bits;
return 0;
out_destroy_pgtable:
@@ -1761,20 +2371,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
/*
* At this point memslot has been committed and there is an
* allocated dirty_bitmap[], dirty pages will be tracked while the
* memory slot is write protected.
*/
- if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (log_dirty_pages) {
+
+ if (change == KVM_MR_DELETE)
+ return;
+
/*
- * If we're with initial-all-set, we don't need to write
- * protect any pages because they're all reported as dirty.
- * Huge pages and normal pages will be write protect gradually.
+ * Huge and normal pages are write-protected and split
+ * on either of these two cases:
+ *
+ * 1. with initial-all-set: gradually with CLEAR ioctls,
*/
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- kvm_mmu_wp_memory_region(kvm, new->id);
- }
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+ /*
+ * or
+ * 2. without initial-all-set: all in one shot when
+ * enabling dirty logging.
+ */
+ kvm_mmu_wp_memory_region(kvm, new->id);
+ kvm_mmu_split_memory_region(kvm, new->id);
+ } else {
+ /*
+ * Free any leftovers from the eager page splitting cache. Do
+ * this when deleting, moving, disabling dirty logging, or
+ * creating the memslot (a nop). Doing it for deletes makes
+ * sure we don't leak memory, and there's no need to keep the
+ * cache around for any of the other cases.
+ */
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}
}
@@ -1794,9 +2426,16 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* Prevent userspace from creating a memory region outside of the IPA
* space addressable by the KVM guest IPA space.
*/
- if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
+ if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
return -EFAULT;
+ /*
+ * Only support guest_memfd backed memslots with mappable memory, since
+ * there aren't any CoCo VMs that support only private memory on arm64.
+ */
+ if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
+ return -EINVAL;
+
hva = new->userspace_addr;
reg_end = hva + (new->npages << PAGE_SHIFT);
@@ -1830,6 +2469,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
ret = -EINVAL;
break;
}
+
+ /*
+ * Cacheable PFNMAP is allowed only if the hardware
+ * supports it.
+ */
+ if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) {
+ ret = -EINVAL;
+ break;
+ }
}
hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
@@ -1846,11 +2494,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
- kvm_free_stage2_pgd(&kvm->arch.mmu);
-}
-
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
@@ -1858,7 +2501,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
phys_addr_t size = slot->npages << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
+ kvm_nested_s2_unmap(kvm, true);
write_unlock(&kvm->mmu_lock);
}