diff options
Diffstat (limited to 'arch/s390/mm/gmap.c')
| -rw-r--r-- | arch/s390/mm/gmap.c | 1620 |
1 files changed, 921 insertions, 699 deletions
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4fb3d3cdb370..dd85bcca817d 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1,12 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 /* * KVM guest address space mapping code * - * Copyright IBM Corp. 2007, 2016 + * Copyright IBM Corp. 2007, 2020 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.vnet.ibm.com> */ +#include <linux/cpufeature.h> +#include <linux/export.h> #include <linux/kernel.h> -#include <linux/mm.h> +#include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/smp.h> #include <linux/spinlock.h> @@ -14,38 +19,60 @@ #include <linux/swapops.h> #include <linux/ksm.h> #include <linux/mman.h> - -#include <asm/pgtable.h> +#include <linux/pgtable.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> +#include <asm/machine.h> +#include <asm/gmap_helpers.h> #include <asm/gmap.h> -#include <asm/tlb.h> +#include <asm/page.h> + +/* + * The address is saved in a radix tree directly; NULL would be ambiguous, + * since 0 is a valid address, and NULL is returned when nothing was found. + * The lower bits are ignored by all users of the macro, so it can be used + * to distinguish a valid address 0 from a NULL. + */ +#define VALID_GADDR_FLAG 1 +#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) +#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) #define GMAP_SHADOW_FAKE_TABLE 1ULL +static struct page *gmap_alloc_crst(void) +{ + struct page *page; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return NULL; + __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); + return page; +} + /** * gmap_alloc - allocate and initialize a guest address space - * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ -static struct gmap *gmap_alloc(unsigned long limit) +struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; unsigned long *table; unsigned long etype, atype; - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; atype = _ASCE_TYPE_SEGMENT; etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; atype = _ASCE_TYPE_REGION3; etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; atype = _ASCE_TYPE_REGION2; etype = _REGION2_ENTRY_EMPTY; } else { @@ -53,24 +80,20 @@ static struct gmap *gmap_alloc(unsigned long limit) atype = _ASCE_TYPE_REGION1; etype = _REGION1_ENTRY_EMPTY; } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); if (!gmap) goto out; - INIT_LIST_HEAD(&gmap->crst_list); INIT_LIST_HEAD(&gmap->children); - INIT_LIST_HEAD(&gmap->pt_list); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); - INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); - atomic_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL, 2); + refcount_set(&gmap->ref_count, 1); + page = gmap_alloc_crst(); if (!page) goto out_free; - page->index = 0; - list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); + table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; gmap->asce = atype | _ASCE_TABLE_LENGTH | @@ -83,6 +106,7 @@ out_free: out: return NULL; } +EXPORT_SYMBOL_GPL(gmap_alloc); /** * gmap_create - create a guest address space @@ -100,24 +124,21 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) if (!gmap) return NULL; gmap->mm = mm; - spin_lock(&mm->context.gmap_lock); + spin_lock(&mm->context.lock); list_add_rcu(&gmap->list, &mm->context.gmap_list); if (list_is_singular(&mm->context.gmap_list)) gmap_asce = gmap->asce; else gmap_asce = -1UL; WRITE_ONCE(mm->context.gmap_asce, gmap_asce); - spin_unlock(&mm->context.gmap_lock); + spin_unlock(&mm->context.lock); return gmap; } EXPORT_SYMBOL_GPL(gmap_create); static void gmap_flush_tlb(struct gmap *gmap) { - if (MACHINE_HAS_IDTE) - __tlb_flush_idte(gmap->asce); - else - __tlb_flush_global(); + __tlb_flush_idte(gmap->asce); } static void gmap_radix_tree_free(struct radix_tree_root *root) @@ -171,30 +192,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_free_crst(unsigned long *table, bool free_ptes) +{ + bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; + int i; + + if (is_segment) { + if (!free_ptes) + goto out; + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _SEGMENT_ENTRY_INVALID)) + page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); + } else { + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _REGION_ENTRY_INVALID)) + gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); + } + +out: + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure * * No locks required. There are no references to this gmap anymore. */ -static void gmap_free(struct gmap *gmap) +void gmap_free(struct gmap *gmap) { - struct page *page, *next; - /* Flush tlb of all gmaps (if not already done for shadows) */ if (!(gmap_is_shadow(gmap) && gmap->removed)) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); + gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); + gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); /* Free additional data for a shadow gmap */ if (gmap_is_shadow(gmap)) { - /* Free all page tables. */ - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) - page_table_free_pgste(page); gmap_rmap_radix_tree_free(&gmap->host_to_rmap); /* Release reference to the parent */ gmap_put(gmap->parent); @@ -202,6 +239,7 @@ static void gmap_free(struct gmap *gmap) kfree(gmap); } +EXPORT_SYMBOL_GPL(gmap_free); /** * gmap_get - increase reference counter for guest address space @@ -211,7 +249,7 @@ static void gmap_free(struct gmap *gmap) */ struct gmap *gmap_get(struct gmap *gmap) { - atomic_inc(&gmap->ref_count); + refcount_inc(&gmap->ref_count); return gmap; } EXPORT_SYMBOL_GPL(gmap_get); @@ -224,7 +262,7 @@ EXPORT_SYMBOL_GPL(gmap_get); */ void gmap_put(struct gmap *gmap) { - if (atomic_dec_return(&gmap->ref_count) == 0) + if (refcount_dec_and_test(&gmap->ref_count)) gmap_free(gmap); } EXPORT_SYMBOL_GPL(gmap_put); @@ -248,7 +286,7 @@ void gmap_remove(struct gmap *gmap) spin_unlock(&gmap->shadow_lock); } /* Remove gmap from the pre-mm list */ - spin_lock(&gmap->mm->context.gmap_lock); + spin_lock(&gmap->mm->context.lock); list_del_rcu(&gmap->list); if (list_empty(&gmap->mm->context.gmap_list)) gmap_asce = 0; @@ -258,46 +296,15 @@ void gmap_remove(struct gmap *gmap) else gmap_asce = -1UL; WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); - spin_unlock(&gmap->mm->context.gmap_lock); + spin_unlock(&gmap->mm->context.lock); synchronize_rcu(); /* Put reference */ gmap_put(gmap); } EXPORT_SYMBOL_GPL(gmap_remove); -/** - * gmap_enable - switch primary space to the guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_enable(struct gmap *gmap) -{ - S390_lowcore.gmap = (unsigned long) gmap; -} -EXPORT_SYMBOL_GPL(gmap_enable); - -/** - * gmap_disable - switch back to the standard primary address space - * @gmap: pointer to the guest address space structure - */ -void gmap_disable(struct gmap *gmap) -{ - S390_lowcore.gmap = 0UL; -} -EXPORT_SYMBOL_GPL(gmap_disable); - -/** - * gmap_get_enabled - get a pointer to the currently enabled gmap - * - * Returns a pointer to the currently enabled gmap. 0 if none is enabled. - */ -struct gmap *gmap_get_enabled(void) -{ - return (struct gmap *) S390_lowcore.gmap; -} -EXPORT_SYMBOL_GPL(gmap_get_enabled); - /* - * gmap_alloc_table is assumed to be called with mmap_sem held + * gmap_alloc_table is assumed to be called with mmap_lock held */ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long init, unsigned long gaddr) @@ -306,41 +313,40 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - new = (unsigned long *) page_to_phys(page); + new = page_to_virt(page); crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; page = NULL; } spin_unlock(&gmap->guest_table_lock); if (page) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return 0; } -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) +static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) { - struct page *page; - unsigned long offset, mask; + return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} + +static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) +{ + return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); - return page->index + offset; +static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, + unsigned long *gaddr) +{ + *gaddr = host_to_guest_delete(gmap, vmaddr); + if (IS_GADDR_VALID(*gaddr)) + return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); + return NULL; } /** @@ -352,16 +358,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry) */ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) { - unsigned long *entry; + unsigned long gaddr; int flush = 0; + pmd_t *pmdp; BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_EMPTY); - *entry = _SEGMENT_ENTRY_EMPTY; + + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } + spin_unlock(&gmap->guest_table_lock); return flush; } @@ -402,10 +411,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) return -EINVAL; flush = 0; - down_write(&gmap->mm->mmap_sem); + mmap_write_lock(gmap->mm); for (off = 0; off < len; off += PMD_SIZE) flush |= __gmap_unmap_by_gaddr(gmap, to + off); - up_write(&gmap->mm->mmap_sem); + mmap_write_unlock(gmap->mm); if (flush) gmap_flush_tlb(gmap); return 0; @@ -435,7 +444,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, return -EINVAL; flush = 0; - down_write(&gmap->mm->mmap_sem); + mmap_write_lock(gmap->mm); for (off = 0; off < len; off += PMD_SIZE) { /* Remove old translation */ flush |= __gmap_unmap_by_gaddr(gmap, to + off); @@ -445,7 +454,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, (void *) from + off)) break; } - up_write(&gmap->mm->mmap_sem); + mmap_write_unlock(gmap->mm); if (flush) gmap_flush_tlb(gmap); if (off >= len) @@ -463,7 +472,7 @@ EXPORT_SYMBOL_GPL(gmap_map_segment); * Returns user space address which corresponds to the guest address or * -EFAULT if no such mapping exists. * This function does not establish potentially missing page table entries. - * The mmap_sem of the mm that belongs to the address space must be held + * The mmap_lock of the mm that belongs to the address space must be held * when this function gets called. * * Note: Can also be called for shadow gmaps. @@ -480,28 +489,8 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) EXPORT_SYMBOL_GPL(__gmap_translate); /** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long rc; - - down_read(&gmap->mm->mmap_sem); - rc = __gmap_translate(gmap, gaddr); - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_translate); - -/** * gmap_unlink - disconnect a page table from the gmap shadow tables - * @gmap: pointer to guest mapping meta data structure + * @mm: pointer to the parent mm_struct * @table: pointer to the host page table * @vmaddr: vm address associated with the host page table */ @@ -520,15 +509,18 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table, rcu_read_unlock(); } +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, + unsigned long gaddr); + /** - * gmap_link - set up shadow page tables to connect a host to a guest address + * __gmap_link - set up shadow page tables to connect a host to a guest address * @gmap: pointer to guest mapping meta data structure * @gaddr: guest address * @vmaddr: vm address * * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT * if the vm address is already mapped to a different guest segment. - * The mmap_sem of the mm that belongs to the address space must be held + * The mmap_lock of the mm that belongs to the address space must be held * when this function gets called. */ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) @@ -540,36 +532,37 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + u64 unprot; int rc; BUG_ON(gmap_is_shadow(gmap)); /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) + gaddr & _REGION1_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) + gaddr & _REGION2_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) + gaddr & _REGION3_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ mm = gmap->mm; pgd = pgd_offset(mm, vmaddr); @@ -579,123 +572,67 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pud = pud_offset(p4d, vmaddr); VM_BUG_ON(pud_none(*pud)); /* large puds cannot yet be handled */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); - /* large pmds cannot yet be handled */ - if (pmd_large(*pmd)) + /* Are we allowed to use huge pages? */ + if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) return rc; ptl = pmd_lock(mm, pmd); spin_lock(&gmap->guest_table_lock); if (*table == _SEGMENT_ENTRY_EMPTY) { rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); - if (!rc) - *table = pmd_val(*pmd); - } else - rc = 0; + vmaddr >> PMD_SHIFT, + (void *)MAKE_VALID_GADDR(gaddr)); + if (!rc) { + if (pmd_leaf(*pmd)) { + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) + | _SEGMENT_ENTRY_GMAP_UC + | _SEGMENT_ENTRY; + } else + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS) + | _SEGMENT_ENTRY; + } + } else if (*table & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { + unprot = (u64)*table; + unprot &= ~_SEGMENT_ENTRY_PROTECT; + unprot |= _SEGMENT_ENTRY_GMAP_UC; + gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); + } spin_unlock(&gmap->guest_table_lock); spin_unlock(ptl); radix_tree_preload_end(); return rc; } - -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) -{ - unsigned long vmaddr; - int rc; - bool unlocked; - - down_read(&gmap->mm->mmap_sem); - -retry: - unlocked = false; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - goto out_up; - } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, - &unlocked)) { - rc = -EFAULT; - goto out_up; - } - /* - * In the case that fixup_user_fault unlocked the mmap_sem during - * faultin redo __gmap_translate to not race with a map/unmap_segment. - */ - if (unlocked) - goto retry; - - rc = __gmap_link(gmap, gaddr, vmaddr); -out_up: - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_fault); +EXPORT_SYMBOL(__gmap_link); /* - * this function is assumed to be called with mmap_sem held + * this function is assumed to be called with mmap_lock held */ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; + + mmap_assert_locked(gmap->mm); /* Find the vm address for the guest address */ vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); if (vmaddr) { vmaddr |= gaddr & ~PMD_MASK; - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) - ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); + gmap_helper_zap_one_page(gmap->mm, vmaddr); } } EXPORT_SYMBOL_GPL(__gmap_zap); -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) -{ - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - down_read(&gmap->mm->mmap_sem); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); - } - up_read(&gmap->mm->mmap_sem); -} -EXPORT_SYMBOL_GPL(gmap_discard); - static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); @@ -757,54 +694,58 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, * * Note: Can also be called for shadow gmaps. */ -static inline unsigned long *gmap_table_walk(struct gmap *gmap, - unsigned long gaddr, int level) +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) { - unsigned long *table; + const int asce_type = gmap->asce & _ASCE_TYPE_MASK; + unsigned long *table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) - return NULL; if (gmap_is_shadow(gmap) && gmap->removed) return NULL; - if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) + + if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) return NULL; - table = gmap->table; - switch (gmap->asce & _ASCE_TYPE_MASK) { + + if (asce_type != _ASCE_TYPE_REGION1 && + gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) + return NULL; + + switch (asce_type) { case _ASCE_TYPE_REGION1: - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if (level == 4) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION2: - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION3: - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_SEGMENT: - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr >> 12) & 0xff; + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; } return table; } +EXPORT_SYMBOL(gmap_table_walk); /** * gmap_pte_op_walk - walk the gmap page table, get the page table lock @@ -814,27 +755,17 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, * @ptl: pointer to the spinlock pointer * * Returns a pointer to the locked pte for a guest address, or NULL - * - * Note: Can also be called for shadow gmaps. */ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, spinlock_t **ptl) { unsigned long *table; - if (gmap_is_shadow(gmap)) - spin_lock(&gmap->guest_table_lock); + BUG_ON(gmap_is_shadow(gmap)); /* Walk the gmap page table, lock and get pte pointer */ table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ - if (!table || *table & _SEGMENT_ENTRY_INVALID) { - if (gmap_is_shadow(gmap)) - spin_unlock(&gmap->guest_table_lock); + if (!table || *table & _SEGMENT_ENTRY_INVALID) return NULL; - } - if (gmap_is_shadow(gmap)) { - *ptl = &gmap->guest_table_lock; - return pte_offset_map((pmd_t *) table, gaddr); - } return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); } @@ -858,10 +789,10 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, BUG_ON(gmap_is_shadow(gmap)); fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) return -EFAULT; if (unlocked) - /* lost mmap_sem, caller has to retry __gmap_translate */ + /* lost mmap_lock, caller has to retry __gmap_translate */ return 0; /* Connect the page tables */ return __gmap_link(gmap, gaddr, vmaddr); @@ -869,87 +800,182 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, /** * gmap_pte_op_end - release the page table lock - * @ptl: pointer to the spinlock pointer + * @ptep: pointer to the locked pte + * @ptl: pointer to the page table spinlock */ -static void gmap_pte_op_end(spinlock_t *ptl) +static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) { - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); +} + +/** + * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock + * and return the pmd pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * + * Returns a pointer to the pmd for a guest address, or NULL + */ +static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) +{ + pmd_t *pmdp; + + BUG_ON(gmap_is_shadow(gmap)); + pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + if (!pmdp) + return NULL; + + /* without huge pages, there is no need to take the table lock */ + if (!gmap->mm->context.allow_gmap_hpage_1m) + return pmd_none(*pmdp) ? NULL : pmdp; + + spin_lock(&gmap->guest_table_lock); + if (pmd_none(*pmdp)) { + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + + /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ + if (!pmd_leaf(*pmdp)) + spin_unlock(&gmap->guest_table_lock); + return pmdp; +} + +/** + * gmap_pmd_op_end - release the guest_table_lock if needed + * @gmap: pointer to the guest mapping meta data structure + * @pmdp: pointer to the pmd + */ +static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) +{ + if (pmd_leaf(*pmdp)) + spin_unlock(&gmap->guest_table_lock); } /* - * gmap_protect_range - remove access rights to memory and set pgste bits + * gmap_protect_pmd - remove access rights to memory and set pmd notification bits + * @pmdp: pointer to the pmd to be protected + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns: + * 0 if successfully protected + * -EAGAIN if a fixup is needed + * -EINVAL if unsupported notifier bits have been specified + * + * Expected to be called with sg->mm->mmap_lock in read and + * guest_table_lock held. + */ +static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; + int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *pmdp; + + /* Fixup needed */ + if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) + return -EAGAIN; + + if (prot == PROT_NONE && !pmd_i) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (prot == PROT_READ && !pmd_p) { + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (bits & GMAP_NOTIFY_MPROT) + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); + + /* Shadow GMAP protection needs split PMDs */ + if (bits & GMAP_NOTIFY_SHADOW) + return -EINVAL; + + return 0; +} + +/* + * gmap_protect_pte - remove access rights to memory and set pgste bits * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space - * @len: size of area + * @pmdp: pointer to the pmd associated with the pte * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set + * @bits: notification bits to set * * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EFAULT if gaddr is invalid (or mapping for shadows is missing). - * - * Called with sg->mm->mmap_sem in read. + * -EAGAIN if a fixup is needed. * - * Note: Can also be called for shadow gmaps. + * Expected to be called with sg->mm->mmap_lock in read */ -static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot, unsigned long bits) +static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) { - unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; int rc; + pte_t *ptep; + spinlock_t *ptl; + unsigned long pbits = 0; - while (len) { - rc = -EAGAIN; - ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); - if (ptep) { - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); - gmap_pte_op_end(ptl); - } - if (rc) { - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); - if (rc) - return rc; - continue; - } - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - return 0; + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return -EAGAIN; + + ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); + if (!ptep) + return -ENOMEM; + + pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; + pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; + /* Protect and unlock. */ + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); + gmap_pte_op_end(ptep, ptl); + return rc; } -/** - * gmap_mprotect_notify - change access rights for a range of ptes and - * call the notifier if any pte changes again +/* + * gmap_protect_range - remove access rights to memory and set pgste bits * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space * @len: size of area * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns: + * PAGE_SIZE if a small page was successfully protected; + * HPAGE_SIZE if a large page was successfully protected; + * -ENOMEM if out of memory; + * -EFAULT if gaddr is invalid (or mapping for shadows is missing); + * -EAGAIN if the guest mapping is missing and should be fixed by the caller. * - * Returns 0 if for each page in the given range a gmap mapping exists, - * the new access rights could be set and the notifier could be armed. - * If the gmap mapping is missing for one or more pages -EFAULT is - * returned. If no memory could be allocated -ENOMEM is returned. - * This function establishes missing page table entries. + * Context: Called with sg->mm->mmap_lock in read. */ -int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot) +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) { - int rc; + pmd_t *pmdp; + int rc = 0; + + BUG_ON(gmap_is_shadow(gmap)); + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return -EAGAIN; + + if (!pmd_leaf(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = PAGE_SIZE; + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = HPAGE_SIZE; + } + gmap_pmd_op_end(gmap, pmdp); - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) - return -EINVAL; - if (!MACHINE_HAS_ESOP && prot == PROT_READ) - return -EINVAL; - down_read(&gmap->mm->mmap_sem); - rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); - up_read(&gmap->mm->mmap_sem); return rc; } -EXPORT_SYMBOL_GPL(gmap_mprotect_notify); +EXPORT_SYMBOL_GPL(gmap_protect_one); /** * gmap_read_table - get an unsigned long value from a guest page table using @@ -959,9 +985,10 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify); * @val: pointer to the unsigned long value to return * * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT - * if reading using the virtual address failed. + * if reading using the virtual address failed. -EINVAL if called on a gmap + * shadow. * - * Called with gmap->mm->mmap_sem in read. + * Called with gmap->mm->mmap_lock in read. */ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) { @@ -970,6 +997,9 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) pte_t *ptep, pte; int rc; + if (gmap_is_shadow(gmap)) + return -EINVAL; + while (1) { rc = -EAGAIN; ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); @@ -978,12 +1008,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *) address; - pte_val(*ptep) |= _PAGE_YOUNG; + *val = *(unsigned long *)__va(address); + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } if (!rc) break; @@ -1011,6 +1041,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1018,6 +1049,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; @@ -1027,18 +1064,17 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, } /** - * gmap_protect_rmap - modify access rights to memory and create an rmap + * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap * @sg: pointer to the shadow guest address space structure * @raddr: rmap address in the shadow gmap * @paddr: address in the parent guest address space * @len: length of the memory area to protect - * @prot: indicates access rights: none, read-only or read-write * * Returns 0 if successfully protected and the rmap was created, -ENOMEM * if out of memory and -EFAULT if paddr is invalid. */ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, - unsigned long paddr, unsigned long len, int prot) + unsigned long paddr, unsigned long len) { struct gmap *parent; struct gmap_rmap *rmap; @@ -1053,11 +1089,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, vmaddr = __gmap_translate(parent, paddr); if (IS_ERR_VALUE(vmaddr)) return vmaddr; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); if (!rmap) return -ENOMEM; rmap->raddr = raddr; - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) { kfree(rmap); return rc; @@ -1066,17 +1102,17 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, ptep = gmap_pte_op_walk(parent, paddr, &ptl); if (ptep) { spin_lock(&sg->guest_table_lock); - rc = ptep_force_prot(parent->mm, paddr, ptep, prot, + rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, PGSTE_VSIE_BIT); if (!rc) gmap_insert_rmap(sg, vmaddr, rmap); spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } radix_tree_preload_end(); if (rc) { kfree(rmap); - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); if (rc) return rc; continue; @@ -1107,7 +1143,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) { asm volatile( - " .insn rrf,0xb98e0000,%0,%1,0,0" + " idte %0,0,%1" : : "a" (asce), "a" (vaddr) : "cc", "memory"); } @@ -1126,7 +1162,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ if (!table || *table & _PAGE_INVALID) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); } @@ -1144,7 +1180,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, int i; BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < 256; i++, raddr += 1UL << 12) + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) pgt[i] = _PAGE_INVALID; } @@ -1157,23 +1193,23 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { - unsigned long sto, *ste, *pgt; - struct page *page; + unsigned long *ste; + phys_addr_t sto, pgt; + struct ptdesc *ptdesc; BUG_ON(!gmap_is_shadow(sg)); ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); - sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); } /** @@ -1187,22 +1223,20 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long asce, *pgt; - struct page *page; + struct ptdesc *ptdesc; + phys_addr_t pgt; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; - for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; - pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); } } @@ -1215,23 +1249,23 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) { - unsigned long r3o, *r3e, *sgt; + unsigned long r3o, *r3e; + phys_addr_t sgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); - r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); - gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); - sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); - list_del(&page->lru); - __free_pages(page, 2); + page = phys_to_page(sgt); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1245,22 +1279,20 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long asce, *sgt; struct page *page; + phys_addr_t sgt; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; - for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; - sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); - list_del(&page->lru); - __free_pages(page, 2); + page = phys_to_page(sgt); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1273,23 +1305,23 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) { - unsigned long r2o, *r2e, *r3t; + unsigned long r2o, *r2e; + phys_addr_t r3t; struct page *page; BUG_ON(!gmap_is_shadow(sg)); r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); - r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); - gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); - r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); - list_del(&page->lru); - __free_pages(page, 2); + page = phys_to_page(r3t); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1303,22 +1335,20 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long asce, *r3t; + phys_addr_t r3t; struct page *page; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; - for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; - r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); - list_del(&page->lru); - __free_pages(page, 2); + page = phys_to_page(r3t); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1331,23 +1361,23 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) { - unsigned long r1o, *r1e, *r2t; + unsigned long r1o, *r1e; struct page *page; + phys_addr_t r2t; BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); - r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); - gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); - r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, r2t); + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); - list_del(&page->lru); - __free_pages(page, 2); + page = phys_to_page(r2t); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1361,24 +1391,24 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, unsigned long *r1t) { - unsigned long asce, *r2t; + unsigned long asce; struct page *page; + phys_addr_t r2t; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; - for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + asce = __pa(r1t) | _ASCE_TYPE_REGION1; + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; - r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); - __gmap_unshadow_r2t(sg, raddr, r2t); + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Clear entry and flush translation r1t -> r2t */ gmap_idte_one(asce, raddr); r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); - list_del(&page->lru); - __free_pages(page, 2); + page = phys_to_page(r2t); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1388,7 +1418,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, * * Called with sg->guest_table_lock */ -static void gmap_unshadow(struct gmap *sg) +void gmap_unshadow(struct gmap *sg) { unsigned long *table; @@ -1398,7 +1428,7 @@ static void gmap_unshadow(struct gmap *sg) sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); gmap_flush_tlb(sg); - table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + table = __va(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: __gmap_unshadow_r1t(sg, 0, table); @@ -1414,141 +1444,7 @@ static void gmap_unshadow(struct gmap *sg) break; } } - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg; - - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce != asce || sg->edat_level != edat_level || - sg->removed) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - atomic_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. - * - */ -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} -EXPORT_SYMBOL_GPL(gmap_shadow_valid); - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - BUG_ON(gmap_is_shadow(parent)); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - atomic_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - down_read(&parent->mm->mmap_sem); - rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, - PROT_READ, PGSTE_VSIE_BIT); - up_read(&parent->mm->mmap_sem); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} -EXPORT_SYMBOL_GPL(gmap_shadow); +EXPORT_SYMBOL(gmap_unshadow); /** * gmap_shadow_r2t - create an empty shadow region 2 table @@ -1560,31 +1456,29 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * The r2t parameter specifies the address of the source table. The * four pages of the source table are made read-only in the parent gmap * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its decendents. + * remove the shadow r2 table and all of its descendants. * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r2t, *table; + unsigned long *table; + phys_addr_t s_r2t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r2t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r2t = (unsigned long *) page_to_phys(page); + s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ @@ -1599,13 +1493,12 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + *table = s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1614,16 +1507,15 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, } spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r2t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1634,7 +1526,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r2t); @@ -1650,25 +1542,23 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r3t, *table; + unsigned long *table; + phys_addr_t s_r3t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r3t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r3t = (unsigned long *) page_to_phys(page); + s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ @@ -1681,14 +1571,14 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, goto out_free; } else if (*table & _REGION_ENTRY_ORIGIN) { rc = -EAGAIN; /* Race with shadow */ + goto out_free; } - crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + *table = s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1697,16 +1587,15 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, } spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r3t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1717,7 +1606,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r3t); @@ -1733,25 +1622,23 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_sgt, *table; + unsigned long *table; + phys_addr_t s_sgt; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL, 2); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = sgt & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_sgt = (unsigned long *) page_to_phys(page); + s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ @@ -1766,13 +1653,12 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + *table = s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1781,16 +1667,15 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, } spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_sgt) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1801,50 +1686,27 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); -/** - * gmap_shadow_lookup_pgtable - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_sem in read. - */ -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) { - unsigned long *table; - struct page *page; - int rc; + unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); - BUG_ON(!gmap_is_shadow(sg)); - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; - } - spin_unlock(&sg->guest_table_lock); - return rc; + pgstes += _PAGE_ENTRIES; + + pgstes[0] &= ~PGSTE_ST2_MASK; + pgstes[1] &= ~PGSTE_ST2_MASK; + pgstes[2] &= ~PGSTE_ST2_MASK; + pgstes[3] &= ~PGSTE_ST2_MASK; + pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; + pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; + pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; + pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; } -EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); /** * gmap_shadow_pgt - instantiate a shadow page table @@ -1857,25 +1719,27 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); * shadow table structure is incomplete, -ENOMEM if out of memory, * -EFAULT if an address in the parent gmap could not be resolved and * - * Called with gmap->mm->mmap_sem in read + * Called with gmap->mm->mmap_lock in read */ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) { unsigned long raddr, origin; - unsigned long *s_pgt, *table; - struct page *page; + unsigned long *table; + struct ptdesc *ptdesc; + phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); /* Allocate a shadow page table */ - page = page_table_alloc_pgste(sg->mm); - if (!page) + ptdesc = page_table_alloc_pgste(sg->mm); + if (!ptdesc) return -ENOMEM; - page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + origin = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = (unsigned long *) page_to_phys(page); + origin |= GMAP_SHADOW_FAKE_TABLE; + gmap_pgste_set_pgt_addr(ptdesc, origin); + s_pgt = page_to_phys(ptdesc_page(ptdesc)); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -1893,7 +1757,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - list_add(&page->lru, &sg->pt_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_SEGMENT_ENTRY_INVALID; @@ -1902,14 +1765,13 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, } spin_unlock(&sg->guest_table_lock); /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; - rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != - (unsigned long) s_pgt) + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_SEGMENT_ENTRY_INVALID; @@ -1920,7 +1782,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(page); + page_table_free_pgste(ptdesc); return rc; } @@ -1936,7 +1798,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) { @@ -1952,7 +1814,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) parent = sg->parent; prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); if (!rmap) return -ENOMEM; rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; @@ -1964,7 +1826,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rc = vmaddr; break; } - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) break; rc = -EAGAIN; @@ -1975,7 +1837,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); if (!tptep) { spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); radix_tree_preload_end(); break; } @@ -1986,7 +1848,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rmap = NULL; rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); spin_unlock(&sg->guest_table_lock); } radix_tree_preload_end(); @@ -2001,13 +1863,13 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) } EXPORT_SYMBOL_GPL(gmap_shadow_page); -/** +/* * gmap_shadow_notify - handle notifications for shadow gmap * * Called with sg->parent->shadow_lock. */ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, - unsigned long gaddr, pte_t *pte) + unsigned long gaddr) { struct gmap_rmap *rmap, *rnext, *head; unsigned long start, end, bits, raddr; @@ -2021,7 +1883,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, } /* Check for top level table */ start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && gaddr < end) { /* The complete shadow table has to go */ @@ -2032,7 +1894,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, return; } /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); gmap_for_each_rmap_safe(rmap, rnext, head) { bits = rmap->raddr & _SHADOW_RMAP_MASK; raddr = rmap->raddr ^ bits; @@ -2061,7 +1923,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, /** * ptep_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space + * @vmaddr: virtual address in the process address space * @pte: pointer to the page table entry * @bits: bits from the pgste that caused the notify call * @@ -2072,27 +1934,23 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte, unsigned long bits) { unsigned long offset, gaddr = 0; - unsigned long *table; struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (table) - gaddr = __gmap_segment_gaddr(table) + offset; + gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; spin_unlock(&gmap->guest_table_lock); - if (!table) + if (!IS_GADDR_VALID(gaddr)) continue; if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { spin_lock(&gmap->shadow_lock); list_for_each_entry_safe(sg, next, &gmap->children, list) - gmap_shadow_notify(sg, vmaddr, gaddr, pte); + gmap_shadow_notify(sg, vmaddr, gaddr); spin_unlock(&gmap->shadow_lock); } if (bits & PGSTE_IN_BIT) @@ -2102,23 +1960,236 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, } EXPORT_SYMBOL_GPL(ptep_notify); -static inline void thp_split_mm(struct mm_struct *mm) +static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); + gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); +} + +/** + * gmap_pmdp_xchg - exchange a gmap pmd with another + * @gmap: pointer to the guest address space structure + * @pmdp: pointer to the pmd entry + * @new: replacement entry + * @gaddr: the affected guest address + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, + unsigned long gaddr) +{ + gaddr &= HPAGE_MASK; + pmdp_notify_gmap(gmap, pmdp, gaddr); + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); + if (machine_has_tlb_guest()) + __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, + IDTE_GLOBAL); + else + __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); + set_pmd(pmdp, new); +} + +static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, + int purge) +{ + pmd_t *pmdp; + struct gmap *gmap; + unsigned long gaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (purge) + __pmdp_cspg(pmdp); + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} + +/** + * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without + * flushing + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 0); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); + +/** + * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_LOCAL); + else + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); + +/** + * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_GLOBAL); + else + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); + +/** + * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status + * @gmap: pointer to guest address space + * @pmdp: pointer to the pmd to be tested + * @gaddr: virtual address in the guest address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return false; + + /* Already protected memory, which did not change is clean */ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) + return false; + + /* Clear UC indication and reset protection */ + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); + gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); + return true; +} + +/** + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment + * @gmap: pointer to guest address space + * @bitmap: dirty bitmap for this pmd + * @gaddr: virtual address in the guest address space + * @vmaddr: virtual address in the host address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], + unsigned long gaddr, unsigned long vmaddr) { + int i; + pmd_t *pmdp; + pte_t *ptep; + spinlock_t *ptl; + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return; + + if (pmd_leaf(*pmdp)) { + if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) + bitmap_fill(bitmap, _PAGE_ENTRIES); + } else { + for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); + if (!ptep) + continue; + if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) + set_bit(i, bitmap); + pte_unmap_unlock(ptep, ptl); + } + } + gmap_pmd_op_end(gmap, pmdp); +} +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + split_huge_pmd(vma, pmd, addr); + return 0; +} + +static const struct mm_walk_ops thp_split_walk_ops = { + .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, +}; + +static inline void thp_split_mm(struct mm_struct *mm) +{ struct vm_area_struct *vma; - unsigned long addr; - - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - for (addr = vma->vm_start; - addr < vma->vm_end; - addr += PAGE_SIZE) - follow_page(vma, addr, FOLL_SPLIT); - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + VMA_ITERATOR(vmi, mm, 0); + + for_each_vma(vmi, vma) { + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); + walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; -#endif } +#else +static inline void thp_split_mm(struct mm_struct *mm) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * switch on pgstes for its userspace process (for kvm) @@ -2130,14 +2201,11 @@ int s390_enable_sie(void) /* Do we have pgstes? if yes, we are done */ if (mm_has_pgste(mm)) return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) - return -EINVAL; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return 0; } EXPORT_SYMBOL_GPL(s390_enable_sie); @@ -2146,48 +2214,78 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); * Enable storage key handling from now on and initialize the storage * keys with the default key. */ -static int __s390_enable_skey(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) +static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) { - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0; } +/* + * Give a chance to schedule after setting a key to 256 pages. + * We only hold the mm lock, which is a rwsem and the kvm srcu. + * Both can sleep. + */ +static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + cond_resched(); + return 0; +} + +static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, + unsigned long hmask, unsigned long next, + struct mm_walk *walk) +{ + pmd_t *pmd = (pmd_t *)pte; + unsigned long start, end; + struct folio *folio = page_folio(pmd_page(*pmd)); + + /* + * The write check makes sure we do not set a key on shared + * memory. This is needed as the walker does not differentiate + * between actual guest memory and the process executable or + * shared libraries. + */ + if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || + !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) + return 0; + + start = pmd_val(*pmd) & HPAGE_MASK; + end = start + HPAGE_SIZE; + __storage_key_init_range(start, end); + set_bit(PG_arch_1, &folio->flags.f); + cond_resched(); + return 0; +} + +static const struct mm_walk_ops enable_skey_walk_ops = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, + .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, +}; + int s390_enable_skey(void) { - struct mm_walk walk = { .pte_entry = __s390_enable_skey }; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; int rc = 0; - down_write(&mm->mmap_sem); - if (mm_use_skey(mm)) + mmap_write_lock(mm); + if (mm_uses_skeys(mm)) goto out_up; - mm->context.use_skey = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.use_skey = 0; - rc = -ENOMEM; - goto out_up; - } + mm->context.uses_skeys = 1; + rc = gmap_helper_disable_cow_sharing(); + if (rc) { + mm->context.uses_skeys = 0; + goto out_up; } - mm->def_flags &= ~VM_MERGEABLE; - - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); out_up: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return rc; } EXPORT_SYMBOL_GPL(s390_enable_skey); @@ -2202,13 +2300,137 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops reset_cmma_walk_ops = { + .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, +}; + void s390_reset_cmma(struct mm_struct *mm) { - struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; - - down_write(&mm->mmap_sem); - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); + mmap_write_unlock(mm); } EXPORT_SYMBOL_GPL(s390_reset_cmma); + +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, +}; + +/* + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. + */ +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) +{ + struct folio *folio; + unsigned long i; + + for (i = 0; i < count; i++) { + folio = pfn_folio(pfns[i]); + /* we always have an extra reference */ + uv_destroy_folio(folio); + /* get rid of the extra reference */ + folio_put(folio); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); + +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } + return 0; +} +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) +{ + unsigned long asce; + struct page *page; + void *table; + + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; +} +EXPORT_SYMBOL_GPL(s390_replace_asce); |
