summaryrefslogtreecommitdiff
path: root/arch/s390/mm/gmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm/gmap.c')
-rw-r--r--arch/s390/mm/gmap.c862
1 files changed, 214 insertions, 648 deletions
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 74e1d873dce0..dd85bcca817d 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -8,6 +8,8 @@
* Janosch Frank <frankja@linux.vnet.ibm.com>
*/
+#include <linux/cpufeature.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
@@ -18,20 +20,43 @@
#include <linux/ksm.h>
#include <linux/mman.h>
#include <linux/pgtable.h>
-
+#include <asm/page-states.h>
#include <asm/pgalloc.h>
+#include <asm/machine.h>
+#include <asm/gmap_helpers.h>
#include <asm/gmap.h>
-#include <asm/tlb.h>
+#include <asm/page.h>
+
+/*
+ * The address is saved in a radix tree directly; NULL would be ambiguous,
+ * since 0 is a valid address, and NULL is returned when nothing was found.
+ * The lower bits are ignored by all users of the macro, so it can be used
+ * to distinguish a valid address 0 from a NULL.
+ */
+#define VALID_GADDR_FLAG 1
+#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
+#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
#define GMAP_SHADOW_FAKE_TABLE 1ULL
+static struct page *gmap_alloc_crst(void)
+{
+ struct page *page;
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ if (!page)
+ return NULL;
+ __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
+ return page;
+}
+
/**
* gmap_alloc - allocate and initialize a guest address space
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
*/
-static struct gmap *gmap_alloc(unsigned long limit)
+struct gmap *gmap_alloc(unsigned long limit)
{
struct gmap *gmap;
struct page *page;
@@ -58,20 +83,16 @@ static struct gmap *gmap_alloc(unsigned long limit)
gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
if (!gmap)
goto out;
- INIT_LIST_HEAD(&gmap->crst_list);
INIT_LIST_HEAD(&gmap->children);
- INIT_LIST_HEAD(&gmap->pt_list);
INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&gmap->guest_table_lock);
spin_lock_init(&gmap->shadow_lock);
refcount_set(&gmap->ref_count, 1);
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
goto out_free;
- page->index = 0;
- list_add(&page->lru, &gmap->crst_list);
table = page_to_virt(page);
crst_table_init(table, etype);
gmap->table = table;
@@ -85,6 +106,7 @@ out_free:
out:
return NULL;
}
+EXPORT_SYMBOL_GPL(gmap_alloc);
/**
* gmap_create - create a guest address space
@@ -116,10 +138,7 @@ EXPORT_SYMBOL_GPL(gmap_create);
static void gmap_flush_tlb(struct gmap *gmap)
{
- if (MACHINE_HAS_IDTE)
- __tlb_flush_idte(gmap->asce);
- else
- __tlb_flush_global();
+ __tlb_flush_idte(gmap->asce);
}
static void gmap_radix_tree_free(struct radix_tree_root *root)
@@ -173,30 +192,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
} while (nr > 0);
}
+static void gmap_free_crst(unsigned long *table, bool free_ptes)
+{
+ bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
+ int i;
+
+ if (is_segment) {
+ if (!free_ptes)
+ goto out;
+ for (i = 0; i < _CRST_ENTRIES; i++)
+ if (!(table[i] & _SEGMENT_ENTRY_INVALID))
+ page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
+ } else {
+ for (i = 0; i < _CRST_ENTRIES; i++)
+ if (!(table[i] & _REGION_ENTRY_INVALID))
+ gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
+ }
+
+out:
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
/**
* gmap_free - free a guest address space
* @gmap: pointer to the guest address space structure
*
* No locks required. There are no references to this gmap anymore.
*/
-static void gmap_free(struct gmap *gmap)
+void gmap_free(struct gmap *gmap)
{
- struct page *page, *next;
-
/* Flush tlb of all gmaps (if not already done for shadows) */
if (!(gmap_is_shadow(gmap) && gmap->removed))
gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
- list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
- __free_pages(page, CRST_ALLOC_ORDER);
+ gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
+
gmap_radix_tree_free(&gmap->guest_to_host);
gmap_radix_tree_free(&gmap->host_to_guest);
/* Free additional data for a shadow gmap */
if (gmap_is_shadow(gmap)) {
- /* Free all page tables. */
- list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
- page_table_free_pgste(page);
gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
/* Release reference to the parent */
gmap_put(gmap->parent);
@@ -204,6 +239,7 @@ static void gmap_free(struct gmap *gmap)
kfree(gmap);
}
+EXPORT_SYMBOL_GPL(gmap_free);
/**
* gmap_get - increase reference counter for guest address space
@@ -267,37 +303,6 @@ void gmap_remove(struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_remove);
-/**
- * gmap_enable - switch primary space to the guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_enable(struct gmap *gmap)
-{
- S390_lowcore.gmap = (unsigned long) gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_enable);
-
-/**
- * gmap_disable - switch back to the standard primary address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_disable(struct gmap *gmap)
-{
- S390_lowcore.gmap = 0UL;
-}
-EXPORT_SYMBOL_GPL(gmap_disable);
-
-/**
- * gmap_get_enabled - get a pointer to the currently enabled gmap
- *
- * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
- */
-struct gmap *gmap_get_enabled(void)
-{
- return (struct gmap *) S390_lowcore.gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_get_enabled);
-
/*
* gmap_alloc_table is assumed to be called with mmap_lock held
*/
@@ -308,17 +313,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long *new;
/* since we dont free the gmap table until gmap_free we can unlock */
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
new = page_to_virt(page);
crst_table_init(new, init);
spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
- list_add(&page->lru, &gmap->crst_list);
*table = __pa(new) | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
- page->index = gaddr;
page = NULL;
}
spin_unlock(&gmap->guest_table_lock);
@@ -327,21 +330,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
return 0;
}
-/**
- * __gmap_segment_gaddr - find virtual address from segment pointer
- * @entry: pointer to a segment table entry in the guest address space
- *
- * Returns the virtual address in the guest address space for the segment
- */
-static unsigned long __gmap_segment_gaddr(unsigned long *entry)
+static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
{
- struct page *page;
- unsigned long offset;
+ return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
- offset = (unsigned long) entry / sizeof(unsigned long);
- offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
- page = pmd_pgtable_page((pmd_t *) entry);
- return page->index + offset;
+static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
+{
+ return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
+
+static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
+ unsigned long *gaddr)
+{
+ *gaddr = host_to_guest_delete(gmap, vmaddr);
+ if (IS_GADDR_VALID(*gaddr))
+ return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
+ return NULL;
}
/**
@@ -353,16 +358,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry)
*/
static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
{
- unsigned long *entry;
+ unsigned long gaddr;
int flush = 0;
+ pmd_t *pmdp;
BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
- if (entry) {
- flush = (*entry != _SEGMENT_ENTRY_EMPTY);
- *entry = _SEGMENT_ENTRY_EMPTY;
+
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
+ flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
+
spin_unlock(&gmap->guest_table_lock);
return flush;
}
@@ -481,26 +489,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
EXPORT_SYMBOL_GPL(__gmap_translate);
/**
- * gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- */
-unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
- unsigned long rc;
-
- mmap_read_lock(gmap->mm);
- rc = __gmap_translate(gmap, gaddr);
- mmap_read_unlock(gmap->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_translate);
-
-/**
* gmap_unlink - disconnect a page table from the gmap shadow tables
* @mm: pointer to the parent mm_struct
* @table: pointer to the host page table
@@ -584,12 +572,12 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
pud = pud_offset(p4d, vmaddr);
VM_BUG_ON(pud_none(*pud));
/* large puds cannot yet be handled */
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
return -EFAULT;
pmd = pmd_offset(pud, vmaddr);
VM_BUG_ON(pmd_none(*pmd));
/* Are we allowed to use huge pages? */
- if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
+ if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
@@ -599,15 +587,18 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
spin_lock(&gmap->guest_table_lock);
if (*table == _SEGMENT_ENTRY_EMPTY) {
rc = radix_tree_insert(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT, table);
+ vmaddr >> PMD_SHIFT,
+ (void *)MAKE_VALID_GADDR(gaddr));
if (!rc) {
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
*table = (pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
- | _SEGMENT_ENTRY_GMAP_UC;
+ | _SEGMENT_ENTRY_GMAP_UC
+ | _SEGMENT_ENTRY;
} else
- *table = pmd_val(*pmd) &
- _SEGMENT_ENTRY_HARDWARE_BITS;
+ *table = (pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS)
+ | _SEGMENT_ENTRY;
}
} else if (*table & _SEGMENT_ENTRY_PROTECT &&
!(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
@@ -621,113 +612,27 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
radix_tree_preload_end();
return rc;
}
-
-/**
- * gmap_fault - resolve a fault on a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @fault_flags: flags to pass down to handle_mm_fault()
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
- */
-int gmap_fault(struct gmap *gmap, unsigned long gaddr,
- unsigned int fault_flags)
-{
- unsigned long vmaddr;
- int rc;
- bool unlocked;
-
- mmap_read_lock(gmap->mm);
-
-retry:
- unlocked = false;
- vmaddr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(vmaddr)) {
- rc = vmaddr;
- goto out_up;
- }
- if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
- &unlocked)) {
- rc = -EFAULT;
- goto out_up;
- }
- /*
- * In the case that fixup_user_fault unlocked the mmap_lock during
- * faultin redo __gmap_translate to not race with a map/unmap_segment.
- */
- if (unlocked)
- goto retry;
-
- rc = __gmap_link(gmap, gaddr, vmaddr);
-out_up:
- mmap_read_unlock(gmap->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_fault);
+EXPORT_SYMBOL(__gmap_link);
/*
* this function is assumed to be called with mmap_lock held
*/
void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
{
- struct vm_area_struct *vma;
unsigned long vmaddr;
- spinlock_t *ptl;
- pte_t *ptep;
+
+ mmap_assert_locked(gmap->mm);
/* Find the vm address for the guest address */
vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
gaddr >> PMD_SHIFT);
if (vmaddr) {
vmaddr |= gaddr & ~PMD_MASK;
-
- vma = vma_lookup(gmap->mm, vmaddr);
- if (!vma || is_vm_hugetlb_page(vma))
- return;
-
- /* Get pointer to the page table entry */
- ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
- if (likely(ptep)) {
- ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
- pte_unmap_unlock(ptep, ptl);
- }
+ gmap_helper_zap_one_page(gmap->mm, vmaddr);
}
}
EXPORT_SYMBOL_GPL(__gmap_zap);
-void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
-{
- unsigned long gaddr, vmaddr, size;
- struct vm_area_struct *vma;
-
- mmap_read_lock(gmap->mm);
- for (gaddr = from; gaddr < to;
- gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
- /* Find the vm address for the guest address */
- vmaddr = (unsigned long)
- radix_tree_lookup(&gmap->guest_to_host,
- gaddr >> PMD_SHIFT);
- if (!vmaddr)
- continue;
- vmaddr |= gaddr & ~PMD_MASK;
- /* Find vma in the parent mm */
- vma = find_vma(gmap->mm, vmaddr);
- if (!vma)
- continue;
- /*
- * We do not discard pages that are backed by
- * hugetlbfs, so we don't have to refault them.
- */
- if (is_vm_hugetlb_page(vma))
- continue;
- size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
- zap_page_range(vma, vmaddr, size);
- }
- mmap_read_unlock(gmap->mm);
-}
-EXPORT_SYMBOL_GPL(gmap_discard);
-
static LIST_HEAD(gmap_notifier_list);
static DEFINE_SPINLOCK(gmap_notifier_lock);
@@ -789,8 +694,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
*
* Note: Can also be called for shadow gmaps.
*/
-static inline unsigned long *gmap_table_walk(struct gmap *gmap,
- unsigned long gaddr, int level)
+unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
{
const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
unsigned long *table = gmap->table;
@@ -837,10 +741,11 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
- table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
+ table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
}
return table;
}
+EXPORT_SYMBOL(gmap_table_walk);
/**
* gmap_pte_op_walk - walk the gmap page table, get the page table lock
@@ -895,12 +800,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
/**
* gmap_pte_op_end - release the page table lock
- * @ptl: pointer to the spinlock pointer
+ * @ptep: pointer to the locked pte
+ * @ptl: pointer to the page table spinlock
*/
-static void gmap_pte_op_end(spinlock_t *ptl)
+static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
{
- if (ptl)
- spin_unlock(ptl);
+ pte_unmap_unlock(ptep, ptl);
}
/**
@@ -931,7 +836,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
}
/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
- if (!pmd_large(*pmdp))
+ if (!pmd_leaf(*pmdp))
spin_unlock(&gmap->guest_table_lock);
return pmdp;
}
@@ -943,7 +848,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
*/
static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
{
- if (pmd_large(*pmdp))
+ if (pmd_leaf(*pmdp))
spin_unlock(&gmap->guest_table_lock);
}
@@ -1011,7 +916,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
{
int rc;
pte_t *ptep;
- spinlock_t *ptl = NULL;
+ spinlock_t *ptl;
unsigned long pbits = 0;
if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
@@ -1025,7 +930,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
/* Protect and unlock. */
rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(ptep, ptl);
return rc;
}
@@ -1037,86 +942,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
* @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
* @bits: pgste notification bits to set
*
- * Returns 0 if successfully protected, -ENOMEM if out of memory and
- * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ * Returns:
+ * PAGE_SIZE if a small page was successfully protected;
+ * HPAGE_SIZE if a large page was successfully protected;
+ * -ENOMEM if out of memory;
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing);
+ * -EAGAIN if the guest mapping is missing and should be fixed by the caller.
*
- * Called with sg->mm->mmap_lock in read.
+ * Context: Called with sg->mm->mmap_lock in read.
*/
-static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
- unsigned long len, int prot, unsigned long bits)
+int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
{
- unsigned long vmaddr, dist;
pmd_t *pmdp;
- int rc;
+ int rc = 0;
BUG_ON(gmap_is_shadow(gmap));
- while (len) {
- rc = -EAGAIN;
- pmdp = gmap_pmd_op_walk(gmap, gaddr);
- if (pmdp) {
- if (!pmd_large(*pmdp)) {
- rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
- bits);
- if (!rc) {
- len -= PAGE_SIZE;
- gaddr += PAGE_SIZE;
- }
- } else {
- rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
- bits);
- if (!rc) {
- dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
- len = len < dist ? 0 : len - dist;
- gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
- }
- }
- gmap_pmd_op_end(gmap, pmdp);
- }
- if (rc) {
- if (rc == -EINVAL)
- return rc;
- /* -EAGAIN, fixup of userspace mm and gmap */
- vmaddr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(vmaddr))
- return vmaddr;
- rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
- if (rc)
- return rc;
- }
- }
- return 0;
-}
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (!pmdp)
+ return -EAGAIN;
-/**
- * gmap_mprotect_notify - change access rights for a range of ptes and
- * call the notifier if any pte changes again
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @len: size of area
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- *
- * Returns 0 if for each page in the given range a gmap mapping exists,
- * the new access rights could be set and the notifier could be armed.
- * If the gmap mapping is missing for one or more pages -EFAULT is
- * returned. If no memory could be allocated -ENOMEM is returned.
- * This function establishes missing page table entries.
- */
-int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
- unsigned long len, int prot)
-{
- int rc;
+ if (!pmd_leaf(*pmdp)) {
+ rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
+ if (!rc)
+ rc = PAGE_SIZE;
+ } else {
+ rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
+ if (!rc)
+ rc = HPAGE_SIZE;
+ }
+ gmap_pmd_op_end(gmap, pmdp);
- if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
- return -EINVAL;
- if (!MACHINE_HAS_ESOP && prot == PROT_READ)
- return -EINVAL;
- mmap_read_lock(gmap->mm);
- rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
- mmap_read_unlock(gmap->mm);
return rc;
}
-EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+EXPORT_SYMBOL_GPL(gmap_protect_one);
/**
* gmap_read_table - get an unsigned long value from a guest page table using
@@ -1154,7 +1013,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
/* Do *NOT* clear the _PAGE_INVALID bit! */
rc = 0;
}
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(ptep, ptl);
}
if (!rc)
break;
@@ -1248,7 +1107,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
if (!rc)
gmap_insert_rmap(sg, vmaddr, rmap);
spin_unlock(&sg->guest_table_lock);
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(ptep, ptl);
}
radix_tree_preload_end();
if (rc) {
@@ -1303,7 +1162,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
if (!table || *table & _PAGE_INVALID)
return;
- gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
+ gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
}
@@ -1321,7 +1180,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
int i;
BUG_ON(!gmap_is_shadow(sg));
- for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
+ for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
pgt[i] = _PAGE_INVALID;
}
@@ -1336,7 +1195,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
{
unsigned long *ste;
phys_addr_t sto, pgt;
- struct page *page;
+ struct ptdesc *ptdesc;
BUG_ON(!gmap_is_shadow(sg));
ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
@@ -1349,9 +1208,8 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
*ste = _SEGMENT_ENTRY_EMPTY;
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = phys_to_page(pgt);
- list_del(&page->lru);
- page_table_free_pgste(page);
+ ptdesc = page_ptdesc(phys_to_page(pgt));
+ page_table_free_pgste(ptdesc);
}
/**
@@ -1365,7 +1223,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
unsigned long *sgt)
{
- struct page *page;
+ struct ptdesc *ptdesc;
phys_addr_t pgt;
int i;
@@ -1377,9 +1235,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
sgt[i] = _SEGMENT_ENTRY_EMPTY;
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = phys_to_page(pgt);
- list_del(&page->lru);
- page_table_free_pgste(page);
+ ptdesc = page_ptdesc(phys_to_page(pgt));
+ page_table_free_pgste(ptdesc);
}
}
@@ -1408,7 +1265,6 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
page = phys_to_page(sgt);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1436,7 +1292,6 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
page = phys_to_page(sgt);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1466,7 +1321,6 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
page = phys_to_page(r3t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1494,7 +1348,6 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
page = phys_to_page(r3t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1524,7 +1377,6 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_r2t(sg, raddr, __va(r2t));
/* Free region 2 table */
page = phys_to_page(r2t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1556,7 +1408,6 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
r1t[i] = _REGION1_ENTRY_EMPTY;
/* Free region 2 table */
page = phys_to_page(r2t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1567,7 +1418,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
*
* Called with sg->guest_table_lock
*/
-static void gmap_unshadow(struct gmap *sg)
+void gmap_unshadow(struct gmap *sg)
{
unsigned long *table;
@@ -1593,142 +1444,7 @@ static void gmap_unshadow(struct gmap *sg)
break;
}
}
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
- int edat_level)
-{
- struct gmap *sg;
-
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce != asce || sg->edat_level != edat_level ||
- sg->removed)
- continue;
- if (!sg->initialized)
- return ERR_PTR(-EAGAIN);
- refcount_inc(&sg->ref_count);
- return sg;
- }
- return NULL;
-}
-
-/**
- * gmap_shadow_valid - check if a shadow guest address space matches the
- * given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise, the
- * caller has to request a new shadow gmap in this case.
- *
- */
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
- if (sg->removed)
- return 0;
- return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_valid);
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
- int edat_level)
-{
- struct gmap *sg, *new;
- unsigned long limit;
- int rc;
-
- BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
- BUG_ON(gmap_is_shadow(parent));
- spin_lock(&parent->shadow_lock);
- sg = gmap_find_shadow(parent, asce, edat_level);
- spin_unlock(&parent->shadow_lock);
- if (sg)
- return sg;
- /* Create a new shadow gmap */
- limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
- if (asce & _ASCE_REAL_SPACE)
- limit = -1UL;
- new = gmap_alloc(limit);
- if (!new)
- return ERR_PTR(-ENOMEM);
- new->mm = parent->mm;
- new->parent = gmap_get(parent);
- new->orig_asce = asce;
- new->edat_level = edat_level;
- new->initialized = false;
- spin_lock(&parent->shadow_lock);
- /* Recheck if another CPU created the same shadow */
- sg = gmap_find_shadow(parent, asce, edat_level);
- if (sg) {
- spin_unlock(&parent->shadow_lock);
- gmap_free(new);
- return sg;
- }
- if (asce & _ASCE_REAL_SPACE) {
- /* only allow one real-space gmap shadow */
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce & _ASCE_REAL_SPACE) {
- spin_lock(&sg->guest_table_lock);
- gmap_unshadow(sg);
- spin_unlock(&sg->guest_table_lock);
- list_del(&sg->list);
- gmap_put(sg);
- break;
- }
- }
- }
- refcount_set(&new->ref_count, 2);
- list_add(&new->list, &parent->children);
- if (asce & _ASCE_REAL_SPACE) {
- /* nothing to protect, return right away */
- new->initialized = true;
- spin_unlock(&parent->shadow_lock);
- return new;
- }
- spin_unlock(&parent->shadow_lock);
- /* protect after insertion, so it will get properly invalidated */
- mmap_read_lock(parent->mm);
- rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
- ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
- PROT_READ, GMAP_NOTIFY_SHADOW);
- mmap_read_unlock(parent->mm);
- spin_lock(&parent->shadow_lock);
- new->initialized = true;
- if (rc) {
- list_del(&new->list);
- gmap_free(new);
- new = ERR_PTR(rc);
- }
- spin_unlock(&parent->shadow_lock);
- return new;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow);
+EXPORT_SYMBOL(gmap_unshadow);
/**
* gmap_shadow_r2t - create an empty shadow region 2 table
@@ -1740,7 +1456,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
* The r2t parameter specifies the address of the source table. The
* four pages of the source table are made read-only in the parent gmap
* address space. A write to the source table area @r2t will automatically
- * remove the shadow r2 table and all of its decendents.
+ * remove the shadow r2 table and all of its descendants.
*
* Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
* shadow table structure is incomplete, -ENOMEM if out of memory and
@@ -1759,12 +1475,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = r2t & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_r2t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1786,7 +1499,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r2t & _REGION_ENTRY_PROTECT);
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -1843,12 +1555,9 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = r3t & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_r3t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1870,7 +1579,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r3t & _REGION_ENTRY_PROTECT);
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -1927,12 +1635,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
/* Allocate a shadow segment table */
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = sgt & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_sgt = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1954,7 +1659,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= sgt & _REGION_ENTRY_PROTECT;
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -1987,45 +1691,22 @@ out_free:
}
EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
-/**
- * gmap_shadow_pgt_lookup - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
- unsigned long *pgt, int *dat_protection,
- int *fake)
+static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
{
- unsigned long *table;
- struct page *page;
- int rc;
+ unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
- BUG_ON(!gmap_is_shadow(sg));
- spin_lock(&sg->guest_table_lock);
- table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
- if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
- /* Shadow page tables are full pages (pte+pgste) */
- page = pfn_to_page(*table >> PAGE_SHIFT);
- *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
- *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
- *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
- rc = 0;
- } else {
- rc = -EAGAIN;
- }
- spin_unlock(&sg->guest_table_lock);
- return rc;
+ pgstes += _PAGE_ENTRIES;
+
+ pgstes[0] &= ~PGSTE_ST2_MASK;
+ pgstes[1] &= ~PGSTE_ST2_MASK;
+ pgstes[2] &= ~PGSTE_ST2_MASK;
+ pgstes[3] &= ~PGSTE_ST2_MASK;
+ pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
+ pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
+ pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
+ pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
}
-EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
/**
* gmap_shadow_pgt - instantiate a shadow page table
@@ -2045,19 +1726,20 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
{
unsigned long raddr, origin;
unsigned long *table;
- struct page *page;
+ struct ptdesc *ptdesc;
phys_addr_t s_pgt;
int rc;
BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
/* Allocate a shadow page table */
- page = page_table_alloc_pgste(sg->mm);
- if (!page)
+ ptdesc = page_table_alloc_pgste(sg->mm);
+ if (!ptdesc)
return -ENOMEM;
- page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN;
if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_pgt = page_to_phys(page);
+ origin |= GMAP_SHADOW_FAKE_TABLE;
+ gmap_pgste_set_pgt_addr(ptdesc, origin);
+ s_pgt = page_to_phys(ptdesc_page(ptdesc));
/* Install shadow page table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
@@ -2075,7 +1757,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
/* mark as invalid as long as the parent table is not protected */
*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
(pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
- list_add(&page->lru, &sg->pt_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2101,7 +1782,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
return rc;
out_free:
spin_unlock(&sg->guest_table_lock);
- page_table_free_pgste(page);
+ page_table_free_pgste(ptdesc);
return rc;
}
@@ -2156,7 +1837,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
if (!tptep) {
spin_unlock(&sg->guest_table_lock);
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(sptep, ptl);
radix_tree_preload_end();
break;
}
@@ -2167,7 +1848,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
rmap = NULL;
rc = 0;
}
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(sptep, ptl);
spin_unlock(&sg->guest_table_lock);
}
radix_tree_preload_end();
@@ -2253,7 +1934,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
pte_t *pte, unsigned long bits)
{
unsigned long offset, gaddr = 0;
- unsigned long *table;
struct gmap *gmap, *sg, *next;
offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
@@ -2261,12 +1941,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- table = radix_tree_lookup(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (table)
- gaddr = __gmap_segment_gaddr(table) + offset;
+ gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
spin_unlock(&gmap->guest_table_lock);
- if (!table)
+ if (!IS_GADDR_VALID(gaddr))
continue;
if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
@@ -2306,13 +1983,11 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
gaddr &= HPAGE_MASK;
pmdp_notify_gmap(gmap, pmdp, gaddr);
new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
- if (MACHINE_HAS_TLB_GUEST)
+ if (machine_has_tlb_guest())
__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
IDTE_GLOBAL);
- else if (MACHINE_HAS_IDTE)
- __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
else
- __pmdp_csp(pmdp);
+ __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
set_pmd(pmdp, new);
}
@@ -2326,15 +2001,14 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
if (pmdp) {
- gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC));
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
if (purge)
- __pmdp_csp(pmdp);
+ __pmdp_cspg(pmdp);
set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
spin_unlock(&gmap->guest_table_lock);
@@ -2355,44 +2029,31 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
/**
- * gmap_pmdp_csp - csp all affected guest pmd entries
- * @mm: pointer to the process mm_struct
- * @vmaddr: virtual address in the process address space
- */
-void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
-{
- gmap_pmdp_clear(mm, vmaddr, 1);
-}
-EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
-
-/**
* gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
* @mm: pointer to the process mm_struct
* @vmaddr: virtual address in the process address space
*/
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
{
- unsigned long *entry, gaddr;
+ unsigned long gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (entry) {
- pmdp = (pmd_t *)entry;
- gaddr = __gmap_segment_gaddr(entry);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
pmdp_notify_gmap(gmap, pmdp, gaddr);
- WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC));
- if (MACHINE_HAS_TLB_GUEST)
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
+ if (machine_has_tlb_guest())
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_LOCAL);
- else if (MACHINE_HAS_IDTE)
+ else
__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
- *entry = _SEGMENT_ENTRY_EMPTY;
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2407,29 +2068,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
*/
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
{
- unsigned long *entry, gaddr;
+ unsigned long gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (entry) {
- pmdp = (pmd_t *)entry;
- gaddr = __gmap_segment_gaddr(entry);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
pmdp_notify_gmap(gmap, pmdp, gaddr);
- WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC));
- if (MACHINE_HAS_TLB_GUEST)
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
+ if (machine_has_tlb_guest())
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_GLOBAL);
- else if (MACHINE_HAS_IDTE)
- __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
else
- __pmdp_csp(pmdp);
- *entry = _SEGMENT_ENTRY_EMPTY;
+ __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2485,7 +2142,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
if (!pmdp)
return;
- if (pmd_large(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
bitmap_fill(bitmap, _PAGE_ENTRIES);
} else {
@@ -2495,7 +2152,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
continue;
if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
set_bit(i, bitmap);
- spin_unlock(ptl);
+ pte_unmap_unlock(ptep, ptl);
}
}
gmap_pmd_op_end(gmap, pmdp);
@@ -2514,6 +2171,7 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops thp_split_walk_ops = {
.pmd_entry = thp_split_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
static inline void thp_split_mm(struct mm_struct *mm)
@@ -2522,8 +2180,7 @@ static inline void thp_split_mm(struct mm_struct *mm)
VMA_ITERATOR(vmi, mm, 0);
for_each_vma(vmi, vma) {
- vma->vm_flags &= ~VM_HUGEPAGE;
- vma->vm_flags |= VM_NOHUGEPAGE;
+ vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
walk_page_vma(vma, &thp_split_walk_ops, NULL);
}
mm->def_flags |= VM_NOHUGEPAGE;
@@ -2535,33 +2192,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Remove all empty zero pages from the mapping for lazy refaulting
- * - This must be called after mm->context.has_pgste is set, to avoid
- * future creation of zero pages
- * - This must be called after THP was enabled
- */
-static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
-{
- unsigned long addr;
-
- for (addr = start; addr != end; addr += PAGE_SIZE) {
- pte_t *ptep;
- spinlock_t *ptl;
-
- ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (is_zero_pfn(pte_pfn(*ptep)))
- ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
- pte_unmap_unlock(ptep, ptl);
- }
- return 0;
-}
-
-static const struct mm_walk_ops zap_zero_walk_ops = {
- .pmd_entry = __zap_zero_pages,
-};
-
-/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
@@ -2571,37 +2201,15 @@ int s390_enable_sie(void)
/* Do we have pgstes? if yes, we are done */
if (mm_has_pgste(mm))
return 0;
- /* Fail if the page tables are 2K */
- if (!mm_alloc_pgste(mm))
- return -EINVAL;
mmap_write_lock(mm);
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
- walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
-int gmap_mark_unmergeable(void)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int ret;
- VMA_ITERATOR(vmi, mm, 0);
-
- for_each_vma(vmi, vma) {
- ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vma->vm_flags);
- if (ret)
- return ret;
- }
- mm->def_flags &= ~VM_MERGEABLE;
- return 0;
-}
-EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
-
/*
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
@@ -2632,7 +2240,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
{
pmd_t *pmd = (pmd_t *)pte;
unsigned long start, end;
- struct page *page = pmd_page(*pmd);
+ struct folio *folio = page_folio(pmd_page(*pmd));
/*
* The write check makes sure we do not set a key on shared
@@ -2645,9 +2253,9 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
return 0;
start = pmd_val(*pmd) & HPAGE_MASK;
- end = start + HPAGE_SIZE - 1;
+ end = start + HPAGE_SIZE;
__storage_key_init_range(start, end);
- set_bit(PG_arch_1, &page->flags);
+ set_bit(PG_arch_1, &folio->flags.f);
cond_resched();
return 0;
}
@@ -2656,6 +2264,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
.pmd_entry = __s390_enable_skey_pmd,
+ .walk_lock = PGWALK_WRLOCK,
};
int s390_enable_skey(void)
@@ -2668,7 +2277,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
- rc = gmap_mark_unmergeable();
+ rc = gmap_helper_disable_cow_sharing();
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;
@@ -2693,6 +2302,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
static const struct mm_walk_ops reset_cmma_walk_ops = {
.pte_entry = __s390_reset_cmma,
+ .walk_lock = PGWALK_WRLOCK,
};
void s390_reset_cmma(struct mm_struct *mm)
@@ -2729,6 +2339,7 @@ static int s390_gather_pages(pte_t *ptep, unsigned long addr,
static const struct mm_walk_ops gather_pages_ops = {
.pte_entry = s390_gather_pages,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -2737,13 +2348,15 @@ static const struct mm_walk_ops gather_pages_ops = {
*/
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
{
+ struct folio *folio;
unsigned long i;
for (i = 0; i < count; i++) {
+ folio = pfn_folio(pfns[i]);
/* we always have an extra reference */
- uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+ uv_destroy_folio(folio);
/* get rid of the extra reference */
- put_page(pfn_to_page(pfns[i]));
+ folio_put(folio);
cond_resched();
}
}
@@ -2784,52 +2397,12 @@ int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
/**
- * s390_unlist_old_asce - Remove the topmost level of page tables from the
- * list of page tables of the gmap.
- * @gmap: the gmap whose table is to be removed
- *
- * On s390x, KVM keeps a list of all pages containing the page tables of the
- * gmap (the CRST list). This list is used at tear down time to free all
- * pages that are now not needed anymore.
- *
- * This function removes the topmost page of the tree (the one pointed to by
- * the ASCE) from the CRST list.
- *
- * This means that it will not be freed when the VM is torn down, and needs
- * to be handled separately by the caller, unless a leak is actually
- * intended. Notice that this function will only remove the page from the
- * list, the page will still be used as a top level page table (and ASCE).
- */
-void s390_unlist_old_asce(struct gmap *gmap)
-{
- struct page *old;
-
- old = virt_to_page(gmap->table);
- spin_lock(&gmap->guest_table_lock);
- list_del(&old->lru);
- /*
- * Sometimes the topmost page might need to be "removed" multiple
- * times, for example if the VM is rebooted into secure mode several
- * times concurrently, or if s390_replace_asce fails after calling
- * s390_remove_old_asce and is attempted again later. In that case
- * the old asce has been removed from the list, and therefore it
- * will not be freed when the VM terminates, but the ASCE is still
- * in use and still pointed to.
- * A subsequent call to replace_asce will follow the pointer and try
- * to remove the same page from the list again.
- * Therefore it's necessary that the page of the ASCE has valid
- * pointers, so list_del can work (and do nothing) without
- * dereferencing stale or invalid pointers.
- */
- INIT_LIST_HEAD(&old->lru);
- spin_unlock(&gmap->guest_table_lock);
-}
-EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
-
-/**
* s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
* @gmap: the gmap whose ASCE needs to be replaced
*
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
* If the allocation of the new top level page table fails, the ASCE is not
* replaced.
* In any case, the old ASCE is always removed from the gmap CRST list.
@@ -2842,23 +2415,16 @@ int s390_replace_asce(struct gmap *gmap)
struct page *page;
void *table;
- s390_unlist_old_asce(gmap);
+ /* Replacing segment type ASCEs would cause serious issues */
+ if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+ return -EINVAL;
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
table = page_to_virt(page);
memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
- /*
- * The caller has to deal with the old ASCE, but here we make sure
- * the new one is properly added to the CRST list, so that
- * it will be freed when the VM is torn down.
- */
- spin_lock(&gmap->guest_table_lock);
- list_add(&page->lru, &gmap->crst_list);
- spin_unlock(&gmap->guest_table_lock);
-
/* Set new table origin while preserving existing ASCE control bits */
asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
WRITE_ONCE(gmap->asce, asce);