summaryrefslogtreecommitdiff
path: root/arch/s390/mm/pgalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm/pgalloc.c')
-rw-r--r--arch/s390/mm/pgalloc.c347
1 files changed, 111 insertions, 236 deletions
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 781965f7210e..7df23528c01b 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -10,70 +10,45 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <asm/mmu_context.h>
+#include <asm/page-states.h>
#include <asm/pgalloc.h>
-#include <asm/gmap.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#ifdef CONFIG_PGSTE
-
-int page_table_allocate_pgste = 0;
-EXPORT_SYMBOL(page_table_allocate_pgste);
-
-static struct ctl_table page_table_sysctl[] = {
- {
- .procname = "allocate_pgste",
- .data = &page_table_allocate_pgste,
- .maxlen = sizeof(int),
- .mode = S_IRUGO | S_IWUSR,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- { }
-};
-
-static struct ctl_table page_table_sysctl_dir[] = {
- {
- .procname = "vm",
- .maxlen = 0,
- .mode = 0555,
- .child = page_table_sysctl,
- },
- { }
-};
-
-static int __init page_table_register_sysctl(void)
+unsigned long *crst_table_alloc_noprof(struct mm_struct *mm)
{
- return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
-}
-__initcall(page_table_register_sysctl);
-
-#endif /* CONFIG_PGSTE */
-
-unsigned long *crst_table_alloc(struct mm_struct *mm)
-{
- struct page *page = alloc_pages(GFP_KERNEL, 2);
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
+ struct ptdesc *ptdesc;
+ unsigned long *table;
- if (!page)
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER);
+ if (!ptdesc)
return NULL;
- arch_set_page_dat(page, 2);
- return (unsigned long *) page_to_virt(page);
+ table = ptdesc_address(ptdesc);
+ __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+ return table;
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
- free_pages((unsigned long) table, 2);
+ if (!table)
+ return;
+ pagetable_free(virt_to_ptdesc(table));
}
static void __crst_table_upgrade(void *arg)
{
struct mm_struct *mm = arg;
+ struct ctlreg asce;
/* change all active ASCEs to avoid the creation of new TLBs */
if (current->active_mm == mm) {
- S390_lowcore.user_asce = mm->context.asce;
- __ctl_load(S390_lowcore.user_asce, 7, 7);
+ asce.val = mm->context.asce;
+ get_lowcore()->user_asce = asce;
+ local_ctl_load(7, &asce);
+ if (!test_thread_flag(TIF_ASCE_PRIMARY))
+ local_ctl_load(1, &asce);
}
__tlb_flush_local();
}
@@ -83,6 +58,8 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
unsigned long asce_limit = mm->context.asce_limit;
+ mmap_assert_write_locked(mm);
+
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
VM_BUG_ON(asce_limit < _REGION2_SIZE);
@@ -94,23 +71,18 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
if (unlikely(!p4d))
goto err_p4d;
crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+ pagetable_p4d_ctor(virt_to_ptdesc(p4d));
}
if (end > _REGION1_SIZE) {
pgd = crst_table_alloc(mm);
if (unlikely(!pgd))
goto err_pgd;
crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+ pagetable_pgd_ctor(virt_to_ptdesc(pgd));
}
spin_lock_bh(&mm->page_table_lock);
- /*
- * This routine gets called with mmap_lock lock held and there is
- * no reason to optimize for the case of otherwise. However, if
- * that would ever change, the below check will let us know.
- */
- VM_BUG_ON(asce_limit != mm->context.asce_limit);
-
if (p4d) {
__pgd = (unsigned long *) mm->pgd;
p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
@@ -136,184 +108,82 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
return 0;
err_pgd:
+ pagetable_dtor(virt_to_ptdesc(p4d));
crst_table_free(mm, p4d);
err_p4d:
return -ENOMEM;
}
-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
-{
- unsigned int old, new;
-
- do {
- old = atomic_read(v);
- new = old ^ bits;
- } while (atomic_cmpxchg(v, old, new) != old);
- return new;
-}
-
#ifdef CONFIG_PGSTE
-struct page *page_table_alloc_pgste(struct mm_struct *mm)
+struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm)
{
- struct page *page;
+ struct ptdesc *ptdesc;
u64 *table;
- page = alloc_page(GFP_KERNEL);
- if (page) {
- table = (u64 *)page_to_virt(page);
+ ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0);
+ if (ptdesc) {
+ table = (u64 *)ptdesc_address(ptdesc);
+ __arch_set_page_dat(table, 1);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
- return page;
+ return ptdesc;
}
-void page_table_free_pgste(struct page *page)
+void page_table_free_pgste(struct ptdesc *ptdesc)
{
- __free_page(page);
+ pagetable_free(ptdesc);
}
#endif /* CONFIG_PGSTE */
-/*
- * page table entry allocation/free routines.
- */
-unsigned long *page_table_alloc(struct mm_struct *mm)
+unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
+ struct ptdesc *ptdesc;
unsigned long *table;
- struct page *page;
- unsigned int mask, bit;
-
- /* Try to get a fragment of a 4K page as a 2K page table */
- if (!mm_alloc_pgste(mm)) {
- table = NULL;
- spin_lock_bh(&mm->context.lock);
- if (!list_empty(&mm->context.pgtable_list)) {
- page = list_first_entry(&mm->context.pgtable_list,
- struct page, lru);
- mask = atomic_read(&page->_refcount) >> 24;
- mask = (mask | (mask >> 4)) & 3;
- if (mask != 3) {
- table = (unsigned long *) page_to_virt(page);
- bit = mask & 1; /* =1 -> second 2K */
- if (bit)
- table += PTRS_PER_PTE;
- atomic_xor_bits(&page->_refcount,
- 1U << (bit + 24));
- list_del(&page->lru);
- }
- }
- spin_unlock_bh(&mm->context.lock);
- if (table)
- return table;
- }
- /* Allocate a fresh page */
- page = alloc_page(GFP_KERNEL);
- if (!page)
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ ptdesc = pagetable_alloc_noprof(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- arch_set_page_dat(page, 0);
- /* Initialize page table */
- table = (unsigned long *) page_to_virt(page);
- if (mm_alloc_pgste(mm)) {
- /* Return 4K page table with PGSTEs */
- atomic_xor_bits(&page->_refcount, 3 << 24);
- memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
- memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
- } else {
- /* Return the first 2K fragment of the page */
- atomic_xor_bits(&page->_refcount, 1 << 24);
- memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
- spin_lock_bh(&mm->context.lock);
- list_add(&page->lru, &mm->context.pgtable_list);
- spin_unlock_bh(&mm->context.lock);
- }
+ table = ptdesc_address(ptdesc);
+ __arch_set_page_dat(table, 1);
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
return table;
}
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
- struct page *page;
- unsigned int bit, mask;
-
- page = virt_to_page(table);
- if (!mm_alloc_pgste(mm)) {
- /* Free 2K page table fragment of a 4K page */
- bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
- mask >>= 24;
- if (mask & 3)
- list_add(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
- spin_unlock_bh(&mm->context.lock);
- if (mask != 0)
- return;
- } else {
- atomic_xor_bits(&page->_refcount, 3U << 24);
- }
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
+ pagetable_dtor_free(ptdesc);
}
-void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
- unsigned long vmaddr)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
{
- struct mm_struct *mm;
- struct page *page;
- unsigned int bit, mask;
-
- mm = tlb->mm;
- page = virt_to_page(table);
- if (mm_alloc_pgste(mm)) {
- gmap_unlink(mm, table, vmaddr);
- table = (unsigned long *) ((unsigned long)table | 3);
- tlb_remove_table(tlb, table);
- return;
- }
- bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
- mask >>= 24;
- if (mask & 3)
- list_add_tail(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
- spin_unlock_bh(&mm->context.lock);
- table = (unsigned long *) ((unsigned long) table | (1U << bit));
- tlb_remove_table(tlb, table);
+ struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+
+ pagetable_dtor_free(ptdesc);
}
-void __tlb_remove_table(void *_table)
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
{
- unsigned int mask = (unsigned long) _table & 3;
- void *table = (void *)((unsigned long) _table ^ mask);
- struct page *page = virt_to_page(table);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
- switch (mask) {
- case 0: /* pmd, pud, or p4d */
- free_pages((unsigned long) table, 2);
- break;
- case 1: /* lower 2K of a 4K page table */
- case 2: /* higher 2K of a 4K page table */
- mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
- mask >>= 24;
- if (mask != 0)
- break;
- fallthrough;
- case 3: /* 4K page table with pgstes */
- if (mask & 3)
- atomic_xor_bits(&page->_refcount, 3 << 24);
- pgtable_pte_page_dtor(page);
- __free_page(page);
- break;
- }
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* Base infrastructure required to generate basic asces, region, segment,
@@ -322,34 +192,39 @@ void __tlb_remove_table(void *_table)
static struct kmem_cache *base_pgt_cache;
-static unsigned long base_pgt_alloc(void)
+static unsigned long *base_pgt_alloc(void)
{
- u64 *table;
+ unsigned long *table;
table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
if (table)
- memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
- return (unsigned long) table;
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ return table;
}
-static void base_pgt_free(unsigned long table)
+static void base_pgt_free(unsigned long *table)
{
- kmem_cache_free(base_pgt_cache, (void *) table);
+ kmem_cache_free(base_pgt_cache, table);
}
-static unsigned long base_crst_alloc(unsigned long val)
+static unsigned long *base_crst_alloc(unsigned long val)
{
- unsigned long table;
+ unsigned long *table;
+ struct ptdesc *ptdesc;
- table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
- if (table)
- crst_table_init((unsigned long *)table, val);
+ ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (!ptdesc)
+ return NULL;
+ table = ptdesc_address(ptdesc);
+ crst_table_init(table, val);
return table;
}
-static void base_crst_free(unsigned long table)
+static void base_crst_free(unsigned long *table)
{
- free_pages(table, CRST_ALLOC_ORDER);
+ if (!table)
+ return;
+ pagetable_free(virt_to_ptdesc(table));
}
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
@@ -361,7 +236,7 @@ static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
return (next - 1) < (end - 1) ? next : end; \
}
-BASE_ADDR_END_FUNC(page, _PAGE_SIZE)
+BASE_ADDR_END_FUNC(page, PAGE_SIZE)
BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
@@ -372,20 +247,20 @@ static inline unsigned long base_lra(unsigned long address)
unsigned long real;
asm volatile(
- " lra %0,0(%1)\n"
+ " lra %0,0(%1)"
: "=d" (real) : "a" (address) : "cc");
return real;
}
-static int base_page_walk(unsigned long origin, unsigned long addr,
+static int base_page_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
unsigned long *pte, next;
if (!alloc)
return 0;
- pte = (unsigned long *) origin;
- pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
+ pte = origin;
+ pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
do {
next = base_page_addr_end(addr, end);
*pte = base_lra(addr);
@@ -393,13 +268,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_segment_walk(unsigned long origin, unsigned long addr,
+static int base_segment_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *ste, next, table;
+ unsigned long *ste, next, *table;
int rc;
- ste = (unsigned long *) origin;
+ ste = origin;
ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
do {
next = base_segment_addr_end(addr, end);
@@ -409,9 +284,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
table = base_pgt_alloc();
if (!table)
return -ENOMEM;
- *ste = table | _SEGMENT_ENTRY;
+ *ste = __pa(table) | _SEGMENT_ENTRY;
}
- table = *ste & _SEGMENT_ENTRY_ORIGIN;
+ table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
rc = base_page_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -422,13 +297,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region3_walk(unsigned long origin, unsigned long addr,
+static int base_region3_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rtte, next, table;
+ unsigned long *rtte, next, *table;
int rc;
- rtte = (unsigned long *) origin;
+ rtte = origin;
rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
do {
next = base_region3_addr_end(addr, end);
@@ -438,9 +313,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rtte = table | _REGION3_ENTRY;
+ *rtte = __pa(table) | _REGION3_ENTRY;
}
- table = *rtte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rtte & _REGION_ENTRY_ORIGIN);
rc = base_segment_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -450,13 +325,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region2_walk(unsigned long origin, unsigned long addr,
+static int base_region2_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rste, next, table;
+ unsigned long *rste, next, *table;
int rc;
- rste = (unsigned long *) origin;
+ rste = origin;
rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
do {
next = base_region2_addr_end(addr, end);
@@ -466,9 +341,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rste = table | _REGION2_ENTRY;
+ *rste = __pa(table) | _REGION2_ENTRY;
}
- table = *rste & _REGION_ENTRY_ORIGIN;
+ table = __va(*rste & _REGION_ENTRY_ORIGIN);
rc = base_region3_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -478,13 +353,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region1_walk(unsigned long origin, unsigned long addr,
+static int base_region1_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rfte, next, table;
+ unsigned long *rfte, next, *table;
int rc;
- rfte = (unsigned long *) origin;
+ rfte = origin;
rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
do {
next = base_region1_addr_end(addr, end);
@@ -494,9 +369,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rfte = table | _REGION1_ENTRY;
+ *rfte = __pa(table) | _REGION1_ENTRY;
}
- table = *rfte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rfte & _REGION_ENTRY_ORIGIN);
rc = base_region2_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -515,7 +390,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
*/
void base_asce_free(unsigned long asce)
{
- unsigned long table = asce & _ASCE_ORIGIN;
+ unsigned long *table = __va(asce & _ASCE_ORIGIN);
if (!asce)
return;
@@ -567,7 +442,7 @@ static int base_pgt_cache_init(void)
*/
unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
{
- unsigned long asce, table, end;
+ unsigned long asce, *table, end;
int rc;
if (base_pgt_cache_init())
@@ -578,25 +453,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
if (!table)
return 0;
rc = base_segment_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION2_SIZE) {
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region3_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION1_SIZE) {
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region2_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
} else {
table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region1_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
}
if (rc) {
base_asce_free(asce);