summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c59
-rw-r--r--arch/x86/mm/fault.c66
-rw-r--r--arch/x86/mm/gup.c470
-rw-r--r--arch/x86/mm/hugetlbpage.c9
-rw-r--r--arch/x86/mm/ident_map.c51
-rw-r--r--arch/x86/mm/init_32.c71
-rw-r--r--arch/x86/mm/init_64.c183
-rw-r--r--arch/x86/mm/ioremap.c3
-rw-r--r--arch/x86/mm/kasan_init_64.c34
-rw-r--r--arch/x86/mm/kaslr.c4
-rw-r--r--arch/x86/mm/mmap.c125
-rw-r--r--arch/x86/mm/mpx.c2
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--arch/x86/mm/pageattr.c54
-rw-r--r--arch/x86/mm/pgtable.c36
-rw-r--r--arch/x86/mm/pgtable_32.c8
17 files changed, 531 insertions, 650 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..0fbdcb64f9f8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
KCOV_INSTRUMENT_tlb.o := n
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..9f305be71a72 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT)
#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
({ \
@@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st,
}
}
-static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
- unsigned long P)
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
{
int i;
pte_t *start;
pgprotval_t prot;
- start = (pte_t *) pmd_page_vaddr(addr);
+ start = (pte_t *)pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
@@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
#if PTRS_PER_PMD > 1
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
- unsigned long P)
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
{
int i;
pmd_t *start;
pgprotval_t prot;
- start = (pmd_t *) pud_page_vaddr(addr);
+ start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
@@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}
-static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
- unsigned long P)
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
{
int i;
pud_t *start;
pgprotval_t prot;
pud_t *prev_pud = NULL;
- start = (pud_t *) pgd_page_vaddr(addr);
+ start = (pud_t *)p4d_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}
#else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a) pud_none(__pud(p4d_val(a)))
+#endif
+
+#if PTRS_PER_P4D > 1
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
+{
+ int i;
+ p4d_t *start;
+ pgprotval_t prot;
+
+ start = (p4d_t *)pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+ if (!p4d_none(*start)) {
+ if (p4d_large(*start) || !p4d_present(*start)) {
+ prot = p4d_flags(*start);
+ note_page(m, st, __pgprot(prot), 2);
+ } else {
+ walk_pud_level(m, st, *start,
+ P + i * P4D_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
+#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
+#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
#endif
static inline bool is_hypervisor_range(int idx)
@@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
} else {
- walk_pud_level(m, &st, *start,
+ walk_p4d_level(m, &st, *start,
i * PGD_LEVEL_MULT);
}
} else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 428e31763cb9..8ad91a01cbc8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
unsigned index = pgd_index(address);
pgd_t *pgd_k;
+ p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
@@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
/*
* set_pgd(pgd, *pgd_k); here would be useless on PAE
* and redundant with the set_pmd() on non-PAE. As would
- * set_pud.
+ * set_p4d/set_pud.
*/
- pud = pud_offset(pgd, address);
- pud_k = pud_offset(pgd_k, address);
+ p4d = p4d_offset(pgd, address);
+ p4d_k = p4d_offset(pgd_k, address);
+ if (!p4d_present(*p4d_k))
+ return NULL;
+
+ pud = pud_offset(p4d, address);
+ pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
@@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(address)];
+ p4d_t *p4d;
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address)
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
#endif
- pmd = pmd_offset(pud_offset(pgd, address), address);
+ p4d = p4d_offset(pgd, address);
+ pud = pud_offset(p4d, address);
+ pmd = pmd_offset(pud, address);
printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
/*
@@ -425,6 +435,7 @@ void vmalloc_sync_all(void)
static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
+ p4d_t *p4d, *p4d_ref;
pud_t *pud, *pud_ref;
pmd_t *pmd, *pmd_ref;
pte_t *pte, *pte_ref;
@@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
- } else {
+ } else if (CONFIG_PGTABLE_LEVELS > 4) {
+ /*
+ * With folded p4d, pgd_none() is always false, so the pgd may
+ * point to an empty page table entry and pgd_page_vaddr()
+ * will return garbage.
+ *
+ * We will do the correct sanity check on the p4d level.
+ */
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
+ /* With 4-level paging, copying happens on the p4d level. */
+ p4d = p4d_offset(pgd, address);
+ p4d_ref = p4d_offset(pgd_ref, address);
+ if (p4d_none(*p4d_ref))
+ return -1;
+
+ if (p4d_none(*p4d)) {
+ set_p4d(p4d, *p4d_ref);
+ arch_flush_lazy_mmu_mode();
+ } else {
+ BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
+ }
+
/*
* Below here mismatches are bugs because these lower tables
* are shared:
*/
- pud = pud_offset(pgd, address);
- pud_ref = pud_offset(pgd_ref, address);
+ pud = pud_offset(p4d, address);
+ pud_ref = pud_offset(p4d_ref, address);
if (pud_none(*pud_ref))
return -1;
@@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
pgd_t *pgd = base + pgd_index(address);
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address)
if (!pgd_present(*pgd))
goto out;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (bad_address(p4d))
+ goto bad;
+
+ printk("P4D %lx ", p4d_val(*p4d));
+ if (!p4d_present(*p4d) || p4d_large(*p4d))
+ goto out;
+
+ pud = pud_offset(p4d, address);
if (bad_address(pud))
goto bad;
@@ -1082,6 +1122,7 @@ static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
if (!pgd_present(*pgd))
return 0;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return 0;
+
+ if (p4d_large(*p4d))
+ return spurious_fault_check(error_code, (pte_t *) p4d);
+
+ pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
deleted file mode 100644
index 99c7805a9693..000000000000
--- a/arch/x86/mm/gup.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Lockless get_user_pages_fast for x86
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/memremap.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#ifndef CONFIG_X86_PAE
- return READ_ONCE(*ptep);
-#else
- /*
- * With get_user_pages_fast, we walk down the pagetables without taking
- * any locks. For this we would like to load the pointers atomically,
- * but that is not possible (without expensive cmpxchg8b) on PAE. What
- * we do have is the guarantee that a pte will only either go from not
- * present to present, or present to not present or both -- it will not
- * switch to a completely different present page without a TLB flush in
- * between; something that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees l iff pte_high
- * sees h. We load pte_high *after* loading pte_low, which ensures we
- * don't see an older value of pte_high. *Then* we recheck pte_low,
- * which ensures that we haven't picked up a changed pte high. We might
- * have got rubbish values from pte_low and pte_high, but we are
- * guaranteed that pte_low will not have the present bit set *unless*
- * it is 'l'. And get_user_pages_fast only operates on present ptes, so
- * we're safe.
- *
- * gup_get_pte should not be used or copied outside gup.c without being
- * very careful -- it does not atomically load the pte or anything that
- * is likely to be useful for you.
- */
- pte_t pte;
-
-retry:
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- if (unlikely(pte.pte_low != ptep->pte_low))
- goto retry;
-
- return pte;
-#endif
-}
-
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
-{
- while ((*nr) - nr_start) {
- struct page *page = pages[--(*nr)];
-
- ClearPageReferenced(page);
- put_page(page);
- }
-}
-
-/*
- * 'pteval' can come from a pte, pmd or pud. We only check
- * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 3 types.
- */
-static inline int pte_allows_gup(unsigned long pteval, int write)
-{
- unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
-
- if (write)
- need_pte_bits |= _PAGE_RW;
-
- if ((pteval & need_pte_bits) != need_pte_bits)
- return 0;
-
- /* Check memory protection keys permissions. */
- if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
- return 0;
-
- return 1;
-}
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr;
- pte_t *ptep;
-
- ptep = pte_offset_map(&pmd, addr);
- do {
- pte_t pte = gup_get_pte(ptep);
- struct page *page;
-
- /* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_protnone(pte)) {
- pte_unmap(ptep);
- return 0;
- }
-
- if (pte_devmap(pte)) {
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- pte_unmap(ptep);
- return 0;
- }
- } else if (!pte_allows_gup(pte_val(pte), write) ||
- pte_special(pte)) {
- pte_unmap(ptep);
- return 0;
- }
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
- get_page(page);
- put_dev_pagemap(pgmap);
- SetPageReferenced(page);
- pages[*nr] = page;
- (*nr)++;
-
- } while (ptep++, addr += PAGE_SIZE, addr != end);
- pte_unmap(ptep - 1);
-
- return 1;
-}
-
-static inline void get_head_page_multiple(struct page *page, int nr)
-{
- VM_BUG_ON_PAGE(page != compound_head(page), page);
- VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_add(page, nr);
- SetPageReferenced(page);
-}
-
-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- return 0;
- }
- SetPageReferenced(page);
- pages[*nr] = page;
- get_page(page);
- put_dev_pagemap(pgmap);
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
- return 1;
-}
-
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pmd_val(pmd), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
- if (pmd_devmap(pmd))
- return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pmd_page(pmd);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pmd_t *pmdp;
-
- pmdp = pmd_offset(&pud, addr);
- do {
- pmd_t pmd = *pmdp;
-
- next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
- return 0;
- if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_protnone(pmd))
- return 0;
- if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
- }
- } while (pmdp++, addr = next, addr != end);
-
- return 1;
-}
-
-static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pud_val(pud), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
- if (pud_devmap(pud))
- return __gup_device_huge_pud(pud, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pud_page(pud);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pud_t *pudp;
-
- pudp = pud_offset(&pgd, addr);
- do {
- pud_t pud = *pudp;
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- return 0;
- if (unlikely(pud_large(pud))) {
- if (!gup_huge_pud(pud, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pmd_range(pud, addr, next, write, pages, nr))
- return 0;
- }
- } while (pudp++, addr = next, addr != end);
-
- return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- unsigned long flags;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
- return 0;
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
-
- return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
-
- end = start + len;
- if (end < start)
- goto slow_irqon;
-
-#ifdef CONFIG_X86_64
- if (end >> __VIRTUAL_MASK_SHIFT)
- goto slow_irqon;
-#endif
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_disable();
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- goto slow;
- } while (pgdp++, addr = next, addr != end);
- local_irq_enable();
-
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
- return nr;
-
- {
- int ret;
-
-slow:
- local_irq_enable();
-slow_irqon:
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- ret = get_user_pages_unlocked(start,
- (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
-
- return ret;
- }
-}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index c5066a260803..302f43fd9c28 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -12,10 +12,12 @@
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/sysctl.h>
+#include <linux/compat.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
+#include <asm/elf.h>
#if 0 /* This is just for testing */
struct page *
@@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
info.flags = 0;
info.length = len;
- info.low_limit = current->mm->mmap_legacy_base;
- info.high_limit = TASK_SIZE;
+ info.low_limit = get_mmap_base(1);
+ info.high_limit = in_compat_syscall() ?
+ tasksize_32bit() : tasksize_64bit();
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = get_mmap_base(0);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 4473cb4f8b90..04210a29dd60 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
return 0;
}
+static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ p4d_t *p4d = p4d_page + p4d_index(addr);
+ pud_t *pud;
+
+ next = (addr & P4D_MASK) + P4D_SIZE;
+ if (next > end)
+ next = end;
+
+ if (p4d_present(*p4d)) {
+ pud = pud_offset(p4d, 0);
+ ident_pud_init(info, pud, addr, next);
+ continue;
+ }
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ ident_pud_init(info, pud, addr, next);
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long pstart, unsigned long pend)
{
@@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
- pud_t *pud;
+ p4d_t *p4d;
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
if (next > end)
next = end;
if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, 0);
- result = ident_pud_init(info, pud, addr, next);
+ p4d = p4d_offset(pgd, 0);
+ result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
continue;
}
- pud = (pud_t *)info->alloc_pgt_page(info->context);
- if (!pud)
+ p4d = (p4d_t *)info->alloc_pgt_page(info->context);
+ if (!p4d)
return -ENOMEM;
- result = ident_pud_init(info, pud, addr, next);
+ result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ } else {
+ /*
+ * With p4d folded, pgd is equal to p4d.
+ * The pgd entry has to point to the pud page table in this case.
+ */
+ pud_t *pud = pud_offset(p4d, 0);
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
}
return 0;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 1fa97c941abe..f34d275ee201 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -56,8 +56,6 @@
unsigned long highstart_pfn, highend_pfn;
-static noinline int do_test_wp_bit(void);
-
bool __read_mostly __vmalloc_start_set = false;
/*
@@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false;
*/
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd_table;
@@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
return pmd_table;
}
#endif
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
pmd_table = pmd_offset(pud, 0);
return pmd_table;
@@ -390,8 +391,11 @@ pte_t *kmap_pte;
static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
{
- return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
- vaddr), vaddr), vaddr);
+ pgd_t *pgd = pgd_offset_k(vaddr);
+ p4d_t *p4d = p4d_offset(pgd, vaddr);
+ pud_t *pud = pud_offset(p4d, vaddr);
+ pmd_t *pmd = pmd_offset(pud, vaddr);
+ return pte_offset_kernel(pmd, vaddr);
}
static void __init kmap_init(void)
@@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
unsigned long vaddr;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
pgd = swapper_pg_dir + pgd_index(vaddr);
- pud = pud_offset(pgd, vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
pkmap_page_table = pte;
@@ -450,6 +456,7 @@ void __init native_pagetable_init(void)
{
unsigned long pfn, va;
pgd_t *pgd, *base = swapper_pg_dir;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -469,7 +476,8 @@ void __init native_pagetable_init(void)
if (!pgd_present(*pgd))
break;
- pud = pud_offset(pgd, va);
+ p4d = p4d_offset(pgd, va);
+ pud = pud_offset(p4d, va);
pmd = pmd_offset(pud, va);
if (!pmd_present(*pmd))
break;
@@ -716,20 +724,20 @@ void __init paging_init(void)
*/
static void __init test_wp_bit(void)
{
- printk(KERN_INFO
- "Checking if this processor honours the WP bit even in supervisor mode...");
+ char z = 0;
+
+ printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode...");
- /* Any page-aligned address will do, the test is non-destructive */
- __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
- boot_cpu_data.wp_works_ok = do_test_wp_bit();
- clear_fixmap(FIX_WP_TEST);
+ __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO);
- if (!boot_cpu_data.wp_works_ok) {
- printk(KERN_CONT "No.\n");
- panic("Linux doesn't support CPUs with broken WP.");
- } else {
+ if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) {
+ clear_fixmap(FIX_WP_TEST);
printk(KERN_CONT "Ok.\n");
+ return;
}
+
+ printk(KERN_CONT "No.\n");
+ panic("Linux doesn't support CPUs with broken WP.");
}
void __init mem_init(void)
@@ -811,8 +819,7 @@ void __init mem_init(void)
BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
- if (boot_cpu_data.wp_works_ok < 0)
- test_wp_bit();
+ test_wp_bit();
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -840,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size)
#endif
#endif
-/*
- * This function cannot be __init, since exceptions don't work in that
- * section. Put this after the callers, so that it cannot be inlined.
- */
-static noinline int do_test_wp_bit(void)
-{
- char tmp_reg;
- int flag;
-
- __asm__ __volatile__(
- " movb %0, %1 \n"
- "1: movb %1, %0 \n"
- " xorl %2, %2 \n"
- "2: \n"
- _ASM_EXTABLE(1b,2b)
- :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
- "=q" (tmp_reg),
- "=r" (flag)
- :"2" (1)
- :"memory");
-
- return flag;
-}
-
int kernel_set_to_readonly __read_mostly;
void set_kernel_text_rw(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f6da869810a8..a242139df8fe 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end)
unsigned long address;
for (address = start; address <= end; address += PGDIR_SIZE) {
- const pgd_t *pgd_ref = pgd_offset_k(address);
+ pgd_t *pgd_ref = pgd_offset_k(address);
+ const p4d_t *p4d_ref;
struct page *page;
- if (pgd_none(*pgd_ref))
+ /*
+ * With folded p4d, pgd_none() is always false, we need to
+ * handle synchonization on p4d level.
+ */
+ BUILD_BUG_ON(pgd_none(*pgd_ref));
+ p4d_ref = p4d_offset(pgd_ref, address);
+
+ if (p4d_none(*p4d_ref))
continue;
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
+ p4d_t *p4d;
spinlock_t *pgt_lock;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ p4d = p4d_offset(pgd, address);
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
- BUG_ON(pgd_page_vaddr(*pgd)
- != pgd_page_vaddr(*pgd_ref));
+ if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
+ BUG_ON(p4d_page_vaddr(*p4d)
+ != p4d_page_vaddr(*p4d_ref));
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
+ if (p4d_none(*p4d))
+ set_p4d(p4d, *p4d_ref);
spin_unlock(pgt_lock);
}
@@ -149,16 +159,28 @@ static __ref void *spp_getpage(void)
return ptr;
}
-static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
{
if (pgd_none(*pgd)) {
- pud_t *pud = (pud_t *)spp_getpage();
- pgd_populate(&init_mm, pgd, pud);
- if (pud != pud_offset(pgd, 0))
+ p4d_t *p4d = (p4d_t *)spp_getpage();
+ pgd_populate(&init_mm, pgd, p4d);
+ if (p4d != p4d_offset(pgd, 0))
printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
- pud, pud_offset(pgd, 0));
+ p4d, p4d_offset(pgd, 0));
+ }
+ return p4d_offset(pgd, vaddr);
+}
+
+static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
+{
+ if (p4d_none(*p4d)) {
+ pud_t *pud = (pud_t *)spp_getpage();
+ p4d_populate(&init_mm, p4d, pud);
+ if (pud != pud_offset(p4d, 0))
+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ pud, pud_offset(p4d, 0));
}
- return pud_offset(pgd, vaddr);
+ return pud_offset(p4d, vaddr);
}
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
@@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
pmd_t *pmd = (pmd_t *) spp_getpage();
pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0))
- printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
}
return pmd_offset(pud, vaddr);
@@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
pte_t *pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0))
- printk(KERN_ERR "PAGETABLE BUG #02!\n");
+ printk(KERN_ERR "PAGETABLE BUG #03!\n");
}
return pte_offset_kernel(pmd, vaddr);
}
-void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pud = pud_page + pud_index(vaddr);
- pmd = fill_pmd(pud, vaddr);
- pte = fill_pte(pmd, vaddr);
+ pmd_t *pmd = fill_pmd(pud, vaddr);
+ pte_t *pte = fill_pte(pmd, vaddr);
set_pte(pte, new_pte);
@@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
__flush_tlb_one(vaddr);
}
+void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
+{
+ p4d_t *p4d = p4d_page + p4d_index(vaddr);
+ pud_t *pud = fill_pud(p4d, vaddr);
+
+ __set_pte_vaddr(pud, vaddr, new_pte);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+ pud_t *pud = pud_page + pud_index(vaddr);
+
+ __set_pte_vaddr(pud, vaddr, new_pte);
+}
+
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
- pud_t *pud_page;
+ p4d_t *p4d_page;
pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
@@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
"PGD FIXMAP MISSING, it should be setup in head.S!\n");
return;
}
- pud_page = (pud_t*)pgd_page_vaddr(*pgd);
- set_pte_vaddr_pud(pud_page, vaddr, pteval);
+
+ p4d_page = p4d_offset(pgd, 0);
+ set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
}
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(vaddr);
- pud = fill_pud(pgd, vaddr);
+ p4d = fill_p4d(pgd, vaddr);
+ pud = fill_pud(p4d, vaddr);
return fill_pmd(pud, vaddr);
}
@@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
enum page_cache_mode cache)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgprot_t prot;
@@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
if (pgd_none(*pgd)) {
+ p4d = (p4d_t *) spp_getpage();
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ p4d = p4d_offset(pgd, (unsigned long)__va(phys));
+ if (p4d_none(*p4d)) {
pud = (pud_t *) spp_getpage();
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
_PAGE_USER));
}
- pud = pud_offset(pgd, (unsigned long)__va(phys));
+ pud = pud_offset(p4d, (unsigned long)__va(phys));
if (pud_none(*pud)) {
pmd = (pmd_t *) spp_getpage();
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
@@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
+ p4d_t *p4d;
pud_t *pud;
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
- if (pgd_val(*pgd)) {
- pud = (pud_t *)pgd_page_vaddr(*pgd);
+ BUILD_BUG_ON(pgd_none(*pgd));
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_val(*p4d)) {
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
@@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- pgd_populate(&init_mm, pgd, pud);
+ p4d_populate(&init_mm, p4d, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
spin_unlock(&init_mm.page_table_lock);
}
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ /* free a pud talbe */
+ free_pagetable(p4d_page(*p4d), 0);
+ spin_lock(&init_mm.page_table_lock);
+ p4d_clear(p4d);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
bool direct)
@@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
update_page_count(PG_LEVEL_1G, -pages);
}
+static void __meminit
+remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pud_t *pud_base;
+ p4d_t *p4d;
+
+ p4d = p4d_start + p4d_index(addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ BUILD_BUG_ON(p4d_large(*p4d));
+
+ pud_base = (pud_t *)p4d_page_vaddr(*p4d);
+ remove_pud_table(pud_base, addr, next, direct);
+ free_pud_table(pud_base, p4d);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_512G, -pages);
+}
+
/* start and end are both virtual address. */
static void __meminit
remove_pagetable(unsigned long start, unsigned long end, bool direct)
@@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
unsigned long next;
unsigned long addr;
pgd_t *pgd;
- pud_t *pud;
+ p4d_t *p4d;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
@@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
if (!pgd_present(*pgd))
continue;
- pud = (pud_t *)pgd_page_vaddr(*pgd);
- remove_pud_table(pud, addr, next, direct);
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ remove_p4d_table(p4d, addr, next, direct);
}
flush_tlb_all();
@@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr)
if (pgd_none(*pgd))
return 0;
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return 0;
+
+ pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return 0;
@@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
unsigned long addr;
unsigned long next;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (!pgd)
return -ENOMEM;
- pud = vmemmap_pud_populate(pgd, addr, node);
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(p4d, addr, node);
if (!pud)
return -ENOMEM;
@@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
unsigned long end = (unsigned long)(start_page + size);
unsigned long next;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
unsigned int nr_pages;
@@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr,
}
get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
+
+ pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
continue;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c43b6b33463a..e4f7b25df18e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -426,7 +426,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
/* Don't assume we're using swapper_pg_dir at this point */
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(addr)];
- pud_t *pud = pud_offset(pgd, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
pmd_t *pmd = pmd_offset(pud, addr);
return pmd;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 66d20174e35f..0c7d8129bed6 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,3 +1,4 @@
+#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
#include <linux/bootmem.h>
#include <linux/kasan.h>
@@ -33,8 +34,19 @@ static int __init map_range(struct range *range)
static void __init clear_pgds(unsigned long start,
unsigned long end)
{
- for (; start < end; start += PGDIR_SIZE)
- pgd_clear(pgd_offset_k(start));
+ pgd_t *pgd;
+
+ for (; start < end; start += PGDIR_SIZE) {
+ pgd = pgd_offset_k(start);
+ /*
+ * With folded p4d, pgd_clear() is nop, use p4d_clear()
+ * instead.
+ */
+ if (CONFIG_PGTABLE_LEVELS < 5)
+ p4d_clear(p4d_offset(pgd, start));
+ else
+ pgd_clear(pgd);
+ }
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
@@ -44,8 +56,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
unsigned long end = KASAN_SHADOW_END;
for (i = pgd_index(start); start < end; i++) {
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
- | _KERNPG_TABLE);
+ switch (CONFIG_PGTABLE_LEVELS) {
+ case 4:
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
+ _KERNPG_TABLE);
+ break;
+ case 5:
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
+ _KERNPG_TABLE);
+ break;
+ default:
+ BUILD_BUG();
+ }
start += PGDIR_SIZE;
}
}
@@ -73,6 +95,7 @@ void __init kasan_early_init(void)
pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+ p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
for (i = 0; i < PTRS_PER_PTE; i++)
kasan_zero_pte[i] = __pte(pte_val);
@@ -83,6 +106,9 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
+ for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ kasan_zero_p4d[i] = __p4d(p4d_val);
+
kasan_map_early_shadow(early_level4_pgt);
kasan_map_early_shadow(init_level4_pgt);
}
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 887e57182716..aed206475aa7 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -48,7 +48,7 @@ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
#if defined(CONFIG_X86_ESPFIX64)
static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
#elif defined(CONFIG_EFI)
-static const unsigned long vaddr_end = EFI_VA_START;
+static const unsigned long vaddr_end = EFI_VA_END;
#else
static const unsigned long vaddr_end = __START_KERNEL_map;
#endif
@@ -105,7 +105,7 @@ void __init kernel_randomize_memory(void)
*/
BUILD_BUG_ON(vaddr_start >= vaddr_end);
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
- vaddr_end >= EFI_VA_START);
+ vaddr_end >= EFI_VA_END);
BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
IS_ENABLED(CONFIG_EFI)) &&
vaddr_end >= __START_KERNEL_map);
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 7940166c799b..19ad095b41df 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -30,30 +30,44 @@
#include <linux/limits.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
+#include <linux/compat.h>
#include <asm/elf.h>
struct va_alignment __read_mostly va_align = {
.flags = -1,
};
-static unsigned long stack_maxrandom_size(void)
+unsigned long tasksize_32bit(void)
+{
+ return IA32_PAGE_OFFSET;
+}
+
+unsigned long tasksize_64bit(void)
+{
+ return TASK_SIZE_MAX;
+}
+
+static unsigned long stack_maxrandom_size(unsigned long task_size)
{
unsigned long max = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
+ max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+ max <<= PAGE_SHIFT;
}
return max;
}
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave an at least ~128 MB hole with possible stack randomization.
- */
-#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
-#define MAX_GAP (TASK_SIZE/6*5)
+#ifdef CONFIG_COMPAT
+# define mmap32_rnd_bits mmap_rnd_compat_bits
+# define mmap64_rnd_bits mmap_rnd_bits
+#else
+# define mmap32_rnd_bits mmap_rnd_bits
+# define mmap64_rnd_bits mmap_rnd_bits
+#endif
+
+#define SIZE_128M (128 * 1024 * 1024UL)
static int mmap_is_legacy(void)
{
@@ -66,54 +80,91 @@ static int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-unsigned long arch_mmap_rnd(void)
+static unsigned long arch_rnd(unsigned int rndbits)
{
- unsigned long rnd;
-
- if (mmap_is_ia32())
-#ifdef CONFIG_COMPAT
- rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
-#else
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-#endif
- else
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+ return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
+}
- return rnd << PAGE_SHIFT;
+unsigned long arch_mmap_rnd(void)
+{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+ return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
-static unsigned long mmap_base(unsigned long rnd)
+static unsigned long mmap_base(unsigned long rnd, unsigned long task_size)
{
unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap_min, gap_max;
+
+ /*
+ * Top of mmap area (just below the process stack).
+ * Leave an at least ~128 MB hole with possible stack randomization.
+ */
+ gap_min = SIZE_128M + stack_maxrandom_size(task_size);
+ gap_max = (task_size / 6) * 5;
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
+ if (gap < gap_min)
+ gap = gap_min;
+ else if (gap > gap_max)
+ gap = gap_max;
+
+ return PAGE_ALIGN(task_size - gap - rnd);
+}
- return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+static unsigned long mmap_legacy_base(unsigned long rnd,
+ unsigned long task_size)
+{
+ return __TASK_UNMAPPED_BASE(task_size) + rnd;
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
+static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
+ unsigned long random_factor, unsigned long task_size)
+{
+ *legacy_base = mmap_legacy_base(random_factor, task_size);
+ if (mmap_is_legacy())
+ *base = *legacy_base;
+ else
+ *base = mmap_base(random_factor, task_size);
+}
+
void arch_pick_mmap_layout(struct mm_struct *mm)
{
- unsigned long random_factor = 0UL;
+ if (mmap_is_legacy())
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ else
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
+ arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
+ arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+ /*
+ * The mmap syscall mapping base decision depends solely on the
+ * syscall type (64-bit or compat). This applies for 64bit
+ * applications and 32bit applications. The 64bit syscall uses
+ * mmap_base, the compat syscall uses mmap_compat_base.
+ */
+ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
+ arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+#endif
+}
- mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
+unsigned long get_mmap_base(int is_legacy)
+{
+ struct mm_struct *mm = current->mm;
- if (mmap_is_legacy()) {
- mm->mmap_base = mm->mmap_legacy_base;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+ if (in_compat_syscall()) {
+ return is_legacy ? mm->mmap_compat_legacy_base
+ : mm->mmap_compat_base;
}
+#endif
+ return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
}
const char *arch_vma_name(struct vm_area_struct *vma)
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 5126dfd52b18..cd44ae727df7 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -590,7 +590,7 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
* we might run off the end of the bounds table if we are on
* a 64-bit kernel and try to get 8 bytes.
*/
-int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
+static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
long __user *bd_entry_ptr)
{
u32 bd_entry_32;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f9d99535f233..25504d5aa816 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid)
nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
MEMBLOCK_ALLOC_ACCESSIBLE);
if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
+ pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
nd_size, nid);
return;
}
@@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid)
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
- * Sanitize @mi by merging and removing unncessary memblks. Also check for
+ * Sanitize @mi by merging and removing unnecessary memblks. Also check for
* conflicts and clear unused memblks.
*
* RETURNS:
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a57e8e02f457..56b22fa504df 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level)
{
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pgd_none(*pgd))
return NULL;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ *level = PG_LEVEL_512G;
+ if (p4d_large(*p4d) || !p4d_present(*p4d))
+ return (pte_t *)p4d;
+
+ pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
@@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
pmd_t *lookup_pmd_address(unsigned long address)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(address);
if (pgd_none(*pgd))
return NULL;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, address);
if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
return NULL;
@@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
set_pte_atomic((pte_t *)pmd, pte);
}
@@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
pud_clear(pud);
}
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
{
- pud_t *pud = pud_offset(pgd, start);
+ pud_t *pud = pud_offset(p4d, start);
/*
* Not on a GB page boundary?
@@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa,
return num_pages;
}
-static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
- pgprot_t pgprot)
+static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
+ pgprot_t pgprot)
{
pud_t *pud;
unsigned long end;
@@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
cur_pages = (pre_end - start) >> PAGE_SHIFT;
cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
- pud = pud_offset(pgd, start);
+ pud = pud_offset(p4d, start);
/*
* Need a PMD page?
@@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
if (cpa->numpages == cur_pages)
return cur_pages;
- pud = pud_offset(pgd, start);
+ pud = pud_offset(p4d, start);
pud_pgprot = pgprot_4k_2_large(pgprot);
/*
@@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
if (start < end) {
long tmp;
- pud = pud_offset(pgd, start);
+ pud = pud_offset(p4d, start);
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
@@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
pud_t *pud = NULL; /* shut up gcc */
+ p4d_t *p4d;
pgd_t *pgd_entry;
long ret;
pgd_entry = cpa->pgd + pgd_index(addr);
+ if (pgd_none(*pgd_entry)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!p4d)
+ return -1;
+
+ set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+
/*
* Allocate a PUD page and hand it down for mapping.
*/
- if (pgd_none(*pgd_entry)) {
+ p4d = p4d_offset(pgd_entry, addr);
+ if (p4d_none(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
if (!pud)
return -1;
- set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
- ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+ ret = populate_pud(cpa, addr, p4d, pgprot);
if (ret < 0) {
/*
* Leave the PUD page in place in case some other CPU or thread
* already found it, but remove any useless entries we just
* added to it.
*/
- unmap_pud_range(pgd_entry, addr,
+ unmap_pud_range(p4d, addr,
addr + (cpa->numpages << PAGE_SHIFT));
return ret;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6cbdff26bb96..508a708eb9a6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+{
+ paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(p4d));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
- CONFIG_PGTABLE_LEVELS == 4) {
+ CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -261,13 +269,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
+ p4d_t *p4d;
pud_t *pud;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
return;
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
pmd_t *pmd = pmds[i];
@@ -580,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
}
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+#ifdef CONFIG_X86_5LEVEL
+/**
+ * p4d_set_huge - setup kernel P4D mapping
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+
+/**
+ * p4d_clear_huge - clear kernel P4D mapping when it is set
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_clear_huge(p4d_t *p4d)
+{
+ return 0;
+}
+#endif
+
/**
* pud_set_huge - setup kernel PUD mapping
*
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index de53c52551a5..b9bd5b8b14fa 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
BUG();
return;
}
- pud = pud_offset(pgd, vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_none(*p4d)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(p4d, vaddr);
if (pud_none(*pud)) {
BUG();
return;