diff options
Diffstat (limited to 'arch/x86/mm/pgtable.c')
| -rw-r--r-- | arch/x86/mm/pgtable.c | 671 |
1 files changed, 527 insertions, 144 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfa537a03be1..2e5ecfdce73c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,60 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/fixmap.h> +#include <asm/mtrr.h> -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO - -#ifdef CONFIG_HIGHPTE -#define PGALLOC_USER_GFP __GFP_HIGHMEM -#else -#define PGALLOC_USER_GFP 0 +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; +EXPORT_SYMBOL(physical_mask); +SYM_PIC_ALIAS(physical_mask); #endif -gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; - -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - return (pte_t *)__get_free_page(PGALLOC_GFP); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *pte; - - pte = alloc_pages(__userpte_alloc_gfp, 0); - if (pte) - pgtable_page_ctor(pte); - return pte; + return __pte_alloc_one(mm, GFP_PGTABLE_USER); } -static int __init setup_userpte(char *arg) -{ - if (!arg) - return -EINVAL; - - /* - * "userpte=nohigh" disables allocation of user pagetables in - * high memory. - */ - if (strcmp(arg, "nohigh") == 0) - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - else - return -EINVAL; - return 0; -} -early_param("userpte", setup_userpte); - void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); + tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); @@ -65,72 +35,65 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - tlb_remove_page(tlb, virt_to_page(pmd)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pud)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); +} + +#if CONFIG_PGTABLE_LEVELS > 4 +void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) +{ + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } -#endif /* PAGETABLE_LEVELS > 3 */ -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_add(&page->lru, &pgd_list); + list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_del(&page->lru); + list_del(&ptdesc->pt_list); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) - - static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { - BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); - virt_to_page(pgd)->index = (pgoff_t)mm; + virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { - return (struct mm_struct *)page->index; + return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - PAGETABLE_LEVELS == 4) { + /* PAE preallocates all its PMDs. No cloning needed. */ + if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - } - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) { - pgd_set_mm(pgd, mm); - pgd_list_add(pgd); - } + /* List used to sync kernel mapping updates */ + pgd_set_mm(pgd, mm); + pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { - if (SHARED_KERNEL_PMD) - return; - spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); @@ -154,12 +117,22 @@ static void pgd_dtor(pgd_t *pgd) * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. + */ +#define PREALLOCATED_PMDS PTRS_PER_PGD + +/* + * "USER_PMDS" are the PMDs for the user copy of the page tables when + * PTI is enabled. They do not exist when PTI is disabled. Note that + * this is distinct from the user _portion_ of the kernel page tables + * which always exists. * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. + * We allocate separate PMDs for the kernel part of the user page-table + * when PTI is enabled. We need them to map the per-process LDT into the + * user-space page-table. */ -#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ + KERNEL_PGD_PTRS : 0) +#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { @@ -181,32 +154,56 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 - +#define PREALLOCATED_USER_PMDS 0 +#define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ -static void free_pmds(pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; + struct ptdesc *ptdesc; - for(i = 0; i < PREALLOCATED_PMDS; i++) - if (pmds[i]) - free_page((unsigned long)pmds[i]); + for (i = 0; i < count; i++) + if (pmds[i]) { + ptdesc = virt_to_ptdesc(pmds[i]); + + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); + mm_dec_nr_pmds(mm); + } } -static int preallocate_pmds(pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + gfp &= ~__GFP_HIGHMEM; - for(i = 0; i < PREALLOCATED_PMDS; i++) { - pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); - if (pmd == NULL) + for (i = 0; i < count; i++) { + pmd_t *pmd = NULL; + struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); + + if (!ptdesc) + failed = true; + if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + ptdesc = NULL; failed = true; + } + if (ptdesc) { + mm_inc_nr_pmds(mm); + pmd = ptdesc_address(ptdesc); + } + pmds[i] = pmd; } if (failed) { - free_pmds(pmds); + free_pmds(mm, pmds, count); return -ENOMEM; } @@ -219,33 +216,48 @@ static int preallocate_pmds(pmd_t *pmds[]) * preallocate which never got a corresponding vma will need to be * freed manually. */ +static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = *pgdp; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgd_clear(pgdp); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); + } +} + static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; - for(i = 0; i < PREALLOCATED_PMDS; i++) { - pgd_t pgd = pgdp[i]; + for (i = 0; i < PREALLOCATED_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i]); - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION - pgdp[i] = native_make_pgd(0); + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - } - } + pgdp = kernel_to_user_pgdp(pgdp); + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); +#endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; - if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ - return; - - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; @@ -258,24 +270,79 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ + pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + p4d_t *u_p4d; + pud_t *u_pud; + int i; + + u_p4d = p4d_offset(u_pgd, 0); + u_pud = pud_offset(u_p4d, 0); + + s_pgd += KERNEL_PGD_BOUNDARY; + u_pud += KERNEL_PGD_BOUNDARY; + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { + pmd_t *pmd = pmds[i]; + + memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, u_pud, pmd); + } + +} +#else +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ +} +#endif + +static inline pgd_t *_pgd_alloc(struct mm_struct *mm) +{ + /* + * PTI and Xen need a whole page for the PAE PGD + * even though the hardware only needs 32 bytes. + * + * For simplicity, allocate a page for all users. + */ + return __pgd_alloc(mm, pgd_allocation_order()); +} + +static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + __pgd_free(mm, pgd); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; + pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + pgd = _pgd_alloc(mm); if (pgd == NULL) goto out; mm->pgd = pgd; - if (preallocate_pmds(pmds) != 0) + if (sizeof(pmds) != 0 && + preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; - if (paravirt_pgd_alloc(mm) != 0) + if (sizeof(u_pmds) != 0 && + preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_user_pmds; + /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they @@ -284,16 +351,24 @@ pgd_t *pgd_alloc(struct mm_struct *mm) spin_lock(&pgd_lock); pgd_ctor(mm, pgd); - pgd_prepopulate_pmd(mm, pgd, pmds); + if (sizeof(pmds) != 0) + pgd_prepopulate_pmd(mm, pgd, pmds); + + if (sizeof(u_pmds) != 0) + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; +out_free_user_pmds: + if (sizeof(u_pmds) != 0) + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: - free_pmds(pmds); + if (sizeof(pmds) != 0) + free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: - free_page((unsigned long)pgd); + _pgd_free(mm, pgd); out: return NULL; } @@ -303,7 +378,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - free_page((unsigned long)pgd); + _pgd_free(mm, pgd); } /* @@ -319,10 +394,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma, { int changed = !pte_same(*ptep, entry); - if (changed && dirty) { - *ptep = entry; - pte_update_defer(vma->vm_mm, address, ptep); - } + if (changed && dirty) + set_pte(ptep, entry); return changed; } @@ -337,8 +410,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { - *pmdp = entry; - pmd_update_defer(vma->vm_mm, address, pmdp); + set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, @@ -349,6 +421,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, return changed; } + +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed = !pud_same(*pudp, entry); + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + + if (changed && dirty) { + set_pud(pudp, entry); + /* + * We had a write-protection fault here and changed the pud + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, @@ -360,13 +452,10 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); - if (ret) - pte_update(vma->vm_mm, addr, ptep); - return ret; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -376,8 +465,19 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); - if (ret) - pmd_update(vma->vm_mm, addr, pmdp); + return ret; +} +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) +{ + int ret = 0; + + if (pud_young(*pudp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pudp); return ret; } @@ -386,13 +486,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young; - - young = ptep_test_and_clear_young(vma, address, ptep); - if (young) - flush_tlb_page(vma, address); - - return young; + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -410,35 +517,45 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, return young; } -void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { - int set; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)pmdp); - if (set) { - pmd_update(vma->vm_mm, address, pmdp); - /* need tlb flush only to serialize against gup-fast */ - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - } + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); + + /* + * No flush is necessary. Once an invalid PTE is established, the PTE's + * access and dirty bits cannot be updated. + */ + return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); +} +#endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + VM_WARN_ON_ONCE(!pud_present(*pudp)); + pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return old; } #endif /** - * reserve_top_address - reserves a hole in the top of kernel address space - * @reserve - size of hole to reserve + * reserve_top_address - Reserve a hole in the top of the kernel address space + * @reserve: Size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top - * of kernel address space to make room for a hypervisor. + * of the kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; + __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; + printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", + -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } @@ -448,6 +565,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + if (idx >= __end_of_fixed_addresses) { BUG(); return; @@ -456,8 +582,265 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) fixmaps_set++; } -void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, - pgprot_t flags) +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) { + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +#if CONFIG_PGTABLE_LEVELS > 4 +/** + * p4d_set_huge - Set up kernel P4D mapping + * @p4d: Pointer to the P4D entry + * @addr: Virtual address associated with the P4D entry + * @prot: Protection bits to use + * + * No 512GB pages yet -- always return 0 + */ +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +/** + * p4d_clear_huge - Clear kernel P4D mapping when it is set + * @p4d: Pointer to the P4D entry to clear + * + * No 512GB pages yet -- do nothing + */ +void p4d_clear_huge(p4d_t *p4d) +{ +} +#endif + +/** + * pud_set_huge - Set up kernel PUD mapping + * @pud: Pointer to the PUD entry + * @addr: Virtual address associated with the PUD entry + * @prot: Protection bits to use + * + * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this + * function sets up a huge page only if the complete range has the same MTRR + * caching mode. + * + * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger + * page mapping attempt fails. + * + * Returns 1 on success and 0 on failure. + */ +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + u8 uniform; + + mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if (!uniform) + return 0; + + /* Bail out if we are we on a populated non-leaf entry: */ + if (pud_present(*pud) && !pud_leaf(*pud)) + return 0; + + set_pte((pte_t *)pud, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); + + return 1; +} + +/** + * pmd_set_huge - Set up kernel PMD mapping + * @pmd: Pointer to the PMD entry + * @addr: Virtual address associated with the PMD entry + * @prot: Protection bits to use + * + * See text over pud_set_huge() above. + * + * Returns 1 on success and 0 on failure. + */ +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + u8 uniform; + + mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if (!uniform) { + pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", + __func__, addr, addr + PMD_SIZE); + return 0; + } + + /* Bail out if we are we on a populated non-leaf entry: */ + if (pmd_present(*pmd) && !pmd_leaf(*pmd)) + return 0; + + set_pte((pte_t *)pmd, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); + + return 1; +} + +/** + * pud_clear_huge - Clear kernel PUD mapping when it is set + * @pud: Pointer to the PUD entry to clear. + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ +int pud_clear_huge(pud_t *pud) +{ + if (pud_leaf(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +/** + * pmd_clear_huge - Clear kernel PMD mapping when it is set + * @pmd: Pointer to the PMD entry to clear. + * + * Returns 1 on success and 0 on failure (no PMD map is found). + */ +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_leaf(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_64 +/** + * pud_free_pmd_page - Clear PUD entry and free PMD page + * @pud: Pointer to a PUD + * @addr: Virtual address associated with PUD + * + * Context: The PUD range has been unmapped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + * + * NOTE: Callers must allow a single page allocation. + */ +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd, *pmd_sv; + struct ptdesc *pt; + int i; + + pmd = pud_pgtable(*pud); + pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); + if (!pmd_sv) + return 0; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_sv[i] = pmd[i]; + if (!pmd_none(pmd[i])) + pmd_clear(&pmd[i]); + } + + pud_clear(pud); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd_sv[i])) { + pt = page_ptdesc(pmd_page(pmd_sv[i])); + pagetable_dtor_free(pt); + } + } + + free_page((unsigned long)pmd_sv); + + pmd_free(&init_mm, pmd); + + return 1; +} + +/** + * pmd_free_pte_page - Clear PMD entry and free PTE page. + * @pmd: Pointer to the PMD + * @addr: Virtual address associated with PMD + * + * Context: The PMD range has been unmapped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + struct ptdesc *pt; + + pt = page_ptdesc(pmd_page(*pmd)); + pmd_clear(pmd); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + + pagetable_dtor_free(pt); + + return 1; +} + +#else /* !CONFIG_X86_64 */ + +/* + * Disable free page handling on x86-PAE. This assures that ioremap() + * does not update sync'd PMD entries. See vmalloc_sync_one(). + */ +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + return pmd_none(*pmd); +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pte_mkwrite_shstk(pte); + + pte = pte_mkwrite_novma(pte); + + return pte_clear_saveddirty(pte); +} + +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pmd_mkwrite_shstk(pmd); + + pmd = pmd_mkwrite_novma(pmd); + + return pmd_clear_saveddirty(pmd); +} + +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) +{ + /* + * Hardware before shadow stack can (rarely) set Dirty=1 + * on a Write=0 PTE. So the below condition + * only indicates a software bug when shadow stack is + * supported by the HW. This checking is covered in + * pte_shstk(). + */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pte_shstk(pte)); +} + +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pmd_shstk(pmd)); +} + +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); +} |
