summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm/pgtable.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm/pgtable.c')
-rw-r--r--arch/powerpc/mm/pgtable.c232
1 files changed, 165 insertions, 67 deletions
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index e3759b69f81b..9e7ba9c3851f 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -23,10 +23,18 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
+
+#ifdef CONFIG_PPC64
+#define PGD_ALIGN (sizeof(pgd_t) * MAX_PTRS_PER_PGD)
+#else
+#define PGD_ALIGN PAGE_SIZE
+#endif
+
+pgd_t swapper_pg_dir[MAX_PTRS_PER_PGD] __section(".bss..page_aligned") __aligned(PGD_ALIGN);
static inline int is_exec_fault(void)
{
@@ -38,19 +46,19 @@ static inline int is_exec_fault(void)
* and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
* on userspace PTEs
*/
-static inline int pte_looks_normal(pte_t pte)
+static inline int pte_looks_normal(pte_t pte, unsigned long addr)
{
if (pte_present(pte) && !pte_special(pte)) {
if (pte_ci(pte))
return 0;
- if (pte_user(pte))
+ if (!is_kernel_addr(addr))
return 1;
}
return 0;
}
-static struct page *maybe_pte_to_page(pte_t pte)
+static struct folio *maybe_pte_to_folio(pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
struct page *page;
@@ -60,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte)
page = pfn_to_page(pfn);
if (PageReserved(page))
return NULL;
- return page;
+ return page_folio(page);
}
#ifdef CONFIG_PPC_BOOK3S
@@ -71,20 +79,17 @@ static struct page *maybe_pte_to_page(pte_t pte)
* support falls into the same category.
*/
-static pte_t set_pte_filter_hash(pte_t pte)
+static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr)
{
- if (radix_enabled())
- return pte;
-
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
- if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
- cpu_has_feature(CPU_FTR_NOEXECUTE))) {
- struct page *pg = maybe_pte_to_page(pte);
- if (!pg)
+ if (pte_looks_normal(pte, addr) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+ cpu_has_feature(CPU_FTR_NOEXECUTE))) {
+ struct folio *folio = maybe_pte_to_folio(pte);
+ if (!folio)
return pte;
- if (!test_bit(PG_arch_1, &pg->flags)) {
- flush_dcache_icache_page(pg);
- set_bit(PG_arch_1, &pg->flags);
+ if (!test_bit(PG_dcache_clean, &folio->flags)) {
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
}
}
return pte;
@@ -92,38 +97,43 @@ static pte_t set_pte_filter_hash(pte_t pte)
#else /* CONFIG_PPC_BOOK3S */
-static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
+static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { return pte; }
#endif /* CONFIG_PPC_BOOK3S */
/* Embedded type MMU with HW exec support. This is a bit more complicated
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
+ *
+ * This is also called once for the folio. So only work with folio->flags here.
*/
-static pte_t set_pte_filter(pte_t pte)
+static inline pte_t set_pte_filter(pte_t pte, unsigned long addr)
{
- struct page *pg;
+ struct folio *folio;
+
+ if (radix_enabled())
+ return pte;
if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
- return set_pte_filter_hash(pte);
+ return set_pte_filter_hash(pte, addr);
/* No exec permission in the first place, move on */
- if (!pte_exec(pte) || !pte_looks_normal(pte))
+ if (!pte_exec(pte) || !pte_looks_normal(pte, addr))
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
return pte;
/* If the page clean, we move on */
- if (test_bit(PG_arch_1, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags))
return pte;
/* If it's an exec fault, we flush the cache and make it clean */
if (is_exec_fault()) {
- flush_dcache_icache_page(pg);
- set_bit(PG_arch_1, &pg->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
return pte;
}
@@ -134,7 +144,10 @@ static pte_t set_pte_filter(pte_t pte)
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
int dirty)
{
- struct page *pg;
+ struct folio *folio;
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+ return pte;
if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
return pte;
@@ -157,17 +170,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
#endif /* CONFIG_DEBUG_VM */
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
goto bail;
/* If the page is already clean, we move on */
- if (test_bit(PG_arch_1, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags))
goto bail;
- /* Clean the page and set PG_arch_1 */
- flush_dcache_icache_page(pg);
- set_bit(PG_arch_1, &pg->flags);
+ /* Clean the page and set PG_dcache_clean */
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
bail:
return pte_mkexec(pte);
@@ -176,26 +189,48 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
/*
* set_pte stores a linux PTE into the linux page table.
*/
-void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
- pte_t pte)
+void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned int nr)
{
- /*
- * Make sure hardware valid bit is not set. We don't do
- * tlb flush for this update.
- */
- VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
-
- /* Add the pte bit when trying to set a pte */
- pte = pte_mkpte(pte);
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
- * is called.
+ * is called. Filter the pte value and use the filtered value
+ * to setup all the ptes in the range.
+ */
+ pte = set_pte_filter(pte, addr);
+
+ /*
+ * We don't need to call arch_enter/leave_lazy_mmu_mode()
+ * because we expect set_ptes to be only be used on not present
+ * and not hw_valid ptes. Hence there is no translation cache flush
+ * involved that need to be batched.
*/
- pte = set_pte_filter(pte);
+ for (;;) {
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ /* Perform the setting of the PTE */
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ pte = pte_next_pfn(pte);
+ }
+}
- /* Perform the setting of the PTE */
- __set_pte_at(mm, addr, ptep, pte, 0);
+void unmap_kernel_page(unsigned long va)
+{
+ pmd_t *pmdp = pmd_off_k(va);
+ pte_t *ptep = pte_offset_kernel(pmdp, va);
+
+ pte_clear(&init_mm, va, ptep);
+ flush_tlb_kernel_range(va, va + PAGE_SIZE);
}
/*
@@ -249,42 +284,76 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
#else
/*
- * Not used on non book3s64 platforms. But 8xx
- * can possibly use tsize derived from hstate.
+ * Not used on non book3s64 platforms.
+ * 8xx compares it with mmu_virtual_psize to
+ * know if it is a huge page or not.
*/
- psize = 0;
+ psize = MMU_PAGE_COUNT;
#endif
__ptep_set_access_flags(vma, ptep, pte, addr, psize);
}
return changed;
#endif
}
+
+#if defined(CONFIG_PPC_8xx)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ pmd_t *pmd = pmd_off(mm, addr);
+ pte_basic_t val;
+ pte_basic_t *entry = (pte_basic_t *)ptep;
+ int num, i;
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ pte = set_pte_filter(pte, addr);
+
+ val = pte_val(pte);
+
+ num = number_of_cells_per_pte(pmd, val, 1);
+
+ for (i = 0; i < num; i++, entry++, val += SZ_4K)
+ *entry = val;
+}
+#endif
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_DEBUG_VM
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
if (mm == &init_mm)
return;
pgd = mm->pgd + pgd_index(addr);
BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ BUG_ON(p4d_none(*p4d));
+ pud = pud_offset(p4d, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
/*
* khugepaged to collapse normal pages to hugepage, first set
- * pmd to none to force page fault/gup to take mmap_sem. After
+ * pmd to none to force page fault/gup to take mmap_lock. After
* pmd is set to none, we do a pte_clear which does this assertion
* so if we find pmd none, return.
*/
if (pmd_none(*pmd))
return;
- BUG_ON(!pmd_present(*pmd));
- assert_spin_locked(pte_lockptr(mm, pmd));
+ pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+ BUG_ON(!pte);
+ assert_spin_locked(ptl);
+ pte_unmap(pte);
}
#endif /* CONFIG_DEBUG_VM */
@@ -305,19 +374,20 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys);
* (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
*
* So long as we atomically load page table pointers we are safe against teardown,
- * we can follow the address down to the the page and take a ref on it.
+ * we can follow the address down to the page and take a ref on it.
* This function need to be called with interrupts disabled. We use this variant
* when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
*/
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
{
- pgd_t pgd, *pgdp;
+ pgd_t *pgdp;
+ p4d_t p4d, *p4dp;
pud_t pud, *pudp;
pmd_t pmd, *pmdp;
pte_t *ret_pte;
hugepd_t *hpdp = NULL;
- unsigned pdshift = PGDIR_SHIFT;
+ unsigned pdshift;
if (hpage_shift)
*hpage_shift = 0;
@@ -325,24 +395,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
if (is_thp)
*is_thp = false;
- pgdp = pgdir + pgd_index(ea);
- pgd = READ_ONCE(*pgdp);
/*
* Always operate on the local stack value. This make sure the
* value don't get updated by a parallel THP split/collapse,
* page fault or a page unmap. The return pte_t * is still not
* stable. So should be checked there for above conditions.
+ * Top level is an exception because it is folded into p4d.
*/
- if (pgd_none(pgd))
+ pgdp = pgdir + pgd_index(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ p4d = READ_ONCE(*p4dp);
+ pdshift = P4D_SHIFT;
+
+ if (p4d_none(p4d))
return NULL;
- if (pgd_is_leaf(pgd)) {
- ret_pte = (pte_t *)pgdp;
+ if (p4d_leaf(p4d)) {
+ ret_pte = (pte_t *)p4dp;
goto out;
}
- if (is_hugepd(__hugepd(pgd_val(pgd)))) {
- hpdp = (hugepd_t *)&pgd;
+ if (is_hugepd(__hugepd(p4d_val(p4d)))) {
+ hpdp = (hugepd_t *)&p4d;
goto out_huge;
}
@@ -352,13 +426,13 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
* irq disabled
*/
pdshift = PUD_SHIFT;
- pudp = pud_offset(&pgd, ea);
+ pudp = pud_offset(&p4d, ea);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return NULL;
- if (pud_is_leaf(pud)) {
+ if (pud_leaf(pud)) {
ret_pte = (pte_t *)pudp;
goto out;
}
@@ -397,7 +471,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
goto out;
}
- if (pmd_is_leaf(pmd)) {
+ if (pmd_leaf(pmd)) {
ret_pte = (pte_t *)pmdp;
goto out;
}
@@ -421,3 +495,27 @@ out:
return ret_pte;
}
EXPORT_SYMBOL_GPL(__find_linux_pte);
+
+/* Note due to the way vm flags are laid out, the bits are XWR */
+const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_READONLY,
+ [VM_WRITE] = PAGE_COPY,
+ [VM_WRITE | VM_READ] = PAGE_COPY,
+ [VM_EXEC] = PAGE_EXECONLY_X,
+ [VM_EXEC | VM_READ] = PAGE_READONLY_X,
+ [VM_EXEC | VM_WRITE] = PAGE_COPY_X,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_READONLY,
+ [VM_SHARED | VM_WRITE] = PAGE_SHARED,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED,
+ [VM_SHARED | VM_EXEC] = PAGE_EXECONLY_X,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X
+};
+
+#ifndef CONFIG_PPC_BOOK3S_64
+DECLARE_VM_GET_PAGE_PROT
+#endif