summaryrefslogtreecommitdiff
path: root/arch/arm64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r--arch/arm64/mm/contpte.c213
-rw-r--r--arch/arm64/mm/fault.c115
-rw-r--r--arch/arm64/mm/gcs.c6
-rw-r--r--arch/arm64/mm/hugetlbpage.c2
-rw-r--r--arch/arm64/mm/init.c2
-rw-r--r--arch/arm64/mm/mmap.c4
-rw-r--r--arch/arm64/mm/mmu.c124
-rw-r--r--arch/arm64/mm/proc.S3
-rw-r--r--arch/arm64/mm/ptdump.c50
-rw-r--r--arch/arm64/mm/ptdump_debugfs.c3
10 files changed, 362 insertions, 160 deletions
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index bcac4f55f9c1..c0557945939c 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -68,7 +68,144 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
pte = pte_mkyoung(pte);
}
- __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+ /*
+ * On eliding the __tlb_flush_range() under BBML2+noabort:
+ *
+ * NOTE: Instead of using N=16 as the contiguous block length, we use
+ * N=4 for clarity.
+ *
+ * NOTE: 'n' and 'c' are used to denote the "contiguous bit" being
+ * unset and set, respectively.
+ *
+ * We worry about two cases where contiguous bit is used:
+ * - When folding N smaller non-contiguous ptes as 1 contiguous block.
+ * - When unfolding a contiguous block into N smaller non-contiguous ptes.
+ *
+ * Currently, the BBML0 folding case looks as follows:
+ *
+ * 0) Initial page-table layout:
+ *
+ * +----+----+----+----+
+ * |RO,n|RO,n|RO,n|RW,n| <--- last page being set as RO
+ * +----+----+----+----+
+ *
+ * 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
+ *
+ * +----+----+----+----+
+ * | 0 | 0 | 0 | 0 |
+ * +----+----+----+----+
+ *
+ * 2) __flush_tlb_range():
+ *
+ * |____ tlbi + dsb ____|
+ *
+ * 3) __set_ptes() to repaint contiguous block:
+ *
+ * +----+----+----+----+
+ * |RO,c|RO,c|RO,c|RO,c|
+ * +----+----+----+----+
+ *
+ * 4) The kernel will eventually __flush_tlb() for changed page:
+ *
+ * |____| <--- tlbi + dsb
+ *
+ * As expected, the intermediate tlbi+dsb ensures that other PEs
+ * only ever see an invalid (0) entry, or the new contiguous TLB entry.
+ * The final tlbi+dsb will always throw away the newly installed
+ * contiguous TLB entry, which is a micro-optimisation opportunity,
+ * but does not affect correctness.
+ *
+ * In the BBML2 case, the change is avoiding the intermediate tlbi+dsb.
+ * This means a few things, but notably other PEs will still "see" any
+ * stale cached TLB entries. This could lead to a "contiguous bit
+ * misprogramming" issue until the final tlbi+dsb of the changed page,
+ * which would clear out both the stale (RW,n) entry and the new (RO,c)
+ * contiguous entry installed in its place.
+ *
+ * What this is saying, is the following:
+ *
+ * +----+----+----+----+
+ * |RO,n|RO,n|RO,n|RW,n| <--- old page tables, all non-contiguous
+ * +----+----+----+----+
+ *
+ * +----+----+----+----+
+ * |RO,c|RO,c|RO,c|RO,c| <--- new page tables, all contiguous
+ * +----+----+----+----+
+ * /\
+ * ||
+ *
+ * If both the old single (RW,n) and new contiguous (RO,c) TLB entries
+ * are present, and a write is made to this address, do we fault or
+ * is the write permitted (via amalgamation)?
+ *
+ * The relevant Arm ARM DDI 0487L.a requirements are RNGLXZ and RJQQTC,
+ * and together state that when BBML1 or BBML2 are implemented, either
+ * a TLB conflict abort is raised (which we expressly forbid), or will
+ * "produce an OA, access permissions, and memory attributes that are
+ * consistent with any of the programmed translation table values".
+ *
+ * That is to say, will either raise a TLB conflict, or produce one of
+ * the cached TLB entries, but never amalgamate.
+ *
+ * Thus, as the page tables are only considered "consistent" after
+ * the final tlbi+dsb (which evicts both the single stale (RW,n) TLB
+ * entry as well as the new contiguous (RO,c) TLB entry), omitting the
+ * initial tlbi+dsb is correct.
+ *
+ * It is also important to note that at the end of the BBML2 folding
+ * case, we are still left with potentially all N TLB entries still
+ * cached (the N-1 non-contiguous ptes, and the single contiguous
+ * block). However, over time, natural TLB pressure will cause the
+ * non-contiguous pte TLB entries to be flushed, leaving only the
+ * contiguous block TLB entry. This means that omitting the tlbi+dsb is
+ * not only correct, but also keeps our eventual performance benefits.
+ *
+ * For the unfolding case, BBML0 looks as follows:
+ *
+ * 0) Initial page-table layout:
+ *
+ * +----+----+----+----+
+ * |RW,c|RW,c|RW,c|RW,c| <--- last page being set as RO
+ * +----+----+----+----+
+ *
+ * 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
+ *
+ * +----+----+----+----+
+ * | 0 | 0 | 0 | 0 |
+ * +----+----+----+----+
+ *
+ * 2) __flush_tlb_range():
+ *
+ * |____ tlbi + dsb ____|
+ *
+ * 3) __set_ptes() to repaint as non-contiguous:
+ *
+ * +----+----+----+----+
+ * |RW,n|RW,n|RW,n|RW,n|
+ * +----+----+----+----+
+ *
+ * 4) Update changed page permissions:
+ *
+ * +----+----+----+----+
+ * |RW,n|RW,n|RW,n|RO,n| <--- last page permissions set
+ * +----+----+----+----+
+ *
+ * 5) The kernel will eventually __flush_tlb() for changed page:
+ *
+ * |____| <--- tlbi + dsb
+ *
+ * For BBML2, we again remove the intermediate tlbi+dsb. Here, there
+ * are no issues, as the final tlbi+dsb covering the changed page is
+ * guaranteed to remove the original large contiguous (RW,c) TLB entry,
+ * as well as the intermediate (RW,n) TLB entry; the next access will
+ * install the new (RO,n) TLB entry and the page tables are only
+ * considered "consistent" after the final tlbi+dsb, so software must
+ * be prepared for this inconsistency prior to finishing the mm dance
+ * regardless.
+ */
+
+ if (!system_supports_bbml2_noabort())
+ __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
}
@@ -169,17 +306,46 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
for (i = 0; i < CONT_PTES; i++, ptep++) {
pte = __ptep_get(ptep);
- if (pte_dirty(pte))
+ if (pte_dirty(pte)) {
orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
+ for (; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
+
+ if (pte_young(pte)) {
orig_pte = pte_mkyoung(orig_pte);
+ i++;
+ ptep++;
+ for (; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
}
return orig_pte;
}
EXPORT_SYMBOL_GPL(contpte_ptep_get);
+static inline bool contpte_is_consistent(pte_t pte, unsigned long pfn,
+ pgprot_t orig_prot)
+{
+ pgprot_t prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+
+ return pte_valid_cont(pte) && pte_pfn(pte) == pfn &&
+ pgprot_val(prot) == pgprot_val(orig_prot);
+}
+
pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
{
/*
@@ -202,7 +368,6 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
pgprot_t orig_prot;
unsigned long pfn;
pte_t orig_pte;
- pgprot_t prot;
pte_t *ptep;
pte_t pte;
int i;
@@ -219,18 +384,44 @@ retry:
for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
pte = __ptep_get(ptep);
- prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
- if (!pte_valid_cont(pte) ||
- pte_pfn(pte) != pfn ||
- pgprot_val(prot) != pgprot_val(orig_prot))
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
goto retry;
- if (pte_dirty(pte))
+ if (pte_dirty(pte)) {
orig_pte = pte_mkdirty(orig_pte);
+ for (; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
+ goto retry;
+
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
- if (pte_young(pte))
+ if (pte_young(pte)) {
orig_pte = pte_mkyoung(orig_pte);
+ i++;
+ ptep++;
+ pfn++;
+ for (; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
+ goto retry;
+
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
}
return orig_pte;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ec0a337891dd..d816ff44faff 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -53,18 +53,12 @@ struct fault_info {
};
static const struct fault_info fault_info[];
-static struct fault_info debug_fault_info[];
static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
{
return fault_info + (esr & ESR_ELx_FSC);
}
-static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
-{
- return debug_fault_info + DBG_ESR_EVT(esr);
-}
-
static void data_abort_decode(unsigned long esr)
{
unsigned long iss2 = ESR_ELx_ISS2(esr);
@@ -487,17 +481,29 @@ static void do_bad_area(unsigned long far, unsigned long esr,
}
}
-static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
- unsigned int mm_flags)
+static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags)
{
- unsigned long iss2 = ESR_ELx_ISS2(esr);
-
if (!system_supports_poe())
return false;
- if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay))
- return true;
-
+ /*
+ * We do not check whether an Overlay fault has occurred because we
+ * cannot make a decision based solely on its value:
+ *
+ * - If Overlay is set, a fault did occur due to POE, but it may be
+ * spurious in those cases where we update POR_EL0 without ISB (e.g.
+ * on context-switch). We would then need to manually check POR_EL0
+ * against vma_pkey(vma), which is exactly what
+ * arch_vma_access_permitted() does.
+ *
+ * - If Overlay is not set, we may still need to report a pkey fault.
+ * This is the case if an access was made within a mapping but with no
+ * page mapped, and POR_EL0 forbids the access (according to
+ * vma_pkey()). Such access will result in a SIGSEGV regardless
+ * because core code checks arch_vma_access_permitted(), but in order
+ * to report the correct error code - SEGV_PKUERR - we must handle
+ * that case here.
+ */
return !arch_vma_access_permitted(vma,
mm_flags & FAULT_FLAG_WRITE,
mm_flags & FAULT_FLAG_INSTRUCTION,
@@ -549,7 +555,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
const struct fault_info *inf;
struct mm_struct *mm = current->mm;
vm_fault_t fault;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
unsigned long addr = untagged_addr(far);
struct vm_area_struct *vma;
@@ -635,7 +641,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
goto bad_area;
}
- if (fault_from_pkey(esr, vma, mm_flags)) {
+ if (fault_from_pkey(vma, mm_flags)) {
pkey = vma_pkey(vma);
vma_end_read(vma);
fault = 0;
@@ -679,7 +685,7 @@ retry:
goto bad_area;
}
- if (fault_from_pkey(esr, vma, mm_flags)) {
+ if (fault_from_pkey(vma, mm_flags)) {
pkey = vma_pkey(vma);
mmap_read_unlock(mm);
fault = 0;
@@ -826,6 +832,7 @@ static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
*/
siaddr = untagged_addr(far);
}
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK);
arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
return 0;
@@ -837,9 +844,12 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr,
/*
* The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
* for tag check faults. Set them to corresponding bits in the untagged
- * address.
+ * address if ARM64_MTE_FAR isn't supported.
+ * Otherwise, bits 63:60 of FAR_EL1 are not UNKNOWN.
*/
- far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
+ if (!cpus_have_cap(ARM64_MTE_FAR))
+ far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
+
do_bad_area(far, esr, regs);
return 0;
}
@@ -939,75 +949,6 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
NOKPROBE_SYMBOL(do_sp_pc_abort);
/*
- * __refdata because early_brk64 is __init, but the reference to it is
- * clobbered at arch_initcall time.
- * See traps.c and debug-monitors.c:debug_traps_init().
- */
-static struct fault_info __refdata debug_fault_info[] = {
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
- { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
- { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
- { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
-};
-
-void __init hook_debug_fault_code(int nr,
- int (*fn)(unsigned long, unsigned long, struct pt_regs *),
- int sig, int code, const char *name)
-{
- BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
-
- debug_fault_info[nr].fn = fn;
- debug_fault_info[nr].sig = sig;
- debug_fault_info[nr].code = code;
- debug_fault_info[nr].name = name;
-}
-
-/*
- * In debug exception context, we explicitly disable preemption despite
- * having interrupts disabled.
- * This serves two purposes: it makes it much less likely that we would
- * accidentally schedule in exception context and it will force a warning
- * if we somehow manage to schedule by accident.
- */
-static void debug_exception_enter(struct pt_regs *regs)
-{
- preempt_disable();
-
- /* This code is a bit fragile. Test it. */
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
-}
-NOKPROBE_SYMBOL(debug_exception_enter);
-
-static void debug_exception_exit(struct pt_regs *regs)
-{
- preempt_enable_no_resched();
-}
-NOKPROBE_SYMBOL(debug_exception_exit);
-
-void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
- struct pt_regs *regs)
-{
- const struct fault_info *inf = esr_to_debug_fault_info(esr);
- unsigned long pc = instruction_pointer(regs);
-
- debug_exception_enter(regs);
-
- if (user_mode(regs) && !is_ttbr0_addr(pc))
- arm64_apply_bp_hardening();
-
- if (inf->fn(addr_if_watchpoint, esr, regs)) {
- arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
- }
-
- debug_exception_exit(regs);
-}
-NOKPROBE_SYMBOL(do_debug_exception);
-
-/*
* Used during anonymous page fault handling.
*/
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 5c46ec527b1c..6e93f78de79b 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -157,12 +157,6 @@ void gcs_free(struct task_struct *task)
if (!system_supports_gcs())
return;
- /*
- * When fork() with CLONE_VM fails, the child (tsk) already
- * has a GCS allocated, and exit_thread() calls this function
- * to free it. In this case the parent (current) and the
- * child share the same mm struct.
- */
if (!task->mm || task->mm != current->mm)
return;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 0c8737f4f2ce..1d90a7e75333 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -225,7 +225,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
ncontig = num_contig_ptes(sz, &pgsize);
if (!pte_present(pte)) {
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+ for (i = 0; i < ncontig; i++, ptep++)
__set_ptes_anysz(mm, ptep, pte, 1, pgsize);
return;
}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 0c8c35dd645e..ea84a61ed508 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void)
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
- &low_size, &high);
+ &low_size, NULL, &high);
if (ret)
return;
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 07aeab8a7606..08ee177432c2 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -81,9 +81,9 @@ static int __init adjust_protection_map(void)
}
arch_initcall(adjust_protection_map);
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
- pteval_t prot;
+ ptdesc_t prot;
/* Short circuit GCS to avoid bloating the table. */
if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ea6695d53fb9..34e5d78af076 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -26,6 +26,7 @@
#include <linux/set_memory.h>
#include <linux/kfence.h>
#include <linux/pkeys.h>
+#include <linux/mm_inline.h>
#include <asm/barrier.h>
#include <asm/cputype.h>
@@ -46,6 +47,13 @@
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
+enum pgtable_type {
+ TABLE_PTE,
+ TABLE_PMD,
+ TABLE_PUD,
+ TABLE_P4D,
+};
+
u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);
@@ -107,7 +115,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
}
EXPORT_SYMBOL(phys_mem_access_prot);
-static phys_addr_t __init early_pgtable_alloc(int shift)
+static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type)
{
phys_addr_t phys;
@@ -192,7 +200,7 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long next;
@@ -207,7 +215,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pmdval |= PMD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pte_phys = pgtable_alloc(PAGE_SHIFT);
+ pte_phys = pgtable_alloc(TABLE_PTE);
ptep = pte_set_fixmap(pte_phys);
init_clear_pgtable(ptep);
ptep += pte_index(addr);
@@ -243,7 +251,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
{
unsigned long next;
@@ -277,7 +285,8 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ int flags)
{
unsigned long next;
pud_t pud = READ_ONCE(*pudp);
@@ -294,7 +303,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pudval |= PUD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pmd_phys = pgtable_alloc(PMD_SHIFT);
+ pmd_phys = pgtable_alloc(TABLE_PMD);
pmdp = pmd_set_fixmap(pmd_phys);
init_clear_pgtable(pmdp);
pmdp += pmd_index(addr);
@@ -325,7 +334,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long next;
@@ -339,7 +348,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
p4dval |= P4D_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pud_phys = pgtable_alloc(PUD_SHIFT);
+ pud_phys = pgtable_alloc(TABLE_PUD);
pudp = pud_set_fixmap(pud_phys);
init_clear_pgtable(pudp);
pudp += pud_index(addr);
@@ -383,7 +392,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long next;
@@ -397,7 +406,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
pgdval |= PGD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- p4d_phys = pgtable_alloc(P4D_SHIFT);
+ p4d_phys = pgtable_alloc(TABLE_P4D);
p4dp = p4d_set_fixmap(p4d_phys);
init_clear_pgtable(p4dp);
p4dp += p4d_index(addr);
@@ -427,7 +436,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long addr, end, next;
@@ -455,7 +464,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
mutex_lock(&fixmap_lock);
@@ -468,37 +477,48 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
extern __alias(__create_pgd_mapping_locked)
void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
phys_addr_t size, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags);
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ int flags);
#endif
-static phys_addr_t __pgd_pgtable_alloc(int shift)
+static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
+ enum pgtable_type pgtable_type)
{
/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
- void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
+ phys_addr_t pa;
+
+ BUG_ON(!ptdesc);
+ pa = page_to_phys(ptdesc_page(ptdesc));
+
+ switch (pgtable_type) {
+ case TABLE_PTE:
+ BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
+ break;
+ case TABLE_PMD:
+ BUG_ON(!pagetable_pmd_ctor(mm, ptdesc));
+ break;
+ case TABLE_PUD:
+ pagetable_pud_ctor(ptdesc);
+ break;
+ case TABLE_P4D:
+ pagetable_p4d_ctor(ptdesc);
+ break;
+ }
- BUG_ON(!ptr);
- return __pa(ptr);
+ return pa;
}
-static phys_addr_t pgd_pgtable_alloc(int shift)
+static phys_addr_t __maybe_unused
+pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
{
- phys_addr_t pa = __pgd_pgtable_alloc(shift);
- struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
-
- /*
- * Call proper page table ctor in case later we need to
- * call core mm functions like apply_to_page_range() on
- * this pre-allocated page table.
- *
- * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
- * folded, and if so pagetable_pte_ctor() becomes nop.
- */
- if (shift == PAGE_SHIFT)
- BUG_ON(!pagetable_pte_ctor(ptdesc));
- else if (shift == PMD_SHIFT)
- BUG_ON(!pagetable_pmd_ctor(ptdesc));
+ return __pgd_pgtable_alloc(&init_mm, pgtable_type);
+}
- return pa;
+static phys_addr_t
+pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
+{
+ return __pgd_pgtable_alloc(NULL, pgtable_type);
}
/*
@@ -530,7 +550,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
- pgd_pgtable_alloc, flags);
+ pgd_pgtable_alloc_special_mm, flags);
}
static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
@@ -744,7 +764,7 @@ static int __init map_entry_trampoline(void)
memset(tramp_pg_dir, 0, PGD_SIZE);
__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
entry_tramp_text_size(), prot,
- __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
+ pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS);
/* Map both the text and data into the kernel page table */
for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
@@ -1286,7 +1306,8 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
next = addr;
end = addr + PUD_SIZE;
do {
- pmd_free_pte_page(pmdp, next);
+ if (pmd_present(pmdp_get(pmdp)))
+ pmd_free_pte_page(pmdp, next);
} while (pmdp++, next += PMD_SIZE, next != end);
pud_clear(pudp);
@@ -1350,7 +1371,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
- size, params->pgprot, __pgd_pgtable_alloc,
+ size, params->pgprot, pgd_pgtable_alloc_init_mm,
flags);
memblock_clear_nomap(start, size);
@@ -1504,24 +1525,41 @@ static int __init prevent_bootmem_remove_init(void)
early_initcall(prevent_bootmem_remove_init);
#endif
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
{
+ pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
+
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
/*
* Break-before-make (BBM) is required for all user space mappings
* when the permission changes from executable to non-executable
* in cases where cpu is affected with errata #2645198.
*/
- if (pte_user_exec(ptep_get(ptep)))
- return ptep_clear_flush(vma, addr, ptep);
+ if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte))
+ __flush_tlb_range(vma, addr, nr * PAGE_SIZE,
+ PAGE_SIZE, true, 3);
}
- return ptep_get_and_clear(vma->vm_mm, addr, ptep);
+
+ return pte;
+}
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+{
+ return modify_prot_start_ptes(vma, addr, ptep, 1);
+}
+
+void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte,
+ unsigned int nr)
+{
+ set_ptes(vma->vm_mm, addr, ptep, pte, nr);
}
void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t pte)
{
- set_pte_at(vma->vm_mm, addr, ptep, pte);
+ modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1);
}
/*
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 80d470aa469d..8c75965afc9e 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -454,7 +454,7 @@ SYM_FUNC_START(__cpu_setup)
dsb nsh
msr cpacr_el1, xzr // Reset cpacr_el1
- mov x1, #1 << 12 // Reset mdscr_el1 and disable
+ mov x1, MDSCR_EL1_TDCC // Reset mdscr_el1 and disable
msr mdscr_el1, x1 // access to the DCC from EL0
reset_pmuserenr_el0 x1 // Disable PMU access from EL0
reset_amuserenr_el0 x1 // Disable AMU access from EL0
@@ -518,7 +518,6 @@ alternative_else_nop_endif
msr REG_PIR_EL1, x0
orr tcr2, tcr2, TCR2_EL1_PIE
- msr REG_TCR2_EL1, x0
.Lskip_indirection:
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 8cec0da4cff2..421a5de806c6 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -189,12 +189,12 @@ static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr)
}
void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- u64 val)
+ pteval_t val)
{
struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump);
struct ptdump_pg_level *pg_level = st->pg_level;
static const char units[] = "KMGTPE";
- u64 prot = 0;
+ ptdesc_t prot = 0;
/* check if the current level has been folded dynamically */
if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) ||
@@ -251,6 +251,38 @@ void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
+void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
unsigned long end = ~0UL;
@@ -266,7 +298,12 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
.pg_level = &kernel_pg_levels[0],
.level = -1,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]){
{info->base_addr, end},
{0, 0}
@@ -303,7 +340,12 @@ bool ptdump_check_wx(void)
.level = -1,
.check_wx = true,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{_PAGE_OFFSET(vabits_actual), ~0UL},
{0, 0}
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
index 68bf1a125502..1e308328c079 100644
--- a/arch/arm64/mm/ptdump_debugfs.c
+++ b/arch/arm64/mm/ptdump_debugfs.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
-#include <linux/memory_hotplug.h>
#include <linux/seq_file.h>
#include <asm/ptdump.h>
@@ -9,9 +8,7 @@ static int ptdump_show(struct seq_file *m, void *v)
{
struct ptdump_info *info = m->private;
- get_online_mems();
ptdump_walk(m, info);
- put_online_mems();
return 0;
}
DEFINE_SHOW_ATTRIBUTE(ptdump);