diff options
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r-- | arch/arm64/mm/contpte.c | 213 | ||||
-rw-r--r-- | arch/arm64/mm/fault.c | 115 | ||||
-rw-r--r-- | arch/arm64/mm/gcs.c | 6 | ||||
-rw-r--r-- | arch/arm64/mm/hugetlbpage.c | 2 | ||||
-rw-r--r-- | arch/arm64/mm/init.c | 2 | ||||
-rw-r--r-- | arch/arm64/mm/mmap.c | 4 | ||||
-rw-r--r-- | arch/arm64/mm/mmu.c | 124 | ||||
-rw-r--r-- | arch/arm64/mm/proc.S | 3 | ||||
-rw-r--r-- | arch/arm64/mm/ptdump.c | 50 | ||||
-rw-r--r-- | arch/arm64/mm/ptdump_debugfs.c | 3 |
10 files changed, 362 insertions, 160 deletions
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index bcac4f55f9c1..c0557945939c 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -68,7 +68,144 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr, pte = pte_mkyoung(pte); } - __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + /* + * On eliding the __tlb_flush_range() under BBML2+noabort: + * + * NOTE: Instead of using N=16 as the contiguous block length, we use + * N=4 for clarity. + * + * NOTE: 'n' and 'c' are used to denote the "contiguous bit" being + * unset and set, respectively. + * + * We worry about two cases where contiguous bit is used: + * - When folding N smaller non-contiguous ptes as 1 contiguous block. + * - When unfolding a contiguous block into N smaller non-contiguous ptes. + * + * Currently, the BBML0 folding case looks as follows: + * + * 0) Initial page-table layout: + * + * +----+----+----+----+ + * |RO,n|RO,n|RO,n|RW,n| <--- last page being set as RO + * +----+----+----+----+ + * + * 1) Aggregate AF + dirty flags using __ptep_get_and_clear(): + * + * +----+----+----+----+ + * | 0 | 0 | 0 | 0 | + * +----+----+----+----+ + * + * 2) __flush_tlb_range(): + * + * |____ tlbi + dsb ____| + * + * 3) __set_ptes() to repaint contiguous block: + * + * +----+----+----+----+ + * |RO,c|RO,c|RO,c|RO,c| + * +----+----+----+----+ + * + * 4) The kernel will eventually __flush_tlb() for changed page: + * + * |____| <--- tlbi + dsb + * + * As expected, the intermediate tlbi+dsb ensures that other PEs + * only ever see an invalid (0) entry, or the new contiguous TLB entry. + * The final tlbi+dsb will always throw away the newly installed + * contiguous TLB entry, which is a micro-optimisation opportunity, + * but does not affect correctness. + * + * In the BBML2 case, the change is avoiding the intermediate tlbi+dsb. + * This means a few things, but notably other PEs will still "see" any + * stale cached TLB entries. This could lead to a "contiguous bit + * misprogramming" issue until the final tlbi+dsb of the changed page, + * which would clear out both the stale (RW,n) entry and the new (RO,c) + * contiguous entry installed in its place. + * + * What this is saying, is the following: + * + * +----+----+----+----+ + * |RO,n|RO,n|RO,n|RW,n| <--- old page tables, all non-contiguous + * +----+----+----+----+ + * + * +----+----+----+----+ + * |RO,c|RO,c|RO,c|RO,c| <--- new page tables, all contiguous + * +----+----+----+----+ + * /\ + * || + * + * If both the old single (RW,n) and new contiguous (RO,c) TLB entries + * are present, and a write is made to this address, do we fault or + * is the write permitted (via amalgamation)? + * + * The relevant Arm ARM DDI 0487L.a requirements are RNGLXZ and RJQQTC, + * and together state that when BBML1 or BBML2 are implemented, either + * a TLB conflict abort is raised (which we expressly forbid), or will + * "produce an OA, access permissions, and memory attributes that are + * consistent with any of the programmed translation table values". + * + * That is to say, will either raise a TLB conflict, or produce one of + * the cached TLB entries, but never amalgamate. + * + * Thus, as the page tables are only considered "consistent" after + * the final tlbi+dsb (which evicts both the single stale (RW,n) TLB + * entry as well as the new contiguous (RO,c) TLB entry), omitting the + * initial tlbi+dsb is correct. + * + * It is also important to note that at the end of the BBML2 folding + * case, we are still left with potentially all N TLB entries still + * cached (the N-1 non-contiguous ptes, and the single contiguous + * block). However, over time, natural TLB pressure will cause the + * non-contiguous pte TLB entries to be flushed, leaving only the + * contiguous block TLB entry. This means that omitting the tlbi+dsb is + * not only correct, but also keeps our eventual performance benefits. + * + * For the unfolding case, BBML0 looks as follows: + * + * 0) Initial page-table layout: + * + * +----+----+----+----+ + * |RW,c|RW,c|RW,c|RW,c| <--- last page being set as RO + * +----+----+----+----+ + * + * 1) Aggregate AF + dirty flags using __ptep_get_and_clear(): + * + * +----+----+----+----+ + * | 0 | 0 | 0 | 0 | + * +----+----+----+----+ + * + * 2) __flush_tlb_range(): + * + * |____ tlbi + dsb ____| + * + * 3) __set_ptes() to repaint as non-contiguous: + * + * +----+----+----+----+ + * |RW,n|RW,n|RW,n|RW,n| + * +----+----+----+----+ + * + * 4) Update changed page permissions: + * + * +----+----+----+----+ + * |RW,n|RW,n|RW,n|RO,n| <--- last page permissions set + * +----+----+----+----+ + * + * 5) The kernel will eventually __flush_tlb() for changed page: + * + * |____| <--- tlbi + dsb + * + * For BBML2, we again remove the intermediate tlbi+dsb. Here, there + * are no issues, as the final tlbi+dsb covering the changed page is + * guaranteed to remove the original large contiguous (RW,c) TLB entry, + * as well as the intermediate (RW,n) TLB entry; the next access will + * install the new (RO,n) TLB entry and the page tables are only + * considered "consistent" after the final tlbi+dsb, so software must + * be prepared for this inconsistency prior to finishing the mm dance + * regardless. + */ + + if (!system_supports_bbml2_noabort()) + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } @@ -169,17 +306,46 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) for (i = 0; i < CONT_PTES; i++, ptep++) { pte = __ptep_get(ptep); - if (pte_dirty(pte)) + if (pte_dirty(pte)) { orig_pte = pte_mkdirty(orig_pte); - - if (pte_young(pte)) + for (; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + if (pte_young(pte)) { + orig_pte = pte_mkyoung(orig_pte); + break; + } + } + break; + } + + if (pte_young(pte)) { orig_pte = pte_mkyoung(orig_pte); + i++; + ptep++; + for (; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + if (pte_dirty(pte)) { + orig_pte = pte_mkdirty(orig_pte); + break; + } + } + break; + } } return orig_pte; } EXPORT_SYMBOL_GPL(contpte_ptep_get); +static inline bool contpte_is_consistent(pte_t pte, unsigned long pfn, + pgprot_t orig_prot) +{ + pgprot_t prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + + return pte_valid_cont(pte) && pte_pfn(pte) == pfn && + pgprot_val(prot) == pgprot_val(orig_prot); +} + pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) { /* @@ -202,7 +368,6 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) pgprot_t orig_prot; unsigned long pfn; pte_t orig_pte; - pgprot_t prot; pte_t *ptep; pte_t pte; int i; @@ -219,18 +384,44 @@ retry: for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { pte = __ptep_get(ptep); - prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); - if (!pte_valid_cont(pte) || - pte_pfn(pte) != pfn || - pgprot_val(prot) != pgprot_val(orig_prot)) + if (!contpte_is_consistent(pte, pfn, orig_prot)) goto retry; - if (pte_dirty(pte)) + if (pte_dirty(pte)) { orig_pte = pte_mkdirty(orig_pte); + for (; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + + if (!contpte_is_consistent(pte, pfn, orig_prot)) + goto retry; + + if (pte_young(pte)) { + orig_pte = pte_mkyoung(orig_pte); + break; + } + } + break; + } - if (pte_young(pte)) + if (pte_young(pte)) { orig_pte = pte_mkyoung(orig_pte); + i++; + ptep++; + pfn++; + for (; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + + if (!contpte_is_consistent(pte, pfn, orig_prot)) + goto retry; + + if (pte_dirty(pte)) { + orig_pte = pte_mkdirty(orig_pte); + break; + } + } + break; + } } return orig_pte; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index ec0a337891dd..d816ff44faff 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -53,18 +53,12 @@ struct fault_info { }; static const struct fault_info fault_info[]; -static struct fault_info debug_fault_info[]; static inline const struct fault_info *esr_to_fault_info(unsigned long esr) { return fault_info + (esr & ESR_ELx_FSC); } -static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr) -{ - return debug_fault_info + DBG_ESR_EVT(esr); -} - static void data_abort_decode(unsigned long esr) { unsigned long iss2 = ESR_ELx_ISS2(esr); @@ -487,17 +481,29 @@ static void do_bad_area(unsigned long far, unsigned long esr, } } -static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, - unsigned int mm_flags) +static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags) { - unsigned long iss2 = ESR_ELx_ISS2(esr); - if (!system_supports_poe()) return false; - if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) - return true; - + /* + * We do not check whether an Overlay fault has occurred because we + * cannot make a decision based solely on its value: + * + * - If Overlay is set, a fault did occur due to POE, but it may be + * spurious in those cases where we update POR_EL0 without ISB (e.g. + * on context-switch). We would then need to manually check POR_EL0 + * against vma_pkey(vma), which is exactly what + * arch_vma_access_permitted() does. + * + * - If Overlay is not set, we may still need to report a pkey fault. + * This is the case if an access was made within a mapping but with no + * page mapped, and POR_EL0 forbids the access (according to + * vma_pkey()). Such access will result in a SIGSEGV regardless + * because core code checks arch_vma_access_permitted(), but in order + * to report the correct error code - SEGV_PKUERR - we must handle + * that case here. + */ return !arch_vma_access_permitted(vma, mm_flags & FAULT_FLAG_WRITE, mm_flags & FAULT_FLAG_INSTRUCTION, @@ -549,7 +555,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, const struct fault_info *inf; struct mm_struct *mm = current->mm; vm_fault_t fault; - unsigned long vm_flags; + vm_flags_t vm_flags; unsigned int mm_flags = FAULT_FLAG_DEFAULT; unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; @@ -635,7 +641,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto bad_area; } - if (fault_from_pkey(esr, vma, mm_flags)) { + if (fault_from_pkey(vma, mm_flags)) { pkey = vma_pkey(vma); vma_end_read(vma); fault = 0; @@ -679,7 +685,7 @@ retry: goto bad_area; } - if (fault_from_pkey(esr, vma, mm_flags)) { + if (fault_from_pkey(vma, mm_flags)) { pkey = vma_pkey(vma); mmap_read_unlock(mm); fault = 0; @@ -826,6 +832,7 @@ static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) */ siaddr = untagged_addr(far); } + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; @@ -837,9 +844,12 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr, /* * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN * for tag check faults. Set them to corresponding bits in the untagged - * address. + * address if ARM64_MTE_FAR isn't supported. + * Otherwise, bits 63:60 of FAR_EL1 are not UNKNOWN. */ - far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK); + if (!cpus_have_cap(ARM64_MTE_FAR)) + far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK); + do_bad_area(far, esr, regs); return 0; } @@ -939,75 +949,6 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) NOKPROBE_SYMBOL(do_sp_pc_abort); /* - * __refdata because early_brk64 is __init, but the reference to it is - * clobbered at arch_initcall time. - * See traps.c and debug-monitors.c:debug_traps_init(). - */ -static struct fault_info __refdata debug_fault_info[] = { - { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" }, - { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" }, - { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 3" }, - { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" }, - { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" }, - { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 7" }, -}; - -void __init hook_debug_fault_code(int nr, - int (*fn)(unsigned long, unsigned long, struct pt_regs *), - int sig, int code, const char *name) -{ - BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info)); - - debug_fault_info[nr].fn = fn; - debug_fault_info[nr].sig = sig; - debug_fault_info[nr].code = code; - debug_fault_info[nr].name = name; -} - -/* - * In debug exception context, we explicitly disable preemption despite - * having interrupts disabled. - * This serves two purposes: it makes it much less likely that we would - * accidentally schedule in exception context and it will force a warning - * if we somehow manage to schedule by accident. - */ -static void debug_exception_enter(struct pt_regs *regs) -{ - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work"); -} -NOKPROBE_SYMBOL(debug_exception_enter); - -static void debug_exception_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); -} -NOKPROBE_SYMBOL(debug_exception_exit); - -void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, - struct pt_regs *regs) -{ - const struct fault_info *inf = esr_to_debug_fault_info(esr); - unsigned long pc = instruction_pointer(regs); - - debug_exception_enter(regs); - - if (user_mode(regs) && !is_ttbr0_addr(pc)) - arm64_apply_bp_hardening(); - - if (inf->fn(addr_if_watchpoint, esr, regs)) { - arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr); - } - - debug_exception_exit(regs); -} -NOKPROBE_SYMBOL(do_debug_exception); - -/* * Used during anonymous page fault handling. */ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index 5c46ec527b1c..6e93f78de79b 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -157,12 +157,6 @@ void gcs_free(struct task_struct *task) if (!system_supports_gcs()) return; - /* - * When fork() with CLONE_VM fails, the child (tsk) already - * has a GCS allocated, and exit_thread() calls this function - * to free it. In this case the parent (current) and the - * child share the same mm struct. - */ if (!task->mm || task->mm != current->mm) return; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0c8737f4f2ce..1d90a7e75333 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -225,7 +225,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, ncontig = num_contig_ptes(sz, &pgsize); if (!pte_present(pte)) { - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) + for (i = 0; i < ncontig; i++, ptep++) __set_ptes_anysz(mm, ptep, pte, 1, pgsize); return; } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0c8c35dd645e..ea84a61ed508 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 07aeab8a7606..08ee177432c2 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -81,9 +81,9 @@ static int __init adjust_protection_map(void) } arch_initcall(adjust_protection_map); -pgprot_t vm_get_page_prot(unsigned long vm_flags) +pgprot_t vm_get_page_prot(vm_flags_t vm_flags) { - pteval_t prot; + ptdesc_t prot; /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ea6695d53fb9..34e5d78af076 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -26,6 +26,7 @@ #include <linux/set_memory.h> #include <linux/kfence.h> #include <linux/pkeys.h> +#include <linux/mm_inline.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -46,6 +47,13 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ +enum pgtable_type { + TABLE_PTE, + TABLE_PMD, + TABLE_PUD, + TABLE_P4D, +}; + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); @@ -107,7 +115,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(int shift) +static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type) { phys_addr_t phys; @@ -192,7 +200,7 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -207,7 +215,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(PAGE_SHIFT); + pte_phys = pgtable_alloc(TABLE_PTE); ptep = pte_set_fixmap(pte_phys); init_clear_pgtable(ptep); ptep += pte_index(addr); @@ -243,7 +251,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -277,7 +285,8 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_type), + int flags) { unsigned long next; pud_t pud = READ_ONCE(*pudp); @@ -294,7 +303,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(PMD_SHIFT); + pmd_phys = pgtable_alloc(TABLE_PMD); pmdp = pmd_set_fixmap(pmd_phys); init_clear_pgtable(pmdp); pmdp += pmd_index(addr); @@ -325,7 +334,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -339,7 +348,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(PUD_SHIFT); + pud_phys = pgtable_alloc(TABLE_PUD); pudp = pud_set_fixmap(pud_phys); init_clear_pgtable(pudp); pudp += pud_index(addr); @@ -383,7 +392,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -397,7 +406,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); - p4d_phys = pgtable_alloc(P4D_SHIFT); + p4d_phys = pgtable_alloc(TABLE_P4D); p4dp = p4d_set_fixmap(p4d_phys); init_clear_pgtable(p4dp); p4dp += p4d_index(addr); @@ -427,7 +436,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long addr, end, next; @@ -455,7 +464,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { mutex_lock(&fixmap_lock); @@ -468,37 +477,48 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, extern __alias(__create_pgd_mapping_locked) void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags); + phys_addr_t (*pgtable_alloc)(enum pgtable_type), + int flags); #endif -static phys_addr_t __pgd_pgtable_alloc(int shift) +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, + enum pgtable_type pgtable_type) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ - void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); + struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); + phys_addr_t pa; + + BUG_ON(!ptdesc); + pa = page_to_phys(ptdesc_page(ptdesc)); + + switch (pgtable_type) { + case TABLE_PTE: + BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); + break; + case TABLE_PMD: + BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); + break; + case TABLE_PUD: + pagetable_pud_ctor(ptdesc); + break; + case TABLE_P4D: + pagetable_p4d_ctor(ptdesc); + break; + } - BUG_ON(!ptr); - return __pa(ptr); + return pa; } -static phys_addr_t pgd_pgtable_alloc(int shift) +static phys_addr_t __maybe_unused +pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) { - phys_addr_t pa = __pgd_pgtable_alloc(shift); - struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); - - /* - * Call proper page table ctor in case later we need to - * call core mm functions like apply_to_page_range() on - * this pre-allocated page table. - * - * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is - * folded, and if so pagetable_pte_ctor() becomes nop. - */ - if (shift == PAGE_SHIFT) - BUG_ON(!pagetable_pte_ctor(ptdesc)); - else if (shift == PMD_SHIFT) - BUG_ON(!pagetable_pmd_ctor(ptdesc)); + return __pgd_pgtable_alloc(&init_mm, pgtable_type); +} - return pa; +static phys_addr_t +pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) +{ + return __pgd_pgtable_alloc(NULL, pgtable_type); } /* @@ -530,7 +550,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, flags); + pgd_pgtable_alloc_special_mm, flags); } static void update_mapping_prot(phys_addr_t phys, unsigned long virt, @@ -744,7 +764,7 @@ static int __init map_entry_trampoline(void) memset(tramp_pg_dir, 0, PGD_SIZE); __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, entry_tramp_text_size(), prot, - __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS); + pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS); /* Map both the text and data into the kernel page table */ for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) @@ -1286,7 +1306,8 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) next = addr; end = addr + PUD_SIZE; do { - pmd_free_pte_page(pmdp, next); + if (pmd_present(pmdp_get(pmdp))) + pmd_free_pte_page(pmdp, next); } while (pmdp++, next += PMD_SIZE, next != end); pud_clear(pudp); @@ -1350,7 +1371,7 @@ int arch_add_memory(int nid, u64 start, u64 size, flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), - size, params->pgprot, __pgd_pgtable_alloc, + size, params->pgprot, pgd_pgtable_alloc_init_mm, flags); memblock_clear_nomap(start, size); @@ -1504,24 +1525,41 @@ static int __init prevent_bootmem_remove_init(void) early_initcall(prevent_bootmem_remove_init); #endif -pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) { + pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(ptep_get(ptep))) - return ptep_clear_flush(vma, addr, ptep); + if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) + __flush_tlb_range(vma, addr, nr * PAGE_SIZE, + PAGE_SIZE, true, 3); } - return ptep_get_and_clear(vma->vm_mm, addr, ptep); + + return pte; +} + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +{ + return modify_prot_start_ptes(vma, addr, ptep, 1); +} + +void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte, + unsigned int nr) +{ + set_ptes(vma->vm_mm, addr, ptep, pte, nr); } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_pte_at(vma->vm_mm, addr, ptep, pte); + modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1); } /* diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 80d470aa469d..8c75965afc9e 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -454,7 +454,7 @@ SYM_FUNC_START(__cpu_setup) dsb nsh msr cpacr_el1, xzr // Reset cpacr_el1 - mov x1, #1 << 12 // Reset mdscr_el1 and disable + mov x1, MDSCR_EL1_TDCC // Reset mdscr_el1 and disable msr mdscr_el1, x1 // access to the DCC from EL0 reset_pmuserenr_el0 x1 // Disable PMU access from EL0 reset_amuserenr_el0 x1 // Disable AMU access from EL0 @@ -518,7 +518,6 @@ alternative_else_nop_endif msr REG_PIR_EL1, x0 orr tcr2, tcr2, TCR2_EL1_PIE - msr REG_TCR2_EL1, x0 .Lskip_indirection: diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 8cec0da4cff2..421a5de806c6 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -189,12 +189,12 @@ static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr) } void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val) + pteval_t val) { struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump); struct ptdump_pg_level *pg_level = st->pg_level; static const char units[] = "KMGTPE"; - u64 prot = 0; + ptdesc_t prot = 0; /* check if the current level has been folded dynamically */ if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) || @@ -251,6 +251,38 @@ void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } +void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; @@ -266,7 +298,12 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) .pg_level = &kernel_pg_levels[0], .level = -1, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]){ {info->base_addr, end}, {0, 0} @@ -303,7 +340,12 @@ bool ptdump_check_wx(void) .level = -1, .check_wx = true, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {_PAGE_OFFSET(vabits_actual), ~0UL}, {0, 0} diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c index 68bf1a125502..1e308328c079 100644 --- a/arch/arm64/mm/ptdump_debugfs.c +++ b/arch/arm64/mm/ptdump_debugfs.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> -#include <linux/memory_hotplug.h> #include <linux/seq_file.h> #include <asm/ptdump.h> @@ -9,9 +8,7 @@ static int ptdump_show(struct seq_file *m, void *v) { struct ptdump_info *info = m->private; - get_online_mems(); ptdump_walk(m, info); - put_online_mems(); return 0; } DEFINE_SHOW_ATTRIBUTE(ptdump); |