diff options
Diffstat (limited to 'arch/powerpc/mm')
28 files changed, 973 insertions, 1070 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 0f499db315d6..5e147986400d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -7,7 +7,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ - pgtable-frag.o \ + pgtable-frag.o ioremap.o ioremap_$(BITS).o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index e249fbf6b9c3..84d5fab94f8f 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -74,7 +74,7 @@ static int find_free_bat(void) { int b; - if (cpu_has_feature(CPU_FTR_601)) { + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { for (b = 0; b < 4; b++) { struct ppc_bat *bat = BATS[b]; @@ -106,7 +106,7 @@ static int find_free_bat(void) */ static unsigned int block_size(unsigned long base, unsigned long top) { - unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; + unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M; unsigned int base_shift = (ffs(base) - 1) & 31; unsigned int block_shift = (fls(top - base) - 1) & 31; @@ -189,7 +189,7 @@ void mmu_mark_initmem_nx(void) unsigned long top = (unsigned long)_etext - PAGE_OFFSET; unsigned long size; - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return; for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { @@ -227,7 +227,7 @@ void mmu_mark_rodata_ro(void) int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return; for (i = 0; i < nb; i++) { @@ -259,7 +259,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, flags &= ~_PAGE_COHERENT; bl = (size >> 17) - 1; - if (PVR_VER(mfspr(SPRN_PVR)) != 1) { + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { /* 603, 604, etc. */ /* Do DBAT first */ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE @@ -297,8 +297,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, /* * Preload a translation in the hash table */ -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) +void hash_preload(struct mm_struct *mm, unsigned long ea) { pmd_t *pmd; @@ -310,6 +309,39 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, } /* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* We have to test for regs NULL since init will get here first thing at boot */ + if (!current->thread.regs) + return; + + /* We also avoid filling the hash if not coming from a fault */ + if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400) + return; + + hash_preload(vma->vm_mm, address); +} + +/* * Initialize the hash table and patch the instructions in hashtable.S. */ void __init MMU_init_hw(void) @@ -358,6 +390,15 @@ void __init MMU_init_hw(void) hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; if (lg_n_hpteg > 16) hash_mb2 = 16 - LG_HPTEG_SIZE; + + /* + * When KASAN is selected, there is already an early temporary hash + * table and the switch to the final hash table is done later. + */ + if (IS_ENABLED(CONFIG_KASAN)) + return; + + MMU_init_hw_patch(); } void __init MMU_init_hw_patch(void) @@ -400,7 +441,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, BUG_ON(first_memblock_base != 0); /* 601 can only access 16MB at the moment */ - if (PVR_VER(mfspr(SPRN_PVR)) == 1) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); else /* Anything else has 256M mapped */ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); @@ -418,9 +459,6 @@ void __init setup_kuep(bool disabled) { pr_info("Activating Kernel Userspace Execution Prevention\n"); - if (cpu_has_feature(CPU_FTR_601)) - pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n"); - if (disabled) pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); } diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 90ab4f31e2b3..523e42eb11da 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -197,9 +197,32 @@ static inline unsigned long ___tlbie(unsigned long vpn, int psize, return va; } -static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) +static inline void fixup_tlbie_vpn(unsigned long vpn, int psize, + int apsize, int ssize) { - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* Need the extra ptesync to ensure we don't reorder tlbie*/ asm volatile("ptesync": : :"memory"); ___tlbie(vpn, psize, apsize, ssize); @@ -283,7 +306,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, asm volatile("ptesync": : :"memory"); } else { __tlbie(vpn, psize, apsize, ssize); - fixup_tlbie(vpn, psize, apsize, ssize); + fixup_tlbie_vpn(vpn, psize, apsize, ssize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) @@ -856,7 +879,7 @@ static void native_flush_hash_range(unsigned long number, int local) /* * Just do one more with the last used values. */ - fixup_tlbie(vpn, psize, psize, ssize); + fixup_tlbie_vpn(vpn, psize, psize, ssize); asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index d1f390ac9cdb..64733b9cb20a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -406,6 +406,8 @@ int hash__has_transparent_hugepage(void) return 1; } +EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 9a5963e07a82..6c123760164e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -34,6 +34,7 @@ #include <linux/libfdt.h> #include <linux/pkeys.h> #include <linux/hugetlb.h> +#include <linux/cpu.h> #include <asm/debugfs.h> #include <asm/processor.h> @@ -61,6 +62,7 @@ #include <asm/ps3.h> #include <asm/pte-walk.h> #include <asm/asm-prototypes.h> +#include <asm/ultravisor.h> #include <mm/mmu_decl.h> @@ -271,10 +273,6 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; - /* Make kvm guest trampolines executable */ - if (overlaps_kvm_tmp(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - /* * If relocatable, check if it overlaps interrupt vectors that * are copied down to real 0. For relocatable kernel @@ -823,7 +821,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, * For now, UPRT is 0 and we have no segment table. */ htab_size = __ilog2(htab_size) - 18; - mmu_partition_table_set_entry(0, hash_table | htab_size, 0); + mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false); pr_info("Partition table %p\n", partition_tb); } @@ -857,12 +855,6 @@ static void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; - /* - * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall - * to inform the hypervisor that we wish to use the HPT. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - register_process_table(0, 0, 0); #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves @@ -1075,8 +1067,8 @@ void hash__early_init_mmu_secondary(void) if (!cpu_has_feature(CPU_FTR_ARCH_300)) mtspr(SPRN_SDR1, _SDR1); else - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); } /* Initialize SLB */ slb_initialize(); @@ -1460,8 +1452,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, - unsigned long dsisr) +int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, + unsigned long msr) { unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; @@ -1518,8 +1510,8 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) } #endif -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) +static void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap) { int hugepage_shift; unsigned long vsid; @@ -1599,6 +1591,57 @@ out_exit: local_irq_restore(flags); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + unsigned long trap; + bool is_exec; + + if (radix_enabled()) { + prefetch((void *)address); + return; + } + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* + * We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot. + * + * We also avoid filling the hash if not coming from a fault. + */ + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + is_exec = false; + break; + case 0x400: + is_exec = true; + break; + default: + return; + } + + hash_preload(vma->vm_mm, address, is_exec, trap); +} + #ifdef CONFIG_PPC_MEM_KEYS /* * Return the protection key associated with the given address and the @@ -1705,7 +1748,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr, /* * IF we try to do a HUGE PTE update after a withdraw is done. * we will find the below NULL. This happens when we do - * split_huge_page_pmd + * split_huge_pmd */ if (!hpte_slot_array) return; @@ -1899,11 +1942,20 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, * * For guests on platforms before POWER9, we clamp the it limit to 1G * to avoid some funky things such as RTAS bugs etc... + * + * On POWER9 we limit to 1TB in case the host erroneously told us that + * the RMA was >1TB. Effective address bits 0:23 are treated as zero + * (meaning the access is aliased to zero i.e. addr = addr % 1TB) + * for virtual real mode addressing and so it doesn't make sense to + * have an area larger than 1TB as it can't be addressed. */ if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { ppc64_rma_size = first_memblock_size; if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); + else + ppc64_rma_size = min_t(u64, ppc64_rma_size, + 1UL << SID_SHIFT_1T); /* Finally limit subsequent allocations */ memblock_set_current_limit(ppc64_rma_size); @@ -1922,10 +1974,16 @@ static int hpt_order_get(void *data, u64 *val) static int hpt_order_set(void *data, u64 val) { + int ret; + if (!mmu_hash_ops.resize_hpt) return -ENODEV; - return mmu_hash_ops.resize_hpt(val); + cpus_read_lock(); + ret = mmu_hash_ops.resize_hpt(val); + cpus_read_unlock(); + + return ret; } DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); @@ -1948,7 +2006,4 @@ void __init print_system_hash_info(void) if (htab_hash_mask) pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); - pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); - pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); - pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); } diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index b056cae3388b..56cc84520577 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -129,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, * Allow to use larger than 64k IOMMU pages. Only do that * if we are backed by hugetlb. */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); mem->pageshift = min(mem->pageshift, pageshift); /* * We don't need struct page reference any more, switch diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 2d0cb5ba9a47..0ba30b8b935b 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -256,8 +256,21 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif + /* + * For tasks which were successfully initialized we end up calling + * arch_exit_mmap() which clears the process table entry. And + * arch_exit_mmap() is called before the required fullmm TLB flush + * which does a RIC=2 flush. Hence for an initialized task, we do clear + * any cached process table entries. + * + * The condition below handles the error case during task init. We have + * set the process table entry early and if we fail a task + * initialization, we need to ensure the process table entry is zeroed. + * We need not worry about process table entry caches because the task + * never ran with the PID value. + */ if (radix_enabled()) - WARN_ON(process_tb[mm->context.id].prtb0 != 0); + process_tb[mm->context.id].prtb0 = 0; else subpage_prot_free(mm); destroy_contexts(&mm->context); diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 7d0e0d0d22c4..75483b40fcb1 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -8,10 +8,13 @@ #include <linux/memblock.h> #include <misc/cxl-base.h> +#include <asm/debugfs.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/trace.h> #include <asm/powernv.h> +#include <asm/firmware.h> +#include <asm/ultravisor.h> #include <mm/mmu_decl.h> #include <trace/events/thp.h> @@ -21,9 +24,6 @@ EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; EXPORT_SYMBOL(__pmd_frag_size_shift); -int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is called when relaxing access to a hugepage. It's also called in the page @@ -205,37 +205,61 @@ void __init mmu_partition_table_init(void) * 64 K size. */ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); - mtspr(SPRN_PTCR, ptcr); + set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); } +static void flush_partition(unsigned int lpid, bool radix) +{ + if (radix) { + radix__flush_all_lpid(lpid); + radix__flush_all_lpid_guest(lpid); + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } +} + void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) + unsigned long dw1, bool flush) { unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + /* + * When ultravisor is enabled, the partition table is stored in secure + * memory and can only be accessed doing an ultravisor call. However, we + * maintain a copy of the partition table in normal memory to allow Nest + * MMU translations to occur (for normal VMs). + * + * Therefore, here we always update partition_tb, regardless of whether + * we are running under an ultravisor or not. + */ partition_tb[lpid].patb0 = cpu_to_be64(dw0); partition_tb[lpid].patb1 = cpu_to_be64(dw1); /* - * Global flush of TLBs and partition table caches for this lpid. - * The type of flush (hash or radix) depends on what the previous - * use of this partition ID was, not the new use. + * If ultravisor is enabled, we do an ultravisor call to register the + * partition table entry (PATE), which also do a global flush of TLBs + * and partition table caches for the lpid. Otherwise, just do the + * flush. The type of flush (hash or radix) depends on what the previous + * use of the partition ID was, not the new use. */ - asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); - } else { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) { + uv_register_pate(lpid, dw0, dw1); + pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n", + dw0, dw1); + } else if (flush) { + /* + * Boot does not need to flush, because MMU is off and each + * CPU does a tlbiel_all() before switching them on, which + * flushes everything. + */ + flush_partition(lpid, (old & PATB_HR)); } - /* do we need fixup here ?*/ - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); @@ -447,23 +471,48 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } -int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid) -{ - unsigned long i; +/* + * Does the CPU support tlbie? + */ +bool tlbie_capable __read_mostly = true; +EXPORT_SYMBOL(tlbie_capable); - if (radix_enabled()) - return radix__ioremap_range(ea, pa, size, prot, nid); - - for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); - if (err) { - if (slab_is_available()) - unmap_kernel_range(ea, size); - else - WARN_ON_ONCE(1); /* Should clean up */ - return err; - } +/* + * Should tlbie be used for management of CPU TLBs, for kernel and process + * address spaces? tlbie may still be used for nMMU accelerators, and for KVM + * guest address spaces. + */ +bool tlbie_enabled __read_mostly = true; + +static int __init setup_disable_tlbie(char *str) +{ + if (!radix_enabled()) { + pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n"); + return 1; } + tlbie_capable = false; + tlbie_enabled = false; + + return 1; +} +__setup("disable_tlbie", setup_disable_tlbie); + +static int __init pgtable_debugfs_setup(void) +{ + if (!tlbie_capable) + return 0; + + /* + * There is no locking vs tlb flushing when changing this value. + * The tlb flushers will see one value or another, and use either + * tlbie or tlbiel with IPIs. In both cases the TLBs will be + * invalidated as expected. + */ + debugfs_create_bool("tlbie_enabled", 0600, + powerpc_debugfs_root, + &tlbie_enabled); + return 0; } +arch_initcall(pgtable_debugfs_setup); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b4ca9e95e678..6ee17d09649c 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -27,25 +27,13 @@ #include <asm/sections.h> #include <asm/trace.h> #include <asm/uaccess.h> +#include <asm/ultravisor.h> #include <trace/events/thp.h> unsigned int mmu_pid_bits; unsigned int mmu_base_pid; -static int native_register_process_table(unsigned long base, unsigned long pg_sz, - unsigned long table_size) -{ - unsigned long patb0, patb1; - - patb0 = be64_to_cpu(partition_tb[0].patb0); - patb1 = base | table_size | PATB_GR; - - mmu_partition_table_set_entry(0, patb0, patb1); - - return 0; -} - static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) { @@ -380,18 +368,6 @@ static void __init radix_init_pgtable(void) */ rts_field = radix__get_tree_size(); process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); - /* - * Fill in the partition table. We are suppose to use effective address - * of process table here. But our linear mapping also enable us to use - * physical address here. - */ - register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); - pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); /* * The init_mm context is given the first available (non-zero) PID, @@ -412,20 +388,15 @@ static void __init radix_init_pgtable(void) static void __init radix_init_partition_table(void) { - unsigned long rts_field, dw0; + unsigned long rts_field, dw0, dw1; mmu_partition_table_init(); rts_field = radix__get_tree_size(); dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; - mmu_partition_table_set_entry(0, dw0, 0); + dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; + mmu_partition_table_set_entry(0, dw0, dw1, false); pr_info("Initializing Radix MMU\n"); - pr_info("Partition table %p\n", partition_tb); -} - -void __init radix_init_native(void) -{ - register_process_table = native_register_process_table; } static int __init get_idx_from_shift(unsigned int shift) @@ -621,8 +592,9 @@ void __init radix__early_init_mmu(void) __pmd_frag_nr = RADIX_PMD_FRAG_NR; __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; + radix_init_pgtable(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) { - radix_init_native(); lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); @@ -633,11 +605,9 @@ void __init radix__early_init_mmu(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - radix_init_pgtable(); /* Switch to the guard PID before turning on MMU */ radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__early_init_mmu_secondary(void) @@ -650,14 +620,14 @@ void radix__early_init_mmu_secondary(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); + radix_init_amor(); } radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__mmu_cleanup_all(void) @@ -667,7 +637,7 @@ void radix__mmu_cleanup_all(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) { lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); - mtspr(SPRN_PTCR, 0); + set_ptcr_when_no_uv(0); powernv_set_nmmu_ptcr(0); radix__flush_tlb_all(); } @@ -737,8 +707,8 @@ static int __meminit stop_machine_change_mapping(void *data) spin_unlock(&init_mm.page_table_lock); pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(params->aligned_start, params->start, -1); - create_physical_mapping(params->end, params->aligned_end, -1); + create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1); + create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1); spin_lock(&init_mm.page_table_lock); return 0; } @@ -902,7 +872,7 @@ int __meminit radix__create_section_mapping(unsigned long start, unsigned long e return -1; } - return create_physical_mapping(start, end, nid); + return create_physical_mapping(__pa(start), __pa(end), nid); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) @@ -1057,13 +1027,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } -int radix__has_transparent_hugepage(void) -{ - /* For radix 2M at PMD level means thp */ - if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) - return 1; - return 0; -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, @@ -1218,26 +1181,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) return 1; } -int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, - pgprot_t prot, int nid) -{ - if (likely(slab_is_available())) { - int err = ioremap_page_range(ea, ea + size, pa, prot); - if (err) - unmap_kernel_range(ea, size); - return err; - } else { - unsigned long i; - - for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); - if (WARN_ON_ONCE(err)) /* Should clean up */ - return err; - } - return 0; - } -} - int __init arch_ioremap_p4d_supported(void) { return 0; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 71f7fede2fa4..67af871190c6 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -51,11 +51,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) * and partition table entries. Then flush the remaining sets of the * TLB. */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); - /* Do the same for process scoped entries. */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) { + /* MSR[HV] should flush partition scope translations first. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + } + + /* Flush process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); @@ -116,22 +120,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbiel_lpid(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -146,23 +134,20 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) +static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ + rs = lpid; prs = 1; /* process scoped */ r = 1; /* radix format */ - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } - static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { @@ -211,22 +196,83 @@ static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void fixup_tlbie(void) + +static inline void fixup_tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid, + unsigned long ap) { - unsigned long pid = 0; + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_pid(unsigned long pid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } } + +static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB); + } +} + static inline void fixup_tlbie_lpid(unsigned long lpid) { + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } @@ -273,6 +319,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_pid(pid, RIC_FLUSH_TLB); + fixup_tlbie_pid(pid); break; case RIC_FLUSH_PWC: __tlbie_pid(pid, RIC_FLUSH_PWC); @@ -280,37 +327,42 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_pid(pid, RIC_FLUSH_ALL); + fixup_tlbie_pid(pid); } - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) +struct tlbiel_pid { + unsigned long pid; + unsigned long ric; +}; + +static void do_tlbiel_pid(void *info) { - int set; + struct tlbiel_pid *t = info; - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_pid(t->pid, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_pid(t->pid, RIC_FLUSH_PWC); + else + _tlbiel_pid(t->pid, RIC_FLUSH_ALL); +} - asm volatile("ptesync": : :"memory"); +static inline void _tlbiel_pid_multicast(struct mm_struct *mm, + unsigned long pid, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_pid t = { .pid = pid, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1); /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. + * Always want the CPU translations to be invalidated with tlbiel in + * these paths, so while coprocessors must use tlbie, we can not + * optimise away the tlbiel component. */ - __tlbiel_lpid(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST "; isync" : : :"memory"); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_pid(pid, RIC_FLUSH_ALL); } static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) @@ -325,6 +377,7 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_lpid(lpid, RIC_FLUSH_TLB); + fixup_tlbie_lpid(lpid); break; case RIC_FLUSH_PWC: __tlbie_lpid(lpid, RIC_FLUSH_PWC); @@ -332,40 +385,33 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_lpid(lpid, RIC_FLUSH_ALL); + fixup_tlbie_lpid(lpid); } - fixup_tlbie_lpid(lpid); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static __always_inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) +static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. */ - __tlbiel_lpid_guest(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory"); + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } - static inline void __tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) @@ -407,6 +453,8 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end, for (addr = start; addr < end; addr += page_size) __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range(addr - page_size, pid, ap); } static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, @@ -416,10 +464,57 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, ap, ric); - fixup_tlbie(); + fixup_tlbie_va(va, pid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +struct tlbiel_va { + unsigned long pid; + unsigned long va; + unsigned long psize; + unsigned long ric; +}; + +static void do_tlbiel_va(void *info) +{ + struct tlbiel_va *t = info; + + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC); + else + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_va_multicast(struct mm_struct *mm, + unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va(va, pid, psize, RIC_FLUSH_TLB); +} + +struct tlbiel_va_range { + unsigned long pid; + unsigned long start; + unsigned long end; + unsigned long page_size; + unsigned long psize; + bool also_pwc; +}; + +static void do_tlbiel_va_range(void *info) +{ + struct tlbiel_va_range *t = info; + + _tlbiel_va_range(t->start, t->end, t->pid, t->page_size, + t->psize, t->also_pwc); +} + static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long psize, unsigned long ric) { @@ -427,7 +522,7 @@ static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, ap, ric); - fixup_tlbie_lpid(lpid); + fixup_tlbie_lpid_va(va, lpid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -439,10 +534,24 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, if (also_pwc) __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, psize); - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va_range t = { .start = start, .end = end, + .pid = pid, .page_size = page_size, + .psize = psize, .also_pwc = also_pwc }; + + on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); +} + /* * Base TLB flushing operations: * @@ -580,10 +689,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm) goto local; } - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } else { local: _tlbiel_pid(pid, RIC_FLUSH_TLB); @@ -609,7 +722,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) goto local; } } - _tlbie_pid(pid, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } else { local: _tlbiel_pid(pid, RIC_FLUSH_ALL); @@ -644,7 +760,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + else + _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); } else { local: _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); @@ -666,6 +785,24 @@ EXPORT_SYMBOL(radix__flush_tlb_page); #define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ +static void do_tlbiel_kernel(void *info) +{ + _tlbiel_pid(0, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_kernel_broadcast(void) +{ + on_each_cpu(do_tlbiel_kernel, NULL, 1); + if (tlbie_capable) { + /* + * Coherent accelerators don't refcount kernel memory mappings, + * so have to always issue a tlbie for them. This is quite a + * slow path anyway. + */ + _tlbie_pid(0, RIC_FLUSH_ALL); + } +} + /* * If kernel TLBIs ever become local rather than global, then * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it @@ -673,7 +810,10 @@ EXPORT_SYMBOL(radix__flush_tlb_page); */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { - _tlbie_pid(0, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(0, RIC_FLUSH_ALL); + else + _tlbiel_kernel_broadcast(); } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); @@ -729,10 +869,14 @@ is_local: if (local) { _tlbiel_pid(pid, RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } } else { bool hflush = flush_all_sizes; @@ -757,8 +901,8 @@ is_local: gflush = false; } - asm volatile("ptesync": : :"memory"); if (local) { + asm volatile("ptesync": : :"memory"); __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, @@ -767,7 +911,8 @@ is_local: __tlbiel_va_range(gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); - } else { + } else if (cputlb_use_tlbie()) { + asm volatile("ptesync": : :"memory"); __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, @@ -775,8 +920,17 @@ is_local: if (gflush) __tlbie_va_range(gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G); - fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } else { + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, mmu_virtual_psize, false); + if (hflush) + _tlbiel_va_range_multicast(mm, + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); + if (gflush) + _tlbiel_va_range_multicast(mm, + gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false); } } preempt_enable(); @@ -835,32 +989,19 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); /* * Flush partition scoped translations from LPID (=LPIDR) */ -void radix__flush_tlb_lpid(unsigned int lpid) +void radix__flush_all_lpid(unsigned int lpid) { _tlbie_lpid(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); - -/* - * Flush partition scoped translations from LPID (=LPIDR) - */ -void radix__local_flush_tlb_lpid(unsigned int lpid) -{ - _tlbiel_lpid(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); +EXPORT_SYMBOL_GPL(radix__flush_all_lpid); /* - * Flush process scoped translations from LPID (=LPIDR). - * Important difference, the guest normally manages its own translations, - * but some cases e.g., vCPU CPU migration require KVM to flush. + * Flush process scoped translations from LPID (=LPIDR) */ -void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +void radix__flush_all_lpid_guest(unsigned int lpid) { - _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); + _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); - static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); @@ -966,16 +1107,26 @@ is_local: if (local) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - also_pwc = true; + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } } else { if (local) _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); - else + else if (cputlb_use_tlbie()) _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, psize, also_pwc); } preempt_enable(); } @@ -1017,7 +1168,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + if (cputlb_use_tlbie()) + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + else + _tlbiel_va_range_multicast(mm, + addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } else { local: _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 9ba07e55c489..2ef24a53f4c9 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/types.h> -#include <linux/mm.h> +#include <linux/pagewalk.h> #include <linux/hugetlb.h> #include <linux/syscalls.h> @@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, return 0; } +static const struct mm_walk_ops subpage_walk_ops = { + .pmd_entry = subpage_walk_pmd_entry, +}; + static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; /* * We don't try too hard, we just mark all the vma in that range @@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, if (vma->vm_start >= (addr + len)) break; vma->vm_flags |= VM_NOHUGEPAGE; - walk_page_vma(vma, &subpage_proto_walk); + walk_page_vma(vma, &subpage_walk_ops, NULL); vma = vma->vm_next; } } diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index c617282d5b2a..2a82984356f8 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -4,310 +4,18 @@ * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) * * Copyright (C) 2000 Russell King - * - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. The function return - * is the virtual address and 'dma_handle' is the physical address. - * Mostly stolen from the ARM port, with some changes for PowerPC. - * -- Dan - * - * Reorganized to get rid of the arch-specific consistent_* functions - * and provide non-coherent implementations for the DMA API. -Matt - * - * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent() - * implementation. This is pulled straight from ARM and barely - * modified. -Matt */ -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/kernel.h> #include <linux/errno.h> -#include <linux/string.h> #include <linux/types.h> #include <linux/highmem.h> #include <linux/dma-direct.h> #include <linux/dma-noncoherent.h> -#include <linux/export.h> #include <asm/tlbflush.h> #include <asm/dma.h> -#include <mm/mmu_decl.h> - -/* - * This address range defaults to a value that is safe for all - * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It - * can be further configured for specific applications under - * the "Advanced Setup" menu. -Matt - */ -#define CONSISTENT_BASE (IOREMAP_TOP) -#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE) -#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) - -/* - * This is the page table (2MB) covering uncached, DMA consistent allocations - */ -static DEFINE_SPINLOCK(consistent_lock); - -/* - * VM region handling support. - * - * This should become something generic, handling VM region allocations for - * vmalloc and similar (ioremap, module space, etc). - * - * I envisage vmalloc()'s supporting vm_struct becoming: - * - * struct vm_struct { - * struct vm_region region; - * unsigned long flags; - * struct page **pages; - * unsigned int nr_pages; - * unsigned long phys_addr; - * }; - * - * get_vm_area() would then call vm_region_alloc with an appropriate - * struct vm_region head (eg): - * - * struct vm_region vmalloc_head = { - * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), - * .vm_start = VMALLOC_START, - * .vm_end = VMALLOC_END, - * }; - * - * However, vmalloc_head.vm_start is variable (typically, it is dependent on - * the amount of RAM found at boot time.) I would imagine that get_vm_area() - * would have to initialise this each time prior to calling vm_region_alloc(). - */ -struct ppc_vm_region { - struct list_head vm_list; - unsigned long vm_start; - unsigned long vm_end; -}; - -static struct ppc_vm_region consistent_head = { - .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), - .vm_start = CONSISTENT_BASE, - .vm_end = CONSISTENT_END, -}; - -static struct ppc_vm_region * -ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp) -{ - unsigned long addr = head->vm_start, end = head->vm_end - size; - unsigned long flags; - struct ppc_vm_region *c, *new; - - new = kmalloc(sizeof(struct ppc_vm_region), gfp); - if (!new) - goto out; - - spin_lock_irqsave(&consistent_lock, flags); - - list_for_each_entry(c, &head->vm_list, vm_list) { - if ((addr + size) < addr) - goto nospc; - if ((addr + size) <= c->vm_start) - goto found; - addr = c->vm_end; - if (addr > end) - goto nospc; - } - - found: - /* - * Insert this entry _before_ the one we found. - */ - list_add_tail(&new->vm_list, &c->vm_list); - new->vm_start = addr; - new->vm_end = addr + size; - - spin_unlock_irqrestore(&consistent_lock, flags); - return new; - - nospc: - spin_unlock_irqrestore(&consistent_lock, flags); - kfree(new); - out: - return NULL; -} - -static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr) -{ - struct ppc_vm_region *c; - - list_for_each_entry(c, &head->vm_list, vm_list) { - if (c->vm_start == addr) - goto out; - } - c = NULL; - out: - return c; -} - -/* - * Allocate DMA-coherent memory space and return both the kernel remapped - * virtual and bus address for that space. - */ -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - struct page *page; - struct ppc_vm_region *c; - unsigned long order; - u64 mask = ISA_DMA_THRESHOLD, limit; - - if (dev) { - mask = dev->coherent_dma_mask; - - /* - * Sanity check the DMA mask - it must be non-zero, and - * must be able to be satisfied by a DMA allocation. - */ - if (mask == 0) { - dev_warn(dev, "coherent DMA mask is unset\n"); - goto no_page; - } - - if ((~mask) & ISA_DMA_THRESHOLD) { - dev_warn(dev, "coherent DMA mask %#llx is smaller " - "than system GFP_DMA mask %#llx\n", - mask, (unsigned long long)ISA_DMA_THRESHOLD); - goto no_page; - } - } - - - size = PAGE_ALIGN(size); - limit = (mask + 1) & ~mask; - if ((limit && size >= limit) || - size >= (CONSISTENT_END - CONSISTENT_BASE)) { - printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", - size, mask); - return NULL; - } - - order = get_order(size); - - /* Might be useful if we ever have a real legacy DMA zone... */ - if (mask != 0xffffffff) - gfp |= GFP_DMA; - - page = alloc_pages(gfp, order); - if (!page) - goto no_page; - - /* - * Invalidate any data that might be lurking in the - * kernel direct-mapped region for device DMA. - */ - { - unsigned long kaddr = (unsigned long)page_address(page); - memset(page_address(page), 0, size); - flush_dcache_range(kaddr, kaddr + size); - } - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = ppc_vm_region_alloc(&consistent_head, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); - if (c) { - unsigned long vaddr = c->vm_start; - struct page *end = page + (1 << order); - - split_page(page, order); - - /* - * Set the "dma handle" - */ - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - - do { - SetPageReserved(page); - map_kernel_page(vaddr, page_to_phys(page), - pgprot_noncached(PAGE_KERNEL)); - page++; - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* - * Free the otherwise unused pages. - */ - while (page < end) { - __free_page(page); - page++; - } - - return (void *)c->vm_start; - } - - if (page) - __free_pages(page, order); - no_page: - return NULL; -} - -/* - * free a page as defined by the above mapping. - */ -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - struct ppc_vm_region *c; - unsigned long flags, addr; - - size = PAGE_ALIGN(size); - - spin_lock_irqsave(&consistent_lock, flags); - - c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr); - if (!c) - goto no_area; - - if ((c->vm_end - c->vm_start) != size) { - printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; - } - - addr = c->vm_start; - do { - pte_t *ptep; - unsigned long pfn; - - ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr), - addr), - addr), - addr); - if (!pte_none(*ptep) && pte_present(*ptep)) { - pfn = pte_pfn(*ptep); - pte_clear(&init_mm, addr, ptep); - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - __free_reserved_page(page); - } - } - addr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - flush_tlb_kernel_range(c->vm_start, c->vm_end); - - list_del(&c->vm_list); - - spin_unlock_irqrestore(&consistent_lock, flags); - - kfree(c); - return; - - no_area: - spin_unlock_irqrestore(&consistent_lock, flags); - printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", - __func__, vaddr); - dump_stack(); -} - /* * make an area consistent. */ @@ -408,23 +116,9 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, __dma_sync_page(paddr, size, dir); } -/* - * Return the PFN for a given cpu virtual address returned by arch_dma_alloc. - */ -long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, - dma_addr_t dma_addr) +void arch_dma_prep_coherent(struct page *page, size_t size) { - /* This should always be populated, so we don't test every - * level. If that fails, we'll have a nice crash which - * will be as good as a BUG_ON() - */ - unsigned long cpu_addr = (unsigned long)vaddr; - pgd_t *pgd = pgd_offset_k(cpu_addr); - pud_t *pud = pud_offset(pgd, cpu_addr); - pmd_t *pmd = pmd_offset(pud, cpu_addr); - pte_t *ptep = pte_offset_kernel(pmd, cpu_addr); + unsigned long kaddr = (unsigned long)page_address(page); - if (pte_none(*ptep) || !pte_present(*ptep)) - return 0; - return pte_pfn(*ptep); + flush_dcache_range(kaddr, kaddr + size); } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a8953f108808..73d4873fc7f8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -667,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page) BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (!PageHighMem(page)) { __flush_dcache_icache(page_address(page+i)); } else { diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a44f6281ca3a..4e08246acd79 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -172,6 +172,21 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) +{ + unsigned long nr_pfn = page_size / sizeof(struct page); + unsigned long start_pfn = page_to_pfn((struct page *)start); + + if ((start_pfn + nr_pfn) > altmap->end_pfn) + return true; + + if (start_pfn < altmap->base_pfn) + return true; + + return false; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { @@ -194,7 +209,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * fail due to alignment issues when using 16MB hugepages, so * fall back to system memory if the altmap allocation fail. */ - if (altmap) { + if (altmap && !altmap_cross_boundary(altmap, start, page_size)) { p = altmap_alloc_block_buf(page_size, altmap); if (!p) pr_debug("altmap block allocation failed, falling back to system memory"); diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c new file mode 100644 index 000000000000..fc669643ce6a --- /dev/null +++ b/arch/powerpc/mm/ioremap.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <asm/io-workarounds.h> + +unsigned long ioremap_bot; +EXPORT_SYMBOL(ioremap_bot); + +void __iomem *ioremap(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap); + +void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} + +void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +{ + pte_t pte = __pte(flags); + void *caller = __builtin_return_address(0); + + /* writeable implies dirty for kernel addresses */ + if (pte_write(pte)) + pte = pte_mkdirty(pte); + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + pte = pte_exprotect(pte); + pte = pte_mkprivileged(pte); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, pte_pgprot(pte), caller); + return __ioremap_caller(addr, size, pte_pgprot(pte), caller); +} +EXPORT_SYMBOL(ioremap_prot); + +int early_ioremap_range(unsigned long ea, phys_addr_t pa, + unsigned long size, pgprot_t prot) +{ + unsigned long i; + + for (i = 0; i < size; i += PAGE_SIZE) { + int err = map_kernel_page(ea + i, pa + i, prot); + + if (WARN_ON_ONCE(err)) /* Should clean up */ + return err; + } + + return 0; +} + +void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, + pgprot_t prot, void *caller) +{ + struct vm_struct *area; + int ret; + unsigned long va; + + area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); + if (area == NULL) + return NULL; + + area->phys_addr = pa; + va = (unsigned long)area->addr; + + ret = ioremap_page_range(va, va + size, pa, prot); + if (!ret) + return (void __iomem *)area->addr + offset; + + unmap_kernel_range(va, size); + free_vm_area(area); + + return NULL; +} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c new file mode 100644 index 000000000000..f36121f25243 --- /dev/null +++ b/arch/powerpc/mm/ioremap_32.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <mm/mmu_decl.h> + +void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * +__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) +{ + unsigned long v; + phys_addr_t p, offset; + int err; + + /* + * Choose an address to map it to. + * Once the vmalloc system is running, we use it. + * Before then, we use space going down from IOREMAP_TOP + * (ioremap_bot records where we're up to). + */ + p = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + size = PAGE_ALIGN(addr + size) - p; + + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (p < 16 * 1024 * 1024) + p += _ISA_MEM_BASE; + +#ifndef CONFIG_CRASH_DUMP + /* + * Don't allow anybody to remap normal RAM that we're using. + * mem_init() sets high_memory so only do the check after that. + */ + if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && + page_is_ram(__phys_to_pfn(p))) { + pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__, + (unsigned long long)p, __builtin_return_address(0)); + return NULL; + } +#endif + + if (size == 0) + return NULL; + + /* + * Is it already mapped? Perhaps overlapped by a previous + * mapping. + */ + v = p_block_mapped(p); + if (v) + return (void __iomem *)v + offset; + + if (slab_is_available()) + return do_ioremap(p, offset, size, prot, caller); + + /* + * Should check if it is a candidate for a BAT mapping + */ + + err = early_ioremap_range(ioremap_bot - size, p, size, prot); + if (err) + return NULL; + ioremap_bot -= size; + + return (void __iomem *)ioremap_bot + offset; +} + +void iounmap(volatile void __iomem *addr) +{ + /* + * If mapped by BATs then there is nothing to do. + * Calling vfree() generates a benign warning. + */ + if (v_block_mapped((unsigned long)addr)) + return; + + if (addr > high_memory && (unsigned long)addr < ioremap_bot) + vunmap((void *)(PAGE_MASK & (unsigned long)addr)); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c new file mode 100644 index 000000000000..fd29e51700cd --- /dev/null +++ b/arch/powerpc/mm/ioremap_64.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +/** + * Low level function to establish the page tables for an IO mapping + */ +void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) +{ + int ret; + unsigned long va = (unsigned long)ea; + + /* We don't support the 4K PFN hack with ioremap */ + if (pgprot_val(prot) & H_PAGE_4K_PFN) + return NULL; + + if ((ea + size) >= (void *)IOREMAP_END) { + pr_warn("Outside the supported range\n"); + return NULL; + } + + WARN_ON(pa & ~PAGE_MASK); + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + if (slab_is_available()) { + ret = ioremap_page_range(va, va + size, pa, prot); + if (ret) + unmap_kernel_range(va, size); + } else { + ret = early_ioremap_range(va, pa, size, prot); + } + + if (ret) + return NULL; + + return (void __iomem *)ea; +} +EXPORT_SYMBOL(__ioremap_at); + +/** + * Low level function to tear down the page tables for an IO mapping. This is + * used for mappings that are manipulated manually, like partial unmapping of + * PCI IOs or ISA space. + */ +void __iounmap_at(void *ea, unsigned long size) +{ + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + unmap_kernel_range((unsigned long)ea, size); +} +EXPORT_SYMBOL(__iounmap_at); + +void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller) +{ + phys_addr_t paligned, offset; + void __iomem *ret; + int err; + + /* We don't support the 4K PFN hack with ioremap */ + if (pgprot_val(prot) & H_PAGE_4K_PFN) + return NULL; + + /* + * Choose an address to map it to. Once the vmalloc system is running, + * we use it. Before that, we map using addresses going up from + * ioremap_bot. vmalloc will use the addresses from IOREMAP_BASE + * through ioremap_bot. + */ + paligned = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + size = PAGE_ALIGN(addr + size) - paligned; + + if (size == 0 || paligned == 0) + return NULL; + + if (slab_is_available()) + return do_ioremap(paligned, offset, size, prot, caller); + + err = early_ioremap_range(ioremap_bot, paligned, size, prot); + if (err) + return NULL; + + ret = (void __iomem *)ioremap_bot + offset; + ioremap_bot += size; + + return ret; +} + +/* + * Unmap an IO region and remove it from vmalloc'd list. + * Access to IO memory should be serialized by driver. + */ +void iounmap(volatile void __iomem *token) +{ + void *addr; + + if (!slab_is_available()) + return; + + addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); + + if ((unsigned long)addr < ioremap_bot) { + pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); + return; + } + vunmap(addr); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 0d62be3cba47..0e6ed4413eea 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -5,12 +5,21 @@ #include <linux/kasan.h> #include <linux/printk.h> #include <linux/memblock.h> +#include <linux/moduleloader.h> #include <linux/sched/task.h> #include <linux/vmalloc.h> #include <asm/pgalloc.h> #include <asm/code-patching.h> #include <mm/mmu_decl.h> +static pgprot_t kasan_prot_ro(void) +{ + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return PAGE_READONLY; + + return PAGE_KERNEL_RO; +} + static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) { unsigned long va = (unsigned long)kasan_early_shadow_page; @@ -21,10 +30,11 @@ static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); } -static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) +static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) { pmd_t *pmd; unsigned long k_cur, k_next; + pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL; pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); @@ -35,15 +45,27 @@ static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_ if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) continue; - new = pte_alloc_one_kernel(&init_mm); + if (slab_is_available()) + new = pte_alloc_one_kernel(&init_mm); + else + new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); if (!new) return -ENOMEM; - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - kasan_populate_pte(new, PAGE_READONLY); - else - kasan_populate_pte(new, PAGE_KERNEL_RO); - pmd_populate_kernel(&init_mm, pmd, new); + kasan_populate_pte(new, prot); + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&init_mm.page_table_lock); + /* Has another populated it ? */ + if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) { + pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } + spin_unlock(&init_mm.page_table_lock); + + if (new && slab_is_available()) + pte_free_kernel(&init_mm, new); } return 0; } @@ -71,7 +93,7 @@ static int __ref kasan_init_region(void *start, size_t size) if (!slab_is_available()) block = memblock_alloc(k_end - k_start, PAGE_SIZE); - for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) { + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); void *va = block ? block + k_cur - k_start : kasan_get_one_page(); pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); @@ -87,11 +109,23 @@ static int __ref kasan_init_region(void *start, size_t size) static void __init kasan_remap_early_shadow_ro(void) { - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY); - else - kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); + pgprot_t prot = kasan_prot_ro(); + unsigned long k_start = KASAN_SHADOW_START; + unsigned long k_end = KASAN_SHADOW_END; + unsigned long k_cur; + phys_addr_t pa = __pa(kasan_early_shadow_page); + + kasan_populate_pte(kasan_early_shadow_pte, prot); + + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) + continue; + __set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); + } flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } @@ -134,7 +168,11 @@ void __init kasan_init(void) #ifdef CONFIG_MODULES void *module_alloc(unsigned long size) { - void *base = vmalloc_exec(size); + void *base; + + base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); if (!base) return NULL; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9259337d7374..be941d382c8d 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -239,7 +239,7 @@ void __init paging_init(void) #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(max_low_pfn, - ((1UL << ARCH_ZONE_DMA_BITS) - 1) >> PAGE_SHIFT); + 1UL << (ARCH_ZONE_DMA_BITS - PAGE_SHIFT)); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM @@ -302,12 +302,9 @@ void __init mem_init(void) pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); #endif /* CONFIG_HIGHMEM */ -#ifdef CONFIG_NOT_COHERENT_CACHE - pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", - IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); -#endif /* CONFIG_NOT_COHERENT_CACHE */ - pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", - ioremap_bot, IOREMAP_TOP); + if (ioremap_bot != IOREMAP_TOP) + pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", + ioremap_bot, IOREMAP_TOP); pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", VMALLOC_START, VMALLOC_END); #endif /* CONFIG_PPC32 */ @@ -407,63 +404,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, EXPORT_SYMBOL(flush_icache_user_range); /* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a PTE in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux PTE. - * - * This must always be called with the pte lock held. - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep) -{ -#ifdef CONFIG_PPC_BOOK3S - /* - * We don't need to worry about _PAGE_PRESENT here because we are - * called with either mm->page_table_lock held or ptl lock held - */ - unsigned long trap; - bool is_exec; - - if (radix_enabled()) { - prefetch((void *)address); - return; - } - - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ - if (!pte_young(*ptep) || address >= TASK_SIZE) - return; - - /* We try to figure out if we are coming from an instruction - * access fault and pass that down to __hash_page so we avoid - * double-faulting on execution of fresh text. We have to test - * for regs NULL since init will get here first thing at boot - * - * We also avoid filling the hash if not coming from a fault - */ - - trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; - switch (trap) { - case 0x300: - is_exec = false; - break; - case 0x400: - is_exec = true; - break; - default: - return; - } - - hash_preload(vma->vm_mm, address, is_exec, trap); -#endif /* CONFIG_PPC_BOOK3S */ -#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ - && defined(CONFIG_HUGETLB_PAGE) - if (is_vm_hugetlb_page(vma)) - book3e_hugetlb_preload(vma, address, *ptep); -#endif -} - -/* * System memory should not be in /proc/iomem but various tools expect it * (eg kdump). */ diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 32c1a191c28a..c750ac9ec713 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -82,10 +82,6 @@ static inline void print_system_hash_info(void) {} #else /* CONFIG_PPC_MMU_NOHASH */ -extern void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap); - - extern void _tlbie(unsigned long address); extern void _tlbia(void); @@ -95,6 +91,8 @@ void print_system_hash_info(void); #ifdef CONFIG_PPC32 +void hash_preload(struct mm_struct *mm, unsigned long ea); + extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); @@ -108,7 +106,6 @@ extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ -extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index 61915f4d3c7f..8b88be91b622 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -122,8 +122,8 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) return found; } -void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, - pte_t pte) +static void +book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) { unsigned long mas1, mas2; u64 mas7_3; @@ -183,6 +183,18 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, local_irq_restore(flags); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +{ + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +} + void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { struct hstate *hstate = hstate_file(vma->vm_file); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index d4acf6fa0596..696f568253a0 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -630,7 +630,6 @@ static void early_init_this_mmu(void) #ifdef CONFIG_PPC_FSL_BOOK3E if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned int num_cams; - int __maybe_unused cpu = smp_processor_id(); bool map = true; /* use a quarter of the TLBCAM for bolted linear map */ @@ -704,6 +703,8 @@ static void __init early_init_mmu_global(void) * for use by the TLB miss code */ linear_map_top = memblock_end_of_DRAM(); + + ioremap_bot = IOREMAP_BASE; } static void __init early_mmu_set_memory_limit(void) diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index a7b05214760c..ee4bd6d38602 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } @@ -61,7 +61,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -113,7 +113,7 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (!kernel) - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 35cb96cfc258..8ec5dfb65b2e 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -27,166 +27,13 @@ #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> -#include <asm/io.h> #include <asm/setup.h> #include <asm/sections.h> #include <mm/mmu_decl.h> -unsigned long ioremap_bot; -EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ - extern char etext[], _stext[], _sinittext[], _einittext[]; -void __iomem * -ioremap(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap); - -void __iomem * -ioremap_wc(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_wc); - -void __iomem * -ioremap_wt(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_wt); - -void __iomem * -ioremap_coherent(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_coherent); - -void __iomem * -ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) -{ - pte_t pte = __pte(flags); - - /* writeable implies dirty for kernel addresses */ - if (pte_write(pte)) - pte = pte_mkdirty(pte); - - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - pte = pte_mkprivileged(pte); - - return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_prot); - -void __iomem * -__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) -{ - return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); -} - -void __iomem * -__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) -{ - unsigned long v, i; - phys_addr_t p; - int err; - - /* - * Choose an address to map it to. - * Once the vmalloc system is running, we use it. - * Before then, we use space going down from IOREMAP_TOP - * (ioremap_bot records where we're up to). - */ - p = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - p; - - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16*1024*1024) - p += _ISA_MEM_BASE; - -#ifndef CONFIG_CRASH_DUMP - /* - * Don't allow anybody to remap normal RAM that we're using. - * mem_init() sets high_memory so only do the check after that. - */ - if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && - page_is_ram(__phys_to_pfn(p))) { - printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n", - (unsigned long long)p, __builtin_return_address(0)); - return NULL; - } -#endif - - if (size == 0) - return NULL; - - /* - * Is it already mapped? Perhaps overlapped by a previous - * mapping. - */ - v = p_block_mapped(p); - if (v) - goto out; - - if (slab_is_available()) { - struct vm_struct *area; - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (area == 0) - return NULL; - area->phys_addr = p; - v = (unsigned long) area->addr; - } else { - v = (ioremap_bot -= size); - } - - /* - * Should check if it is a candidate for a BAT mapping - */ - - err = 0; - for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_kernel_page(v + i, p + i, prot); - if (err) { - if (slab_is_available()) - vunmap((void *)v); - return NULL; - } - -out: - return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK)); -} -EXPORT_SYMBOL(__ioremap); - -void iounmap(volatile void __iomem *addr) -{ - /* - * If mapped by BATs then there is nothing to do. - * Calling vfree() generates a benign warning. - */ - if (v_block_mapped((unsigned long)addr)) - return; - - if (addr > high_memory && (unsigned long) addr < ioremap_bot) - vunmap((void *) (PAGE_MASK & (unsigned long)addr)); -} -EXPORT_SYMBOL(iounmap); - static void __init *early_alloc_pgtable(unsigned long size) { void *ptr = memblock_alloc(size, size); @@ -252,7 +99,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); #ifdef CONFIG_PPC_BOOK3S_32 if (ktext) - hash_preload(&init_mm, v, false, 0x300); + hash_preload(&init_mm, v); #endif v += PAGE_SIZE; p += PAGE_SIZE; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 9ad59b733984..e78832dce7bb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * This file contains ioremap and related functions for 64-bit machines. + * This file contains pgtable related functions for 64-bit machines. * * Derived from arch/ppc64/mm/init.c * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -34,7 +34,6 @@ #include <asm/pgalloc.h> #include <asm/page.h> #include <asm/prom.h> -#include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -98,208 +97,8 @@ unsigned long __pte_frag_nr; EXPORT_SYMBOL(__pte_frag_nr); unsigned long __pte_frag_size_shift; EXPORT_SYMBOL(__pte_frag_size_shift); -unsigned long ioremap_bot; -#else /* !CONFIG_PPC_BOOK3S_64 */ -unsigned long ioremap_bot = IOREMAP_BASE; #endif -int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid) -{ - unsigned long i; - - for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); - if (err) { - if (slab_is_available()) - unmap_kernel_range(ea, size); - else - WARN_ON_ONCE(1); /* Should clean up */ - return err; - } - } - - return 0; -} - -/** - * __ioremap_at - Low level function to establish the page tables - * for an IO mapping - */ -void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) -{ - /* We don't support the 4K PFN hack with ioremap */ - if (pgprot_val(prot) & H_PAGE_4K_PFN) - return NULL; - - if ((ea + size) >= (void *)IOREMAP_END) { - pr_warn("Outside the supported range\n"); - return NULL; - } - - WARN_ON(pa & ~PAGE_MASK); - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE)) - return NULL; - - return (void __iomem *)ea; -} - -/** - * __iounmap_from - Low level function to tear down the page tables - * for an IO mapping. This is used for mappings that - * are manipulated manually, like partial unmapping of - * PCI IOs or ISA space. - */ -void __iounmap_at(void *ea, unsigned long size) -{ - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - unmap_kernel_range((unsigned long)ea, size); -} - -void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, - pgprot_t prot, void *caller) -{ - phys_addr_t paligned; - void __iomem *ret; - - /* - * Choose an address to map it to. - * Once the imalloc system is running, we use it. - * Before that, we map using addresses going - * up from ioremap_bot. imalloc will use - * the addresses from ioremap_bot through - * IMALLOC_END - * - */ - paligned = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - paligned; - - if ((size == 0) || (paligned == 0)) - return NULL; - - if (slab_is_available()) { - struct vm_struct *area; - - area = __get_vm_area_caller(size, VM_IOREMAP, - ioremap_bot, IOREMAP_END, - caller); - if (area == NULL) - return NULL; - - area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, prot); - } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); - if (ret) - ioremap_bot += size; - } - - if (ret) - ret += addr & ~PAGE_MASK; - return ret; -} - -void __iomem * __ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); -} - -void __iomem * ioremap(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - pte_t pte = __pte(flags); - void *caller = __builtin_return_address(0); - - /* writeable implies dirty for kernel addresses */ - if (pte_write(pte)) - pte = pte_mkdirty(pte); - - /* we don't want to let _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - /* - * Force kernel mapping. - */ - pte = pte_mkprivileged(pte); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller); - return __ioremap_caller(addr, size, pte_pgprot(pte), caller); -} - - -/* - * Unmap an IO region and remove it from imalloc'd list. - * Access to IO memory should be serialized by driver. - */ -void __iounmap(volatile void __iomem *token) -{ - void *addr; - - if (!slab_is_available()) - return; - - addr = (void *) ((unsigned long __force) - PCI_FIX_ADDR(token) & PAGE_MASK); - if ((unsigned long)addr < ioremap_bot) { - printk(KERN_WARNING "Attempt to iounmap early bolted mapping" - " at 0x%p\n", addr); - return; - } - vunmap(addr); -} - -void iounmap(volatile void __iomem *token) -{ - if (ppc_md.iounmap) - ppc_md.iounmap(token); - else - __iounmap(token); -} - -EXPORT_SYMBOL(ioremap); -EXPORT_SYMBOL(ioremap_wc); -EXPORT_SYMBOL(ioremap_prot); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(__ioremap_at); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(__iounmap); -EXPORT_SYMBOL(__iounmap_at); - #ifndef __PAGETABLE_PUD_FOLDED /* 4 level page table */ struct page *pgd_page(pgd_t pgd) diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index a0d23e96e841..4154feac1da3 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -149,7 +149,7 @@ static int bats_show_603(struct seq_file *m, void *v) static int bats_open(struct inode *inode, struct file *file) { - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return single_open(file, bats_show_601, NULL); return single_open(file, bats_show_603, NULL); diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index 72f0e4a3d839..a07278027c6f 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -237,7 +237,6 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 return -1; } -#ifdef CONFIG_PPC_PSERIES static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { struct hash_pte ptes[4]; @@ -274,7 +273,6 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 * } return -1; } -#endif static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, unsigned long *lp_bits) @@ -316,10 +314,9 @@ static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { -#ifdef CONFIG_PPC_PSERIES - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR)) return pseries_find(ea, psize, primary, v, r); -#endif + return native_find(ea, psize, primary, v, r); } @@ -386,12 +383,13 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) psize = mmu_vmalloc_psize; else psize = mmu_io_psize; -#ifdef CONFIG_PPC_64K_PAGES + /* check for secret 4K mappings */ - if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) || - ((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && + ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO || + (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) psize = mmu_io_psize; -#endif + /* check for hashpte */ status = hpte_find(st, addr, psize); @@ -469,9 +467,10 @@ static void walk_linearmapping(struct pg_state *st) static void walk_vmemmap(struct pg_state *st) { -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmemmap_backing *ptr = vmemmap_list; + if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + return; /* * Traverse the vmemmaped memory and dump pages that are in the hash * pagetable. @@ -481,7 +480,6 @@ static void walk_vmemmap(struct pg_state *st) ptr = ptr->list; } seq_puts(st->seq, "---[ vmemmap end ]---\n"); -#endif } static void populate_markers(void) @@ -495,11 +493,7 @@ static void populate_markers(void) address_markers[6].start_address = PHB_IO_END; address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_BOOK3S_64 address_markers[9].start_address = H_VMEMMAP_START; -#else - address_markers[9].start_address = VMEMMAP_BASE; -#endif } static int ptdump_show(struct seq_file *m, void *v) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 6a88a9f585d4..2f9ddc29c535 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -26,10 +26,6 @@ #include "ptdump.h" -#ifdef CONFIG_PPC32 -#define KERN_VIRT_START PAGE_OFFSET -#endif - /* * To visualise what is happening, * @@ -88,10 +84,6 @@ static struct addr_marker address_markers[] = { #else { 0, "Early I/O remap start" }, { 0, "Early I/O remap end" }, -#ifdef CONFIG_NOT_COHERENT_CACHE - { 0, "Consistent mem start" }, - { 0, "Consistent mem end" }, -#endif #ifdef CONFIG_HIGHMEM { 0, "Highmem PTEs start" }, { 0, "Highmem PTEs end" }, @@ -181,7 +173,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr) static void note_prot_wx(struct pg_state *st, unsigned long addr) { - if (!st->check_wx) + if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx) return; if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X))) @@ -299,17 +291,15 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) static void walk_pagetables(struct pg_state *st) { - pgd_t *pgd = pgd_offset_k(0UL); unsigned int i; - unsigned long addr; - - addr = st->start_address; + unsigned long addr = st->start_address & PGDIR_MASK; + pgd_t *pgd = pgd_offset_k(addr); /* * Traverse the linux pagetable structure and dump pages that are in * the hash pagetable. */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { + for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd)) /* pgd exists */ walk_pud(st, pgd, addr); @@ -341,11 +331,6 @@ static void populate_markers(void) #else /* !CONFIG_PPC64 */ address_markers[i++].start_address = ioremap_bot; address_markers[i++].start_address = IOREMAP_TOP; -#ifdef CONFIG_NOT_COHERENT_CACHE - address_markers[i++].start_address = IOREMAP_TOP; - address_markers[i++].start_address = IOREMAP_TOP + - CONFIG_CONSISTENT_SIZE; -#endif #ifdef CONFIG_HIGHMEM address_markers[i++].start_address = PKMAP_BASE; address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); @@ -364,12 +349,13 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, + .start_address = PAGE_OFFSET, }; - if (radix_enabled()) - st.start_address = PAGE_OFFSET; - else +#ifdef CONFIG_PPC64 + if (!radix_enabled()) st.start_address = KERN_VIRT_START; +#endif /* Traverse kernel page tables */ walk_pagetables(&st); @@ -407,12 +393,13 @@ void ptdump_check_wx(void) .seq = NULL, .marker = address_markers, .check_wx = true, + .start_address = PAGE_OFFSET, }; - if (radix_enabled()) - st.start_address = PAGE_OFFSET; - else +#ifdef CONFIG_PPC64 + if (!radix_enabled()) st.start_address = KERN_VIRT_START; +#endif walk_pagetables(&st); |