From 47d99948eee48a84a4b242c17915a4ff59a29b5d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 10:00:00 +0000 Subject: powerpc/mm: Move book3s64 specifics in subdirectory mm/book3s64 Many files in arch/powerpc/mm are only for book3S64. This patch creates a subdirectory for them. Signed-off-by: Christophe Leroy [mpe: Update the selftest sym links, shorten new filenames, cleanup some whitespace and formatting in the new files.] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 25 +- arch/powerpc/mm/book3s64/Makefile | 24 + arch/powerpc/mm/book3s64/hash_4k.c | 124 ++ arch/powerpc/mm/book3s64/hash_64k.c | 333 +++++ arch/powerpc/mm/book3s64/hash_hugepage.c | 191 +++ arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 152 ++ arch/powerpc/mm/book3s64/hash_native.c | 884 ++++++++++++ arch/powerpc/mm/book3s64/hash_pgtable.c | 463 ++++++ arch/powerpc/mm/book3s64/hash_tlb.c | 265 ++++ arch/powerpc/mm/book3s64/hash_utils.c | 1946 ++++++++++++++++++++++++++ arch/powerpc/mm/book3s64/iommu_api.c | 482 +++++++ arch/powerpc/mm/book3s64/mmu_context.c | 263 ++++ arch/powerpc/mm/book3s64/pgtable.c | 449 ++++++ arch/powerpc/mm/book3s64/pkeys.c | 428 ++++++ arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 110 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 1124 +++++++++++++++ arch/powerpc/mm/book3s64/radix_tlb.c | 1101 +++++++++++++++ arch/powerpc/mm/book3s64/slb.c | 833 +++++++++++ arch/powerpc/mm/book3s64/subpage_prot.c | 289 ++++ arch/powerpc/mm/book3s64/vphn.c | 73 + arch/powerpc/mm/book3s64/vphn.h | 16 + arch/powerpc/mm/hash64_4k.c | 124 -- arch/powerpc/mm/hash64_64k.c | 333 ----- arch/powerpc/mm/hash_native_64.c | 884 ------------ arch/powerpc/mm/hash_utils_64.c | 1930 ------------------------- arch/powerpc/mm/hugepage-hash64.c | 191 --- arch/powerpc/mm/hugetlbpage-hash64.c | 147 -- arch/powerpc/mm/hugetlbpage-radix.c | 110 -- arch/powerpc/mm/mmu_context_book3s64.c | 263 ---- arch/powerpc/mm/mmu_context_iommu.c | 482 ------- arch/powerpc/mm/numa.c | 2 +- arch/powerpc/mm/pgtable-book3s64.c | 449 ------ arch/powerpc/mm/pgtable-hash64.c | 463 ------ arch/powerpc/mm/pgtable-radix.c | 1124 --------------- arch/powerpc/mm/pkeys.c | 428 ------ arch/powerpc/mm/slb.c | 832 ----------- arch/powerpc/mm/subpage-prot.c | 289 ---- arch/powerpc/mm/tlb-radix.c | 1101 --------------- arch/powerpc/mm/tlb_hash64.c | 259 ---- arch/powerpc/mm/vphn.c | 71 - arch/powerpc/mm/vphn.h | 17 - 41 files changed, 9554 insertions(+), 9520 deletions(-) create mode 100644 arch/powerpc/mm/book3s64/Makefile create mode 100644 arch/powerpc/mm/book3s64/hash_4k.c create mode 100644 arch/powerpc/mm/book3s64/hash_64k.c create mode 100644 arch/powerpc/mm/book3s64/hash_hugepage.c create mode 100644 arch/powerpc/mm/book3s64/hash_hugetlbpage.c create mode 100644 arch/powerpc/mm/book3s64/hash_native.c create mode 100644 arch/powerpc/mm/book3s64/hash_pgtable.c create mode 100644 arch/powerpc/mm/book3s64/hash_tlb.c create mode 100644 arch/powerpc/mm/book3s64/hash_utils.c create mode 100644 arch/powerpc/mm/book3s64/iommu_api.c create mode 100644 arch/powerpc/mm/book3s64/mmu_context.c create mode 100644 arch/powerpc/mm/book3s64/pgtable.c create mode 100644 arch/powerpc/mm/book3s64/pkeys.c create mode 100644 arch/powerpc/mm/book3s64/radix_hugetlbpage.c create mode 100644 arch/powerpc/mm/book3s64/radix_pgtable.c create mode 100644 arch/powerpc/mm/book3s64/radix_tlb.c create mode 100644 arch/powerpc/mm/book3s64/slb.c create mode 100644 arch/powerpc/mm/book3s64/subpage_prot.c create mode 100644 arch/powerpc/mm/book3s64/vphn.c create mode 100644 arch/powerpc/mm/book3s64/vphn.h delete mode 100644 arch/powerpc/mm/hash64_4k.c delete mode 100644 arch/powerpc/mm/hash64_64k.c delete mode 100644 arch/powerpc/mm/hash_native_64.c delete mode 100644 arch/powerpc/mm/hash_utils_64.c delete mode 100644 arch/powerpc/mm/hugepage-hash64.c delete mode 100644 arch/powerpc/mm/hugetlbpage-hash64.c delete mode 100644 arch/powerpc/mm/hugetlbpage-radix.c delete mode 100644 arch/powerpc/mm/mmu_context_book3s64.c delete mode 100644 arch/powerpc/mm/mmu_context_iommu.c delete mode 100644 arch/powerpc/mm/pgtable-book3s64.c delete mode 100644 arch/powerpc/mm/pgtable-hash64.c delete mode 100644 arch/powerpc/mm/pgtable-radix.c delete mode 100644 arch/powerpc/mm/pkeys.c delete mode 100644 arch/powerpc/mm/slb.c delete mode 100644 arch/powerpc/mm/subpage-prot.c delete mode 100644 arch/powerpc/mm/tlb-radix.c delete mode 100644 arch/powerpc/mm/tlb_hash64.c delete mode 100644 arch/powerpc/mm/vphn.c delete mode 100644 arch/powerpc/mm/vphn.h (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3c1bd9fa23cd..a137fdf775e2 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,53 +5,34 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o -hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o \ - $(hash64-y) mmu_context_book3s64.o \ - pgtable-book3s64.o pgtable-frag.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_BOOK3S_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o -obj-$(CONFIG_PPC_BOOK3S) += tlb_hash$(BITS).o -ifdef CONFIG_PPC_BOOK3S_64 -obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o -obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o -endif +obj-$(CONFIG_PPC_BOOK3S_32) += tlb_hash32.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o -obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o -obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ -obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o # Disable kcov instrumentation on sensitive code # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb_nohash.o := n KCOV_INSTRUMENT_fsl_booke_mmu.o := n - -# Instrumenting the SLB fault path can lead to duplicate SLB entries -KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile new file mode 100644 index 000000000000..974b4fc19f4f --- /dev/null +++ b/arch/powerpc/mm/book3s64/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y := $(NO_MINIMAL_TOC) + +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) + +obj-y += hash_pgtable.o hash_utils.o slb.o \ + mmu_context.o pgtable.o hash_tlb.o +obj-$(CONFIG_PPC_NATIVE) += hash_native.o +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o +obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o +obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o +endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o +obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o + +# Instrumenting the SLB fault path can lead to duplicate SLB entries +KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c new file mode 100644 index 000000000000..22e787123cdf --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -0,0 +1,124 @@ +/* + * Copyright IBM Corporation, 2015 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include + +int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, int subpg_prot) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned long rflags, pa; + unsigned long old_pte, new_pte; + unsigned long vpn, hash, slot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. Since this is 4K insert of 64K page size + * also add H_PAGE_COMBO + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* + * PP bits. _PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + rflags = htab_convert_pte_flags(new_pte); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + vpn = hpt_vpn(ea, vsid, ssize); + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + /* + * There MIGHT be an HPTE for this pte + */ + unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize, + rpte, 0); + + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K, + MMU_PAGE_4K, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + hash = hpt_hash(vpn, shift, ssize); + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_4K, MMU_PAGE_4K, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + MMU_PAGE_4K, + MMU_PAGE_4K, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_4K, MMU_PAGE_4K, old_pte); + return -1; + } + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c new file mode 100644 index 000000000000..7084ce2951e6 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -0,0 +1,333 @@ +/* + * Copyright IBM Corporation, 2015 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include + +/* + * Return true, if the entry has a slot value which + * the software considers as invalid. + */ +static inline bool hpte_soft_invalid(unsigned long hidx) +{ + return ((hidx & 0xfUL) == 0xfUL); +} + +/* + * index from 0 - 15 + */ +bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) +{ + return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index))); +} + +int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, int subpg_prot) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned int subpg_index; + unsigned long rflags, pa; + unsigned long old_pte, new_pte, subpg_pte; + unsigned long vpn, hash, slot, gslot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. Since this is 4K insert of 64K page size + * also add H_PAGE_COMBO + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* + * Handle the subpage protection bits + */ + subpg_pte = new_pte & ~subpg_prot; + rflags = htab_convert_pte_flags(subpg_pte); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } + + subpg_index = (ea & (PAGE_SIZE - 1)) >> shift; + vpn = hpt_vpn(ea, vsid, ssize); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + /* + *None of the sub 4k page is hashed + */ + if (!(old_pte & H_PAGE_HASHPTE)) + goto htab_insert_hpte; + /* + * Check if the pte was already inserted into the hash table + * as a 64k HW page, and invalidate the 64k HPTE if so. + */ + if (!(old_pte & H_PAGE_COMBO)) { + flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); + /* + * clear the old slot details from the old and new pte. + * On hash insert failure we use old pte value and we don't + * want slot information there if we have a insert failure. + */ + old_pte &= ~H_PAGE_HASHPTE; + new_pte &= ~H_PAGE_HASHPTE; + goto htab_insert_hpte; + } + /* + * Check for sub page valid and update + */ + if (__rpte_sub_valid(rpte, subpg_index)) { + int ret; + + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, + subpg_index); + ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize, flags); + + /* + * If we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + if (ret == -1) + goto htab_insert_hpte; + + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; + } + +htab_insert_hpte: + + /* + * Initialize all hidx entries to invalid value, the first time + * the PTE is about to allocate a 4K HPTE. + */ + if (!(old_pte & H_PAGE_COMBO)) + rpte.hidx = INVALID_RPTE_HIDX; + + /* + * handle H_PAGE_4K_PFN case + */ + if (old_pte & H_PAGE_4K_PFN) { + /* + * All the sub 4k page have the same + * physical address. + */ + pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT; + } else { + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + pa += (subpg_index << shift); + } + hash = hpt_hash(vpn, shift, ssize); +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_4K, MMU_PAGE_4K, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + bool soft_invalid; + + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, HPTE_V_SECONDARY, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize); + + soft_invalid = hpte_soft_invalid(slot); + if (unlikely(soft_invalid)) { + /* + * We got a valid slot from a hardware point of view. + * but we cannot use it, because we use this special + * value; as defined by hpte_soft_invalid(), to track + * invalid slots. We cannot use it. So invalidate it. + */ + gslot = slot & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize, 0); + } + + if (unlikely(slot == -1 || soft_invalid)) { + /* + * For soft invalid slot, let's ensure that we release a + * slot from the primary, with the hope that we will + * acquire that slot next time we try. This will ensure + * that we do not get the same soft-invalid slot. + */ + if (soft_invalid || (mftb() & 0x1)) + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_4K, MMU_PAGE_4K, old_pte); + return -1; + } + + new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); + new_pte |= H_PAGE_HASHPTE; + + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} + +int __hash_page_64K(unsigned long ea, unsigned long access, + unsigned long vsid, pte_t *ptep, unsigned long trap, + unsigned long flags, int ssize) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned long rflags, pa; + unsigned long old_pte, new_pte; + unsigned long vpn, hash, slot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Check if PTE has the cache-inhibit bit set + * If so, bail out and refault as a 4k page + */ + if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && + unlikely(pte_ci(pte))) + return 0; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + rflags = htab_convert_pte_flags(new_pte); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + vpn = hpt_vpn(ea, vsid, ssize); + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + unsigned long gslot; + + /* + * There MIGHT be an HPTE for this pte + */ + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K, + MMU_PAGE_64K, ssize, + flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + hash = hpt_hash(vpn, shift, ssize); + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_64K, MMU_PAGE_64K, + ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + MMU_PAGE_64K, + MMU_PAGE_64K, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_64K, MMU_PAGE_64K, old_pte); + return -1; + } + + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c new file mode 100644 index 000000000000..440823797de7 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -0,0 +1,191 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +/* + * PPC64 THP Support for hash based MMUs + */ +#include +#include + +int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, + pmd_t *pmdp, unsigned long trap, unsigned long flags, + int ssize, unsigned int psize) +{ + unsigned int index, valid; + unsigned char *hpte_slot_array; + unsigned long rflags, pa, hidx; + unsigned long old_pmd, new_pmd; + int ret, lpsize = MMU_PAGE_16M; + unsigned long vpn, hash, shift, slot; + + /* + * atomically mark the linux large page PMD busy and dirty + */ + do { + pmd_t pmd = READ_ONCE(*pmdp); + + old_pmd = pmd_val(pmd); + /* If PMD busy, retry the access */ + if (unlikely(old_pmd & H_PAGE_BUSY)) + return 0; + /* If PMD permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pmd))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pmd |= _PAGE_DIRTY; + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + + /* + * Make sure this is thp or devmap entry + */ + if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) + return 0; + + rflags = htab_convert_pte_flags(new_pmd); + +#if 0 + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } +#endif + /* + * Find the slot index details for this ea, using base page size. + */ + shift = mmu_psize_defs[psize].shift; + index = (ea & ~HPAGE_PMD_MASK) >> shift; + BUG_ON(index >= PTE_FRAG_SIZE); + + vpn = hpt_vpn(ea, vsid, ssize); + hpte_slot_array = get_hpte_slot_array(pmdp); + if (psize == MMU_PAGE_4K) { + /* + * invalidate the old hpte entry if we have that mapped via 64K + * base page size. This is because demote_segment won't flush + * hash page table entries. + */ + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { + flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, + ssize, flags); + /* + * With THP, we also clear the slot information with + * respect to all the 64K hash pte mapping the 16MB + * page. They are all invalid now. This make sure we + * don't find the slot valid when we fault with 4k + * base page size. + * + */ + memset(hpte_slot_array, 0, PTE_FRAG_SIZE); + } + } + + valid = hpte_valid(hpte_slot_array, index); + if (valid) { + /* update the hpte bits */ + hash = hpt_hash(vpn, shift, ssize); + hidx = hpte_hash_index(hpte_slot_array, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, + psize, lpsize, ssize, flags); + /* + * We failed to update, try to insert a new entry. + */ + if (ret == -1) { + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + valid = 0; + hpte_slot_array[index] = 0; + } + } + + if (!valid) { + unsigned long hpte_group; + + hash = hpt_hash(vpn, shift, ssize); + /* insert new entry */ + pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; + new_pmd |= H_PAGE_HASHPTE; + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + psize, lpsize, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + psize, lpsize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pmd and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *pmdp = __pmd(old_pmd); + hash_failure_debug(ea, access, vsid, trap, ssize, + psize, lpsize, old_pmd); + return -1; + } + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + mark_hpte_slot_valid(hpte_slot_array, index, slot); + } + /* + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with + * base page size 4k. + */ + if (psize == MMU_PAGE_4K) + new_pmd |= H_PAGE_COMBO; + /* + * The hpte valid is stored in the pgtable whose address is in the + * second half of the PMD. Order this against clearing of the busy bit in + * huge pmd. + */ + smp_wmb(); + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c new file mode 100644 index 000000000000..2d4e02aa15a3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth + */ + +#include +#include +#include +#include +#include +#include + +extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rlags, + unsigned long vflags, int psize, int ssize); + +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, unsigned int shift, unsigned int mmu_psize) +{ + real_pte_t rpte; + unsigned long vpn; + unsigned long old_pte, new_pte; + unsigned long rflags, pa; + long slot, offset; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + vpn = hpt_vpn(ea, vsid, ssize); + + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* Make sure this is a hugetlb entry */ + if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) + return 0; + + rflags = htab_convert_pte_flags(new_pte); + if (unlikely(mmu_psize == MMU_PAGE_16G)) + offset = PTRS_PER_PUD; + else + offset = PTRS_PER_PMD; + rpte = __real_pte(__pte(old_pte), ptep, offset); + + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long gslot; + + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize, + mmu_psize, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(vpn, shift, ssize); + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + + /* clear HPTE slot informations in new PTE */ + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + + slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, + mmu_psize, ssize); + + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, mmu_psize, old_pte); + return -1; + } + + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} + +pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + unsigned long pte_val; + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, + _PAGE_PRESENT, _PAGE_INVALID, 1); + + return __pte(pte_val); +} + +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + + if (radix_enabled()) + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, + old_pte, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c new file mode 100644 index 000000000000..aaa28fd918fe --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -0,0 +1,884 @@ +/* + * native hashtable management. + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG_LOW + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#ifdef __BIG_ENDIAN__ +#define HPTE_LOCK_BIT 3 +#else +#define HPTE_LOCK_BIT (56+3) +#endif + +DEFINE_RAW_SPINLOCK(native_tlbie_lock); + +static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) +{ + unsigned long rb; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + + asm volatile("tlbiel %0" : : "r" (rb)); +} + +/* + * tlbiel instruction for hash, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + unsigned int r = 0; /* hash format */ + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) + : "memory"); +} + + +static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa206(set, is); + + asm volatile("ptesync": : :"memory"); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and any caching of partition table + * entries. Then flush the remaining sets of the TLB. Hash mode uses + * partition scoped TLB translations. + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 0); + for (set = 1; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + + /* + * Now invalidate the process table cache. + * + * From ISA v3.0B p. 1078: + * The following forms are invalid. + * * PRS=1, R=0, and RIC!=2 (The only process-scoped + * HPT caching is of the Process Table.) + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 1); + + asm volatile("ptesync": : :"memory"); + + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +void hash__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + tlbiel_all_isa206(POWER8_TLB_SETS, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) + tlbiel_all_isa206(POWER7_TLB_SETS, is); + else + WARN(1, "%s called on pre-POWER7 CPU\n", __func__); +} + +static inline unsigned long ___tlbie(unsigned long vpn, int psize, + int apsize, int ssize) +{ + unsigned long va; + unsigned int penc; + unsigned long sllp; + + /* + * We need 14 to 65 bits of va for a tlibe of 4K page + * With vpn we ignore the lower VPN_SHIFT bits already. + * And top two bits are already ignored because we can + * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT + * of 12. + */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ + if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + /* clear out bits after (52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); + va |= ssize << 8; + sllp = get_sllp_encoding(apsize); + va |= sllp << 5; + asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + default: + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); /* AVAL */ + va |= 1; /* L */ + asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + } + return va; +} + +static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + /* Need the extra ptesync to ensure we don't reorder tlbie*/ + asm volatile("ptesync": : :"memory"); + ___tlbie(vpn, psize, apsize, ssize); + } +} + +static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) +{ + unsigned long rb; + + rb = ___tlbie(vpn, psize, apsize, ssize); + trace_tlbie(0, 0, rb, 0, 0, 0, 0); +} + +static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) +{ + unsigned long va; + unsigned int penc; + unsigned long sllp; + + /* VPN_SHIFT can be atmost 12 */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64 bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ + if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + /* clear out bits after(52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); + va |= ssize << 8; + sllp = get_sllp_encoding(apsize); + va |= sllp << 5; + asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) + : : "r" (va), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + default: + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); + va |= 1; /* L */ + asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) + : : "r" (va), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + } + trace_tlbie(0, 1, va, 0, 0, 0, 0); + +} + +static inline void tlbie(unsigned long vpn, int psize, int apsize, + int ssize, int local) +{ + unsigned int use_local; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use(); + + if (use_local) + use_local = mmu_psize_defs[psize].tlbiel; + if (lock_tlbie && !use_local) + raw_spin_lock(&native_tlbie_lock); + asm volatile("ptesync": : :"memory"); + if (use_local) { + __tlbiel(vpn, psize, apsize, ssize); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie(vpn, psize, apsize, ssize); + fixup_tlbie(vpn, psize, apsize, ssize); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + if (lock_tlbie && !use_local) + raw_spin_unlock(&native_tlbie_lock); +} + +static inline void native_lock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = (unsigned long *)&hptep->v; + + while (1) { + if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) + break; + spin_begin(); + while(test_bit(HPTE_LOCK_BIT, word)) + spin_cpu_relax(); + spin_end(); + } +} + +static inline void native_unlock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = (unsigned long *)&hptep->v; + + clear_bit_unlock(HPTE_LOCK_BIT, word); +} + +static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int apsize, int ssize) +{ + struct hash_pte *hptep = htab_address + hpte_group; + unsigned long hpte_v, hpte_r; + int i; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," + " rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, vpn, pa, rflags, vflags, psize); + } + + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { + /* retry with lock held */ + native_lock_hpte(hptep); + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) + break; + native_unlock_hpte(hptep); + } + + hptep++; + } + + if (i == HPTES_PER_GROUP) + return -1; + + hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", + i, hpte_v, hpte_r); + } + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_r = hpte_old_to_new_r(hpte_v, hpte_r); + hpte_v = hpte_old_to_new_v(hpte_v); + } + + hptep->r = cpu_to_be64(hpte_r); + /* Guarantee the second dword is visible before the valid bit */ + eieio(); + /* + * Now set the first dword including the valid bit + * NOTE: this also unlocks the hpte + */ + hptep->v = cpu_to_be64(hpte_v); + + __asm__ __volatile__ ("ptesync" : : : "memory"); + + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); +} + +static long native_hpte_remove(unsigned long hpte_group) +{ + struct hash_pte *hptep; + int i; + int slot_offset; + unsigned long hpte_v; + + DBG_LOW(" remove(group=%lx)\n", hpte_group); + + /* pick a random entry to start at */ + slot_offset = mftb() & 0x7; + + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + hpte_group + slot_offset; + hpte_v = be64_to_cpu(hptep->v); + + if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { + /* retry with lock held */ + native_lock_hpte(hptep); + hpte_v = be64_to_cpu(hptep->v); + if ((hpte_v & HPTE_V_VALID) + && !(hpte_v & HPTE_V_BOLTED)) + break; + native_unlock_hpte(hptep); + } + + slot_offset++; + slot_offset &= 0x7; + } + + if (i == HPTES_PER_GROUP) + return -1; + + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + + return i; +} + +static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, + unsigned long vpn, int bpsize, + int apsize, int ssize, unsigned long flags) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v, want_v; + int ret = 0, local = 0; + + want_v = hpte_encode_avpn(vpn, bpsize, ssize); + + DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", + vpn, want_v & HPTE_V_AVPN, slot, newpp); + + hpte_v = hpte_get_old_v(hptep); + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { + DBG_LOW(" -> miss\n"); + ret = -1; + } else { + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = hpte_get_old_v(hptep); + if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || + !(hpte_v & HPTE_V_VALID))) { + ret = -1; + } else { + DBG_LOW(" -> hit\n"); + /* Update the HPTE */ + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N | + HPTE_R_C))); + } + native_unlock_hpte(hptep); + } + + if (flags & HPTE_LOCAL_UPDATE) + local = 1; + /* + * Ensure it is out of the tlb too if it is not a nohpte fault + */ + if (!(flags & HPTE_NOHPTE_UPDATE)) + tlbie(vpn, bpsize, apsize, ssize, local); + + return ret; +} + +static long native_hpte_find(unsigned long vpn, int psize, int ssize) +{ + struct hash_pte *hptep; + unsigned long hash; + unsigned long i; + long slot; + unsigned long want_v, hpte_v; + + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* Bolted mappings are only ever in the primary group */ + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + + hptep = htab_address + slot; + hpte_v = hpte_get_old_v(hptep); + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* HPTE matches */ + return slot; + ++slot; + } + + return -1; +} + +/* + * Update the page protection bits. Intended to be used to create + * guard pages for kernel data structures on pages which are bolted + * in the HPT. Assumes pages being operated on will not be stolen. + * + * No need to lock here because we should be the only user. + */ +static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, + int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + panic("could not find page to bolt\n"); + hptep = htab_address + slot; + + /* Update the HPTE */ + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N))); + /* + * Ensure it is out of the tlb too. Bolted entries base and + * actual page size will be same. + */ + tlbie(vpn, psize, psize, ssize, 0); +} + +/* + * Remove a bolted kernel entry. Memory hotplug uses this. + * + * No need to lock here because we should be the only user. + */ +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + return -ENOENT; + + hptep = htab_address + slot; + + VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); + + /* Invalidate the hpte */ + hptep->v = 0; + + /* Invalidate the TLB */ + tlbie(vpn, psize, psize, ssize, 0); + return 0; +} + + +static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, + int bpsize, int apsize, int ssize, int local) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + + local_irq_save(flags); + + DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); + + want_v = hpte_encode_avpn(vpn, bpsize, ssize); + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + else + native_unlock_hpte(hptep); + } + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ + tlbie(vpn, bpsize, apsize, ssize, local); + + local_irq_restore(flags); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) +{ + int i; + struct hash_pte *hptep; + int actual_psize = MMU_PAGE_16M; + unsigned int max_hpte_count, valid; + unsigned long flags, s_addr = addr; + unsigned long hpte_v, want_v, shift; + unsigned long hidx, vpn = 0, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + local_irq_save(flags); + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + hpte_v = hpte_get_old_v(hptep); + + /* Even if we miss, we need to invalidate the TLB */ + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* recheck with locks held */ + native_lock_hpte(hptep); + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* + * Invalidate the hpte. NOTE: this also unlocks it + */ + + hptep->v = 0; + } else + native_unlock_hpte(hptep); + } + /* + * We need to do tlb invalidate for all the address, tlbie + * instruction compares entry_VA in tlb with the VA specified + * here + */ + tlbie(vpn, psize, actual_psize, ssize, local); + } + local_irq_restore(flags); +} +#else +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) +{ + WARN(1, "%s called without THP support\n", __func__); +} +#endif + +static void hpte_decode(struct hash_pte *hpte, unsigned long slot, + int *psize, int *apsize, int *ssize, unsigned long *vpn) +{ + unsigned long avpn, pteg, vpi; + unsigned long hpte_v = be64_to_cpu(hpte->v); + unsigned long hpte_r = be64_to_cpu(hpte->r); + unsigned long vsid, seg_off; + int size, a_size, shift; + /* Look at the 8 bit LP value */ + unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_v = hpte_new_to_old_v(hpte_v, hpte_r); + hpte_r = hpte_new_to_old_r(hpte_r); + } + if (!(hpte_v & HPTE_V_LARGE)) { + size = MMU_PAGE_4K; + a_size = MMU_PAGE_4K; + } else { + size = hpte_page_sizes[lp] & 0xf; + a_size = hpte_page_sizes[lp] >> 4; + } + /* This works for all page sizes, and for 256M and 1T segments */ + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; + shift = mmu_psize_defs[size].shift; + + avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); + pteg = slot / HPTES_PER_GROUP; + if (hpte_v & HPTE_V_SECONDARY) + pteg = ~pteg; + + switch (*ssize) { + case MMU_SEGSIZE_256M: + /* We only have 28 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (shift < 23) { + vpi = (vsid ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + case MMU_SEGSIZE_1T: + /* We only have 40 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (shift < 23) { + vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + default: + *vpn = size = 0; + } + *psize = size; + *apsize = a_size; +} + +/* + * clear all mappings on kexec. All cpus are in real mode (or they will + * be when they isi), and we are the only one left. We rely on our kernel + * mapping being 0xC0's and the hardware ignoring those two real bits. + * + * This must be called with interrupts disabled. + * + * Taking the native_tlbie_lock is unsafe here due to the possibility of + * lockdep being on. On pre POWER5 hardware, not taking the lock could + * cause deadlock. POWER5 and newer not taking the lock is fine. This only + * gets called during boot before secondary CPUs have come up and during + * crashdump and all bets are off anyway. + * + * TODO: add batching support when enabled. remember, no dynamic memory here, + * although there is the control page available... + */ +static void native_hpte_clear(void) +{ + unsigned long vpn = 0; + unsigned long slot, slots; + struct hash_pte *hptep = htab_address; + unsigned long hpte_v; + unsigned long pteg_count; + int psize, apsize, ssize; + + pteg_count = htab_hash_mask + 1; + + slots = pteg_count * HPTES_PER_GROUP; + + for (slot = 0; slot < slots; slot++, hptep++) { + /* + * we could lock the pte here, but we are the only cpu + * running, right? and for crash dump, we probably + * don't want to wait for a maybe bad cpu. + */ + hpte_v = be64_to_cpu(hptep->v); + + /* + * Call __tlbie() here rather than tlbie() since we can't take the + * native_tlbie_lock. + */ + if (hpte_v & HPTE_V_VALID) { + hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); + hptep->v = 0; + ___tlbie(vpn, psize, apsize, ssize); + } + } + + asm volatile("eieio; tlbsync; ptesync":::"memory"); +} + +/* + * Batched hash table flush, we batch the tlbie's to avoid taking/releasing + * the lock all the time + */ +static void native_flush_hash_range(unsigned long number, int local) +{ + unsigned long vpn = 0; + unsigned long hash, index, hidx, shift, slot; + struct hash_pte *hptep; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + real_pte_t pte; + struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + unsigned long psize = batch->psize; + int ssize = batch->ssize; + int i; + unsigned int use_local; + + use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && + mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use(); + + local_irq_save(flags); + + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + continue; + /* lock and try again */ + native_lock_hpte(hptep); + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + hptep->v = 0; + + } pte_iterate_hashed_end(); + } + + if (use_local) { + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbiel(vpn, psize, psize, ssize); + } pte_iterate_hashed_end(); + } + asm volatile("ptesync":::"memory"); + } else { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbie(vpn, psize, psize, ssize); + } pte_iterate_hashed_end(); + } + /* + * Just do one more with the last used values. + */ + fixup_tlbie(vpn, psize, psize, ssize); + asm volatile("eieio; tlbsync; ptesync":::"memory"); + + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } + + local_irq_restore(flags); +} + +void __init hpte_init_native(void) +{ + mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; + mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; + mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; + mmu_hash_ops.hpte_insert = native_hpte_insert; + mmu_hash_ops.hpte_remove = native_hpte_remove; + mmu_hash_ops.hpte_clear_all = native_hpte_clear; + mmu_hash_ops.flush_hash_range = native_flush_hash_range; + mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate; +} diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c new file mode 100644 index 000000000000..1fd025dba4a3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -0,0 +1,463 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#define CREATE_TRACE_POINTS +#include + +#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) +#warning Limited user VSID range means pagetable space is wasted +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * vmemmap is the starting address of the virtual address space where + * struct pages are allocated for all possible PFNs present on the system + * including holes and bad memory (hence sparse). These virtual struct + * pages are stored in sequence in this virtual address space irrespective + * of the fact whether the corresponding PFN is valid or not. This achieves + * constant relationship between address of struct page and its PFN. + * + * During boot or memory hotplug operation when a new memory section is + * added, physical memory allocation (including hash table bolting) will + * be performed for the set of struct pages which are part of the memory + * section. This saves memory by not allocating struct pages for PFNs + * which are not valid. + * + * ---------------------------------------------- + * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| + * ---------------------------------------------- + * + * f000000000000000 c000000000000000 + * vmemmap +--------------+ +--------------+ + * + | page struct | +--------------> | page struct | + * | +--------------+ +--------------+ + * | | page struct | +--------------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | + +------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | | +--> | page struct | + * | +--------------+ | | +--------------+ + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | +-------+ | + * | +--------------+ | + * | | page struct | +-----------+ + * | +--------------+ + * | | page struct | No mapping + * | +--------------+ + * | | page struct | No mapping + * v +--------------+ + * + * ----------------------------------------- + * | RELATION BETWEEN STRUCT PAGES AND PFNS| + * ----------------------------------------- + * + * vmemmap +--------------+ +---------------+ + * + | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * v +--------------+ +---------------+ + */ +/* + * On hash-based CPUs, the vmemmap is bolted in the hash table. + * + */ +int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int rc; + + if ((start + page_size) >= H_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + if (rc < 0) { + int rc2 = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int rc = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON((rc < 0) && (rc != -ENOENT)); + WARN_ON(rc == -ENOENT); +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + + smp_wmb(); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + __be64 old_be, tmp; + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); +#endif + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + and. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) + : "cc" ); + + old = be64_to_cpu(old_be); + + trace_hugepage_update(addr, old, clr, set); + if (old & H_PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + serialize_against_pte_lookup(vma->vm_mm); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + return pmd; +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & H_PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_user_vsid(&mm->context, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + if (mm_is_thread_local(mm)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + /* + * Serialize against find_current_mm_pte variants which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_curren_mm_pte to finish. + */ + serialize_against_pte_lookup(mm); + return old_pmd; +} + +int hash__has_transparent_hugepage(void) +{ + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_STRICT_KERNEL_RWX +static bool hash__change_memory_range(unsigned long start, unsigned long end, + unsigned long newpp) +{ + unsigned long idx; + unsigned int step, shift; + + shift = mmu_psize_defs[mmu_linear_psize].shift; + step = 1 << shift; + + start = ALIGN_DOWN(start, step); + end = ALIGN(end, step); // aligns up + + if (start >= end) + return false; + + pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", + start, end, newpp, step); + + for (idx = start; idx < end; idx += step) + /* Not sure if we can do much with the return value */ + mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, + mmu_kernel_ssize); + + return true; +} + +void hash__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); +} + +void hash__mark_initmem_nx(void) +{ + unsigned long start, end, pp; + + start = (unsigned long)__init_begin; + end = (unsigned long)__init_end; + + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + + WARN_ON(!hash__change_memory_range(start, end, pp)); +} +#endif diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c new file mode 100644 index 000000000000..d4f0101447b1 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -0,0 +1,265 @@ +/* + * This file contains the routines for flushing entries from the + * TLB and MMU hash table. + * + * Derived from arch/ppc64/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); + +/* + * A linux PTE was changed and the corresponding hash table entry + * neesd to be flushed. This function will either perform the flush + * immediately or will batch it up if the current CPU has an active + * batch on it. + */ +void hpte_need_flush(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long pte, int huge) +{ + unsigned long vpn; + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); + unsigned long vsid; + unsigned int psize; + int ssize; + real_pte_t rpte; + int i, offset; + + i = batch->index; + + /* + * Get page size (maybe move back to caller). + * + * NOTE: when using special 64K mappings in 4K environment like + * for SPEs, we obtain the page size from the slice, which thus + * must still exist (and thus the VMA not reused) at the time + * of this call + */ + if (huge) { +#ifdef CONFIG_HUGETLB_PAGE + psize = get_slice_psize(mm, addr); + /* Mask the address for the correct page size */ + addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); + if (unlikely(psize == MMU_PAGE_16G)) + offset = PTRS_PER_PUD; + else + offset = PTRS_PER_PMD; +#else + BUG(); + psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ +#endif + } else { + psize = pte_pagesize_index(mm, addr, pte); + /* + * Mask the address for the standard page size. If we + * have a 64k page kernel, but the hardware does not + * support 64k pages, this might be different from the + * hardware page size encoded in the slice table. + */ + addr &= PAGE_MASK; + offset = PTRS_PER_PTE; + } + + + /* Build full vaddr */ + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_user_vsid(&mm->context, addr, ssize); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + WARN_ON(vsid == 0); + vpn = hpt_vpn(addr, vsid, ssize); + rpte = __real_pte(__pte(pte), ptep, offset); + + /* + * Check if we have an active batch on this CPU. If not, just + * flush now and return. + */ + if (!batch->active) { + flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); + put_cpu_var(ppc64_tlb_batch); + return; + } + + /* + * This can happen when we are in the middle of a TLB batch and + * we encounter memory pressure (eg copy_page_range when it tries + * to allocate a new pte). If we have to reclaim memory and end + * up scanning and resetting referenced bits then our batch context + * will change mid stream. + * + * We also need to ensure only one page size is present in a given + * batch + */ + if (i != 0 && (mm != batch->mm || batch->psize != psize || + batch->ssize != ssize)) { + __flush_tlb_pending(batch); + i = 0; + } + if (i == 0) { + batch->mm = mm; + batch->psize = psize; + batch->ssize = ssize; + } + batch->pte[i] = rpte; + batch->vpn[i] = vpn; + batch->index = ++i; + if (i >= PPC64_TLB_BATCH_NR) + __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); +} + +/* + * This function is called when terminating an mmu batch or when a batch + * is full. It will perform the flush of all the entries currently stored + * in a batch. + * + * Must be called from within some kind of spinlock/non-preempt region... + */ +void __flush_tlb_pending(struct ppc64_tlb_batch *batch) +{ + int i, local; + + i = batch->index; + local = mm_is_thread_local(batch->mm); + if (i == 1) + flush_hash_page(batch->vpn[0], batch->pte[0], + batch->psize, batch->ssize, local); + else + flush_hash_range(i, local); + batch->index = 0; +} + +void hash__tlb_flush(struct mmu_gather *tlb) +{ + struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); + + /* + * If there's a TLB batch pending, then we must flush it because the + * pages are going to be freed and we really don't want to have a CPU + * access a freed page because it has a stale TLB + */ + if (tlbbatch->index) + __flush_tlb_pending(tlbbatch); + + put_cpu_var(ppc64_tlb_batch); +} + +/** + * __flush_hash_table_range - Flush all HPTEs for a given address range + * from the hash table (and the TLB). But keeps + * the linux PTEs intact. + * + * @mm : mm_struct of the target address space (generally init_mm) + * @start : starting address + * @end : ending address (not included in the flush) + * + * This function is mostly to be used by some IO hotplug code in order + * to remove all hash entries from a given address range used to map IO + * space on a removed PCI-PCI bidge without tearing down the full mapping + * since 64K pages may overlap with other bridges when using 64K pages + * with 4K HW pages on IO space. + * + * Because of that usage pattern, it is implemented for small size rather + * than speed. + */ +void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + bool is_thp; + int hugepage_shift; + unsigned long flags; + + start = _ALIGN_DOWN(start, PAGE_SIZE); + end = _ALIGN_UP(end, PAGE_SIZE); + + BUG_ON(!mm->pgd); + + /* + * Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + for (; start < end; start += PAGE_SIZE) { + pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, + &hugepage_shift); + unsigned long pte; + + if (ptep == NULL) + continue; + pte = pte_val(*ptep); + if (is_thp) + trace_hugepage_invalidate(start, pte); + if (!(pte & H_PAGE_HASHPTE)) + continue; + if (unlikely(is_thp)) + hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); + else + hpte_need_flush(mm, start, ptep, pte, hugepage_shift); + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} + +void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + pte_t *start_pte; + unsigned long flags; + + addr = _ALIGN_DOWN(addr, PMD_SIZE); + /* + * Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + start_pte = pte_offset_map(pmd, addr); + for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { + unsigned long pteval = pte_val(*pte); + if (pteval & H_PAGE_HASHPTE) + hpte_need_flush(mm, addr, pte, pteval, 0); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c new file mode 100644 index 000000000000..b21a81d42f15 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -0,0 +1,1946 @@ +/* + * PowerPC64 port by Mike Corrigan and Dave Engebretsen + * {mikejc|engebret}@us.ibm.com + * + * Copyright (c) 2000 Mike Corrigan + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard , IBM + * + * Module name: htab.c + * + * Description: + * PowerPC Hashed Page Table functions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG +#undef DEBUG_LOW + +#define pr_fmt(fmt) "hash-mmu: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#define KB (1024) +#define MB (1024*KB) +#define GB (1024L*MB) + +/* + * Note: pte --> Linux PTE + * HPTE --> PowerPC Hashed Page Table Entry + * + * Execution context: + * htab_initialize is called with the MMU off (of course), but + * the kernel has been copied down to zero so it can directly + * reference global data. At this point it is very difficult + * to print debug info. + * + */ + +static unsigned long _SDR1; +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +u8 hpte_page_sizes[1 << LP_BITS]; +EXPORT_SYMBOL_GPL(hpte_page_sizes); + +struct hash_pte *htab_address; +unsigned long htab_size_bytes; +unsigned long htab_hash_mask; +EXPORT_SYMBOL_GPL(htab_hash_mask); +int mmu_linear_psize = MMU_PAGE_4K; +EXPORT_SYMBOL_GPL(mmu_linear_psize); +int mmu_virtual_psize = MMU_PAGE_4K; +int mmu_vmalloc_psize = MMU_PAGE_4K; +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif +int mmu_io_psize = MMU_PAGE_4K; +int mmu_kernel_ssize = MMU_SEGSIZE_256M; +EXPORT_SYMBOL_GPL(mmu_kernel_ssize); +int mmu_highuser_ssize = MMU_SEGSIZE_256M; +u16 mmu_slb_size = 64; +EXPORT_SYMBOL_GPL(mmu_slb_size); +#ifdef CONFIG_PPC_64K_PAGES +int mmu_ci_restrictions; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC +static u8 *linear_map_hash_slots; +static unsigned long linear_map_hash_count; +static DEFINE_SPINLOCK(linear_map_hash_lock); +#endif /* CONFIG_DEBUG_PAGEALLOC */ +struct mmu_hash_ops mmu_hash_ops; +EXPORT_SYMBOL(mmu_hash_ops); + +/* + * These are definitions of page sizes arrays to be used when none + * is provided by the firmware. + */ + +/* + * Fallback (4k pages only) + */ +static struct mmu_psize_def mmu_psize_defaults[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 0, + }, +}; + +/* + * POWER4, GPUL, POWER5 + * + * Support for 16Mb large pages + */ +static struct mmu_psize_def mmu_psize_defaults_gp[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 1, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .sllp = SLB_VSID_L, + .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, + [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, + .avpnm = 0x1UL, + .tlbiel = 0, + }, +}; + +/* + * 'R' and 'C' update notes: + * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* + * create writeable HPTEs without C set, because the hcall H_PROTECT + * that we use in that case will not update C + * - The above is however not a problem, because we also don't do that + * fancy "no flush" variant of eviction and we use H_REMOVE which will + * do the right thing and thus we don't have the race I described earlier + * + * - Under bare metal, we do have the race, so we need R and C set + * - We make sure R is always set and never lost + * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping + */ +unsigned long htab_convert_pte_flags(unsigned long pteflags) +{ + unsigned long rflags = 0; + + /* _PAGE_EXEC -> NOEXEC */ + if ((pteflags & _PAGE_EXEC) == 0) + rflags |= HPTE_R_N; + /* + * PPP bits: + * Linux uses slb key 0 for kernel and 1 for user. + * kernel RW areas are mapped with PPP=0b000 + * User area is mapped with PPP=0b010 for read/write + * or PPP=0b011 for read-only (including writeable but clean pages). + */ + if (pteflags & _PAGE_PRIVILEGED) { + /* + * Kernel read only mapped with ppp bits 0b110 + */ + if (!(pteflags & _PAGE_WRITE)) { + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + rflags |= (HPTE_R_PP0 | 0x2); + else + rflags |= 0x3; + } + } else { + if (pteflags & _PAGE_RWX) + rflags |= 0x2; + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) + rflags |= 0x1; + } + /* + * We can't allow hardware to update hpte bits. Hence always + * set 'R' bit and set 'C' if it is a write fault + */ + rflags |= HPTE_R_R; + + if (pteflags & _PAGE_DIRTY) + rflags |= HPTE_R_C; + /* + * Add in WIG bits + */ + + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) + rflags |= HPTE_R_I; + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT) + rflags |= (HPTE_R_I | HPTE_R_G); + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) + rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); + else + /* + * Add memory coherence if cache inhibited is not set + */ + rflags |= HPTE_R_M; + + rflags |= pte_to_hpte_pkey_bits(pteflags); + return rflags; +} + +int htab_bolt_mapping(unsigned long vstart, unsigned long vend, + unsigned long pstart, unsigned long prot, + int psize, int ssize) +{ + unsigned long vaddr, paddr; + unsigned int step, shift; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + prot = htab_convert_pte_flags(prot); + + DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", + vstart, vend, pstart, prot, psize, ssize); + + for (vaddr = vstart, paddr = pstart; vaddr < vend; + vaddr += step, paddr += step) { + unsigned long hash, hpteg; + unsigned long vsid = get_kernel_vsid(vaddr, ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); + unsigned long tprot = prot; + + /* + * If we hit a bad address return error. + */ + if (!vsid) + return -1; + /* Make kernel text executable */ + if (overlaps_kernel_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* Make kvm guest trampolines executable */ + if (overlaps_kvm_tmp(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* + * If relocatable, check if it overlaps interrupt vectors that + * are copied down to real 0. For relocatable kernel + * (e.g. kdump case) we copy interrupt vectors down to real + * address 0. Mark that region as executable. This is + * because on p8 system with relocation on exception feature + * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence + * in order to execute the interrupt handlers in virtual + * mode the vector region need to be marked as executable. + */ + if ((PHYSICAL_START > MEMORY_START) && + overlaps_interrupt_vector_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + hash = hpt_hash(vpn, shift, ssize); + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + BUG_ON(!mmu_hash_ops.hpte_insert); + ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, + ssize); + + if (ret < 0) + break; + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (debug_pagealloc_enabled() && + (paddr >> PAGE_SHIFT) < linear_map_hash_count) + linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; +#endif /* CONFIG_DEBUG_PAGEALLOC */ + } + return ret < 0 ? ret : 0; +} + +int htab_remove_mapping(unsigned long vstart, unsigned long vend, + int psize, int ssize) +{ + unsigned long vaddr; + unsigned int step, shift; + int rc; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + if (!mmu_hash_ops.hpte_removebolted) + return -ENODEV; + + for (vaddr = vstart; vaddr < vend; vaddr += step) { + rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); + if (rc == -ENOENT) { + ret = -ENOENT; + continue; + } + if (rc < 0) + return rc; + } + + return ret; +} + +static bool disable_1tb_segments = false; + +static int __init parse_disable_1tb_segments(char *p) +{ + disable_1tb_segments = true; + return 0; +} +early_param("disable_1tb_segments", parse_disable_1tb_segments); + +static int __init htab_dt_scan_seg_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); + if (prop == NULL) + return 0; + for (; size >= 4; size -= 4, ++prop) { + if (be32_to_cpu(prop[0]) == 40) { + DBG("1T segment support detected\n"); + + if (disable_1tb_segments) { + DBG("1T segments disabled by command line\n"); + break; + } + + cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; + return 1; + } + } + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 0; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x14: + idx = MMU_PAGE_1M; + break; + case 0x18: + idx = MMU_PAGE_16M; + break; + case 0x22: + idx = MMU_PAGE_16G; + break; + } + return idx; +} + +static int __init htab_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + size /= 4; + cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); + while(size > 0) { + unsigned int base_shift = be32_to_cpu(prop[0]); + unsigned int slbenc = be32_to_cpu(prop[1]); + unsigned int lpnum = be32_to_cpu(prop[2]); + struct mmu_psize_def *def; + int idx, base_idx; + + size -= 3; prop += 3; + base_idx = get_idx_from_shift(base_shift); + if (base_idx < 0) { + /* skip the pte encoding also */ + prop += lpnum * 2; size -= lpnum * 2; + continue; + } + def = &mmu_psize_defs[base_idx]; + if (base_idx == MMU_PAGE_16M) + cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; + + def->shift = base_shift; + if (base_shift <= 23) + def->avpnm = 0; + else + def->avpnm = (1 << (base_shift - 23)) - 1; + def->sllp = slbenc; + /* + * We don't know for sure what's up with tlbiel, so + * for now we only set it for 4K and 64K pages + */ + if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) + def->tlbiel = 1; + else + def->tlbiel = 0; + + while (size > 0 && lpnum) { + unsigned int shift = be32_to_cpu(prop[0]); + int penc = be32_to_cpu(prop[1]); + + prop += 2; size -= 2; + lpnum--; + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + if (penc == -1) + pr_err("Invalid penc for base_shift=%d " + "shift=%d\n", base_shift, shift); + + def->penc[idx] = penc; + pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," + " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", + base_shift, shift, def->sllp, + def->avpnm, def->tlbiel, def->penc[idx]); + } + } + + return 1; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Scan for 16G memory blocks that have been set aside for huge pages + * and reserve those blocks for 16G huge pages. + */ +static int __init htab_dt_scan_hugepage_blocks(unsigned long node, + const char *uname, int depth, + void *data) { + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be64 *addr_prop; + const __be32 *page_count_prop; + unsigned int expected_pages; + long unsigned int phys_addr; + long unsigned int block_size; + + /* We are scanning "memory" nodes only */ + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* + * This property is the log base 2 of the number of virtual pages that + * will represent this memory block. + */ + page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); + if (page_count_prop == NULL) + return 0; + expected_pages = (1 << be32_to_cpu(page_count_prop[0])); + addr_prop = of_get_flat_dt_prop(node, "reg", NULL); + if (addr_prop == NULL) + return 0; + phys_addr = be64_to_cpu(addr_prop[0]); + block_size = be64_to_cpu(addr_prop[1]); + if (block_size != (16 * GB)) + return 0; + printk(KERN_INFO "Huge page(16GB) memory: " + "addr = 0x%lX size = 0x%lX pages = %d\n", + phys_addr, block_size, expected_pages); + if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { + memblock_reserve(phys_addr, block_size * expected_pages); + pseries_add_gpage(phys_addr, block_size, expected_pages); + } + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static void mmu_psize_set_default_penc(void) +{ + int bpsize, apsize; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) + mmu_psize_defs[bpsize].penc[apsize] = -1; +} + +#ifdef CONFIG_PPC_64K_PAGES + +static bool might_have_hea(void) +{ + /* + * The HEA ethernet adapter requires awareness of the + * GX bus. Without that awareness we can easily assume + * we will never see an HEA ethernet device. + */ +#ifdef CONFIG_IBMEBUS + return !cpu_has_feature(CPU_FTR_ARCH_207S) && + firmware_has_feature(FW_FEATURE_SPLPAR); +#else + return false; +#endif +} + +#endif /* #ifdef CONFIG_PPC_64K_PAGES */ + +static void __init htab_scan_page_sizes(void) +{ + int rc; + + /* se the invalid penc to -1 */ + mmu_psize_set_default_penc(); + + /* Default to 4K pages only */ + memcpy(mmu_psize_defs, mmu_psize_defaults, + sizeof(mmu_psize_defaults)); + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); + if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { + /* + * Nothing in the device-tree, but the CPU supports 16M pages, + * so let's fallback on a known size list for 16M capable CPUs. + */ + memcpy(mmu_psize_defs, mmu_psize_defaults_gp, + sizeof(mmu_psize_defaults_gp)); + } + +#ifdef CONFIG_HUGETLB_PAGE + if (!hugetlb_disabled) { + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); + } +#endif /* CONFIG_HUGETLB_PAGE */ +} + +/* + * Fill in the hpte_page_sizes[] array. + * We go through the mmu_psize_defs[] array looking for all the + * supported base/actual page size combinations. Each combination + * has a unique pagesize encoding (penc) value in the low bits of + * the LP field of the HPTE. For actual page sizes less than 1MB, + * some of the upper LP bits are used for RPN bits, meaning that + * we need to fill in several entries in hpte_page_sizes[]. + * + * In diagrammatic form, with r = RPN bits and z = page size bits: + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ... + * + * The zzzz bits are implementation-specific but are chosen so that + * no encoding for a larger page size uses the same value in its + * low-order N bits as the encoding for the 2^(12+N) byte page size + * (if it exists). + */ +static void init_hpte_page_sizes(void) +{ + long int ap, bp; + long int shift, penc; + + for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { + if (!mmu_psize_defs[bp].shift) + continue; /* not a supported page size */ + for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { + penc = mmu_psize_defs[bp].penc[ap]; + if (penc == -1 || !mmu_psize_defs[ap].shift) + continue; + shift = mmu_psize_defs[ap].shift - LP_SHIFT; + if (shift <= 0) + continue; /* should never happen */ + /* + * For page sizes less than 1MB, this loop + * replicates the entry for all possible values + * of the rrrr bits. + */ + while (penc < (1 << LP_BITS)) { + hpte_page_sizes[penc] = (ap << 4) | bp; + penc += 1 << shift; + } + } + } +} + +static void __init htab_init_page_sizes(void) +{ + init_hpte_page_sizes(); + + if (!debug_pagealloc_enabled()) { + /* + * Pick a size for the linear mapping. Currently, we only + * support 16M, 1M and 4K which is the default + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + mmu_linear_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_linear_psize = MMU_PAGE_1M; + } + +#ifdef CONFIG_PPC_64K_PAGES + /* + * Pick a size for the ordinary pages. Default is 4K, we support + * 64K for user mappings and vmalloc if supported by the processor. + * We only use 64k for ioremap if the processor + * (and firmware) support cache-inhibited large pages. + * If not, we use 4k and set mmu_ci_restrictions so that + * hash_page knows to switch processes that use cache-inhibited + * mappings to 4k pages. + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift) { + mmu_virtual_psize = MMU_PAGE_64K; + mmu_vmalloc_psize = MMU_PAGE_64K; + if (mmu_linear_psize == MMU_PAGE_4K) + mmu_linear_psize = MMU_PAGE_64K; + if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { + /* + * When running on pSeries using 64k pages for ioremap + * would stop us accessing the HEA ethernet. So if we + * have the chance of ever seeing one, stay at 4k. + */ + if (!might_have_hea()) + mmu_io_psize = MMU_PAGE_64K; + } else + mmu_ci_restrictions = 1; + } +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* + * We try to use 16M pages for vmemmap if that is supported + * and we have at least 1G of RAM at boot + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift && + memblock_phys_mem_size() >= 0x40000000) + mmu_vmemmap_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_64K].shift) + mmu_vmemmap_psize = MMU_PAGE_64K; + else + mmu_vmemmap_psize = MMU_PAGE_4K; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + printk(KERN_DEBUG "Page orders: linear mapping = %d, " + "virtual = %d, io = %d" +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ", vmemmap = %d" +#endif + "\n", + mmu_psize_defs[mmu_linear_psize].shift, + mmu_psize_defs[mmu_virtual_psize].shift, + mmu_psize_defs[mmu_io_psize].shift +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ,mmu_psize_defs[mmu_vmemmap_psize].shift +#endif + ); +} + +static int __init htab_dt_scan_pftsize(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); + if (prop != NULL) { + /* pft_size[0] is the NUMA CEC cookie */ + ppc64_pft_size = be32_to_cpu(prop[1]); + return 1; + } + return 0; +} + +unsigned htab_shift_for_mem_size(unsigned long mem_size) +{ + unsigned memshift = __ilog2(mem_size); + unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned pteg_shift; + + /* round mem_size up to next power of 2 */ + if ((1UL << memshift) < mem_size) + memshift += 1; + + /* aim for 2 pages / pteg */ + pteg_shift = memshift - (pshift + 1); + + /* + * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab + * size permitted by the architecture. + */ + return max(pteg_shift + 7, 18U); +} + +static unsigned long __init htab_get_table_size(void) +{ + /* + * If hash size isn't already provided by the platform, we try to + * retrieve it from the device-tree. If it's not there neither, we + * calculate it now based on the total RAM size + */ + if (ppc64_pft_size == 0) + of_scan_flat_dt(htab_dt_scan_pftsize, NULL); + if (ppc64_pft_size) + return 1UL << ppc64_pft_size; + + return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size()); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int resize_hpt_for_hotplug(unsigned long new_mem_size) +{ + unsigned target_hpt_shift; + + if (!mmu_hash_ops.resize_hpt) + return 0; + + target_hpt_shift = htab_shift_for_mem_size(new_mem_size); + + /* + * To avoid lots of HPT resizes if memory size is fluctuating + * across a boundary, we deliberately have some hysterisis + * here: we immediately increase the HPT size if the target + * shift exceeds the current shift, but we won't attempt to + * reduce unless the target shift is at least 2 below the + * current shift + */ + if (target_hpt_shift > ppc64_pft_size || + target_hpt_shift < ppc64_pft_size - 1) + return mmu_hash_ops.resize_hpt(target_hpt_shift); + + return 0; +} + +int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) +{ + int rc; + + if (end >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize); + + if (rc < 0) { + int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +int hash__remove_section_mapping(unsigned long start, unsigned long end) +{ + int rc = htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); + WARN_ON(rc < 0); + return rc; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static void __init hash_init_partition_table(phys_addr_t hash_table, + unsigned long htab_size) +{ + mmu_partition_table_init(); + + /* + * PS field (VRMA page size) is not used for LPID 0, hence set to 0. + * For now, UPRT is 0 and we have no segment table. + */ + htab_size = __ilog2(htab_size) - 18; + mmu_partition_table_set_entry(0, hash_table | htab_size, 0); + pr_info("Partition table %p\n", partition_tb); +} + +static void __init htab_initialize(void) +{ + unsigned long table; + unsigned long pteg_count; + unsigned long prot; + unsigned long base = 0, size = 0; + struct memblock_region *reg; + + DBG(" -> htab_initialize()\n"); + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + mmu_kernel_ssize = MMU_SEGSIZE_1T; + mmu_highuser_ssize = MMU_SEGSIZE_1T; + printk(KERN_INFO "Using 1TB segments\n"); + } + + /* + * Calculate the required size of the htab. We want the number of + * PTEGs to equal one half the number of real pages. + */ + htab_size_bytes = htab_get_table_size(); + pteg_count = htab_size_bytes >> 7; + + htab_hash_mask = pteg_count - 1; + + if (firmware_has_feature(FW_FEATURE_LPAR) || + firmware_has_feature(FW_FEATURE_PS3_LV1)) { + /* Using a hypervisor which owns the htab */ + htab_address = NULL; + _SDR1 = 0; + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + register_process_table(0, 0, 0); +#ifdef CONFIG_FA_DUMP + /* + * If firmware assisted dump is active firmware preserves + * the contents of htab along with entire partition memory. + * Clear the htab if firmware assisted dump is active so + * that we dont end up using old mappings. + */ + if (is_fadump_active() && mmu_hash_ops.hpte_clear_all) + mmu_hash_ops.hpte_clear_all(); +#endif + } else { + unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE; + +#ifdef CONFIG_PPC_CELL + /* + * Cell may require the hash table down low when using the + * Axon IOMMU in order to fit the dynamic region over it, see + * comments in cell/iommu.c + */ + if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) { + limit = 0x80000000; + pr_info("Hash table forced below 2G for Axon IOMMU\n"); + } +#endif /* CONFIG_PPC_CELL */ + + table = memblock_phys_alloc_range(htab_size_bytes, + htab_size_bytes, + 0, limit); + if (!table) + panic("ERROR: Failed to allocate %pa bytes below %pa\n", + &htab_size_bytes, &limit); + + DBG("Hash table allocated at %lx, size: %lx\n", table, + htab_size_bytes); + + htab_address = __va(table); + + /* htab absolute addr + encoded htabsize */ + _SDR1 = table + __ilog2(htab_size_bytes) - 18; + + /* Initialize the HPT with no entries */ + memset((void *)table, 0, htab_size_bytes); + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + else + hash_init_partition_table(table, htab_size_bytes); + } + + prot = pgprot_val(PAGE_KERNEL); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (debug_pagealloc_enabled()) { + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = memblock_alloc_try_nid( + linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!linear_map_hash_slots) + panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", + __func__, linear_map_hash_count, &ppc64_rma_size); + } +#endif /* CONFIG_DEBUG_PAGEALLOC */ + + /* create bolted the linear mapping in the hash table */ + for_each_memblock(memory, reg) { + base = (unsigned long)__va(reg->base); + size = reg->size; + + DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", + base, size, prot); + + if ((base + size) >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), + prot, mmu_linear_psize, mmu_kernel_ssize)); + } + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + + /* + * If we have a memory_limit and we've allocated TCEs then we need to + * explicitly map the TCE area at the top of RAM. We also cope with the + * case that the TCEs start below memory_limit. + * tce_alloc_start/end are 16MB aligned so the mapping should work + * for either 4K or 16MB pages. + */ + if (tce_alloc_start) { + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); + + if (base + size >= tce_alloc_start) + tce_alloc_start = base + size + 1; + + BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, + __pa(tce_alloc_start), prot, + mmu_linear_psize, mmu_kernel_ssize)); + } + + + DBG(" <- htab_initialize()\n"); +} +#undef KB +#undef MB + +void __init hash__early_init_devtree(void) +{ + /* Initialize segment sizes */ + of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); + + /* Initialize page sizes */ + htab_scan_page_sizes(); +} + +struct hash_mm_context init_hash_mm_context; +void __init hash__early_init_mmu(void) +{ +#ifndef CONFIG_PPC_64K_PAGES + /* + * We have code in __hash_page_4K() and elsewhere, which assumes it can + * do the following: + * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); + * + * Where the slot number is between 0-15, and values of 8-15 indicate + * the secondary bucket. For that code to work H_PAGE_F_SECOND and + * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and + * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here + * with a BUILD_BUG_ON(). + */ + BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); +#endif /* CONFIG_PPC_64K_PAGES */ + + htab_init_page_sizes(); + + /* + * initialize page table size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = H_PMD_FRAG_NR; + __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT; + + __pte_index_size = H_PTE_INDEX_SIZE; + __pmd_index_size = H_PMD_INDEX_SIZE; + __pud_index_size = H_PUD_INDEX_SIZE; + __pgd_index_size = H_PGD_INDEX_SIZE; + __pud_cache_index = H_PUD_CACHE_INDEX; + __pte_table_size = H_PTE_TABLE_SIZE; + __pmd_table_size = H_PMD_TABLE_SIZE; + __pud_table_size = H_PUD_TABLE_SIZE; + __pgd_table_size = H_PGD_TABLE_SIZE; + /* + * 4k use hugepd format, so for hash set then to + * zero + */ + __pmd_val_bits = HASH_PMD_VAL_BITS; + __pud_val_bits = HASH_PUD_VAL_BITS; + __pgd_val_bits = HASH_PGD_VAL_BITS; + + __kernel_virt_start = H_KERN_VIRT_START; + __vmalloc_start = H_VMALLOC_START; + __vmalloc_end = H_VMALLOC_END; + __kernel_io_start = H_KERN_IO_START; + __kernel_io_end = H_KERN_IO_END; + vmemmap = (struct page *)H_VMEMMAP_START; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + + /* Select appropriate backend */ + if (firmware_has_feature(FW_FEATURE_PS3_LV1)) + ps3_early_mm_init(); + else if (firmware_has_feature(FW_FEATURE_LPAR)) + hpte_init_pseries(); + else if (IS_ENABLED(CONFIG_PPC_NATIVE)) + hpte_init_native(); + + if (!mmu_hash_ops.hpte_insert) + panic("hash__early_init_mmu: No MMU hash ops defined!\n"); + + /* + * Initialize the MMU Hash table and create the linear mapping + * of memory. Has to be done before SLB initialization as this is + * currently where the page size encoding is obtained. + */ + htab_initialize(); + + init_mm.context.hash_context = &init_hash_mm_context; + init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + + pr_info("Initializing hash mmu with SLB\n"); + /* Initialize SLB management */ + slb_initialize(); + + if (cpu_has_feature(CPU_FTR_ARCH_206) + && cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} + +#ifdef CONFIG_SMP +void hash__early_init_mmu_secondary(void) +{ + /* Initialize hash table for that CPU */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + mtspr(SPRN_SDR1, _SDR1); + else + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + } + /* Initialize SLB */ + slb_initialize(); + + if (cpu_has_feature(CPU_FTR_ARCH_206) + && cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} +#endif /* CONFIG_SMP */ + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) +{ + struct page *page; + + if (!pfn_valid(pte_pfn(pte))) + return pp; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { + if (trap == 0x400) { + flush_dcache_icache_page(page); + set_bit(PG_arch_1, &page->flags); + } else + pp |= HPTE_R_N; + } + return pp; +} + +#ifdef CONFIG_PPC_MM_SLICES +static unsigned int get_paca_psize(unsigned long addr) +{ + unsigned char *psizes; + unsigned long index, mask_index; + + if (addr < SLICE_LOW_TOP) { + psizes = get_paca()->mm_ctx_low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + } else { + psizes = get_paca()->mm_ctx_high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + } + mask_index = index & 0x1; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; +} + +#else +unsigned int get_paca_psize(unsigned long addr) +{ + return get_paca()->mm_ctx_user_psize; +} +#endif + +/* + * Demote a segment to using 4k pages. + * For now this makes the whole process use 4k pages. + */ +#ifdef CONFIG_PPC_64K_PAGES +void demote_segment_4k(struct mm_struct *mm, unsigned long addr) +{ + if (get_slice_psize(mm, addr) == MMU_PAGE_4K) + return; + slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); + copro_flush_all_slbs(mm); + if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { + + copy_mm_to_paca(mm); + slb_flush_and_restore_bolted(); + } +} +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_SUBPAGE_PROT +/* + * This looks up a 2-bit protection code for a 4k subpage of a 64k page. + * Userspace sets the subpage permissions using the subpage_prot system call. + * + * Result is 0: full permissions, _PAGE_RW: read-only, + * _PAGE_RWX: no access. + */ +static int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + u32 spp = 0; + u32 **sbpm, *sbpp; + + if (!spt) + return 0; + + if (ea >= spt->maxaddr) + return 0; + if (ea < 0x100000000UL) { + /* addresses below 4GB use spt->low_prot */ + sbpm = spt->low_prot; + } else { + sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; + if (!sbpm) + return 0; + } + sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!sbpp) + return 0; + spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; + + /* extract 2-bit bitfield for this 4k subpage */ + spp >>= 30 - 2 * ((ea >> 12) & 0xf); + + /* + * 0 -> full premission + * 1 -> Read only + * 2 -> no access. + * We return the flag that need to be cleared. + */ + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); + return spp; +} + +#else /* CONFIG_PPC_SUBPAGE_PROT */ +static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + return 0; +} +#endif + +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, int lpsize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", + trap, vsid, ssize, psize, lpsize, pte); +} + +static void check_paca_psize(unsigned long ea, struct mm_struct *mm, + int psize, bool user_region) +{ + if (user_region) { + if (psize != get_paca_psize(ea)) { + copy_mm_to_paca(mm); + slb_flush_and_restore_bolted(); + } + } else if (get_paca()->vmalloc_sllp != + mmu_psize_defs[mmu_vmalloc_psize].sllp) { + get_paca()->vmalloc_sllp = + mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_vmalloc_update(); + } +} + +/* + * Result code is: + * 0 - handled + * 1 - normal page fault + * -1 - critical hash insertion error + * -2 - access not permitted by subpage protection mechanism + */ +int hash_page_mm(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap, + unsigned long flags) +{ + bool is_thp; + enum ctx_state prev_state = exception_enter(); + pgd_t *pgdir; + unsigned long vsid; + pte_t *ptep; + unsigned hugeshift; + int rc, user_region = 0; + int psize, ssize; + + DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", + ea, access, trap); + trace_hash_fault(ea, access, trap); + + /* Get region & vsid */ + switch (get_region_id(ea)) { + case USER_REGION_ID: + user_region = 1; + if (! mm) { + DBG_LOW(" user region with no mm !\n"); + rc = 1; + goto bail; + } + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_user_vsid(&mm->context, ea, ssize); + break; + case VMALLOC_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + break; + + case IO_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + break; + default: + /* + * Not a valid range + * Send the problem up to do_page_fault() + */ + rc = 1; + goto bail; + } + DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + + /* Bad address. */ + if (!vsid) { + DBG_LOW("Bad address!\n"); + rc = 1; + goto bail; + } + /* Get pgdir */ + pgdir = mm->pgd; + if (pgdir == NULL) { + rc = 1; + goto bail; + } + + /* Check CPU locality */ + if (user_region && mm_is_thread_local(mm)) + flags |= HPTE_LOCAL_UPDATE; + +#ifndef CONFIG_PPC_64K_PAGES + /* + * If we use 4K pages and our psize is not 4K, then we might + * be hitting a special driver mapping, and need to align the + * address before we fetch the PTE. + * + * It could also be a hugepage mapping, in which case this is + * not necessary, but it's not harmful, either. + */ + if (psize != MMU_PAGE_4K) + ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get PTE and page size from page tables */ + ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); + if (ptep == NULL || !pte_present(*ptep)) { + DBG_LOW(" no PTE !\n"); + rc = 1; + goto bail; + } + + /* Add _PAGE_PRESENT to the required access perm */ + access |= _PAGE_PRESENT; + + /* + * Pre-check access permissions (will be re-checked atomically + * in __hash_page_XX but this pre-check is a fast path + */ + if (!check_pte_access(access, pte_val(*ptep))) { + DBG_LOW(" no access !\n"); + rc = 1; + goto bail; + } + + if (hugeshift) { + if (is_thp) + rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, + trap, flags, ssize, psize); +#ifdef CONFIG_HUGETLB_PAGE + else + rc = __hash_page_huge(ea, access, vsid, ptep, trap, + flags, ssize, hugeshift, psize); +#else + else { + /* + * if we have hugeshift, and is not transhuge with + * hugetlb disabled, something is really wrong. + */ + rc = 1; + WARN_ON(1); + } +#endif + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + + goto bail; + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + /* Do actual hashing */ +#ifdef CONFIG_PPC_64K_PAGES + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } + + /* + * If this PTE is non-cacheable and we have restrictions on + * using non cacheable large pages, then we switch to 4k + */ + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { + if (user_region) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } else if (ea < VMALLOC_END) { + /* + * some driver did a non-cacheable mapping + * in vmalloc space, so switch vmalloc + * to 4k pages + */ + printk(KERN_ALERT "Reducing vmalloc segment " + "to 4kB pages because of " + "non-cacheable mapping\n"); + psize = mmu_vmalloc_psize = MMU_PAGE_4K; + copro_flush_all_slbs(mm); + } + } + +#endif /* CONFIG_PPC_64K_PAGES */ + + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + +#ifdef CONFIG_PPC_64K_PAGES + if (psize == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + flags, ssize); + else +#endif /* CONFIG_PPC_64K_PAGES */ + { + int spp = subpage_protection(mm, ea); + if (access & spp) + rc = -2; + else + rc = __hash_page_4K(ea, access, vsid, ptep, trap, + flags, ssize, spp); + } + + /* + * Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + psize, pte_val(*ptep)); +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + DBG_LOW(" -> rc=%d\n", rc); + +bail: + exception_exit(prev_state); + return rc; +} +EXPORT_SYMBOL_GPL(hash_page_mm); + +int hash_page(unsigned long ea, unsigned long access, unsigned long trap, + unsigned long dsisr) +{ + unsigned long flags = 0; + struct mm_struct *mm = current->mm; + + if ((get_region_id(ea) == VMALLOC_REGION_ID) || + (get_region_id(ea) == IO_REGION_ID)) + mm = &init_mm; + + if (dsisr & DSISR_NOHPTE) + flags |= HPTE_NOHPTE_UPDATE; + + return hash_page_mm(mm, ea, access, trap, flags); +} +EXPORT_SYMBOL_GPL(hash_page); + +int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, + unsigned long dsisr) +{ + unsigned long access = _PAGE_PRESENT | _PAGE_READ; + unsigned long flags = 0; + struct mm_struct *mm = current->mm; + unsigned int region_id = get_region_id(ea); + + if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) + mm = &init_mm; + + if (dsisr & DSISR_NOHPTE) + flags |= HPTE_NOHPTE_UPDATE; + + if (dsisr & DSISR_ISSTORE) + access |= _PAGE_WRITE; + /* + * We set _PAGE_PRIVILEGED only when + * kernel mode access kernel space. + * + * _PAGE_PRIVILEGED is NOT set + * 1) when kernel mode access user space + * 2) user space access kernel space. + */ + access |= _PAGE_PRIVILEGED; + if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) + access &= ~_PAGE_PRIVILEGED; + + if (trap == 0x400) + access |= _PAGE_EXEC; + + return hash_page_mm(mm, ea, access, trap, flags); +} + +#ifdef CONFIG_PPC_MM_SLICES +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + int psize = get_slice_psize(mm, ea); + + /* We only prefault standard pages for now */ + if (unlikely(psize != mm_ctx_user_psize(&mm->context))) + return false; + + /* + * Don't prefault if subpage protection is enabled for the EA. + */ + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) + return false; + + return true; +} +#else +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + return true; +} +#endif + +void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap) +{ + int hugepage_shift; + unsigned long vsid; + pgd_t *pgdir; + pte_t *ptep; + unsigned long flags; + int rc, ssize, update_flags = 0; + unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); + + BUG_ON(get_region_id(ea) != USER_REGION_ID); + + if (!should_hash_preload(mm, ea)) + return; + + DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," + " trap=%lx\n", mm, mm->pgd, ea, access, trap); + + /* Get Linux PTE if available */ + pgdir = mm->pgd; + if (pgdir == NULL) + return; + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_user_vsid(&mm->context, ea, ssize); + if (!vsid) + return; + /* + * Hash doesn't like irqs. Walking linux page table with irq disabled + * saves us from holding multiple locks. + */ + local_irq_save(flags); + + /* + * THP pages use update_mmu_cache_pmd. We don't do + * hash preload there. Hence can ignore THP here + */ + ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); + if (!ptep) + goto out_exit; + + WARN_ON(hugepage_shift); +#ifdef CONFIG_PPC_64K_PAGES + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on + * a 64K kernel), then we don't preload, hash_page() will take + * care of it once we actually try to access the page. + * That way we don't have to duplicate all of the logic for segment + * page size demotion here + */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) + goto out_exit; +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Is that local to this CPU ? */ + if (mm_is_thread_local(mm)) + update_flags |= HPTE_LOCAL_UPDATE; + + /* Hash it in */ +#ifdef CONFIG_PPC_64K_PAGES + if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + update_flags, ssize); + else +#endif /* CONFIG_PPC_64K_PAGES */ + rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, + ssize, subpage_protection(mm, ea)); + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm_ctx_user_psize(&mm->context), + mm_ctx_user_psize(&mm->context), + pte_val(*ptep)); +out_exit: + local_irq_restore(flags); +} + +#ifdef CONFIG_PPC_MEM_KEYS +/* + * Return the protection key associated with the given address and the + * mm_struct. + */ +u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) +{ + pte_t *ptep; + u16 pkey = 0; + unsigned long flags; + + if (!mm || !mm->pgd) + return 0; + + local_irq_save(flags); + ptep = find_linux_pte(mm->pgd, address, NULL, NULL); + if (ptep) + pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); + local_irq_restore(flags); + + return pkey; +} +#endif /* CONFIG_PPC_MEM_KEYS */ + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static inline void tm_flush_hash_page(int local) +{ + /* + * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a + * page back to a block device w/PIO could pick up transactional data + * (bad!) so we force an abort here. Before the sync the page will be + * made read-only, which will flush_hash_page. BIG ISSUE here: if the + * kernel uses a page from userspace without unmapping it first, it may + * see the speculated version. + */ + if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs && + MSR_TM_ACTIVE(current->thread.regs->msr)) { + tm_enable(); + tm_abort(TM_CAUSE_TLBI); + } +} +#else +static inline void tm_flush_hash_page(int local) +{ +} +#endif + +/* + * Return the global hash slot, corresponding to the given PTE, which contains + * the HPTE. + */ +unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, + int ssize, real_pte_t rpte, unsigned int subpg_index) +{ + unsigned long hash, gslot, hidx; + + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(rpte, subpg_index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + gslot += hidx & _PTEIDX_GROUP_IX; + return gslot; +} + +/* + * WARNING: This is called from hash_low_64.S, if you change this prototype, + * do not forget to update the assembly call site ! + */ +void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, + unsigned long flags) +{ + unsigned long index, shift, gslot; + int local = flags & HPTE_LOCAL_UPDATE; + + DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index); + DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot); + /* + * We use same base page size and actual psize, because we don't + * use these functions for hugepage + */ + mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize, + ssize, local); + } pte_iterate_hashed_end(); + + tm_flush_hash_page(local); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void flush_hash_hugepage(unsigned long vsid, unsigned long addr, + pmd_t *pmdp, unsigned int psize, int ssize, + unsigned long flags) +{ + int i, max_hpte_count, valid; + unsigned long s_addr; + unsigned char *hpte_slot_array; + unsigned long hidx, shift, vpn, hash, slot; + int local = flags & HPTE_LOCAL_UPDATE; + + s_addr = addr & HPAGE_PMD_MASK; + hpte_slot_array = get_hpte_slot_array(pmdp); + /* + * IF we try to do a HUGE PTE update after a withdraw is done. + * we will find the below NULL. This happens when we do + * split_huge_page_pmd + */ + if (!hpte_slot_array) + return; + + if (mmu_hash_ops.hugepage_invalidate) { + mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array, + psize, ssize, local); + goto tm_abort; + } + /* + * No bluk hpte removal support, invalidate each entry + */ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE >> shift; + for (i = 0; i < max_hpte_count; i++) { + /* + * 8 bits per each hpte entries + * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] + */ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, local); + } +tm_abort: + tm_flush_hash_page(local); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void flush_hash_range(unsigned long number, int local) +{ + if (mmu_hash_ops.flush_hash_range) + mmu_hash_ops.flush_hash_range(number, local); + else { + int i; + struct ppc64_tlb_batch *batch = + this_cpu_ptr(&ppc64_tlb_batch); + + for (i = 0; i < number; i++) + flush_hash_page(batch->vpn[i], batch->pte[i], + batch->psize, batch->ssize, local); + } +} + +/* + * low_hash_fault is called when we the low level hash code failed + * to instert a PTE due to an hypervisor error + */ +void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) +{ + enum ctx_state prev_state = exception_enter(); + + if (user_mode(regs)) { +#ifdef CONFIG_PPC_SUBPAGE_PROT + if (rc == -2) + _exception(SIGSEGV, regs, SEGV_ACCERR, address); + else +#endif + _exception(SIGBUS, regs, BUS_ADRERR, address); + } else + bad_page_fault(regs, address, SIGBUS); + + exception_exit(prev_state); +} + +long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int ssize) +{ + unsigned long hpte_group; + long slot; + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags, + psize, psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, + vflags | HPTE_V_SECONDARY, + psize, psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + goto repeat; + } + } + + return slot; +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + long ret; + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; + + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, + HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); + + BUG_ON (ret < 0); + spin_lock(&linear_map_hash_lock); + BUG_ON(linear_map_hash_slots[lmi] & 0x80); + linear_map_hash_slots[lmi] = ret | 0x80; + spin_unlock(&linear_map_hash_lock); +} + +static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash, hidx, slot; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + spin_lock(&linear_map_hash_lock); + BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); + hidx = linear_map_hash_slots[lmi] & 0x7f; + linear_map_hash_slots[lmi] = 0; + spin_unlock(&linear_map_hash_lock); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, + mmu_linear_psize, + mmu_kernel_ssize, 0); +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = __pa(vaddr) >> PAGE_SHIFT; + if (lmi >= linear_map_hash_count) + continue; + if (enable) + kernel_map_linear_page(vaddr, lmi); + else + kernel_unmap_linear_page(vaddr, lmi); + } + local_irq_restore(flags); +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* + * On virtualized systems the first entry is our RMA region aka VRMA, + * non-virtualized 64-bit hash MMU systems don't have a limitation + * on real mode access. + * + * For guests on platforms before POWER9, we clamp the it limit to 1G + * to avoid some funky things such as RTAS bugs etc... + */ + if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { + ppc64_rma_size = first_memblock_size; + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) + ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(ppc64_rma_size); + } else { + ppc64_rma_size = ULONG_MAX; + } +} + +#ifdef CONFIG_DEBUG_FS + +static int hpt_order_get(void *data, u64 *val) +{ + *val = ppc64_pft_size; + return 0; +} + +static int hpt_order_set(void *data, u64 val) +{ + if (!mmu_hash_ops.resize_hpt) + return -ENODEV; + + return mmu_hash_ops.resize_hpt(val); +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); + +static int __init hash64_debugfs(void) +{ + if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, + NULL, &fops_hpt_order)) { + pr_err("lpar: unable to create hpt_order debugsfs file\n"); + } + + return 0; +} +machine_device_initcall(pseries, hash64_debugfs); +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c new file mode 100644 index 000000000000..e7a9c4f6bfca --- /dev/null +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -0,0 +1,482 @@ +/* + * IOMMU helpers in MMU context. + * + * Copyright (C) 2015 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(mem_list_mutex); + +#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 +#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) + +struct mm_iommu_table_group_mem_t { + struct list_head next; + struct rcu_head rcu; + unsigned long used; + atomic64_t mapped; + unsigned int pageshift; + u64 ua; /* userspace address */ + u64 entries; /* number of entries in hpas/hpages[] */ + /* + * in mm_iommu_get we temporarily use this to store + * struct page address. + * + * We need to convert ua to hpa in real mode. Make it + * simpler by storing physical address. + */ + union { + struct page **hpages; /* vmalloc'ed */ + phys_addr_t *hpas; + }; +#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) + u64 dev_hpa; /* Device memory base address */ +}; + +static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, + unsigned long npages, bool incr) +{ + long ret = 0, locked, lock_limit; + + if (!npages) + return 0; + + down_write(&mm->mmap_sem); + + if (incr) { + locked = mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + mm->locked_vm += npages; + } else { + if (WARN_ON_ONCE(npages > mm->locked_vm)) + npages = mm->locked_vm; + mm->locked_vm -= npages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", + current ? current->pid : 0, + incr ? '+' : '-', + npages << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK)); + up_write(&mm->mmap_sem); + + return ret; +} + +bool mm_iommu_preregistered(struct mm_struct *mm) +{ + return !list_empty(&mm->context.iommu_group_mem_list); +} +EXPORT_SYMBOL_GPL(mm_iommu_preregistered); + +static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) +{ + struct mm_iommu_table_group_mem_t *mem; + long i, ret, locked_entries = 0; + unsigned int pageshift; + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, + next) { + /* Overlap? */ + if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem->ua + + (mem->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + goto unlock_exit; + } + + } + + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { + ret = mm_iommu_adjust_locked_vm(mm, entries, true); + if (ret) + goto unlock_exit; + + locked_entries = entries; + } + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) { + ret = -ENOMEM; + goto unlock_exit; + } + + if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { + mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); + mem->dev_hpa = dev_hpa; + goto good_exit; + } + mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; + + /* + * For a starting point for a maximum page size calculation + * we use @ua and @entries natural alignment to allow IOMMU pages + * smaller than huge pages but still bigger than PAGE_SIZE. + */ + mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT)); + mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0]))); + if (!mem->hpas) { + kfree(mem); + ret = -ENOMEM; + goto unlock_exit; + } + + down_read(&mm->mmap_sem); + ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + up_read(&mm->mmap_sem); + if (ret != entries) { + /* free the reference taken */ + for (i = 0; i < ret; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + ret = -EFAULT; + goto unlock_exit; + } + + pageshift = PAGE_SHIFT; + for (i = 0; i < entries; ++i) { + struct page *page = mem->hpages[i]; + + /* + * Allow to use larger than 64k IOMMU pages. Only do that + * if we are backed by hugetlb. + */ + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { + struct page *head = compound_head(page); + + pageshift = compound_order(head) + PAGE_SHIFT; + } + mem->pageshift = min(mem->pageshift, pageshift); + /* + * We don't need struct page reference any more, switch + * to physical address. + */ + mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; + } + +good_exit: + ret = 0; + atomic64_set(&mem->mapped, 1); + mem->used = 1; + mem->ua = ua; + mem->entries = entries; + *pmem = mem; + + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + +unlock_exit: + if (locked_entries && ret) + mm_iommu_adjust_locked_vm(mm, locked_entries, false); + + mutex_unlock(&mem_list_mutex); + + return ret; +} + +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, + pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_new); + +long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_newdev); + +static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) +{ + long i; + struct page *page = NULL; + + if (!mem->hpas) + return; + + for (i = 0; i < mem->entries; ++i) { + if (!mem->hpas[i]) + continue; + + page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT); + if (!page) + continue; + + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) + SetPageDirty(page); + + put_page(page); + mem->hpas[i] = 0; + } +} + +static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem) +{ + + mm_iommu_unpin(mem); + vfree(mem->hpas); + kfree(mem); +} + +static void mm_iommu_free(struct rcu_head *head) +{ + struct mm_iommu_table_group_mem_t *mem = container_of(head, + struct mm_iommu_table_group_mem_t, rcu); + + mm_iommu_do_free(mem); +} + +static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) +{ + list_del_rcu(&mem->next); + call_rcu(&mem->rcu, mm_iommu_free); +} + +long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) +{ + long ret = 0; + unsigned long entries, dev_hpa; + + mutex_lock(&mem_list_mutex); + + if (mem->used == 0) { + ret = -ENOENT; + goto unlock_exit; + } + + --mem->used; + /* There are still users, exit */ + if (mem->used) + goto unlock_exit; + + /* Are there still mappings? */ + if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) { + ++mem->used; + ret = -EBUSY; + goto unlock_exit; + } + + /* @mapped became 0 so now mappings are disabled, release the region */ + entries = mem->entries; + dev_hpa = mem->dev_hpa; + mm_iommu_release(mem); + + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + mm_iommu_adjust_locked_vm(mm, entries, false); + +unlock_exit: + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_put); + +struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, + unsigned long ua, unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_lookup); + +struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, + unsigned long ua, unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, + next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + + return ret; +} + +struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, + unsigned long ua, unsigned long entries) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if ((mem->ua == ua) && (mem->entries == entries)) { + ret = mem; + ++mem->used; + break; + } + } + + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_get); + +long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned int pageshift, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + u64 *va; + + if (entry >= mem->entries) + return -EFAULT; + + if (pageshift > mem->pageshift) + return -EFAULT; + + if (!mem->hpas) { + *hpa = mem->dev_hpa + (ua - mem->ua); + return 0; + } + + va = &mem->hpas[entry]; + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); + + return 0; +} +EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); + +long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned int pageshift, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + unsigned long *pa; + + if (entry >= mem->entries) + return -EFAULT; + + if (pageshift > mem->pageshift) + return -EFAULT; + + if (!mem->hpas) { + *hpa = mem->dev_hpa + (ua - mem->ua); + return 0; + } + + pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); + if (!pa) + return -EFAULT; + + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); + + return 0; +} + +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) +{ + struct mm_iommu_table_group_mem_t *mem; + long entry; + void *va; + unsigned long *pa; + + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); + if (!mem) + return; + + if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) + return; + + entry = (ua - mem->ua) >> PAGE_SHIFT; + va = &mem->hpas[entry]; + + pa = (void *) vmalloc_to_phys(va); + if (!pa) + return; + + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; +} + +bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, + unsigned int pageshift, unsigned long *size) +{ + struct mm_iommu_table_group_mem_t *mem; + unsigned long end; + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + continue; + + end = mem->dev_hpa + (mem->entries << PAGE_SHIFT); + if ((mem->dev_hpa <= hpa) && (hpa < end)) { + /* + * Since the IOMMU page size might be bigger than + * PAGE_SIZE, the amount of preregistered memory + * starting from @hpa might be smaller than 1<mapped)) + return 0; + + /* Last mm_iommu_put() has been called, no more mappings allowed() */ + return -ENXIO; +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc); + +void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem) +{ + atomic64_add_unless(&mem->mapped, -1, 1); +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec); + +void mm_iommu_init(struct mm_struct *mm) +{ + INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list); +} diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c new file mode 100644 index 000000000000..cb2b08635508 --- /dev/null +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -0,0 +1,263 @@ +/* + * MMU context allocation for 64-bit kernels. + * + * Copyright (C) 2004 Anton Blanchard, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static DEFINE_IDA(mmu_context_ida); + +static int alloc_context_id(int min_id, int max_id) +{ + return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); +} + +void hash__reserve_context_id(int id) +{ + int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); + + WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); +} + +int hash__alloc_context_id(void) +{ + unsigned long max; + + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) + max = MAX_USER_CONTEXT; + else + max = MAX_USER_CONTEXT_65BIT_VA; + + return alloc_context_id(MIN_USER_CONTEXT, max); +} +EXPORT_SYMBOL_GPL(hash__alloc_context_id); + +void slb_setup_new_exec(void); + +static int hash__init_new_context(struct mm_struct *mm) +{ + int index; + + index = hash__alloc_context_id(); + if (index < 0) + return index; + + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), + GFP_KERNEL); + if (!mm->context.hash_context) { + ida_free(&mmu_context_ida, index); + return -ENOMEM; + } + + /* + * The old code would re-promote on fork, we don't do that when using + * slices as it could cause problem promoting slices that have been + * forced down to 4K. + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we properly + * initialize context slice details for newly allocated mm's (which will + * have id == 0) and don't alter context slice inherited via fork (which + * will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is OK. + */ + if (mm->context.id == 0) { + memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); + slice_init_new_context_exec(mm); + } else { + /* This is fork. Copy hash_context details from current->mm */ + memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); +#ifdef CONFIG_PPC_SUBPAGE_PROT + /* inherit subpage prot detalis if we have one. */ + if (current->mm->context.hash_context->spt) { + mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), + GFP_KERNEL); + if (!mm->context.hash_context->spt) { + ida_free(&mmu_context_ida, index); + kfree(mm->context.hash_context); + return -ENOMEM; + } + } +#endif + + } + + pkey_mm_init(mm); + return index; +} + +void hash__setup_new_exec(void) +{ + slice_setup_new_exec(); + + slb_setup_new_exec(); +} + +static int radix__init_new_context(struct mm_struct *mm) +{ + unsigned long rts_field; + int index, max_id; + + max_id = (1 << mmu_pid_bits) - 1; + index = alloc_context_id(mmu_base_pid, max_id); + if (index < 0) + return index; + + /* + * set the process table entry, + */ + rts_field = radix__get_tree_size(); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + + /* + * Order the above store with subsequent update of the PID + * register (at which point HW can start loading/caching + * the entry) and the corresponding load by the MMU from + * the L2 cache. + */ + asm volatile("ptesync;isync" : : : "memory"); + + mm->context.npu_context = NULL; + mm->context.hash_context = NULL; + + return index; +} + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + if (radix_enabled()) + index = radix__init_new_context(mm); + else + index = hash__init_new_context(mm); + + if (index < 0) + return index; + + mm->context.id = index; + + mm->context.pte_frag = NULL; + mm->context.pmd_frag = NULL; +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_init(mm); +#endif + atomic_set(&mm->context.active_cpus, 0); + atomic_set(&mm->context.copros, 0); + + return 0; +} + +void __destroy_context(int context_id) +{ + ida_free(&mmu_context_ida, context_id); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +static void destroy_contexts(mm_context_t *ctx) +{ + int index, context_id; + + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } + kfree(ctx->hash_context); +} + +static void pmd_frag_destroy(void *pmd_frag) +{ + int count; + struct page *page; + + page = virt_to_page(pmd_frag); + /* drop all the pending references */ + count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { + pgtable_pmd_page_dtor(page); + __free_page(page); + } +} + +static void destroy_pagetable_cache(struct mm_struct *mm) +{ + void *frag; + + frag = mm->context.pte_frag; + if (frag) + pte_frag_destroy(frag); + + frag = mm->context.pmd_frag; + if (frag) + pmd_frag_destroy(frag); + return; +} + +void destroy_context(struct mm_struct *mm) +{ +#ifdef CONFIG_SPAPR_TCE_IOMMU + WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); +#endif + if (radix_enabled()) + WARN_ON(process_tb[mm->context.id].prtb0 != 0); + else + subpage_prot_free(mm); + destroy_contexts(&mm->context); + mm->context.id = MMU_NO_CONTEXT; +} + +void arch_exit_mmap(struct mm_struct *mm) +{ + destroy_pagetable_cache(mm); + + if (radix_enabled()) { + /* + * Radix doesn't have a valid bit in the process table + * entries. However we know that at least P9 implementation + * will avoid caching an entry with an invalid RTS field, + * and 0 is invalid. So this will do. + * + * This runs before the "fullmm" tlb flush in exit_mmap, + * which does a RIC=2 tlbie to clear the process table + * entry. See the "fullmm" comments in tlb-radix.c. + * + * No barrier required here after the store because + * this process will do the invalidate, which starts with + * ptesync. + */ + process_tb[mm->context.id].prtb0 = 0; + } +} + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + isync(); +} +#endif diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c new file mode 100644 index 000000000000..16bda049187a --- /dev/null +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -0,0 +1,449 @@ +/* + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +unsigned long __pmd_frag_nr; +EXPORT_SYMBOL(__pmd_frag_nr); +unsigned long __pmd_frag_size_shift; +EXPORT_SYMBOL(__pmd_frag_size_shift); + +int (*register_process_table)(unsigned long base, unsigned long page_size, + unsigned long tbl_size); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp)); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + /* + * We can use MMU_PAGE_2M here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pmdp_ptep(pmdp), + pmd_pte(entry), address, MMU_PAGE_2M); + } + return changed; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(pmd_lockptr(mm, pmdp)); + WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} + +static void do_nothing(void *unused) +{ + +} +/* + * Serialize against find_current_mm_pte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_current_mm_pte to finish. + */ +void serialize_against_pte_lookup(struct mm_struct *mm) +{ + smp_mb(); + smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); +} + +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + unsigned long old_pmd; + + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + serialize_against_pte_lookup(vma->vm_mm); + return __pmd(old_pmd); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + return pmd_set_protbits(__pmd(pmdv), pgprot); +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + if (radix_enabled()) + prefetch((void *)addr); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* For use by kexec */ +void mmu_cleanup_all(void) +{ + if (radix_enabled()) + radix__mmu_cleanup_all(); + else if (mmu_hash_ops.hpte_clear_all) + mmu_hash_ops.hpte_clear_all(); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid) +{ + if (radix_enabled()) + return radix__create_section_mapping(start, end, nid); + + return hash__create_section_mapping(start, end, nid); +} + +int __meminit remove_section_mapping(unsigned long start, unsigned long end) +{ + if (radix_enabled()) + return radix__remove_section_mapping(start, end); + + return hash__remove_section_mapping(start, end); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +void __init mmu_partition_table_init(void) +{ + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + unsigned long ptcr; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); + /* Initialize the Partition Table with no entries */ + partition_tb = memblock_alloc(patb_size, patb_size); + if (!partition_tb) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, patb_size, patb_size); + + /* + * update partition table control register, + * 64 K size. + */ + ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); + mtspr(SPRN_PTCR, ptcr); + powernv_set_nmmu_ptcr(ptcr); +} + +void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1) +{ + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + + partition_tb[lpid].patb0 = cpu_to_be64(dw0); + partition_tb[lpid].patb1 = cpu_to_be64(dw1); + + /* + * Global flush of TLBs and partition table caches for this lpid. + * The type of flush (hash or radix) depends on what the previous + * use of this partition ID was, not the new use. + */ + asm volatile("ptesync" : : : "memory"); + if (old & PATB_HR) { + asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); + } else { + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} +EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); + +static pmd_t *get_pmd_from_cache(struct mm_struct *mm) +{ + void *pmd_frag, *ret; + + if (PMD_FRAG_NR == 1) + return NULL; + + spin_lock(&mm->page_table_lock); + ret = mm->context.pmd_frag; + if (ret) { + pmd_frag = ret + PMD_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0) + pmd_frag = NULL; + mm->context.pmd_frag = pmd_frag; + } + spin_unlock(&mm->page_table_lock); + return (pmd_t *)ret; +} + +static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) +{ + void *ret = NULL; + struct page *page; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + page = alloc_page(gfp); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_pages(page, 0); + return NULL; + } + + atomic_set(&page->pt_frag_refcount, 1); + + ret = page_address(page); + /* + * if we support only one fragment just return the + * allocated page. + */ + if (PMD_FRAG_NR == 1) + return ret; + + spin_lock(&mm->page_table_lock); + /* + * If we find pgtable_page set, we return + * the allocated page with single fragement + * count. + */ + if (likely(!mm->context.pmd_frag)) { + atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + mm->context.pmd_frag = ret + PMD_FRAG_SIZE; + } + spin_unlock(&mm->page_table_lock); + + return (pmd_t *)ret; +} + +pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) +{ + pmd_t *pmd; + + pmd = get_pmd_from_cache(mm); + if (pmd) + return pmd; + + return __alloc_for_pmdcache(mm); +} + +void pmd_fragment_free(unsigned long *pmd) +{ + struct page *page = virt_to_page(pmd); + + BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&page->pt_frag_refcount)) { + pgtable_pmd_page_dtor(page); + __free_page(page); + } +} + +static inline void pgtable_free(void *table, int index) +{ + switch (index) { + case PTE_INDEX: + pte_fragment_free(table, 0); + break; + case PMD_INDEX: + pmd_fragment_free(table); + break; + case PUD_INDEX: + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); + break; +#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) + /* 16M hugepd directory at pud level */ + case HTLB_16M_INDEX: + BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); + break; + /* 16G hugepd directory at the pgd level */ + case HTLB_16G_INDEX: + BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); + break; +#endif + /* We don't free pgd table via RCU callback */ + default: + BUG(); + } +} + +#ifdef CONFIG_SMP +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) +{ + unsigned long pgf = (unsigned long)table; + + BUG_ON(index > MAX_PGTABLE_INDEX_SIZE); + pgf |= index; + tlb_remove_table(tlb, (void *)pgf); +} + +void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + return pgtable_free(table, index); +} +#else +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) +{ + return pgtable_free(table, index); +} +#endif + +#ifdef CONFIG_PROC_FS +atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; + +void arch_report_meminfo(struct seq_file *m) +{ + /* + * Hash maps the memory with one size mmu_linear_psize. + * So don't bother to print these on hash + */ + if (!radix_enabled()) + return; + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); + seq_printf(m, "DirectMap64k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); + seq_printf(m, "DirectMap2M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); + seq_printf(m, "DirectMap1G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); +} +#endif /* CONFIG_PROC_FS */ + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + unsigned long pte_val; + + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); + + return __pte(pte_val); + +} + +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + if (radix_enabled()) + return radix__ptep_modify_prot_commit(vma, addr, + ptep, old_pte, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); +} + +/* + * For hash translation mode, we use the deposited table to store hash slot + * information and they are stored at PTRS_PER_PMD offset from related pmd + * location. Hence a pmd move requires deposit and withdraw. + * + * For radix translation with split pmd ptl, we store the deposited table in the + * pmd page. Hence if we have different pmd page we need to withdraw during pmd + * move. + * + * With hash we use deposited table always irrespective of anon or not. + * With radix we use deposited table only for anonymous mapping. + */ +int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) +{ + if (radix_enabled()) + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); + + return true; +} diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c new file mode 100644 index 000000000000..ae7fca40e5b3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * PowerPC Memory Protection Keys management + * + * Copyright 2017, Ram Pai, IBM Corporation. + */ + +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_TRUE(pkey_disabled); +int pkeys_total; /* Total pkeys as per device tree */ +u32 initial_allocation_mask; /* Bits set for the initially allocated keys */ +u32 reserved_allocation_mask; /* Bits set for reserved keys */ +static bool pkey_execute_disable_supported; +static bool pkeys_devtree_defined; /* property exported by device tree */ +static u64 pkey_amr_mask; /* Bits in AMR not to be touched */ +static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */ +static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */ +static int execute_only_key = 2; + +#define AMR_BITS_PER_PKEY 2 +#define AMR_RD_BIT 0x1UL +#define AMR_WR_BIT 0x2UL +#define IAMR_EX_BIT 0x1UL +#define PKEY_REG_BITS (sizeof(u64)*8) +#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) + +static void scan_pkey_feature(void) +{ + u32 vals[2]; + struct device_node *cpu; + + cpu = of_find_node_by_type(NULL, "cpu"); + if (!cpu) + return; + + if (of_property_read_u32_array(cpu, + "ibm,processor-storage-keys", vals, 2)) + return; + + /* + * Since any pkey can be used for data or execute, we will just treat + * all keys as equal and track them as one entity. + */ + pkeys_total = vals[0]; + pkeys_devtree_defined = true; +} + +static inline bool pkey_mmu_enabled(void) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return pkeys_total; + else + return cpu_has_feature(CPU_FTR_PKEY); +} + +static int pkey_initialize(void) +{ + int os_reserved, i; + + /* + * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral + * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE. + * Ensure that the bits a distinct. + */ + BUILD_BUG_ON(PKEY_DISABLE_EXECUTE & + (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + /* + * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous + * in the vmaflag. Make sure that is really the case. + */ + BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + + __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + != (sizeof(u64) * BITS_PER_BYTE)); + + /* scan the device tree for pkey feature */ + scan_pkey_feature(); + + /* + * Let's assume 32 pkeys on P8 bare metal, if its not defined by device + * tree. We make this exception since skiboot forgot to expose this + * property on power8. + */ + if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) && + cpu_has_feature(CPU_FTRS_POWER8)) + pkeys_total = 32; + + /* + * Adjust the upper limit, based on the number of bits supported by + * arch-neutral code. + */ + pkeys_total = min_t(int, pkeys_total, + ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1)); + + if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total) + static_branch_enable(&pkey_disabled); + else + static_branch_disable(&pkey_disabled); + + if (static_branch_likely(&pkey_disabled)) + return 0; + + /* + * The device tree cannot be relied to indicate support for + * execute_disable support. Instead we use a PVR check. + */ + if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p)) + pkey_execute_disable_supported = false; + else + pkey_execute_disable_supported = true; + +#ifdef CONFIG_PPC_4K_PAGES + /* + * The OS can manage only 8 pkeys due to its inability to represent them + * in the Linux 4K PTE. + */ + os_reserved = pkeys_total - 8; +#else + os_reserved = 0; +#endif + /* Bits are in LE format. */ + reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key); + + /* register mask is in BE format */ + pkey_amr_mask = ~0x0ul; + pkey_amr_mask &= ~(0x3ul << pkeyshift(0)); + + pkey_iamr_mask = ~0x0ul; + pkey_iamr_mask &= ~(0x3ul << pkeyshift(0)); + pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key)); + + pkey_uamor_mask = ~0x0ul; + pkey_uamor_mask &= ~(0x3ul << pkeyshift(0)); + pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key)); + + /* mark the rest of the keys as reserved and hence unavailable */ + for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) { + reserved_allocation_mask |= (0x1 << i); + pkey_uamor_mask &= ~(0x3ul << pkeyshift(i)); + } + initial_allocation_mask = reserved_allocation_mask | (0x1 << 0); + + if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) { + /* + * Insufficient number of keys to support + * execute only key. Mark it unavailable. + * Any AMR, UAMOR, IAMR bit set for + * this key is irrelevant since this key + * can never be allocated. + */ + execute_only_key = -1; + } + + return 0; +} + +arch_initcall(pkey_initialize); + +void pkey_mm_init(struct mm_struct *mm) +{ + if (static_branch_likely(&pkey_disabled)) + return; + mm_pkey_allocation_map(mm) = initial_allocation_mask; + mm->context.execute_only_pkey = execute_only_key; +} + +static inline u64 read_amr(void) +{ + return mfspr(SPRN_AMR); +} + +static inline void write_amr(u64 value) +{ + mtspr(SPRN_AMR, value); +} + +static inline u64 read_iamr(void) +{ + if (!likely(pkey_execute_disable_supported)) + return 0x0UL; + + return mfspr(SPRN_IAMR); +} + +static inline void write_iamr(u64 value) +{ + if (!likely(pkey_execute_disable_supported)) + return; + + mtspr(SPRN_IAMR, value); +} + +static inline u64 read_uamor(void) +{ + return mfspr(SPRN_UAMOR); +} + +static inline void write_uamor(u64 value) +{ + mtspr(SPRN_UAMOR, value); +} + +static bool is_pkey_enabled(int pkey) +{ + u64 uamor = read_uamor(); + u64 pkey_bits = 0x3ul << pkeyshift(pkey); + u64 uamor_pkey_bits = (uamor & pkey_bits); + + /* + * Both the bits in UAMOR corresponding to the key should be set or + * reset. + */ + WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits)); + return !!(uamor_pkey_bits); +} + +static inline void init_amr(int pkey, u8 init_bits) +{ + u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); + u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); + + write_amr(old_amr | new_amr_bits); +} + +static inline void init_iamr(int pkey, u8 init_bits) +{ + u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey)); + u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); + + write_iamr(old_iamr | new_iamr_bits); +} + +/* + * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that + * specified in @init_val. + */ +int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) +{ + u64 new_amr_bits = 0x0ul; + u64 new_iamr_bits = 0x0ul; + + if (!is_pkey_enabled(pkey)) + return -EINVAL; + + if (init_val & PKEY_DISABLE_EXECUTE) { + if (!pkey_execute_disable_supported) + return -EINVAL; + new_iamr_bits |= IAMR_EX_BIT; + } + init_iamr(pkey, new_iamr_bits); + + /* Set the bits we need in AMR: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT; + else if (init_val & PKEY_DISABLE_WRITE) + new_amr_bits |= AMR_WR_BIT; + + init_amr(pkey, new_amr_bits); + return 0; +} + +void thread_pkey_regs_save(struct thread_struct *thread) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + /* + * TODO: Skip saving registers if @thread hasn't used any keys yet. + */ + thread->amr = read_amr(); + thread->iamr = read_iamr(); + thread->uamor = read_uamor(); +} + +void thread_pkey_regs_restore(struct thread_struct *new_thread, + struct thread_struct *old_thread) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + if (old_thread->amr != new_thread->amr) + write_amr(new_thread->amr); + if (old_thread->iamr != new_thread->iamr) + write_iamr(new_thread->iamr); + if (old_thread->uamor != new_thread->uamor) + write_uamor(new_thread->uamor); +} + +void thread_pkey_regs_init(struct thread_struct *thread) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + thread->amr = pkey_amr_mask; + thread->iamr = pkey_iamr_mask; + thread->uamor = pkey_uamor_mask; + + write_uamor(pkey_uamor_mask); + write_amr(pkey_amr_mask); + write_iamr(pkey_iamr_mask); +} + +static inline bool pkey_allows_readwrite(int pkey) +{ + int pkey_shift = pkeyshift(pkey); + + if (!is_pkey_enabled(pkey)) + return true; + + return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift)); +} + +int __execute_only_pkey(struct mm_struct *mm) +{ + return mm->context.execute_only_pkey; +} + +static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) +{ + /* Do this check first since the vm_flags should be hot */ + if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) + return false; + + return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey); +} + +/* + * This should only be called for *plain* mprotect calls. + */ +int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, + int pkey) +{ + /* + * If the currently associated pkey is execute-only, but the requested + * protection is not execute-only, move it back to the default pkey. + */ + if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC)) + return 0; + + /* + * The requested protection is execute-only. Hence let's use an + * execute-only pkey. + */ + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(vma->vm_mm); + if (pkey > 0) + return pkey; + } + + /* Nothing to override. */ + return vma_pkey(vma); +} + +static bool pkey_access_permitted(int pkey, bool write, bool execute) +{ + int pkey_shift; + u64 amr; + + if (!is_pkey_enabled(pkey)) + return true; + + pkey_shift = pkeyshift(pkey); + if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift))) + return true; + + amr = read_amr(); /* Delay reading amr until absolutely needed */ + return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) || + (write && !(amr & (AMR_WR_BIT << pkey_shift)))); +} + +bool arch_pte_access_permitted(u64 pte, bool write, bool execute) +{ + if (static_branch_likely(&pkey_disabled)) + return true; + + return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute); +} + +/* + * We only want to enforce protection keys on the current thread because we + * effectively have no access to AMR/IAMR for other threads or any way to tell + * which AMR/IAMR in a threaded process we could use. + * + * So do not enforce things if the VMA is not from the current mm, or if we are + * in a kernel thread. + */ +static inline bool vma_is_foreign(struct vm_area_struct *vma) +{ + if (!current->mm) + return true; + + /* if it is not our ->mm, it has to be foreign */ + if (current->mm != vma->vm_mm) + return true; + + return false; +} + +bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, + bool execute, bool foreign) +{ + if (static_branch_likely(&pkey_disabled)) + return true; + /* + * Do not enforce our key-permissions on a foreign vma. + */ + if (foreign || vma_is_foreign(vma)) + return true; + + return pkey_access_permitted(vma_pkey(vma), write, execute); +} + +void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + /* Duplicate the oldmm pkey state in mm: */ + mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm); + mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; +} diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c new file mode 100644 index 000000000000..cab06331c0c0 --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); +} + +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); +} + +void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); +} + +/* + * A vairant of hugetlb_get_unmapped_area doing topdown search + * FIXME!! should we do as x86 does or non hugetlb area does ? + * ie, use topdown or not based on mmap_is_legacy check ? + */ +unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; + struct vm_unmapped_area_info info; + + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > high_limit) + return -ENOMEM; + + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (high_limit - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + return addr; + } + /* + * We are always doing an topdown search here. Slice code + * does that too. + */ + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + + return vm_unmapped_area(&info); +} + +void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_hugetlb_page(vma, addr); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c new file mode 100644 index 000000000000..c9bcf428dd2b --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -0,0 +1,1124 @@ +/* + * Page table handling routines for radix page table. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "radix-mmu: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +unsigned int mmu_pid_bits; +unsigned int mmu_base_pid; + +static int native_register_process_table(unsigned long base, unsigned long pg_sz, + unsigned long table_size) +{ + unsigned long patb0, patb1; + + patb0 = be64_to_cpu(partition_tb[0].patb0); + patb1 = base | table_size | PATB_GR; + + mmu_partition_table_set_entry(0, patb0, patb1); + + return 0; +} + +static __ref void *early_alloc_pgtable(unsigned long size, int nid, + unsigned long region_start, unsigned long region_end) +{ + phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; + phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; + void *ptr; + + if (region_start) + min_addr = region_start; + if (region_end) + max_addr = region_end; + + ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); + + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", + __func__, size, size, nid, &min_addr, &max_addr); + + return ptr; +} + +static int early_map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, + region_start, region_end); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, + region_start, region_end); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); + smp_wmb(); + return 0; +} + +/* + * nid, region_start, and region_end are hints to try to place the page + * table memory in the same node or region. + */ +static int __map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + /* + * Make sure task size is correct as per the max adddr + */ + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); + +#ifdef CONFIG_PPC_64K_PAGES + BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); +#endif + + if (unlikely(!slab_is_available())) + return early_map_kernel_page(ea, pa, flags, map_page_size, + nid, region_start, region_end); + + /* + * Should make page table allocation functions be able to take a + * node, so we can place kernel page tables on the right nodes after + * boot. + */ + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); + smp_wmb(); + return 0; +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long clear) +{ + unsigned long idx; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + start = ALIGN_DOWN(start, PAGE_SIZE); + end = PAGE_ALIGN(end); // aligns up + + pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", + start, end, clear); + + for (idx = start; idx < end; idx += PAGE_SIZE) { + pgdp = pgd_offset_k(idx); + pudp = pud_alloc(&init_mm, pgdp, idx); + if (!pudp) + continue; + if (pud_huge(*pudp)) { + ptep = (pte_t *)pudp; + goto update_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, idx); + if (!pmdp) + continue; + if (pmd_huge(*pmdp)) { + ptep = pmdp_ptep(pmdp); + goto update_the_pte; + } + ptep = pte_alloc_kernel(pmdp, idx); + if (!ptep) + continue; +update_the_pte: + radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); + } + + radix__flush_tlb_kernel_range(start, end); +} + +void radix__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + radix__change_memory_range(start, end, _PAGE_WRITE); +} + +void radix__mark_initmem_nx(void) +{ + unsigned long start = (unsigned long)__init_begin; + unsigned long end = (unsigned long)__init_end; + + radix__change_memory_range(start, end, _PAGE_EXEC); +} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + +static inline void __meminit +print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) +{ + char buf[10]; + + if (end <= start) + return; + + string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); + + pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, + exec ? " (exec)" : ""); +} + +static unsigned long next_boundary(unsigned long addr, unsigned long end) +{ +#ifdef CONFIG_STRICT_KERNEL_RWX + if (addr < __pa_symbol(__init_begin)) + return __pa_symbol(__init_begin); +#endif + return end; +} + +static int __meminit create_physical_mapping(unsigned long start, + unsigned long end, + int nid) +{ + unsigned long vaddr, addr, mapping_size = 0; + bool prev_exec, exec = false; + pgprot_t prot; + int psize; + + start = _ALIGN_UP(start, PAGE_SIZE); + for (addr = start; addr < end; addr += mapping_size) { + unsigned long gap, previous_size; + int rc; + + gap = next_boundary(addr, end) - addr; + previous_size = mapping_size; + prev_exec = exec; + + if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && + mmu_psize_defs[MMU_PAGE_1G].shift) { + mapping_size = PUD_SIZE; + psize = MMU_PAGE_1G; + } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && + mmu_psize_defs[MMU_PAGE_2M].shift) { + mapping_size = PMD_SIZE; + psize = MMU_PAGE_2M; + } else { + mapping_size = PAGE_SIZE; + psize = mmu_virtual_psize; + } + + vaddr = (unsigned long)__va(addr); + + if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { + prot = PAGE_KERNEL_X; + exec = true; + } else { + prot = PAGE_KERNEL; + exec = false; + } + + if (mapping_size != previous_size || exec != prev_exec) { + print_mapping(start, addr, previous_size, prev_exec); + start = addr; + } + + rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); + if (rc) + return rc; + + update_page_count(psize, 1); + } + + print_mapping(start, addr, mapping_size, exec); + return 0; +} + +void __init radix_init_pgtable(void) +{ + unsigned long rts_field; + struct memblock_region *reg; + + /* We don't support slb for radix */ + mmu_slb_size = 0; + /* + * Create the linear mapping, using standard page size for now + */ + for_each_memblock(memory, reg) { + /* + * The memblock allocator is up at this point, so the + * page tables will be allocated within the range. No + * need or a node (which we don't have yet). + */ + + if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + + WARN_ON(create_physical_mapping(reg->base, + reg->base + reg->size, + -1)); + } + + /* Find out how many PID bits are supported */ + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (!mmu_pid_bits) + mmu_pid_bits = 20; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* + * When KVM is possible, we only use the top half of the + * PID space to avoid collisions between host and guest PIDs + * which can cause problems due to prefetch when exiting the + * guest with AIL=3 + */ + mmu_base_pid = 1 << (mmu_pid_bits - 1); +#else + mmu_base_pid = 1; +#endif + } else { + /* The guest uses the bottom half of the PID space */ + if (!mmu_pid_bits) + mmu_pid_bits = 19; + mmu_base_pid = 1; + } + + /* + * Allocate Partition table and process table for the + * host. + */ + BUG_ON(PRTB_SIZE_SHIFT > 36); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); + /* + * Fill in the process table. + */ + rts_field = radix__get_tree_size(); + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); + /* + * Fill in the partition table. We are suppose to use effective address + * of process table here. But our linear mapping also enable us to use + * physical address here. + */ + register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); + + /* + * The init_mm context is given the first available (non-zero) PID, + * which is the "guard PID" and contains no page table. PIDR should + * never be set to zero because that duplicates the kernel address + * space at the 0x0... offset (quadrant 0)! + * + * An arbitrary PID that may later be allocated by the PID allocator + * for userspace processes must not be used either, because that + * would cause stale user mappings for that PID on CPUs outside of + * the TLB invalidation scheme (because it won't be in mm_cpumask). + * + * So permanently carve out one PID for the purpose of a guard PID. + */ + init_mm.context.id = mmu_base_pid; + mmu_base_pid++; +} + +static void __init radix_init_partition_table(void) +{ + unsigned long rts_field, dw0; + + mmu_partition_table_init(); + rts_field = radix__get_tree_size(); + dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; + mmu_partition_table_set_entry(0, dw0, 0); + + pr_info("Initializing Radix MMU\n"); + pr_info("Partition table %p\n", partition_tb); +} + +void __init radix_init_native(void) +{ + register_process_table = native_register_process_table; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x15: + idx = MMU_PAGE_2M; + break; + case 0x1e: + idx = MMU_PAGE_1G; + break; + } + return idx; +} + +static int __init radix_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + int shift, idx; + unsigned int ap; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + /* Find MMU PID size */ + prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); + if (prop && size == 4) + mmu_pid_bits = be32_to_cpup(prop); + + /* Grab page size encodings */ + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + for (; size >= 4; size -= 4, ++prop) { + + struct mmu_psize_def *def; + + /* top 3 bit is AP encoding */ + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); + ap = be32_to_cpu(prop[0]) >> 29; + pr_info("Page size shift = %d AP=0x%x\n", shift, ap); + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + def = &mmu_psize_defs[idx]; + def->shift = shift; + def->ap = ap; + } + + /* needed ? */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 1; +} + +void __init radix__early_init_devtree(void) +{ + int rc; + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + /* + * let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; +found: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + return; +} + +static void radix_init_amor(void) +{ + /* + * In HV mode, we init AMOR (Authority Mask Override Register) so that + * the hypervisor and guest can setup IAMR (Instruction Authority Mask + * Register), enable key 0 and set it to 1. + * + * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) + */ + mtspr(SPRN_AMOR, (3ul << 62)); +} + +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled) +{ + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + /* + * Radix always uses key0 of the IAMR to determine if an access is + * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction + * fetch. + */ + mtspr(SPRN_IAMR, (1ul << 62)); +} +#endif + +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Access Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; + } + + /* Make sure userspace can't change the AMR */ + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + isync(); +} +#endif + +void __init radix__early_init_mmu(void) +{ + unsigned long lpcr; + +#ifdef CONFIG_PPC_64K_PAGES + /* PAGE_SIZE mappings */ + mmu_virtual_psize = MMU_PAGE_64K; +#else + mmu_virtual_psize = MMU_PAGE_4K; +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* vmemmap mapping */ + mmu_vmemmap_psize = mmu_virtual_psize; +#endif + /* + * initialize page table size + */ + __pte_index_size = RADIX_PTE_INDEX_SIZE; + __pmd_index_size = RADIX_PMD_INDEX_SIZE; + __pud_index_size = RADIX_PUD_INDEX_SIZE; + __pgd_index_size = RADIX_PGD_INDEX_SIZE; + __pud_cache_index = RADIX_PUD_INDEX_SIZE; + __pte_table_size = RADIX_PTE_TABLE_SIZE; + __pmd_table_size = RADIX_PMD_TABLE_SIZE; + __pud_table_size = RADIX_PUD_TABLE_SIZE; + __pgd_table_size = RADIX_PGD_TABLE_SIZE; + + __pmd_val_bits = RADIX_PMD_VAL_BITS; + __pud_val_bits = RADIX_PUD_VAL_BITS; + __pgd_val_bits = RADIX_PGD_VAL_BITS; + + __kernel_virt_start = RADIX_KERN_VIRT_START; + __vmalloc_start = RADIX_VMALLOC_START; + __vmalloc_end = RADIX_VMALLOC_END; + __kernel_io_start = RADIX_KERN_IO_START; + __kernel_io_end = RADIX_KERN_IO_END; + vmemmap = (struct page *)RADIX_VMEMMAP_START; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + __pte_frag_nr = RADIX_PTE_FRAG_NR; + __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = RADIX_PMD_FRAG_NR; + __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + radix_init_native(); + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + radix_init_partition_table(); + radix_init_amor(); + } else { + radix_init_pseries(); + } + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + + radix_init_pgtable(); + /* Switch to the guard PID before turning on MMU */ + radix__switch_mmu_context(NULL, &init_mm); + if (cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} + +void radix__early_init_mmu_secondary(void) +{ + unsigned long lpcr; + /* + * update partition table control register and UPRT + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + radix_init_amor(); + } + + radix__switch_mmu_context(NULL, &init_mm); + if (cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} + +void radix__mmu_cleanup_all(void) +{ + unsigned long lpcr; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); + mtspr(SPRN_PTCR, 0); + powernv_set_nmmu_ptcr(0); + radix__flush_tlb_all(); + } +} + +void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* + * Radix mode is not limited by RMA / VRMA addressing. + */ + ppc64_rma_size = ULONG_MAX; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, pte_start); + pmd_clear(pmd); +} + +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, pmd_start); + pud_clear(pud); +} + +struct change_mapping_params { + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long aligned_start; + unsigned long aligned_end; +}; + +static int __meminit stop_machine_change_mapping(void *data) +{ + struct change_mapping_params *params = + (struct change_mapping_params *)data; + + if (!data) + return -1; + + spin_unlock(&init_mm.page_table_lock); + pte_clear(&init_mm, params->aligned_start, params->pte); + create_physical_mapping(params->aligned_start, params->start, -1); + create_physical_mapping(params->end, params->aligned_end, -1); + spin_lock(&init_mm.page_table_lock); + return 0; +} + +static void remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pte_t *pte; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { + /* + * The vmemmap_free() and remove_section_mapping() + * codepaths call us with aligned addresses. + */ + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, pte); + } +} + +/* + * clear the pte and potentially split the mapping helper + */ +static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, + unsigned long size, pte_t *pte) +{ + unsigned long mask = ~(size - 1); + unsigned long aligned_start = addr & mask; + unsigned long aligned_end = addr + size; + struct change_mapping_params params; + bool split_region = false; + + if ((end - addr) < size) { + /* + * We're going to clear the PTE, but not flushed + * the mapping, time to remap and flush. The + * effects if visible outside the processor or + * if we are running in code close to the + * mapping we cleared, we are in trouble. + */ + if (overlaps_kernel_text(aligned_start, addr) || + overlaps_kernel_text(end, aligned_end)) { + /* + * Hack, just return, don't pte_clear + */ + WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " + "text, not splitting\n", addr, end); + return; + } + split_region = true; + } + + if (split_region) { + params.pte = pte; + params.start = addr; + params.end = end; + params.aligned_start = addr & ~(size - 1); + params.aligned_end = min_t(unsigned long, aligned_end, + (unsigned long)__va(memblock_end_of_DRAM())); + stop_machine(stop_machine_change_mapping, ¶ms, NULL); + return; + } + + pte_clear(&init_mm, addr, pte); +} + +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmd; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_huge(*pmd)) { + split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next); + free_pte_table(pte_base, pmd); + } +} + +static void remove_pud_table(pud_t *pud_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pmd_t *pmd_base; + pud_t *pud; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_huge(*pud)) { + split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next); + free_pmd_table(pmd_base, pud); + } +} + +static void __meminit remove_pagetable(unsigned long start, unsigned long end) +{ + unsigned long addr, next; + pud_t *pud_base; + pgd_t *pgd; + + spin_lock(&init_mm.page_table_lock); + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (pgd_huge(*pgd)) { + split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); + continue; + } + + pud_base = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud_base, addr, next); + } + + spin_unlock(&init_mm.page_table_lock); + radix__flush_tlb_kernel_range(start, end); +} + +int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) +{ + if (end >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + return create_physical_mapping(start, end, nid); +} + +int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) +{ + remove_pagetable(start, end); + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int map_page_size, + int nid) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); +} + +int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding */ + unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); + int ret; + + if ((start + page_size) >= RADIX_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + + ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); + BUG_ON(ret); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +{ + remove_pagetable(start, start + page_size); +} +#endif +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); +#endif + + old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); + trace_hugepage_update(addr, old, clr, set); + + return old; +} + +pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) + +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + + /*FIXME!! Verify whether we need this kick below */ + serialize_against_pte_lookup(vma->vm_mm); + + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); + + return pmd; +} + +/* + * For us pgtable_t is pte_t *. Inorder to save the deposisted + * page table, we consider the allocated page table as a list + * head. On withdraw we need to make sure we zero out the used + * list_head memory area. + */ +void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} + +pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + unsigned long old; + + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * Serialize against find_current_mm_pte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_current_mm_pte to finish. + */ + serialize_against_pte_lookup(mm); + return old_pmd; +} + +int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, + pte_t entry, unsigned long address, int psize) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | + _PAGE_RW | _PAGE_EXEC); + + unsigned long change = pte_val(entry) ^ pte_val(*ptep); + /* + * To avoid NMMU hang while relaxing access, we need mark + * the pte invalid in between. + */ + if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { + unsigned long old_pte, new_pte; + + old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); + /* + * new value of pte + */ + new_pte = old_pte | set; + radix__flush_tlb_page_psize(mm, address, psize); + __radix_pte_update(ptep, _PAGE_INVALID, new_pte); + } else { + __radix_pte_update(ptep, 0, set); + /* + * Book3S does not require a TLB flush when relaxing access + * restrictions when the address space is not attached to a + * NMMU, because the core MMU will reload the pte after taking + * an access fault, which is defined by the architectue. + */ + } + /* See ptesync comment in radix__set_pte_at */ +} + +void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. We need to do this only for radix, because hash + * translation does flush when updating the linux pte. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_tlb_page(vma, addr); + + set_pte_at(mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c new file mode 100644 index 000000000000..6a23b9ebd2a1 --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -0,0 +1,1101 @@ +/* + * TLB flush routines for radix kernels. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define RIC_FLUSH_TLB 0 +#define RIC_FLUSH_PWC 1 +#define RIC_FLUSH_ALL 2 + +/* + * tlbiel instruction for radix, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) + : "memory"); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and the entire Page Walk Cache + * and partition table entries. Then flush the remaining sets of the + * TLB. + */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + + /* Do the same for process scoped entries. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + + asm volatile("ptesync": : :"memory"); +} + +void radix__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is); + else + WARN(1, "%s called on pre-POWER9 CPU\n", __func__); + + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void __tlbiel_pid(unsigned long pid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rb |= set << PPC_BITLSHIFT(51); + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); +} + +static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void __tlbiel_lpid(unsigned long lpid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rb |= set << PPC_BITLSHIFT(51); + rs = 0; /* LPID comes from LPIDR */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 1, rb, rs, ric, prs, r); +} + +static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + +static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rb |= set << PPC_BITLSHIFT(51); + rs = 0; /* LPID comes from LPIDR */ + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 1, rb, rs, ric, prs, r); +} + + +static inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); +} + +static inline void __tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + +static inline void fixup_tlbie(void) +{ + unsigned long pid = 0; + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_lpid(unsigned long lpid) +{ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); + } +} + +/* + * We use 128 set in radix mode and 256 set in hpt mode. + */ +static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +{ + int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_pid(pid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void _tlbie_pid(unsigned long pid, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid(pid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_pid(pid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid(pid, RIC_FLUSH_ALL); + } + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) +{ + int set; + + VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_lpid(lpid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid(lpid, RIC_FLUSH_ALL); + } + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) +{ + int set; + + VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_lpid_guest(lpid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); +} + + +static inline void __tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); +} + +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbiel_va(va, pid, ap, ric); + asm volatile("ptesync": : :"memory"); +} + +static inline void _tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_va_range(start, end, pid, page_size, psize); + asm volatile("ptesync": : :"memory"); +} + +static inline void __tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); +} + +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, ric); + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, ric); + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); + __tlbie_va_range(start, end, pid, page_size, psize); + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ +void radix__local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid, RIC_FLUSH_TLB); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_mm); + +#ifndef CONFIG_SMP +void radix__local_flush_all_mm(struct mm_struct *mm) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid, RIC_FLUSH_ALL); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_all_mm); +#endif /* CONFIG_SMP */ + +void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + preempt_enable(); +} + +void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + /* need the return fix for nohash.c */ + if (is_vm_hugetlb_page(vma)) + return radix__local_flush_hugetlb_page(vma, vmaddr); +#endif + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); +} +EXPORT_SYMBOL(radix__local_flush_tlb_page); + +static bool mm_is_singlethreaded(struct mm_struct *mm) +{ + if (atomic_read(&mm->context.copros) > 0) + return false; + if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) + return true; + return false; +} + +static bool mm_needs_flush_escalation(struct mm_struct *mm) +{ + /* + * P9 nest MMU has issues with the page walk cache + * caching PTEs and not flushing them properly when + * RIC = 0 for a PID/LPID invalidate + */ + if (atomic_read(&mm->context.copros) > 0) + return true; + return false; +} + +#ifdef CONFIG_SMP +static void do_exit_flush_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + unsigned long pid = mm->context.id; + + if (current->mm == mm) + return; /* Local CPU */ + + if (current->active_mm == mm) { + /* + * Must be a kernel thread because sender is single-threaded. + */ + BUG_ON(current->mm); + mmgrab(&init_mm); + switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; + mmdrop(mm); + } + _tlbiel_pid(pid, RIC_FLUSH_ALL); +} + +static void exit_flush_lazy_tlbs(struct mm_struct *mm) +{ + /* + * Would be nice if this was async so it could be run in + * parallel with our local flush, but generic code does not + * give a good API for it. Could extend the generic code or + * make a special powerpc IPI for flushing TLBs. + * For now it's not too performance critical. + */ + smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, + (void *)mm, 1); + mm_reset_thread_local(mm); +} + +void radix__flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + /* + * Order loads of mm_cpumask vs previous stores to clear ptes before + * the invalidate. See barrier in switch_mm_irqs_off + */ + smp_mb(); + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } + + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { +local: + _tlbiel_pid(pid, RIC_FLUSH_TLB); + } + preempt_enable(); +} +EXPORT_SYMBOL(radix__flush_tlb_mm); + +static void __flush_all_mm(struct mm_struct *mm, bool fullmm) +{ + unsigned long pid; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (!fullmm) { + exit_flush_lazy_tlbs(mm); + goto local; + } + } + _tlbie_pid(pid, RIC_FLUSH_ALL); + } else { +local: + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } + preempt_enable(); +} +void radix__flush_all_mm(struct mm_struct *mm) +{ + __flush_all_mm(mm, false); +} +EXPORT_SYMBOL(radix__flush_all_mm); + +void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +{ + tlb->need_flush_all = 1; +} +EXPORT_SYMBOL(radix__flush_tlb_pwc); + +void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) +{ + unsigned long pid; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } else { +local: + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } + preempt_enable(); +} + +void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_page(vma, vmaddr); +#endif + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); +} +EXPORT_SYMBOL(radix__flush_tlb_page); + +#else /* CONFIG_SMP */ +#define radix__flush_all_mm radix__local_flush_all_mm +#endif /* CONFIG_SMP */ + +void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + _tlbie_pid(0, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL(radix__flush_tlb_kernel_range); + +#define TLB_FLUSH_ALL -1UL + +/* + * Number of pages above which we invalidate the entire PID rather than + * flush individual pages, for local and global flushes respectively. + * + * tlbie goes out to the interconnect and individual ops are more costly. + * It also does not iterate over sets like the local tlbiel variant when + * invalidating a full PID, so it has a far lower threshold to change from + * individual page flushes to full-pid flushes. + */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; + +static inline void __radix__flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool flush_all_sizes) + +{ + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (end != TLB_FLUSH_ALL) { + exit_flush_lazy_tlbs(mm); + goto is_local; + } + } + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } else { +is_local: + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } + + if (full) { + if (local) { + _tlbiel_pid(pid, RIC_FLUSH_TLB); + } else { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } + } else { + bool hflush = flush_all_sizes; + bool gflush = flush_all_sizes; + unsigned long hstart, hend; + unsigned long gstart, gend; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + hflush = true; + + if (hflush) { + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + if (hstart == hend) + hflush = false; + } + + if (gflush) { + gstart = (start + PUD_SIZE - 1) & PUD_MASK; + gend = end & PUD_MASK; + if (gstart == gend) + gflush = false; + } + + asm volatile("ptesync": : :"memory"); + if (local) { + __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbiel_va_range(hstart, hend, pid, + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbiel_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbie_va_range(hstart, hend, pid, + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbie_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + } + preempt_enable(); +} + +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + __radix__flush_tlb_range(vma->vm_mm, start, end, false); +} +EXPORT_SYMBOL(radix__flush_tlb_range); + +static int radix_get_mmu_psize(int page_size) +{ + int psize; + + if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift)) + psize = mmu_virtual_psize; + else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift)) + psize = MMU_PAGE_2M; + else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift)) + psize = MMU_PAGE_1G; + else + return -1; + return psize; +} + +/* + * Flush partition scoped LPID address translation for all CPUs. + */ +void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size) +{ + int psize = radix_get_mmu_psize(page_size); + + _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page); + +/* + * Flush partition scoped PWC from LPID for all CPUs. + */ +void radix__flush_pwc_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_PWC); +} +EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ +void radix__flush_tlb_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ +void radix__local_flush_tlb_lpid(unsigned int lpid) +{ + _tlbiel_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); + +/* + * Flush process scoped translations from LPID (=LPIDR). + * Important difference, the guest normally manages its own translations, + * but some cases e.g., vCPU CPU migration require KVM to flush. + */ +void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +{ + _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); + + +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize); + +void radix__tlb_flush(struct mmu_gather *tlb) +{ + int psize = 0; + struct mm_struct *mm = tlb->mm; + int page_size = tlb->page_size; + unsigned long start = tlb->start; + unsigned long end = tlb->end; + + /* + * if page size is not something we understand, do a full mm flush + * + * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush + * that flushes the process table entry cache upon process teardown. + * See the comment for radix in arch_exit_mmap(). + */ + if (tlb->fullmm) { + __flush_all_mm(mm, true); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) + } else if (mm_tlb_flush_nested(mm)) { + /* + * If there is a concurrent invalidation that is clearing ptes, + * then it's possible this invalidation will miss one of those + * cleared ptes and miss flushing the TLB. If this invalidate + * returns before the other one flushes TLBs, that can result + * in it returning while there are still valid TLBs inside the + * range to be invalidated. + * + * See mm/memory.c:tlb_finish_mmu() for more details. + * + * The solution to this is ensure the entire range is always + * flushed here. The problem for powerpc is that the flushes + * are page size specific, so this "forced flush" would not + * do the right thing if there are a mix of page sizes in + * the range to be invalidated. So use __flush_tlb_range + * which invalidates all possible page sizes in the range. + * + * PWC flush probably is not be required because the core code + * shouldn't free page tables in this path, but accounting + * for the possibility makes us a bit more robust. + * + * need_flush_all is an uncommon case because page table + * teardown should be done with exclusive locks held (but + * after locks are dropped another invalidate could come + * in), it could be optimized further if necessary. + */ + if (!tlb->need_flush_all) + __radix__flush_tlb_range(mm, start, end, true); + else + radix__flush_all_mm(mm); +#endif + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { + if (!tlb->need_flush_all) + radix__flush_tlb_mm(mm); + else + radix__flush_all_mm(mm); + } else { + if (!tlb->need_flush_all) + radix__flush_tlb_range_psize(mm, start, end, psize); + else + radix__flush_tlb_pwc_range_psize(mm, start, end, psize); + } + tlb->need_flush_all = 0; +} + +static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, + unsigned long start, unsigned long end, + int psize, bool also_pwc) +{ + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (end != TLB_FLUSH_ALL) { + exit_flush_lazy_tlbs(mm); + goto is_local; + } + } + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } else { +is_local: + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } + + if (full) { + if (local) { + _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } + } else { + if (local) + _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + } + preempt_enable(); +} + +void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + return __radix__flush_tlb_range_psize(mm, start, end, psize, false); +} + +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + __radix__flush_tlb_range_psize(mm, start, end, psize, true); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) +{ + unsigned long pid, end; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + /* 4k page size, just blow the world */ + if (PAGE_SIZE == 0x1000) { + radix__flush_all_mm(mm); + return; + } + + end = addr + HPAGE_PMD_SIZE; + + /* Otherwise first do the PWC, then iterate the pages. */ + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } else { +local: + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } + + preempt_enable(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M); +} +EXPORT_SYMBOL(radix__flush_pmd_tlb_range); + +void radix__flush_tlb_all(void) +{ + unsigned long rb,prs,r,rs; + unsigned long ric = RIC_FLUSH_ALL; + + rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */ + + asm volatile("ptesync": : :"memory"); + /* + * now flush guest entries by passing PRS = 1 and LPID != 0 + */ + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory"); + /* + * now flush host entires by passing PRS = 0 and LPID == 0 + */ + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) +{ + unsigned long pid = mm->context.id; + + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + /* + * If this context hasn't run on that CPU before and KVM is + * around, there's a slim chance that the guest on another + * CPU just brought in obsolete translation into the TLB of + * this CPU due to a bad prefetch using the guest PID on + * the way into the hypervisor. + * + * We work around this here. If KVM is possible, we check if + * any sibling thread is in KVM. If it is, the window may exist + * and thus we flush that PID from the core. + * + * A potential future improvement would be to mark which PIDs + * have never been used on the system and avoid it if the PID + * is new and the process has no other cpumask bit set. + */ + if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) { + int cpu = smp_processor_id(); + int sib = cpu_first_thread_sibling(cpu); + bool flush = false; + + for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { + if (sib == cpu) + continue; + if (!cpu_possible(sib)) + continue; + if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) + flush = true; + } + if (flush) + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } +} +EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c new file mode 100644 index 000000000000..c22742218bd3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/slb.c @@ -0,0 +1,833 @@ +/* + * PowerPC64 SLB support. + * + * Copyright (C) 2004 David Gibson , IBM + * Based on earlier code written by: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard , IBM + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +enum slb_index { + LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ + KSTACK_INDEX = 1, /* Kernel stack map */ +}; + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); + +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) + +static inline unsigned long mk_esid_data(unsigned long ea, int ssize, + enum slb_index index) +{ + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; +} + +static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, + unsigned long flags) +{ + return (vsid << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); +} + +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, + unsigned long flags) +{ + return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); +} + +static void assert_slb_presence(bool present, unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return; + + /* + * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware + * ignores all other bits from 0-27, so just clear them all. + */ + ea &= ~((1UL << 28) - 1); + asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); + + WARN_ON(present == (tmp == 0)); +#endif +} + +static inline void slb_shadow_update(unsigned long ea, int ssize, + unsigned long flags, + enum slb_index index) +{ + struct slb_shadow *p = get_slb_shadow(); + + /* + * Clear the ESID first so the entry is not valid while we are + * updating it. No write barriers are needed here, provided + * we only update the current CPU's SLB shadow buffer. + */ + WRITE_ONCE(p->save_area[index].esid, 0); + WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags))); + WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index))); +} + +static inline void slb_shadow_clear(enum slb_index index) +{ + WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); +} + +static inline void create_shadowed_slbe(unsigned long ea, int ssize, + unsigned long flags, + enum slb_index index) +{ + /* + * Updating the shadow buffer before writing the SLB ensures + * we don't get a stale entry here if we get preempted by PHYP + * between these two statements. + */ + slb_shadow_update(ea, ssize, flags, index); + + assert_slb_presence(false, ea); + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data(ea, ssize, flags)), + "r" (mk_esid_data(ea, ssize, index)) + : "memory" ); +} + +/* + * Insert bolted entries into SLB (which may not be empty, so don't clear + * slb_cache_ptr). + */ +void __slb_restore_bolted_realmode(void) +{ + struct slb_shadow *p = get_slb_shadow(); + enum slb_index index; + + /* No isync needed because realmode. */ + for (index = 0; index < SLB_NUM_BOLTED; index++) { + asm volatile("slbmte %0,%1" : + : "r" (be64_to_cpu(p->save_area[index].vsid)), + "r" (be64_to_cpu(p->save_area[index].esid))); + } + + assert_slb_presence(true, local_paca->kstack); +} + +/* + * Insert the bolted entries into an empty SLB. + */ +void slb_restore_bolted_realmode(void) +{ + __slb_restore_bolted_realmode(); + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; +} + +/* + * This flushes all SLB entries including 0, so it must be realmode. + */ +void slb_flush_all_realmode(void) +{ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); +} + +/* + * This flushes non-bolted entries, it can be run in virtual mode. Must + * be called with interrupts disabled. + */ +void slb_flush_and_restore_bolted(void) +{ + struct slb_shadow *p = get_slb_shadow(); + + BUILD_BUG_ON(SLB_NUM_BOLTED != 2); + + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + asm volatile("isync\n" + "slbia\n" + "slbmte %0, %1\n" + "isync\n" + :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), + "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) + : "memory"); + assert_slb_presence(true, get_paca()->kstack); + + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; +} + +void slb_save_contents(struct slb_entry *slb_ptr) +{ + int i; + unsigned long e, v; + + /* Save slb_cache_ptr value. */ + get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; + + if (!slb_ptr) + return; + + for (i = 0; i < mmu_slb_size; i++) { + asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); + asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); + slb_ptr->esid = e; + slb_ptr->vsid = v; + slb_ptr++; + } +} + +void slb_dump_contents(struct slb_entry *slb_ptr) +{ + int i, n; + unsigned long e, v; + unsigned long llp; + + if (!slb_ptr) + return; + + pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); + pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); + + for (i = 0; i < mmu_slb_size; i++) { + e = slb_ptr->esid; + v = slb_ptr->vsid; + slb_ptr++; + + if (!e && !v) + continue; + + pr_err("%02d %016lx %016lx\n", i, e, v); + + if (!(e & SLB_ESID_V)) { + pr_err("\n"); + continue; + } + llp = v & SLB_VSID_LLP; + if (v & SLB_VSID_B_1T) { + pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID_1T(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); + } else { + pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); + } + } + pr_err("----------------------------------\n"); + + /* Dump slb cache entires as well. */ + pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); + pr_err("Valid SLB cache entries:\n"); + n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); + for (i = 0; i < n; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + pr_err("Rest of SLB cache entries:\n"); + for (i = n; i < SLB_CACHE_ENTRIES; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); +} + +void slb_vmalloc_update(void) +{ + /* + * vmalloc is not bolted, so just have to flush non-bolted. + */ + slb_flush_and_restore_bolted(); +} + +static bool preload_hit(struct thread_info *ti, unsigned long esid) +{ + unsigned char i; + + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + if (esid == ti->slb_preload_esid[idx]) + return true; + } + return false; +} + +static bool preload_add(struct thread_info *ti, unsigned long ea) +{ + unsigned char idx; + unsigned long esid; + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + /* EAs are stored >> 28 so 256MB segments don't need clearing */ + if (ea & ESID_MASK_1T) + ea &= ESID_MASK_1T; + } + + esid = ea >> SID_SHIFT; + + if (preload_hit(ti, esid)) + return false; + + idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; + ti->slb_preload_esid[idx] = esid; + if (ti->slb_preload_nr == SLB_PRELOAD_NR) + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; + else + ti->slb_preload_nr++; + + return true; +} + +static void preload_age(struct thread_info *ti) +{ + if (!ti->slb_preload_nr) + return; + ti->slb_preload_nr--; + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; +} + +void slb_setup_new_exec(void) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long exec = 0x10000000; + + WARN_ON(irqs_disabled()); + + /* + * preload cache can only be used to determine whether a SLB + * entry exists if it does not start to overflow. + */ + if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* + * We have no good place to clear the slb preload cache on exec, + * flush_thread is about the earliest arch hook but that happens + * after we switch to the mm and have aleady preloaded the SLBEs. + * + * For the most part that's probably okay to use entries from the + * previous exec, they will age out if unused. It may turn out to + * be an advantage to clear the cache before switching to it, + * however. + */ + + /* + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. + */ + if (!is_kernel_addr(exec)) { + if (preload_add(ti, exec)) + slb_allocate_user(mm, exec); + } + + /* Libraries and mmaps. */ + if (!is_kernel_addr(mm->mmap_base)) { + if (preload_add(ti, mm->mmap_base)) + slb_allocate_user(mm, mm->mmap_base); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + +void preload_new_slb_context(unsigned long start, unsigned long sp) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long heap = mm->start_brk; + + WARN_ON(irqs_disabled()); + + /* see above */ + if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* Userspace entry address. */ + if (!is_kernel_addr(start)) { + if (preload_add(ti, start)) + slb_allocate_user(mm, start); + } + + /* Top of stack, grows down. */ + if (!is_kernel_addr(sp)) { + if (preload_add(ti, sp)) + slb_allocate_user(mm, sp); + } + + /* Bottom of heap, grows up. */ + if (heap && !is_kernel_addr(heap)) { + if (preload_add(ti, heap)) + slb_allocate_user(mm, heap); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + + +/* Flush all user entries from the segment table of the current processor. */ +void switch_slb(struct task_struct *tsk, struct mm_struct *mm) +{ + struct thread_info *ti = task_thread_info(tsk); + unsigned char i; + + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an SLB miss + * which would update the slb_cache/slb_cache_ptr fields in the PACA. + */ + hard_irq_disable(); + asm volatile("isync" : : : "memory"); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * SLBIA IH=3 invalidates all Class=1 SLBEs and their + * associated lookaside structures, which matches what + * switch_slb wants. So ARCH_300 does not use the slb + * cache. + */ + asm volatile(PPC_SLBIA(3)); + } else { + unsigned long offset = get_paca()->slb_cache_ptr; + + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + unsigned long slbie_data = 0; + + for (i = 0; i < offset; i++) { + unsigned long ea; + + ea = (unsigned long) + get_paca()->slb_cache[i] << SID_SHIFT; + /* + * Could assert_slb_presence(true) here, but + * hypervisor or machine check could have come + * in and removed the entry at this point. + */ + + slbie_data = ea; + slbie_data |= user_segment_size(slbie_data) + << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + asm volatile("slbie %0" : : "r" (slbie_data)); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) + asm volatile("slbie %0" : : "r" (slbie_data)); + + } else { + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + unsigned long ksp_vsid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile(PPC_SLBIA(1) "\n" + "slbmte %0,%1\n" + "isync" + :: "r"(ksp_vsid_data), + "r"(ksp_esid_data)); + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + } + + get_paca()->slb_cache_ptr = 0; + } + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; + + copy_mm_to_paca(mm); + + /* + * We gradually age out SLBs after a number of context switches to + * reduce reload overhead of unused entries (like we do with FP/VEC + * reload). Each time we wrap 256 switches, take an entry out of the + * SLB preload cache. + */ + tsk->thread.load_slb++; + if (!tsk->thread.load_slb) { + unsigned long pc = KSTK_EIP(tsk); + + preload_age(ti); + preload_add(ti, pc); + } + + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + unsigned long ea; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; + + slb_allocate_user(mm, ea); + } + + /* + * Synchronize slbmte preloads with possible subsequent user memory + * address accesses by the kernel (user mode won't happen until + * rfid, which is safe). + */ + asm volatile("isync" : : : "memory"); +} + +void slb_set_size(u16 size) +{ + mmu_slb_size = size; +} + +void slb_initialize(void) +{ + unsigned long linear_llp, vmalloc_llp, io_llp; + unsigned long lflags; + static int slb_encoding_inited; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + unsigned long vmemmap_llp; +#endif + + /* Prepare our SLB miss handler based on our page size */ + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + io_llp = mmu_psize_defs[mmu_io_psize].sllp; + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; + get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + if (!slb_encoding_inited) { + slb_encoding_inited = 1; + pr_devel("SLB: linear LLP = %04lx\n", linear_llp); + pr_devel("SLB: io LLP = %04lx\n", io_llp); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); +#endif + } + + get_paca()->stab_rr = SLB_NUM_BOLTED - 1; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; + + lflags = SLB_VSID_KERNEL | linear_llp; + + /* Invalidate the entire SLB (even entry 0) & all the ERATS */ + asm volatile("isync":::"memory"); + asm volatile("slbmte %0,%0"::"r" (0) : "memory"); + asm volatile("isync; slbia; isync":::"memory"); + create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); + + /* + * For the boot cpu, we're running on the stack in init_thread_union, + * which is in the first segment of the linear mapping, and also + * get_paca()->kstack hasn't been initialized yet. + * For secondary cpus, we need to bolt the kernel stack entry now. + */ + slb_shadow_clear(KSTACK_INDEX); + if (raw_smp_processor_id() != boot_cpuid && + (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) + create_shadowed_slbe(get_paca()->kstack, + mmu_kernel_ssize, lflags, KSTACK_INDEX); + + asm volatile("isync":::"memory"); +} + +static void slb_cache_update(unsigned long esid_data) +{ + int slb_cache_index; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; /* ISAv3.0B and later does not use slb_cache */ + + /* + * Now update slb cache entries + */ + slb_cache_index = local_paca->slb_cache_ptr; + if (slb_cache_index < SLB_CACHE_ENTRIES) { + /* + * We have space in slb cache for optimized switch_slb(). + * Top 36 bits from esid_data as per ISA + */ + local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; + local_paca->slb_cache_ptr++; + } else { + /* + * Our cache is full and the current cache content strictly + * doesn't indicate the active SLB conents. Bump the ptr + * so that switch_slb() will ignore the cache. + */ + local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; + } +} + +static enum slb_index alloc_slb_index(bool kernel) +{ + enum slb_index index; + + /* + * The allocation bitmaps can become out of synch with the SLB + * when the _switch code does slbie when bolting a new stack + * segment and it must not be anywhere else in the SLB. This leaves + * a kernel allocated entry that is unused in the SLB. With very + * large systems or small segment sizes, the bitmaps could slowly + * fill with these entries. They will eventually be cleared out + * by the round robin allocator in that case, so it's probably not + * worth accounting for. + */ + + /* + * SLBs beyond 32 entries are allocated with stab_rr only + * POWER7/8/9 have 32 SLB entries, this could be expanded if a + * future CPU has more. + */ + if (local_paca->slb_used_bitmap != U32_MAX) { + index = ffz(local_paca->slb_used_bitmap); + local_paca->slb_used_bitmap |= 1U << index; + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + } else { + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = local_paca->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + local_paca->stab_rr = index; + if (index < 32) { + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + else + local_paca->slb_kern_bitmap &= ~(1U << index); + } + } + BUG_ON(index < SLB_NUM_BOLTED); + + return index; +} + +static long slb_insert_entry(unsigned long ea, unsigned long context, + unsigned long flags, int ssize, bool kernel) +{ + unsigned long vsid; + unsigned long vsid_data, esid_data; + enum slb_index index; + + vsid = get_vsid(context, ea, ssize); + if (!vsid) + return -EFAULT; + + /* + * There must not be a kernel SLB fault in alloc_slb_index or before + * slbmte here or the allocation bitmaps could get out of whack with + * the SLB. + * + * User SLB faults or preloads take this path which might get inlined + * into the caller, so add compiler barriers here to ensure unsafe + * memory accesses do not come between. + */ + barrier(); + + index = alloc_slb_index(kernel); + + vsid_data = __mk_vsid_data(vsid, ssize, flags); + esid_data = mk_esid_data(ea, ssize, index); + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * User preloads should add isync afterwards in case the kernel + * accesses user memory before it returns to userspace with rfid. + */ + assert_slb_presence(false, ea); + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); + + barrier(); + + if (!kernel) + slb_cache_update(esid_data); + + return 0; +} + +static long slb_allocate_kernel(unsigned long ea, unsigned long id) +{ + unsigned long context; + unsigned long flags; + int ssize; + + if (id == LINEAR_MAP_REGION_ID) { + + /* We only support upto MAX_PHYSMEM_BITS */ + if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + } else if (id == VMEMMAP_REGION_ID) { + + if (ea >= H_VMEMMAP_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + } else if (id == VMALLOC_REGION_ID) { + + if (ea >= H_VMALLOC_END) + return -EFAULT; + + flags = local_paca->vmalloc_sllp; + + } else if (id == IO_REGION_ID) { + + if (ea >= H_KERN_IO_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + + } else { + return -EFAULT; + } + + ssize = MMU_SEGSIZE_1T; + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + ssize = MMU_SEGSIZE_256M; + + context = get_kernel_context(ea); + + return slb_insert_entry(ea, context, flags, ssize, true); +} + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) +{ + unsigned long context; + unsigned long flags; + int bpsize; + int ssize; + + /* + * consider this as bad access if we take a SLB miss + * on an address above addr limit. + */ + if (ea >= mm_ctx_slb_addr_limit(&mm->context)) + return -EFAULT; + + context = get_user_context(&mm->context, ea); + if (!context) + return -EFAULT; + + if (unlikely(ea >= H_PGTABLE_RANGE)) { + WARN_ON(1); + return -EFAULT; + } + + ssize = user_segment_size(ea); + + bpsize = get_slice_psize(mm, ea); + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + + return slb_insert_entry(ea, context, flags, ssize, false); +} + +long do_slb_fault(struct pt_regs *regs, unsigned long ea) +{ + unsigned long id = get_region_id(ea); + + /* IRQs are not reconciled here, so can't check irqs_disabled */ + VM_WARN_ON(mfmsr() & MSR_EE); + + if (unlikely(!(regs->msr & MSR_RI))) + return -EINVAL; + + /* + * SLB kernel faults must be very careful not to touch anything + * that is not bolted. E.g., PACA and global variables are okay, + * mm->context stuff is not. + * + * SLB user faults can access all of kernel memory, but must be + * careful not to touch things like IRQ state because it is not + * "reconciled" here. The difficulty is that we must use + * fast_exception_return to return from kernel SLB faults without + * looking at possible non-bolted memory. We could test user vs + * kernel faults in the interrupt handler asm and do a full fault, + * reconcile, ret_from_except for user faults which would make them + * first class kernel code. But for performance it's probably nicer + * if they go via fast_exception_return too. + */ + if (id >= LINEAR_MAP_REGION_ID) { + long err; +#ifdef CONFIG_DEBUG_VM + /* Catch recursive kernel SLB faults. */ + BUG_ON(local_paca->in_kernel_slb_handler); + local_paca->in_kernel_slb_handler = 1; +#endif + err = slb_allocate_kernel(ea, id); +#ifdef CONFIG_DEBUG_VM + local_paca->in_kernel_slb_handler = 0; +#endif + return err; + } else { + struct mm_struct *mm = current->mm; + long err; + + if (unlikely(!mm)) + return -EFAULT; + + err = slb_allocate_user(mm, ea); + if (!err) + preload_add(current_thread_info(), ea); + + return err; + } +} + +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) +{ + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c new file mode 100644 index 000000000000..473dd430e306 --- /dev/null +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -0,0 +1,289 @@ +/* + * Copyright 2007-2008 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Free all pages allocated for subpage protection maps and pointers. + * Also makes sure that the subpage_prot_table structure is + * reinitialized for the next user. + */ +void subpage_prot_free(struct mm_struct *mm) +{ + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + unsigned long i, j, addr; + u32 **p; + + if (!spt) + return; + + for (i = 0; i < 4; ++i) { + if (spt->low_prot[i]) { + free_page((unsigned long)spt->low_prot[i]); + spt->low_prot[i] = NULL; + } + } + addr = 0; + for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) { + p = spt->protptrs[i]; + if (!p) + continue; + spt->protptrs[i] = NULL; + for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr; + ++j, addr += PAGE_SIZE) + if (p[j]) + free_page((unsigned long)p[j]); + free_page((unsigned long)p); + } + spt->maxaddr = 0; + kfree(spt); +} + +static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, + int npages) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return; + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return; + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; npages > 0; --npages) { + pte_update(mm, addr, pte, 0, 0, 0); + addr += PAGE_SIZE; + ++pte; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +} + +/* + * Clear the subpage protection map for an address range, allowing + * all accesses that are allowed by the pte permissions. + */ +static void subpage_prot_clear(unsigned long addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt; + u32 **spm, *spp; + unsigned long i; + size_t nw; + unsigned long next, limit; + + down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) + goto err_out; + + limit = addr + len; + if (limit > spt->maxaddr) + limit = spt->maxaddr; + for (; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + if (addr < 0x100000000UL) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) + continue; + } + spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!spp) + continue; + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + memset(spp, 0, nw * sizeof(u32)); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + +err_out: + up_write(&mm->mmap_sem); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + split_huge_pmd(vma, pmd, addr); + return 0; +} + +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct mm_walk subpage_proto_walk = { + .mm = mm, + .pmd_entry = subpage_walk_pmd_entry, + }; + + /* + * We don't try too hard, we just mark all the vma in that range + * VM_NOHUGEPAGE and split them. + */ + vma = find_vma(mm, addr); + /* + * If the range is in unmapped range, just return + */ + if (vma && ((addr + len) <= vma->vm_start)) + return; + + while (vma) { + if (vma->vm_start >= (addr + len)) + break; + vma->vm_flags |= VM_NOHUGEPAGE; + walk_page_vma(vma, &subpage_proto_walk); + vma = vma->vm_next; + } +} +#else +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + return; +} +#endif + +/* + * Copy in a subpage protection map for an address range. + * The map has 2 bits per 4k subpage, so 32 bits per 64k page. + * Each 2-bit field is 0 to allow any access, 1 to prevent writes, + * 2 or 3 to prevent all accesses. + * Note that the normal page protections also apply; the subpage + * protection mechanism is an additional constraint, so putting 0 + * in a 2-bit field won't allow writes to a page that is otherwise + * write-protected. + */ +SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, + unsigned long, len, u32 __user *, map) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt; + u32 **spm, *spp; + unsigned long i; + size_t nw; + unsigned long next, limit; + int err; + + if (radix_enabled()) + return -ENOENT; + + /* Check parameters */ + if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || + addr >= mm->task_size || len >= mm->task_size || + addr + len > mm->task_size) + return -EINVAL; + + if (is_hugepage_only_range(mm, addr, len)) + return -EINVAL; + + if (!map) { + /* Clear out the protection map for the address range */ + subpage_prot_clear(addr, len); + return 0; + } + + if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) + return -EFAULT; + + down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) { + /* + * Allocate subpage prot table if not already done. + * Do this with mmap_sem held + */ + spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); + if (!spt) { + err = -ENOMEM; + goto out; + } + mm->context.hash_context->spt = spt; + } + + subpage_mark_vma_nohuge(mm, addr, len); + for (limit = addr + len; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + err = -ENOMEM; + if (addr < 0x100000000UL) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) { + spm = (u32 **)get_zeroed_page(GFP_KERNEL); + if (!spm) + goto out; + spt->protptrs[addr >> SBP_L3_SHIFT] = spm; + } + } + spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1); + spp = *spm; + if (!spp) { + spp = (u32 *)get_zeroed_page(GFP_KERNEL); + if (!spp) + goto out; + *spm = spp; + } + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + local_irq_disable(); + demote_segment_4k(mm, addr); + local_irq_enable(); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + up_write(&mm->mmap_sem); + if (__copy_from_user(spp, map, nw * sizeof(u32))) + return -EFAULT; + map += nw; + down_write(&mm->mmap_sem); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + if (limit > spt->maxaddr) + spt->maxaddr = limit; + err = 0; + out: + up_write(&mm->mmap_sem); + return err; +} diff --git a/arch/powerpc/mm/book3s64/vphn.c b/arch/powerpc/mm/book3s64/vphn.c new file mode 100644 index 000000000000..0ee7734afb50 --- /dev/null +++ b/arch/powerpc/mm/book3s64/vphn.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "vphn.h" + +/* + * The associativity domain numbers are returned from the hypervisor as a + * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the + * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 + * bytes. + * + * --- 16-bit fields --> + * _________________________ + * | 0 | 1 | 2 | 3 | be_packed[0] + * ------+-----+-----+------ + * _________________________ + * | 4 | 5 | 6 | 7 | be_packed[1] + * ------------------------- + * ... + * _________________________ + * | 20 | 21 | 22 | 23 | be_packed[5] + * ------------------------- + * + * Convert to the sequence they would appear in the ibm,associativity property. + */ +int vphn_unpack_associativity(const long *packed, __be32 *unpacked) +{ + __be64 be_packed[VPHN_REGISTER_COUNT]; + int i, nr_assoc_doms = 0; + const __be16 *field = (const __be16 *) be_packed; + u16 last = 0; + bool is_32bit = false; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + /* Let's fix the values returned by plpar_hcall9() */ + for (i = 0; i < VPHN_REGISTER_COUNT; i++) + be_packed[i] = cpu_to_be64(packed[i]); + + for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { + u16 new = be16_to_cpup(field++); + + if (is_32bit) { + /* + * Let's concatenate the 16 bits of this field to the + * 15 lower bits of the previous field + */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(last << 16 | new); + is_32bit = false; + } else if (new == VPHN_FIELD_UNUSED) + /* This is the list terminator */ + break; + else if (new & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(new & VPHN_FIELD_MASK); + } else { + /* + * Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + last = new; + is_32bit = true; + } + } + + /* The first cell contains the length of the property */ + unpacked[0] = cpu_to_be32(nr_assoc_doms); + + return nr_assoc_doms; +} diff --git a/arch/powerpc/mm/book3s64/vphn.h b/arch/powerpc/mm/book3s64/vphn.h new file mode 100644 index 000000000000..f0b93c2dd578 --- /dev/null +++ b/arch/powerpc/mm/book3s64/vphn.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_POWERPC_MM_VPHN_H_ +#define _ARCH_POWERPC_MM_VPHN_H_ + +/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ +#define VPHN_REGISTER_COUNT 6 + +/* + * 6 64-bit registers unpacked into up to 24 be32 associativity values. To + * form the complete property we have to add the length in the first cell. + */ +#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) + +extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); + +#endif diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c deleted file mode 100644 index 6fa6765a10eb..000000000000 --- a/arch/powerpc/mm/hash64_4k.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -#include -#include -#include - -int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, unsigned long flags, - int ssize, int subpg_prot) -{ - real_pte_t rpte; - unsigned long hpte_group; - unsigned long rflags, pa; - unsigned long old_pte, new_pte; - unsigned long vpn, hash, slot; - unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; - - /* - * atomically mark the linux large page PTE busy and dirty - */ - do { - pte_t pte = READ_ONCE(*ptep); - - old_pte = pte_val(pte); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access. Since this is 4K insert of 64K page size - * also add H_PAGE_COMBO - */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - /* - * PP bits. _PAGE_USER is already PP bit 0x2, so we only - * need to add in 0x1 if it's a read-only user page - */ - rflags = htab_convert_pte_flags(new_pte); - rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); - - if (cpu_has_feature(CPU_FTR_NOEXECUTE) && - !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & H_PAGE_HASHPTE)) { - /* - * There MIGHT be an HPTE for this pte - */ - unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize, - rpte, 0); - - if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K, - MMU_PAGE_4K, ssize, flags) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & H_PAGE_HASHPTE))) { - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - hash = hpt_hash(vpn, shift, ssize); - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - MMU_PAGE_4K, MMU_PAGE_4K, ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, - HPTE_V_SECONDARY, - MMU_PAGE_4K, - MMU_PAGE_4K, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - mmu_hash_ops.hpte_remove(hpte_group); - /* - * FIXME!! Should be try the group from which we removed ? - */ - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - MMU_PAGE_4K, MMU_PAGE_4K, old_pte); - return -1; - } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; - new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); - } - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c deleted file mode 100644 index 3afa253d7f52..000000000000 --- a/arch/powerpc/mm/hash64_64k.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -#include -#include -#include - -/* - * Return true, if the entry has a slot value which - * the software considers as invalid. - */ -static inline bool hpte_soft_invalid(unsigned long hidx) -{ - return ((hidx & 0xfUL) == 0xfUL); -} - -/* - * index from 0 - 15 - */ -bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) -{ - return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index))); -} - -int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, unsigned long flags, - int ssize, int subpg_prot) -{ - real_pte_t rpte; - unsigned long hpte_group; - unsigned int subpg_index; - unsigned long rflags, pa; - unsigned long old_pte, new_pte, subpg_pte; - unsigned long vpn, hash, slot, gslot; - unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; - - /* - * atomically mark the linux large page PTE busy and dirty - */ - do { - pte_t pte = READ_ONCE(*ptep); - - old_pte = pte_val(pte); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access. Since this is 4K insert of 64K page size - * also add H_PAGE_COMBO - */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - /* - * Handle the subpage protection bits - */ - subpg_pte = new_pte & ~subpg_prot; - rflags = htab_convert_pte_flags(subpg_pte); - - if (cpu_has_feature(CPU_FTR_NOEXECUTE) && - !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { - - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - } - - subpg_index = (ea & (PAGE_SIZE - 1)) >> shift; - vpn = hpt_vpn(ea, vsid, ssize); - rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); - /* - *None of the sub 4k page is hashed - */ - if (!(old_pte & H_PAGE_HASHPTE)) - goto htab_insert_hpte; - /* - * Check if the pte was already inserted into the hash table - * as a 64k HW page, and invalidate the 64k HPTE if so. - */ - if (!(old_pte & H_PAGE_COMBO)) { - flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); - /* - * clear the old slot details from the old and new pte. - * On hash insert failure we use old pte value and we don't - * want slot information there if we have a insert failure. - */ - old_pte &= ~H_PAGE_HASHPTE; - new_pte &= ~H_PAGE_HASHPTE; - goto htab_insert_hpte; - } - /* - * Check for sub page valid and update - */ - if (__rpte_sub_valid(rpte, subpg_index)) { - int ret; - - gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, - subpg_index); - ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, - MMU_PAGE_4K, MMU_PAGE_4K, - ssize, flags); - - /* - * If we failed because typically the HPTE wasn't really here - * we try an insertion. - */ - if (ret == -1) - goto htab_insert_hpte; - - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; - } - -htab_insert_hpte: - - /* - * Initialize all hidx entries to invalid value, the first time - * the PTE is about to allocate a 4K HPTE. - */ - if (!(old_pte & H_PAGE_COMBO)) - rpte.hidx = INVALID_RPTE_HIDX; - - /* - * handle H_PAGE_4K_PFN case - */ - if (old_pte & H_PAGE_4K_PFN) { - /* - * All the sub 4k page have the same - * physical address. - */ - pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT; - } else { - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - pa += (subpg_index << shift); - } - hash = hpt_hash(vpn, shift, ssize); -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - MMU_PAGE_4K, MMU_PAGE_4K, ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - bool soft_invalid; - - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, HPTE_V_SECONDARY, - MMU_PAGE_4K, MMU_PAGE_4K, - ssize); - - soft_invalid = hpte_soft_invalid(slot); - if (unlikely(soft_invalid)) { - /* - * We got a valid slot from a hardware point of view. - * but we cannot use it, because we use this special - * value; as defined by hpte_soft_invalid(), to track - * invalid slots. We cannot use it. So invalidate it. - */ - gslot = slot & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn, - MMU_PAGE_4K, MMU_PAGE_4K, - ssize, 0); - } - - if (unlikely(slot == -1 || soft_invalid)) { - /* - * For soft invalid slot, let's ensure that we release a - * slot from the primary, with the hope that we will - * acquire that slot next time we try. This will ensure - * that we do not get the same soft-invalid slot. - */ - if (soft_invalid || (mftb() & 0x1)) - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - mmu_hash_ops.hpte_remove(hpte_group); - /* - * FIXME!! Should be try the group from which we removed ? - */ - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - MMU_PAGE_4K, MMU_PAGE_4K, old_pte); - return -1; - } - - new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); - new_pte |= H_PAGE_HASHPTE; - - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} - -int __hash_page_64K(unsigned long ea, unsigned long access, - unsigned long vsid, pte_t *ptep, unsigned long trap, - unsigned long flags, int ssize) -{ - real_pte_t rpte; - unsigned long hpte_group; - unsigned long rflags, pa; - unsigned long old_pte, new_pte; - unsigned long vpn, hash, slot; - unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift; - - /* - * atomically mark the linux large page PTE busy and dirty - */ - do { - pte_t pte = READ_ONCE(*ptep); - - old_pte = pte_val(pte); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - /* - * Check if PTE has the cache-inhibit bit set - * If so, bail out and refault as a 4k page - */ - if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && - unlikely(pte_ci(pte))) - return 0; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access. - */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - rflags = htab_convert_pte_flags(new_pte); - rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); - - if (cpu_has_feature(CPU_FTR_NOEXECUTE) && - !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & H_PAGE_HASHPTE)) { - unsigned long gslot; - - /* - * There MIGHT be an HPTE for this pte - */ - gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); - if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K, - MMU_PAGE_64K, ssize, - flags) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & H_PAGE_HASHPTE))) { - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - hash = hpt_hash(vpn, shift, ssize); - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - MMU_PAGE_64K, MMU_PAGE_64K, - ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, - HPTE_V_SECONDARY, - MMU_PAGE_64K, - MMU_PAGE_64K, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - mmu_hash_ops.hpte_remove(hpte_group); - /* - * FIXME!! Should be try the group from which we removed ? - */ - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - MMU_PAGE_64K, MMU_PAGE_64K, old_pte); - return -1; - } - - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; - new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); - } - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c deleted file mode 100644 index aaa28fd918fe..000000000000 --- a/arch/powerpc/mm/hash_native_64.c +++ /dev/null @@ -1,884 +0,0 @@ -/* - * native hashtable management. - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#undef DEBUG_LOW - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef DEBUG_LOW -#define DBG_LOW(fmt...) udbg_printf(fmt) -#else -#define DBG_LOW(fmt...) -#endif - -#ifdef __BIG_ENDIAN__ -#define HPTE_LOCK_BIT 3 -#else -#define HPTE_LOCK_BIT (56+3) -#endif - -DEFINE_RAW_SPINLOCK(native_tlbie_lock); - -static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) -{ - unsigned long rb; - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - - asm volatile("tlbiel %0" : : "r" (rb)); -} - -/* - * tlbiel instruction for hash, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) -{ - unsigned long rb; - unsigned long rs; - unsigned int r = 0; /* hash format */ - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) - : "memory"); -} - - -static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa206(set, is); - - asm volatile("ptesync": : :"memory"); -} - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and any caching of partition table - * entries. Then flush the remaining sets of the TLB. Hash mode uses - * partition scoped TLB translations. - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - for (set = 1; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); - - /* - * Now invalidate the process table cache. - * - * From ISA v3.0B p. 1078: - * The following forms are invalid. - * * PRS=1, R=0, and RIC!=2 (The only process-scoped - * HPT caching is of the Process Table.) - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 1); - - asm volatile("ptesync": : :"memory"); - - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -void hash__tlbiel_all(unsigned int action) -{ - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) - tlbiel_all_isa206(POWER8_TLB_SETS, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) - tlbiel_all_isa206(POWER7_TLB_SETS, is); - else - WARN(1, "%s called on pre-POWER7 CPU\n", __func__); -} - -static inline unsigned long ___tlbie(unsigned long vpn, int psize, - int apsize, int ssize) -{ - unsigned long va; - unsigned int penc; - unsigned long sllp; - - /* - * We need 14 to 65 bits of va for a tlibe of 4K page - * With vpn we ignore the lower VPN_SHIFT bits already. - * And top two bits are already ignored because we can - * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT - * of 12. - */ - va = vpn << VPN_SHIFT; - /* - * clear top 16 bits of 64bit va, non SLS segment - * Older versions of the architecture (2.02 and earler) require the - * masking of the top 16 bits. - */ - if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) - va &= ~(0xffffULL << 48); - - switch (psize) { - case MMU_PAGE_4K: - /* clear out bits after (52) [0....52.....63] */ - va &= ~((1ul << (64 - 52)) - 1); - va |= ssize << 8; - sllp = get_sllp_encoding(apsize); - va |= sllp << 5; - asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) - : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - default: - /* We need 14 to 14 + i bits of va */ - penc = mmu_psize_defs[psize].penc[apsize]; - va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); - va |= penc << 12; - va |= ssize << 8; - /* - * AVAL bits: - * We don't need all the bits, but rest of the bits - * must be ignored by the processor. - * vpn cover upto 65 bits of va. (0...65) and we need - * 58..64 bits of va. - */ - va |= (vpn & 0xfe); /* AVAL */ - va |= 1; /* L */ - asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) - : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - } - return va; -} - -static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) -{ - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { - /* Need the extra ptesync to ensure we don't reorder tlbie*/ - asm volatile("ptesync": : :"memory"); - ___tlbie(vpn, psize, apsize, ssize); - } -} - -static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) -{ - unsigned long rb; - - rb = ___tlbie(vpn, psize, apsize, ssize); - trace_tlbie(0, 0, rb, 0, 0, 0, 0); -} - -static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) -{ - unsigned long va; - unsigned int penc; - unsigned long sllp; - - /* VPN_SHIFT can be atmost 12 */ - va = vpn << VPN_SHIFT; - /* - * clear top 16 bits of 64 bit va, non SLS segment - * Older versions of the architecture (2.02 and earler) require the - * masking of the top 16 bits. - */ - if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) - va &= ~(0xffffULL << 48); - - switch (psize) { - case MMU_PAGE_4K: - /* clear out bits after(52) [0....52.....63] */ - va &= ~((1ul << (64 - 52)) - 1); - va |= ssize << 8; - sllp = get_sllp_encoding(apsize); - va |= sllp << 5; - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) - : : "r" (va), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - default: - /* We need 14 to 14 + i bits of va */ - penc = mmu_psize_defs[psize].penc[apsize]; - va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); - va |= penc << 12; - va |= ssize << 8; - /* - * AVAL bits: - * We don't need all the bits, but rest of the bits - * must be ignored by the processor. - * vpn cover upto 65 bits of va. (0...65) and we need - * 58..64 bits of va. - */ - va |= (vpn & 0xfe); - va |= 1; /* L */ - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) - : : "r" (va), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - } - trace_tlbie(0, 1, va, 0, 0, 0, 0); - -} - -static inline void tlbie(unsigned long vpn, int psize, int apsize, - int ssize, int local) -{ - unsigned int use_local; - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - - use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use(); - - if (use_local) - use_local = mmu_psize_defs[psize].tlbiel; - if (lock_tlbie && !use_local) - raw_spin_lock(&native_tlbie_lock); - asm volatile("ptesync": : :"memory"); - if (use_local) { - __tlbiel(vpn, psize, apsize, ssize); - asm volatile("ptesync": : :"memory"); - } else { - __tlbie(vpn, psize, apsize, ssize); - fixup_tlbie(vpn, psize, apsize, ssize); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); - } - if (lock_tlbie && !use_local) - raw_spin_unlock(&native_tlbie_lock); -} - -static inline void native_lock_hpte(struct hash_pte *hptep) -{ - unsigned long *word = (unsigned long *)&hptep->v; - - while (1) { - if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) - break; - spin_begin(); - while(test_bit(HPTE_LOCK_BIT, word)) - spin_cpu_relax(); - spin_end(); - } -} - -static inline void native_unlock_hpte(struct hash_pte *hptep) -{ - unsigned long *word = (unsigned long *)&hptep->v; - - clear_bit_unlock(HPTE_LOCK_BIT, word); -} - -static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, - unsigned long pa, unsigned long rflags, - unsigned long vflags, int psize, int apsize, int ssize) -{ - struct hash_pte *hptep = htab_address + hpte_group; - unsigned long hpte_v, hpte_r; - int i; - - if (!(vflags & HPTE_V_BOLTED)) { - DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," - " rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, vpn, pa, rflags, vflags, psize); - } - - for (i = 0; i < HPTES_PER_GROUP; i++) { - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { - /* retry with lock held */ - native_lock_hpte(hptep); - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) - break; - native_unlock_hpte(hptep); - } - - hptep++; - } - - if (i == HPTES_PER_GROUP) - return -1; - - hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; - - if (!(vflags & HPTE_V_BOLTED)) { - DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", - i, hpte_v, hpte_r); - } - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - hpte_r = hpte_old_to_new_r(hpte_v, hpte_r); - hpte_v = hpte_old_to_new_v(hpte_v); - } - - hptep->r = cpu_to_be64(hpte_r); - /* Guarantee the second dword is visible before the valid bit */ - eieio(); - /* - * Now set the first dword including the valid bit - * NOTE: this also unlocks the hpte - */ - hptep->v = cpu_to_be64(hpte_v); - - __asm__ __volatile__ ("ptesync" : : : "memory"); - - return i | (!!(vflags & HPTE_V_SECONDARY) << 3); -} - -static long native_hpte_remove(unsigned long hpte_group) -{ - struct hash_pte *hptep; - int i; - int slot_offset; - unsigned long hpte_v; - - DBG_LOW(" remove(group=%lx)\n", hpte_group); - - /* pick a random entry to start at */ - slot_offset = mftb() & 0x7; - - for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + hpte_group + slot_offset; - hpte_v = be64_to_cpu(hptep->v); - - if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { - /* retry with lock held */ - native_lock_hpte(hptep); - hpte_v = be64_to_cpu(hptep->v); - if ((hpte_v & HPTE_V_VALID) - && !(hpte_v & HPTE_V_BOLTED)) - break; - native_unlock_hpte(hptep); - } - - slot_offset++; - slot_offset &= 0x7; - } - - if (i == HPTES_PER_GROUP) - return -1; - - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - - return i; -} - -static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long vpn, int bpsize, - int apsize, int ssize, unsigned long flags) -{ - struct hash_pte *hptep = htab_address + slot; - unsigned long hpte_v, want_v; - int ret = 0, local = 0; - - want_v = hpte_encode_avpn(vpn, bpsize, ssize); - - DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", - vpn, want_v & HPTE_V_AVPN, slot, newpp); - - hpte_v = hpte_get_old_v(hptep); - /* - * We need to invalidate the TLB always because hpte_remove doesn't do - * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less - * random entry from it. When we do that we don't invalidate the TLB - * (hpte_remove) because we assume the old translation is still - * technically "valid". - */ - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { - DBG_LOW(" -> miss\n"); - ret = -1; - } else { - native_lock_hpte(hptep); - /* recheck with locks held */ - hpte_v = hpte_get_old_v(hptep); - if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || - !(hpte_v & HPTE_V_VALID))) { - ret = -1; - } else { - DBG_LOW(" -> hit\n"); - /* Update the HPTE */ - hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & - ~(HPTE_R_PPP | HPTE_R_N)) | - (newpp & (HPTE_R_PPP | HPTE_R_N | - HPTE_R_C))); - } - native_unlock_hpte(hptep); - } - - if (flags & HPTE_LOCAL_UPDATE) - local = 1; - /* - * Ensure it is out of the tlb too if it is not a nohpte fault - */ - if (!(flags & HPTE_NOHPTE_UPDATE)) - tlbie(vpn, bpsize, apsize, ssize, local); - - return ret; -} - -static long native_hpte_find(unsigned long vpn, int psize, int ssize) -{ - struct hash_pte *hptep; - unsigned long hash; - unsigned long i; - long slot; - unsigned long want_v, hpte_v; - - hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); - - /* Bolted mappings are only ever in the primary group */ - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - for (i = 0; i < HPTES_PER_GROUP; i++) { - - hptep = htab_address + slot; - hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) - /* HPTE matches */ - return slot; - ++slot; - } - - return -1; -} - -/* - * Update the page protection bits. Intended to be used to create - * guard pages for kernel data structures on pages which are bolted - * in the HPT. Assumes pages being operated on will not be stolen. - * - * No need to lock here because we should be the only user. - */ -static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, - int psize, int ssize) -{ - unsigned long vpn; - unsigned long vsid; - long slot; - struct hash_pte *hptep; - - vsid = get_kernel_vsid(ea, ssize); - vpn = hpt_vpn(ea, vsid, ssize); - - slot = native_hpte_find(vpn, psize, ssize); - if (slot == -1) - panic("could not find page to bolt\n"); - hptep = htab_address + slot; - - /* Update the HPTE */ - hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & - ~(HPTE_R_PPP | HPTE_R_N)) | - (newpp & (HPTE_R_PPP | HPTE_R_N))); - /* - * Ensure it is out of the tlb too. Bolted entries base and - * actual page size will be same. - */ - tlbie(vpn, psize, psize, ssize, 0); -} - -/* - * Remove a bolted kernel entry. Memory hotplug uses this. - * - * No need to lock here because we should be the only user. - */ -static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) -{ - unsigned long vpn; - unsigned long vsid; - long slot; - struct hash_pte *hptep; - - vsid = get_kernel_vsid(ea, ssize); - vpn = hpt_vpn(ea, vsid, ssize); - - slot = native_hpte_find(vpn, psize, ssize); - if (slot == -1) - return -ENOENT; - - hptep = htab_address + slot; - - VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); - - /* Invalidate the hpte */ - hptep->v = 0; - - /* Invalidate the TLB */ - tlbie(vpn, psize, psize, ssize, 0); - return 0; -} - - -static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, - int bpsize, int apsize, int ssize, int local) -{ - struct hash_pte *hptep = htab_address + slot; - unsigned long hpte_v; - unsigned long want_v; - unsigned long flags; - - local_irq_save(flags); - - DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); - - want_v = hpte_encode_avpn(vpn, bpsize, ssize); - hpte_v = hpte_get_old_v(hptep); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - native_lock_hpte(hptep); - /* recheck with locks held */ - hpte_v = hpte_get_old_v(hptep); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - else - native_unlock_hpte(hptep); - } - /* - * We need to invalidate the TLB always because hpte_remove doesn't do - * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less - * random entry from it. When we do that we don't invalidate the TLB - * (hpte_remove) because we assume the old translation is still - * technically "valid". - */ - tlbie(vpn, bpsize, apsize, ssize, local); - - local_irq_restore(flags); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void native_hugepage_invalidate(unsigned long vsid, - unsigned long addr, - unsigned char *hpte_slot_array, - int psize, int ssize, int local) -{ - int i; - struct hash_pte *hptep; - int actual_psize = MMU_PAGE_16M; - unsigned int max_hpte_count, valid; - unsigned long flags, s_addr = addr; - unsigned long hpte_v, want_v, shift; - unsigned long hidx, vpn = 0, hash, slot; - - shift = mmu_psize_defs[psize].shift; - max_hpte_count = 1U << (PMD_SHIFT - shift); - - local_irq_save(flags); - for (i = 0; i < max_hpte_count; i++) { - valid = hpte_valid(hpte_slot_array, i); - if (!valid) - continue; - hidx = hpte_hash_index(hpte_slot_array, i); - - /* get the vpn */ - addr = s_addr + (i * (1ul << shift)); - vpn = hpt_vpn(addr, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - - hptep = htab_address + slot; - want_v = hpte_encode_avpn(vpn, psize, ssize); - hpte_v = hpte_get_old_v(hptep); - - /* Even if we miss, we need to invalidate the TLB */ - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* recheck with locks held */ - native_lock_hpte(hptep); - hpte_v = hpte_get_old_v(hptep); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - - hptep->v = 0; - } else - native_unlock_hpte(hptep); - } - /* - * We need to do tlb invalidate for all the address, tlbie - * instruction compares entry_VA in tlb with the VA specified - * here - */ - tlbie(vpn, psize, actual_psize, ssize, local); - } - local_irq_restore(flags); -} -#else -static void native_hugepage_invalidate(unsigned long vsid, - unsigned long addr, - unsigned char *hpte_slot_array, - int psize, int ssize, int local) -{ - WARN(1, "%s called without THP support\n", __func__); -} -#endif - -static void hpte_decode(struct hash_pte *hpte, unsigned long slot, - int *psize, int *apsize, int *ssize, unsigned long *vpn) -{ - unsigned long avpn, pteg, vpi; - unsigned long hpte_v = be64_to_cpu(hpte->v); - unsigned long hpte_r = be64_to_cpu(hpte->r); - unsigned long vsid, seg_off; - int size, a_size, shift; - /* Look at the 8 bit LP value */ - unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - hpte_v = hpte_new_to_old_v(hpte_v, hpte_r); - hpte_r = hpte_new_to_old_r(hpte_r); - } - if (!(hpte_v & HPTE_V_LARGE)) { - size = MMU_PAGE_4K; - a_size = MMU_PAGE_4K; - } else { - size = hpte_page_sizes[lp] & 0xf; - a_size = hpte_page_sizes[lp] >> 4; - } - /* This works for all page sizes, and for 256M and 1T segments */ - *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; - shift = mmu_psize_defs[size].shift; - - avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); - pteg = slot / HPTES_PER_GROUP; - if (hpte_v & HPTE_V_SECONDARY) - pteg = ~pteg; - - switch (*ssize) { - case MMU_SEGSIZE_256M: - /* We only have 28 - 23 bits of seg_off in avpn */ - seg_off = (avpn & 0x1f) << 23; - vsid = avpn >> 5; - /* We can find more bits from the pteg value */ - if (shift < 23) { - vpi = (vsid ^ pteg) & htab_hash_mask; - seg_off |= vpi << shift; - } - *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; - break; - case MMU_SEGSIZE_1T: - /* We only have 40 - 23 bits of seg_off in avpn */ - seg_off = (avpn & 0x1ffff) << 23; - vsid = avpn >> 17; - if (shift < 23) { - vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; - seg_off |= vpi << shift; - } - *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; - break; - default: - *vpn = size = 0; - } - *psize = size; - *apsize = a_size; -} - -/* - * clear all mappings on kexec. All cpus are in real mode (or they will - * be when they isi), and we are the only one left. We rely on our kernel - * mapping being 0xC0's and the hardware ignoring those two real bits. - * - * This must be called with interrupts disabled. - * - * Taking the native_tlbie_lock is unsafe here due to the possibility of - * lockdep being on. On pre POWER5 hardware, not taking the lock could - * cause deadlock. POWER5 and newer not taking the lock is fine. This only - * gets called during boot before secondary CPUs have come up and during - * crashdump and all bets are off anyway. - * - * TODO: add batching support when enabled. remember, no dynamic memory here, - * although there is the control page available... - */ -static void native_hpte_clear(void) -{ - unsigned long vpn = 0; - unsigned long slot, slots; - struct hash_pte *hptep = htab_address; - unsigned long hpte_v; - unsigned long pteg_count; - int psize, apsize, ssize; - - pteg_count = htab_hash_mask + 1; - - slots = pteg_count * HPTES_PER_GROUP; - - for (slot = 0; slot < slots; slot++, hptep++) { - /* - * we could lock the pte here, but we are the only cpu - * running, right? and for crash dump, we probably - * don't want to wait for a maybe bad cpu. - */ - hpte_v = be64_to_cpu(hptep->v); - - /* - * Call __tlbie() here rather than tlbie() since we can't take the - * native_tlbie_lock. - */ - if (hpte_v & HPTE_V_VALID) { - hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); - hptep->v = 0; - ___tlbie(vpn, psize, apsize, ssize); - } - } - - asm volatile("eieio; tlbsync; ptesync":::"memory"); -} - -/* - * Batched hash table flush, we batch the tlbie's to avoid taking/releasing - * the lock all the time - */ -static void native_flush_hash_range(unsigned long number, int local) -{ - unsigned long vpn = 0; - unsigned long hash, index, hidx, shift, slot; - struct hash_pte *hptep; - unsigned long hpte_v; - unsigned long want_v; - unsigned long flags; - real_pte_t pte; - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); - unsigned long psize = batch->psize; - int ssize = batch->ssize; - int i; - unsigned int use_local; - - use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && - mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use(); - - local_irq_save(flags); - - for (i = 0; i < number; i++) { - vpn = batch->vpn[i]; - pte = batch->pte[i]; - - pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - hptep = htab_address + slot; - want_v = hpte_encode_avpn(vpn, psize, ssize); - hpte_v = hpte_get_old_v(hptep); - - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - continue; - /* lock and try again */ - native_lock_hpte(hptep); - hpte_v = hpte_get_old_v(hptep); - - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - native_unlock_hpte(hptep); - else - hptep->v = 0; - - } pte_iterate_hashed_end(); - } - - if (use_local) { - asm volatile("ptesync":::"memory"); - for (i = 0; i < number; i++) { - vpn = batch->vpn[i]; - pte = batch->pte[i]; - - pte_iterate_hashed_subpages(pte, psize, - vpn, index, shift) { - __tlbiel(vpn, psize, psize, ssize); - } pte_iterate_hashed_end(); - } - asm volatile("ptesync":::"memory"); - } else { - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - - if (lock_tlbie) - raw_spin_lock(&native_tlbie_lock); - - asm volatile("ptesync":::"memory"); - for (i = 0; i < number; i++) { - vpn = batch->vpn[i]; - pte = batch->pte[i]; - - pte_iterate_hashed_subpages(pte, psize, - vpn, index, shift) { - __tlbie(vpn, psize, psize, ssize); - } pte_iterate_hashed_end(); - } - /* - * Just do one more with the last used values. - */ - fixup_tlbie(vpn, psize, psize, ssize); - asm volatile("eieio; tlbsync; ptesync":::"memory"); - - if (lock_tlbie) - raw_spin_unlock(&native_tlbie_lock); - } - - local_irq_restore(flags); -} - -void __init hpte_init_native(void) -{ - mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; - mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; - mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; - mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; - mmu_hash_ops.hpte_insert = native_hpte_insert; - mmu_hash_ops.hpte_remove = native_hpte_remove; - mmu_hash_ops.hpte_clear_all = native_hpte_clear; - mmu_hash_ops.flush_hash_range = native_flush_hash_range; - mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate; -} diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c deleted file mode 100644 index 6eb89643ce58..000000000000 --- a/arch/powerpc/mm/hash_utils_64.c +++ /dev/null @@ -1,1930 +0,0 @@ -/* - * PowerPC64 port by Mike Corrigan and Dave Engebretsen - * {mikejc|engebret}@us.ibm.com - * - * Copyright (c) 2000 Mike Corrigan - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard , IBM - * - * Module name: htab.c - * - * Description: - * PowerPC Hashed Page Table functions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#undef DEBUG -#undef DEBUG_LOW - -#define pr_fmt(fmt) "hash-mmu: " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - -#ifdef DEBUG_LOW -#define DBG_LOW(fmt...) udbg_printf(fmt) -#else -#define DBG_LOW(fmt...) -#endif - -#define KB (1024) -#define MB (1024*KB) -#define GB (1024L*MB) - -/* - * Note: pte --> Linux PTE - * HPTE --> PowerPC Hashed Page Table Entry - * - * Execution context: - * htab_initialize is called with the MMU off (of course), but - * the kernel has been copied down to zero so it can directly - * reference global data. At this point it is very difficult - * to print debug info. - * - */ - -static unsigned long _SDR1; -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -EXPORT_SYMBOL_GPL(mmu_psize_defs); - -u8 hpte_page_sizes[1 << LP_BITS]; -EXPORT_SYMBOL_GPL(hpte_page_sizes); - -struct hash_pte *htab_address; -unsigned long htab_size_bytes; -unsigned long htab_hash_mask; -EXPORT_SYMBOL_GPL(htab_hash_mask); -int mmu_linear_psize = MMU_PAGE_4K; -EXPORT_SYMBOL_GPL(mmu_linear_psize); -int mmu_virtual_psize = MMU_PAGE_4K; -int mmu_vmalloc_psize = MMU_PAGE_4K; -#ifdef CONFIG_SPARSEMEM_VMEMMAP -int mmu_vmemmap_psize = MMU_PAGE_4K; -#endif -int mmu_io_psize = MMU_PAGE_4K; -int mmu_kernel_ssize = MMU_SEGSIZE_256M; -EXPORT_SYMBOL_GPL(mmu_kernel_ssize); -int mmu_highuser_ssize = MMU_SEGSIZE_256M; -u16 mmu_slb_size = 64; -EXPORT_SYMBOL_GPL(mmu_slb_size); -#ifdef CONFIG_PPC_64K_PAGES -int mmu_ci_restrictions; -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC -static u8 *linear_map_hash_slots; -static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ -struct mmu_hash_ops mmu_hash_ops; -EXPORT_SYMBOL(mmu_hash_ops); - -/* There are definitions of page sizes arrays to be used when none - * is provided by the firmware. - */ - -/* - * Fallback (4k pages only) - */ -static struct mmu_psize_def mmu_psize_defaults[] = { - [MMU_PAGE_4K] = { - .shift = 12, - .sllp = 0, - .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, - .avpnm = 0, - .tlbiel = 0, - }, -}; - -/* POWER4, GPUL, POWER5 - * - * Support for 16Mb large pages - */ -static struct mmu_psize_def mmu_psize_defaults_gp[] = { - [MMU_PAGE_4K] = { - .shift = 12, - .sllp = 0, - .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, - .avpnm = 0, - .tlbiel = 1, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .sllp = SLB_VSID_L, - .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, - [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, - .avpnm = 0x1UL, - .tlbiel = 0, - }, -}; - -/* - * 'R' and 'C' update notes: - * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* - * create writeable HPTEs without C set, because the hcall H_PROTECT - * that we use in that case will not update C - * - The above is however not a problem, because we also don't do that - * fancy "no flush" variant of eviction and we use H_REMOVE which will - * do the right thing and thus we don't have the race I described earlier - * - * - Under bare metal, we do have the race, so we need R and C set - * - We make sure R is always set and never lost - * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping - */ -unsigned long htab_convert_pte_flags(unsigned long pteflags) -{ - unsigned long rflags = 0; - - /* _PAGE_EXEC -> NOEXEC */ - if ((pteflags & _PAGE_EXEC) == 0) - rflags |= HPTE_R_N; - /* - * PPP bits: - * Linux uses slb key 0 for kernel and 1 for user. - * kernel RW areas are mapped with PPP=0b000 - * User area is mapped with PPP=0b010 for read/write - * or PPP=0b011 for read-only (including writeable but clean pages). - */ - if (pteflags & _PAGE_PRIVILEGED) { - /* - * Kernel read only mapped with ppp bits 0b110 - */ - if (!(pteflags & _PAGE_WRITE)) { - if (mmu_has_feature(MMU_FTR_KERNEL_RO)) - rflags |= (HPTE_R_PP0 | 0x2); - else - rflags |= 0x3; - } - } else { - if (pteflags & _PAGE_RWX) - rflags |= 0x2; - if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) - rflags |= 0x1; - } - /* - * We can't allow hardware to update hpte bits. Hence always - * set 'R' bit and set 'C' if it is a write fault - */ - rflags |= HPTE_R_R; - - if (pteflags & _PAGE_DIRTY) - rflags |= HPTE_R_C; - /* - * Add in WIG bits - */ - - if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) - rflags |= HPTE_R_I; - else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT) - rflags |= (HPTE_R_I | HPTE_R_G); - else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) - rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); - else - /* - * Add memory coherence if cache inhibited is not set - */ - rflags |= HPTE_R_M; - - rflags |= pte_to_hpte_pkey_bits(pteflags); - return rflags; -} - -int htab_bolt_mapping(unsigned long vstart, unsigned long vend, - unsigned long pstart, unsigned long prot, - int psize, int ssize) -{ - unsigned long vaddr, paddr; - unsigned int step, shift; - int ret = 0; - - shift = mmu_psize_defs[psize].shift; - step = 1 << shift; - - prot = htab_convert_pte_flags(prot); - - DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", - vstart, vend, pstart, prot, psize, ssize); - - for (vaddr = vstart, paddr = pstart; vaddr < vend; - vaddr += step, paddr += step) { - unsigned long hash, hpteg; - unsigned long vsid = get_kernel_vsid(vaddr, ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); - unsigned long tprot = prot; - - /* - * If we hit a bad address return error. - */ - if (!vsid) - return -1; - /* Make kernel text executable */ - if (overlaps_kernel_text(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - - /* Make kvm guest trampolines executable */ - if (overlaps_kvm_tmp(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - - /* - * If relocatable, check if it overlaps interrupt vectors that - * are copied down to real 0. For relocatable kernel - * (e.g. kdump case) we copy interrupt vectors down to real - * address 0. Mark that region as executable. This is - * because on p8 system with relocation on exception feature - * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence - * in order to execute the interrupt handlers in virtual - * mode the vector region need to be marked as executable. - */ - if ((PHYSICAL_START > MEMORY_START) && - overlaps_interrupt_vector_text(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - - hash = hpt_hash(vpn, shift, ssize); - hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - - BUG_ON(!mmu_hash_ops.hpte_insert); - ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, - HPTE_V_BOLTED, psize, psize, - ssize); - - if (ret < 0) - break; - -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - (paddr >> PAGE_SHIFT) < linear_map_hash_count) - linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; -#endif /* CONFIG_DEBUG_PAGEALLOC */ - } - return ret < 0 ? ret : 0; -} - -int htab_remove_mapping(unsigned long vstart, unsigned long vend, - int psize, int ssize) -{ - unsigned long vaddr; - unsigned int step, shift; - int rc; - int ret = 0; - - shift = mmu_psize_defs[psize].shift; - step = 1 << shift; - - if (!mmu_hash_ops.hpte_removebolted) - return -ENODEV; - - for (vaddr = vstart; vaddr < vend; vaddr += step) { - rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); - if (rc == -ENOENT) { - ret = -ENOENT; - continue; - } - if (rc < 0) - return rc; - } - - return ret; -} - -static bool disable_1tb_segments = false; - -static int __init parse_disable_1tb_segments(char *p) -{ - disable_1tb_segments = true; - return 0; -} -early_param("disable_1tb_segments", parse_disable_1tb_segments); - -static int __init htab_dt_scan_seg_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be32 *prop; - int size = 0; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); - if (prop == NULL) - return 0; - for (; size >= 4; size -= 4, ++prop) { - if (be32_to_cpu(prop[0]) == 40) { - DBG("1T segment support detected\n"); - - if (disable_1tb_segments) { - DBG("1T segments disabled by command line\n"); - break; - } - - cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; - return 1; - } - } - cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; - return 0; -} - -static int __init get_idx_from_shift(unsigned int shift) -{ - int idx = -1; - - switch (shift) { - case 0xc: - idx = MMU_PAGE_4K; - break; - case 0x10: - idx = MMU_PAGE_64K; - break; - case 0x14: - idx = MMU_PAGE_1M; - break; - case 0x18: - idx = MMU_PAGE_16M; - break; - case 0x22: - idx = MMU_PAGE_16G; - break; - } - return idx; -} - -static int __init htab_dt_scan_page_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be32 *prop; - int size = 0; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); - if (!prop) - return 0; - - pr_info("Page sizes from device-tree:\n"); - size /= 4; - cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); - while(size > 0) { - unsigned int base_shift = be32_to_cpu(prop[0]); - unsigned int slbenc = be32_to_cpu(prop[1]); - unsigned int lpnum = be32_to_cpu(prop[2]); - struct mmu_psize_def *def; - int idx, base_idx; - - size -= 3; prop += 3; - base_idx = get_idx_from_shift(base_shift); - if (base_idx < 0) { - /* skip the pte encoding also */ - prop += lpnum * 2; size -= lpnum * 2; - continue; - } - def = &mmu_psize_defs[base_idx]; - if (base_idx == MMU_PAGE_16M) - cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; - - def->shift = base_shift; - if (base_shift <= 23) - def->avpnm = 0; - else - def->avpnm = (1 << (base_shift - 23)) - 1; - def->sllp = slbenc; - /* - * We don't know for sure what's up with tlbiel, so - * for now we only set it for 4K and 64K pages - */ - if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) - def->tlbiel = 1; - else - def->tlbiel = 0; - - while (size > 0 && lpnum) { - unsigned int shift = be32_to_cpu(prop[0]); - int penc = be32_to_cpu(prop[1]); - - prop += 2; size -= 2; - lpnum--; - - idx = get_idx_from_shift(shift); - if (idx < 0) - continue; - - if (penc == -1) - pr_err("Invalid penc for base_shift=%d " - "shift=%d\n", base_shift, shift); - - def->penc[idx] = penc; - pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," - " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", - base_shift, shift, def->sllp, - def->avpnm, def->tlbiel, def->penc[idx]); - } - } - - return 1; -} - -#ifdef CONFIG_HUGETLB_PAGE -/* Scan for 16G memory blocks that have been set aside for huge pages - * and reserve those blocks for 16G huge pages. - */ -static int __init htab_dt_scan_hugepage_blocks(unsigned long node, - const char *uname, int depth, - void *data) { - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be64 *addr_prop; - const __be32 *page_count_prop; - unsigned int expected_pages; - long unsigned int phys_addr; - long unsigned int block_size; - - /* We are scanning "memory" nodes only */ - if (type == NULL || strcmp(type, "memory") != 0) - return 0; - - /* This property is the log base 2 of the number of virtual pages that - * will represent this memory block. */ - page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); - if (page_count_prop == NULL) - return 0; - expected_pages = (1 << be32_to_cpu(page_count_prop[0])); - addr_prop = of_get_flat_dt_prop(node, "reg", NULL); - if (addr_prop == NULL) - return 0; - phys_addr = be64_to_cpu(addr_prop[0]); - block_size = be64_to_cpu(addr_prop[1]); - if (block_size != (16 * GB)) - return 0; - printk(KERN_INFO "Huge page(16GB) memory: " - "addr = 0x%lX size = 0x%lX pages = %d\n", - phys_addr, block_size, expected_pages); - if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { - memblock_reserve(phys_addr, block_size * expected_pages); - pseries_add_gpage(phys_addr, block_size, expected_pages); - } - return 0; -} -#endif /* CONFIG_HUGETLB_PAGE */ - -static void mmu_psize_set_default_penc(void) -{ - int bpsize, apsize; - for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) - for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) - mmu_psize_defs[bpsize].penc[apsize] = -1; -} - -#ifdef CONFIG_PPC_64K_PAGES - -static bool might_have_hea(void) -{ - /* - * The HEA ethernet adapter requires awareness of the - * GX bus. Without that awareness we can easily assume - * we will never see an HEA ethernet device. - */ -#ifdef CONFIG_IBMEBUS - return !cpu_has_feature(CPU_FTR_ARCH_207S) && - firmware_has_feature(FW_FEATURE_SPLPAR); -#else - return false; -#endif -} - -#endif /* #ifdef CONFIG_PPC_64K_PAGES */ - -static void __init htab_scan_page_sizes(void) -{ - int rc; - - /* se the invalid penc to -1 */ - mmu_psize_set_default_penc(); - - /* Default to 4K pages only */ - memcpy(mmu_psize_defs, mmu_psize_defaults, - sizeof(mmu_psize_defaults)); - - /* - * Try to find the available page sizes in the device-tree - */ - rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); - if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { - /* - * Nothing in the device-tree, but the CPU supports 16M pages, - * so let's fallback on a known size list for 16M capable CPUs. - */ - memcpy(mmu_psize_defs, mmu_psize_defaults_gp, - sizeof(mmu_psize_defaults_gp)); - } - -#ifdef CONFIG_HUGETLB_PAGE - if (!hugetlb_disabled) { - /* Reserve 16G huge page memory sections for huge pages */ - of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); - } -#endif /* CONFIG_HUGETLB_PAGE */ -} - -/* - * Fill in the hpte_page_sizes[] array. - * We go through the mmu_psize_defs[] array looking for all the - * supported base/actual page size combinations. Each combination - * has a unique pagesize encoding (penc) value in the low bits of - * the LP field of the HPTE. For actual page sizes less than 1MB, - * some of the upper LP bits are used for RPN bits, meaning that - * we need to fill in several entries in hpte_page_sizes[]. - * - * In diagrammatic form, with r = RPN bits and z = page size bits: - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ... - * - * The zzzz bits are implementation-specific but are chosen so that - * no encoding for a larger page size uses the same value in its - * low-order N bits as the encoding for the 2^(12+N) byte page size - * (if it exists). - */ -static void init_hpte_page_sizes(void) -{ - long int ap, bp; - long int shift, penc; - - for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { - if (!mmu_psize_defs[bp].shift) - continue; /* not a supported page size */ - for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { - penc = mmu_psize_defs[bp].penc[ap]; - if (penc == -1 || !mmu_psize_defs[ap].shift) - continue; - shift = mmu_psize_defs[ap].shift - LP_SHIFT; - if (shift <= 0) - continue; /* should never happen */ - /* - * For page sizes less than 1MB, this loop - * replicates the entry for all possible values - * of the rrrr bits. - */ - while (penc < (1 << LP_BITS)) { - hpte_page_sizes[penc] = (ap << 4) | bp; - penc += 1 << shift; - } - } - } -} - -static void __init htab_init_page_sizes(void) -{ - init_hpte_page_sizes(); - - if (!debug_pagealloc_enabled()) { - /* - * Pick a size for the linear mapping. Currently, we only - * support 16M, 1M and 4K which is the default - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - mmu_linear_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - mmu_linear_psize = MMU_PAGE_1M; - } - -#ifdef CONFIG_PPC_64K_PAGES - /* - * Pick a size for the ordinary pages. Default is 4K, we support - * 64K for user mappings and vmalloc if supported by the processor. - * We only use 64k for ioremap if the processor - * (and firmware) support cache-inhibited large pages. - * If not, we use 4k and set mmu_ci_restrictions so that - * hash_page knows to switch processes that use cache-inhibited - * mappings to 4k pages. - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift) { - mmu_virtual_psize = MMU_PAGE_64K; - mmu_vmalloc_psize = MMU_PAGE_64K; - if (mmu_linear_psize == MMU_PAGE_4K) - mmu_linear_psize = MMU_PAGE_64K; - if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { - /* - * When running on pSeries using 64k pages for ioremap - * would stop us accessing the HEA ethernet. So if we - * have the chance of ever seeing one, stay at 4k. - */ - if (!might_have_hea()) - mmu_io_psize = MMU_PAGE_64K; - } else - mmu_ci_restrictions = 1; - } -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* We try to use 16M pages for vmemmap if that is supported - * and we have at least 1G of RAM at boot - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift && - memblock_phys_mem_size() >= 0x40000000) - mmu_vmemmap_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_64K].shift) - mmu_vmemmap_psize = MMU_PAGE_64K; - else - mmu_vmemmap_psize = MMU_PAGE_4K; -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - printk(KERN_DEBUG "Page orders: linear mapping = %d, " - "virtual = %d, io = %d" -#ifdef CONFIG_SPARSEMEM_VMEMMAP - ", vmemmap = %d" -#endif - "\n", - mmu_psize_defs[mmu_linear_psize].shift, - mmu_psize_defs[mmu_virtual_psize].shift, - mmu_psize_defs[mmu_io_psize].shift -#ifdef CONFIG_SPARSEMEM_VMEMMAP - ,mmu_psize_defs[mmu_vmemmap_psize].shift -#endif - ); -} - -static int __init htab_dt_scan_pftsize(unsigned long node, - const char *uname, int depth, - void *data) -{ - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be32 *prop; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); - if (prop != NULL) { - /* pft_size[0] is the NUMA CEC cookie */ - ppc64_pft_size = be32_to_cpu(prop[1]); - return 1; - } - return 0; -} - -unsigned htab_shift_for_mem_size(unsigned long mem_size) -{ - unsigned memshift = __ilog2(mem_size); - unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift; - unsigned pteg_shift; - - /* round mem_size up to next power of 2 */ - if ((1UL << memshift) < mem_size) - memshift += 1; - - /* aim for 2 pages / pteg */ - pteg_shift = memshift - (pshift + 1); - - /* - * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab - * size permitted by the architecture. - */ - return max(pteg_shift + 7, 18U); -} - -static unsigned long __init htab_get_table_size(void) -{ - /* If hash size isn't already provided by the platform, we try to - * retrieve it from the device-tree. If it's not there neither, we - * calculate it now based on the total RAM size - */ - if (ppc64_pft_size == 0) - of_scan_flat_dt(htab_dt_scan_pftsize, NULL); - if (ppc64_pft_size) - return 1UL << ppc64_pft_size; - - return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size()); -} - -#ifdef CONFIG_MEMORY_HOTPLUG -int resize_hpt_for_hotplug(unsigned long new_mem_size) -{ - unsigned target_hpt_shift; - - if (!mmu_hash_ops.resize_hpt) - return 0; - - target_hpt_shift = htab_shift_for_mem_size(new_mem_size); - - /* - * To avoid lots of HPT resizes if memory size is fluctuating - * across a boundary, we deliberately have some hysterisis - * here: we immediately increase the HPT size if the target - * shift exceeds the current shift, but we won't attempt to - * reduce unless the target shift is at least 2 below the - * current shift - */ - if (target_hpt_shift > ppc64_pft_size || - target_hpt_shift < ppc64_pft_size - 1) - return mmu_hash_ops.resize_hpt(target_hpt_shift); - - return 0; -} - -int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) -{ - int rc; - - if (end >= H_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - return -1; - } - - rc = htab_bolt_mapping(start, end, __pa(start), - pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize); - - if (rc < 0) { - int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -int hash__remove_section_mapping(unsigned long start, unsigned long end) -{ - int rc = htab_remove_mapping(start, end, mmu_linear_psize, - mmu_kernel_ssize); - WARN_ON(rc < 0); - return rc; -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -static void __init hash_init_partition_table(phys_addr_t hash_table, - unsigned long htab_size) -{ - mmu_partition_table_init(); - - /* - * PS field (VRMA page size) is not used for LPID 0, hence set to 0. - * For now, UPRT is 0 and we have no segment table. - */ - htab_size = __ilog2(htab_size) - 18; - mmu_partition_table_set_entry(0, hash_table | htab_size, 0); - pr_info("Partition table %p\n", partition_tb); -} - -static void __init htab_initialize(void) -{ - unsigned long table; - unsigned long pteg_count; - unsigned long prot; - unsigned long base = 0, size = 0; - struct memblock_region *reg; - - DBG(" -> htab_initialize()\n"); - - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { - mmu_kernel_ssize = MMU_SEGSIZE_1T; - mmu_highuser_ssize = MMU_SEGSIZE_1T; - printk(KERN_INFO "Using 1TB segments\n"); - } - - /* - * Calculate the required size of the htab. We want the number of - * PTEGs to equal one half the number of real pages. - */ - htab_size_bytes = htab_get_table_size(); - pteg_count = htab_size_bytes >> 7; - - htab_hash_mask = pteg_count - 1; - - if (firmware_has_feature(FW_FEATURE_LPAR) || - firmware_has_feature(FW_FEATURE_PS3_LV1)) { - /* Using a hypervisor which owns the htab */ - htab_address = NULL; - _SDR1 = 0; - /* - * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall - * to inform the hypervisor that we wish to use the HPT. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - register_process_table(0, 0, 0); -#ifdef CONFIG_FA_DUMP - /* - * If firmware assisted dump is active firmware preserves - * the contents of htab along with entire partition memory. - * Clear the htab if firmware assisted dump is active so - * that we dont end up using old mappings. - */ - if (is_fadump_active() && mmu_hash_ops.hpte_clear_all) - mmu_hash_ops.hpte_clear_all(); -#endif - } else { - unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE; - -#ifdef CONFIG_PPC_CELL - /* - * Cell may require the hash table down low when using the - * Axon IOMMU in order to fit the dynamic region over it, see - * comments in cell/iommu.c - */ - if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) { - limit = 0x80000000; - pr_info("Hash table forced below 2G for Axon IOMMU\n"); - } -#endif /* CONFIG_PPC_CELL */ - - table = memblock_phys_alloc_range(htab_size_bytes, - htab_size_bytes, - 0, limit); - if (!table) - panic("ERROR: Failed to allocate %pa bytes below %pa\n", - &htab_size_bytes, &limit); - - DBG("Hash table allocated at %lx, size: %lx\n", table, - htab_size_bytes); - - htab_address = __va(table); - - /* htab absolute addr + encoded htabsize */ - _SDR1 = table + __ilog2(htab_size_bytes) - 18; - - /* Initialize the HPT with no entries */ - memset((void *)table, 0, htab_size_bytes); - - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - /* Set SDR1 */ - mtspr(SPRN_SDR1, _SDR1); - else - hash_init_partition_table(table, htab_size_bytes); - } - - prot = pgprot_val(PAGE_KERNEL); - -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled()) { - linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; - linear_map_hash_slots = memblock_alloc_try_nid( - linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, - ppc64_rma_size, NUMA_NO_NODE); - if (!linear_map_hash_slots) - panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", - __func__, linear_map_hash_count, &ppc64_rma_size); - } -#endif /* CONFIG_DEBUG_PAGEALLOC */ - - /* create bolted the linear mapping in the hash table */ - for_each_memblock(memory, reg) { - base = (unsigned long)__va(reg->base); - size = reg->size; - - DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", - base, size, prot); - - if ((base + size) >= H_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - continue; - } - - BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), - prot, mmu_linear_psize, mmu_kernel_ssize)); - } - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - - /* - * If we have a memory_limit and we've allocated TCEs then we need to - * explicitly map the TCE area at the top of RAM. We also cope with the - * case that the TCEs start below memory_limit. - * tce_alloc_start/end are 16MB aligned so the mapping should work - * for either 4K or 16MB pages. - */ - if (tce_alloc_start) { - tce_alloc_start = (unsigned long)__va(tce_alloc_start); - tce_alloc_end = (unsigned long)__va(tce_alloc_end); - - if (base + size >= tce_alloc_start) - tce_alloc_start = base + size + 1; - - BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, - __pa(tce_alloc_start), prot, - mmu_linear_psize, mmu_kernel_ssize)); - } - - - DBG(" <- htab_initialize()\n"); -} -#undef KB -#undef MB - -void __init hash__early_init_devtree(void) -{ - /* Initialize segment sizes */ - of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); - - /* Initialize page sizes */ - htab_scan_page_sizes(); -} - -struct hash_mm_context init_hash_mm_context; -void __init hash__early_init_mmu(void) -{ -#ifndef CONFIG_PPC_64K_PAGES - /* - * We have code in __hash_page_4K() and elsewhere, which assumes it can - * do the following: - * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); - * - * Where the slot number is between 0-15, and values of 8-15 indicate - * the secondary bucket. For that code to work H_PAGE_F_SECOND and - * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and - * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here - * with a BUILD_BUG_ON(). - */ - BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); -#endif /* CONFIG_PPC_64K_PAGES */ - - htab_init_page_sizes(); - - /* - * initialize page table size - */ - __pte_frag_nr = H_PTE_FRAG_NR; - __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; - __pmd_frag_nr = H_PMD_FRAG_NR; - __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT; - - __pte_index_size = H_PTE_INDEX_SIZE; - __pmd_index_size = H_PMD_INDEX_SIZE; - __pud_index_size = H_PUD_INDEX_SIZE; - __pgd_index_size = H_PGD_INDEX_SIZE; - __pud_cache_index = H_PUD_CACHE_INDEX; - __pte_table_size = H_PTE_TABLE_SIZE; - __pmd_table_size = H_PMD_TABLE_SIZE; - __pud_table_size = H_PUD_TABLE_SIZE; - __pgd_table_size = H_PGD_TABLE_SIZE; - /* - * 4k use hugepd format, so for hash set then to - * zero - */ - __pmd_val_bits = HASH_PMD_VAL_BITS; - __pud_val_bits = HASH_PUD_VAL_BITS; - __pgd_val_bits = HASH_PGD_VAL_BITS; - - __kernel_virt_start = H_KERN_VIRT_START; - __vmalloc_start = H_VMALLOC_START; - __vmalloc_end = H_VMALLOC_END; - __kernel_io_start = H_KERN_IO_START; - __kernel_io_end = H_KERN_IO_END; - vmemmap = (struct page *)H_VMEMMAP_START; - ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PCI - pci_io_base = ISA_IO_BASE; -#endif - - /* Select appropriate backend */ - if (firmware_has_feature(FW_FEATURE_PS3_LV1)) - ps3_early_mm_init(); - else if (firmware_has_feature(FW_FEATURE_LPAR)) - hpte_init_pseries(); - else if (IS_ENABLED(CONFIG_PPC_NATIVE)) - hpte_init_native(); - - if (!mmu_hash_ops.hpte_insert) - panic("hash__early_init_mmu: No MMU hash ops defined!\n"); - - /* Initialize the MMU Hash table and create the linear mapping - * of memory. Has to be done before SLB initialization as this is - * currently where the page size encoding is obtained. - */ - htab_initialize(); - - init_mm.context.hash_context = &init_hash_mm_context; - init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; - - pr_info("Initializing hash mmu with SLB\n"); - /* Initialize SLB management */ - slb_initialize(); - - if (cpu_has_feature(CPU_FTR_ARCH_206) - && cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} - -#ifdef CONFIG_SMP -void hash__early_init_mmu_secondary(void) -{ - /* Initialize hash table for that CPU */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - mtspr(SPRN_SDR1, _SDR1); - else - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - } - /* Initialize SLB */ - slb_initialize(); - - if (cpu_has_feature(CPU_FTR_ARCH_206) - && cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} -#endif /* CONFIG_SMP */ - -/* - * Called by asm hashtable.S for doing lazy icache flush - */ -unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) -{ - struct page *page; - - if (!pfn_valid(pte_pfn(pte))) - return pp; - - page = pte_page(pte); - - /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } else - pp |= HPTE_R_N; - } - return pp; -} - -#ifdef CONFIG_PPC_MM_SLICES -static unsigned int get_paca_psize(unsigned long addr) -{ - unsigned char *psizes; - unsigned long index, mask_index; - - if (addr < SLICE_LOW_TOP) { - psizes = get_paca()->mm_ctx_low_slices_psize; - index = GET_LOW_SLICE_INDEX(addr); - } else { - psizes = get_paca()->mm_ctx_high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); - } - mask_index = index & 0x1; - return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; -} - -#else -unsigned int get_paca_psize(unsigned long addr) -{ - return get_paca()->mm_ctx_user_psize; -} -#endif - -/* - * Demote a segment to using 4k pages. - * For now this makes the whole process use 4k pages. - */ -#ifdef CONFIG_PPC_64K_PAGES -void demote_segment_4k(struct mm_struct *mm, unsigned long addr) -{ - if (get_slice_psize(mm, addr) == MMU_PAGE_4K) - return; - slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); - copro_flush_all_slbs(mm); - if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { - - copy_mm_to_paca(mm); - slb_flush_and_restore_bolted(); - } -} -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_PPC_SUBPAGE_PROT -/* - * This looks up a 2-bit protection code for a 4k subpage of a 64k page. - * Userspace sets the subpage permissions using the subpage_prot system call. - * - * Result is 0: full permissions, _PAGE_RW: read-only, - * _PAGE_RWX: no access. - */ -static int subpage_protection(struct mm_struct *mm, unsigned long ea) -{ - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); - u32 spp = 0; - u32 **sbpm, *sbpp; - - if (!spt) - return 0; - - if (ea >= spt->maxaddr) - return 0; - if (ea < 0x100000000UL) { - /* addresses below 4GB use spt->low_prot */ - sbpm = spt->low_prot; - } else { - sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; - if (!sbpm) - return 0; - } - sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; - if (!sbpp) - return 0; - spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; - - /* extract 2-bit bitfield for this 4k subpage */ - spp >>= 30 - 2 * ((ea >> 12) & 0xf); - - /* - * 0 -> full premission - * 1 -> Read only - * 2 -> no access. - * We return the flag that need to be cleared. - */ - spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); - return spp; -} - -#else /* CONFIG_PPC_SUBPAGE_PROT */ -static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) -{ - return 0; -} -#endif - -void hash_failure_debug(unsigned long ea, unsigned long access, - unsigned long vsid, unsigned long trap, - int ssize, int psize, int lpsize, unsigned long pte) -{ - if (!printk_ratelimit()) - return; - pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", - ea, access, current->comm); - pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", - trap, vsid, ssize, psize, lpsize, pte); -} - -static void check_paca_psize(unsigned long ea, struct mm_struct *mm, - int psize, bool user_region) -{ - if (user_region) { - if (psize != get_paca_psize(ea)) { - copy_mm_to_paca(mm); - slb_flush_and_restore_bolted(); - } - } else if (get_paca()->vmalloc_sllp != - mmu_psize_defs[mmu_vmalloc_psize].sllp) { - get_paca()->vmalloc_sllp = - mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_vmalloc_update(); - } -} - -/* Result code is: - * 0 - handled - * 1 - normal page fault - * -1 - critical hash insertion error - * -2 - access not permitted by subpage protection mechanism - */ -int hash_page_mm(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap, - unsigned long flags) -{ - bool is_thp; - enum ctx_state prev_state = exception_enter(); - pgd_t *pgdir; - unsigned long vsid; - pte_t *ptep; - unsigned hugeshift; - int rc, user_region = 0; - int psize, ssize; - - DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", - ea, access, trap); - trace_hash_fault(ea, access, trap); - - /* Get region & vsid */ - switch (get_region_id(ea)) { - case USER_REGION_ID: - user_region = 1; - if (! mm) { - DBG_LOW(" user region with no mm !\n"); - rc = 1; - goto bail; - } - psize = get_slice_psize(mm, ea); - ssize = user_segment_size(ea); - vsid = get_user_vsid(&mm->context, ea, ssize); - break; - case VMALLOC_REGION_ID: - vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - psize = mmu_vmalloc_psize; - ssize = mmu_kernel_ssize; - break; - - case IO_REGION_ID: - vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - psize = mmu_io_psize; - ssize = mmu_kernel_ssize; - break; - default: - /* Not a valid range - * Send the problem up to do_page_fault - */ - rc = 1; - goto bail; - } - DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); - - /* Bad address. */ - if (!vsid) { - DBG_LOW("Bad address!\n"); - rc = 1; - goto bail; - } - /* Get pgdir */ - pgdir = mm->pgd; - if (pgdir == NULL) { - rc = 1; - goto bail; - } - - /* Check CPU locality */ - if (user_region && mm_is_thread_local(mm)) - flags |= HPTE_LOCAL_UPDATE; - -#ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we might - * be hitting a special driver mapping, and need to align the - * address before we fetch the PTE. - * - * It could also be a hugepage mapping, in which case this is - * not necessary, but it's not harmful, either. - */ - if (psize != MMU_PAGE_4K) - ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Get PTE and page size from page tables */ - ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); - if (ptep == NULL || !pte_present(*ptep)) { - DBG_LOW(" no PTE !\n"); - rc = 1; - goto bail; - } - - /* Add _PAGE_PRESENT to the required access perm */ - access |= _PAGE_PRESENT; - - /* Pre-check access permissions (will be re-checked atomically - * in __hash_page_XX but this pre-check is a fast path - */ - if (!check_pte_access(access, pte_val(*ptep))) { - DBG_LOW(" no access !\n"); - rc = 1; - goto bail; - } - - if (hugeshift) { - if (is_thp) - rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, - trap, flags, ssize, psize); -#ifdef CONFIG_HUGETLB_PAGE - else - rc = __hash_page_huge(ea, access, vsid, ptep, trap, - flags, ssize, hugeshift, psize); -#else - else { - /* - * if we have hugeshift, and is not transhuge with - * hugetlb disabled, something is really wrong. - */ - rc = 1; - WARN_ON(1); - } -#endif - if (current->mm == mm) - check_paca_psize(ea, mm, psize, user_region); - - goto bail; - } - -#ifndef CONFIG_PPC_64K_PAGES - DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); -#else - DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), - pte_val(*(ptep + PTRS_PER_PTE))); -#endif - /* Do actual hashing */ -#ifdef CONFIG_PPC_64K_PAGES - /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ - if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { - demote_segment_4k(mm, ea); - psize = MMU_PAGE_4K; - } - - /* If this PTE is non-cacheable and we have restrictions on - * using non cacheable large pages, then we switch to 4k - */ - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { - if (user_region) { - demote_segment_4k(mm, ea); - psize = MMU_PAGE_4K; - } else if (ea < VMALLOC_END) { - /* - * some driver did a non-cacheable mapping - * in vmalloc space, so switch vmalloc - * to 4k pages - */ - printk(KERN_ALERT "Reducing vmalloc segment " - "to 4kB pages because of " - "non-cacheable mapping\n"); - psize = mmu_vmalloc_psize = MMU_PAGE_4K; - copro_flush_all_slbs(mm); - } - } - -#endif /* CONFIG_PPC_64K_PAGES */ - - if (current->mm == mm) - check_paca_psize(ea, mm, psize, user_region); - -#ifdef CONFIG_PPC_64K_PAGES - if (psize == MMU_PAGE_64K) - rc = __hash_page_64K(ea, access, vsid, ptep, trap, - flags, ssize); - else -#endif /* CONFIG_PPC_64K_PAGES */ - { - int spp = subpage_protection(mm, ea); - if (access & spp) - rc = -2; - else - rc = __hash_page_4K(ea, access, vsid, ptep, trap, - flags, ssize, spp); - } - - /* Dump some info in case of hash insertion failure, they should - * never happen so it is really useful to know if/when they do - */ - if (rc == -1) - hash_failure_debug(ea, access, vsid, trap, ssize, psize, - psize, pte_val(*ptep)); -#ifndef CONFIG_PPC_64K_PAGES - DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); -#else - DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), - pte_val(*(ptep + PTRS_PER_PTE))); -#endif - DBG_LOW(" -> rc=%d\n", rc); - -bail: - exception_exit(prev_state); - return rc; -} -EXPORT_SYMBOL_GPL(hash_page_mm); - -int hash_page(unsigned long ea, unsigned long access, unsigned long trap, - unsigned long dsisr) -{ - unsigned long flags = 0; - struct mm_struct *mm = current->mm; - - if ((get_region_id(ea) == VMALLOC_REGION_ID) || - (get_region_id(ea) == IO_REGION_ID)) - mm = &init_mm; - - if (dsisr & DSISR_NOHPTE) - flags |= HPTE_NOHPTE_UPDATE; - - return hash_page_mm(mm, ea, access, trap, flags); -} -EXPORT_SYMBOL_GPL(hash_page); - -int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, - unsigned long dsisr) -{ - unsigned long access = _PAGE_PRESENT | _PAGE_READ; - unsigned long flags = 0; - struct mm_struct *mm = current->mm; - unsigned int region_id = get_region_id(ea); - - if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) - mm = &init_mm; - - if (dsisr & DSISR_NOHPTE) - flags |= HPTE_NOHPTE_UPDATE; - - if (dsisr & DSISR_ISSTORE) - access |= _PAGE_WRITE; - /* - * We set _PAGE_PRIVILEGED only when - * kernel mode access kernel space. - * - * _PAGE_PRIVILEGED is NOT set - * 1) when kernel mode access user space - * 2) user space access kernel space. - */ - access |= _PAGE_PRIVILEGED; - if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) - access &= ~_PAGE_PRIVILEGED; - - if (trap == 0x400) - access |= _PAGE_EXEC; - - return hash_page_mm(mm, ea, access, trap, flags); -} - -#ifdef CONFIG_PPC_MM_SLICES -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - int psize = get_slice_psize(mm, ea); - - /* We only prefault standard pages for now */ - if (unlikely(psize != mm_ctx_user_psize(&mm->context))) - return false; - - /* - * Don't prefault if subpage protection is enabled for the EA. - */ - if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) - return false; - - return true; -} -#else -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - return true; -} -#endif - -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) -{ - int hugepage_shift; - unsigned long vsid; - pgd_t *pgdir; - pte_t *ptep; - unsigned long flags; - int rc, ssize, update_flags = 0; - unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); - - BUG_ON(get_region_id(ea) != USER_REGION_ID); - - if (!should_hash_preload(mm, ea)) - return; - - DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," - " trap=%lx\n", mm, mm->pgd, ea, access, trap); - - /* Get Linux PTE if available */ - pgdir = mm->pgd; - if (pgdir == NULL) - return; - - /* Get VSID */ - ssize = user_segment_size(ea); - vsid = get_user_vsid(&mm->context, ea, ssize); - if (!vsid) - return; - /* - * Hash doesn't like irqs. Walking linux page table with irq disabled - * saves us from holding multiple locks. - */ - local_irq_save(flags); - - /* - * THP pages use update_mmu_cache_pmd. We don't do - * hash preload there. Hence can ignore THP here - */ - ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); - if (!ptep) - goto out_exit; - - WARN_ON(hugepage_shift); -#ifdef CONFIG_PPC_64K_PAGES - /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on - * a 64K kernel), then we don't preload, hash_page() will take - * care of it once we actually try to access the page. - * That way we don't have to duplicate all of the logic for segment - * page size demotion here - */ - if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) - goto out_exit; -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Is that local to this CPU ? */ - if (mm_is_thread_local(mm)) - update_flags |= HPTE_LOCAL_UPDATE; - - /* Hash it in */ -#ifdef CONFIG_PPC_64K_PAGES - if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) - rc = __hash_page_64K(ea, access, vsid, ptep, trap, - update_flags, ssize); - else -#endif /* CONFIG_PPC_64K_PAGES */ - rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, - ssize, subpage_protection(mm, ea)); - - /* Dump some info in case of hash insertion failure, they should - * never happen so it is really useful to know if/when they do - */ - if (rc == -1) - hash_failure_debug(ea, access, vsid, trap, ssize, - mm_ctx_user_psize(&mm->context), - mm_ctx_user_psize(&mm->context), - pte_val(*ptep)); -out_exit: - local_irq_restore(flags); -} - -#ifdef CONFIG_PPC_MEM_KEYS -/* - * Return the protection key associated with the given address and the - * mm_struct. - */ -u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) -{ - pte_t *ptep; - u16 pkey = 0; - unsigned long flags; - - if (!mm || !mm->pgd) - return 0; - - local_irq_save(flags); - ptep = find_linux_pte(mm->pgd, address, NULL, NULL); - if (ptep) - pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); - local_irq_restore(flags); - - return pkey; -} -#endif /* CONFIG_PPC_MEM_KEYS */ - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -static inline void tm_flush_hash_page(int local) -{ - /* - * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a - * page back to a block device w/PIO could pick up transactional data - * (bad!) so we force an abort here. Before the sync the page will be - * made read-only, which will flush_hash_page. BIG ISSUE here: if the - * kernel uses a page from userspace without unmapping it first, it may - * see the speculated version. - */ - if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs && - MSR_TM_ACTIVE(current->thread.regs->msr)) { - tm_enable(); - tm_abort(TM_CAUSE_TLBI); - } -} -#else -static inline void tm_flush_hash_page(int local) -{ -} -#endif - -/* - * Return the global hash slot, corresponding to the given PTE, which contains - * the HPTE. - */ -unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, - int ssize, real_pte_t rpte, unsigned int subpg_index) -{ - unsigned long hash, gslot, hidx; - - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(rpte, subpg_index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - gslot += hidx & _PTEIDX_GROUP_IX; - return gslot; -} - -/* WARNING: This is called from hash_low_64.S, if you change this prototype, - * do not forget to update the assembly call site ! - */ -void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, - unsigned long flags) -{ - unsigned long index, shift, gslot; - int local = flags & HPTE_LOCAL_UPDATE; - - DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); - pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index); - DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot); - /* - * We use same base page size and actual psize, because we don't - * use these functions for hugepage - */ - mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize, - ssize, local); - } pte_iterate_hashed_end(); - - tm_flush_hash_page(local); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -void flush_hash_hugepage(unsigned long vsid, unsigned long addr, - pmd_t *pmdp, unsigned int psize, int ssize, - unsigned long flags) -{ - int i, max_hpte_count, valid; - unsigned long s_addr; - unsigned char *hpte_slot_array; - unsigned long hidx, shift, vpn, hash, slot; - int local = flags & HPTE_LOCAL_UPDATE; - - s_addr = addr & HPAGE_PMD_MASK; - hpte_slot_array = get_hpte_slot_array(pmdp); - /* - * IF we try to do a HUGE PTE update after a withdraw is done. - * we will find the below NULL. This happens when we do - * split_huge_page_pmd - */ - if (!hpte_slot_array) - return; - - if (mmu_hash_ops.hugepage_invalidate) { - mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array, - psize, ssize, local); - goto tm_abort; - } - /* - * No bluk hpte removal support, invalidate each entry - */ - shift = mmu_psize_defs[psize].shift; - max_hpte_count = HPAGE_PMD_SIZE >> shift; - for (i = 0; i < max_hpte_count; i++) { - /* - * 8 bits per each hpte entries - * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] - */ - valid = hpte_valid(hpte_slot_array, i); - if (!valid) - continue; - hidx = hpte_hash_index(hpte_slot_array, i); - - /* get the vpn */ - addr = s_addr + (i * (1ul << shift)); - vpn = hpt_vpn(addr, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(slot, vpn, psize, - MMU_PAGE_16M, ssize, local); - } -tm_abort: - tm_flush_hash_page(local); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -void flush_hash_range(unsigned long number, int local) -{ - if (mmu_hash_ops.flush_hash_range) - mmu_hash_ops.flush_hash_range(number, local); - else { - int i; - struct ppc64_tlb_batch *batch = - this_cpu_ptr(&ppc64_tlb_batch); - - for (i = 0; i < number; i++) - flush_hash_page(batch->vpn[i], batch->pte[i], - batch->psize, batch->ssize, local); - } -} - -/* - * low_hash_fault is called when we the low level hash code failed - * to instert a PTE due to an hypervisor error - */ -void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) -{ - enum ctx_state prev_state = exception_enter(); - - if (user_mode(regs)) { -#ifdef CONFIG_PPC_SUBPAGE_PROT - if (rc == -2) - _exception(SIGSEGV, regs, SEGV_ACCERR, address); - else -#endif - _exception(SIGBUS, regs, BUS_ADRERR, address); - } else - bad_page_fault(regs, address, SIGBUS); - - exception_exit(prev_state); -} - -long hpte_insert_repeating(unsigned long hash, unsigned long vpn, - unsigned long pa, unsigned long rflags, - unsigned long vflags, int psize, int ssize) -{ - unsigned long hpte_group; - long slot; - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags, - psize, psize, ssize); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, - vflags | HPTE_V_SECONDARY, - psize, psize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - - mmu_hash_ops.hpte_remove(hpte_group); - goto repeat; - } - } - - return slot; -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); - long ret; - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - - /* Don't create HPTE entries for bad address */ - if (!vsid) - return; - - ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, - HPTE_V_BOLTED, - mmu_linear_psize, mmu_kernel_ssize); - - BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); - BUG_ON(linear_map_hash_slots[lmi] & 0x80); - linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); -} - -static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash, hidx, slot; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); - hidx = linear_map_hash_slots[lmi] & 0x7f; - linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, - mmu_linear_psize, - mmu_kernel_ssize, 0); -} - -void __kernel_map_pages(struct page *page, int numpages, int enable) -{ - unsigned long flags, vaddr, lmi; - int i; - - local_irq_save(flags); - for (i = 0; i < numpages; i++, page++) { - vaddr = (unsigned long)page_address(page); - lmi = __pa(vaddr) >> PAGE_SHIFT; - if (lmi >= linear_map_hash_count) - continue; - if (enable) - kernel_map_linear_page(vaddr, lmi); - else - kernel_unmap_linear_page(vaddr, lmi); - } - local_irq_restore(flags); -} -#endif /* CONFIG_DEBUG_PAGEALLOC */ - -void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* - * On virtualized systems the first entry is our RMA region aka VRMA, - * non-virtualized 64-bit hash MMU systems don't have a limitation - * on real mode access. - * - * For guests on platforms before POWER9, we clamp the it limit to 1G - * to avoid some funky things such as RTAS bugs etc... - */ - if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { - ppc64_rma_size = first_memblock_size; - if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) - ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); - - /* Finally limit subsequent allocations */ - memblock_set_current_limit(ppc64_rma_size); - } else { - ppc64_rma_size = ULONG_MAX; - } -} - -#ifdef CONFIG_DEBUG_FS - -static int hpt_order_get(void *data, u64 *val) -{ - *val = ppc64_pft_size; - return 0; -} - -static int hpt_order_set(void *data, u64 val) -{ - if (!mmu_hash_ops.resize_hpt) - return -ENODEV; - - return mmu_hash_ops.resize_hpt(val); -} - -DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); - -static int __init hash64_debugfs(void) -{ - if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, - NULL, &fops_hpt_order)) { - pr_err("lpar: unable to create hpt_order debugsfs file\n"); - } - - return 0; -} -machine_device_initcall(pseries, hash64_debugfs); -#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c deleted file mode 100644 index dfbc3b32f09b..000000000000 --- a/arch/powerpc/mm/hugepage-hash64.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -/* - * PPC64 THP Support for hash based MMUs - */ -#include -#include - -int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, - pmd_t *pmdp, unsigned long trap, unsigned long flags, - int ssize, unsigned int psize) -{ - unsigned int index, valid; - unsigned char *hpte_slot_array; - unsigned long rflags, pa, hidx; - unsigned long old_pmd, new_pmd; - int ret, lpsize = MMU_PAGE_16M; - unsigned long vpn, hash, shift, slot; - - /* - * atomically mark the linux large page PMD busy and dirty - */ - do { - pmd_t pmd = READ_ONCE(*pmdp); - - old_pmd = pmd_val(pmd); - /* If PMD busy, retry the access */ - if (unlikely(old_pmd & H_PAGE_BUSY)) - return 0; - /* If PMD permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pmd))) - return 1; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access - */ - new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pmd |= _PAGE_DIRTY; - } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); - - /* - * Make sure this is thp or devmap entry - */ - if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) - return 0; - - rflags = htab_convert_pte_flags(new_pmd); - -#if 0 - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { - - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - } -#endif - /* - * Find the slot index details for this ea, using base page size. - */ - shift = mmu_psize_defs[psize].shift; - index = (ea & ~HPAGE_PMD_MASK) >> shift; - BUG_ON(index >= PTE_FRAG_SIZE); - - vpn = hpt_vpn(ea, vsid, ssize); - hpte_slot_array = get_hpte_slot_array(pmdp); - if (psize == MMU_PAGE_4K) { - /* - * invalidate the old hpte entry if we have that mapped via 64K - * base page size. This is because demote_segment won't flush - * hash page table entries. - */ - if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { - flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, - ssize, flags); - /* - * With THP, we also clear the slot information with - * respect to all the 64K hash pte mapping the 16MB - * page. They are all invalid now. This make sure we - * don't find the slot valid when we fault with 4k - * base page size. - * - */ - memset(hpte_slot_array, 0, PTE_FRAG_SIZE); - } - } - - valid = hpte_valid(hpte_slot_array, index); - if (valid) { - /* update the hpte bits */ - hash = hpt_hash(vpn, shift, ssize); - hidx = hpte_hash_index(hpte_slot_array, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - - ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, - psize, lpsize, ssize, flags); - /* - * We failed to update, try to insert a new entry. - */ - if (ret == -1) { - /* - * large pte is marked busy, so we can be sure - * nobody is looking at hpte_slot_array. hence we can - * safely update this here. - */ - valid = 0; - hpte_slot_array[index] = 0; - } - } - - if (!valid) { - unsigned long hpte_group; - - hash = hpt_hash(vpn, shift, ssize); - /* insert new entry */ - pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; - new_pmd |= H_PAGE_HASHPTE; - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - psize, lpsize, ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, - HPTE_V_SECONDARY, - psize, lpsize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - - mmu_hash_ops.hpte_remove(hpte_group); - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pmd and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *pmdp = __pmd(old_pmd); - hash_failure_debug(ea, access, vsid, trap, ssize, - psize, lpsize, old_pmd); - return -1; - } - /* - * large pte is marked busy, so we can be sure - * nobody is looking at hpte_slot_array. hence we can - * safely update this here. - */ - mark_hpte_slot_valid(hpte_slot_array, index, slot); - } - /* - * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with - * base page size 4k. - */ - if (psize == MMU_PAGE_4K) - new_pmd |= H_PAGE_COMBO; - /* - * The hpte valid is stored in the pgtable whose address is in the - * second half of the PMD. Order this against clearing of the busy bit in - * huge pmd. - */ - smp_wmb(); - *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); - return 0; -} diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c deleted file mode 100644 index b0d9209d9a86..000000000000 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) - * - * Copyright (C) 2003 David Gibson, IBM Corporation. - * - * Based on the IA-32 version: - * Copyright (C) 2002, Rohit Seth - */ - -#include -#include -#include -#include -#include -#include - -extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, - unsigned long pa, unsigned long rlags, - unsigned long vflags, int psize, int ssize); - -int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, unsigned long flags, - int ssize, unsigned int shift, unsigned int mmu_psize) -{ - real_pte_t rpte; - unsigned long vpn; - unsigned long old_pte, new_pte; - unsigned long rflags, pa; - long slot, offset; - - BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); - - /* Search the Linux page table for a match with va */ - vpn = hpt_vpn(ea, vsid, ssize); - - /* At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ - - - do { - old_pte = pte_val(*ptep); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - - /* Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - /* Make sure this is a hugetlb entry */ - if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) - return 0; - - rflags = htab_convert_pte_flags(new_pte); - if (unlikely(mmu_psize == MMU_PAGE_16G)) - offset = PTRS_PER_PUD; - else - offset = PTRS_PER_PMD; - rpte = __real_pte(__pte(old_pte), ptep, offset); - - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & H_PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long gslot; - - gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); - if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize, - mmu_psize, ssize, flags) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & H_PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(vpn, shift, ssize); - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - - /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; - - slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, - mmu_psize, ssize); - - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - mmu_psize, mmu_psize, old_pte); - return -1; - } - - new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset); - } - - /* - * No need to use ldarx/stdcx here - */ - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} - -pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - unsigned long pte_val; - /* - * Clear the _PAGE_PRESENT so that no hardware parallel update is - * possible. Also keep the pte_present true so that we don't take - * wrong fault. - */ - pte_val = pte_update(vma->vm_mm, addr, ptep, - _PAGE_PRESENT, _PAGE_INVALID, 1); - - return __pte(pte_val); -} - -void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t old_pte, pte_t pte) -{ - - if (radix_enabled()) - return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, - old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); -} diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c deleted file mode 100644 index cab06331c0c0..000000000000 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - int psize; - struct hstate *hstate = hstate_file(vma->vm_file); - - psize = hstate_get_psize(hstate); - radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); -} - -void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - int psize; - struct hstate *hstate = hstate_file(vma->vm_file); - - psize = hstate_get_psize(hstate); - radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); -} - -void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - int psize; - struct hstate *hstate = hstate_file(vma->vm_file); - - psize = hstate_get_psize(hstate); - radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); -} - -/* - * A vairant of hugetlb_get_unmapped_area doing topdown search - * FIXME!! should we do as x86 does or non hugetlb area does ? - * ie, use topdown or not based on mmap_is_legacy check ? - */ -unsigned long -radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct hstate *h = hstate_file(file); - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - /* - * We are always doing an topdown search here. Slice code - * does that too. - */ - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - - return vm_unmapped_area(&info); -} - -void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t old_pte, pte_t pte) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. - */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) - radix__flush_hugetlb_page(vma, addr); - - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); -} diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c deleted file mode 100644 index cb2b08635508..000000000000 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * MMU context allocation for 64-bit kernels. - * - * Copyright (C) 2004 Anton Blanchard, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -static DEFINE_IDA(mmu_context_ida); - -static int alloc_context_id(int min_id, int max_id) -{ - return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); -} - -void hash__reserve_context_id(int id) -{ - int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); - - WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); -} - -int hash__alloc_context_id(void) -{ - unsigned long max; - - if (mmu_has_feature(MMU_FTR_68_BIT_VA)) - max = MAX_USER_CONTEXT; - else - max = MAX_USER_CONTEXT_65BIT_VA; - - return alloc_context_id(MIN_USER_CONTEXT, max); -} -EXPORT_SYMBOL_GPL(hash__alloc_context_id); - -void slb_setup_new_exec(void); - -static int hash__init_new_context(struct mm_struct *mm) -{ - int index; - - index = hash__alloc_context_id(); - if (index < 0) - return index; - - mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), - GFP_KERNEL); - if (!mm->context.hash_context) { - ida_free(&mmu_context_ida, index); - return -ENOMEM; - } - - /* - * The old code would re-promote on fork, we don't do that when using - * slices as it could cause problem promoting slices that have been - * forced down to 4K. - * - * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we properly - * initialize context slice details for newly allocated mm's (which will - * have id == 0) and don't alter context slice inherited via fork (which - * will have id != 0). - * - * We should not be calling init_new_context() on init_mm. Hence a - * check against 0 is OK. - */ - if (mm->context.id == 0) { - memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); - slice_init_new_context_exec(mm); - } else { - /* This is fork. Copy hash_context details from current->mm */ - memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); -#ifdef CONFIG_PPC_SUBPAGE_PROT - /* inherit subpage prot detalis if we have one. */ - if (current->mm->context.hash_context->spt) { - mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), - GFP_KERNEL); - if (!mm->context.hash_context->spt) { - ida_free(&mmu_context_ida, index); - kfree(mm->context.hash_context); - return -ENOMEM; - } - } -#endif - - } - - pkey_mm_init(mm); - return index; -} - -void hash__setup_new_exec(void) -{ - slice_setup_new_exec(); - - slb_setup_new_exec(); -} - -static int radix__init_new_context(struct mm_struct *mm) -{ - unsigned long rts_field; - int index, max_id; - - max_id = (1 << mmu_pid_bits) - 1; - index = alloc_context_id(mmu_base_pid, max_id); - if (index < 0) - return index; - - /* - * set the process table entry, - */ - rts_field = radix__get_tree_size(); - process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); - - /* - * Order the above store with subsequent update of the PID - * register (at which point HW can start loading/caching - * the entry) and the corresponding load by the MMU from - * the L2 cache. - */ - asm volatile("ptesync;isync" : : : "memory"); - - mm->context.npu_context = NULL; - mm->context.hash_context = NULL; - - return index; -} - -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - int index; - - if (radix_enabled()) - index = radix__init_new_context(mm); - else - index = hash__init_new_context(mm); - - if (index < 0) - return index; - - mm->context.id = index; - - mm->context.pte_frag = NULL; - mm->context.pmd_frag = NULL; -#ifdef CONFIG_SPAPR_TCE_IOMMU - mm_iommu_init(mm); -#endif - atomic_set(&mm->context.active_cpus, 0); - atomic_set(&mm->context.copros, 0); - - return 0; -} - -void __destroy_context(int context_id) -{ - ida_free(&mmu_context_ida, context_id); -} -EXPORT_SYMBOL_GPL(__destroy_context); - -static void destroy_contexts(mm_context_t *ctx) -{ - int index, context_id; - - for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { - context_id = ctx->extended_id[index]; - if (context_id) - ida_free(&mmu_context_ida, context_id); - } - kfree(ctx->hash_context); -} - -static void pmd_frag_destroy(void *pmd_frag) -{ - int count; - struct page *page; - - page = virt_to_page(pmd_frag); - /* drop all the pending references */ - count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; - /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); - } -} - -static void destroy_pagetable_cache(struct mm_struct *mm) -{ - void *frag; - - frag = mm->context.pte_frag; - if (frag) - pte_frag_destroy(frag); - - frag = mm->context.pmd_frag; - if (frag) - pmd_frag_destroy(frag); - return; -} - -void destroy_context(struct mm_struct *mm) -{ -#ifdef CONFIG_SPAPR_TCE_IOMMU - WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); -#endif - if (radix_enabled()) - WARN_ON(process_tb[mm->context.id].prtb0 != 0); - else - subpage_prot_free(mm); - destroy_contexts(&mm->context); - mm->context.id = MMU_NO_CONTEXT; -} - -void arch_exit_mmap(struct mm_struct *mm) -{ - destroy_pagetable_cache(mm); - - if (radix_enabled()) { - /* - * Radix doesn't have a valid bit in the process table - * entries. However we know that at least P9 implementation - * will avoid caching an entry with an invalid RTS field, - * and 0 is invalid. So this will do. - * - * This runs before the "fullmm" tlb flush in exit_mmap, - * which does a RIC=2 tlbie to clear the process table - * entry. See the "fullmm" comments in tlb-radix.c. - * - * No barrier required here after the store because - * this process will do the invalidate, which starts with - * ptesync. - */ - process_tb[mm->context.id].prtb0 = 0; - } -} - -#ifdef CONFIG_PPC_RADIX_MMU -void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) -{ - mtspr(SPRN_PID, next->context.id); - isync(); -} -#endif diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c deleted file mode 100644 index e7a9c4f6bfca..000000000000 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ /dev/null @@ -1,482 +0,0 @@ -/* - * IOMMU helpers in MMU context. - * - * Copyright (C) 2015 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_MUTEX(mem_list_mutex); - -#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 -#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) - -struct mm_iommu_table_group_mem_t { - struct list_head next; - struct rcu_head rcu; - unsigned long used; - atomic64_t mapped; - unsigned int pageshift; - u64 ua; /* userspace address */ - u64 entries; /* number of entries in hpas/hpages[] */ - /* - * in mm_iommu_get we temporarily use this to store - * struct page address. - * - * We need to convert ua to hpa in real mode. Make it - * simpler by storing physical address. - */ - union { - struct page **hpages; /* vmalloc'ed */ - phys_addr_t *hpas; - }; -#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) - u64 dev_hpa; /* Device memory base address */ -}; - -static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, - unsigned long npages, bool incr) -{ - long ret = 0, locked, lock_limit; - - if (!npages) - return 0; - - down_write(&mm->mmap_sem); - - if (incr) { - locked = mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - mm->locked_vm += npages; - } else { - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; - } - - pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", - current ? current->pid : 0, - incr ? '+' : '-', - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); - - return ret; -} - -bool mm_iommu_preregistered(struct mm_struct *mm) -{ - return !list_empty(&mm->context.iommu_group_mem_list); -} -EXPORT_SYMBOL_GPL(mm_iommu_preregistered); - -static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, - unsigned long entries, unsigned long dev_hpa, - struct mm_iommu_table_group_mem_t **pmem) -{ - struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; - unsigned int pageshift; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - - } - - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { - ret = mm_iommu_adjust_locked_vm(mm, entries, true); - if (ret) - goto unlock_exit; - - locked_entries = entries; - } - - mem = kzalloc(sizeof(*mem), GFP_KERNEL); - if (!mem) { - ret = -ENOMEM; - goto unlock_exit; - } - - if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { - mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); - mem->dev_hpa = dev_hpa; - goto good_exit; - } - mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; - - /* - * For a starting point for a maximum page size calculation - * we use @ua and @entries natural alignment to allow IOMMU pages - * smaller than huge pages but still bigger than PAGE_SIZE. - */ - mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT)); - mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0]))); - if (!mem->hpas) { - kfree(mem); - ret = -ENOMEM; - goto unlock_exit; - } - - down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); - up_read(&mm->mmap_sem); - if (ret != entries) { - /* free the reference taken */ - for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); - - vfree(mem->hpas); - kfree(mem); - ret = -EFAULT; - goto unlock_exit; - } - - pageshift = PAGE_SHIFT; - for (i = 0; i < entries; ++i) { - struct page *page = mem->hpages[i]; - - /* - * Allow to use larger than 64k IOMMU pages. Only do that - * if we are backed by hugetlb. - */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } - mem->pageshift = min(mem->pageshift, pageshift); - /* - * We don't need struct page reference any more, switch - * to physical address. - */ - mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; - } - -good_exit: - ret = 0; - atomic64_set(&mem->mapped, 1); - mem->used = 1; - mem->ua = ua; - mem->entries = entries; - *pmem = mem; - - list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); - -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); - - mutex_unlock(&mem_list_mutex); - - return ret; -} - -long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, - struct mm_iommu_table_group_mem_t **pmem) -{ - return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, - pmem); -} -EXPORT_SYMBOL_GPL(mm_iommu_new); - -long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, - unsigned long entries, unsigned long dev_hpa, - struct mm_iommu_table_group_mem_t **pmem) -{ - return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); -} -EXPORT_SYMBOL_GPL(mm_iommu_newdev); - -static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) -{ - long i; - struct page *page = NULL; - - if (!mem->hpas) - return; - - for (i = 0; i < mem->entries; ++i) { - if (!mem->hpas[i]) - continue; - - page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT); - if (!page) - continue; - - if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) - SetPageDirty(page); - - put_page(page); - mem->hpas[i] = 0; - } -} - -static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem) -{ - - mm_iommu_unpin(mem); - vfree(mem->hpas); - kfree(mem); -} - -static void mm_iommu_free(struct rcu_head *head) -{ - struct mm_iommu_table_group_mem_t *mem = container_of(head, - struct mm_iommu_table_group_mem_t, rcu); - - mm_iommu_do_free(mem); -} - -static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) -{ - list_del_rcu(&mem->next); - call_rcu(&mem->rcu, mm_iommu_free); -} - -long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) -{ - long ret = 0; - unsigned long entries, dev_hpa; - - mutex_lock(&mem_list_mutex); - - if (mem->used == 0) { - ret = -ENOENT; - goto unlock_exit; - } - - --mem->used; - /* There are still users, exit */ - if (mem->used) - goto unlock_exit; - - /* Are there still mappings? */ - if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) { - ++mem->used; - ret = -EBUSY; - goto unlock_exit; - } - - /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; - mm_iommu_release(mem); - - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); - -unlock_exit: - mutex_unlock(&mem_list_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(mm_iommu_put); - -struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(mm_iommu_lookup); - -struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, - next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} - -struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, - unsigned long ua, unsigned long entries) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { - if ((mem->ua == ua) && (mem->entries == entries)) { - ret = mem; - ++mem->used; - break; - } - } - - mutex_unlock(&mem_list_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(mm_iommu_get); - -long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - u64 *va; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - va = &mem->hpas[entry]; - *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} -EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); - -long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - unsigned long *pa; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); - if (!pa) - return -EFAULT; - - *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} - -extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) -{ - struct mm_iommu_table_group_mem_t *mem; - long entry; - void *va; - unsigned long *pa; - - mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); - if (!mem) - return; - - if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) - return; - - entry = (ua - mem->ua) >> PAGE_SHIFT; - va = &mem->hpas[entry]; - - pa = (void *) vmalloc_to_phys(va); - if (!pa) - return; - - *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; -} - -bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, - unsigned int pageshift, unsigned long *size) -{ - struct mm_iommu_table_group_mem_t *mem; - unsigned long end; - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { - if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - continue; - - end = mem->dev_hpa + (mem->entries << PAGE_SHIFT); - if ((mem->dev_hpa <= hpa) && (hpa < end)) { - /* - * Since the IOMMU page size might be bigger than - * PAGE_SIZE, the amount of preregistered memory - * starting from @hpa might be smaller than 1<mapped)) - return 0; - - /* Last mm_iommu_put() has been called, no more mappings allowed() */ - return -ENXIO; -} -EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc); - -void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem) -{ - atomic64_add_unless(&mem->mapped, -1, 1); -} -EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec); - -void mm_iommu_init(struct mm_struct *mm) -{ - INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list); -} diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 6ef36d553cde..57e64273cb33 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1068,7 +1068,7 @@ u64 memory_hotplug_max(void) /* Virtual Processor Home Node (VPHN) support */ #ifdef CONFIG_PPC_SPLPAR -#include "vphn.h" +#include "book3s64/vphn.h" struct topology_update_data { struct topology_update_data *next; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c deleted file mode 100644 index 16bda049187a..000000000000 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ /dev/null @@ -1,449 +0,0 @@ -/* - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -unsigned long __pmd_frag_nr; -EXPORT_SYMBOL(__pmd_frag_nr); -unsigned long __pmd_frag_size_shift; -EXPORT_SYMBOL(__pmd_frag_size_shift); - -int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp)); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - /* - * We can use MMU_PAGE_2M here, because only radix - * path look at the psize. - */ - __ptep_set_access_flags(vma, pmdp_ptep(pmdp), - pmd_pte(entry), address, MMU_PAGE_2M); - } - return changed; -} - -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - /* - * Make sure hardware valid bit is not set. We don't do - * tlb flush for this update. - */ - - WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); - assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); -#endif - trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); -} - -static void do_nothing(void *unused) -{ - -} -/* - * Serialize against find_current_mm_pte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. - */ -void serialize_against_pte_lookup(struct mm_struct *mm) -{ - smp_mb(); - smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); -} - -/* - * We use this to invalidate a pmdp entry before switching from a - * hugepte to regular pmd entry. - */ -pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - unsigned long old_pmd; - - old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - */ - serialize_against_pte_lookup(vma->vm_mm); - return __pmd(old_pmd); -} - -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - unsigned long pmdv; - - pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - unsigned long pmdv; - - pmdv = pmd_val(pmd); - pmdv &= _HPAGE_CHG_MASK; - return pmd_set_protbits(__pmd(pmdv), newprot); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - if (radix_enabled()) - prefetch((void *)addr); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/* For use by kexec */ -void mmu_cleanup_all(void) -{ - if (radix_enabled()) - radix__mmu_cleanup_all(); - else if (mmu_hash_ops.hpte_clear_all) - mmu_hash_ops.hpte_clear_all(); -} - -#ifdef CONFIG_MEMORY_HOTPLUG -int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid) -{ - if (radix_enabled()) - return radix__create_section_mapping(start, end, nid); - - return hash__create_section_mapping(start, end, nid); -} - -int __meminit remove_section_mapping(unsigned long start, unsigned long end) -{ - if (radix_enabled()) - return radix__remove_section_mapping(start, end); - - return hash__remove_section_mapping(start, end); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -void __init mmu_partition_table_init(void) -{ - unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; - unsigned long ptcr; - - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); - /* Initialize the Partition Table with no entries */ - partition_tb = memblock_alloc(patb_size, patb_size); - if (!partition_tb) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, patb_size, patb_size); - - /* - * update partition table control register, - * 64 K size. - */ - ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); - mtspr(SPRN_PTCR, ptcr); - powernv_set_nmmu_ptcr(ptcr); -} - -void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) -{ - unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); - - partition_tb[lpid].patb0 = cpu_to_be64(dw0); - partition_tb[lpid].patb1 = cpu_to_be64(dw1); - - /* - * Global flush of TLBs and partition table caches for this lpid. - * The type of flush (hash or radix) depends on what the previous - * use of this partition ID was, not the new use. - */ - asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); - } else { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); - } - /* do we need fixup here ?*/ - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); -} -EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); - -static pmd_t *get_pmd_from_cache(struct mm_struct *mm) -{ - void *pmd_frag, *ret; - - if (PMD_FRAG_NR == 1) - return NULL; - - spin_lock(&mm->page_table_lock); - ret = mm->context.pmd_frag; - if (ret) { - pmd_frag = ret + PMD_FRAG_SIZE; - /* - * If we have taken up all the fragments mark PTE page NULL - */ - if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0) - pmd_frag = NULL; - mm->context.pmd_frag = pmd_frag; - } - spin_unlock(&mm->page_table_lock); - return (pmd_t *)ret; -} - -static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) -{ - void *ret = NULL; - struct page *page; - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; - - if (mm == &init_mm) - gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) - return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); - return NULL; - } - - atomic_set(&page->pt_frag_refcount, 1); - - ret = page_address(page); - /* - * if we support only one fragment just return the - * allocated page. - */ - if (PMD_FRAG_NR == 1) - return ret; - - spin_lock(&mm->page_table_lock); - /* - * If we find pgtable_page set, we return - * the allocated page with single fragement - * count. - */ - if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); - mm->context.pmd_frag = ret + PMD_FRAG_SIZE; - } - spin_unlock(&mm->page_table_lock); - - return (pmd_t *)ret; -} - -pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) -{ - pmd_t *pmd; - - pmd = get_pmd_from_cache(mm); - if (pmd) - return pmd; - - return __alloc_for_pmdcache(mm); -} - -void pmd_fragment_free(unsigned long *pmd) -{ - struct page *page = virt_to_page(pmd); - - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); - } -} - -static inline void pgtable_free(void *table, int index) -{ - switch (index) { - case PTE_INDEX: - pte_fragment_free(table, 0); - break; - case PMD_INDEX: - pmd_fragment_free(table); - break; - case PUD_INDEX: - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); - break; -#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) - /* 16M hugepd directory at pud level */ - case HTLB_16M_INDEX: - BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); - break; - /* 16G hugepd directory at the pgd level */ - case HTLB_16G_INDEX: - BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); - break; -#endif - /* We don't free pgd table via RCU callback */ - default: - BUG(); - } -} - -#ifdef CONFIG_SMP -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) -{ - unsigned long pgf = (unsigned long)table; - - BUG_ON(index > MAX_PGTABLE_INDEX_SIZE); - pgf |= index; - tlb_remove_table(tlb, (void *)pgf); -} - -void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - return pgtable_free(table, index); -} -#else -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) -{ - return pgtable_free(table, index); -} -#endif - -#ifdef CONFIG_PROC_FS -atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; - -void arch_report_meminfo(struct seq_file *m) -{ - /* - * Hash maps the memory with one size mmu_linear_psize. - * So don't bother to print these on hash - */ - if (!radix_enabled()) - return; - seq_printf(m, "DirectMap4k: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); - seq_printf(m, "DirectMap64k: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); - seq_printf(m, "DirectMap2M: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); - seq_printf(m, "DirectMap1G: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); -} -#endif /* CONFIG_PROC_FS */ - -pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) -{ - unsigned long pte_val; - - /* - * Clear the _PAGE_PRESENT so that no hardware parallel update is - * possible. Also keep the pte_present true so that we don't take - * wrong fault. - */ - pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); - - return __pte(pte_val); - -} - -void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t old_pte, pte_t pte) -{ - if (radix_enabled()) - return radix__ptep_modify_prot_commit(vma, addr, - ptep, old_pte, pte); - set_pte_at(vma->vm_mm, addr, ptep, pte); -} - -/* - * For hash translation mode, we use the deposited table to store hash slot - * information and they are stored at PTRS_PER_PMD offset from related pmd - * location. Hence a pmd move requires deposit and withdraw. - * - * For radix translation with split pmd ptl, we store the deposited table in the - * pmd page. Hence if we have different pmd page we need to withdraw during pmd - * move. - * - * With hash we use deposited table always irrespective of anon or not. - * With radix we use deposited table only for anonymous mapping. - */ -int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, - struct spinlock *old_pmd_ptl, - struct vm_area_struct *vma) -{ - if (radix_enabled()) - return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); - - return true; -} diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c deleted file mode 100644 index 1fd025dba4a3..000000000000 --- a/arch/powerpc/mm/pgtable-hash64.c +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright 2005, Paul Mackerras, IBM Corporation. - * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#define CREATE_TRACE_POINTS -#include - -#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) -#warning Limited user VSID range means pagetable space is wasted -#endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP -/* - * vmemmap is the starting address of the virtual address space where - * struct pages are allocated for all possible PFNs present on the system - * including holes and bad memory (hence sparse). These virtual struct - * pages are stored in sequence in this virtual address space irrespective - * of the fact whether the corresponding PFN is valid or not. This achieves - * constant relationship between address of struct page and its PFN. - * - * During boot or memory hotplug operation when a new memory section is - * added, physical memory allocation (including hash table bolting) will - * be performed for the set of struct pages which are part of the memory - * section. This saves memory by not allocating struct pages for PFNs - * which are not valid. - * - * ---------------------------------------------- - * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| - * ---------------------------------------------- - * - * f000000000000000 c000000000000000 - * vmemmap +--------------+ +--------------+ - * + | page struct | +--------------> | page struct | - * | +--------------+ +--------------+ - * | | page struct | +--------------> | page struct | - * | +--------------+ | +--------------+ - * | | page struct | + +------> | page struct | - * | +--------------+ | +--------------+ - * | | page struct | | +--> | page struct | - * | +--------------+ | | +--------------+ - * | | page struct | | | - * | +--------------+ | | - * | | page struct | | | - * | +--------------+ | | - * | | page struct | | | - * | +--------------+ | | - * | | page struct | | | - * | +--------------+ | | - * | | page struct | +-------+ | - * | +--------------+ | - * | | page struct | +-----------+ - * | +--------------+ - * | | page struct | No mapping - * | +--------------+ - * | | page struct | No mapping - * v +--------------+ - * - * ----------------------------------------- - * | RELATION BETWEEN STRUCT PAGES AND PFNS| - * ----------------------------------------- - * - * vmemmap +--------------+ +---------------+ - * + | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | | - * | +--------------+ - * | | | - * | +--------------+ - * | | | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | | - * | +--------------+ - * | | | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * v +--------------+ +---------------+ - */ -/* - * On hash-based CPUs, the vmemmap is bolted in the hash table. - * - */ -int __meminit hash__vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - int rc; - - if ((start + page_size) >= H_VMEMMAP_END) { - pr_warn("Outside the supported range\n"); - return -1; - } - - rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - if (rc < 0) { - int rc2 = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -void hash__vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ - int rc = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON((rc < 0) && (rc != -ENOENT)); - WARN_ON(rc == -ENOENT); -} -#endif -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - -/* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it - */ -int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); - } else { - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - * - */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), - mmu_io_psize, mmu_kernel_ssize)) { - printk(KERN_ERR "Failed to do bolted mapping IO " - "memory at %016lx !\n", pa); - return -ENOMEM; - } - } - - smp_wmb(); - return 0; -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - __be64 old_be, tmp; - unsigned long old; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(pmd_lockptr(mm, pmdp)); -#endif - - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - and. %1,%0,%6\n\ - bne- 1b \n\ - andc %1,%0,%4 \n\ - or %1,%1,%7\n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), - "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) - : "cc" ); - - old = be64_to_cpu(old_be); - - trace_hugepage_update(addr, old, clr, set); - if (old & H_PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp, old); - return old; -} - -pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_trans_huge(*pmdp)); - VM_BUG_ON(pmd_devmap(*pmdp)); - - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - serialize_against_pte_lookup(vma->vm_mm); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - return pmd; -} - -/* - * We want to put the pgtable in pmd and use pgtable for tracking - * the base page size hptes - */ -void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - pgtable_t *pgtable_slot; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - /* - * we store the pgtable in the second half of PMD - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - *pgtable_slot = pgtable; - /* - * expose the deposited pgtable to other cpus. - * before we set the hugepage PTE at pmd level - * hash fault code looks at the deposted pgtable - * to store hash index values. - */ - smp_wmb(); -} - -pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pgtable_t pgtable; - pgtable_t *pgtable_slot; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Once we withdraw, mark the entry NULL. - */ - *pgtable_slot = NULL; - /* - * We store HPTE information in the deposited PTE fragment. - * zero out the content on withdraw. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return pgtable; -} - -/* - * A linux hugepage PMD was changed and the corresponding hash table entries - * neesd to be flushed. - */ -void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long old_pmd) -{ - int ssize; - unsigned int psize; - unsigned long vsid; - unsigned long flags = 0; - - /* get the base page size,vsid and segment size */ -#ifdef CONFIG_DEBUG_VM - psize = get_slice_psize(mm, addr); - BUG_ON(psize == MMU_PAGE_16M); -#endif - if (old_pmd & H_PAGE_COMBO) - psize = MMU_PAGE_4K; - else - psize = MMU_PAGE_64K; - - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_user_vsid(&mm->context, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - - if (mm_is_thread_local(mm)) - flags |= HPTE_LOCAL_UPDATE; - - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); -} - -pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - pgtable_t pgtable; - unsigned long old; - pgtable_t *pgtable_slot; - - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * We have pmd == none and we are holding page_table_lock. - * So we can safely go and clear the pgtable hash - * index info. - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Let's zero out old valid and hash index details - * hash fault look at them. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_current_mm_pte variants which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_curren_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); - return old_pmd; -} - -int hash__has_transparent_hugepage(void) -{ - - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) - return 0; - /* - * We support THP only if PMD_SIZE is 16MB. - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) - return 0; - /* - * We need to make sure that we support 16MB hugepage in a segement - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE - * of 64K. - */ - /* - * If we have 64K HPTE, we will be using that by default - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift && - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) - return 0; - /* - * Ok we only have 4K HPTE - */ - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) - return 0; - - return 1; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#ifdef CONFIG_STRICT_KERNEL_RWX -static bool hash__change_memory_range(unsigned long start, unsigned long end, - unsigned long newpp) -{ - unsigned long idx; - unsigned int step, shift; - - shift = mmu_psize_defs[mmu_linear_psize].shift; - step = 1 << shift; - - start = ALIGN_DOWN(start, step); - end = ALIGN(end, step); // aligns up - - if (start >= end) - return false; - - pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", - start, end, newpp, step); - - for (idx = start; idx < end; idx += step) - /* Not sure if we can do much with the return value */ - mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, - mmu_kernel_ssize); - - return true; -} - -void hash__mark_rodata_ro(void) -{ - unsigned long start, end; - - start = (unsigned long)_stext; - end = (unsigned long)__init_begin; - - WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); -} - -void hash__mark_initmem_nx(void) -{ - unsigned long start, end, pp; - - start = (unsigned long)__init_begin; - end = (unsigned long)__init_end; - - pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); - - WARN_ON(!hash__change_memory_range(start, end, pp)); -} -#endif diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c deleted file mode 100644 index fcb0169e2d32..000000000000 --- a/arch/powerpc/mm/pgtable-radix.c +++ /dev/null @@ -1,1124 +0,0 @@ -/* - * Page table handling routines for radix page table. - * - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) "radix-mmu: " fmt - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -unsigned int mmu_pid_bits; -unsigned int mmu_base_pid; - -static int native_register_process_table(unsigned long base, unsigned long pg_sz, - unsigned long table_size) -{ - unsigned long patb0, patb1; - - patb0 = be64_to_cpu(partition_tb[0].patb0); - patb1 = base | table_size | PATB_GR; - - mmu_partition_table_set_entry(0, patb0, patb1); - - return 0; -} - -static __ref void *early_alloc_pgtable(unsigned long size, int nid, - unsigned long region_start, unsigned long region_end) -{ - phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; - phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; - void *ptr; - - if (region_start) - min_addr = region_start; - if (region_end) - max_addr = region_end; - - ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", - __func__, size, size, nid, &min_addr, &max_addr); - - return ptr; -} - -static int early_map_kernel_page(unsigned long ea, unsigned long pa, - pgprot_t flags, - unsigned int map_page_size, - int nid, - unsigned long region_start, unsigned long region_end) -{ - unsigned long pfn = pa >> PAGE_SHIFT; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, - region_start, region_end); - pgd_populate(&init_mm, pgdp, pudp); - } - pudp = pud_offset(pgdp, ea); - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, - region_start, region_end); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE, nid, - region_start, region_end); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - -set_the_pte: - set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); - return 0; -} - -/* - * nid, region_start, and region_end are hints to try to place the page - * table memory in the same node or region. - */ -static int __map_kernel_page(unsigned long ea, unsigned long pa, - pgprot_t flags, - unsigned int map_page_size, - int nid, - unsigned long region_start, unsigned long region_end) -{ - unsigned long pfn = pa >> PAGE_SHIFT; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - /* - * Make sure task size is correct as per the max adddr - */ - BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); - -#ifdef CONFIG_PPC_64K_PAGES - BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); -#endif - - if (unlikely(!slab_is_available())) - return early_map_kernel_page(ea, pa, flags, map_page_size, - nid, region_start, region_end); - - /* - * Should make page table allocation functions be able to take a - * node, so we can place kernel page tables on the right nodes after - * boot. - */ - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - -set_the_pte: - set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); - return 0; -} - -int radix__map_kernel_page(unsigned long ea, unsigned long pa, - pgprot_t flags, - unsigned int map_page_size) -{ - return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); -} - -#ifdef CONFIG_STRICT_KERNEL_RWX -void radix__change_memory_range(unsigned long start, unsigned long end, - unsigned long clear) -{ - unsigned long idx; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - start = ALIGN_DOWN(start, PAGE_SIZE); - end = PAGE_ALIGN(end); // aligns up - - pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", - start, end, clear); - - for (idx = start; idx < end; idx += PAGE_SIZE) { - pgdp = pgd_offset_k(idx); - pudp = pud_alloc(&init_mm, pgdp, idx); - if (!pudp) - continue; - if (pud_huge(*pudp)) { - ptep = (pte_t *)pudp; - goto update_the_pte; - } - pmdp = pmd_alloc(&init_mm, pudp, idx); - if (!pmdp) - continue; - if (pmd_huge(*pmdp)) { - ptep = pmdp_ptep(pmdp); - goto update_the_pte; - } - ptep = pte_alloc_kernel(pmdp, idx); - if (!ptep) - continue; -update_the_pte: - radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); - } - - radix__flush_tlb_kernel_range(start, end); -} - -void radix__mark_rodata_ro(void) -{ - unsigned long start, end; - - start = (unsigned long)_stext; - end = (unsigned long)__init_begin; - - radix__change_memory_range(start, end, _PAGE_WRITE); -} - -void radix__mark_initmem_nx(void) -{ - unsigned long start = (unsigned long)__init_begin; - unsigned long end = (unsigned long)__init_end; - - radix__change_memory_range(start, end, _PAGE_EXEC); -} -#endif /* CONFIG_STRICT_KERNEL_RWX */ - -static inline void __meminit -print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) -{ - char buf[10]; - - if (end <= start) - return; - - string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); - - pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, - exec ? " (exec)" : ""); -} - -static unsigned long next_boundary(unsigned long addr, unsigned long end) -{ -#ifdef CONFIG_STRICT_KERNEL_RWX - if (addr < __pa_symbol(__init_begin)) - return __pa_symbol(__init_begin); -#endif - return end; -} - -static int __meminit create_physical_mapping(unsigned long start, - unsigned long end, - int nid) -{ - unsigned long vaddr, addr, mapping_size = 0; - bool prev_exec, exec = false; - pgprot_t prot; - int psize; - - start = _ALIGN_UP(start, PAGE_SIZE); - for (addr = start; addr < end; addr += mapping_size) { - unsigned long gap, previous_size; - int rc; - - gap = next_boundary(addr, end) - addr; - previous_size = mapping_size; - prev_exec = exec; - - if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && - mmu_psize_defs[MMU_PAGE_1G].shift) { - mapping_size = PUD_SIZE; - psize = MMU_PAGE_1G; - } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && - mmu_psize_defs[MMU_PAGE_2M].shift) { - mapping_size = PMD_SIZE; - psize = MMU_PAGE_2M; - } else { - mapping_size = PAGE_SIZE; - psize = mmu_virtual_psize; - } - - vaddr = (unsigned long)__va(addr); - - if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || - overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { - prot = PAGE_KERNEL_X; - exec = true; - } else { - prot = PAGE_KERNEL; - exec = false; - } - - if (mapping_size != previous_size || exec != prev_exec) { - print_mapping(start, addr, previous_size, prev_exec); - start = addr; - } - - rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); - if (rc) - return rc; - - update_page_count(psize, 1); - } - - print_mapping(start, addr, mapping_size, exec); - return 0; -} - -void __init radix_init_pgtable(void) -{ - unsigned long rts_field; - struct memblock_region *reg; - - /* We don't support slb for radix */ - mmu_slb_size = 0; - /* - * Create the linear mapping, using standard page size for now - */ - for_each_memblock(memory, reg) { - /* - * The memblock allocator is up at this point, so the - * page tables will be allocated within the range. No - * need or a node (which we don't have yet). - */ - - if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - continue; - } - - WARN_ON(create_physical_mapping(reg->base, - reg->base + reg->size, - -1)); - } - - /* Find out how many PID bits are supported */ - if (cpu_has_feature(CPU_FTR_HVMODE)) { - if (!mmu_pid_bits) - mmu_pid_bits = 20; -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* - * When KVM is possible, we only use the top half of the - * PID space to avoid collisions between host and guest PIDs - * which can cause problems due to prefetch when exiting the - * guest with AIL=3 - */ - mmu_base_pid = 1 << (mmu_pid_bits - 1); -#else - mmu_base_pid = 1; -#endif - } else { - /* The guest uses the bottom half of the PID space */ - if (!mmu_pid_bits) - mmu_pid_bits = 19; - mmu_base_pid = 1; - } - - /* - * Allocate Partition table and process table for the - * host. - */ - BUG_ON(PRTB_SIZE_SHIFT > 36); - process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); - /* - * Fill in the process table. - */ - rts_field = radix__get_tree_size(); - process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); - /* - * Fill in the partition table. We are suppose to use effective address - * of process table here. But our linear mapping also enable us to use - * physical address here. - */ - register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); - pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); - - /* - * The init_mm context is given the first available (non-zero) PID, - * which is the "guard PID" and contains no page table. PIDR should - * never be set to zero because that duplicates the kernel address - * space at the 0x0... offset (quadrant 0)! - * - * An arbitrary PID that may later be allocated by the PID allocator - * for userspace processes must not be used either, because that - * would cause stale user mappings for that PID on CPUs outside of - * the TLB invalidation scheme (because it won't be in mm_cpumask). - * - * So permanently carve out one PID for the purpose of a guard PID. - */ - init_mm.context.id = mmu_base_pid; - mmu_base_pid++; -} - -static void __init radix_init_partition_table(void) -{ - unsigned long rts_field, dw0; - - mmu_partition_table_init(); - rts_field = radix__get_tree_size(); - dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; - mmu_partition_table_set_entry(0, dw0, 0); - - pr_info("Initializing Radix MMU\n"); - pr_info("Partition table %p\n", partition_tb); -} - -void __init radix_init_native(void) -{ - register_process_table = native_register_process_table; -} - -static int __init get_idx_from_shift(unsigned int shift) -{ - int idx = -1; - - switch (shift) { - case 0xc: - idx = MMU_PAGE_4K; - break; - case 0x10: - idx = MMU_PAGE_64K; - break; - case 0x15: - idx = MMU_PAGE_2M; - break; - case 0x1e: - idx = MMU_PAGE_1G; - break; - } - return idx; -} - -static int __init radix_dt_scan_page_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - int size = 0; - int shift, idx; - unsigned int ap; - const __be32 *prop; - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - /* Find MMU PID size */ - prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); - if (prop && size == 4) - mmu_pid_bits = be32_to_cpup(prop); - - /* Grab page size encodings */ - prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); - if (!prop) - return 0; - - pr_info("Page sizes from device-tree:\n"); - for (; size >= 4; size -= 4, ++prop) { - - struct mmu_psize_def *def; - - /* top 3 bit is AP encoding */ - shift = be32_to_cpu(prop[0]) & ~(0xe << 28); - ap = be32_to_cpu(prop[0]) >> 29; - pr_info("Page size shift = %d AP=0x%x\n", shift, ap); - - idx = get_idx_from_shift(shift); - if (idx < 0) - continue; - - def = &mmu_psize_defs[idx]; - def->shift = shift; - def->ap = ap; - } - - /* needed ? */ - cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; - return 1; -} - -void __init radix__early_init_devtree(void) -{ - int rc; - - /* - * Try to find the available page sizes in the device-tree - */ - rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); - if (rc != 0) /* Found */ - goto found; - /* - * let's assume we have page 4k and 64k support - */ - mmu_psize_defs[MMU_PAGE_4K].shift = 12; - mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; - - mmu_psize_defs[MMU_PAGE_64K].shift = 16; - mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; -found: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - return; -} - -static void radix_init_amor(void) -{ - /* - * In HV mode, we init AMOR (Authority Mask Override Register) so that - * the hypervisor and guest can setup IAMR (Instruction Authority Mask - * Register), enable key 0 and set it to 1. - * - * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) - */ - mtspr(SPRN_AMOR, (3ul << 62)); -} - -#ifdef CONFIG_PPC_KUEP -void setup_kuep(bool disabled) -{ - if (disabled || !early_radix_enabled()) - return; - - if (smp_processor_id() == boot_cpuid) - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - /* - * Radix always uses key0 of the IAMR to determine if an access is - * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction - * fetch. - */ - mtspr(SPRN_IAMR, (1ul << 62)); -} -#endif - -#ifdef CONFIG_PPC_KUAP -void setup_kuap(bool disabled) -{ - if (disabled || !early_radix_enabled()) - return; - - if (smp_processor_id() == boot_cpuid) { - pr_info("Activating Kernel Userspace Access Prevention\n"); - cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; - } - - /* Make sure userspace can't change the AMR */ - mtspr(SPRN_UAMOR, 0); - mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); - isync(); -} -#endif - -void __init radix__early_init_mmu(void) -{ - unsigned long lpcr; - -#ifdef CONFIG_PPC_64K_PAGES - /* PAGE_SIZE mappings */ - mmu_virtual_psize = MMU_PAGE_64K; -#else - mmu_virtual_psize = MMU_PAGE_4K; -#endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - mmu_vmemmap_psize = mmu_virtual_psize; -#endif - /* - * initialize page table size - */ - __pte_index_size = RADIX_PTE_INDEX_SIZE; - __pmd_index_size = RADIX_PMD_INDEX_SIZE; - __pud_index_size = RADIX_PUD_INDEX_SIZE; - __pgd_index_size = RADIX_PGD_INDEX_SIZE; - __pud_cache_index = RADIX_PUD_INDEX_SIZE; - __pte_table_size = RADIX_PTE_TABLE_SIZE; - __pmd_table_size = RADIX_PMD_TABLE_SIZE; - __pud_table_size = RADIX_PUD_TABLE_SIZE; - __pgd_table_size = RADIX_PGD_TABLE_SIZE; - - __pmd_val_bits = RADIX_PMD_VAL_BITS; - __pud_val_bits = RADIX_PUD_VAL_BITS; - __pgd_val_bits = RADIX_PGD_VAL_BITS; - - __kernel_virt_start = RADIX_KERN_VIRT_START; - __vmalloc_start = RADIX_VMALLOC_START; - __vmalloc_end = RADIX_VMALLOC_END; - __kernel_io_start = RADIX_KERN_IO_START; - __kernel_io_end = RADIX_KERN_IO_END; - vmemmap = (struct page *)RADIX_VMEMMAP_START; - ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PCI - pci_io_base = ISA_IO_BASE; -#endif - __pte_frag_nr = RADIX_PTE_FRAG_NR; - __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; - __pmd_frag_nr = RADIX_PMD_FRAG_NR; - __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; - - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - radix_init_native(); - lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - radix_init_partition_table(); - radix_init_amor(); - } else { - radix_init_pseries(); - } - - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - - radix_init_pgtable(); - /* Switch to the guard PID before turning on MMU */ - radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} - -void radix__early_init_mmu_secondary(void) -{ - unsigned long lpcr; - /* - * update partition table control register and UPRT - */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - radix_init_amor(); - } - - radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} - -void radix__mmu_cleanup_all(void) -{ - unsigned long lpcr; - - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); - mtspr(SPRN_PTCR, 0); - powernv_set_nmmu_ptcr(0); - radix__flush_tlb_all(); - } -} - -void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* - * Radix mode is not limited by RMA / VRMA addressing. - */ - ppc64_rma_size = ULONG_MAX; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_pte_table(pte_t *pte_start, pmd_t *pmd) -{ - pte_t *pte; - int i; - - for (i = 0; i < PTRS_PER_PTE; i++) { - pte = pte_start + i; - if (!pte_none(*pte)) - return; - } - - pte_free_kernel(&init_mm, pte_start); - pmd_clear(pmd); -} - -static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) -{ - pmd_t *pmd; - int i; - - for (i = 0; i < PTRS_PER_PMD; i++) { - pmd = pmd_start + i; - if (!pmd_none(*pmd)) - return; - } - - pmd_free(&init_mm, pmd_start); - pud_clear(pud); -} - -struct change_mapping_params { - pte_t *pte; - unsigned long start; - unsigned long end; - unsigned long aligned_start; - unsigned long aligned_end; -}; - -static int __meminit stop_machine_change_mapping(void *data) -{ - struct change_mapping_params *params = - (struct change_mapping_params *)data; - - if (!data) - return -1; - - spin_unlock(&init_mm.page_table_lock); - pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(params->aligned_start, params->start, -1); - create_physical_mapping(params->end, params->aligned_end, -1); - spin_lock(&init_mm.page_table_lock); - return 0; -} - -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end) -{ - unsigned long next; - pte_t *pte; - - pte = pte_start + pte_index(addr); - for (; addr < end; addr = next, pte++) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - if (next > end) - next = end; - - if (!pte_present(*pte)) - continue; - - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; - } - - pte_clear(&init_mm, addr, pte); - } -} - -/* - * clear the pte and potentially split the mapping helper - */ -static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, - unsigned long size, pte_t *pte) -{ - unsigned long mask = ~(size - 1); - unsigned long aligned_start = addr & mask; - unsigned long aligned_end = addr + size; - struct change_mapping_params params; - bool split_region = false; - - if ((end - addr) < size) { - /* - * We're going to clear the PTE, but not flushed - * the mapping, time to remap and flush. The - * effects if visible outside the processor or - * if we are running in code close to the - * mapping we cleared, we are in trouble. - */ - if (overlaps_kernel_text(aligned_start, addr) || - overlaps_kernel_text(end, aligned_end)) { - /* - * Hack, just return, don't pte_clear - */ - WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " - "text, not splitting\n", addr, end); - return; - } - split_region = true; - } - - if (split_region) { - params.pte = pte; - params.start = addr; - params.end = end; - params.aligned_start = addr & ~(size - 1); - params.aligned_end = min_t(unsigned long, aligned_end, - (unsigned long)__va(memblock_end_of_DRAM())); - stop_machine(stop_machine_change_mapping, ¶ms, NULL); - return; - } - - pte_clear(&init_mm, addr, pte); -} - -static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end) -{ - unsigned long next; - pte_t *pte_base; - pmd_t *pmd; - - pmd = pmd_start + pmd_index(addr); - for (; addr < end; addr = next, pmd++) { - next = pmd_addr_end(addr, end); - - if (!pmd_present(*pmd)) - continue; - - if (pmd_huge(*pmd)) { - split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); - continue; - } - - pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next); - free_pte_table(pte_base, pmd); - } -} - -static void remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end) -{ - unsigned long next; - pmd_t *pmd_base; - pud_t *pud; - - pud = pud_start + pud_index(addr); - for (; addr < end; addr = next, pud++) { - next = pud_addr_end(addr, end); - - if (!pud_present(*pud)) - continue; - - if (pud_huge(*pud)) { - split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); - continue; - } - - pmd_base = (pmd_t *)pud_page_vaddr(*pud); - remove_pmd_table(pmd_base, addr, next); - free_pmd_table(pmd_base, pud); - } -} - -static void __meminit remove_pagetable(unsigned long start, unsigned long end) -{ - unsigned long addr, next; - pud_t *pud_base; - pgd_t *pgd; - - spin_lock(&init_mm.page_table_lock); - - for (addr = start; addr < end; addr = next) { - next = pgd_addr_end(addr, end); - - pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) - continue; - - if (pgd_huge(*pgd)) { - split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); - continue; - } - - pud_base = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud_base, addr, next); - } - - spin_unlock(&init_mm.page_table_lock); - radix__flush_tlb_kernel_range(start, end); -} - -int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) -{ - if (end >= RADIX_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - return -1; - } - - return create_physical_mapping(start, end, nid); -} - -int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) -{ - remove_pagetable(start, end); - return 0; -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, - pgprot_t flags, unsigned int map_page_size, - int nid) -{ - return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); -} - -int __meminit radix__vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding */ - unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; - int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); - int ret; - - if ((start + page_size) >= RADIX_VMEMMAP_END) { - pr_warn("Outside the supported range\n"); - return -1; - } - - ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); - BUG_ON(ret); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) -{ - remove_pagetable(start, start + page_size); -} -#endif -#endif - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - unsigned long old; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(pmd_lockptr(mm, pmdp)); -#endif - - old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); - trace_hugepage_update(addr, old, clr, set); - - return old; -} - -pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) - -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); - VM_BUG_ON(pmd_devmap(*pmdp)); - /* - * khugepaged calls this for normal pmd - */ - pmd = *pmdp; - pmd_clear(pmdp); - - /*FIXME!! Verify whether we need this kick below */ - serialize_against_pte_lookup(vma->vm_mm); - - radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); - - return pmd; -} - -/* - * For us pgtable_t is pte_t *. Inorder to save the deposisted - * page table, we consider the allocated page table as a list - * head. On withdraw we need to make sure we zero out the used - * list_head memory area. - */ -void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - struct list_head *lh = (struct list_head *) pgtable; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(lh); - else - list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); - pmd_huge_pte(mm, pmdp) = pgtable; -} - -pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pte_t *ptep; - pgtable_t pgtable; - struct list_head *lh; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); - lh = (struct list_head *) pgtable; - if (list_empty(lh)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; - list_del(lh); - } - ptep = (pte_t *) pgtable; - *ptep = __pte(0); - ptep++; - *ptep = __pte(0); - return pgtable; -} - - -pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - unsigned long old; - - old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * Serialize against find_current_mm_pte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); - return old_pmd; -} - -int radix__has_transparent_hugepage(void) -{ - /* For radix 2M at PMD level means thp */ - if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) - return 1; - return 0; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, - pte_t entry, unsigned long address, int psize) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | - _PAGE_RW | _PAGE_EXEC); - - unsigned long change = pte_val(entry) ^ pte_val(*ptep); - /* - * To avoid NMMU hang while relaxing access, we need mark - * the pte invalid in between. - */ - if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { - unsigned long old_pte, new_pte; - - old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); - /* - * new value of pte - */ - new_pte = old_pte | set; - radix__flush_tlb_page_psize(mm, address, psize); - __radix_pte_update(ptep, _PAGE_INVALID, new_pte); - } else { - __radix_pte_update(ptep, 0, set); - /* - * Book3S does not require a TLB flush when relaxing access - * restrictions when the address space is not attached to a - * NMMU, because the core MMU will reload the pte after taking - * an access fault, which is defined by the architectue. - */ - } - /* See ptesync comment in radix__set_pte_at */ -} - -void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t old_pte, pte_t pte) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. We need to do this only for radix, because hash - * translation does flush when updating the linux pte. - */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) - radix__flush_tlb_page(vma, addr); - - set_pte_at(mm, addr, ptep, pte); -} diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c deleted file mode 100644 index ae7fca40e5b3..000000000000 --- a/arch/powerpc/mm/pkeys.c +++ /dev/null @@ -1,428 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * PowerPC Memory Protection Keys management - * - * Copyright 2017, Ram Pai, IBM Corporation. - */ - -#include -#include -#include -#include -#include -#include - -DEFINE_STATIC_KEY_TRUE(pkey_disabled); -int pkeys_total; /* Total pkeys as per device tree */ -u32 initial_allocation_mask; /* Bits set for the initially allocated keys */ -u32 reserved_allocation_mask; /* Bits set for reserved keys */ -static bool pkey_execute_disable_supported; -static bool pkeys_devtree_defined; /* property exported by device tree */ -static u64 pkey_amr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */ -static int execute_only_key = 2; - -#define AMR_BITS_PER_PKEY 2 -#define AMR_RD_BIT 0x1UL -#define AMR_WR_BIT 0x2UL -#define IAMR_EX_BIT 0x1UL -#define PKEY_REG_BITS (sizeof(u64)*8) -#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) - -static void scan_pkey_feature(void) -{ - u32 vals[2]; - struct device_node *cpu; - - cpu = of_find_node_by_type(NULL, "cpu"); - if (!cpu) - return; - - if (of_property_read_u32_array(cpu, - "ibm,processor-storage-keys", vals, 2)) - return; - - /* - * Since any pkey can be used for data or execute, we will just treat - * all keys as equal and track them as one entity. - */ - pkeys_total = vals[0]; - pkeys_devtree_defined = true; -} - -static inline bool pkey_mmu_enabled(void) -{ - if (firmware_has_feature(FW_FEATURE_LPAR)) - return pkeys_total; - else - return cpu_has_feature(CPU_FTR_PKEY); -} - -static int pkey_initialize(void) -{ - int os_reserved, i; - - /* - * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral - * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE. - * Ensure that the bits a distinct. - */ - BUILD_BUG_ON(PKEY_DISABLE_EXECUTE & - (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - /* - * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous - * in the vmaflag. Make sure that is really the case. - */ - BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + - __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) - != (sizeof(u64) * BITS_PER_BYTE)); - - /* scan the device tree for pkey feature */ - scan_pkey_feature(); - - /* - * Let's assume 32 pkeys on P8 bare metal, if its not defined by device - * tree. We make this exception since skiboot forgot to expose this - * property on power8. - */ - if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) && - cpu_has_feature(CPU_FTRS_POWER8)) - pkeys_total = 32; - - /* - * Adjust the upper limit, based on the number of bits supported by - * arch-neutral code. - */ - pkeys_total = min_t(int, pkeys_total, - ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1)); - - if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total) - static_branch_enable(&pkey_disabled); - else - static_branch_disable(&pkey_disabled); - - if (static_branch_likely(&pkey_disabled)) - return 0; - - /* - * The device tree cannot be relied to indicate support for - * execute_disable support. Instead we use a PVR check. - */ - if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p)) - pkey_execute_disable_supported = false; - else - pkey_execute_disable_supported = true; - -#ifdef CONFIG_PPC_4K_PAGES - /* - * The OS can manage only 8 pkeys due to its inability to represent them - * in the Linux 4K PTE. - */ - os_reserved = pkeys_total - 8; -#else - os_reserved = 0; -#endif - /* Bits are in LE format. */ - reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key); - - /* register mask is in BE format */ - pkey_amr_mask = ~0x0ul; - pkey_amr_mask &= ~(0x3ul << pkeyshift(0)); - - pkey_iamr_mask = ~0x0ul; - pkey_iamr_mask &= ~(0x3ul << pkeyshift(0)); - pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key)); - - pkey_uamor_mask = ~0x0ul; - pkey_uamor_mask &= ~(0x3ul << pkeyshift(0)); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key)); - - /* mark the rest of the keys as reserved and hence unavailable */ - for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) { - reserved_allocation_mask |= (0x1 << i); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(i)); - } - initial_allocation_mask = reserved_allocation_mask | (0x1 << 0); - - if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) { - /* - * Insufficient number of keys to support - * execute only key. Mark it unavailable. - * Any AMR, UAMOR, IAMR bit set for - * this key is irrelevant since this key - * can never be allocated. - */ - execute_only_key = -1; - } - - return 0; -} - -arch_initcall(pkey_initialize); - -void pkey_mm_init(struct mm_struct *mm) -{ - if (static_branch_likely(&pkey_disabled)) - return; - mm_pkey_allocation_map(mm) = initial_allocation_mask; - mm->context.execute_only_pkey = execute_only_key; -} - -static inline u64 read_amr(void) -{ - return mfspr(SPRN_AMR); -} - -static inline void write_amr(u64 value) -{ - mtspr(SPRN_AMR, value); -} - -static inline u64 read_iamr(void) -{ - if (!likely(pkey_execute_disable_supported)) - return 0x0UL; - - return mfspr(SPRN_IAMR); -} - -static inline void write_iamr(u64 value) -{ - if (!likely(pkey_execute_disable_supported)) - return; - - mtspr(SPRN_IAMR, value); -} - -static inline u64 read_uamor(void) -{ - return mfspr(SPRN_UAMOR); -} - -static inline void write_uamor(u64 value) -{ - mtspr(SPRN_UAMOR, value); -} - -static bool is_pkey_enabled(int pkey) -{ - u64 uamor = read_uamor(); - u64 pkey_bits = 0x3ul << pkeyshift(pkey); - u64 uamor_pkey_bits = (uamor & pkey_bits); - - /* - * Both the bits in UAMOR corresponding to the key should be set or - * reset. - */ - WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits)); - return !!(uamor_pkey_bits); -} - -static inline void init_amr(int pkey, u8 init_bits) -{ - u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); - u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); - - write_amr(old_amr | new_amr_bits); -} - -static inline void init_iamr(int pkey, u8 init_bits) -{ - u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey)); - u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); - - write_iamr(old_iamr | new_iamr_bits); -} - -/* - * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that - * specified in @init_val. - */ -int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) -{ - u64 new_amr_bits = 0x0ul; - u64 new_iamr_bits = 0x0ul; - - if (!is_pkey_enabled(pkey)) - return -EINVAL; - - if (init_val & PKEY_DISABLE_EXECUTE) { - if (!pkey_execute_disable_supported) - return -EINVAL; - new_iamr_bits |= IAMR_EX_BIT; - } - init_iamr(pkey, new_iamr_bits); - - /* Set the bits we need in AMR: */ - if (init_val & PKEY_DISABLE_ACCESS) - new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT; - else if (init_val & PKEY_DISABLE_WRITE) - new_amr_bits |= AMR_WR_BIT; - - init_amr(pkey, new_amr_bits); - return 0; -} - -void thread_pkey_regs_save(struct thread_struct *thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - /* - * TODO: Skip saving registers if @thread hasn't used any keys yet. - */ - thread->amr = read_amr(); - thread->iamr = read_iamr(); - thread->uamor = read_uamor(); -} - -void thread_pkey_regs_restore(struct thread_struct *new_thread, - struct thread_struct *old_thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - if (old_thread->amr != new_thread->amr) - write_amr(new_thread->amr); - if (old_thread->iamr != new_thread->iamr) - write_iamr(new_thread->iamr); - if (old_thread->uamor != new_thread->uamor) - write_uamor(new_thread->uamor); -} - -void thread_pkey_regs_init(struct thread_struct *thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - thread->amr = pkey_amr_mask; - thread->iamr = pkey_iamr_mask; - thread->uamor = pkey_uamor_mask; - - write_uamor(pkey_uamor_mask); - write_amr(pkey_amr_mask); - write_iamr(pkey_iamr_mask); -} - -static inline bool pkey_allows_readwrite(int pkey) -{ - int pkey_shift = pkeyshift(pkey); - - if (!is_pkey_enabled(pkey)) - return true; - - return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift)); -} - -int __execute_only_pkey(struct mm_struct *mm) -{ - return mm->context.execute_only_pkey; -} - -static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) -{ - /* Do this check first since the vm_flags should be hot */ - if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) - return false; - - return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey); -} - -/* - * This should only be called for *plain* mprotect calls. - */ -int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, - int pkey) -{ - /* - * If the currently associated pkey is execute-only, but the requested - * protection is not execute-only, move it back to the default pkey. - */ - if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC)) - return 0; - - /* - * The requested protection is execute-only. Hence let's use an - * execute-only pkey. - */ - if (prot == PROT_EXEC) { - pkey = execute_only_pkey(vma->vm_mm); - if (pkey > 0) - return pkey; - } - - /* Nothing to override. */ - return vma_pkey(vma); -} - -static bool pkey_access_permitted(int pkey, bool write, bool execute) -{ - int pkey_shift; - u64 amr; - - if (!is_pkey_enabled(pkey)) - return true; - - pkey_shift = pkeyshift(pkey); - if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift))) - return true; - - amr = read_amr(); /* Delay reading amr until absolutely needed */ - return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) || - (write && !(amr & (AMR_WR_BIT << pkey_shift)))); -} - -bool arch_pte_access_permitted(u64 pte, bool write, bool execute) -{ - if (static_branch_likely(&pkey_disabled)) - return true; - - return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute); -} - -/* - * We only want to enforce protection keys on the current thread because we - * effectively have no access to AMR/IAMR for other threads or any way to tell - * which AMR/IAMR in a threaded process we could use. - * - * So do not enforce things if the VMA is not from the current mm, or if we are - * in a kernel thread. - */ -static inline bool vma_is_foreign(struct vm_area_struct *vma) -{ - if (!current->mm) - return true; - - /* if it is not our ->mm, it has to be foreign */ - if (current->mm != vma->vm_mm) - return true; - - return false; -} - -bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, - bool execute, bool foreign) -{ - if (static_branch_likely(&pkey_disabled)) - return true; - /* - * Do not enforce our key-permissions on a foreign vma. - */ - if (foreign || vma_is_foreign(vma)) - return true; - - return pkey_access_permitted(vma_pkey(vma), write, execute); -} - -void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - /* Duplicate the oldmm pkey state in mm: */ - mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm); - mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; -} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c deleted file mode 100644 index 89e4531de64b..000000000000 --- a/arch/powerpc/mm/slb.c +++ /dev/null @@ -1,832 +0,0 @@ -/* - * PowerPC64 SLB support. - * - * Copyright (C) 2004 David Gibson , IBM - * Based on earlier code written by: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard , IBM - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -enum slb_index { - LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ - KSTACK_INDEX = 1, /* Kernel stack map */ -}; - -static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); - -#define slb_esid_mask(ssize) \ - (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) - -static inline unsigned long mk_esid_data(unsigned long ea, int ssize, - enum slb_index index) -{ - return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; -} - -static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, - unsigned long flags) -{ - return (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); -} - -static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, - unsigned long flags) -{ - return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); -} - -static void assert_slb_presence(bool present, unsigned long ea) -{ -#ifdef CONFIG_DEBUG_VM - unsigned long tmp; - - WARN_ON_ONCE(mfmsr() & MSR_EE); - - if (!cpu_has_feature(CPU_FTR_ARCH_206)) - return; - - /* - * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware - * ignores all other bits from 0-27, so just clear them all. - */ - ea &= ~((1UL << 28) - 1); - asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); - - WARN_ON(present == (tmp == 0)); -#endif -} - -static inline void slb_shadow_update(unsigned long ea, int ssize, - unsigned long flags, - enum slb_index index) -{ - struct slb_shadow *p = get_slb_shadow(); - - /* - * Clear the ESID first so the entry is not valid while we are - * updating it. No write barriers are needed here, provided - * we only update the current CPU's SLB shadow buffer. - */ - WRITE_ONCE(p->save_area[index].esid, 0); - WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags))); - WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index))); -} - -static inline void slb_shadow_clear(enum slb_index index) -{ - WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); -} - -static inline void create_shadowed_slbe(unsigned long ea, int ssize, - unsigned long flags, - enum slb_index index) -{ - /* - * Updating the shadow buffer before writing the SLB ensures - * we don't get a stale entry here if we get preempted by PHYP - * between these two statements. - */ - slb_shadow_update(ea, ssize, flags, index); - - assert_slb_presence(false, ea); - asm volatile("slbmte %0,%1" : - : "r" (mk_vsid_data(ea, ssize, flags)), - "r" (mk_esid_data(ea, ssize, index)) - : "memory" ); -} - -/* - * Insert bolted entries into SLB (which may not be empty, so don't clear - * slb_cache_ptr). - */ -void __slb_restore_bolted_realmode(void) -{ - struct slb_shadow *p = get_slb_shadow(); - enum slb_index index; - - /* No isync needed because realmode. */ - for (index = 0; index < SLB_NUM_BOLTED; index++) { - asm volatile("slbmte %0,%1" : - : "r" (be64_to_cpu(p->save_area[index].vsid)), - "r" (be64_to_cpu(p->save_area[index].esid))); - } - - assert_slb_presence(true, local_paca->kstack); -} - -/* - * Insert the bolted entries into an empty SLB. - */ -void slb_restore_bolted_realmode(void) -{ - __slb_restore_bolted_realmode(); - get_paca()->slb_cache_ptr = 0; - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; -} - -/* - * This flushes all SLB entries including 0, so it must be realmode. - */ -void slb_flush_all_realmode(void) -{ - asm volatile("slbmte %0,%0; slbia" : : "r" (0)); -} - -/* - * This flushes non-bolted entries, it can be run in virtual mode. Must - * be called with interrupts disabled. - */ -void slb_flush_and_restore_bolted(void) -{ - struct slb_shadow *p = get_slb_shadow(); - - BUILD_BUG_ON(SLB_NUM_BOLTED != 2); - - WARN_ON(!irqs_disabled()); - - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - - asm volatile("isync\n" - "slbia\n" - "slbmte %0, %1\n" - "isync\n" - :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), - "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) - : "memory"); - assert_slb_presence(true, get_paca()->kstack); - - get_paca()->slb_cache_ptr = 0; - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; -} - -void slb_save_contents(struct slb_entry *slb_ptr) -{ - int i; - unsigned long e, v; - - /* Save slb_cache_ptr value. */ - get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; - - if (!slb_ptr) - return; - - for (i = 0; i < mmu_slb_size; i++) { - asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); - asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); - slb_ptr->esid = e; - slb_ptr->vsid = v; - slb_ptr++; - } -} - -void slb_dump_contents(struct slb_entry *slb_ptr) -{ - int i, n; - unsigned long e, v; - unsigned long llp; - - if (!slb_ptr) - return; - - pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); - pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); - - for (i = 0; i < mmu_slb_size; i++) { - e = slb_ptr->esid; - v = slb_ptr->vsid; - slb_ptr++; - - if (!e && !v) - continue; - - pr_err("%02d %016lx %016lx\n", i, e, v); - - if (!(e & SLB_ESID_V)) { - pr_err("\n"); - continue; - } - llp = v & SLB_VSID_LLP; - if (v & SLB_VSID_B_1T) { - pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", - GET_ESID_1T(e), - (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); - } else { - pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", - GET_ESID(e), - (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); - } - } - pr_err("----------------------------------\n"); - - /* Dump slb cache entires as well. */ - pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); - pr_err("Valid SLB cache entries:\n"); - n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); - for (i = 0; i < n; i++) - pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); - pr_err("Rest of SLB cache entries:\n"); - for (i = n; i < SLB_CACHE_ENTRIES; i++) - pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); -} - -void slb_vmalloc_update(void) -{ - /* - * vmalloc is not bolted, so just have to flush non-bolted. - */ - slb_flush_and_restore_bolted(); -} - -static bool preload_hit(struct thread_info *ti, unsigned long esid) -{ - unsigned char i; - - for (i = 0; i < ti->slb_preload_nr; i++) { - unsigned char idx; - - idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; - if (esid == ti->slb_preload_esid[idx]) - return true; - } - return false; -} - -static bool preload_add(struct thread_info *ti, unsigned long ea) -{ - unsigned char idx; - unsigned long esid; - - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { - /* EAs are stored >> 28 so 256MB segments don't need clearing */ - if (ea & ESID_MASK_1T) - ea &= ESID_MASK_1T; - } - - esid = ea >> SID_SHIFT; - - if (preload_hit(ti, esid)) - return false; - - idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; - ti->slb_preload_esid[idx] = esid; - if (ti->slb_preload_nr == SLB_PRELOAD_NR) - ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; - else - ti->slb_preload_nr++; - - return true; -} - -static void preload_age(struct thread_info *ti) -{ - if (!ti->slb_preload_nr) - return; - ti->slb_preload_nr--; - ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; -} - -void slb_setup_new_exec(void) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long exec = 0x10000000; - - WARN_ON(irqs_disabled()); - - /* - * preload cache can only be used to determine whether a SLB - * entry exists if it does not start to overflow. - */ - if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* - * We have no good place to clear the slb preload cache on exec, - * flush_thread is about the earliest arch hook but that happens - * after we switch to the mm and have aleady preloaded the SLBEs. - * - * For the most part that's probably okay to use entries from the - * previous exec, they will age out if unused. It may turn out to - * be an advantage to clear the cache before switching to it, - * however. - */ - - /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. - */ - if (!is_kernel_addr(exec)) { - if (preload_add(ti, exec)) - slb_allocate_user(mm, exec); - } - - /* Libraries and mmaps. */ - if (!is_kernel_addr(mm->mmap_base)) { - if (preload_add(ti, mm->mmap_base)) - slb_allocate_user(mm, mm->mmap_base); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - -void preload_new_slb_context(unsigned long start, unsigned long sp) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long heap = mm->start_brk; - - WARN_ON(irqs_disabled()); - - /* see above */ - if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* Userspace entry address. */ - if (!is_kernel_addr(start)) { - if (preload_add(ti, start)) - slb_allocate_user(mm, start); - } - - /* Top of stack, grows down. */ - if (!is_kernel_addr(sp)) { - if (preload_add(ti, sp)) - slb_allocate_user(mm, sp); - } - - /* Bottom of heap, grows up. */ - if (heap && !is_kernel_addr(heap)) { - if (preload_add(ti, heap)) - slb_allocate_user(mm, heap); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - - -/* Flush all user entries from the segment table of the current processor. */ -void switch_slb(struct task_struct *tsk, struct mm_struct *mm) -{ - struct thread_info *ti = task_thread_info(tsk); - unsigned char i; - - /* - * We need interrupts hard-disabled here, not just soft-disabled, - * so that a PMU interrupt can't occur, which might try to access - * user memory (to get a stack trace) and possible cause an SLB miss - * which would update the slb_cache/slb_cache_ptr fields in the PACA. - */ - hard_irq_disable(); - asm volatile("isync" : : : "memory"); - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - /* - * SLBIA IH=3 invalidates all Class=1 SLBEs and their - * associated lookaside structures, which matches what - * switch_slb wants. So ARCH_300 does not use the slb - * cache. - */ - asm volatile(PPC_SLBIA(3)); - } else { - unsigned long offset = get_paca()->slb_cache_ptr; - - if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && - offset <= SLB_CACHE_ENTRIES) { - unsigned long slbie_data = 0; - - for (i = 0; i < offset; i++) { - unsigned long ea; - - ea = (unsigned long) - get_paca()->slb_cache[i] << SID_SHIFT; - /* - * Could assert_slb_presence(true) here, but - * hypervisor or machine check could have come - * in and removed the entry at this point. - */ - - slbie_data = ea; - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* user slbs have C=1 */ - asm volatile("slbie %0" : : "r" (slbie_data)); - } - - /* Workaround POWER5 < DD2.1 issue */ - if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) - asm volatile("slbie %0" : : "r" (slbie_data)); - - } else { - struct slb_shadow *p = get_slb_shadow(); - unsigned long ksp_esid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].esid); - unsigned long ksp_vsid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); - - asm volatile(PPC_SLBIA(1) "\n" - "slbmte %0,%1\n" - "isync" - :: "r"(ksp_vsid_data), - "r"(ksp_esid_data)); - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - } - - get_paca()->slb_cache_ptr = 0; - } - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; - - copy_mm_to_paca(mm); - - /* - * We gradually age out SLBs after a number of context switches to - * reduce reload overhead of unused entries (like we do with FP/VEC - * reload). Each time we wrap 256 switches, take an entry out of the - * SLB preload cache. - */ - tsk->thread.load_slb++; - if (!tsk->thread.load_slb) { - unsigned long pc = KSTK_EIP(tsk); - - preload_age(ti); - preload_add(ti, pc); - } - - for (i = 0; i < ti->slb_preload_nr; i++) { - unsigned char idx; - unsigned long ea; - - idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; - ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; - - slb_allocate_user(mm, ea); - } - - /* - * Synchronize slbmte preloads with possible subsequent user memory - * address accesses by the kernel (user mode won't happen until - * rfid, which is safe). - */ - asm volatile("isync" : : : "memory"); -} - -void slb_set_size(u16 size) -{ - mmu_slb_size = size; -} - -void slb_initialize(void) -{ - unsigned long linear_llp, vmalloc_llp, io_llp; - unsigned long lflags; - static int slb_encoding_inited; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - unsigned long vmemmap_llp; -#endif - - /* Prepare our SLB miss handler based on our page size */ - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - io_llp = mmu_psize_defs[mmu_io_psize].sllp; - vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; - get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; -#endif - if (!slb_encoding_inited) { - slb_encoding_inited = 1; - pr_devel("SLB: linear LLP = %04lx\n", linear_llp); - pr_devel("SLB: io LLP = %04lx\n", io_llp); -#ifdef CONFIG_SPARSEMEM_VMEMMAP - pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); -#endif - } - - get_paca()->stab_rr = SLB_NUM_BOLTED - 1; - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; - - lflags = SLB_VSID_KERNEL | linear_llp; - - /* Invalidate the entire SLB (even entry 0) & all the ERATS */ - asm volatile("isync":::"memory"); - asm volatile("slbmte %0,%0"::"r" (0) : "memory"); - asm volatile("isync; slbia; isync":::"memory"); - create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); - - /* For the boot cpu, we're running on the stack in init_thread_union, - * which is in the first segment of the linear mapping, and also - * get_paca()->kstack hasn't been initialized yet. - * For secondary cpus, we need to bolt the kernel stack entry now. - */ - slb_shadow_clear(KSTACK_INDEX); - if (raw_smp_processor_id() != boot_cpuid && - (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) - create_shadowed_slbe(get_paca()->kstack, - mmu_kernel_ssize, lflags, KSTACK_INDEX); - - asm volatile("isync":::"memory"); -} - -static void slb_cache_update(unsigned long esid_data) -{ - int slb_cache_index; - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - return; /* ISAv3.0B and later does not use slb_cache */ - - /* - * Now update slb cache entries - */ - slb_cache_index = local_paca->slb_cache_ptr; - if (slb_cache_index < SLB_CACHE_ENTRIES) { - /* - * We have space in slb cache for optimized switch_slb(). - * Top 36 bits from esid_data as per ISA - */ - local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; - local_paca->slb_cache_ptr++; - } else { - /* - * Our cache is full and the current cache content strictly - * doesn't indicate the active SLB conents. Bump the ptr - * so that switch_slb() will ignore the cache. - */ - local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; - } -} - -static enum slb_index alloc_slb_index(bool kernel) -{ - enum slb_index index; - - /* - * The allocation bitmaps can become out of synch with the SLB - * when the _switch code does slbie when bolting a new stack - * segment and it must not be anywhere else in the SLB. This leaves - * a kernel allocated entry that is unused in the SLB. With very - * large systems or small segment sizes, the bitmaps could slowly - * fill with these entries. They will eventually be cleared out - * by the round robin allocator in that case, so it's probably not - * worth accounting for. - */ - - /* - * SLBs beyond 32 entries are allocated with stab_rr only - * POWER7/8/9 have 32 SLB entries, this could be expanded if a - * future CPU has more. - */ - if (local_paca->slb_used_bitmap != U32_MAX) { - index = ffz(local_paca->slb_used_bitmap); - local_paca->slb_used_bitmap |= 1U << index; - if (kernel) - local_paca->slb_kern_bitmap |= 1U << index; - } else { - /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ - index = local_paca->stab_rr; - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - local_paca->stab_rr = index; - if (index < 32) { - if (kernel) - local_paca->slb_kern_bitmap |= 1U << index; - else - local_paca->slb_kern_bitmap &= ~(1U << index); - } - } - BUG_ON(index < SLB_NUM_BOLTED); - - return index; -} - -static long slb_insert_entry(unsigned long ea, unsigned long context, - unsigned long flags, int ssize, bool kernel) -{ - unsigned long vsid; - unsigned long vsid_data, esid_data; - enum slb_index index; - - vsid = get_vsid(context, ea, ssize); - if (!vsid) - return -EFAULT; - - /* - * There must not be a kernel SLB fault in alloc_slb_index or before - * slbmte here or the allocation bitmaps could get out of whack with - * the SLB. - * - * User SLB faults or preloads take this path which might get inlined - * into the caller, so add compiler barriers here to ensure unsafe - * memory accesses do not come between. - */ - barrier(); - - index = alloc_slb_index(kernel); - - vsid_data = __mk_vsid_data(vsid, ssize, flags); - esid_data = mk_esid_data(ea, ssize, index); - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - * User preloads should add isync afterwards in case the kernel - * accesses user memory before it returns to userspace with rfid. - */ - assert_slb_presence(false, ea); - asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); - - barrier(); - - if (!kernel) - slb_cache_update(esid_data); - - return 0; -} - -static long slb_allocate_kernel(unsigned long ea, unsigned long id) -{ - unsigned long context; - unsigned long flags; - int ssize; - - if (id == LINEAR_MAP_REGION_ID) { - - /* We only support upto MAX_PHYSMEM_BITS */ - if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) - return -EFAULT; - - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - } else if (id == VMEMMAP_REGION_ID) { - - if (ea >= H_VMEMMAP_END) - return -EFAULT; - - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; -#endif - } else if (id == VMALLOC_REGION_ID) { - - if (ea >= H_VMALLOC_END) - return -EFAULT; - - flags = local_paca->vmalloc_sllp; - - } else if (id == IO_REGION_ID) { - - if (ea >= H_KERN_IO_END) - return -EFAULT; - - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; - - } else { - return -EFAULT; - } - - ssize = MMU_SEGSIZE_1T; - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - ssize = MMU_SEGSIZE_256M; - - context = get_kernel_context(ea); - - return slb_insert_entry(ea, context, flags, ssize, true); -} - -static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) -{ - unsigned long context; - unsigned long flags; - int bpsize; - int ssize; - - /* - * consider this as bad access if we take a SLB miss - * on an address above addr limit. - */ - if (ea >= mm_ctx_slb_addr_limit(&mm->context)) - return -EFAULT; - - context = get_user_context(&mm->context, ea); - if (!context) - return -EFAULT; - - if (unlikely(ea >= H_PGTABLE_RANGE)) { - WARN_ON(1); - return -EFAULT; - } - - ssize = user_segment_size(ea); - - bpsize = get_slice_psize(mm, ea); - flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; - - return slb_insert_entry(ea, context, flags, ssize, false); -} - -long do_slb_fault(struct pt_regs *regs, unsigned long ea) -{ - unsigned long id = get_region_id(ea); - - /* IRQs are not reconciled here, so can't check irqs_disabled */ - VM_WARN_ON(mfmsr() & MSR_EE); - - if (unlikely(!(regs->msr & MSR_RI))) - return -EINVAL; - - /* - * SLB kernel faults must be very careful not to touch anything - * that is not bolted. E.g., PACA and global variables are okay, - * mm->context stuff is not. - * - * SLB user faults can access all of kernel memory, but must be - * careful not to touch things like IRQ state because it is not - * "reconciled" here. The difficulty is that we must use - * fast_exception_return to return from kernel SLB faults without - * looking at possible non-bolted memory. We could test user vs - * kernel faults in the interrupt handler asm and do a full fault, - * reconcile, ret_from_except for user faults which would make them - * first class kernel code. But for performance it's probably nicer - * if they go via fast_exception_return too. - */ - if (id >= LINEAR_MAP_REGION_ID) { - long err; -#ifdef CONFIG_DEBUG_VM - /* Catch recursive kernel SLB faults. */ - BUG_ON(local_paca->in_kernel_slb_handler); - local_paca->in_kernel_slb_handler = 1; -#endif - err = slb_allocate_kernel(ea, id); -#ifdef CONFIG_DEBUG_VM - local_paca->in_kernel_slb_handler = 0; -#endif - return err; - } else { - struct mm_struct *mm = current->mm; - long err; - - if (unlikely(!mm)) - return -EFAULT; - - err = slb_allocate_user(mm, ea); - if (!err) - preload_add(current_thread_info(), ea); - - return err; - } -} - -void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) -{ - if (err == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, ea); - else - bad_page_fault(regs, ea, SIGSEGV); - } else if (err == -EINVAL) { - unrecoverable_exception(regs); - } else { - BUG(); - } -} diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c deleted file mode 100644 index 473dd430e306..000000000000 --- a/arch/powerpc/mm/subpage-prot.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright 2007-2008 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* - * Free all pages allocated for subpage protection maps and pointers. - * Also makes sure that the subpage_prot_table structure is - * reinitialized for the next user. - */ -void subpage_prot_free(struct mm_struct *mm) -{ - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); - unsigned long i, j, addr; - u32 **p; - - if (!spt) - return; - - for (i = 0; i < 4; ++i) { - if (spt->low_prot[i]) { - free_page((unsigned long)spt->low_prot[i]); - spt->low_prot[i] = NULL; - } - } - addr = 0; - for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) { - p = spt->protptrs[i]; - if (!p) - continue; - spt->protptrs[i] = NULL; - for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr; - ++j, addr += PAGE_SIZE) - if (p[j]) - free_page((unsigned long)p[j]); - free_page((unsigned long)p); - } - spt->maxaddr = 0; - kfree(spt); -} - -static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, - int npages) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - return; - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) - return; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - arch_enter_lazy_mmu_mode(); - for (; npages > 0; --npages) { - pte_update(mm, addr, pte, 0, 0, 0); - addr += PAGE_SIZE; - ++pte; - } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); -} - -/* - * Clear the subpage protection map for an address range, allowing - * all accesses that are allowed by the pte permissions. - */ -static void subpage_prot_clear(unsigned long addr, unsigned long len) -{ - struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt; - u32 **spm, *spp; - unsigned long i; - size_t nw; - unsigned long next, limit; - - down_write(&mm->mmap_sem); - - spt = mm_ctx_subpage_prot(&mm->context); - if (!spt) - goto err_out; - - limit = addr + len; - if (limit > spt->maxaddr) - limit = spt->maxaddr; - for (; addr < limit; addr = next) { - next = pmd_addr_end(addr, limit); - if (addr < 0x100000000UL) { - spm = spt->low_prot; - } else { - spm = spt->protptrs[addr >> SBP_L3_SHIFT]; - if (!spm) - continue; - } - spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; - if (!spp) - continue; - spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); - - i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); - nw = PTRS_PER_PTE - i; - if (addr + (nw << PAGE_SHIFT) > next) - nw = (next - addr) >> PAGE_SHIFT; - - memset(spp, 0, nw * sizeof(u32)); - - /* now flush any existing HPTEs for the range */ - hpte_flush_range(mm, addr, nw); - } - -err_out: - up_write(&mm->mmap_sem); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct vm_area_struct *vma = walk->vma; - split_huge_pmd(vma, pmd, addr); - return 0; -} - -static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; - - /* - * We don't try too hard, we just mark all the vma in that range - * VM_NOHUGEPAGE and split them. - */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; - vma->vm_flags |= VM_NOHUGEPAGE; - walk_page_vma(vma, &subpage_proto_walk); - vma = vma->vm_next; - } -} -#else -static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - return; -} -#endif - -/* - * Copy in a subpage protection map for an address range. - * The map has 2 bits per 4k subpage, so 32 bits per 64k page. - * Each 2-bit field is 0 to allow any access, 1 to prevent writes, - * 2 or 3 to prevent all accesses. - * Note that the normal page protections also apply; the subpage - * protection mechanism is an additional constraint, so putting 0 - * in a 2-bit field won't allow writes to a page that is otherwise - * write-protected. - */ -SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, - unsigned long, len, u32 __user *, map) -{ - struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt; - u32 **spm, *spp; - unsigned long i; - size_t nw; - unsigned long next, limit; - int err; - - if (radix_enabled()) - return -ENOENT; - - /* Check parameters */ - if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || - addr >= mm->task_size || len >= mm->task_size || - addr + len > mm->task_size) - return -EINVAL; - - if (is_hugepage_only_range(mm, addr, len)) - return -EINVAL; - - if (!map) { - /* Clear out the protection map for the address range */ - subpage_prot_clear(addr, len); - return 0; - } - - if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) - return -EFAULT; - - down_write(&mm->mmap_sem); - - spt = mm_ctx_subpage_prot(&mm->context); - if (!spt) { - /* - * Allocate subpage prot table if not already done. - * Do this with mmap_sem held - */ - spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); - if (!spt) { - err = -ENOMEM; - goto out; - } - mm->context.hash_context->spt = spt; - } - - subpage_mark_vma_nohuge(mm, addr, len); - for (limit = addr + len; addr < limit; addr = next) { - next = pmd_addr_end(addr, limit); - err = -ENOMEM; - if (addr < 0x100000000UL) { - spm = spt->low_prot; - } else { - spm = spt->protptrs[addr >> SBP_L3_SHIFT]; - if (!spm) { - spm = (u32 **)get_zeroed_page(GFP_KERNEL); - if (!spm) - goto out; - spt->protptrs[addr >> SBP_L3_SHIFT] = spm; - } - } - spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1); - spp = *spm; - if (!spp) { - spp = (u32 *)get_zeroed_page(GFP_KERNEL); - if (!spp) - goto out; - *spm = spp; - } - spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); - - local_irq_disable(); - demote_segment_4k(mm, addr); - local_irq_enable(); - - i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); - nw = PTRS_PER_PTE - i; - if (addr + (nw << PAGE_SHIFT) > next) - nw = (next - addr) >> PAGE_SHIFT; - - up_write(&mm->mmap_sem); - if (__copy_from_user(spp, map, nw * sizeof(u32))) - return -EFAULT; - map += nw; - down_write(&mm->mmap_sem); - - /* now flush any existing HPTEs for the range */ - hpte_flush_range(mm, addr, nw); - } - if (limit > spt->maxaddr) - spt->maxaddr = limit; - err = 0; - out: - up_write(&mm->mmap_sem); - return err; -} diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c deleted file mode 100644 index 6a23b9ebd2a1..000000000000 --- a/arch/powerpc/mm/tlb-radix.c +++ /dev/null @@ -1,1101 +0,0 @@ -/* - * TLB flush routines for radix kernels. - * - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define RIC_FLUSH_TLB 0 -#define RIC_FLUSH_PWC 1 -#define RIC_FLUSH_ALL 2 - -/* - * tlbiel instruction for radix, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) -{ - unsigned long rb; - unsigned long rs; - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) - : "memory"); -} - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and the entire Page Walk Cache - * and partition table entries. Then flush the remaining sets of the - * TLB. - */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); - - /* Do the same for process scoped entries. */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); - - asm volatile("ptesync": : :"memory"); -} - -void radix__tlbiel_all(unsigned int action) -{ - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is); - else - WARN(1, "%s called on pre-POWER9 CPU\n", __func__); - - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void __tlbiel_pid(unsigned long pid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(53); /* IS = 1 */ - rb |= set << PPC_BITLSHIFT(51); - rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 1, rb, rs, ric, prs, r); -} - -static inline void __tlbie_pid(unsigned long pid, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(53); /* IS = 1 */ - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} - -static inline void __tlbiel_lpid(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - -static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rs = lpid; - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 0, rb, rs, ric, prs, r); -} - -static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - - -static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 1, rb, rs, ric, prs, r); -} - -static inline void __tlbie_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} - -static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = lpid; - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 0, rb, rs, ric, prs, r); -} - -static inline void fixup_tlbie(void) -{ - unsigned long pid = 0; - unsigned long va = ((1UL << 52) - 1); - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { - asm volatile("ptesync": : :"memory"); - __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); - } -} - -static inline void fixup_tlbie_lpid(unsigned long lpid) -{ - unsigned long va = ((1UL << 52) - 1); - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { - asm volatile("ptesync": : :"memory"); - __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); - } -} - -/* - * We use 128 set in radix mode and 256 set in hpt mode. - */ -static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) -{ - int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_pid(pid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_pid(pid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void _tlbie_pid(unsigned long pid, unsigned long ric) -{ - asm volatile("ptesync": : :"memory"); - - /* - * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint - * in the asm statement. - */ - switch (ric) { - case RIC_FLUSH_TLB: - __tlbie_pid(pid, RIC_FLUSH_TLB); - break; - case RIC_FLUSH_PWC: - __tlbie_pid(pid, RIC_FLUSH_PWC); - break; - case RIC_FLUSH_ALL: - default: - __tlbie_pid(pid, RIC_FLUSH_ALL); - } - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) -{ - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_lpid(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) -{ - asm volatile("ptesync": : :"memory"); - - /* - * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint - * in the asm statement. - */ - switch (ric) { - case RIC_FLUSH_TLB: - __tlbie_lpid(lpid, RIC_FLUSH_TLB); - break; - case RIC_FLUSH_PWC: - __tlbie_lpid(lpid, RIC_FLUSH_PWC); - break; - case RIC_FLUSH_ALL: - default: - __tlbie_lpid(lpid, RIC_FLUSH_ALL); - } - fixup_tlbie_lpid(lpid); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) -{ - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_lpid_guest(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); -} - - -static inline void __tlbiel_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize) -{ - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - - for (addr = start; addr < end; addr += page_size) - __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); -} - -static inline void _tlbiel_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) -{ - unsigned long ap = mmu_get_ap(psize); - - asm volatile("ptesync": : :"memory"); - __tlbiel_va(va, pid, ap, ric); - asm volatile("ptesync": : :"memory"); -} - -static inline void _tlbiel_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize, bool also_pwc) -{ - asm volatile("ptesync": : :"memory"); - if (also_pwc) - __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); - __tlbiel_va_range(start, end, pid, page_size, psize); - asm volatile("ptesync": : :"memory"); -} - -static inline void __tlbie_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize) -{ - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - - for (addr = start; addr < end; addr += page_size) - __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); -} - -static inline void _tlbie_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) -{ - unsigned long ap = mmu_get_ap(psize); - - asm volatile("ptesync": : :"memory"); - __tlbie_va(va, pid, ap, ric); - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long psize, unsigned long ric) -{ - unsigned long ap = mmu_get_ap(psize); - - asm volatile("ptesync": : :"memory"); - __tlbie_lpid_va(va, lpid, ap, ric); - fixup_tlbie_lpid(lpid); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbie_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize, bool also_pwc) -{ - asm volatile("ptesync": : :"memory"); - if (also_pwc) - __tlbie_pid(pid, RIC_FLUSH_PWC); - __tlbie_va_range(start, end, pid, page_size, psize); - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -/* - * Base TLB flushing operations: - * - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes kernel pages - * - * - local_* variants of page and mm only apply to the current - * processor - */ -void radix__local_flush_tlb_mm(struct mm_struct *mm) -{ - unsigned long pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_TLB); - preempt_enable(); -} -EXPORT_SYMBOL(radix__local_flush_tlb_mm); - -#ifndef CONFIG_SMP -void radix__local_flush_all_mm(struct mm_struct *mm) -{ - unsigned long pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); - preempt_enable(); -} -EXPORT_SYMBOL(radix__local_flush_all_mm); -#endif /* CONFIG_SMP */ - -void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, - int psize) -{ - unsigned long pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); - preempt_enable(); -} - -void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ -#ifdef CONFIG_HUGETLB_PAGE - /* need the return fix for nohash.c */ - if (is_vm_hugetlb_page(vma)) - return radix__local_flush_hugetlb_page(vma, vmaddr); -#endif - radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); -} -EXPORT_SYMBOL(radix__local_flush_tlb_page); - -static bool mm_is_singlethreaded(struct mm_struct *mm) -{ - if (atomic_read(&mm->context.copros) > 0) - return false; - if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) - return true; - return false; -} - -static bool mm_needs_flush_escalation(struct mm_struct *mm) -{ - /* - * P9 nest MMU has issues with the page walk cache - * caching PTEs and not flushing them properly when - * RIC = 0 for a PID/LPID invalidate - */ - if (atomic_read(&mm->context.copros) > 0) - return true; - return false; -} - -#ifdef CONFIG_SMP -static void do_exit_flush_lazy_tlb(void *arg) -{ - struct mm_struct *mm = arg; - unsigned long pid = mm->context.id; - - if (current->mm == mm) - return; /* Local CPU */ - - if (current->active_mm == mm) { - /* - * Must be a kernel thread because sender is single-threaded. - */ - BUG_ON(current->mm); - mmgrab(&init_mm); - switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; - mmdrop(mm); - } - _tlbiel_pid(pid, RIC_FLUSH_ALL); -} - -static void exit_flush_lazy_tlbs(struct mm_struct *mm) -{ - /* - * Would be nice if this was async so it could be run in - * parallel with our local flush, but generic code does not - * give a good API for it. Could extend the generic code or - * make a special powerpc IPI for flushing TLBs. - * For now it's not too performance critical. - */ - smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, - (void *)mm, 1); - mm_reset_thread_local(mm); -} - -void radix__flush_tlb_mm(struct mm_struct *mm) -{ - unsigned long pid; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - /* - * Order loads of mm_cpumask vs previous stores to clear ptes before - * the invalidate. See barrier in switch_mm_irqs_off - */ - smp_mb(); - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_TLB); - } - preempt_enable(); -} -EXPORT_SYMBOL(radix__flush_tlb_mm); - -static void __flush_all_mm(struct mm_struct *mm, bool fullmm) -{ - unsigned long pid; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (!fullmm) { - exit_flush_lazy_tlbs(mm); - goto local; - } - } - _tlbie_pid(pid, RIC_FLUSH_ALL); - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_ALL); - } - preempt_enable(); -} -void radix__flush_all_mm(struct mm_struct *mm) -{ - __flush_all_mm(mm, false); -} -EXPORT_SYMBOL(radix__flush_all_mm); - -void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) -{ - tlb->need_flush_all = 1; -} -EXPORT_SYMBOL(radix__flush_tlb_pwc); - -void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, - int psize) -{ - unsigned long pid; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); - } else { -local: - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); - } - preempt_enable(); -} - -void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_page(vma, vmaddr); -#endif - radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); -} -EXPORT_SYMBOL(radix__flush_tlb_page); - -#else /* CONFIG_SMP */ -#define radix__flush_all_mm radix__local_flush_all_mm -#endif /* CONFIG_SMP */ - -void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - _tlbie_pid(0, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL(radix__flush_tlb_kernel_range); - -#define TLB_FLUSH_ALL -1UL - -/* - * Number of pages above which we invalidate the entire PID rather than - * flush individual pages, for local and global flushes respectively. - * - * tlbie goes out to the interconnect and individual ops are more costly. - * It also does not iterate over sets like the local tlbiel variant when - * invalidating a full PID, so it has a far lower threshold to change from - * individual page flushes to full-pid flushes. - */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; - -static inline void __radix__flush_tlb_range(struct mm_struct *mm, - unsigned long start, unsigned long end, - bool flush_all_sizes) - -{ - unsigned long pid; - unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; - unsigned long page_size = 1UL << page_shift; - unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } - - if (full) { - if (local) { - _tlbiel_pid(pid, RIC_FLUSH_TLB); - } else { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); - } - } else { - bool hflush = flush_all_sizes; - bool gflush = flush_all_sizes; - unsigned long hstart, hend; - unsigned long gstart, gend; - - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - hflush = true; - - if (hflush) { - hstart = (start + PMD_SIZE - 1) & PMD_MASK; - hend = end & PMD_MASK; - if (hstart == hend) - hflush = false; - } - - if (gflush) { - gstart = (start + PUD_SIZE - 1) & PUD_MASK; - gend = end & PUD_MASK; - if (gstart == gend) - gflush = false; - } - - asm volatile("ptesync": : :"memory"); - if (local) { - __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); - if (hflush) - __tlbiel_va_range(hstart, hend, pid, - PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbiel_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); - asm volatile("ptesync": : :"memory"); - } else { - __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); - if (hflush) - __tlbie_va_range(hstart, hend, pid, - PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbie_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); - } - } - preempt_enable(); -} - -void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) - -{ -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_tlb_range(vma, start, end); -#endif - - __radix__flush_tlb_range(vma->vm_mm, start, end, false); -} -EXPORT_SYMBOL(radix__flush_tlb_range); - -static int radix_get_mmu_psize(int page_size) -{ - int psize; - - if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift)) - psize = mmu_virtual_psize; - else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift)) - psize = MMU_PAGE_2M; - else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift)) - psize = MMU_PAGE_1G; - else - return -1; - return psize; -} - -/* - * Flush partition scoped LPID address translation for all CPUs. - */ -void radix__flush_tlb_lpid_page(unsigned int lpid, - unsigned long addr, - unsigned long page_size) -{ - int psize = radix_get_mmu_psize(page_size); - - _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB); -} -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page); - -/* - * Flush partition scoped PWC from LPID for all CPUs. - */ -void radix__flush_pwc_lpid(unsigned int lpid) -{ - _tlbie_lpid(lpid, RIC_FLUSH_PWC); -} -EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); - -/* - * Flush partition scoped translations from LPID (=LPIDR) - */ -void radix__flush_tlb_lpid(unsigned int lpid) -{ - _tlbie_lpid(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); - -/* - * Flush partition scoped translations from LPID (=LPIDR) - */ -void radix__local_flush_tlb_lpid(unsigned int lpid) -{ - _tlbiel_lpid(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); - -/* - * Flush process scoped translations from LPID (=LPIDR). - * Important difference, the guest normally manages its own translations, - * but some cases e.g., vCPU CPU migration require KVM to flush. - */ -void radix__local_flush_tlb_lpid_guest(unsigned int lpid) -{ - _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); - - -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize); - -void radix__tlb_flush(struct mmu_gather *tlb) -{ - int psize = 0; - struct mm_struct *mm = tlb->mm; - int page_size = tlb->page_size; - unsigned long start = tlb->start; - unsigned long end = tlb->end; - - /* - * if page size is not something we understand, do a full mm flush - * - * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush - * that flushes the process table entry cache upon process teardown. - * See the comment for radix in arch_exit_mmap(). - */ - if (tlb->fullmm) { - __flush_all_mm(mm, true); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) - } else if (mm_tlb_flush_nested(mm)) { - /* - * If there is a concurrent invalidation that is clearing ptes, - * then it's possible this invalidation will miss one of those - * cleared ptes and miss flushing the TLB. If this invalidate - * returns before the other one flushes TLBs, that can result - * in it returning while there are still valid TLBs inside the - * range to be invalidated. - * - * See mm/memory.c:tlb_finish_mmu() for more details. - * - * The solution to this is ensure the entire range is always - * flushed here. The problem for powerpc is that the flushes - * are page size specific, so this "forced flush" would not - * do the right thing if there are a mix of page sizes in - * the range to be invalidated. So use __flush_tlb_range - * which invalidates all possible page sizes in the range. - * - * PWC flush probably is not be required because the core code - * shouldn't free page tables in this path, but accounting - * for the possibility makes us a bit more robust. - * - * need_flush_all is an uncommon case because page table - * teardown should be done with exclusive locks held (but - * after locks are dropped another invalidate could come - * in), it could be optimized further if necessary. - */ - if (!tlb->need_flush_all) - __radix__flush_tlb_range(mm, start, end, true); - else - radix__flush_all_mm(mm); -#endif - } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { - if (!tlb->need_flush_all) - radix__flush_tlb_mm(mm); - else - radix__flush_all_mm(mm); - } else { - if (!tlb->need_flush_all) - radix__flush_tlb_range_psize(mm, start, end, psize); - else - radix__flush_tlb_pwc_range_psize(mm, start, end, psize); - } - tlb->need_flush_all = 0; -} - -static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, - unsigned long start, unsigned long end, - int psize, bool also_pwc) -{ - unsigned long pid; - unsigned int page_shift = mmu_psize_defs[psize].shift; - unsigned long page_size = 1UL << page_shift; - unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } - - if (full) { - if (local) { - _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); - } else { - if (mm_needs_flush_escalation(mm)) - also_pwc = true; - - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); - } - } else { - if (local) - _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); - else - _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); - } - preempt_enable(); -} - -void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize) -{ - return __radix__flush_tlb_range_psize(mm, start, end, psize, false); -} - -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize) -{ - __radix__flush_tlb_range_psize(mm, start, end, psize, true); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) -{ - unsigned long pid, end; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - /* 4k page size, just blow the world */ - if (PAGE_SIZE == 0x1000) { - radix__flush_all_mm(mm); - return; - } - - end = addr + HPAGE_PMD_SIZE; - - /* Otherwise first do the PWC, then iterate the pages. */ - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - } else { -local: - _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - } - - preempt_enable(); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M); -} -EXPORT_SYMBOL(radix__flush_pmd_tlb_range); - -void radix__flush_tlb_all(void) -{ - unsigned long rb,prs,r,rs; - unsigned long ric = RIC_FLUSH_ALL; - - rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */ - - asm volatile("ptesync": : :"memory"); - /* - * now flush guest entries by passing PRS = 1 and LPID != 0 - */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory"); - /* - * now flush host entires by passing PRS = 0 and LPID == 0 - */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory"); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) -{ - unsigned long pid = mm->context.id; - - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - /* - * If this context hasn't run on that CPU before and KVM is - * around, there's a slim chance that the guest on another - * CPU just brought in obsolete translation into the TLB of - * this CPU due to a bad prefetch using the guest PID on - * the way into the hypervisor. - * - * We work around this here. If KVM is possible, we check if - * any sibling thread is in KVM. If it is, the window may exist - * and thus we flush that PID from the core. - * - * A potential future improvement would be to mark which PIDs - * have never been used on the system and avoid it if the PID - * is new and the process has no other cpumask bit set. - */ - if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) { - int cpu = smp_processor_id(); - int sib = cpu_first_thread_sibling(cpu); - bool flush = false; - - for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { - if (sib == cpu) - continue; - if (!cpu_possible(sib)) - continue; - if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) - flush = true; - } - if (flush) - _tlbiel_pid(pid, RIC_FLUSH_ALL); - } -} -EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround); -#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c deleted file mode 100644 index 87d71dd25441..000000000000 --- a/arch/powerpc/mm/tlb_hash64.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * This file contains the routines for flushing entries from the - * TLB and MMU hash table. - * - * Derived from arch/ppc64/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * Dave Engebretsen - * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - -DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); - -/* - * A linux PTE was changed and the corresponding hash table entry - * neesd to be flushed. This function will either perform the flush - * immediately or will batch it up if the current CPU has an active - * batch on it. - */ -void hpte_need_flush(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned long pte, int huge) -{ - unsigned long vpn; - struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); - unsigned long vsid; - unsigned int psize; - int ssize; - real_pte_t rpte; - int i, offset; - - i = batch->index; - - /* Get page size (maybe move back to caller). - * - * NOTE: when using special 64K mappings in 4K environment like - * for SPEs, we obtain the page size from the slice, which thus - * must still exist (and thus the VMA not reused) at the time - * of this call - */ - if (huge) { -#ifdef CONFIG_HUGETLB_PAGE - psize = get_slice_psize(mm, addr); - /* Mask the address for the correct page size */ - addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); - if (unlikely(psize == MMU_PAGE_16G)) - offset = PTRS_PER_PUD; - else - offset = PTRS_PER_PMD; -#else - BUG(); - psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ -#endif - } else { - psize = pte_pagesize_index(mm, addr, pte); - /* Mask the address for the standard page size. If we - * have a 64k page kernel, but the hardware does not - * support 64k pages, this might be different from the - * hardware page size encoded in the slice table. */ - addr &= PAGE_MASK; - offset = PTRS_PER_PTE; - } - - - /* Build full vaddr */ - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_user_vsid(&mm->context, addr, ssize); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - WARN_ON(vsid == 0); - vpn = hpt_vpn(addr, vsid, ssize); - rpte = __real_pte(__pte(pte), ptep, offset); - - /* - * Check if we have an active batch on this CPU. If not, just - * flush now and return. - */ - if (!batch->active) { - flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); - put_cpu_var(ppc64_tlb_batch); - return; - } - - /* - * This can happen when we are in the middle of a TLB batch and - * we encounter memory pressure (eg copy_page_range when it tries - * to allocate a new pte). If we have to reclaim memory and end - * up scanning and resetting referenced bits then our batch context - * will change mid stream. - * - * We also need to ensure only one page size is present in a given - * batch - */ - if (i != 0 && (mm != batch->mm || batch->psize != psize || - batch->ssize != ssize)) { - __flush_tlb_pending(batch); - i = 0; - } - if (i == 0) { - batch->mm = mm; - batch->psize = psize; - batch->ssize = ssize; - } - batch->pte[i] = rpte; - batch->vpn[i] = vpn; - batch->index = ++i; - if (i >= PPC64_TLB_BATCH_NR) - __flush_tlb_pending(batch); - put_cpu_var(ppc64_tlb_batch); -} - -/* - * This function is called when terminating an mmu batch or when a batch - * is full. It will perform the flush of all the entries currently stored - * in a batch. - * - * Must be called from within some kind of spinlock/non-preempt region... - */ -void __flush_tlb_pending(struct ppc64_tlb_batch *batch) -{ - int i, local; - - i = batch->index; - local = mm_is_thread_local(batch->mm); - if (i == 1) - flush_hash_page(batch->vpn[0], batch->pte[0], - batch->psize, batch->ssize, local); - else - flush_hash_range(i, local); - batch->index = 0; -} - -void hash__tlb_flush(struct mmu_gather *tlb) -{ - struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); - - /* If there's a TLB batch pending, then we must flush it because the - * pages are going to be freed and we really don't want to have a CPU - * access a freed page because it has a stale TLB - */ - if (tlbbatch->index) - __flush_tlb_pending(tlbbatch); - - put_cpu_var(ppc64_tlb_batch); -} - -/** - * __flush_hash_table_range - Flush all HPTEs for a given address range - * from the hash table (and the TLB). But keeps - * the linux PTEs intact. - * - * @mm : mm_struct of the target address space (generally init_mm) - * @start : starting address - * @end : ending address (not included in the flush) - * - * This function is mostly to be used by some IO hotplug code in order - * to remove all hash entries from a given address range used to map IO - * space on a removed PCI-PCI bidge without tearing down the full mapping - * since 64K pages may overlap with other bridges when using 64K pages - * with 4K HW pages on IO space. - * - * Because of that usage pattern, it is implemented for small size rather - * than speed. - */ -void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - bool is_thp; - int hugepage_shift; - unsigned long flags; - - start = _ALIGN_DOWN(start, PAGE_SIZE); - end = _ALIGN_UP(end, PAGE_SIZE); - - BUG_ON(!mm->pgd); - - /* Note: Normally, we should only ever use a batch within a - * PTE locked section. This violates the rule, but will work - * since we don't actually modify the PTEs, we just flush the - * hash while leaving the PTEs intact (including their reference - * to being hashed). This is not the most performance oriented - * way to do things but is fine for our needs here. - */ - local_irq_save(flags); - arch_enter_lazy_mmu_mode(); - for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, - &hugepage_shift); - unsigned long pte; - - if (ptep == NULL) - continue; - pte = pte_val(*ptep); - if (is_thp) - trace_hugepage_invalidate(start, pte); - if (!(pte & H_PAGE_HASHPTE)) - continue; - if (unlikely(is_thp)) - hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); - else - hpte_need_flush(mm, start, ptep, pte, hugepage_shift); - } - arch_leave_lazy_mmu_mode(); - local_irq_restore(flags); -} - -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) -{ - pte_t *pte; - pte_t *start_pte; - unsigned long flags; - - addr = _ALIGN_DOWN(addr, PMD_SIZE); - /* Note: Normally, we should only ever use a batch within a - * PTE locked section. This violates the rule, but will work - * since we don't actually modify the PTEs, we just flush the - * hash while leaving the PTEs intact (including their reference - * to being hashed). This is not the most performance oriented - * way to do things but is fine for our needs here. - */ - local_irq_save(flags); - arch_enter_lazy_mmu_mode(); - start_pte = pte_offset_map(pmd, addr); - for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { - unsigned long pteval = pte_val(*pte); - if (pteval & H_PAGE_HASHPTE) - hpte_need_flush(mm, addr, pte, pteval, 0); - addr += PAGE_SIZE; - } - arch_leave_lazy_mmu_mode(); - local_irq_restore(flags); -} diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c deleted file mode 100644 index f83044faac23..000000000000 --- a/arch/powerpc/mm/vphn.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "vphn.h" - -/* - * The associativity domain numbers are returned from the hypervisor as a - * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the - * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 - * bytes. - * - * --- 16-bit fields --> - * _________________________ - * | 0 | 1 | 2 | 3 | be_packed[0] - * ------+-----+-----+------ - * _________________________ - * | 4 | 5 | 6 | 7 | be_packed[1] - * ------------------------- - * ... - * _________________________ - * | 20 | 21 | 22 | 23 | be_packed[5] - * ------------------------- - * - * Convert to the sequence they would appear in the ibm,associativity property. - */ -int vphn_unpack_associativity(const long *packed, __be32 *unpacked) -{ - __be64 be_packed[VPHN_REGISTER_COUNT]; - int i, nr_assoc_doms = 0; - const __be16 *field = (const __be16 *) be_packed; - u16 last = 0; - bool is_32bit = false; - -#define VPHN_FIELD_UNUSED (0xffff) -#define VPHN_FIELD_MSB (0x8000) -#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) - - /* Let's fix the values returned by plpar_hcall9() */ - for (i = 0; i < VPHN_REGISTER_COUNT; i++) - be_packed[i] = cpu_to_be64(packed[i]); - - for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { - u16 new = be16_to_cpup(field++); - - if (is_32bit) { - /* Let's concatenate the 16 bits of this field to the - * 15 lower bits of the previous field - */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(last << 16 | new); - is_32bit = false; - } else if (new == VPHN_FIELD_UNUSED) - /* This is the list terminator */ - break; - else if (new & VPHN_FIELD_MSB) { - /* Data is in the lower 15 bits of this field */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(new & VPHN_FIELD_MASK); - } else { - /* Data is in the lower 15 bits of this field - * concatenated with the next 16 bit field - */ - last = new; - is_32bit = true; - } - } - - /* The first cell contains the length of the property */ - unpacked[0] = cpu_to_be32(nr_assoc_doms); - - return nr_assoc_doms; -} diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h deleted file mode 100644 index f9ffdb3942fc..000000000000 --- a/arch/powerpc/mm/vphn.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ARCH_POWERPC_MM_VPHN_H_ -#define _ARCH_POWERPC_MM_VPHN_H_ - -/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. - */ -#define VPHN_REGISTER_COUNT 6 - -/* - * 6 64-bit registers unpacked into up to 24 be32 associativity values. To - * form the complete property we have to add the length in the first cell. - */ -#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) - -extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); - -#endif -- cgit