summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/copro_fault.c10
-rw-r--r--arch/powerpc/mm/fault.c64
-rw-r--r--arch/powerpc/mm/hash_utils_64.c66
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c18
-rw-r--r--arch/powerpc/mm/hugetlbpage.c31
-rw-r--r--arch/powerpc/mm/init-common.c16
-rw-r--r--arch/powerpc/mm/init_64.c38
-rw-r--r--arch/powerpc/mm/mem.c4
-rw-r--r--arch/powerpc/mm/mmu_context_iommu.c2
-rw-r--r--arch/powerpc/mm/numa.c15
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c18
-rw-r--r--arch/powerpc/mm/pgtable-radix.c265
-rw-r--r--arch/powerpc/mm/pgtable_64.c22
-rw-r--r--arch/powerpc/mm/subpage-prot.c4
-rw-r--r--arch/powerpc/mm/tlb-radix.c6
15 files changed, 436 insertions, 143 deletions
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index aaa7ec6788b9..697b70ad1195 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -67,11 +67,13 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto out_unlock;
/*
- * protfault should only happen due to us
- * mapping a region readonly temporarily. PROT_NONE
- * is also covered by the VMA check above.
+ * PROT_NONE is covered by the VMA check above.
+ * and hash should get a NOHPTE fault instead of
+ * a PROTFAULT in case fixup is needed for things
+ * like autonuma.
*/
- WARN_ON_ONCE(dsisr & DSISR_PROTFAULT);
+ if (!radix_enabled())
+ WARN_ON_ONCE(dsisr & DSISR_PROTFAULT);
}
ret = 0;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 6fd30ac7d14a..8dc758658972 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -253,8 +253,11 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
if (unlikely(debugger_fault_handler(regs)))
goto bail;
- /* On a kernel SLB miss we can only check for a valid exception entry */
- if (!user_mode(regs) && (address >= TASK_SIZE)) {
+ /*
+ * The kernel should never take an execute fault nor should it
+ * take a page fault to a kernel address.
+ */
+ if (!user_mode(regs) && (is_exec || (address >= TASK_SIZE))) {
rc = SIGSEGV;
goto bail;
}
@@ -391,20 +394,6 @@ good_area:
if (is_exec) {
/*
- * An execution fault + no execute ?
- *
- * On CPUs that don't have CPU_FTR_COHERENT_ICACHE we
- * deliberately create NX mappings, and use the fault to do the
- * cache flush. This is usually handled in hash_page_do_lazy_icache()
- * but we could end up here if that races with a concurrent PTE
- * update. In that case we need to fall through here to the VMA
- * check below.
- */
- if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
- (regs->msr & SRR1_ISI_N_OR_G))
- goto bad_area;
-
- /*
* Allow execution from readable areas if the MMU does not
* provide separate controls over reading and executing.
*
@@ -418,15 +407,6 @@ good_area:
(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
!(vma->vm_flags & (VM_READ | VM_WRITE))))
goto bad_area;
-
-#ifdef CONFIG_PPC_STD_MMU
- /*
- * protfault should only happen due to us
- * mapping a region readonly temporarily. PROT_NONE
- * is also covered by the VMA check above.
- */
- WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-#endif /* CONFIG_PPC_STD_MMU */
/* a write */
} else if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
@@ -436,8 +416,40 @@ good_area:
} else {
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
- WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
}
+#ifdef CONFIG_PPC_STD_MMU
+ /*
+ * For hash translation mode, we should never get a
+ * PROTFAULT. Any update to pte to reduce access will result in us
+ * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
+ * fault instead of DSISR_PROTFAULT.
+ *
+ * A pte update to relax the access will not result in a hash page table
+ * entry invalidate and hence can result in DSISR_PROTFAULT.
+ * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
+ * the special !is_write in the below conditional.
+ *
+ * For platforms that doesn't supports coherent icache and do support
+ * per page noexec bit, we do setup things such that we do the
+ * sync between D/I cache via fault. But that is handled via low level
+ * hash fault code (hash_page_do_lazy_icache()) and we should not reach
+ * here in such case.
+ *
+ * For wrong access that can result in PROTFAULT, the above vma->vm_flags
+ * check should handle those and hence we should fall to the bad_area
+ * handling correctly.
+ *
+ * For embedded with per page exec support that doesn't support coherent
+ * icache we do get PROTFAULT and we handle that D/I cache sync in
+ * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
+ * is conditional for server MMU.
+ *
+ * For radix, we can get prot fault for autonuma case, because radix
+ * page table will have them marked noaccess for user.
+ */
+ if (!radix_enabled() && !is_write)
+ WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+#endif /* CONFIG_PPC_STD_MMU */
/*
* If for any reason at all we couldn't handle the fault,
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 80334937e14f..12d679df50bd 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,7 +35,9 @@
#include <linux/memblock.h>
#include <linux/context_tracking.h>
#include <linux/libfdt.h>
+#include <linux/debugfs.h>
+#include <asm/debug.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -747,7 +749,36 @@ static unsigned long __init htab_get_table_size(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int create_section_mapping(unsigned long start, unsigned long end)
+void resize_hpt_for_hotplug(unsigned long new_mem_size)
+{
+ unsigned target_hpt_shift;
+
+ if (!mmu_hash_ops.resize_hpt)
+ return;
+
+ target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
+
+ /*
+ * To avoid lots of HPT resizes if memory size is fluctuating
+ * across a boundary, we deliberately have some hysterisis
+ * here: we immediately increase the HPT size if the target
+ * shift exceeds the current shift, but we won't attempt to
+ * reduce unless the target shift is at least 2 below the
+ * current shift
+ */
+ if ((target_hpt_shift > ppc64_pft_size)
+ || (target_hpt_shift < (ppc64_pft_size - 1))) {
+ int rc;
+
+ rc = mmu_hash_ops.resize_hpt(target_hpt_shift);
+ if (rc)
+ printk(KERN_WARNING
+ "Unable to resize hash page table to target order %d: %d\n",
+ target_hpt_shift, rc);
+ }
+}
+
+int hash__create_section_mapping(unsigned long start, unsigned long end)
{
int rc = htab_bolt_mapping(start, end, __pa(start),
pgprot_val(PAGE_KERNEL), mmu_linear_psize,
@@ -761,7 +792,7 @@ int create_section_mapping(unsigned long start, unsigned long end)
return rc;
}
-int remove_section_mapping(unsigned long start, unsigned long end)
+int hash__remove_section_mapping(unsigned long start, unsigned long end)
{
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
mmu_kernel_ssize);
@@ -1795,3 +1826,34 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int hpt_order_get(void *data, u64 *val)
+{
+ *val = ppc64_pft_size;
+ return 0;
+}
+
+static int hpt_order_set(void *data, u64 val)
+{
+ if (!mmu_hash_ops.resize_hpt)
+ return -ENODEV;
+
+ return mmu_hash_ops.resize_hpt(val);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+
+static int __init hash64_debugfs(void)
+{
+ if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root,
+ NULL, &fops_hpt_order)) {
+ pr_err("lpar: unable to create hpt_order debugsfs file\n");
+ }
+
+ return 0;
+}
+machine_device_initcall(pseries, hash64_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index d5026f3800b6..a84bb44497f9 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -116,21 +116,3 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
-
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
-/*
- * This enables us to catch the wrong page directory format
- * Moved here so that we can use WARN() in the call.
- */
-int hugepd_ok(hugepd_t hpd)
-{
- bool is_hugepd;
-
- /*
- * We should not find this format in page directory, warn otherwise.
- */
- is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
- WARN(is_hugepd, "Found wrong page directory format\n");
- return 0;
-}
-#endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 289df38fb7e0..8c3389cbcd12 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -53,7 +53,7 @@ static u64 gpage_freearray[MAX_NUMBER_GPAGES];
static unsigned nr_gpages;
#endif
-#define hugepd_none(hpd) ((hpd).pd == 0)
+#define hugepd_none(hpd) (hpd_val(hpd) == 0)
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
@@ -103,24 +103,24 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
for (i = 0; i < num_hugepd; i++, hpdp++) {
if (unlikely(!hugepd_none(*hpdp)))
break;
- else
+ else {
#ifdef CONFIG_PPC_BOOK3S_64
- hpdp->pd = __pa(new) |
- (shift_to_mmu_psize(pshift) << 2);
+ *hpdp = __hugepd(__pa(new) |
+ (shift_to_mmu_psize(pshift) << 2));
#elif defined(CONFIG_PPC_8xx)
- hpdp->pd = __pa(new) |
- (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
- _PMD_PAGE_512K) |
- _PMD_PRESENT;
+ *hpdp = __hugepd(__pa(new) |
+ (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
+ _PMD_PAGE_512K) | _PMD_PRESENT);
#else
/* We use the old format for PPC_FSL_BOOK3E */
- hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+ *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
#endif
+ }
}
/* If we bailed from the for loop early, an error occurred, clean up */
if (i < num_hugepd) {
for (i = i - 1 ; i >= 0; i--, hpdp--)
- hpdp->pd = 0;
+ *hpdp = __hugepd(0);
kmem_cache_free(cachep, new);
}
spin_unlock(&mm->page_table_lock);
@@ -454,7 +454,7 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
return;
for (i = 0; i < num_hugepd; i++, hpdp++)
- hpdp->pd = 0;
+ *hpdp = __hugepd(0);
if (shift >= pdshift)
hugepd_free(tlb, hugepte);
@@ -810,12 +810,8 @@ static int __init hugetlbpage_init(void)
* if we have pdshift and shift value same, we don't
* use pgt cache for hugepd.
*/
- if (pdshift > shift) {
+ if (pdshift > shift)
pgtable_cache_add(pdshift - shift, NULL);
- if (!PGT_CACHE(pdshift - shift))
- panic("hugetlbpage_init(): could not create "
- "pgtable cache for %d bit pagesize\n", shift);
- }
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
else if (!hugepte_cache) {
/*
@@ -852,9 +848,6 @@ static int __init hugetlbpage_init(void)
else if (mmu_psize_defs[MMU_PAGE_2M].shift)
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
#endif
- else
- panic("%s: Unable to set default huge page size\n", __func__);
-
return 0;
}
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index a175cd82ae8c..eb8c6c8c4851 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -41,6 +41,7 @@ static void pmd_ctor(void *addr)
}
struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */
/*
* Create a kmem_cache() for pagetables. This is not used for PTE
@@ -78,17 +79,21 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
align = max_t(unsigned long, align, minalign);
name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
new = kmem_cache_create(name, table_size, align, 0, ctor);
+ if (!new)
+ panic("Could not allocate pgtable cache for order %d", shift);
+
kfree(name);
pgtable_cache[shift - 1] = new;
+
pr_debug("Allocated pgtable cache for order %d\n", shift);
}
-
+EXPORT_SYMBOL_GPL(pgtable_cache_add); /* used by kvm_hv module */
void pgtable_cache_init(void)
{
pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
- if (PMD_INDEX_SIZE && !PGT_CACHE(PMD_INDEX_SIZE))
+ if (PMD_CACHE_INDEX && !PGT_CACHE(PMD_CACHE_INDEX))
pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
/*
* In all current configs, when the PUD index exists it's the
@@ -97,11 +102,4 @@ void pgtable_cache_init(void)
*/
if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
-
- if (!PGT_CACHE(PGD_INDEX_SIZE))
- panic("Couldn't allocate pgd cache");
- if (PMD_INDEX_SIZE && !PGT_CACHE(PMD_INDEX_SIZE))
- panic("Couldn't allocate pmd pgtable caches");
- if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
- panic("Couldn't allocate pud pgtable caches");
}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 93abf8a9813d..6aa3b76aa0d6 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -42,6 +42,8 @@
#include <linux/memblock.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
@@ -344,12 +346,46 @@ static int __init parse_disable_radix(char *p)
}
early_param("disable_radix", parse_disable_radix);
+/*
+ * If we're running under a hypervisor, we need to check the contents of
+ * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
+ * radix. If not, we clear the radix feature bit so we fall back to hash.
+ */
+static void early_check_vec5(void)
+{
+ unsigned long root, chosen;
+ int size;
+ const u8 *vec5;
+
+ root = of_get_flat_dt_root();
+ chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
+ if (chosen == -FDT_ERR_NOTFOUND)
+ return;
+ vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
+ if (!vec5)
+ return;
+ if (size <= OV5_INDX(OV5_MMU_RADIX_300) ||
+ !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300)))
+ /* Hypervisor doesn't support radix */
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+}
+
void __init mmu_early_init_devtree(void)
{
/* Disable radix mode based on kernel command line. */
- if (disable_radix)
+ /* We don't yet have the machinery to do radix as a guest. */
+ if (disable_radix || !(mfmsr() & MSR_HV))
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ /*
+ * Check /chosen/ibm,architecture-vec-5 if running as a guest.
+ * When running bare-metal, we can use radix if we like
+ * even though the ibm,architecture-vec-5 property created by
+ * skiboot doesn't have the necessary bits set.
+ */
+ if (early_radix_enabled() && !(mfmsr() & MSR_HV))
+ early_check_vec5();
+
if (early_radix_enabled())
radix__early_init_devtree();
else
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 5f844337de21..9ee536ec0739 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -134,6 +134,8 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
unsigned long nr_pages = size >> PAGE_SHIFT;
int rc;
+ resize_hpt_for_hotplug(memblock_phys_mem_size());
+
pgdata = NODE_DATA(nid);
start = (unsigned long)__va(start);
@@ -174,6 +176,8 @@ int arch_remove_memory(u64 start, u64 size)
*/
vm_unmap_aliases();
+ resize_hpt_for_hotplug(memblock_phys_mem_size());
+
return ret;
}
#endif
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 104bad029ce9..7de7124ac91b 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -184,7 +184,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
* of the CMA zone if possible. NOTE: faulting in + migration
* can be expensive. Batching can be considered later
*/
- if (get_pageblock_migratetype(page) == MIGRATE_CMA) {
+ if (is_migrate_cma_page(page)) {
if (mm_iommu_move_page_from_cma(page))
goto populate;
if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b1099cb2f393..9befaee237d6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -290,7 +290,7 @@ int of_node_to_nid(struct device_node *device)
return nid;
}
-EXPORT_SYMBOL_GPL(of_node_to_nid);
+EXPORT_SYMBOL(of_node_to_nid);
static int __init find_min_common_depth(void)
{
@@ -786,14 +786,9 @@ new_range:
fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
node_set_online(nid);
- if (!(size = numa_enforce_memory_limit(start, size))) {
- if (--ranges)
- goto new_range;
- else
- continue;
- }
-
- memblock_set_node(start, size, &memblock.memory, nid);
+ size = numa_enforce_memory_limit(start, size);
+ if (size)
+ memblock_set_node(start, size, &memblock.memory, nid);
if (--ranges)
goto new_range;
@@ -1098,7 +1093,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
nid = hot_add_node_scn_to_nid(scn_addr);
}
- if (nid < 0 || !node_online(nid))
+ if (nid < 0 || !node_possible(nid))
nid = first_online_node;
return nid;
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index ebf9782bacf9..b798ff674fab 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -126,3 +126,21 @@ void mmu_cleanup_all(void)
else if (mmu_hash_ops.hpte_clear_all)
mmu_hash_ops.hpte_clear_all();
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int create_section_mapping(unsigned long start, unsigned long end)
+{
+ if (radix_enabled())
+ return radix__create_section_mapping(start, end);
+
+ return hash__create_section_mapping(start, end);
+}
+
+int remove_section_mapping(unsigned long start, unsigned long end)
+{
+ if (radix_enabled())
+ return radix__remove_section_mapping(start, end);
+
+ return hash__remove_section_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfa53ccc8baf..feeda90cd06d 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -18,6 +18,7 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/firmware.h>
+#include <asm/powernv.h>
#include <trace/events/thp.h>
@@ -65,7 +66,7 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa,
if (!pmdp)
return -ENOMEM;
if (map_page_size == PMD_SIZE) {
- ptep = (pte_t *)pudp;
+ ptep = pmdp_ptep(pmdp);
goto set_the_pte;
}
ptep = pte_alloc_kernel(pmdp, ea);
@@ -90,7 +91,7 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa,
}
pmdp = pmd_offset(pudp, ea);
if (map_page_size == PMD_SIZE) {
- ptep = (pte_t *)pudp;
+ ptep = pmdp_ptep(pmdp);
goto set_the_pte;
}
if (!pmd_present(*pmdp)) {
@@ -107,54 +108,66 @@ set_the_pte:
return 0;
}
+static inline void __meminit print_mapping(unsigned long start,
+ unsigned long end,
+ unsigned long size)
+{
+ if (end <= start)
+ return;
+
+ pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size);
+}
+
+static int __meminit create_physical_mapping(unsigned long start,
+ unsigned long end)
+{
+ unsigned long addr, mapping_size = 0;
+
+ start = _ALIGN_UP(start, PAGE_SIZE);
+ for (addr = start; addr < end; addr += mapping_size) {
+ unsigned long gap, previous_size;
+ int rc;
+
+ gap = end - addr;
+ previous_size = mapping_size;
+
+ if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+ mmu_psize_defs[MMU_PAGE_1G].shift)
+ mapping_size = PUD_SIZE;
+ else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+ mmu_psize_defs[MMU_PAGE_2M].shift)
+ mapping_size = PMD_SIZE;
+ else
+ mapping_size = PAGE_SIZE;
+
+ if (mapping_size != previous_size) {
+ print_mapping(start, addr, previous_size);
+ start = addr;
+ }
+
+ rc = radix__map_kernel_page((unsigned long)__va(addr), addr,
+ PAGE_KERNEL_X, mapping_size);
+ if (rc)
+ return rc;
+ }
+
+ print_mapping(start, addr, mapping_size);
+ return 0;
+}
+
static void __init radix_init_pgtable(void)
{
- int loop_count;
- u64 base, end, start_addr;
unsigned long rts_field;
struct memblock_region *reg;
- unsigned long linear_page_size;
/* We don't support slb for radix */
mmu_slb_size = 0;
/*
* Create the linear mapping, using standard page size for now
*/
- loop_count = 0;
- for_each_memblock(memory, reg) {
-
- start_addr = reg->base;
-
-redo:
- if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift)
- linear_page_size = PUD_SIZE;
- else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift)
- linear_page_size = PMD_SIZE;
- else
- linear_page_size = PAGE_SIZE;
-
- base = _ALIGN_UP(start_addr, linear_page_size);
- end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size);
-
- pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n",
- (unsigned long)base, (unsigned long)end,
- linear_page_size);
-
- while (base < end) {
- radix__map_kernel_page((unsigned long)__va(base),
- base, PAGE_KERNEL_X,
- linear_page_size);
- base += linear_page_size;
- }
- /*
- * map the rest using lower page size
- */
- if (end < reg->base + reg->size) {
- start_addr = end;
- loop_count++;
- goto redo;
- }
- }
+ for_each_memblock(memory, reg)
+ WARN_ON(create_physical_mapping(reg->base,
+ reg->base + reg->size));
/*
* Allocate Partition table and process table for the
* host.
@@ -401,6 +414,8 @@ void __init radix__early_init_mmu(void)
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
radix_init_partition_table();
radix_init_amor();
+ } else {
+ radix_init_pseries();
}
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
@@ -438,6 +453,7 @@ void radix__mmu_cleanup_all(void)
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
mtspr(SPRN_PTCR, 0);
+ powernv_set_nmmu_ptcr(0);
radix__flush_tlb_all();
}
}
@@ -467,6 +483,173 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
memblock_set_current_limit(first_memblock_base + first_memblock_size);
}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pte_free_kernel(&init_mm, pte_start);
+ pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ pmd_free(&init_mm, pmd_start);
+ pud_clear(pud);
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pte_t *pte;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+ /*
+ * The vmemmap_free() and remove_section_mapping()
+ * codepaths call us with aligned addresses.
+ */
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, pte);
+ }
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pte_t *pte_base;
+ pmd_t *pmd;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_huge(*pmd)) {
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next);
+ free_pte_table(pte_base, pmd);
+ }
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmd_base;
+ pud_t *pud;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_huge(*pud)) {
+ if (!IS_ALIGNED(addr, PUD_SIZE) ||
+ !IS_ALIGNED(next, PUD_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, (pte_t *)pud);
+ continue;
+ }
+
+ pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+ remove_pmd_table(pmd_base, addr, next);
+ free_pmd_table(pmd_base, pud);
+ }
+}
+
+static void remove_pagetable(unsigned long start, unsigned long end)
+{
+ unsigned long addr, next;
+ pud_t *pud_base;
+ pgd_t *pgd;
+
+ spin_lock(&init_mm.page_table_lock);
+
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (pgd_huge(*pgd)) {
+ if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
+ !IS_ALIGNED(next, PGDIR_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, (pte_t *)pgd);
+ continue;
+ }
+
+ pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+ remove_pud_table(pud_base, addr, next);
+ }
+
+ spin_unlock(&init_mm.page_table_lock);
+ radix__flush_tlb_kernel_range(start, end);
+}
+
+int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
+{
+ return create_physical_mapping(start, end);
+}
+
+int radix__remove_section_mapping(unsigned long start, unsigned long end)
+{
+ remove_pagetable(start, end);
+ return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
@@ -482,7 +665,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
#ifdef CONFIG_MEMORY_HOTPLUG
void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
- /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */
+ remove_pagetable(start, start + page_size);
}
#endif
#endif
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 8bca7f58afc4..db93cf747a03 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -52,6 +52,7 @@
#include <asm/sections.h>
#include <asm/firmware.h>
#include <asm/dma.h>
+#include <asm/powernv.h>
#include "mmu_decl.h"
@@ -436,6 +437,7 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
void __init mmu_partition_table_init(void)
{
unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+ unsigned long ptcr;
BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
@@ -448,19 +450,31 @@ void __init mmu_partition_table_init(void)
* update partition table control register,
* 64 K size.
*/
- mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+ mtspr(SPRN_PTCR, ptcr);
+ powernv_set_nmmu_ptcr(ptcr);
}
void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
unsigned long dw1)
{
+ unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
partition_tb[lpid].patb0 = cpu_to_be64(dw0);
partition_tb[lpid].patb1 = cpu_to_be64(dw1);
- /* Global flush of TLBs and partition table caches for this lpid */
+ /*
+ * Global flush of TLBs and partition table caches for this lpid.
+ * The type of flush (hash or radix) depends on what the previous
+ * use of this partition ID was, not the new use.
+ */
asm volatile("ptesync" : : : "memory");
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ if (old & PATB_HR)
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ else
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
}
EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 5c096c01e8bd..94210940112f 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -248,9 +248,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
nw = (next - addr) >> PAGE_SHIFT;
up_write(&mm->mmap_sem);
- err = -EFAULT;
if (__copy_from_user(spp, map, nw * sizeof(u32)))
- goto out2;
+ return -EFAULT;
map += nw;
down_write(&mm->mmap_sem);
@@ -262,6 +261,5 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
err = 0;
out:
up_write(&mm->mmap_sem);
- out2:
return err;
}
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 61b79119065f..952713d6cf04 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -50,9 +50,7 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
__tlbiel_pid(pid, set, ric);
}
- if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
- return;
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
}
static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
@@ -85,8 +83,6 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
asm volatile("ptesync": : :"memory");
- if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
}
static inline void _tlbie_va(unsigned long va, unsigned long pid,