1 files changed, 1073 insertions, 607 deletions
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index a9c42a7ffb6a..df9f7c444c39 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  arch/sparc64/mm/init.c
  *
@@ -5,12 +6,12 @@
  *  Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
  */
  
-#include <linux/module.h>
+#include <linux/extable.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
@@ -22,19 +23,19 @@
 #include <linux/kprobes.h>
 #include <linux/cache.h>
 #include <linux/sort.h>
+#include <linux/ioport.h>
 #include <linux/percpu.h>
-#include <linux/memblock.h>
 #include <linux/mmzone.h>
 #include <linux/gfp.h>
+#include <linux/bootmem_info.h>
 
 #include <asm/head.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
-#include <asm/pgtable.h>
 #include <asm/oplib.h>
 #include <asm/iommu.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 #include <asm/dma.h>
@@ -47,11 +48,13 @@
 #include <asm/prom.h>
 #include <asm/mdesc.h>
 #include <asm/cpudata.h>
+#include <asm/setup.h>
 #include <asm/irq.h>
 
 #include "init_64.h"
 
 unsigned long kern_linear_pte_xor[4] __read_mostly;
+static unsigned long page_cache4v_flag;
 
 /* A bitmap, two bits for every 256MB of physical memory.  These two
  * bits determine what page size we use for kernel linear
@@ -73,7 +76,6 @@ unsigned long kern_linear_pte_xor[4] __read_mostly;
  * 'cpu' properties, but we need to have this table setup before the
  * MDESC is initialized.
  */
-unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
 
 #ifndef CONFIG_DEBUG_PAGEALLOC
 /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings.
@@ -82,14 +84,17 @@ unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
  */
 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
 #endif
+extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
 
 static unsigned long cpu_pgsz_mask;
 
-#define MAX_BANKS	32
+#define MAX_BANKS	1024
 
 static struct linux_prom64_registers pavail[MAX_BANKS];
 static int pavail_ents;
 
+u64 numa_latency[MAX_NUMNODES][MAX_NUMNODES];
+
 static int cmp_p64(const void *a, const void *b)
 {
 	const struct linux_prom64_registers *x = a, *y = b;
@@ -163,10 +168,6 @@ static void __init read_obp_memory(const char *property,
 	     cmp_p64, NULL);
 }
 
-unsigned long sparc64_valid_addr_bitmap[VALID_ADDR_BITMAP_BYTES /
-					sizeof(unsigned long)];
-EXPORT_SYMBOL(sparc64_valid_addr_bitmap);
-
 /* Kernel physical address base and size in bytes.  */
 unsigned long kern_base __read_mostly;
 unsigned long kern_size __read_mostly;
@@ -194,21 +195,26 @@ atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0);
 #endif
 #endif
 
-inline void flush_dcache_page_impl(struct page *page)
+inline void flush_dcache_folio_impl(struct folio *folio)
 {
+	unsigned int i, nr = folio_nr_pages(folio);
+
 	BUG_ON(tlb_type == hypervisor);
 #ifdef CONFIG_DEBUG_DCFLUSH
 	atomic_inc(&dcpage_flushes);
 #endif
 
 #ifdef DCACHE_ALIASING_POSSIBLE
-	__flush_dcache_page(page_address(page),
-			    ((tlb_type == spitfire) &&
-			     page_mapping(page) != NULL));
+	for (i = 0; i < nr; i++)
+		__flush_dcache_page(folio_address(folio) + i * PAGE_SIZE,
+				    ((tlb_type == spitfire) &&
+				     folio_flush_mapping(folio) != NULL));
 #else
-	if (page_mapping(page) != NULL &&
-	    tlb_type == spitfire)
-		__flush_icache_page(__pa(page_address(page)));
+	if (folio_flush_mapping(folio) != NULL &&
+	    tlb_type == spitfire) {
+		for (i = 0; i < nr; i++)
+			__flush_icache_page((pfn + i) * PAGE_SIZE);
+	}
 #endif
 }
 
@@ -217,10 +223,10 @@ inline void flush_dcache_page_impl(struct page *page)
 #define PG_dcache_cpu_mask	\
 	((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL)
 
-#define dcache_dirty_cpu(page) \
-	(((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
+#define dcache_dirty_cpu(folio) \
+	(((folio)->flags.f >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
 
-static inline void set_dcache_dirty(struct page *page, int this_cpu)
+static inline void set_dcache_dirty(struct folio *folio, int this_cpu)
 {
 	unsigned long mask = this_cpu;
 	unsigned long non_cpu_bits;
@@ -237,11 +243,11 @@ static inline void set_dcache_dirty(struct page *page, int this_cpu)
 			     "bne,pn	%%xcc, 1b\n\t"
 			     " nop"
 			     : /* no outputs */
-			     : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags)
+			     : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags.f)
 			     : "g1", "g7");
 }
 
-static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu)
+static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu)
 {
 	unsigned long mask = (1UL << PG_dcache_dirty);
 
@@ -259,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu)
 			     " nop\n"
 			     "2:"
 			     : /* no outputs */
-			     : "r" (cpu), "r" (mask), "r" (&page->flags),
+			     : "r" (cpu), "r" (mask), "r" (&folio->flags.f),
 			       "i" (PG_dcache_cpu_mask),
 			       "i" (PG_dcache_cpu_shift)
 			     : "g1", "g7");
@@ -283,9 +289,10 @@ static void flush_dcache(unsigned long pfn)
 
 	page = pfn_to_page(pfn);
 	if (page) {
+		struct folio *folio = page_folio(page);
 		unsigned long pg_flags;
 
-		pg_flags = page->flags;
+		pg_flags = folio->flags.f;
 		if (pg_flags & (1UL << PG_dcache_dirty)) {
 			int cpu = ((pg_flags >> PG_dcache_cpu_shift) &
 				   PG_dcache_cpu_mask);
@@ -295,11 +302,11 @@ static void flush_dcache(unsigned long pfn)
 			 * in the SMP case.
 			 */
 			if (cpu == this_cpu)
-				flush_dcache_page_impl(page);
+				flush_dcache_folio_impl(folio);
 			else
-				smp_flush_dcache_page_impl(page, cpu);
+				smp_flush_dcache_folio_impl(folio, cpu);
 
-			clear_dcache_dirty_cpu(page, cpu);
+			clear_dcache_dirty_cpu(folio, cpu);
 
 			put_cpu();
 		}
@@ -323,23 +330,78 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde
 	tsb_insert(tsb, tag, tte);
 }
 
-#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline bool is_hugetlb_pte(pte_t pte)
+#ifdef CONFIG_HUGETLB_PAGE
+static int __init hugetlbpage_init(void)
 {
-	if ((tlb_type == hypervisor &&
-	     (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) ||
-	    (tlb_type != hypervisor &&
-	     (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U))
-		return true;
-	return false;
+	hugetlb_add_hstate(HPAGE_64K_SHIFT - PAGE_SHIFT);
+	hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
+	hugetlb_add_hstate(HPAGE_256MB_SHIFT - PAGE_SHIFT);
+	hugetlb_add_hstate(HPAGE_2GB_SHIFT - PAGE_SHIFT);
+
+	return 0;
 }
-#endif
 
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
+arch_initcall(hugetlbpage_init);
+
+static void __init pud_huge_patch(void)
+{
+	struct pud_huge_patch_entry *p;
+	unsigned long addr;
+
+	p = &__pud_huge_patch;
+	addr = p->addr;
+	*(unsigned int *)addr = p->insn;
+
+	__asm__ __volatile__("flush %0" : : "r" (addr));
+}
+
+bool __init arch_hugetlb_valid_size(unsigned long size)
+{
+	unsigned int hugepage_shift = ilog2(size);
+	unsigned short hv_pgsz_idx;
+	unsigned int hv_pgsz_mask;
+
+	switch (hugepage_shift) {
+	case HPAGE_16GB_SHIFT:
+		hv_pgsz_mask = HV_PGSZ_MASK_16GB;
+		hv_pgsz_idx = HV_PGSZ_IDX_16GB;
+		pud_huge_patch();
+		break;
+	case HPAGE_2GB_SHIFT:
+		hv_pgsz_mask = HV_PGSZ_MASK_2GB;
+		hv_pgsz_idx = HV_PGSZ_IDX_2GB;
+		break;
+	case HPAGE_256MB_SHIFT:
+		hv_pgsz_mask = HV_PGSZ_MASK_256MB;
+		hv_pgsz_idx = HV_PGSZ_IDX_256MB;
+		break;
+	case HPAGE_SHIFT:
+		hv_pgsz_mask = HV_PGSZ_MASK_4MB;
+		hv_pgsz_idx = HV_PGSZ_IDX_4MB;
+		break;
+	case HPAGE_64K_SHIFT:
+		hv_pgsz_mask = HV_PGSZ_MASK_64K;
+		hv_pgsz_idx = HV_PGSZ_IDX_64K;
+		break;
+	default:
+		hv_pgsz_mask = 0;
+	}
+
+	if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U)
+		return false;
+
+	return true;
+}
+#endif	/* CONFIG_HUGETLB_PAGE */
+
+void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
+		unsigned long address, pte_t *ptep, unsigned int nr)
 {
 	struct mm_struct *mm;
 	unsigned long flags;
+	bool is_huge_tsb;
 	pte_t pte = *ptep;
+	unsigned int i;
 
 	if (tlb_type != hypervisor) {
 		unsigned long pfn = pte_pfn(pte);
@@ -350,22 +412,57 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 
 	mm = vma->vm_mm;
 
+	/* Don't insert a non-valid PTE into the TSB, we'll deadlock.  */
+	if (!pte_accessible(mm, pte))
+		return;
+
 	spin_lock_irqsave(&mm->context.lock, flags);
 
+	is_huge_tsb = false;
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-	if (mm->context.huge_pte_count && is_hugetlb_pte(pte))
-		__update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
-					address, pte_val(pte));
-	else
+	if (mm->context.hugetlb_pte_count || mm->context.thp_pte_count) {
+		unsigned long hugepage_size = PAGE_SIZE;
+
+		if (is_vm_hugetlb_page(vma))
+			hugepage_size = huge_page_size(hstate_vma(vma));
+
+		if (hugepage_size >= PUD_SIZE) {
+			unsigned long mask = 0x1ffc00000UL;
+
+			/* Transfer bits [32:22] from address to resolve
+			 * at 4M granularity.
+			 */
+			pte_val(pte) &= ~mask;
+			pte_val(pte) |= (address & mask);
+		} else if (hugepage_size >= PMD_SIZE) {
+			/* We are fabricating 8MB pages using 4MB
+			 * real hw pages.
+			 */
+			pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
+		}
+
+		if (hugepage_size >= PMD_SIZE) {
+			__update_mmu_tsb_insert(mm, MM_TSB_HUGE,
+				REAL_HPAGE_SHIFT, address, pte_val(pte));
+			is_huge_tsb = true;
+		}
+	}
 #endif
-		__update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
-					address, pte_val(pte));
+	if (!is_huge_tsb) {
+		for (i = 0; i < nr; i++) {
+			__update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
+						address, pte_val(pte));
+			address += PAGE_SIZE;
+			pte_val(pte) += PAGE_SIZE;
+		}
+	}
 
 	spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
-void flush_dcache_page(struct page *page)
+void flush_dcache_folio(struct folio *folio)
 {
+	unsigned long pfn = folio_pfn(folio);
 	struct address_space *mapping;
 	int this_cpu;
 
@@ -376,35 +473,35 @@ void flush_dcache_page(struct page *page)
 	 * is merely the zero page.  The 'bigcore' testcase in GDB
 	 * causes this case to run millions of times.
 	 */
-	if (page == ZERO_PAGE(0))
+	if (is_zero_pfn(pfn))
 		return;
 
 	this_cpu = get_cpu();
 
-	mapping = page_mapping(page);
+	mapping = folio_flush_mapping(folio);
 	if (mapping && !mapping_mapped(mapping)) {
-		int dirty = test_bit(PG_dcache_dirty, &page->flags);
+		bool dirty = test_bit(PG_dcache_dirty, &folio->flags.f);
 		if (dirty) {
-			int dirty_cpu = dcache_dirty_cpu(page);
+			int dirty_cpu = dcache_dirty_cpu(folio);
 
 			if (dirty_cpu == this_cpu)
 				goto out;
-			smp_flush_dcache_page_impl(page, dirty_cpu);
+			smp_flush_dcache_folio_impl(folio, dirty_cpu);
 		}
-		set_dcache_dirty(page, this_cpu);
+		set_dcache_dirty(folio, this_cpu);
 	} else {
-		/* We could delay the flush for the !page_mapping
+		/* We could delay the flush for the !folio_mapping
 		 * case too.  But that case is for exec env/arg
 		 * pages and those are %99 certainly going to get
 		 * faulted into the tlb (and thus flushed) anyways.
 		 */
-		flush_dcache_page_impl(page);
+		flush_dcache_folio_impl(folio);
 	}
 
 out:
 	put_cpu();
 }
-EXPORT_SYMBOL(flush_dcache_page);
+EXPORT_SYMBOL(flush_dcache_folio);
 
 void __kprobes flush_icache_range(unsigned long start, unsigned long end)
 {
@@ -421,10 +518,7 @@ void __kprobes flush_icache_range(unsigned long start, unsigned long end)
 			if (kaddr >= PAGE_OFFSET)
 				paddr = kaddr & mask;
 			else {
-				pgd_t *pgdp = pgd_offset_k(kaddr);
-				pud_t *pudp = pud_offset(pgdp, kaddr);
-				pmd_t *pmdp = pmd_offset(pudp, kaddr);
-				pte_t *ptep = pte_offset_kernel(pmdp, kaddr);
+				pte_t *ptep = virt_to_kpte(kaddr);
 
 				paddr = pte_val(*ptep) & mask;
 			}
@@ -588,7 +682,7 @@ static void __init remap_kernel(void)
 	int i, tlb_ent = sparc64_highest_locked_tlbent();
 
 	tte_vaddr = (unsigned long) KERNBASE;
-	phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
+	phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
 	tte_data = kern_large_tte(phys_page);
 
 	kern_locked_tte_data = tte_data;
@@ -629,9 +723,10 @@ static void __init inherit_prom_mappings(void)
 
 void prom_world(int enter)
 {
-	if (!enter)
-		set_fs(get_fs());
-
+	/*
+	 * No need to change the address space any more, just flush
+	 * the register windows
+	 */
 	__asm__ __volatile__("flushw");
 }
 
@@ -662,10 +757,58 @@ EXPORT_SYMBOL(__flush_dcache_range);
 
 /* get_new_mmu_context() uses "cache + 1".  */
 DEFINE_SPINLOCK(ctx_alloc_lock);
-unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1;
+unsigned long tlb_context_cache = CTX_FIRST_VERSION;
 #define MAX_CTX_NR	(1UL << CTX_NR_BITS)
 #define CTX_BMAP_SLOTS	BITS_TO_LONGS(MAX_CTX_NR)
 DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR);
+DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0};
+
+static void mmu_context_wrap(void)
+{
+	unsigned long old_ver = tlb_context_cache & CTX_VERSION_MASK;
+	unsigned long new_ver, new_ctx, old_ctx;
+	struct mm_struct *mm;
+	int cpu;
+
+	bitmap_zero(mmu_context_bmap, 1 << CTX_NR_BITS);
+
+	/* Reserve kernel context */
+	set_bit(0, mmu_context_bmap);
+
+	new_ver = (tlb_context_cache & CTX_VERSION_MASK) + CTX_FIRST_VERSION;
+	if (unlikely(new_ver == 0))
+		new_ver = CTX_FIRST_VERSION;
+	tlb_context_cache = new_ver;
+
+	/*
+	 * Make sure that any new mm that are added into per_cpu_secondary_mm,
+	 * are going to go through get_new_mmu_context() path.
+	 */
+	mb();
+
+	/*
+	 * Updated versions to current on those CPUs that had valid secondary
+	 * contexts
+	 */
+	for_each_online_cpu(cpu) {
+		/*
+		 * If a new mm is stored after we took this mm from the array,
+		 * it will go into get_new_mmu_context() path, because we
+		 * already bumped the version in tlb_context_cache.
+		 */
+		mm = per_cpu(per_cpu_secondary_mm, cpu);
+
+		if (unlikely(!mm || mm == &init_mm))
+			continue;
+
+		old_ctx = mm->context.sparc64_ctx_val;
+		if (likely((old_ctx & CTX_VERSION_MASK) == old_ver)) {
+			new_ctx = (old_ctx & ~CTX_VERSION_MASK) | new_ver;
+			set_bit(new_ctx & CTX_NR_MASK, mmu_context_bmap);
+			mm->context.sparc64_ctx_val = new_ctx;
+		}
+	}
+}
 
 /* Caller does TLB context flushing on local CPU if necessary.
  * The caller also ensures that CTX_VALID(mm->context) is false.
@@ -681,48 +824,30 @@ void get_new_mmu_context(struct mm_struct *mm)
 {
 	unsigned long ctx, new_ctx;
 	unsigned long orig_pgsz_bits;
-	int new_version;
 
 	spin_lock(&ctx_alloc_lock);
+retry:
+	/* wrap might have happened, test again if our context became valid */
+	if (unlikely(CTX_VALID(mm->context)))
+		goto out;
 	orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK);
 	ctx = (tlb_context_cache + 1) & CTX_NR_MASK;
 	new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx);
-	new_version = 0;
 	if (new_ctx >= (1 << CTX_NR_BITS)) {
 		new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1);
 		if (new_ctx >= ctx) {
-			int i;
-			new_ctx = (tlb_context_cache & CTX_VERSION_MASK) +
-				CTX_FIRST_VERSION;
-			if (new_ctx == 1)
-				new_ctx = CTX_FIRST_VERSION;
-
-			/* Don't call memset, for 16 entries that's just
-			 * plain silly...
-			 */
-			mmu_context_bmap[0] = 3;
-			mmu_context_bmap[1] = 0;
-			mmu_context_bmap[2] = 0;
-			mmu_context_bmap[3] = 0;
-			for (i = 4; i < CTX_BMAP_SLOTS; i += 4) {
-				mmu_context_bmap[i + 0] = 0;
-				mmu_context_bmap[i + 1] = 0;
-				mmu_context_bmap[i + 2] = 0;
-				mmu_context_bmap[i + 3] = 0;
-			}
-			new_version = 1;
-			goto out;
+			mmu_context_wrap();
+			goto retry;
 		}
 	}
+	if (mm->context.sparc64_ctx_val)
+		cpumask_clear(mm_cpumask(mm));
 	mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63));
 	new_ctx |= (tlb_context_cache & CTX_VERSION_MASK);
-out:
 	tlb_context_cache = new_ctx;
 	mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits;
+out:
 	spin_unlock(&ctx_alloc_lock);
-
-	if (unlikely(new_version))
-		smp_new_mmu_context_version();
 }
 
 static int numa_enabled = 1;
@@ -789,16 +914,26 @@ static void __init find_ramdisk(unsigned long phys_base)
 
 struct node_mem_mask {
 	unsigned long mask;
-	unsigned long val;
+	unsigned long match;
 };
 static struct node_mem_mask node_masks[MAX_NUMNODES];
 static int num_node_masks;
 
+#ifdef CONFIG_NUMA
+
+struct mdesc_mlgroup {
+	u64	node;
+	u64	latency;
+	u64	match;
+	u64	mask;
+};
+
+static struct mdesc_mlgroup *mlgroups;
+static int num_mlgroups;
+
 int numa_cpu_lookup_table[NR_CPUS];
 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-
 struct mdesc_mblock {
 	u64	base;
 	u64	size;
@@ -807,52 +942,128 @@ struct mdesc_mblock {
 static struct mdesc_mblock *mblocks;
 static int num_mblocks;
 
-static unsigned long ra_to_pa(unsigned long addr)
+static struct mdesc_mblock * __init addr_to_mblock(unsigned long addr)
 {
+	struct mdesc_mblock *m = NULL;
 	int i;
 
 	for (i = 0; i < num_mblocks; i++) {
-		struct mdesc_mblock *m = &mblocks[i];
+		m = &mblocks[i];
 
 		if (addr >= m->base &&
 		    addr < (m->base + m->size)) {
-			addr += m->offset;
 			break;
 		}
 	}
-	return addr;
+
+	return m;
 }
 
-static int find_node(unsigned long addr)
+static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid)
 {
-	int i;
+	int prev_nid, new_nid;
 
-	addr = ra_to_pa(addr);
-	for (i = 0; i < num_node_masks; i++) {
-		struct node_mem_mask *p = &node_masks[i];
+	prev_nid = NUMA_NO_NODE;
+	for ( ; start < end; start += PAGE_SIZE) {
+		for (new_nid = 0; new_nid < num_node_masks; new_nid++) {
+			struct node_mem_mask *p = &node_masks[new_nid];
+
+			if ((start & p->mask) == p->match) {
+				if (prev_nid == NUMA_NO_NODE)
+					prev_nid = new_nid;
+				break;
+			}
+		}
+
+		if (new_nid == num_node_masks) {
+			prev_nid = 0;
+			WARN_ONCE(1, "addr[%Lx] doesn't match a NUMA node rule. Some memory will be owned by node 0.",
+				  start);
+			break;
+		}
 
-		if ((addr & p->mask) == p->val)
-			return i;
+		if (prev_nid != new_nid)
+			break;
 	}
-	return -1;
+	*nid = prev_nid;
+
+	return start > end ? end : start;
 }
 
-static u64 memblock_nid_range(u64 start, u64 end, int *nid)
+static u64 __init memblock_nid_range(u64 start, u64 end, int *nid)
 {
-	*nid = find_node(start);
-	start += PAGE_SIZE;
-	while (start < end) {
-		int n = find_node(start);
+	u64 ret_end, pa_start, m_mask, m_match, m_end;
+	struct mdesc_mblock *mblock;
+	int _nid, i;
+
+	if (tlb_type != hypervisor)
+		return memblock_nid_range_sun4u(start, end, nid);
+
+	mblock = addr_to_mblock(start);
+	if (!mblock) {
+		WARN_ONCE(1, "memblock_nid_range: Can't find mblock addr[%Lx]",
+			  start);
+
+		_nid = 0;
+		ret_end = end;
+		goto done;
+	}
 
-		if (n != *nid)
+	pa_start = start + mblock->offset;
+	m_match = 0;
+	m_mask = 0;
+
+	for (_nid = 0; _nid < num_node_masks; _nid++) {
+		struct node_mem_mask *const m = &node_masks[_nid];
+
+		if ((pa_start & m->mask) == m->match) {
+			m_match = m->match;
+			m_mask = m->mask;
 			break;
-		start += PAGE_SIZE;
+		}
 	}
 
-	if (start > end)
-		start = end;
+	if (num_node_masks == _nid) {
+		/* We could not find NUMA group, so default to 0, but lets
+		 * search for latency group, so we could calculate the correct
+		 * end address that we return
+		 */
+		_nid = 0;
+
+		for (i = 0; i < num_mlgroups; i++) {
+			struct mdesc_mlgroup *const m = &mlgroups[i];
 
-	return start;
+			if ((pa_start & m->mask) == m->match) {
+				m_match = m->match;
+				m_mask = m->mask;
+				break;
+			}
+		}
+
+		if (i == num_mlgroups) {
+			WARN_ONCE(1, "memblock_nid_range: Can't find latency group addr[%Lx]",
+				  start);
+
+			ret_end = end;
+			goto done;
+		}
+	}
+
+	/*
+	 * Each latency group has match and mask, and each memory block has an
+	 * offset.  An address belongs to a latency group if its address matches
+	 * the following formula: ((addr + offset) & mask) == match
+	 * It is, however, slow to check every single page if it matches a
+	 * particular latency group. As optimization we calculate end value by
+	 * using bit arithmetics.
+	 */
+	m_end = m_match + (1ul << __ffs(m_mask)) - mblock->offset;
+	m_end += pa_start & ~((1ul << fls64(m_mask)) - 1);
+	ret_end = m_end > end ? end : m_end;
+
+done:
+	*nid = _nid;
+	return ret_end;
 }
 #endif
 
@@ -864,16 +1075,9 @@ static void __init allocate_node_data(int nid)
 {
 	struct pglist_data *p;
 	unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-	unsigned long paddr;
 
-	paddr = memblock_alloc_try_nid(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid);
-	if (!paddr) {
-		prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
-		prom_halt();
-	}
-	NODE_DATA(nid) = __va(paddr);
-	memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+#ifdef CONFIG_NUMA
+	alloc_node_data(nid);
 
 	NODE_DATA(nid)->node_id = nid;
 #endif
@@ -887,34 +1091,28 @@ static void __init allocate_node_data(int nid)
 
 static void init_node_masks_nonnuma(void)
 {
+#ifdef CONFIG_NUMA
 	int i;
+#endif
 
 	numadbg("Initializing tables for non-numa.\n");
 
-	node_masks[0].mask = node_masks[0].val = 0;
+	node_masks[0].mask = 0;
+	node_masks[0].match = 0;
 	num_node_masks = 1;
 
+#ifdef CONFIG_NUMA
 	for (i = 0; i < NR_CPUS; i++)
 		numa_cpu_lookup_table[i] = 0;
 
 	cpumask_setall(&numa_cpumask_lookup_table[0]);
+#endif
 }
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data *node_data[MAX_NUMNODES];
+#ifdef CONFIG_NUMA
 
 EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
-EXPORT_SYMBOL(node_data);
-
-struct mdesc_mlgroup {
-	u64	node;
-	u64	latency;
-	u64	match;
-	u64	mask;
-};
-static struct mdesc_mlgroup *mlgroups;
-static int num_mlgroups;
 
 static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
 				   u32 cfg_handle)
@@ -987,7 +1185,7 @@ int of_node_to_nid(struct device_node *dp)
 	md = mdesc_grab();
 
 	count = 0;
-	nid = -1;
+	nid = NUMA_NO_NODE;
 	mdesc_for_each_node_by_name(md, grp, "group") {
 		if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
 			nid = count;
@@ -1003,14 +1201,14 @@ int of_node_to_nid(struct device_node *dp)
 
 static void __init add_node_ranges(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
+	unsigned long prev_max;
+	u64 i;
 
-	for_each_memblock(memory, reg) {
-		unsigned long size = reg->size;
-		unsigned long start, end;
+memblock_resized:
+	prev_max = memblock.memory.max;
 
-		start = reg->base;
-		end = start + size;
+	for_each_mem_range(i, &start, &end) {
 		while (start < end) {
 			unsigned long this_end;
 			int nid;
@@ -1018,10 +1216,13 @@ static void __init add_node_ranges(void)
 			this_end = memblock_nid_range(start, end, &nid);
 
 			numadbg("Setting memblock NUMA node nid[%d] "
-				"start[%lx] end[%lx]\n",
+				"start[%llx] end[%lx]\n",
 				nid, start, this_end);
 
-			memblock_set_node(start, this_end - start, nid);
+			memblock_set_node(start, this_end - start,
+					  &memblock.memory, nid);
+			if (memblock.memory.max != prev_max)
+				goto memblock_resized;
 			start = this_end;
 		}
 	}
@@ -1038,8 +1239,8 @@ static int __init grab_mlgroups(struct mdesc_handle *md)
 	if (!count)
 		return -ENOENT;
 
-	paddr = memblock_alloc(count * sizeof(struct mdesc_mlgroup),
-			  SMP_CACHE_BYTES);
+	paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mlgroup),
+				    SMP_CACHE_BYTES);
 	if (!paddr)
 		return -ENOMEM;
 
@@ -1079,8 +1280,8 @@ static int __init grab_mblocks(struct mdesc_handle *md)
 	if (!count)
 		return -ENOENT;
 
-	paddr = memblock_alloc(count * sizeof(struct mdesc_mblock),
-			  SMP_CACHE_BYTES);
+	paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mblock),
+				    SMP_CACHE_BYTES);
 	if (!paddr)
 		return -ENOMEM;
 
@@ -1146,6 +1347,49 @@ static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
 	return NULL;
 }
 
+int __node_distance(int from, int to)
+{
+	if ((from >= MAX_NUMNODES) || (to >= MAX_NUMNODES)) {
+		pr_warn("Returning default NUMA distance value for %d->%d\n",
+			from, to);
+		return (from == to) ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+	}
+	return numa_latency[from][to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp)
+{
+	int i;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct node_mem_mask *n = &node_masks[i];
+
+		if ((grp->mask == n->mask) && (grp->match == n->match))
+			break;
+	}
+	return i;
+}
+
+static void __init find_numa_latencies_for_group(struct mdesc_handle *md,
+						 u64 grp, int index)
+{
+	u64 arc;
+
+	mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
+		int tnode;
+		u64 target = mdesc_arc_target(md, arc);
+		struct mdesc_mlgroup *m = find_mlgroup(target);
+
+		if (!m)
+			continue;
+		tnode = find_best_numa_node_for_mlgroup(m);
+		if (tnode == MAX_NUMNODES)
+			continue;
+		numa_latency[index][tnode] = m->latency;
+	}
+}
+
 static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
 				      int index)
 {
@@ -1176,10 +1420,10 @@ static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
 	n = &node_masks[num_node_masks++];
 
 	n->mask = candidate->mask;
-	n->val = candidate->match;
+	n->match = candidate->match;
 
-	numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n",
-		index, n->mask, n->val, candidate->latency);
+	numadbg("NUMA NODE[%d]: mask[%lx] match[%lx] (latency[%llx])\n",
+		index, n->mask, n->match, candidate->latency);
 
 	return 0;
 }
@@ -1209,7 +1453,7 @@ static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
 static int __init numa_parse_mdesc(void)
 {
 	struct mdesc_handle *md = mdesc_grab();
-	int i, err, count;
+	int i, j, err, count;
 	u64 node;
 
 	node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
@@ -1234,6 +1478,23 @@ static int __init numa_parse_mdesc(void)
 		count++;
 	}
 
+	count = 0;
+	mdesc_for_each_node_by_name(md, node, "group") {
+		find_numa_latencies_for_group(md, node, count);
+		count++;
+	}
+
+	/* Normalize numa latency matrix according to ACPI SLIT spec. */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		u64 self_latency = numa_latency[i][i];
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			numa_latency[i][j] =
+				(numa_latency[i][j] * LOCAL_DISTANCE) /
+				self_latency;
+		}
+	}
+
 	add_node_ranges();
 
 	for (i = 0; i < num_node_masks; i++) {
@@ -1259,7 +1520,7 @@ static int __init numa_parse_jbus(void)
 		numa_cpu_lookup_table[cpu] = index;
 		cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu));
 		node_masks[index].mask = ~((1UL << 36UL) - 1UL);
-		node_masks[index].val = cpu << 36UL;
+		node_masks[index].match = cpu << 36UL;
 
 		index++;
 	}
@@ -1290,10 +1551,18 @@ static int __init numa_parse_sun4u(void)
 
 static int __init bootmem_init_numa(void)
 {
+	int i, j;
 	int err = -1;
 
 	numadbg("bootmem_init_numa()\n");
 
+	/* Some sane defaults for numa latency values */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		for (j = 0; j < MAX_NUMNODES; j++)
+			numa_latency[i][j] = (i == j) ?
+				LOCAL_DISTANCE : REMOTE_DISTANCE;
+	}
+
 	if (numa_enabled) {
 		if (tlb_type == hypervisor)
 			err = numa_parse_mdesc();
@@ -1325,7 +1594,7 @@ static void __init bootmem_init_nonnuma(void)
 	       (top_of_ram - total_ram) >> 20);
 
 	init_node_masks_nonnuma();
-	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+	memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
 	allocate_node_data(0);
 	node_set_online(0);
 }
@@ -1346,7 +1615,6 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
 
 	/* XXX cpu notifier XXX */
 
-	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
 
 	return end_pfn;
@@ -1355,9 +1623,148 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
 static struct linux_prom64_registers pall[MAX_BANKS] __initdata;
 static int pall_ents __initdata;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
+static unsigned long max_phys_bits = 40;
+
+bool kern_addr_valid(unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if ((long)addr < 0L) {
+		unsigned long pa = __pa(addr);
+
+		if ((pa >> max_phys_bits) != 0UL)
+			return false;
+
+		return pfn_valid(pa >> PAGE_SHIFT);
+	}
+
+	if (addr >= (unsigned long) KERNBASE &&
+	    addr < (unsigned long)&_end)
+		return true;
+
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return false;
+
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return false;
+
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return false;
+
+	if (pud_leaf(*pud))
+		return pfn_valid(pud_pfn(*pud));
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return false;
+
+	if (pmd_leaf(*pmd))
+		return pfn_valid(pmd_pfn(*pmd));
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(*pte))
+		return false;
+
+	return pfn_valid(pte_pfn(*pte));
+}
+
+static unsigned long __ref kernel_map_hugepud(unsigned long vstart,
+					      unsigned long vend,
+					      pud_t *pud)
+{
+	const unsigned long mask16gb = (1UL << 34) - 1UL;
+	u64 pte_val = vstart;
+
+	/* Each PUD is 8GB */
+	if ((vstart & mask16gb) ||
+	    (vend - vstart <= mask16gb)) {
+		pte_val ^= kern_linear_pte_xor[2];
+		pud_val(*pud) = pte_val | _PAGE_PUD_HUGE;
+
+		return vstart + PUD_SIZE;
+	}
+
+	pte_val ^= kern_linear_pte_xor[3];
+	pte_val |= _PAGE_PUD_HUGE;
+
+	vend = vstart + mask16gb + 1UL;
+	while (vstart < vend) {
+		pud_val(*pud) = pte_val;
+
+		pte_val += PUD_SIZE;
+		vstart += PUD_SIZE;
+		pud++;
+	}
+	return vstart;
+}
+
+static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend,
+				   bool guard)
+{
+	if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE)
+		return true;
+
+	return false;
+}
+
+static unsigned long __ref kernel_map_hugepmd(unsigned long vstart,
+					      unsigned long vend,
+					      pmd_t *pmd)
+{
+	const unsigned long mask256mb = (1UL << 28) - 1UL;
+	const unsigned long mask2gb = (1UL << 31) - 1UL;
+	u64 pte_val = vstart;
+
+	/* Each PMD is 8MB */
+	if ((vstart & mask256mb) ||
+	    (vend - vstart <= mask256mb)) {
+		pte_val ^= kern_linear_pte_xor[0];
+		pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE;
+
+		return vstart + PMD_SIZE;
+	}
+
+	if ((vstart & mask2gb) ||
+	    (vend - vstart <= mask2gb)) {
+		pte_val ^= kern_linear_pte_xor[1];
+		pte_val |= _PAGE_PMD_HUGE;
+		vend = vstart + mask256mb + 1UL;
+	} else {
+		pte_val ^= kern_linear_pte_xor[2];
+		pte_val |= _PAGE_PMD_HUGE;
+		vend = vstart + mask2gb + 1UL;
+	}
+
+	while (vstart < vend) {
+		pmd_val(*pmd) = pte_val;
+
+		pte_val += PMD_SIZE;
+		vstart += PMD_SIZE;
+		pmd++;
+	}
+
+	return vstart;
+}
+
+static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend,
+				   bool guard)
+{
+	if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE)
+		return true;
+
+	return false;
+}
+
 static unsigned long __ref kernel_map_range(unsigned long pstart,
-					    unsigned long pend, pgprot_t prot)
+					    unsigned long pend, pgprot_t prot,
+					    bool use_huge)
 {
 	unsigned long vstart = PAGE_OFFSET + pstart;
 	unsigned long vend = PAGE_OFFSET + pend;
@@ -1372,24 +1779,62 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
 	while (vstart < vend) {
 		unsigned long this_end, paddr = __pa(vstart);
 		pgd_t *pgd = pgd_offset_k(vstart);
+		p4d_t *p4d;
 		pud_t *pud;
 		pmd_t *pmd;
 		pte_t *pte;
 
-		pud = pud_offset(pgd, vstart);
+		if (pgd_none(*pgd)) {
+			pud_t *new;
+
+			new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+						  PAGE_SIZE);
+			if (!new)
+				goto err_alloc;
+			alloc_bytes += PAGE_SIZE;
+			pgd_populate(&init_mm, pgd, new);
+		}
+
+		p4d = p4d_offset(pgd, vstart);
+		if (p4d_none(*p4d)) {
+			pud_t *new;
+
+			new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+						  PAGE_SIZE);
+			if (!new)
+				goto err_alloc;
+			alloc_bytes += PAGE_SIZE;
+			p4d_populate(&init_mm, p4d, new);
+		}
+
+		pud = pud_offset(p4d, vstart);
 		if (pud_none(*pud)) {
 			pmd_t *new;
 
-			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+			if (kernel_can_map_hugepud(vstart, vend, use_huge)) {
+				vstart = kernel_map_hugepud(vstart, vend, pud);
+				continue;
+			}
+			new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+						  PAGE_SIZE);
+			if (!new)
+				goto err_alloc;
 			alloc_bytes += PAGE_SIZE;
 			pud_populate(&init_mm, pud, new);
 		}
 
 		pmd = pmd_offset(pud, vstart);
-		if (!pmd_present(*pmd)) {
+		if (pmd_none(*pmd)) {
 			pte_t *new;
 
-			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+			if (kernel_can_map_hugepmd(vstart, vend, use_huge)) {
+				vstart = kernel_map_hugepmd(vstart, vend, pmd);
+				continue;
+			}
+			new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+						  PAGE_SIZE);
+			if (!new)
+				goto err_alloc;
 			alloc_bytes += PAGE_SIZE;
 			pmd_populate_kernel(&init_mm, pmd, new);
 		}
@@ -1409,102 +1854,41 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
 	}
 
 	return alloc_bytes;
-}
-
-extern unsigned int kvmap_linear_patch[1];
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-static void __init kpte_set_val(unsigned long index, unsigned long val)
-{
-	unsigned long *ptr = kpte_linear_bitmap;
 
-	val <<= ((index % (BITS_PER_LONG / 2)) * 2);
-	ptr += (index / (BITS_PER_LONG / 2));
-
-	*ptr |= val;
+err_alloc:
+	panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
+	      __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+	return -ENOMEM;
 }
 
-static const unsigned long kpte_shift_min = 28; /* 256MB */
-static const unsigned long kpte_shift_max = 34; /* 16GB */
-static const unsigned long kpte_shift_incr = 3;
-
-static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end,
-					   unsigned long shift)
+static void __init flush_all_kernel_tsbs(void)
 {
-	unsigned long size = (1UL << shift);
-	unsigned long mask = (size - 1UL);
-	unsigned long remains = end - start;
-	unsigned long val;
-
-	if (remains < size || (start & mask))
-		return start;
-
-	/* VAL maps:
-	 *
-	 *	shift 28 --> kern_linear_pte_xor index 1
-	 *	shift 31 --> kern_linear_pte_xor index 2
-	 *	shift 34 --> kern_linear_pte_xor index 3
-	 */
-	val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1;
-
-	remains &= ~mask;
-	if (shift != kpte_shift_max)
-		remains = size;
-
-	while (remains) {
-		unsigned long index = start >> kpte_shift_min;
+	int i;
 
-		kpte_set_val(index, val);
+	for (i = 0; i < KERNEL_TSB_NENTRIES; i++) {
+		struct tsb *ent = &swapper_tsb[i];
 
-		start += 1UL << kpte_shift_min;
-		remains -= 1UL << kpte_shift_min;
+		ent->tag = (1UL << TSB_TAG_INVALID_BIT);
 	}
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) {
+		struct tsb *ent = &swapper_4m_tsb[i];
 
-	return start;
-}
-
-static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
-{
-	unsigned long smallest_size, smallest_mask;
-	unsigned long s;
-
-	smallest_size = (1UL << kpte_shift_min);
-	smallest_mask = (smallest_size - 1UL);
-
-	while (start < end) {
-		unsigned long orig_start = start;
-
-		for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) {
-			start = kpte_mark_using_shift(start, end, s);
-
-			if (start != orig_start)
-				break;
-		}
-
-		if (start == orig_start)
-			start = (start + smallest_size) & ~smallest_mask;
+		ent->tag = (1UL << TSB_TAG_INVALID_BIT);
 	}
+#endif
 }
 
-static void __init init_kpte_bitmap(void)
-{
-	unsigned long i;
-
-	for (i = 0; i < pall_ents; i++) {
-		unsigned long phys_start, phys_end;
-
-		phys_start = pall[i].phys_addr;
-		phys_end = phys_start + pall[i].reg_size;
-
-		mark_kpte_bitmap(phys_start, phys_end);
-	}
-}
+extern unsigned int kvmap_linear_patch[1];
 
 static void __init kernel_physical_mapping_init(void)
 {
-#ifdef CONFIG_DEBUG_PAGEALLOC
 	unsigned long i, mem_alloced = 0UL;
+	bool use_huge = true;
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	use_huge = false;
+#endif
 	for (i = 0; i < pall_ents; i++) {
 		unsigned long phys_start, phys_end;
 
@@ -1512,7 +1896,7 @@ static void __init kernel_physical_mapping_init(void)
 		phys_end = phys_start + pall[i].reg_size;
 
 		mem_alloced += kernel_map_range(phys_start, phys_end,
-						PAGE_KERNEL);
+						PAGE_KERNEL, use_huge);
 	}
 
 	printk("Allocated %ld bytes for kernel page tables.\n",
@@ -1521,18 +1905,19 @@ static void __init kernel_physical_mapping_init(void)
 	kvmap_linear_patch[0] = 0x01000000; /* nop */
 	flushi(&kvmap_linear_patch[0]);
 
+	flush_all_kernel_tsbs();
+
 	__flush_tlb_all();
-#endif
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT;
 	unsigned long phys_end = phys_start + (numpages * PAGE_SIZE);
 
 	kernel_map_range(phys_start, phys_end,
-			 (enable ? PAGE_KERNEL : __pgprot(0)));
+			 (enable ? PAGE_KERNEL : __pgprot(0)), false);
 
 	flush_tsb_kernel_range(PAGE_OFFSET + phys_start,
 			       PAGE_OFFSET + phys_end);
@@ -1557,6 +1942,91 @@ unsigned long __init find_ecache_flush_span(unsigned long size)
 	return ~0UL;
 }
 
+unsigned long PAGE_OFFSET;
+EXPORT_SYMBOL(PAGE_OFFSET);
+
+unsigned long VMALLOC_END   = 0x0000010000000000UL;
+EXPORT_SYMBOL(VMALLOC_END);
+
+unsigned long sparc64_va_hole_top =    0xfffff80000000000UL;
+unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL;
+
+static void __init setup_page_offset(void)
+{
+	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
+		/* Cheetah/Panther support a full 64-bit virtual
+		 * address, so we can use all that our page tables
+		 * support.
+		 */
+		sparc64_va_hole_top =    0xfff0000000000000UL;
+		sparc64_va_hole_bottom = 0x0010000000000000UL;
+
+		max_phys_bits = 42;
+	} else if (tlb_type == hypervisor) {
+		switch (sun4v_chip_type) {
+		case SUN4V_CHIP_NIAGARA1:
+		case SUN4V_CHIP_NIAGARA2:
+			/* T1 and T2 support 48-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xffff800000000000UL;
+			sparc64_va_hole_bottom = 0x0000800000000000UL;
+
+			max_phys_bits = 39;
+			break;
+		case SUN4V_CHIP_NIAGARA3:
+			/* T3 supports 48-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xffff800000000000UL;
+			sparc64_va_hole_bottom = 0x0000800000000000UL;
+
+			max_phys_bits = 43;
+			break;
+		case SUN4V_CHIP_NIAGARA4:
+		case SUN4V_CHIP_NIAGARA5:
+		case SUN4V_CHIP_SPARC64X:
+		case SUN4V_CHIP_SPARC_M6:
+			/* T4 and later support 52-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xfff8000000000000UL;
+			sparc64_va_hole_bottom = 0x0008000000000000UL;
+			max_phys_bits = 47;
+			break;
+		case SUN4V_CHIP_SPARC_M7:
+		case SUN4V_CHIP_SPARC_SN:
+			/* M7 and later support 52-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xfff8000000000000UL;
+			sparc64_va_hole_bottom = 0x0008000000000000UL;
+			max_phys_bits = 49;
+			break;
+		case SUN4V_CHIP_SPARC_M8:
+		default:
+			/* M8 and later support 54-bit virtual addresses.
+			 * However, restricting M8 and above VA bits to 53
+			 * as 4-level page table cannot support more than
+			 * 53 VA bits.
+			 */
+			sparc64_va_hole_top =    0xfff0000000000000UL;
+			sparc64_va_hole_bottom = 0x0010000000000000UL;
+			max_phys_bits = 51;
+			break;
+		}
+	}
+
+	if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) {
+		prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n",
+			    max_phys_bits);
+		prom_halt();
+	}
+
+	PAGE_OFFSET = sparc64_va_hole_top;
+	VMALLOC_END = ((sparc64_va_hole_bottom >> 1) +
+		       (sparc64_va_hole_bottom >> 2));
+
+	pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
+		PAGE_OFFSET, max_phys_bits);
+	pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n",
+		VMALLOC_START, VMALLOC_END);
+	pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n",
+		VMEMMAP_BASE, VMEMMAP_BASE << 1);
+}
+
 static void __init tsb_phys_patch(void)
 {
 	struct tsb_ldquad_phys_patch_entry *pquad;
@@ -1599,21 +2069,42 @@ static void __init tsb_phys_patch(void)
 #define NUM_KTSB_DESCR	1
 #endif
 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
-extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+
+/* The swapper TSBs are loaded with a base sequence of:
+ *
+ *	sethi	%uhi(SYMBOL), REG1
+ *	sethi	%hi(SYMBOL), REG2
+ *	or	REG1, %ulo(SYMBOL), REG1
+ *	or	REG2, %lo(SYMBOL), REG2
+ *	sllx	REG1, 32, REG1
+ *	or	REG1, REG2, REG1
+ *
+ * When we use physical addressing for the TSB accesses, we patch the
+ * first four instructions in the above sequence.
+ */
 
 static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa)
 {
-	pa >>= KTSB_PHYS_SHIFT;
+	unsigned long high_bits, low_bits;
+
+	high_bits = (pa >> 32) & 0xffffffff;
+	low_bits = (pa >> 0) & 0xffffffff;
 
 	while (start < end) {
 		unsigned int *ia = (unsigned int *)(unsigned long)*start;
 
-		ia[0] = (ia[0] & ~0x3fffff) | (pa >> 10);
+		ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10);
 		__asm__ __volatile__("flush	%0" : : "r" (ia));
 
-		ia[1] = (ia[1] & ~0x3ff) | (pa & 0x3ff);
+		ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10);
 		__asm__ __volatile__("flush	%0" : : "r" (ia + 1));
 
+		ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff);
+		__asm__ __volatile__("flush	%0" : : "r" (ia + 2));
+
+		ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff);
+		__asm__ __volatile__("flush	%0" : : "r" (ia + 3));
+
 		start++;
 	}
 }
@@ -1694,7 +2185,7 @@ static void __init sun4v_ktsb_init(void)
 #endif
 }
 
-void __cpuinit sun4v_ktsb_register(void)
+void sun4v_ktsb_register(void)
 {
 	unsigned long pa, ret;
 
@@ -1719,11 +2210,26 @@ static void __init sun4u_linear_pte_xor_finalize(void)
 
 static void __init sun4v_linear_pte_xor_finalize(void)
 {
+	unsigned long pagecv_flag;
+
+	/* Bit 9 of TTE is no longer CV bit on M7 processor and it instead
+	 * enables MCD error. Do not set bit 9 on M7 processor.
+	 */
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_SPARC_M7:
+	case SUN4V_CHIP_SPARC_M8:
+	case SUN4V_CHIP_SPARC_SN:
+		pagecv_flag = 0x00;
+		break;
+	default:
+		pagecv_flag = _PAGE_CV_4V;
+		break;
+	}
 #ifndef CONFIG_DEBUG_PAGEALLOC
 	if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
 		kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
-			0xfffff80000000000UL;
-		kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+			PAGE_OFFSET;
+		kern_linear_pte_xor[1] |= (_PAGE_CP_4V | pagecv_flag |
 					   _PAGE_P_4V | _PAGE_W_4V);
 	} else {
 		kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
@@ -1731,8 +2237,8 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 
 	if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
 		kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
-			0xfffff80000000000UL;
-		kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+			PAGE_OFFSET;
+		kern_linear_pte_xor[2] |= (_PAGE_CP_4V | pagecv_flag |
 					   _PAGE_P_4V | _PAGE_W_4V);
 	} else {
 		kern_linear_pte_xor[2] = kern_linear_pte_xor[1];
@@ -1740,8 +2246,8 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 
 	if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
 		kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
-			0xfffff80000000000UL;
-		kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+			PAGE_OFFSET;
+		kern_linear_pte_xor[3] |= (_PAGE_CP_4V | pagecv_flag |
 					   _PAGE_P_4V | _PAGE_W_4V);
 	} else {
 		kern_linear_pte_xor[3] = kern_linear_pte_xor[2];
@@ -1752,22 +2258,39 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 /* paging_init() sets up the page tables */
 
 static unsigned long last_valid_pfn;
-pgd_t swapper_pg_dir[2048];
 
 static void sun4u_pgprot_init(void);
 static void sun4v_pgprot_init(void);
 
+#define _PAGE_CACHE_4U	(_PAGE_CP_4U | _PAGE_CV_4U)
+#define _PAGE_CACHE_4V	(_PAGE_CP_4V | _PAGE_CV_4V)
+#define __DIRTY_BITS_4U	 (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
+#define __DIRTY_BITS_4V	 (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
+#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
+#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
+
+/* We need to exclude reserved regions. This exclusion will include
+ * vmlinux and initrd. To be more precise the initrd size could be used to
+ * compute a new lower limit because it is freed later during initialization.
+ */
+static void __init reduce_memory(phys_addr_t limit_ram)
+{
+	limit_ram += memblock_reserved_size();
+	memblock_enforce_memory_limit(limit_ram);
+}
+
 void __init paging_init(void)
 {
 	unsigned long end_pfn, shift, phys_base;
 	unsigned long real_end, i;
-	int node;
+
+	setup_page_offset();
 
 	/* These build time checkes make sure that the dcache_dirty_cpu()
-	 * page->flags usage will work.
+	 * folio->flags usage will work.
 	 *
 	 * When a page gets marked as dcache-dirty, we store the
-	 * cpu number starting at bit 32 in the page->flags.  Also,
+	 * cpu number starting at bit 32 in the folio->flags.  Also,
 	 * functions like clear_dcache_dirty_cpu use the cpu mask
 	 * in 13-bit signed-immediate instruction fields.
 	 */
@@ -1788,7 +2311,7 @@ void __init paging_init(void)
 
 	BUILD_BUG_ON(NR_CPUS > 4096);
 
-	kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
+	kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
 	kern_size = (unsigned long)&_end - (unsigned long)KERNBASE;
 
 	/* Invalidate both kernel TSBs.  */
@@ -1797,6 +2320,27 @@ void __init paging_init(void)
 	memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
 #endif
 
+	/* TTE.cv bit on sparc v9 occupies the same position as TTE.mcde
+	 * bit on M7 processor. This is a conflicting usage of the same
+	 * bit. Enabling TTE.cv on M7 would turn on Memory Corruption
+	 * Detection error on all pages and this will lead to problems
+	 * later. Kernel does not run with MCD enabled and hence rest
+	 * of the required steps to fully configure memory corruption
+	 * detection are not taken. We need to ensure TTE.mcde is not
+	 * set on M7 processor. Compute the value of cacheability
+	 * flag for use later taking this into consideration.
+	 */
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_SPARC_M7:
+	case SUN4V_CHIP_SPARC_M8:
+	case SUN4V_CHIP_SPARC_SN:
+		page_cache4v_flag = _PAGE_CP_4V;
+		break;
+	default:
+		page_cache4v_flag = _PAGE_CACHE_4V;
+		break;
+	}
+
 	if (tlb_type == hypervisor)
 		sun4v_pgprot_init();
 	else
@@ -1834,7 +2378,8 @@ void __init paging_init(void)
 
 	find_ramdisk(phys_base);
 
-	memblock_enforce_memory_limit(cmdline_memory_size);
+	if (cmdline_memory_size)
+		reduce_memory(cmdline_memory_size);
 
 	memblock_allow_resize();
 	memblock_dump_all();
@@ -1844,7 +2389,7 @@ void __init paging_init(void)
 	shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE);
 
 	real_end = (unsigned long)_end;
-	num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << 22);
+	num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB);
 	printk("Kernel: Using %d locked TLB entries for main kernel image.\n",
 	       num_kernel_image_mappings);
 
@@ -1853,16 +2398,10 @@ void __init paging_init(void)
 	 */
 	init_mm.pgd += ((shift) / (sizeof(pgd_t)));
 	
-	memset(swapper_low_pmd_dir, 0, sizeof(swapper_low_pmd_dir));
+	memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));
 
-	/* Now can init the kernel/bad page tables. */
-	pud_set(pud_offset(&swapper_pg_dir[0], 0),
-		swapper_low_pmd_dir + (shift / sizeof(pgd_t)));
-	
 	inherit_prom_mappings();
 	
-	init_kpte_bitmap();
-
 	/* Ok, we can use our TLB miss and window trap handlers safely.  */
 	setup_tba();
 
@@ -1913,21 +2452,6 @@ void __init paging_init(void)
 	/* Setup bootmem... */
 	last_valid_pfn = end_pfn = bootmem_init(phys_base);
 
-	/* Once the OF device tree and MDESC have been setup, we know
-	 * the list of possible cpus.  Therefore we can allocate the
-	 * IRQ stacks.
-	 */
-	for_each_possible_cpu(i) {
-		node = cpu_to_node(i);
-
-		softirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
-							THREAD_SIZE,
-							THREAD_SIZE, 0);
-		hardirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
-							THREAD_SIZE,
-							THREAD_SIZE, 0);
-	}
-
 	kernel_physical_mapping_init();
 
 	{
@@ -1937,7 +2461,7 @@ void __init paging_init(void)
 
 		max_zone_pfns[ZONE_NORMAL] = end_pfn;
 
-		free_area_init_nodes(max_zone_pfns);
+		free_area_init(max_zone_pfns);
 	}
 
 	printk("Booting Linux...\n");
@@ -1969,73 +2493,9 @@ int page_in_phys_avail(unsigned long paddr)
 	return 0;
 }
 
-static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
-static int pavail_rescan_ents __initdata;
-
-/* Certain OBP calls, such as fetching "available" properties, can
- * claim physical memory.  So, along with initializing the valid
- * address bitmap, what we do here is refetch the physical available
- * memory list again, and make sure it provides at least as much
- * memory as 'pavail' does.
- */
-static void __init setup_valid_addr_bitmap_from_pavail(unsigned long *bitmap)
-{
-	int i;
-
-	read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
-
-	for (i = 0; i < pavail_ents; i++) {
-		unsigned long old_start, old_end;
-
-		old_start = pavail[i].phys_addr;
-		old_end = old_start + pavail[i].reg_size;
-		while (old_start < old_end) {
-			int n;
-
-			for (n = 0; n < pavail_rescan_ents; n++) {
-				unsigned long new_start, new_end;
-
-				new_start = pavail_rescan[n].phys_addr;
-				new_end = new_start +
-					pavail_rescan[n].reg_size;
-
-				if (new_start <= old_start &&
-				    new_end >= (old_start + PAGE_SIZE)) {
-					set_bit(old_start >> 22, bitmap);
-					goto do_next_page;
-				}
-			}
-
-			prom_printf("mem_init: Lost memory in pavail\n");
-			prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
-				    pavail[i].phys_addr,
-				    pavail[i].reg_size);
-			prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
-				    pavail_rescan[i].phys_addr,
-				    pavail_rescan[i].reg_size);
-			prom_printf("mem_init: Cannot continue, aborting.\n");
-			prom_halt();
-
-		do_next_page:
-			old_start += PAGE_SIZE;
-		}
-	}
-}
-
-static void __init patch_tlb_miss_handler_bitmap(void)
-{
-	extern unsigned int valid_addr_bitmap_insn[];
-	extern unsigned int valid_addr_bitmap_patch[];
-
-	valid_addr_bitmap_insn[1] = valid_addr_bitmap_patch[1];
-	mb();
-	valid_addr_bitmap_insn[0] = valid_addr_bitmap_patch[0];
-	flushi(&valid_addr_bitmap_insn[0]);
-}
-
 static void __init register_page_bootmem_info(void)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int i;
 
 	for_each_online_node(i)
@@ -2045,22 +2505,13 @@ static void __init register_page_bootmem_info(void)
 }
 void __init mem_init(void)
 {
-	unsigned long addr, last;
-
-	addr = PAGE_OFFSET + kern_base;
-	last = PAGE_ALIGN(kern_size) + addr;
-	while (addr < last) {
-		set_bit(__pa(addr) >> 22, sparc64_valid_addr_bitmap);
-		addr += PAGE_SIZE;
-	}
-
-	setup_valid_addr_bitmap_from_pavail(sparc64_valid_addr_bitmap);
-	patch_tlb_miss_handler_bitmap();
-
-	high_memory = __va(last_valid_pfn << PAGE_SHIFT);
-
+	/*
+	 * Must be done after boot memory is put on freelist, because here we
+	 * might set fields in deferred struct pages that have not yet been
+	 * initialized, and memblock_free_all() initializes all the reserved
+	 * deferred pages for us.
+	 */
 	register_page_bootmem_info();
-	free_all_bootmem();
 
 	/*
 	 * Set up the zero page, mark it reserved, so that page count
@@ -2073,7 +2524,6 @@ void __init mem_init(void)
 	}
 	mark_page_reserved(mem_map_zero);
 
-	mem_init_print_info(NULL);
 
 	if (tlb_type == cheetah || tlb_type == cheetah_plus)
 		cheetah_ecache_flush_init();
@@ -2110,21 +2560,6 @@ void free_initmem(void)
 	}
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-			   "initrd");
-}
-#endif
-
-#define _PAGE_CACHE_4U	(_PAGE_CP_4U | _PAGE_CV_4U)
-#define _PAGE_CACHE_4V	(_PAGE_CP_4V | _PAGE_CV_4V)
-#define __DIRTY_BITS_4U	 (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
-#define __DIRTY_BITS_4V	 (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
-#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
-#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
-
 pgprot_t PAGE_KERNEL __read_mostly;
 EXPORT_SYMBOL(PAGE_KERNEL);
 
@@ -2146,18 +2581,9 @@ unsigned long _PAGE_CACHE __read_mostly;
 EXPORT_SYMBOL(_PAGE_CACHE);
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-unsigned long vmemmap_table[VMEMMAP_SIZE];
-
-static long __meminitdata addr_start, addr_end;
-static int __meminitdata node_start;
-
 int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
-			       int node)
+			       int node, struct vmem_altmap *altmap)
 {
-	unsigned long phys_start = (vstart - VMEMMAP_BASE);
-	unsigned long phys_end = (vend - VMEMMAP_BASE);
-	unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK;
-	unsigned long end = VMEMMAP_ALIGN(phys_end);
 	unsigned long pte_base;
 
 	pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
@@ -2165,52 +2591,49 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
 		    _PAGE_P_4U | _PAGE_W_4U);
 	if (tlb_type == hypervisor)
 		pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
-			    _PAGE_CP_4V | _PAGE_CV_4V |
-			    _PAGE_P_4V | _PAGE_W_4V);
+			    page_cache4v_flag | _PAGE_P_4V | _PAGE_W_4V);
+
+	pte_base |= _PAGE_PMD_HUGE;
+
+	vstart = vstart & PMD_MASK;
+	vend = ALIGN(vend, PMD_SIZE);
+	for (; vstart < vend; vstart += PMD_SIZE) {
+		pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
+		unsigned long pte;
+		p4d_t *p4d;
+		pud_t *pud;
+		pmd_t *pmd;
 
-	for (; addr < end; addr += VMEMMAP_CHUNK) {
-		unsigned long *vmem_pp =
-			vmemmap_table + (addr >> VMEMMAP_CHUNK_SHIFT);
-		void *block;
+		if (!pgd)
+			return -ENOMEM;
+
+		p4d = vmemmap_p4d_populate(pgd, vstart, node);
+		if (!p4d)
+			return -ENOMEM;
+
+		pud = vmemmap_pud_populate(p4d, vstart, node);
+		if (!pud)
+			return -ENOMEM;
+
+		pmd = pmd_offset(pud, vstart);
+		pte = pmd_val(*pmd);
+		if (!(pte & _PAGE_VALID)) {
+			void *block = vmemmap_alloc_block(PMD_SIZE, node);
 
-		if (!(*vmem_pp & _PAGE_VALID)) {
-			block = vmemmap_alloc_block(1UL << 22, node);
 			if (!block)
 				return -ENOMEM;
 
-			*vmem_pp = pte_base | __pa(block);
-
-			/* check to see if we have contiguous blocks */
-			if (addr_end != addr || node_start != node) {
-				if (addr_start)
-					printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
-					       addr_start, addr_end-1, node_start);
-				addr_start = addr;
-				node_start = node;
-			}
-			addr_end = addr + VMEMMAP_CHUNK;
+			pmd_val(*pmd) = pte_base | __pa(block);
 		}
 	}
-	return 0;
-}
 
-void __meminit vmemmap_populate_print_last(void)
-{
-	if (addr_start) {
-		printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
-		       addr_start, addr_end-1, node_start);
-		addr_start = 0;
-		addr_end = 0;
-		node_start = 0;
-	}
-}
-
-void vmemmap_free(unsigned long start, unsigned long end)
-{
+	return 0;
 }
-
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+/* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */
+static pgprot_t protection_map[16] __ro_after_init;
+
 static void prot_init_common(unsigned long page_none,
 			     unsigned long page_shared,
 			     unsigned long page_copy,
@@ -2261,10 +2684,10 @@ static void __init sun4u_pgprot_init(void)
 		     __ACCESS_BITS_4U | _PAGE_E_4U);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
+	kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
 #else
 	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
-		0xfffff80000000000UL;
+		PAGE_OFFSET;
 #endif
 	kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
 				   _PAGE_P_4U | _PAGE_W_4U);
@@ -2298,23 +2721,23 @@ static void __init sun4v_pgprot_init(void)
 	int i;
 
 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
-				_PAGE_CACHE_4V | _PAGE_P_4V |
+				page_cache4v_flag | _PAGE_P_4V |
 				__ACCESS_BITS_4V | __DIRTY_BITS_4V |
 				_PAGE_EXEC_4V);
 	PAGE_KERNEL_LOCKED = PAGE_KERNEL;
 
 	_PAGE_IE = _PAGE_IE_4V;
 	_PAGE_E = _PAGE_E_4V;
-	_PAGE_CACHE = _PAGE_CACHE_4V;
+	_PAGE_CACHE = page_cache4v_flag;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
+	kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
 #else
 	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
-		0xfffff80000000000UL;
+		PAGE_OFFSET;
 #endif
-	kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
-				   _PAGE_P_4V | _PAGE_W_4V);
+	kern_linear_pte_xor[0] |= (page_cache4v_flag | _PAGE_P_4V |
+				   _PAGE_W_4V);
 
 	for (i = 1; i < 4; i++)
 		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
@@ -2327,12 +2750,12 @@ static void __init sun4v_pgprot_init(void)
 			     _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
 			     _PAGE_SZ64K_4V | _PAGE_SZ8K_4V);
 
-	page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V;
-	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+	page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | page_cache4v_flag;
+	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
 		       __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V);
-	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
 		       __ACCESS_BITS_4V | _PAGE_EXEC_4V);
-	page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+	page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
 			 __ACCESS_BITS_4V | _PAGE_EXEC_4V);
 
 	page_exec_bit = _PAGE_EXEC_4V;
@@ -2390,7 +2813,7 @@ static unsigned long kern_large_tte(unsigned long paddr)
 	       _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U);
 	if (tlb_type == hypervisor)
 		val = (_PAGE_VALID | _PAGE_SZ4MB_4V |
-		       _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V |
+		       page_cache4v_flag | _PAGE_P_4V |
 		       _PAGE_EXEC_4V | _PAGE_W_4V);
 
 	return val | paddr;
@@ -2455,92 +2878,40 @@ void __flush_tlb_all(void)
 			     : : "r" (pstate));
 }
 
-static pte_t *get_from_cache(struct mm_struct *mm)
+static pte_t *__pte_alloc_one(struct mm_struct *mm)
 {
-	struct page *page;
-	pte_t *ret;
+	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
 
-	spin_lock(&mm->page_table_lock);
-	page = mm->context.pgtable_page;
-	ret = NULL;
-	if (page) {
-		void *p = page_address(page);
-
-		mm->context.pgtable_page = NULL;
-
-		ret = (pte_t *) (p + (PAGE_SIZE / 2));
+	if (!ptdesc)
+		return NULL;
+	if (!pagetable_pte_ctor(mm, ptdesc)) {
+		pagetable_free(ptdesc);
+		return NULL;
 	}
-	spin_unlock(&mm->page_table_lock);
-
-	return ret;
+	return ptdesc_address(ptdesc);
 }
 
-static struct page *__alloc_for_cache(struct mm_struct *mm)
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
-				       __GFP_REPEAT | __GFP_ZERO);
-
-	if (page) {
-		spin_lock(&mm->page_table_lock);
-		if (!mm->context.pgtable_page) {
-			atomic_set(&page->_count, 2);
-			mm->context.pgtable_page = page;
-		}
-		spin_unlock(&mm->page_table_lock);
-	}
-	return page;
+	return __pte_alloc_one(mm);
 }
 
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
-			    unsigned long address)
+pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
-	struct page *page;
-	pte_t *pte;
-
-	pte = get_from_cache(mm);
-	if (pte)
-		return pte;
-
-	page = __alloc_for_cache(mm);
-	if (page)
-		pte = (pte_t *) page_address(page);
-
-	return pte;
+	return __pte_alloc_one(mm);
 }
 
-pgtable_t pte_alloc_one(struct mm_struct *mm,
-			unsigned long address)
+static void __pte_free(pgtable_t pte)
 {
-	struct page *page;
-	pte_t *pte;
-
-	pte = get_from_cache(mm);
-	if (pte)
-		return pte;
-
-	page = __alloc_for_cache(mm);
-	if (page) {
-		pgtable_page_ctor(page);
-		pte = (pte_t *) page_address(page);
-	}
+	struct ptdesc *ptdesc = virt_to_ptdesc(pte);
 
-	return pte;
+	pagetable_dtor(ptdesc);
+	pagetable_free(ptdesc);
 }
 
 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-	struct page *page = virt_to_page(pte);
-	if (put_page_testzero(page))
-		free_hot_cold_page(page, 0);
-}
-
-static void __pte_free(pgtable_t pte)
-{
-	struct page *page = virt_to_page(pte);
-	if (put_page_testzero(page)) {
-		pgtable_page_dtor(page);
-		free_hot_cold_page(page, 0);
-	}
+	__pte_free(pte);
 }
 
 void pte_free(struct mm_struct *mm, pgtable_t pte)
@@ -2557,92 +2928,20 @@ void pgtable_free(void *table, bool is_page)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify)
+static void pte_free_now(struct rcu_head *head)
 {
-	if (pgprot_val(pgprot) & _PAGE_VALID)
-		pmd_val(pmd) |= PMD_HUGE_PRESENT;
-	if (tlb_type == hypervisor) {
-		if (pgprot_val(pgprot) & _PAGE_WRITE_4V)
-			pmd_val(pmd) |= PMD_HUGE_WRITE;
-		if (pgprot_val(pgprot) & _PAGE_EXEC_4V)
-			pmd_val(pmd) |= PMD_HUGE_EXEC;
-
-		if (!for_modify) {
-			if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V)
-				pmd_val(pmd) |= PMD_HUGE_ACCESSED;
-			if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V)
-				pmd_val(pmd) |= PMD_HUGE_DIRTY;
-		}
-	} else {
-		if (pgprot_val(pgprot) & _PAGE_WRITE_4U)
-			pmd_val(pmd) |= PMD_HUGE_WRITE;
-		if (pgprot_val(pgprot) & _PAGE_EXEC_4U)
-			pmd_val(pmd) |= PMD_HUGE_EXEC;
-
-		if (!for_modify) {
-			if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U)
-				pmd_val(pmd) |= PMD_HUGE_ACCESSED;
-			if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U)
-				pmd_val(pmd) |= PMD_HUGE_DIRTY;
-		}
-	}
-
-	return pmd;
-}
-
-pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
-{
-	pmd_t pmd;
-
-	pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT)));
-	pmd_val(pmd) |= PMD_ISHUGE;
-	pmd = pmd_set_protbits(pmd, pgprot, false);
-	return pmd;
-}
+	struct page *page;
 
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
-	pmd_val(pmd) &= ~(PMD_HUGE_PRESENT |
-			  PMD_HUGE_WRITE |
-			  PMD_HUGE_EXEC);
-	pmd = pmd_set_protbits(pmd, newprot, true);
-	return pmd;
+	page = container_of(head, struct page, rcu_head);
+	__pte_free((pgtable_t)page_address(page));
 }
 
-pgprot_t pmd_pgprot(pmd_t entry)
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
 {
-	unsigned long pte = 0;
-
-	if (pmd_val(entry) & PMD_HUGE_PRESENT)
-		pte |= _PAGE_VALID;
-
-	if (tlb_type == hypervisor) {
-		if (pmd_val(entry) & PMD_HUGE_PRESENT)
-			pte |= _PAGE_PRESENT_4V;
-		if (pmd_val(entry) & PMD_HUGE_EXEC)
-			pte |= _PAGE_EXEC_4V;
-		if (pmd_val(entry) & PMD_HUGE_WRITE)
-			pte |= _PAGE_W_4V;
-		if (pmd_val(entry) & PMD_HUGE_ACCESSED)
-			pte |= _PAGE_ACCESSED_4V;
-		if (pmd_val(entry) & PMD_HUGE_DIRTY)
-			pte |= _PAGE_MODIFIED_4V;
-		pte |= _PAGE_CP_4V|_PAGE_CV_4V;
-	} else {
-		if (pmd_val(entry) & PMD_HUGE_PRESENT)
-			pte |= _PAGE_PRESENT_4U;
-		if (pmd_val(entry) & PMD_HUGE_EXEC)
-			pte |= _PAGE_EXEC_4U;
-		if (pmd_val(entry) & PMD_HUGE_WRITE)
-			pte |= _PAGE_W_4U;
-		if (pmd_val(entry) & PMD_HUGE_ACCESSED)
-			pte |= _PAGE_ACCESSED_4U;
-		if (pmd_val(entry) & PMD_HUGE_DIRTY)
-			pte |= _PAGE_MODIFIED_4U;
-		pte |= _PAGE_CP_4U|_PAGE_CV_4U;
-	}
+	struct page *page;
 
-	return __pgprot(pte);
+	page = virt_to_page(pgtable);
+	call_rcu(&page->rcu_head, pte_free_now);
 }
 
 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -2651,30 +2950,25 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 	unsigned long pte, flags;
 	struct mm_struct *mm;
 	pmd_t entry = *pmd;
-	pgprot_t prot;
 
-	if (!pmd_large(entry) || !pmd_young(entry))
+	if (!pmd_leaf(entry) || !pmd_young(entry))
 		return;
 
-	pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS);
-	pte <<= PMD_PADDR_SHIFT;
-	pte |= _PAGE_VALID;
+	pte = pmd_val(entry);
 
-	prot = pmd_pgprot(entry);
-
-	if (tlb_type == hypervisor)
-		pgprot_val(prot) |= _PAGE_SZHUGE_4V;
-	else
-		pgprot_val(prot) |= _PAGE_SZHUGE_4U;
+	/* Don't insert a non-valid PMD into the TSB, we'll deadlock.  */
+	if (!(pte & _PAGE_VALID))
+		return;
 
-	pte |= pgprot_val(prot);
+	/* We are fabricating 8MB pages using 4MB real hw pages.  */
+	pte |= (addr & (1UL << REAL_HPAGE_SHIFT));
 
 	mm = vma->vm_mm;
 
 	spin_lock_irqsave(&mm->context.lock, flags);
 
 	if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL)
-		__update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
+		__update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
 					addr, pte);
 
 	spin_unlock_irqrestore(&mm->context.lock, flags);
@@ -2695,7 +2989,7 @@ void hugetlb_setup(struct pt_regs *regs)
 	struct mm_struct *mm = current->mm;
 	struct tsb_config *tp;
 
-	if (in_atomic() || !mm) {
+	if (faulthandler_disabled() || !mm) {
 		const struct exception_table_entry *entry;
 
 		entry = search_exception_tables(regs->tpc);
@@ -2719,9 +3013,10 @@ void hugetlb_setup(struct pt_regs *regs)
 	 * the Data-TLB for huge pages.
 	 */
 	if (tlb_type == cheetah_plus) {
+		bool need_context_reload = false;
 		unsigned long ctx;
 
-		spin_lock(&ctx_alloc_lock);
+		spin_lock_irq(&ctx_alloc_lock);
 		ctx = mm->context.sparc64_ctx_val;
 		ctx &= ~CTX_PGSZ_MASK;
 		ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
@@ -2740,9 +3035,180 @@ void hugetlb_setup(struct pt_regs *regs)
 			 * also executing in this address space.
 			 */
 			mm->context.sparc64_ctx_val = ctx;
+			need_context_reload = true;
+		}
+		spin_unlock_irq(&ctx_alloc_lock);
+
+		if (need_context_reload)
 			on_each_cpu(context_reload, mm, 0);
+	}
+}
+#endif
+
+static struct resource code_resource = {
+	.name	= "Kernel code",
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource data_resource = {
+	.name	= "Kernel data",
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource bss_resource = {
+	.name	= "Kernel bss",
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static inline resource_size_t compute_kern_paddr(void *addr)
+{
+	return (resource_size_t) (addr - KERNBASE + kern_base);
+}
+
+static void __init kernel_lds_init(void)
+{
+	code_resource.start = compute_kern_paddr(_text);
+	code_resource.end   = compute_kern_paddr(_etext - 1);
+	data_resource.start = compute_kern_paddr(_etext);
+	data_resource.end   = compute_kern_paddr(_edata - 1);
+	bss_resource.start  = compute_kern_paddr(__bss_start);
+	bss_resource.end    = compute_kern_paddr(_end - 1);
+}
+
+static int __init report_memory(void)
+{
+	int i;
+	struct resource *res;
+
+	kernel_lds_init();
+
+	for (i = 0; i < pavail_ents; i++) {
+		res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+
+		if (!res) {
+			pr_warn("Failed to allocate source.\n");
+			break;
+		}
+
+		res->name = "System RAM";
+		res->start = pavail[i].phys_addr;
+		res->end = pavail[i].phys_addr + pavail[i].reg_size - 1;
+		res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+
+		if (insert_resource(&iomem_resource, res) < 0) {
+			pr_warn("Resource insertion failed.\n");
+			break;
 		}
-		spin_unlock(&ctx_alloc_lock);
+
+		insert_resource(res, &code_resource);
+		insert_resource(res, &data_resource);
+		insert_resource(res, &bss_resource);
 	}
+
+	return 0;
 }
+arch_initcall(report_memory);
+
+#ifdef CONFIG_SMP
+#define do_flush_tlb_kernel_range	smp_flush_tlb_kernel_range
+#else
+#define do_flush_tlb_kernel_range	__flush_tlb_kernel_range
 #endif
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) {
+		if (start < LOW_OBP_ADDRESS) {
+			flush_tsb_kernel_range(start, LOW_OBP_ADDRESS);
+			do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS);
+		}
+		if (end > HI_OBP_ADDRESS) {
+			flush_tsb_kernel_range(HI_OBP_ADDRESS, end);
+			do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end);
+		}
+	} else {
+		flush_tsb_kernel_range(start, end);
+		do_flush_tlb_kernel_range(start, end);
+	}
+}
+
+void copy_user_highpage(struct page *to, struct page *from,
+	unsigned long vaddr, struct vm_area_struct *vma)
+{
+	char *vfrom, *vto;
+
+	vfrom = kmap_atomic(from);
+	vto = kmap_atomic(to);
+	copy_user_page(vto, vfrom, vaddr, to);
+	kunmap_atomic(vto);
+	kunmap_atomic(vfrom);
+
+	/* If this page has ADI enabled, copy over any ADI tags
+	 * as well
+	 */
+	if (vma->vm_flags & VM_SPARC_ADI) {
+		unsigned long pfrom, pto, i, adi_tag;
+
+		pfrom = page_to_phys(from);
+		pto = page_to_phys(to);
+
+		for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) {
+			asm volatile("ldxa [%1] %2, %0\n\t"
+					: "=r" (adi_tag)
+					:  "r" (i), "i" (ASI_MCD_REAL));
+			asm volatile("stxa %0, [%1] %2\n\t"
+					:
+					: "r" (adi_tag), "r" (pto),
+					  "i" (ASI_MCD_REAL));
+			pto += adi_blksize();
+		}
+		asm volatile("membar #Sync\n\t");
+	}
+}
+EXPORT_SYMBOL(copy_user_highpage);
+
+void copy_highpage(struct page *to, struct page *from)
+{
+	char *vfrom, *vto;
+
+	vfrom = kmap_atomic(from);
+	vto = kmap_atomic(to);
+	copy_page(vto, vfrom);
+	kunmap_atomic(vto);
+	kunmap_atomic(vfrom);
+
+	/* If this platform is ADI enabled, copy any ADI tags
+	 * as well
+	 */
+	if (adi_capable()) {
+		unsigned long pfrom, pto, i, adi_tag;
+
+		pfrom = page_to_phys(from);
+		pto = page_to_phys(to);
+
+		for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) {
+			asm volatile("ldxa [%1] %2, %0\n\t"
+					: "=r" (adi_tag)
+					:  "r" (i), "i" (ASI_MCD_REAL));
+			asm volatile("stxa %0, [%1] %2\n\t"
+					:
+					: "r" (adi_tag), "r" (pto),
+					  "i" (ASI_MCD_REAL));
+			pto += adi_blksize();
+		}
+		asm volatile("membar #Sync\n\t");
+	}
+}
+EXPORT_SYMBOL(copy_highpage);
+
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
+{
+	unsigned long prot = pgprot_val(protection_map[vm_flags &
+					(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
+
+	if (vm_flags & VM_SPARC_ADI)
+		prot |= _PAGE_MCD_4V;
+
+	return __pgprot(prot);
+}
+EXPORT_SYMBOL(vm_get_page_prot);