summaryrefslogtreecommitdiff
path: root/arch/sparc/mm/init_64.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc/mm/init_64.c')
-rw-r--r--arch/sparc/mm/init_64.c1680
1 files changed, 1073 insertions, 607 deletions
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index a9c42a7ffb6a..df9f7c444c39 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* arch/sparc64/mm/init.c
*
@@ -5,12 +6,12 @@
* Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
*/
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/initrd.h>
@@ -22,19 +23,19 @@
#include <linux/kprobes.h>
#include <linux/cache.h>
#include <linux/sort.h>
+#include <linux/ioport.h>
#include <linux/percpu.h>
-#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/gfp.h>
+#include <linux/bootmem_info.h>
#include <asm/head.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/oplib.h>
#include <asm/iommu.h>
#include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/dma.h>
@@ -47,11 +48,13 @@
#include <asm/prom.h>
#include <asm/mdesc.h>
#include <asm/cpudata.h>
+#include <asm/setup.h>
#include <asm/irq.h>
#include "init_64.h"
unsigned long kern_linear_pte_xor[4] __read_mostly;
+static unsigned long page_cache4v_flag;
/* A bitmap, two bits for every 256MB of physical memory. These two
* bits determine what page size we use for kernel linear
@@ -73,7 +76,6 @@ unsigned long kern_linear_pte_xor[4] __read_mostly;
* 'cpu' properties, but we need to have this table setup before the
* MDESC is initialized.
*/
-unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
#ifndef CONFIG_DEBUG_PAGEALLOC
/* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings.
@@ -82,14 +84,17 @@ unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
*/
extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
#endif
+extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
static unsigned long cpu_pgsz_mask;
-#define MAX_BANKS 32
+#define MAX_BANKS 1024
static struct linux_prom64_registers pavail[MAX_BANKS];
static int pavail_ents;
+u64 numa_latency[MAX_NUMNODES][MAX_NUMNODES];
+
static int cmp_p64(const void *a, const void *b)
{
const struct linux_prom64_registers *x = a, *y = b;
@@ -163,10 +168,6 @@ static void __init read_obp_memory(const char *property,
cmp_p64, NULL);
}
-unsigned long sparc64_valid_addr_bitmap[VALID_ADDR_BITMAP_BYTES /
- sizeof(unsigned long)];
-EXPORT_SYMBOL(sparc64_valid_addr_bitmap);
-
/* Kernel physical address base and size in bytes. */
unsigned long kern_base __read_mostly;
unsigned long kern_size __read_mostly;
@@ -194,21 +195,26 @@ atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0);
#endif
#endif
-inline void flush_dcache_page_impl(struct page *page)
+inline void flush_dcache_folio_impl(struct folio *folio)
{
+ unsigned int i, nr = folio_nr_pages(folio);
+
BUG_ON(tlb_type == hypervisor);
#ifdef CONFIG_DEBUG_DCFLUSH
atomic_inc(&dcpage_flushes);
#endif
#ifdef DCACHE_ALIASING_POSSIBLE
- __flush_dcache_page(page_address(page),
- ((tlb_type == spitfire) &&
- page_mapping(page) != NULL));
+ for (i = 0; i < nr; i++)
+ __flush_dcache_page(folio_address(folio) + i * PAGE_SIZE,
+ ((tlb_type == spitfire) &&
+ folio_flush_mapping(folio) != NULL));
#else
- if (page_mapping(page) != NULL &&
- tlb_type == spitfire)
- __flush_icache_page(__pa(page_address(page)));
+ if (folio_flush_mapping(folio) != NULL &&
+ tlb_type == spitfire) {
+ for (i = 0; i < nr; i++)
+ __flush_icache_page((pfn + i) * PAGE_SIZE);
+ }
#endif
}
@@ -217,10 +223,10 @@ inline void flush_dcache_page_impl(struct page *page)
#define PG_dcache_cpu_mask \
((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL)
-#define dcache_dirty_cpu(page) \
- (((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
+#define dcache_dirty_cpu(folio) \
+ (((folio)->flags.f >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
-static inline void set_dcache_dirty(struct page *page, int this_cpu)
+static inline void set_dcache_dirty(struct folio *folio, int this_cpu)
{
unsigned long mask = this_cpu;
unsigned long non_cpu_bits;
@@ -237,11 +243,11 @@ static inline void set_dcache_dirty(struct page *page, int this_cpu)
"bne,pn %%xcc, 1b\n\t"
" nop"
: /* no outputs */
- : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags)
+ : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags.f)
: "g1", "g7");
}
-static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu)
+static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu)
{
unsigned long mask = (1UL << PG_dcache_dirty);
@@ -259,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu)
" nop\n"
"2:"
: /* no outputs */
- : "r" (cpu), "r" (mask), "r" (&page->flags),
+ : "r" (cpu), "r" (mask), "r" (&folio->flags.f),
"i" (PG_dcache_cpu_mask),
"i" (PG_dcache_cpu_shift)
: "g1", "g7");
@@ -283,9 +289,10 @@ static void flush_dcache(unsigned long pfn)
page = pfn_to_page(pfn);
if (page) {
+ struct folio *folio = page_folio(page);
unsigned long pg_flags;
- pg_flags = page->flags;
+ pg_flags = folio->flags.f;
if (pg_flags & (1UL << PG_dcache_dirty)) {
int cpu = ((pg_flags >> PG_dcache_cpu_shift) &
PG_dcache_cpu_mask);
@@ -295,11 +302,11 @@ static void flush_dcache(unsigned long pfn)
* in the SMP case.
*/
if (cpu == this_cpu)
- flush_dcache_page_impl(page);
+ flush_dcache_folio_impl(folio);
else
- smp_flush_dcache_page_impl(page, cpu);
+ smp_flush_dcache_folio_impl(folio, cpu);
- clear_dcache_dirty_cpu(page, cpu);
+ clear_dcache_dirty_cpu(folio, cpu);
put_cpu();
}
@@ -323,23 +330,78 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde
tsb_insert(tsb, tag, tte);
}
-#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline bool is_hugetlb_pte(pte_t pte)
+#ifdef CONFIG_HUGETLB_PAGE
+static int __init hugetlbpage_init(void)
{
- if ((tlb_type == hypervisor &&
- (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) ||
- (tlb_type != hypervisor &&
- (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U))
- return true;
- return false;
+ hugetlb_add_hstate(HPAGE_64K_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(HPAGE_256MB_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(HPAGE_2GB_SHIFT - PAGE_SHIFT);
+
+ return 0;
}
-#endif
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
+arch_initcall(hugetlbpage_init);
+
+static void __init pud_huge_patch(void)
+{
+ struct pud_huge_patch_entry *p;
+ unsigned long addr;
+
+ p = &__pud_huge_patch;
+ addr = p->addr;
+ *(unsigned int *)addr = p->insn;
+
+ __asm__ __volatile__("flush %0" : : "r" (addr));
+}
+
+bool __init arch_hugetlb_valid_size(unsigned long size)
+{
+ unsigned int hugepage_shift = ilog2(size);
+ unsigned short hv_pgsz_idx;
+ unsigned int hv_pgsz_mask;
+
+ switch (hugepage_shift) {
+ case HPAGE_16GB_SHIFT:
+ hv_pgsz_mask = HV_PGSZ_MASK_16GB;
+ hv_pgsz_idx = HV_PGSZ_IDX_16GB;
+ pud_huge_patch();
+ break;
+ case HPAGE_2GB_SHIFT:
+ hv_pgsz_mask = HV_PGSZ_MASK_2GB;
+ hv_pgsz_idx = HV_PGSZ_IDX_2GB;
+ break;
+ case HPAGE_256MB_SHIFT:
+ hv_pgsz_mask = HV_PGSZ_MASK_256MB;
+ hv_pgsz_idx = HV_PGSZ_IDX_256MB;
+ break;
+ case HPAGE_SHIFT:
+ hv_pgsz_mask = HV_PGSZ_MASK_4MB;
+ hv_pgsz_idx = HV_PGSZ_IDX_4MB;
+ break;
+ case HPAGE_64K_SHIFT:
+ hv_pgsz_mask = HV_PGSZ_MASK_64K;
+ hv_pgsz_idx = HV_PGSZ_IDX_64K;
+ break;
+ default:
+ hv_pgsz_mask = 0;
+ }
+
+ if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int nr)
{
struct mm_struct *mm;
unsigned long flags;
+ bool is_huge_tsb;
pte_t pte = *ptep;
+ unsigned int i;
if (tlb_type != hypervisor) {
unsigned long pfn = pte_pfn(pte);
@@ -350,22 +412,57 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
mm = vma->vm_mm;
+ /* Don't insert a non-valid PTE into the TSB, we'll deadlock. */
+ if (!pte_accessible(mm, pte))
+ return;
+
spin_lock_irqsave(&mm->context.lock, flags);
+ is_huge_tsb = false;
#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
- if (mm->context.huge_pte_count && is_hugetlb_pte(pte))
- __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
- address, pte_val(pte));
- else
+ if (mm->context.hugetlb_pte_count || mm->context.thp_pte_count) {
+ unsigned long hugepage_size = PAGE_SIZE;
+
+ if (is_vm_hugetlb_page(vma))
+ hugepage_size = huge_page_size(hstate_vma(vma));
+
+ if (hugepage_size >= PUD_SIZE) {
+ unsigned long mask = 0x1ffc00000UL;
+
+ /* Transfer bits [32:22] from address to resolve
+ * at 4M granularity.
+ */
+ pte_val(pte) &= ~mask;
+ pte_val(pte) |= (address & mask);
+ } else if (hugepage_size >= PMD_SIZE) {
+ /* We are fabricating 8MB pages using 4MB
+ * real hw pages.
+ */
+ pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
+ }
+
+ if (hugepage_size >= PMD_SIZE) {
+ __update_mmu_tsb_insert(mm, MM_TSB_HUGE,
+ REAL_HPAGE_SHIFT, address, pte_val(pte));
+ is_huge_tsb = true;
+ }
+ }
#endif
- __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
- address, pte_val(pte));
+ if (!is_huge_tsb) {
+ for (i = 0; i < nr; i++) {
+ __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
+ address, pte_val(pte));
+ address += PAGE_SIZE;
+ pte_val(pte) += PAGE_SIZE;
+ }
+ }
spin_unlock_irqrestore(&mm->context.lock, flags);
}
-void flush_dcache_page(struct page *page)
+void flush_dcache_folio(struct folio *folio)
{
+ unsigned long pfn = folio_pfn(folio);
struct address_space *mapping;
int this_cpu;
@@ -376,35 +473,35 @@ void flush_dcache_page(struct page *page)
* is merely the zero page. The 'bigcore' testcase in GDB
* causes this case to run millions of times.
*/
- if (page == ZERO_PAGE(0))
+ if (is_zero_pfn(pfn))
return;
this_cpu = get_cpu();
- mapping = page_mapping(page);
+ mapping = folio_flush_mapping(folio);
if (mapping && !mapping_mapped(mapping)) {
- int dirty = test_bit(PG_dcache_dirty, &page->flags);
+ bool dirty = test_bit(PG_dcache_dirty, &folio->flags.f);
if (dirty) {
- int dirty_cpu = dcache_dirty_cpu(page);
+ int dirty_cpu = dcache_dirty_cpu(folio);
if (dirty_cpu == this_cpu)
goto out;
- smp_flush_dcache_page_impl(page, dirty_cpu);
+ smp_flush_dcache_folio_impl(folio, dirty_cpu);
}
- set_dcache_dirty(page, this_cpu);
+ set_dcache_dirty(folio, this_cpu);
} else {
- /* We could delay the flush for the !page_mapping
+ /* We could delay the flush for the !folio_mapping
* case too. But that case is for exec env/arg
* pages and those are %99 certainly going to get
* faulted into the tlb (and thus flushed) anyways.
*/
- flush_dcache_page_impl(page);
+ flush_dcache_folio_impl(folio);
}
out:
put_cpu();
}
-EXPORT_SYMBOL(flush_dcache_page);
+EXPORT_SYMBOL(flush_dcache_folio);
void __kprobes flush_icache_range(unsigned long start, unsigned long end)
{
@@ -421,10 +518,7 @@ void __kprobes flush_icache_range(unsigned long start, unsigned long end)
if (kaddr >= PAGE_OFFSET)
paddr = kaddr & mask;
else {
- pgd_t *pgdp = pgd_offset_k(kaddr);
- pud_t *pudp = pud_offset(pgdp, kaddr);
- pmd_t *pmdp = pmd_offset(pudp, kaddr);
- pte_t *ptep = pte_offset_kernel(pmdp, kaddr);
+ pte_t *ptep = virt_to_kpte(kaddr);
paddr = pte_val(*ptep) & mask;
}
@@ -588,7 +682,7 @@ static void __init remap_kernel(void)
int i, tlb_ent = sparc64_highest_locked_tlbent();
tte_vaddr = (unsigned long) KERNBASE;
- phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
+ phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
tte_data = kern_large_tte(phys_page);
kern_locked_tte_data = tte_data;
@@ -629,9 +723,10 @@ static void __init inherit_prom_mappings(void)
void prom_world(int enter)
{
- if (!enter)
- set_fs(get_fs());
-
+ /*
+ * No need to change the address space any more, just flush
+ * the register windows
+ */
__asm__ __volatile__("flushw");
}
@@ -662,10 +757,58 @@ EXPORT_SYMBOL(__flush_dcache_range);
/* get_new_mmu_context() uses "cache + 1". */
DEFINE_SPINLOCK(ctx_alloc_lock);
-unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1;
+unsigned long tlb_context_cache = CTX_FIRST_VERSION;
#define MAX_CTX_NR (1UL << CTX_NR_BITS)
#define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR)
DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR);
+DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0};
+
+static void mmu_context_wrap(void)
+{
+ unsigned long old_ver = tlb_context_cache & CTX_VERSION_MASK;
+ unsigned long new_ver, new_ctx, old_ctx;
+ struct mm_struct *mm;
+ int cpu;
+
+ bitmap_zero(mmu_context_bmap, 1 << CTX_NR_BITS);
+
+ /* Reserve kernel context */
+ set_bit(0, mmu_context_bmap);
+
+ new_ver = (tlb_context_cache & CTX_VERSION_MASK) + CTX_FIRST_VERSION;
+ if (unlikely(new_ver == 0))
+ new_ver = CTX_FIRST_VERSION;
+ tlb_context_cache = new_ver;
+
+ /*
+ * Make sure that any new mm that are added into per_cpu_secondary_mm,
+ * are going to go through get_new_mmu_context() path.
+ */
+ mb();
+
+ /*
+ * Updated versions to current on those CPUs that had valid secondary
+ * contexts
+ */
+ for_each_online_cpu(cpu) {
+ /*
+ * If a new mm is stored after we took this mm from the array,
+ * it will go into get_new_mmu_context() path, because we
+ * already bumped the version in tlb_context_cache.
+ */
+ mm = per_cpu(per_cpu_secondary_mm, cpu);
+
+ if (unlikely(!mm || mm == &init_mm))
+ continue;
+
+ old_ctx = mm->context.sparc64_ctx_val;
+ if (likely((old_ctx & CTX_VERSION_MASK) == old_ver)) {
+ new_ctx = (old_ctx & ~CTX_VERSION_MASK) | new_ver;
+ set_bit(new_ctx & CTX_NR_MASK, mmu_context_bmap);
+ mm->context.sparc64_ctx_val = new_ctx;
+ }
+ }
+}
/* Caller does TLB context flushing on local CPU if necessary.
* The caller also ensures that CTX_VALID(mm->context) is false.
@@ -681,48 +824,30 @@ void get_new_mmu_context(struct mm_struct *mm)
{
unsigned long ctx, new_ctx;
unsigned long orig_pgsz_bits;
- int new_version;
spin_lock(&ctx_alloc_lock);
+retry:
+ /* wrap might have happened, test again if our context became valid */
+ if (unlikely(CTX_VALID(mm->context)))
+ goto out;
orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK);
ctx = (tlb_context_cache + 1) & CTX_NR_MASK;
new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx);
- new_version = 0;
if (new_ctx >= (1 << CTX_NR_BITS)) {
new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1);
if (new_ctx >= ctx) {
- int i;
- new_ctx = (tlb_context_cache & CTX_VERSION_MASK) +
- CTX_FIRST_VERSION;
- if (new_ctx == 1)
- new_ctx = CTX_FIRST_VERSION;
-
- /* Don't call memset, for 16 entries that's just
- * plain silly...
- */
- mmu_context_bmap[0] = 3;
- mmu_context_bmap[1] = 0;
- mmu_context_bmap[2] = 0;
- mmu_context_bmap[3] = 0;
- for (i = 4; i < CTX_BMAP_SLOTS; i += 4) {
- mmu_context_bmap[i + 0] = 0;
- mmu_context_bmap[i + 1] = 0;
- mmu_context_bmap[i + 2] = 0;
- mmu_context_bmap[i + 3] = 0;
- }
- new_version = 1;
- goto out;
+ mmu_context_wrap();
+ goto retry;
}
}
+ if (mm->context.sparc64_ctx_val)
+ cpumask_clear(mm_cpumask(mm));
mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63));
new_ctx |= (tlb_context_cache & CTX_VERSION_MASK);
-out:
tlb_context_cache = new_ctx;
mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits;
+out:
spin_unlock(&ctx_alloc_lock);
-
- if (unlikely(new_version))
- smp_new_mmu_context_version();
}
static int numa_enabled = 1;
@@ -789,16 +914,26 @@ static void __init find_ramdisk(unsigned long phys_base)
struct node_mem_mask {
unsigned long mask;
- unsigned long val;
+ unsigned long match;
};
static struct node_mem_mask node_masks[MAX_NUMNODES];
static int num_node_masks;
+#ifdef CONFIG_NUMA
+
+struct mdesc_mlgroup {
+ u64 node;
+ u64 latency;
+ u64 match;
+ u64 mask;
+};
+
+static struct mdesc_mlgroup *mlgroups;
+static int num_mlgroups;
+
int numa_cpu_lookup_table[NR_CPUS];
cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-
struct mdesc_mblock {
u64 base;
u64 size;
@@ -807,52 +942,128 @@ struct mdesc_mblock {
static struct mdesc_mblock *mblocks;
static int num_mblocks;
-static unsigned long ra_to_pa(unsigned long addr)
+static struct mdesc_mblock * __init addr_to_mblock(unsigned long addr)
{
+ struct mdesc_mblock *m = NULL;
int i;
for (i = 0; i < num_mblocks; i++) {
- struct mdesc_mblock *m = &mblocks[i];
+ m = &mblocks[i];
if (addr >= m->base &&
addr < (m->base + m->size)) {
- addr += m->offset;
break;
}
}
- return addr;
+
+ return m;
}
-static int find_node(unsigned long addr)
+static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid)
{
- int i;
+ int prev_nid, new_nid;
- addr = ra_to_pa(addr);
- for (i = 0; i < num_node_masks; i++) {
- struct node_mem_mask *p = &node_masks[i];
+ prev_nid = NUMA_NO_NODE;
+ for ( ; start < end; start += PAGE_SIZE) {
+ for (new_nid = 0; new_nid < num_node_masks; new_nid++) {
+ struct node_mem_mask *p = &node_masks[new_nid];
+
+ if ((start & p->mask) == p->match) {
+ if (prev_nid == NUMA_NO_NODE)
+ prev_nid = new_nid;
+ break;
+ }
+ }
+
+ if (new_nid == num_node_masks) {
+ prev_nid = 0;
+ WARN_ONCE(1, "addr[%Lx] doesn't match a NUMA node rule. Some memory will be owned by node 0.",
+ start);
+ break;
+ }
- if ((addr & p->mask) == p->val)
- return i;
+ if (prev_nid != new_nid)
+ break;
}
- return -1;
+ *nid = prev_nid;
+
+ return start > end ? end : start;
}
-static u64 memblock_nid_range(u64 start, u64 end, int *nid)
+static u64 __init memblock_nid_range(u64 start, u64 end, int *nid)
{
- *nid = find_node(start);
- start += PAGE_SIZE;
- while (start < end) {
- int n = find_node(start);
+ u64 ret_end, pa_start, m_mask, m_match, m_end;
+ struct mdesc_mblock *mblock;
+ int _nid, i;
+
+ if (tlb_type != hypervisor)
+ return memblock_nid_range_sun4u(start, end, nid);
+
+ mblock = addr_to_mblock(start);
+ if (!mblock) {
+ WARN_ONCE(1, "memblock_nid_range: Can't find mblock addr[%Lx]",
+ start);
+
+ _nid = 0;
+ ret_end = end;
+ goto done;
+ }
- if (n != *nid)
+ pa_start = start + mblock->offset;
+ m_match = 0;
+ m_mask = 0;
+
+ for (_nid = 0; _nid < num_node_masks; _nid++) {
+ struct node_mem_mask *const m = &node_masks[_nid];
+
+ if ((pa_start & m->mask) == m->match) {
+ m_match = m->match;
+ m_mask = m->mask;
break;
- start += PAGE_SIZE;
+ }
}
- if (start > end)
- start = end;
+ if (num_node_masks == _nid) {
+ /* We could not find NUMA group, so default to 0, but lets
+ * search for latency group, so we could calculate the correct
+ * end address that we return
+ */
+ _nid = 0;
+
+ for (i = 0; i < num_mlgroups; i++) {
+ struct mdesc_mlgroup *const m = &mlgroups[i];
- return start;
+ if ((pa_start & m->mask) == m->match) {
+ m_match = m->match;
+ m_mask = m->mask;
+ break;
+ }
+ }
+
+ if (i == num_mlgroups) {
+ WARN_ONCE(1, "memblock_nid_range: Can't find latency group addr[%Lx]",
+ start);
+
+ ret_end = end;
+ goto done;
+ }
+ }
+
+ /*
+ * Each latency group has match and mask, and each memory block has an
+ * offset. An address belongs to a latency group if its address matches
+ * the following formula: ((addr + offset) & mask) == match
+ * It is, however, slow to check every single page if it matches a
+ * particular latency group. As optimization we calculate end value by
+ * using bit arithmetics.
+ */
+ m_end = m_match + (1ul << __ffs(m_mask)) - mblock->offset;
+ m_end += pa_start & ~((1ul << fls64(m_mask)) - 1);
+ ret_end = m_end > end ? end : m_end;
+
+done:
+ *nid = _nid;
+ return ret_end;
}
#endif
@@ -864,16 +1075,9 @@ static void __init allocate_node_data(int nid)
{
struct pglist_data *p;
unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- unsigned long paddr;
- paddr = memblock_alloc_try_nid(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid);
- if (!paddr) {
- prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
- prom_halt();
- }
- NODE_DATA(nid) = __va(paddr);
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+#ifdef CONFIG_NUMA
+ alloc_node_data(nid);
NODE_DATA(nid)->node_id = nid;
#endif
@@ -887,34 +1091,28 @@ static void __init allocate_node_data(int nid)
static void init_node_masks_nonnuma(void)
{
+#ifdef CONFIG_NUMA
int i;
+#endif
numadbg("Initializing tables for non-numa.\n");
- node_masks[0].mask = node_masks[0].val = 0;
+ node_masks[0].mask = 0;
+ node_masks[0].match = 0;
num_node_masks = 1;
+#ifdef CONFIG_NUMA
for (i = 0; i < NR_CPUS; i++)
numa_cpu_lookup_table[i] = 0;
cpumask_setall(&numa_cpumask_lookup_table[0]);
+#endif
}
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data *node_data[MAX_NUMNODES];
+#ifdef CONFIG_NUMA
EXPORT_SYMBOL(numa_cpu_lookup_table);
EXPORT_SYMBOL(numa_cpumask_lookup_table);
-EXPORT_SYMBOL(node_data);
-
-struct mdesc_mlgroup {
- u64 node;
- u64 latency;
- u64 match;
- u64 mask;
-};
-static struct mdesc_mlgroup *mlgroups;
-static int num_mlgroups;
static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
u32 cfg_handle)
@@ -987,7 +1185,7 @@ int of_node_to_nid(struct device_node *dp)
md = mdesc_grab();
count = 0;
- nid = -1;
+ nid = NUMA_NO_NODE;
mdesc_for_each_node_by_name(md, grp, "group") {
if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
nid = count;
@@ -1003,14 +1201,14 @@ int of_node_to_nid(struct device_node *dp)
static void __init add_node_ranges(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ unsigned long prev_max;
+ u64 i;
- for_each_memblock(memory, reg) {
- unsigned long size = reg->size;
- unsigned long start, end;
+memblock_resized:
+ prev_max = memblock.memory.max;
- start = reg->base;
- end = start + size;
+ for_each_mem_range(i, &start, &end) {
while (start < end) {
unsigned long this_end;
int nid;
@@ -1018,10 +1216,13 @@ static void __init add_node_ranges(void)
this_end = memblock_nid_range(start, end, &nid);
numadbg("Setting memblock NUMA node nid[%d] "
- "start[%lx] end[%lx]\n",
+ "start[%llx] end[%lx]\n",
nid, start, this_end);
- memblock_set_node(start, this_end - start, nid);
+ memblock_set_node(start, this_end - start,
+ &memblock.memory, nid);
+ if (memblock.memory.max != prev_max)
+ goto memblock_resized;
start = this_end;
}
}
@@ -1038,8 +1239,8 @@ static int __init grab_mlgroups(struct mdesc_handle *md)
if (!count)
return -ENOENT;
- paddr = memblock_alloc(count * sizeof(struct mdesc_mlgroup),
- SMP_CACHE_BYTES);
+ paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mlgroup),
+ SMP_CACHE_BYTES);
if (!paddr)
return -ENOMEM;
@@ -1079,8 +1280,8 @@ static int __init grab_mblocks(struct mdesc_handle *md)
if (!count)
return -ENOENT;
- paddr = memblock_alloc(count * sizeof(struct mdesc_mblock),
- SMP_CACHE_BYTES);
+ paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mblock),
+ SMP_CACHE_BYTES);
if (!paddr)
return -ENOMEM;
@@ -1146,6 +1347,49 @@ static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
return NULL;
}
+int __node_distance(int from, int to)
+{
+ if ((from >= MAX_NUMNODES) || (to >= MAX_NUMNODES)) {
+ pr_warn("Returning default NUMA distance value for %d->%d\n",
+ from, to);
+ return (from == to) ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+ }
+ return numa_latency[from][to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp)
+{
+ int i;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ struct node_mem_mask *n = &node_masks[i];
+
+ if ((grp->mask == n->mask) && (grp->match == n->match))
+ break;
+ }
+ return i;
+}
+
+static void __init find_numa_latencies_for_group(struct mdesc_handle *md,
+ u64 grp, int index)
+{
+ u64 arc;
+
+ mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
+ int tnode;
+ u64 target = mdesc_arc_target(md, arc);
+ struct mdesc_mlgroup *m = find_mlgroup(target);
+
+ if (!m)
+ continue;
+ tnode = find_best_numa_node_for_mlgroup(m);
+ if (tnode == MAX_NUMNODES)
+ continue;
+ numa_latency[index][tnode] = m->latency;
+ }
+}
+
static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
int index)
{
@@ -1176,10 +1420,10 @@ static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
n = &node_masks[num_node_masks++];
n->mask = candidate->mask;
- n->val = candidate->match;
+ n->match = candidate->match;
- numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n",
- index, n->mask, n->val, candidate->latency);
+ numadbg("NUMA NODE[%d]: mask[%lx] match[%lx] (latency[%llx])\n",
+ index, n->mask, n->match, candidate->latency);
return 0;
}
@@ -1209,7 +1453,7 @@ static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
static int __init numa_parse_mdesc(void)
{
struct mdesc_handle *md = mdesc_grab();
- int i, err, count;
+ int i, j, err, count;
u64 node;
node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
@@ -1234,6 +1478,23 @@ static int __init numa_parse_mdesc(void)
count++;
}
+ count = 0;
+ mdesc_for_each_node_by_name(md, node, "group") {
+ find_numa_latencies_for_group(md, node, count);
+ count++;
+ }
+
+ /* Normalize numa latency matrix according to ACPI SLIT spec. */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ u64 self_latency = numa_latency[i][i];
+
+ for (j = 0; j < MAX_NUMNODES; j++) {
+ numa_latency[i][j] =
+ (numa_latency[i][j] * LOCAL_DISTANCE) /
+ self_latency;
+ }
+ }
+
add_node_ranges();
for (i = 0; i < num_node_masks; i++) {
@@ -1259,7 +1520,7 @@ static int __init numa_parse_jbus(void)
numa_cpu_lookup_table[cpu] = index;
cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu));
node_masks[index].mask = ~((1UL << 36UL) - 1UL);
- node_masks[index].val = cpu << 36UL;
+ node_masks[index].match = cpu << 36UL;
index++;
}
@@ -1290,10 +1551,18 @@ static int __init numa_parse_sun4u(void)
static int __init bootmem_init_numa(void)
{
+ int i, j;
int err = -1;
numadbg("bootmem_init_numa()\n");
+ /* Some sane defaults for numa latency values */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ for (j = 0; j < MAX_NUMNODES; j++)
+ numa_latency[i][j] = (i == j) ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ }
+
if (numa_enabled) {
if (tlb_type == hypervisor)
err = numa_parse_mdesc();
@@ -1325,7 +1594,7 @@ static void __init bootmem_init_nonnuma(void)
(top_of_ram - total_ram) >> 20);
init_node_masks_nonnuma();
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
allocate_node_data(0);
node_set_online(0);
}
@@ -1346,7 +1615,6 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
/* XXX cpu notifier XXX */
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
return end_pfn;
@@ -1355,9 +1623,148 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
static struct linux_prom64_registers pall[MAX_BANKS] __initdata;
static int pall_ents __initdata;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+static unsigned long max_phys_bits = 40;
+
+bool kern_addr_valid(unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if ((long)addr < 0L) {
+ unsigned long pa = __pa(addr);
+
+ if ((pa >> max_phys_bits) != 0UL)
+ return false;
+
+ return pfn_valid(pa >> PAGE_SHIFT);
+ }
+
+ if (addr >= (unsigned long) KERNBASE &&
+ addr < (unsigned long)&_end)
+ return true;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ return false;
+
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ if (pud_leaf(*pud))
+ return pfn_valid(pud_pfn(*pud));
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_leaf(*pmd))
+ return pfn_valid(pmd_pfn(*pmd));
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ return false;
+
+ return pfn_valid(pte_pfn(*pte));
+}
+
+static unsigned long __ref kernel_map_hugepud(unsigned long vstart,
+ unsigned long vend,
+ pud_t *pud)
+{
+ const unsigned long mask16gb = (1UL << 34) - 1UL;
+ u64 pte_val = vstart;
+
+ /* Each PUD is 8GB */
+ if ((vstart & mask16gb) ||
+ (vend - vstart <= mask16gb)) {
+ pte_val ^= kern_linear_pte_xor[2];
+ pud_val(*pud) = pte_val | _PAGE_PUD_HUGE;
+
+ return vstart + PUD_SIZE;
+ }
+
+ pte_val ^= kern_linear_pte_xor[3];
+ pte_val |= _PAGE_PUD_HUGE;
+
+ vend = vstart + mask16gb + 1UL;
+ while (vstart < vend) {
+ pud_val(*pud) = pte_val;
+
+ pte_val += PUD_SIZE;
+ vstart += PUD_SIZE;
+ pud++;
+ }
+ return vstart;
+}
+
+static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend,
+ bool guard)
+{
+ if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE)
+ return true;
+
+ return false;
+}
+
+static unsigned long __ref kernel_map_hugepmd(unsigned long vstart,
+ unsigned long vend,
+ pmd_t *pmd)
+{
+ const unsigned long mask256mb = (1UL << 28) - 1UL;
+ const unsigned long mask2gb = (1UL << 31) - 1UL;
+ u64 pte_val = vstart;
+
+ /* Each PMD is 8MB */
+ if ((vstart & mask256mb) ||
+ (vend - vstart <= mask256mb)) {
+ pte_val ^= kern_linear_pte_xor[0];
+ pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE;
+
+ return vstart + PMD_SIZE;
+ }
+
+ if ((vstart & mask2gb) ||
+ (vend - vstart <= mask2gb)) {
+ pte_val ^= kern_linear_pte_xor[1];
+ pte_val |= _PAGE_PMD_HUGE;
+ vend = vstart + mask256mb + 1UL;
+ } else {
+ pte_val ^= kern_linear_pte_xor[2];
+ pte_val |= _PAGE_PMD_HUGE;
+ vend = vstart + mask2gb + 1UL;
+ }
+
+ while (vstart < vend) {
+ pmd_val(*pmd) = pte_val;
+
+ pte_val += PMD_SIZE;
+ vstart += PMD_SIZE;
+ pmd++;
+ }
+
+ return vstart;
+}
+
+static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend,
+ bool guard)
+{
+ if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE)
+ return true;
+
+ return false;
+}
+
static unsigned long __ref kernel_map_range(unsigned long pstart,
- unsigned long pend, pgprot_t prot)
+ unsigned long pend, pgprot_t prot,
+ bool use_huge)
{
unsigned long vstart = PAGE_OFFSET + pstart;
unsigned long vend = PAGE_OFFSET + pend;
@@ -1372,24 +1779,62 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
while (vstart < vend) {
unsigned long this_end, paddr = __pa(vstart);
pgd_t *pgd = pgd_offset_k(vstart);
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- pud = pud_offset(pgd, vstart);
+ if (pgd_none(*pgd)) {
+ pud_t *new;
+
+ new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+ PAGE_SIZE);
+ if (!new)
+ goto err_alloc;
+ alloc_bytes += PAGE_SIZE;
+ pgd_populate(&init_mm, pgd, new);
+ }
+
+ p4d = p4d_offset(pgd, vstart);
+ if (p4d_none(*p4d)) {
+ pud_t *new;
+
+ new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+ PAGE_SIZE);
+ if (!new)
+ goto err_alloc;
+ alloc_bytes += PAGE_SIZE;
+ p4d_populate(&init_mm, p4d, new);
+ }
+
+ pud = pud_offset(p4d, vstart);
if (pud_none(*pud)) {
pmd_t *new;
- new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+ if (kernel_can_map_hugepud(vstart, vend, use_huge)) {
+ vstart = kernel_map_hugepud(vstart, vend, pud);
+ continue;
+ }
+ new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+ PAGE_SIZE);
+ if (!new)
+ goto err_alloc;
alloc_bytes += PAGE_SIZE;
pud_populate(&init_mm, pud, new);
}
pmd = pmd_offset(pud, vstart);
- if (!pmd_present(*pmd)) {
+ if (pmd_none(*pmd)) {
pte_t *new;
- new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+ if (kernel_can_map_hugepmd(vstart, vend, use_huge)) {
+ vstart = kernel_map_hugepmd(vstart, vend, pmd);
+ continue;
+ }
+ new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE,
+ PAGE_SIZE);
+ if (!new)
+ goto err_alloc;
alloc_bytes += PAGE_SIZE;
pmd_populate_kernel(&init_mm, pmd, new);
}
@@ -1409,102 +1854,41 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
}
return alloc_bytes;
-}
-
-extern unsigned int kvmap_linear_patch[1];
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-static void __init kpte_set_val(unsigned long index, unsigned long val)
-{
- unsigned long *ptr = kpte_linear_bitmap;
- val <<= ((index % (BITS_PER_LONG / 2)) * 2);
- ptr += (index / (BITS_PER_LONG / 2));
-
- *ptr |= val;
+err_alloc:
+ panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
+ __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+ return -ENOMEM;
}
-static const unsigned long kpte_shift_min = 28; /* 256MB */
-static const unsigned long kpte_shift_max = 34; /* 16GB */
-static const unsigned long kpte_shift_incr = 3;
-
-static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end,
- unsigned long shift)
+static void __init flush_all_kernel_tsbs(void)
{
- unsigned long size = (1UL << shift);
- unsigned long mask = (size - 1UL);
- unsigned long remains = end - start;
- unsigned long val;
-
- if (remains < size || (start & mask))
- return start;
-
- /* VAL maps:
- *
- * shift 28 --> kern_linear_pte_xor index 1
- * shift 31 --> kern_linear_pte_xor index 2
- * shift 34 --> kern_linear_pte_xor index 3
- */
- val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1;
-
- remains &= ~mask;
- if (shift != kpte_shift_max)
- remains = size;
-
- while (remains) {
- unsigned long index = start >> kpte_shift_min;
+ int i;
- kpte_set_val(index, val);
+ for (i = 0; i < KERNEL_TSB_NENTRIES; i++) {
+ struct tsb *ent = &swapper_tsb[i];
- start += 1UL << kpte_shift_min;
- remains -= 1UL << kpte_shift_min;
+ ent->tag = (1UL << TSB_TAG_INVALID_BIT);
}
+#ifndef CONFIG_DEBUG_PAGEALLOC
+ for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) {
+ struct tsb *ent = &swapper_4m_tsb[i];
- return start;
-}
-
-static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
-{
- unsigned long smallest_size, smallest_mask;
- unsigned long s;
-
- smallest_size = (1UL << kpte_shift_min);
- smallest_mask = (smallest_size - 1UL);
-
- while (start < end) {
- unsigned long orig_start = start;
-
- for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) {
- start = kpte_mark_using_shift(start, end, s);
-
- if (start != orig_start)
- break;
- }
-
- if (start == orig_start)
- start = (start + smallest_size) & ~smallest_mask;
+ ent->tag = (1UL << TSB_TAG_INVALID_BIT);
}
+#endif
}
-static void __init init_kpte_bitmap(void)
-{
- unsigned long i;
-
- for (i = 0; i < pall_ents; i++) {
- unsigned long phys_start, phys_end;
-
- phys_start = pall[i].phys_addr;
- phys_end = phys_start + pall[i].reg_size;
-
- mark_kpte_bitmap(phys_start, phys_end);
- }
-}
+extern unsigned int kvmap_linear_patch[1];
static void __init kernel_physical_mapping_init(void)
{
-#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned long i, mem_alloced = 0UL;
+ bool use_huge = true;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ use_huge = false;
+#endif
for (i = 0; i < pall_ents; i++) {
unsigned long phys_start, phys_end;
@@ -1512,7 +1896,7 @@ static void __init kernel_physical_mapping_init(void)
phys_end = phys_start + pall[i].reg_size;
mem_alloced += kernel_map_range(phys_start, phys_end,
- PAGE_KERNEL);
+ PAGE_KERNEL, use_huge);
}
printk("Allocated %ld bytes for kernel page tables.\n",
@@ -1521,18 +1905,19 @@ static void __init kernel_physical_mapping_init(void)
kvmap_linear_patch[0] = 0x01000000; /* nop */
flushi(&kvmap_linear_patch[0]);
+ flush_all_kernel_tsbs();
+
__flush_tlb_all();
-#endif
}
#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT;
unsigned long phys_end = phys_start + (numpages * PAGE_SIZE);
kernel_map_range(phys_start, phys_end,
- (enable ? PAGE_KERNEL : __pgprot(0)));
+ (enable ? PAGE_KERNEL : __pgprot(0)), false);
flush_tsb_kernel_range(PAGE_OFFSET + phys_start,
PAGE_OFFSET + phys_end);
@@ -1557,6 +1942,91 @@ unsigned long __init find_ecache_flush_span(unsigned long size)
return ~0UL;
}
+unsigned long PAGE_OFFSET;
+EXPORT_SYMBOL(PAGE_OFFSET);
+
+unsigned long VMALLOC_END = 0x0000010000000000UL;
+EXPORT_SYMBOL(VMALLOC_END);
+
+unsigned long sparc64_va_hole_top = 0xfffff80000000000UL;
+unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL;
+
+static void __init setup_page_offset(void)
+{
+ if (tlb_type == cheetah || tlb_type == cheetah_plus) {
+ /* Cheetah/Panther support a full 64-bit virtual
+ * address, so we can use all that our page tables
+ * support.
+ */
+ sparc64_va_hole_top = 0xfff0000000000000UL;
+ sparc64_va_hole_bottom = 0x0010000000000000UL;
+
+ max_phys_bits = 42;
+ } else if (tlb_type == hypervisor) {
+ switch (sun4v_chip_type) {
+ case SUN4V_CHIP_NIAGARA1:
+ case SUN4V_CHIP_NIAGARA2:
+ /* T1 and T2 support 48-bit virtual addresses. */
+ sparc64_va_hole_top = 0xffff800000000000UL;
+ sparc64_va_hole_bottom = 0x0000800000000000UL;
+
+ max_phys_bits = 39;
+ break;
+ case SUN4V_CHIP_NIAGARA3:
+ /* T3 supports 48-bit virtual addresses. */
+ sparc64_va_hole_top = 0xffff800000000000UL;
+ sparc64_va_hole_bottom = 0x0000800000000000UL;
+
+ max_phys_bits = 43;
+ break;
+ case SUN4V_CHIP_NIAGARA4:
+ case SUN4V_CHIP_NIAGARA5:
+ case SUN4V_CHIP_SPARC64X:
+ case SUN4V_CHIP_SPARC_M6:
+ /* T4 and later support 52-bit virtual addresses. */
+ sparc64_va_hole_top = 0xfff8000000000000UL;
+ sparc64_va_hole_bottom = 0x0008000000000000UL;
+ max_phys_bits = 47;
+ break;
+ case SUN4V_CHIP_SPARC_M7:
+ case SUN4V_CHIP_SPARC_SN:
+ /* M7 and later support 52-bit virtual addresses. */
+ sparc64_va_hole_top = 0xfff8000000000000UL;
+ sparc64_va_hole_bottom = 0x0008000000000000UL;
+ max_phys_bits = 49;
+ break;
+ case SUN4V_CHIP_SPARC_M8:
+ default:
+ /* M8 and later support 54-bit virtual addresses.
+ * However, restricting M8 and above VA bits to 53
+ * as 4-level page table cannot support more than
+ * 53 VA bits.
+ */
+ sparc64_va_hole_top = 0xfff0000000000000UL;
+ sparc64_va_hole_bottom = 0x0010000000000000UL;
+ max_phys_bits = 51;
+ break;
+ }
+ }
+
+ if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) {
+ prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n",
+ max_phys_bits);
+ prom_halt();
+ }
+
+ PAGE_OFFSET = sparc64_va_hole_top;
+ VMALLOC_END = ((sparc64_va_hole_bottom >> 1) +
+ (sparc64_va_hole_bottom >> 2));
+
+ pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
+ PAGE_OFFSET, max_phys_bits);
+ pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n",
+ VMALLOC_START, VMALLOC_END);
+ pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n",
+ VMEMMAP_BASE, VMEMMAP_BASE << 1);
+}
+
static void __init tsb_phys_patch(void)
{
struct tsb_ldquad_phys_patch_entry *pquad;
@@ -1599,21 +2069,42 @@ static void __init tsb_phys_patch(void)
#define NUM_KTSB_DESCR 1
#endif
static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
-extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+
+/* The swapper TSBs are loaded with a base sequence of:
+ *
+ * sethi %uhi(SYMBOL), REG1
+ * sethi %hi(SYMBOL), REG2
+ * or REG1, %ulo(SYMBOL), REG1
+ * or REG2, %lo(SYMBOL), REG2
+ * sllx REG1, 32, REG1
+ * or REG1, REG2, REG1
+ *
+ * When we use physical addressing for the TSB accesses, we patch the
+ * first four instructions in the above sequence.
+ */
static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa)
{
- pa >>= KTSB_PHYS_SHIFT;
+ unsigned long high_bits, low_bits;
+
+ high_bits = (pa >> 32) & 0xffffffff;
+ low_bits = (pa >> 0) & 0xffffffff;
while (start < end) {
unsigned int *ia = (unsigned int *)(unsigned long)*start;
- ia[0] = (ia[0] & ~0x3fffff) | (pa >> 10);
+ ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10);
__asm__ __volatile__("flush %0" : : "r" (ia));
- ia[1] = (ia[1] & ~0x3ff) | (pa & 0x3ff);
+ ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10);
__asm__ __volatile__("flush %0" : : "r" (ia + 1));
+ ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff);
+ __asm__ __volatile__("flush %0" : : "r" (ia + 2));
+
+ ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff);
+ __asm__ __volatile__("flush %0" : : "r" (ia + 3));
+
start++;
}
}
@@ -1694,7 +2185,7 @@ static void __init sun4v_ktsb_init(void)
#endif
}
-void __cpuinit sun4v_ktsb_register(void)
+void sun4v_ktsb_register(void)
{
unsigned long pa, ret;
@@ -1719,11 +2210,26 @@ static void __init sun4u_linear_pte_xor_finalize(void)
static void __init sun4v_linear_pte_xor_finalize(void)
{
+ unsigned long pagecv_flag;
+
+ /* Bit 9 of TTE is no longer CV bit on M7 processor and it instead
+ * enables MCD error. Do not set bit 9 on M7 processor.
+ */
+ switch (sun4v_chip_type) {
+ case SUN4V_CHIP_SPARC_M7:
+ case SUN4V_CHIP_SPARC_M8:
+ case SUN4V_CHIP_SPARC_SN:
+ pagecv_flag = 0x00;
+ break;
+ default:
+ pagecv_flag = _PAGE_CV_4V;
+ break;
+ }
#ifndef CONFIG_DEBUG_PAGEALLOC
if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
- 0xfffff80000000000UL;
- kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+ PAGE_OFFSET;
+ kern_linear_pte_xor[1] |= (_PAGE_CP_4V | pagecv_flag |
_PAGE_P_4V | _PAGE_W_4V);
} else {
kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
@@ -1731,8 +2237,8 @@ static void __init sun4v_linear_pte_xor_finalize(void)
if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
- 0xfffff80000000000UL;
- kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+ PAGE_OFFSET;
+ kern_linear_pte_xor[2] |= (_PAGE_CP_4V | pagecv_flag |
_PAGE_P_4V | _PAGE_W_4V);
} else {
kern_linear_pte_xor[2] = kern_linear_pte_xor[1];
@@ -1740,8 +2246,8 @@ static void __init sun4v_linear_pte_xor_finalize(void)
if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
- 0xfffff80000000000UL;
- kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+ PAGE_OFFSET;
+ kern_linear_pte_xor[3] |= (_PAGE_CP_4V | pagecv_flag |
_PAGE_P_4V | _PAGE_W_4V);
} else {
kern_linear_pte_xor[3] = kern_linear_pte_xor[2];
@@ -1752,22 +2258,39 @@ static void __init sun4v_linear_pte_xor_finalize(void)
/* paging_init() sets up the page tables */
static unsigned long last_valid_pfn;
-pgd_t swapper_pg_dir[2048];
static void sun4u_pgprot_init(void);
static void sun4v_pgprot_init(void);
+#define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U)
+#define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V)
+#define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
+#define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
+#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
+#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
+
+/* We need to exclude reserved regions. This exclusion will include
+ * vmlinux and initrd. To be more precise the initrd size could be used to
+ * compute a new lower limit because it is freed later during initialization.
+ */
+static void __init reduce_memory(phys_addr_t limit_ram)
+{
+ limit_ram += memblock_reserved_size();
+ memblock_enforce_memory_limit(limit_ram);
+}
+
void __init paging_init(void)
{
unsigned long end_pfn, shift, phys_base;
unsigned long real_end, i;
- int node;
+
+ setup_page_offset();
/* These build time checkes make sure that the dcache_dirty_cpu()
- * page->flags usage will work.
+ * folio->flags usage will work.
*
* When a page gets marked as dcache-dirty, we store the
- * cpu number starting at bit 32 in the page->flags. Also,
+ * cpu number starting at bit 32 in the folio->flags. Also,
* functions like clear_dcache_dirty_cpu use the cpu mask
* in 13-bit signed-immediate instruction fields.
*/
@@ -1788,7 +2311,7 @@ void __init paging_init(void)
BUILD_BUG_ON(NR_CPUS > 4096);
- kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
+ kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
kern_size = (unsigned long)&_end - (unsigned long)KERNBASE;
/* Invalidate both kernel TSBs. */
@@ -1797,6 +2320,27 @@ void __init paging_init(void)
memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
#endif
+ /* TTE.cv bit on sparc v9 occupies the same position as TTE.mcde
+ * bit on M7 processor. This is a conflicting usage of the same
+ * bit. Enabling TTE.cv on M7 would turn on Memory Corruption
+ * Detection error on all pages and this will lead to problems
+ * later. Kernel does not run with MCD enabled and hence rest
+ * of the required steps to fully configure memory corruption
+ * detection are not taken. We need to ensure TTE.mcde is not
+ * set on M7 processor. Compute the value of cacheability
+ * flag for use later taking this into consideration.
+ */
+ switch (sun4v_chip_type) {
+ case SUN4V_CHIP_SPARC_M7:
+ case SUN4V_CHIP_SPARC_M8:
+ case SUN4V_CHIP_SPARC_SN:
+ page_cache4v_flag = _PAGE_CP_4V;
+ break;
+ default:
+ page_cache4v_flag = _PAGE_CACHE_4V;
+ break;
+ }
+
if (tlb_type == hypervisor)
sun4v_pgprot_init();
else
@@ -1834,7 +2378,8 @@ void __init paging_init(void)
find_ramdisk(phys_base);
- memblock_enforce_memory_limit(cmdline_memory_size);
+ if (cmdline_memory_size)
+ reduce_memory(cmdline_memory_size);
memblock_allow_resize();
memblock_dump_all();
@@ -1844,7 +2389,7 @@ void __init paging_init(void)
shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE);
real_end = (unsigned long)_end;
- num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << 22);
+ num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB);
printk("Kernel: Using %d locked TLB entries for main kernel image.\n",
num_kernel_image_mappings);
@@ -1853,16 +2398,10 @@ void __init paging_init(void)
*/
init_mm.pgd += ((shift) / (sizeof(pgd_t)));
- memset(swapper_low_pmd_dir, 0, sizeof(swapper_low_pmd_dir));
+ memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));
- /* Now can init the kernel/bad page tables. */
- pud_set(pud_offset(&swapper_pg_dir[0], 0),
- swapper_low_pmd_dir + (shift / sizeof(pgd_t)));
-
inherit_prom_mappings();
- init_kpte_bitmap();
-
/* Ok, we can use our TLB miss and window trap handlers safely. */
setup_tba();
@@ -1913,21 +2452,6 @@ void __init paging_init(void)
/* Setup bootmem... */
last_valid_pfn = end_pfn = bootmem_init(phys_base);
- /* Once the OF device tree and MDESC have been setup, we know
- * the list of possible cpus. Therefore we can allocate the
- * IRQ stacks.
- */
- for_each_possible_cpu(i) {
- node = cpu_to_node(i);
-
- softirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
- THREAD_SIZE,
- THREAD_SIZE, 0);
- hardirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
- THREAD_SIZE,
- THREAD_SIZE, 0);
- }
-
kernel_physical_mapping_init();
{
@@ -1937,7 +2461,7 @@ void __init paging_init(void)
max_zone_pfns[ZONE_NORMAL] = end_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
printk("Booting Linux...\n");
@@ -1969,73 +2493,9 @@ int page_in_phys_avail(unsigned long paddr)
return 0;
}
-static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
-static int pavail_rescan_ents __initdata;
-
-/* Certain OBP calls, such as fetching "available" properties, can
- * claim physical memory. So, along with initializing the valid
- * address bitmap, what we do here is refetch the physical available
- * memory list again, and make sure it provides at least as much
- * memory as 'pavail' does.
- */
-static void __init setup_valid_addr_bitmap_from_pavail(unsigned long *bitmap)
-{
- int i;
-
- read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
-
- for (i = 0; i < pavail_ents; i++) {
- unsigned long old_start, old_end;
-
- old_start = pavail[i].phys_addr;
- old_end = old_start + pavail[i].reg_size;
- while (old_start < old_end) {
- int n;
-
- for (n = 0; n < pavail_rescan_ents; n++) {
- unsigned long new_start, new_end;
-
- new_start = pavail_rescan[n].phys_addr;
- new_end = new_start +
- pavail_rescan[n].reg_size;
-
- if (new_start <= old_start &&
- new_end >= (old_start + PAGE_SIZE)) {
- set_bit(old_start >> 22, bitmap);
- goto do_next_page;
- }
- }
-
- prom_printf("mem_init: Lost memory in pavail\n");
- prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
- pavail[i].phys_addr,
- pavail[i].reg_size);
- prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
- pavail_rescan[i].phys_addr,
- pavail_rescan[i].reg_size);
- prom_printf("mem_init: Cannot continue, aborting.\n");
- prom_halt();
-
- do_next_page:
- old_start += PAGE_SIZE;
- }
- }
-}
-
-static void __init patch_tlb_miss_handler_bitmap(void)
-{
- extern unsigned int valid_addr_bitmap_insn[];
- extern unsigned int valid_addr_bitmap_patch[];
-
- valid_addr_bitmap_insn[1] = valid_addr_bitmap_patch[1];
- mb();
- valid_addr_bitmap_insn[0] = valid_addr_bitmap_patch[0];
- flushi(&valid_addr_bitmap_insn[0]);
-}
-
static void __init register_page_bootmem_info(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
int i;
for_each_online_node(i)
@@ -2045,22 +2505,13 @@ static void __init register_page_bootmem_info(void)
}
void __init mem_init(void)
{
- unsigned long addr, last;
-
- addr = PAGE_OFFSET + kern_base;
- last = PAGE_ALIGN(kern_size) + addr;
- while (addr < last) {
- set_bit(__pa(addr) >> 22, sparc64_valid_addr_bitmap);
- addr += PAGE_SIZE;
- }
-
- setup_valid_addr_bitmap_from_pavail(sparc64_valid_addr_bitmap);
- patch_tlb_miss_handler_bitmap();
-
- high_memory = __va(last_valid_pfn << PAGE_SHIFT);
-
+ /*
+ * Must be done after boot memory is put on freelist, because here we
+ * might set fields in deferred struct pages that have not yet been
+ * initialized, and memblock_free_all() initializes all the reserved
+ * deferred pages for us.
+ */
register_page_bootmem_info();
- free_all_bootmem();
/*
* Set up the zero page, mark it reserved, so that page count
@@ -2073,7 +2524,6 @@ void __init mem_init(void)
}
mark_page_reserved(mem_map_zero);
- mem_init_print_info(NULL);
if (tlb_type == cheetah || tlb_type == cheetah_plus)
cheetah_ecache_flush_init();
@@ -2110,21 +2560,6 @@ void free_initmem(void)
}
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
-#define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U)
-#define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V)
-#define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
-#define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
-#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
-#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
-
pgprot_t PAGE_KERNEL __read_mostly;
EXPORT_SYMBOL(PAGE_KERNEL);
@@ -2146,18 +2581,9 @@ unsigned long _PAGE_CACHE __read_mostly;
EXPORT_SYMBOL(_PAGE_CACHE);
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-unsigned long vmemmap_table[VMEMMAP_SIZE];
-
-static long __meminitdata addr_start, addr_end;
-static int __meminitdata node_start;
-
int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
- int node)
+ int node, struct vmem_altmap *altmap)
{
- unsigned long phys_start = (vstart - VMEMMAP_BASE);
- unsigned long phys_end = (vend - VMEMMAP_BASE);
- unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK;
- unsigned long end = VMEMMAP_ALIGN(phys_end);
unsigned long pte_base;
pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
@@ -2165,52 +2591,49 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
_PAGE_P_4U | _PAGE_W_4U);
if (tlb_type == hypervisor)
pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
- _PAGE_CP_4V | _PAGE_CV_4V |
- _PAGE_P_4V | _PAGE_W_4V);
+ page_cache4v_flag | _PAGE_P_4V | _PAGE_W_4V);
+
+ pte_base |= _PAGE_PMD_HUGE;
+
+ vstart = vstart & PMD_MASK;
+ vend = ALIGN(vend, PMD_SIZE);
+ for (; vstart < vend; vstart += PMD_SIZE) {
+ pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
+ unsigned long pte;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
- for (; addr < end; addr += VMEMMAP_CHUNK) {
- unsigned long *vmem_pp =
- vmemmap_table + (addr >> VMEMMAP_CHUNK_SHIFT);
- void *block;
+ if (!pgd)
+ return -ENOMEM;
+
+ p4d = vmemmap_p4d_populate(pgd, vstart, node);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(p4d, vstart, node);
+ if (!pud)
+ return -ENOMEM;
+
+ pmd = pmd_offset(pud, vstart);
+ pte = pmd_val(*pmd);
+ if (!(pte & _PAGE_VALID)) {
+ void *block = vmemmap_alloc_block(PMD_SIZE, node);
- if (!(*vmem_pp & _PAGE_VALID)) {
- block = vmemmap_alloc_block(1UL << 22, node);
if (!block)
return -ENOMEM;
- *vmem_pp = pte_base | __pa(block);
-
- /* check to see if we have contiguous blocks */
- if (addr_end != addr || node_start != node) {
- if (addr_start)
- printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
- addr_start, addr_end-1, node_start);
- addr_start = addr;
- node_start = node;
- }
- addr_end = addr + VMEMMAP_CHUNK;
+ pmd_val(*pmd) = pte_base | __pa(block);
}
}
- return 0;
-}
-void __meminit vmemmap_populate_print_last(void)
-{
- if (addr_start) {
- printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
- addr_start, addr_end-1, node_start);
- addr_start = 0;
- addr_end = 0;
- node_start = 0;
- }
-}
-
-void vmemmap_free(unsigned long start, unsigned long end)
-{
+ return 0;
}
-
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+/* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */
+static pgprot_t protection_map[16] __ro_after_init;
+
static void prot_init_common(unsigned long page_none,
unsigned long page_shared,
unsigned long page_copy,
@@ -2261,10 +2684,10 @@ static void __init sun4u_pgprot_init(void)
__ACCESS_BITS_4U | _PAGE_E_4U);
#ifdef CONFIG_DEBUG_PAGEALLOC
- kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
+ kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
#else
kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
- 0xfffff80000000000UL;
+ PAGE_OFFSET;
#endif
kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
_PAGE_P_4U | _PAGE_W_4U);
@@ -2298,23 +2721,23 @@ static void __init sun4v_pgprot_init(void)
int i;
PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
- _PAGE_CACHE_4V | _PAGE_P_4V |
+ page_cache4v_flag | _PAGE_P_4V |
__ACCESS_BITS_4V | __DIRTY_BITS_4V |
_PAGE_EXEC_4V);
PAGE_KERNEL_LOCKED = PAGE_KERNEL;
_PAGE_IE = _PAGE_IE_4V;
_PAGE_E = _PAGE_E_4V;
- _PAGE_CACHE = _PAGE_CACHE_4V;
+ _PAGE_CACHE = page_cache4v_flag;
#ifdef CONFIG_DEBUG_PAGEALLOC
- kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
+ kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
#else
kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
- 0xfffff80000000000UL;
+ PAGE_OFFSET;
#endif
- kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
- _PAGE_P_4V | _PAGE_W_4V);
+ kern_linear_pte_xor[0] |= (page_cache4v_flag | _PAGE_P_4V |
+ _PAGE_W_4V);
for (i = 1; i < 4; i++)
kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
@@ -2327,12 +2750,12 @@ static void __init sun4v_pgprot_init(void)
_PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
_PAGE_SZ64K_4V | _PAGE_SZ8K_4V);
- page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V;
- page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+ page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | page_cache4v_flag;
+ page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
__ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V);
- page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+ page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
__ACCESS_BITS_4V | _PAGE_EXEC_4V);
- page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+ page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
__ACCESS_BITS_4V | _PAGE_EXEC_4V);
page_exec_bit = _PAGE_EXEC_4V;
@@ -2390,7 +2813,7 @@ static unsigned long kern_large_tte(unsigned long paddr)
_PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U);
if (tlb_type == hypervisor)
val = (_PAGE_VALID | _PAGE_SZ4MB_4V |
- _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V |
+ page_cache4v_flag | _PAGE_P_4V |
_PAGE_EXEC_4V | _PAGE_W_4V);
return val | paddr;
@@ -2455,92 +2878,40 @@ void __flush_tlb_all(void)
: : "r" (pstate));
}
-static pte_t *get_from_cache(struct mm_struct *mm)
+static pte_t *__pte_alloc_one(struct mm_struct *mm)
{
- struct page *page;
- pte_t *ret;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
- spin_lock(&mm->page_table_lock);
- page = mm->context.pgtable_page;
- ret = NULL;
- if (page) {
- void *p = page_address(page);
-
- mm->context.pgtable_page = NULL;
-
- ret = (pte_t *) (p + (PAGE_SIZE / 2));
+ if (!ptdesc)
+ return NULL;
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
+ return NULL;
}
- spin_unlock(&mm->page_table_lock);
-
- return ret;
+ return ptdesc_address(ptdesc);
}
-static struct page *__alloc_for_cache(struct mm_struct *mm)
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
- struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
- __GFP_REPEAT | __GFP_ZERO);
-
- if (page) {
- spin_lock(&mm->page_table_lock);
- if (!mm->context.pgtable_page) {
- atomic_set(&page->_count, 2);
- mm->context.pgtable_page = page;
- }
- spin_unlock(&mm->page_table_lock);
- }
- return page;
+ return __pte_alloc_one(mm);
}
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
- unsigned long address)
+pgtable_t pte_alloc_one(struct mm_struct *mm)
{
- struct page *page;
- pte_t *pte;
-
- pte = get_from_cache(mm);
- if (pte)
- return pte;
-
- page = __alloc_for_cache(mm);
- if (page)
- pte = (pte_t *) page_address(page);
-
- return pte;
+ return __pte_alloc_one(mm);
}
-pgtable_t pte_alloc_one(struct mm_struct *mm,
- unsigned long address)
+static void __pte_free(pgtable_t pte)
{
- struct page *page;
- pte_t *pte;
-
- pte = get_from_cache(mm);
- if (pte)
- return pte;
-
- page = __alloc_for_cache(mm);
- if (page) {
- pgtable_page_ctor(page);
- pte = (pte_t *) page_address(page);
- }
+ struct ptdesc *ptdesc = virt_to_ptdesc(pte);
- return pte;
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
- struct page *page = virt_to_page(pte);
- if (put_page_testzero(page))
- free_hot_cold_page(page, 0);
-}
-
-static void __pte_free(pgtable_t pte)
-{
- struct page *page = virt_to_page(pte);
- if (put_page_testzero(page)) {
- pgtable_page_dtor(page);
- free_hot_cold_page(page, 0);
- }
+ __pte_free(pte);
}
void pte_free(struct mm_struct *mm, pgtable_t pte)
@@ -2557,92 +2928,20 @@ void pgtable_free(void *table, bool is_page)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify)
+static void pte_free_now(struct rcu_head *head)
{
- if (pgprot_val(pgprot) & _PAGE_VALID)
- pmd_val(pmd) |= PMD_HUGE_PRESENT;
- if (tlb_type == hypervisor) {
- if (pgprot_val(pgprot) & _PAGE_WRITE_4V)
- pmd_val(pmd) |= PMD_HUGE_WRITE;
- if (pgprot_val(pgprot) & _PAGE_EXEC_4V)
- pmd_val(pmd) |= PMD_HUGE_EXEC;
-
- if (!for_modify) {
- if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V)
- pmd_val(pmd) |= PMD_HUGE_ACCESSED;
- if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V)
- pmd_val(pmd) |= PMD_HUGE_DIRTY;
- }
- } else {
- if (pgprot_val(pgprot) & _PAGE_WRITE_4U)
- pmd_val(pmd) |= PMD_HUGE_WRITE;
- if (pgprot_val(pgprot) & _PAGE_EXEC_4U)
- pmd_val(pmd) |= PMD_HUGE_EXEC;
-
- if (!for_modify) {
- if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U)
- pmd_val(pmd) |= PMD_HUGE_ACCESSED;
- if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U)
- pmd_val(pmd) |= PMD_HUGE_DIRTY;
- }
- }
-
- return pmd;
-}
-
-pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
-{
- pmd_t pmd;
-
- pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT)));
- pmd_val(pmd) |= PMD_ISHUGE;
- pmd = pmd_set_protbits(pmd, pgprot, false);
- return pmd;
-}
+ struct page *page;
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
- pmd_val(pmd) &= ~(PMD_HUGE_PRESENT |
- PMD_HUGE_WRITE |
- PMD_HUGE_EXEC);
- pmd = pmd_set_protbits(pmd, newprot, true);
- return pmd;
+ page = container_of(head, struct page, rcu_head);
+ __pte_free((pgtable_t)page_address(page));
}
-pgprot_t pmd_pgprot(pmd_t entry)
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
{
- unsigned long pte = 0;
-
- if (pmd_val(entry) & PMD_HUGE_PRESENT)
- pte |= _PAGE_VALID;
-
- if (tlb_type == hypervisor) {
- if (pmd_val(entry) & PMD_HUGE_PRESENT)
- pte |= _PAGE_PRESENT_4V;
- if (pmd_val(entry) & PMD_HUGE_EXEC)
- pte |= _PAGE_EXEC_4V;
- if (pmd_val(entry) & PMD_HUGE_WRITE)
- pte |= _PAGE_W_4V;
- if (pmd_val(entry) & PMD_HUGE_ACCESSED)
- pte |= _PAGE_ACCESSED_4V;
- if (pmd_val(entry) & PMD_HUGE_DIRTY)
- pte |= _PAGE_MODIFIED_4V;
- pte |= _PAGE_CP_4V|_PAGE_CV_4V;
- } else {
- if (pmd_val(entry) & PMD_HUGE_PRESENT)
- pte |= _PAGE_PRESENT_4U;
- if (pmd_val(entry) & PMD_HUGE_EXEC)
- pte |= _PAGE_EXEC_4U;
- if (pmd_val(entry) & PMD_HUGE_WRITE)
- pte |= _PAGE_W_4U;
- if (pmd_val(entry) & PMD_HUGE_ACCESSED)
- pte |= _PAGE_ACCESSED_4U;
- if (pmd_val(entry) & PMD_HUGE_DIRTY)
- pte |= _PAGE_MODIFIED_4U;
- pte |= _PAGE_CP_4U|_PAGE_CV_4U;
- }
+ struct page *page;
- return __pgprot(pte);
+ page = virt_to_page(pgtable);
+ call_rcu(&page->rcu_head, pte_free_now);
}
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -2651,30 +2950,25 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
unsigned long pte, flags;
struct mm_struct *mm;
pmd_t entry = *pmd;
- pgprot_t prot;
- if (!pmd_large(entry) || !pmd_young(entry))
+ if (!pmd_leaf(entry) || !pmd_young(entry))
return;
- pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS);
- pte <<= PMD_PADDR_SHIFT;
- pte |= _PAGE_VALID;
+ pte = pmd_val(entry);
- prot = pmd_pgprot(entry);
-
- if (tlb_type == hypervisor)
- pgprot_val(prot) |= _PAGE_SZHUGE_4V;
- else
- pgprot_val(prot) |= _PAGE_SZHUGE_4U;
+ /* Don't insert a non-valid PMD into the TSB, we'll deadlock. */
+ if (!(pte & _PAGE_VALID))
+ return;
- pte |= pgprot_val(prot);
+ /* We are fabricating 8MB pages using 4MB real hw pages. */
+ pte |= (addr & (1UL << REAL_HPAGE_SHIFT));
mm = vma->vm_mm;
spin_lock_irqsave(&mm->context.lock, flags);
if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL)
- __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
+ __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
addr, pte);
spin_unlock_irqrestore(&mm->context.lock, flags);
@@ -2695,7 +2989,7 @@ void hugetlb_setup(struct pt_regs *regs)
struct mm_struct *mm = current->mm;
struct tsb_config *tp;
- if (in_atomic() || !mm) {
+ if (faulthandler_disabled() || !mm) {
const struct exception_table_entry *entry;
entry = search_exception_tables(regs->tpc);
@@ -2719,9 +3013,10 @@ void hugetlb_setup(struct pt_regs *regs)
* the Data-TLB for huge pages.
*/
if (tlb_type == cheetah_plus) {
+ bool need_context_reload = false;
unsigned long ctx;
- spin_lock(&ctx_alloc_lock);
+ spin_lock_irq(&ctx_alloc_lock);
ctx = mm->context.sparc64_ctx_val;
ctx &= ~CTX_PGSZ_MASK;
ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
@@ -2740,9 +3035,180 @@ void hugetlb_setup(struct pt_regs *regs)
* also executing in this address space.
*/
mm->context.sparc64_ctx_val = ctx;
+ need_context_reload = true;
+ }
+ spin_unlock_irq(&ctx_alloc_lock);
+
+ if (need_context_reload)
on_each_cpu(context_reload, mm, 0);
+ }
+}
+#endif
+
+static struct resource code_resource = {
+ .name = "Kernel code",
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource data_resource = {
+ .name = "Kernel data",
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static inline resource_size_t compute_kern_paddr(void *addr)
+{
+ return (resource_size_t) (addr - KERNBASE + kern_base);
+}
+
+static void __init kernel_lds_init(void)
+{
+ code_resource.start = compute_kern_paddr(_text);
+ code_resource.end = compute_kern_paddr(_etext - 1);
+ data_resource.start = compute_kern_paddr(_etext);
+ data_resource.end = compute_kern_paddr(_edata - 1);
+ bss_resource.start = compute_kern_paddr(__bss_start);
+ bss_resource.end = compute_kern_paddr(_end - 1);
+}
+
+static int __init report_memory(void)
+{
+ int i;
+ struct resource *res;
+
+ kernel_lds_init();
+
+ for (i = 0; i < pavail_ents; i++) {
+ res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+
+ if (!res) {
+ pr_warn("Failed to allocate source.\n");
+ break;
+ }
+
+ res->name = "System RAM";
+ res->start = pavail[i].phys_addr;
+ res->end = pavail[i].phys_addr + pavail[i].reg_size - 1;
+ res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+
+ if (insert_resource(&iomem_resource, res) < 0) {
+ pr_warn("Resource insertion failed.\n");
+ break;
}
- spin_unlock(&ctx_alloc_lock);
+
+ insert_resource(res, &code_resource);
+ insert_resource(res, &data_resource);
+ insert_resource(res, &bss_resource);
}
+
+ return 0;
}
+arch_initcall(report_memory);
+
+#ifdef CONFIG_SMP
+#define do_flush_tlb_kernel_range smp_flush_tlb_kernel_range
+#else
+#define do_flush_tlb_kernel_range __flush_tlb_kernel_range
#endif
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) {
+ if (start < LOW_OBP_ADDRESS) {
+ flush_tsb_kernel_range(start, LOW_OBP_ADDRESS);
+ do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS);
+ }
+ if (end > HI_OBP_ADDRESS) {
+ flush_tsb_kernel_range(HI_OBP_ADDRESS, end);
+ do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end);
+ }
+ } else {
+ flush_tsb_kernel_range(start, end);
+ do_flush_tlb_kernel_range(start, end);
+ }
+}
+
+void copy_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
+{
+ char *vfrom, *vto;
+
+ vfrom = kmap_atomic(from);
+ vto = kmap_atomic(to);
+ copy_user_page(vto, vfrom, vaddr, to);
+ kunmap_atomic(vto);
+ kunmap_atomic(vfrom);
+
+ /* If this page has ADI enabled, copy over any ADI tags
+ * as well
+ */
+ if (vma->vm_flags & VM_SPARC_ADI) {
+ unsigned long pfrom, pto, i, adi_tag;
+
+ pfrom = page_to_phys(from);
+ pto = page_to_phys(to);
+
+ for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) {
+ asm volatile("ldxa [%1] %2, %0\n\t"
+ : "=r" (adi_tag)
+ : "r" (i), "i" (ASI_MCD_REAL));
+ asm volatile("stxa %0, [%1] %2\n\t"
+ :
+ : "r" (adi_tag), "r" (pto),
+ "i" (ASI_MCD_REAL));
+ pto += adi_blksize();
+ }
+ asm volatile("membar #Sync\n\t");
+ }
+}
+EXPORT_SYMBOL(copy_user_highpage);
+
+void copy_highpage(struct page *to, struct page *from)
+{
+ char *vfrom, *vto;
+
+ vfrom = kmap_atomic(from);
+ vto = kmap_atomic(to);
+ copy_page(vto, vfrom);
+ kunmap_atomic(vto);
+ kunmap_atomic(vfrom);
+
+ /* If this platform is ADI enabled, copy any ADI tags
+ * as well
+ */
+ if (adi_capable()) {
+ unsigned long pfrom, pto, i, adi_tag;
+
+ pfrom = page_to_phys(from);
+ pto = page_to_phys(to);
+
+ for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) {
+ asm volatile("ldxa [%1] %2, %0\n\t"
+ : "=r" (adi_tag)
+ : "r" (i), "i" (ASI_MCD_REAL));
+ asm volatile("stxa %0, [%1] %2\n\t"
+ :
+ : "r" (adi_tag), "r" (pto),
+ "i" (ASI_MCD_REAL));
+ pto += adi_blksize();
+ }
+ asm volatile("membar #Sync\n\t");
+ }
+}
+EXPORT_SYMBOL(copy_highpage);
+
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
+{
+ unsigned long prot = pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
+
+ if (vm_flags & VM_SPARC_ADI)
+ prot |= _PAGE_MCD_4V;
+
+ return __pgprot(prot);
+}
+EXPORT_SYMBOL(vm_get_page_prot);