diff options
Diffstat (limited to 'arch/sparc/mm/init_64.c')
| -rw-r--r-- | arch/sparc/mm/init_64.c | 1680 |
1 files changed, 1073 insertions, 607 deletions
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index a9c42a7ffb6a..df9f7c444c39 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * arch/sparc64/mm/init.c * @@ -5,12 +6,12 @@ * Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz) */ -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/initrd.h> @@ -22,19 +23,19 @@ #include <linux/kprobes.h> #include <linux/cache.h> #include <linux/sort.h> +#include <linux/ioport.h> #include <linux/percpu.h> -#include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/gfp.h> +#include <linux/bootmem_info.h> #include <asm/head.h> #include <asm/page.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/oplib.h> #include <asm/iommu.h> #include <asm/io.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/dma.h> @@ -47,11 +48,13 @@ #include <asm/prom.h> #include <asm/mdesc.h> #include <asm/cpudata.h> +#include <asm/setup.h> #include <asm/irq.h> #include "init_64.h" unsigned long kern_linear_pte_xor[4] __read_mostly; +static unsigned long page_cache4v_flag; /* A bitmap, two bits for every 256MB of physical memory. These two * bits determine what page size we use for kernel linear @@ -73,7 +76,6 @@ unsigned long kern_linear_pte_xor[4] __read_mostly; * 'cpu' properties, but we need to have this table setup before the * MDESC is initialized. */ -unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)]; #ifndef CONFIG_DEBUG_PAGEALLOC /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings. @@ -82,14 +84,17 @@ unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)]; */ extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; #endif +extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; static unsigned long cpu_pgsz_mask; -#define MAX_BANKS 32 +#define MAX_BANKS 1024 static struct linux_prom64_registers pavail[MAX_BANKS]; static int pavail_ents; +u64 numa_latency[MAX_NUMNODES][MAX_NUMNODES]; + static int cmp_p64(const void *a, const void *b) { const struct linux_prom64_registers *x = a, *y = b; @@ -163,10 +168,6 @@ static void __init read_obp_memory(const char *property, cmp_p64, NULL); } -unsigned long sparc64_valid_addr_bitmap[VALID_ADDR_BITMAP_BYTES / - sizeof(unsigned long)]; -EXPORT_SYMBOL(sparc64_valid_addr_bitmap); - /* Kernel physical address base and size in bytes. */ unsigned long kern_base __read_mostly; unsigned long kern_size __read_mostly; @@ -194,21 +195,26 @@ atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0); #endif #endif -inline void flush_dcache_page_impl(struct page *page) +inline void flush_dcache_folio_impl(struct folio *folio) { + unsigned int i, nr = folio_nr_pages(folio); + BUG_ON(tlb_type == hypervisor); #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes); #endif #ifdef DCACHE_ALIASING_POSSIBLE - __flush_dcache_page(page_address(page), - ((tlb_type == spitfire) && - page_mapping(page) != NULL)); + for (i = 0; i < nr; i++) + __flush_dcache_page(folio_address(folio) + i * PAGE_SIZE, + ((tlb_type == spitfire) && + folio_flush_mapping(folio) != NULL)); #else - if (page_mapping(page) != NULL && - tlb_type == spitfire) - __flush_icache_page(__pa(page_address(page))); + if (folio_flush_mapping(folio) != NULL && + tlb_type == spitfire) { + for (i = 0; i < nr; i++) + __flush_icache_page((pfn + i) * PAGE_SIZE); + } #endif } @@ -217,10 +223,10 @@ inline void flush_dcache_page_impl(struct page *page) #define PG_dcache_cpu_mask \ ((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL) -#define dcache_dirty_cpu(page) \ - (((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) +#define dcache_dirty_cpu(folio) \ + (((folio)->flags.f >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) -static inline void set_dcache_dirty(struct page *page, int this_cpu) +static inline void set_dcache_dirty(struct folio *folio, int this_cpu) { unsigned long mask = this_cpu; unsigned long non_cpu_bits; @@ -237,11 +243,11 @@ static inline void set_dcache_dirty(struct page *page, int this_cpu) "bne,pn %%xcc, 1b\n\t" " nop" : /* no outputs */ - : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) + : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags.f) : "g1", "g7"); } -static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) +static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu) { unsigned long mask = (1UL << PG_dcache_dirty); @@ -259,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) " nop\n" "2:" : /* no outputs */ - : "r" (cpu), "r" (mask), "r" (&page->flags), + : "r" (cpu), "r" (mask), "r" (&folio->flags.f), "i" (PG_dcache_cpu_mask), "i" (PG_dcache_cpu_shift) : "g1", "g7"); @@ -283,9 +289,10 @@ static void flush_dcache(unsigned long pfn) page = pfn_to_page(pfn); if (page) { + struct folio *folio = page_folio(page); unsigned long pg_flags; - pg_flags = page->flags; + pg_flags = folio->flags.f; if (pg_flags & (1UL << PG_dcache_dirty)) { int cpu = ((pg_flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask); @@ -295,11 +302,11 @@ static void flush_dcache(unsigned long pfn) * in the SMP case. */ if (cpu == this_cpu) - flush_dcache_page_impl(page); + flush_dcache_folio_impl(folio); else - smp_flush_dcache_page_impl(page, cpu); + smp_flush_dcache_folio_impl(folio, cpu); - clear_dcache_dirty_cpu(page, cpu); + clear_dcache_dirty_cpu(folio, cpu); put_cpu(); } @@ -323,23 +330,78 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde tsb_insert(tsb, tag, tte); } -#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline bool is_hugetlb_pte(pte_t pte) +#ifdef CONFIG_HUGETLB_PAGE +static int __init hugetlbpage_init(void) { - if ((tlb_type == hypervisor && - (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) || - (tlb_type != hypervisor && - (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U)) - return true; - return false; + hugetlb_add_hstate(HPAGE_64K_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate(HPAGE_256MB_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate(HPAGE_2GB_SHIFT - PAGE_SHIFT); + + return 0; } -#endif -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +arch_initcall(hugetlbpage_init); + +static void __init pud_huge_patch(void) +{ + struct pud_huge_patch_entry *p; + unsigned long addr; + + p = &__pud_huge_patch; + addr = p->addr; + *(unsigned int *)addr = p->insn; + + __asm__ __volatile__("flush %0" : : "r" (addr)); +} + +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + unsigned int hugepage_shift = ilog2(size); + unsigned short hv_pgsz_idx; + unsigned int hv_pgsz_mask; + + switch (hugepage_shift) { + case HPAGE_16GB_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_16GB; + hv_pgsz_idx = HV_PGSZ_IDX_16GB; + pud_huge_patch(); + break; + case HPAGE_2GB_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_2GB; + hv_pgsz_idx = HV_PGSZ_IDX_2GB; + break; + case HPAGE_256MB_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_256MB; + hv_pgsz_idx = HV_PGSZ_IDX_256MB; + break; + case HPAGE_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_4MB; + hv_pgsz_idx = HV_PGSZ_IDX_4MB; + break; + case HPAGE_64K_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_64K; + hv_pgsz_idx = HV_PGSZ_IDX_64K; + break; + default: + hv_pgsz_mask = 0; + } + + if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) + return false; + + return true; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { struct mm_struct *mm; unsigned long flags; + bool is_huge_tsb; pte_t pte = *ptep; + unsigned int i; if (tlb_type != hypervisor) { unsigned long pfn = pte_pfn(pte); @@ -350,22 +412,57 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * mm = vma->vm_mm; + /* Don't insert a non-valid PTE into the TSB, we'll deadlock. */ + if (!pte_accessible(mm, pte)) + return; + spin_lock_irqsave(&mm->context.lock, flags); + is_huge_tsb = false; #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) - if (mm->context.huge_pte_count && is_hugetlb_pte(pte)) - __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT, - address, pte_val(pte)); - else + if (mm->context.hugetlb_pte_count || mm->context.thp_pte_count) { + unsigned long hugepage_size = PAGE_SIZE; + + if (is_vm_hugetlb_page(vma)) + hugepage_size = huge_page_size(hstate_vma(vma)); + + if (hugepage_size >= PUD_SIZE) { + unsigned long mask = 0x1ffc00000UL; + + /* Transfer bits [32:22] from address to resolve + * at 4M granularity. + */ + pte_val(pte) &= ~mask; + pte_val(pte) |= (address & mask); + } else if (hugepage_size >= PMD_SIZE) { + /* We are fabricating 8MB pages using 4MB + * real hw pages. + */ + pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT)); + } + + if (hugepage_size >= PMD_SIZE) { + __update_mmu_tsb_insert(mm, MM_TSB_HUGE, + REAL_HPAGE_SHIFT, address, pte_val(pte)); + is_huge_tsb = true; + } + } #endif - __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, - address, pte_val(pte)); + if (!is_huge_tsb) { + for (i = 0; i < nr; i++) { + __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, + address, pte_val(pte)); + address += PAGE_SIZE; + pte_val(pte) += PAGE_SIZE; + } + } spin_unlock_irqrestore(&mm->context.lock, flags); } -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { + unsigned long pfn = folio_pfn(folio); struct address_space *mapping; int this_cpu; @@ -376,35 +473,35 @@ void flush_dcache_page(struct page *page) * is merely the zero page. The 'bigcore' testcase in GDB * causes this case to run millions of times. */ - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; this_cpu = get_cpu(); - mapping = page_mapping(page); + mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) { - int dirty = test_bit(PG_dcache_dirty, &page->flags); + bool dirty = test_bit(PG_dcache_dirty, &folio->flags.f); if (dirty) { - int dirty_cpu = dcache_dirty_cpu(page); + int dirty_cpu = dcache_dirty_cpu(folio); if (dirty_cpu == this_cpu) goto out; - smp_flush_dcache_page_impl(page, dirty_cpu); + smp_flush_dcache_folio_impl(folio, dirty_cpu); } - set_dcache_dirty(page, this_cpu); + set_dcache_dirty(folio, this_cpu); } else { - /* We could delay the flush for the !page_mapping + /* We could delay the flush for the !folio_mapping * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. */ - flush_dcache_page_impl(page); + flush_dcache_folio_impl(folio); } out: put_cpu(); } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); void __kprobes flush_icache_range(unsigned long start, unsigned long end) { @@ -421,10 +518,7 @@ void __kprobes flush_icache_range(unsigned long start, unsigned long end) if (kaddr >= PAGE_OFFSET) paddr = kaddr & mask; else { - pgd_t *pgdp = pgd_offset_k(kaddr); - pud_t *pudp = pud_offset(pgdp, kaddr); - pmd_t *pmdp = pmd_offset(pudp, kaddr); - pte_t *ptep = pte_offset_kernel(pmdp, kaddr); + pte_t *ptep = virt_to_kpte(kaddr); paddr = pte_val(*ptep) & mask; } @@ -588,7 +682,7 @@ static void __init remap_kernel(void) int i, tlb_ent = sparc64_highest_locked_tlbent(); tte_vaddr = (unsigned long) KERNBASE; - phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL; + phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; tte_data = kern_large_tte(phys_page); kern_locked_tte_data = tte_data; @@ -629,9 +723,10 @@ static void __init inherit_prom_mappings(void) void prom_world(int enter) { - if (!enter) - set_fs(get_fs()); - + /* + * No need to change the address space any more, just flush + * the register windows + */ __asm__ __volatile__("flushw"); } @@ -662,10 +757,58 @@ EXPORT_SYMBOL(__flush_dcache_range); /* get_new_mmu_context() uses "cache + 1". */ DEFINE_SPINLOCK(ctx_alloc_lock); -unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1; +unsigned long tlb_context_cache = CTX_FIRST_VERSION; #define MAX_CTX_NR (1UL << CTX_NR_BITS) #define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR) DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); +DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0}; + +static void mmu_context_wrap(void) +{ + unsigned long old_ver = tlb_context_cache & CTX_VERSION_MASK; + unsigned long new_ver, new_ctx, old_ctx; + struct mm_struct *mm; + int cpu; + + bitmap_zero(mmu_context_bmap, 1 << CTX_NR_BITS); + + /* Reserve kernel context */ + set_bit(0, mmu_context_bmap); + + new_ver = (tlb_context_cache & CTX_VERSION_MASK) + CTX_FIRST_VERSION; + if (unlikely(new_ver == 0)) + new_ver = CTX_FIRST_VERSION; + tlb_context_cache = new_ver; + + /* + * Make sure that any new mm that are added into per_cpu_secondary_mm, + * are going to go through get_new_mmu_context() path. + */ + mb(); + + /* + * Updated versions to current on those CPUs that had valid secondary + * contexts + */ + for_each_online_cpu(cpu) { + /* + * If a new mm is stored after we took this mm from the array, + * it will go into get_new_mmu_context() path, because we + * already bumped the version in tlb_context_cache. + */ + mm = per_cpu(per_cpu_secondary_mm, cpu); + + if (unlikely(!mm || mm == &init_mm)) + continue; + + old_ctx = mm->context.sparc64_ctx_val; + if (likely((old_ctx & CTX_VERSION_MASK) == old_ver)) { + new_ctx = (old_ctx & ~CTX_VERSION_MASK) | new_ver; + set_bit(new_ctx & CTX_NR_MASK, mmu_context_bmap); + mm->context.sparc64_ctx_val = new_ctx; + } + } +} /* Caller does TLB context flushing on local CPU if necessary. * The caller also ensures that CTX_VALID(mm->context) is false. @@ -681,48 +824,30 @@ void get_new_mmu_context(struct mm_struct *mm) { unsigned long ctx, new_ctx; unsigned long orig_pgsz_bits; - int new_version; spin_lock(&ctx_alloc_lock); +retry: + /* wrap might have happened, test again if our context became valid */ + if (unlikely(CTX_VALID(mm->context))) + goto out; orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); ctx = (tlb_context_cache + 1) & CTX_NR_MASK; new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); - new_version = 0; if (new_ctx >= (1 << CTX_NR_BITS)) { new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); if (new_ctx >= ctx) { - int i; - new_ctx = (tlb_context_cache & CTX_VERSION_MASK) + - CTX_FIRST_VERSION; - if (new_ctx == 1) - new_ctx = CTX_FIRST_VERSION; - - /* Don't call memset, for 16 entries that's just - * plain silly... - */ - mmu_context_bmap[0] = 3; - mmu_context_bmap[1] = 0; - mmu_context_bmap[2] = 0; - mmu_context_bmap[3] = 0; - for (i = 4; i < CTX_BMAP_SLOTS; i += 4) { - mmu_context_bmap[i + 0] = 0; - mmu_context_bmap[i + 1] = 0; - mmu_context_bmap[i + 2] = 0; - mmu_context_bmap[i + 3] = 0; - } - new_version = 1; - goto out; + mmu_context_wrap(); + goto retry; } } + if (mm->context.sparc64_ctx_val) + cpumask_clear(mm_cpumask(mm)); mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63)); new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); -out: tlb_context_cache = new_ctx; mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; +out: spin_unlock(&ctx_alloc_lock); - - if (unlikely(new_version)) - smp_new_mmu_context_version(); } static int numa_enabled = 1; @@ -789,16 +914,26 @@ static void __init find_ramdisk(unsigned long phys_base) struct node_mem_mask { unsigned long mask; - unsigned long val; + unsigned long match; }; static struct node_mem_mask node_masks[MAX_NUMNODES]; static int num_node_masks; +#ifdef CONFIG_NUMA + +struct mdesc_mlgroup { + u64 node; + u64 latency; + u64 match; + u64 mask; +}; + +static struct mdesc_mlgroup *mlgroups; +static int num_mlgroups; + int numa_cpu_lookup_table[NR_CPUS]; cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; -#ifdef CONFIG_NEED_MULTIPLE_NODES - struct mdesc_mblock { u64 base; u64 size; @@ -807,52 +942,128 @@ struct mdesc_mblock { static struct mdesc_mblock *mblocks; static int num_mblocks; -static unsigned long ra_to_pa(unsigned long addr) +static struct mdesc_mblock * __init addr_to_mblock(unsigned long addr) { + struct mdesc_mblock *m = NULL; int i; for (i = 0; i < num_mblocks; i++) { - struct mdesc_mblock *m = &mblocks[i]; + m = &mblocks[i]; if (addr >= m->base && addr < (m->base + m->size)) { - addr += m->offset; break; } } - return addr; + + return m; } -static int find_node(unsigned long addr) +static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid) { - int i; + int prev_nid, new_nid; - addr = ra_to_pa(addr); - for (i = 0; i < num_node_masks; i++) { - struct node_mem_mask *p = &node_masks[i]; + prev_nid = NUMA_NO_NODE; + for ( ; start < end; start += PAGE_SIZE) { + for (new_nid = 0; new_nid < num_node_masks; new_nid++) { + struct node_mem_mask *p = &node_masks[new_nid]; + + if ((start & p->mask) == p->match) { + if (prev_nid == NUMA_NO_NODE) + prev_nid = new_nid; + break; + } + } + + if (new_nid == num_node_masks) { + prev_nid = 0; + WARN_ONCE(1, "addr[%Lx] doesn't match a NUMA node rule. Some memory will be owned by node 0.", + start); + break; + } - if ((addr & p->mask) == p->val) - return i; + if (prev_nid != new_nid) + break; } - return -1; + *nid = prev_nid; + + return start > end ? end : start; } -static u64 memblock_nid_range(u64 start, u64 end, int *nid) +static u64 __init memblock_nid_range(u64 start, u64 end, int *nid) { - *nid = find_node(start); - start += PAGE_SIZE; - while (start < end) { - int n = find_node(start); + u64 ret_end, pa_start, m_mask, m_match, m_end; + struct mdesc_mblock *mblock; + int _nid, i; + + if (tlb_type != hypervisor) + return memblock_nid_range_sun4u(start, end, nid); + + mblock = addr_to_mblock(start); + if (!mblock) { + WARN_ONCE(1, "memblock_nid_range: Can't find mblock addr[%Lx]", + start); + + _nid = 0; + ret_end = end; + goto done; + } - if (n != *nid) + pa_start = start + mblock->offset; + m_match = 0; + m_mask = 0; + + for (_nid = 0; _nid < num_node_masks; _nid++) { + struct node_mem_mask *const m = &node_masks[_nid]; + + if ((pa_start & m->mask) == m->match) { + m_match = m->match; + m_mask = m->mask; break; - start += PAGE_SIZE; + } } - if (start > end) - start = end; + if (num_node_masks == _nid) { + /* We could not find NUMA group, so default to 0, but lets + * search for latency group, so we could calculate the correct + * end address that we return + */ + _nid = 0; + + for (i = 0; i < num_mlgroups; i++) { + struct mdesc_mlgroup *const m = &mlgroups[i]; - return start; + if ((pa_start & m->mask) == m->match) { + m_match = m->match; + m_mask = m->mask; + break; + } + } + + if (i == num_mlgroups) { + WARN_ONCE(1, "memblock_nid_range: Can't find latency group addr[%Lx]", + start); + + ret_end = end; + goto done; + } + } + + /* + * Each latency group has match and mask, and each memory block has an + * offset. An address belongs to a latency group if its address matches + * the following formula: ((addr + offset) & mask) == match + * It is, however, slow to check every single page if it matches a + * particular latency group. As optimization we calculate end value by + * using bit arithmetics. + */ + m_end = m_match + (1ul << __ffs(m_mask)) - mblock->offset; + m_end += pa_start & ~((1ul << fls64(m_mask)) - 1); + ret_end = m_end > end ? end : m_end; + +done: + *nid = _nid; + return ret_end; } #endif @@ -864,16 +1075,9 @@ static void __init allocate_node_data(int nid) { struct pglist_data *p; unsigned long start_pfn, end_pfn; -#ifdef CONFIG_NEED_MULTIPLE_NODES - unsigned long paddr; - paddr = memblock_alloc_try_nid(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid); - if (!paddr) { - prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid); - prom_halt(); - } - NODE_DATA(nid) = __va(paddr); - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); +#ifdef CONFIG_NUMA + alloc_node_data(nid); NODE_DATA(nid)->node_id = nid; #endif @@ -887,34 +1091,28 @@ static void __init allocate_node_data(int nid) static void init_node_masks_nonnuma(void) { +#ifdef CONFIG_NUMA int i; +#endif numadbg("Initializing tables for non-numa.\n"); - node_masks[0].mask = node_masks[0].val = 0; + node_masks[0].mask = 0; + node_masks[0].match = 0; num_node_masks = 1; +#ifdef CONFIG_NUMA for (i = 0; i < NR_CPUS; i++) numa_cpu_lookup_table[i] = 0; cpumask_setall(&numa_cpumask_lookup_table[0]); +#endif } -#ifdef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data *node_data[MAX_NUMNODES]; +#ifdef CONFIG_NUMA EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(numa_cpumask_lookup_table); -EXPORT_SYMBOL(node_data); - -struct mdesc_mlgroup { - u64 node; - u64 latency; - u64 match; - u64 mask; -}; -static struct mdesc_mlgroup *mlgroups; -static int num_mlgroups; static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio, u32 cfg_handle) @@ -987,7 +1185,7 @@ int of_node_to_nid(struct device_node *dp) md = mdesc_grab(); count = 0; - nid = -1; + nid = NUMA_NO_NODE; mdesc_for_each_node_by_name(md, grp, "group") { if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { nid = count; @@ -1003,14 +1201,14 @@ int of_node_to_nid(struct device_node *dp) static void __init add_node_ranges(void) { - struct memblock_region *reg; + phys_addr_t start, end; + unsigned long prev_max; + u64 i; - for_each_memblock(memory, reg) { - unsigned long size = reg->size; - unsigned long start, end; +memblock_resized: + prev_max = memblock.memory.max; - start = reg->base; - end = start + size; + for_each_mem_range(i, &start, &end) { while (start < end) { unsigned long this_end; int nid; @@ -1018,10 +1216,13 @@ static void __init add_node_ranges(void) this_end = memblock_nid_range(start, end, &nid); numadbg("Setting memblock NUMA node nid[%d] " - "start[%lx] end[%lx]\n", + "start[%llx] end[%lx]\n", nid, start, this_end); - memblock_set_node(start, this_end - start, nid); + memblock_set_node(start, this_end - start, + &memblock.memory, nid); + if (memblock.memory.max != prev_max) + goto memblock_resized; start = this_end; } } @@ -1038,8 +1239,8 @@ static int __init grab_mlgroups(struct mdesc_handle *md) if (!count) return -ENOENT; - paddr = memblock_alloc(count * sizeof(struct mdesc_mlgroup), - SMP_CACHE_BYTES); + paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mlgroup), + SMP_CACHE_BYTES); if (!paddr) return -ENOMEM; @@ -1079,8 +1280,8 @@ static int __init grab_mblocks(struct mdesc_handle *md) if (!count) return -ENOENT; - paddr = memblock_alloc(count * sizeof(struct mdesc_mblock), - SMP_CACHE_BYTES); + paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mblock), + SMP_CACHE_BYTES); if (!paddr) return -ENOMEM; @@ -1146,6 +1347,49 @@ static struct mdesc_mlgroup * __init find_mlgroup(u64 node) return NULL; } +int __node_distance(int from, int to) +{ + if ((from >= MAX_NUMNODES) || (to >= MAX_NUMNODES)) { + pr_warn("Returning default NUMA distance value for %d->%d\n", + from, to); + return (from == to) ? LOCAL_DISTANCE : REMOTE_DISTANCE; + } + return numa_latency[from][to]; +} +EXPORT_SYMBOL(__node_distance); + +static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) { + struct node_mem_mask *n = &node_masks[i]; + + if ((grp->mask == n->mask) && (grp->match == n->match)) + break; + } + return i; +} + +static void __init find_numa_latencies_for_group(struct mdesc_handle *md, + u64 grp, int index) +{ + u64 arc; + + mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { + int tnode; + u64 target = mdesc_arc_target(md, arc); + struct mdesc_mlgroup *m = find_mlgroup(target); + + if (!m) + continue; + tnode = find_best_numa_node_for_mlgroup(m); + if (tnode == MAX_NUMNODES) + continue; + numa_latency[index][tnode] = m->latency; + } +} + static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp, int index) { @@ -1176,10 +1420,10 @@ static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp, n = &node_masks[num_node_masks++]; n->mask = candidate->mask; - n->val = candidate->match; + n->match = candidate->match; - numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n", - index, n->mask, n->val, candidate->latency); + numadbg("NUMA NODE[%d]: mask[%lx] match[%lx] (latency[%llx])\n", + index, n->mask, n->match, candidate->latency); return 0; } @@ -1209,7 +1453,7 @@ static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp, static int __init numa_parse_mdesc(void) { struct mdesc_handle *md = mdesc_grab(); - int i, err, count; + int i, j, err, count; u64 node; node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups"); @@ -1234,6 +1478,23 @@ static int __init numa_parse_mdesc(void) count++; } + count = 0; + mdesc_for_each_node_by_name(md, node, "group") { + find_numa_latencies_for_group(md, node, count); + count++; + } + + /* Normalize numa latency matrix according to ACPI SLIT spec. */ + for (i = 0; i < MAX_NUMNODES; i++) { + u64 self_latency = numa_latency[i][i]; + + for (j = 0; j < MAX_NUMNODES; j++) { + numa_latency[i][j] = + (numa_latency[i][j] * LOCAL_DISTANCE) / + self_latency; + } + } + add_node_ranges(); for (i = 0; i < num_node_masks; i++) { @@ -1259,7 +1520,7 @@ static int __init numa_parse_jbus(void) numa_cpu_lookup_table[cpu] = index; cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu)); node_masks[index].mask = ~((1UL << 36UL) - 1UL); - node_masks[index].val = cpu << 36UL; + node_masks[index].match = cpu << 36UL; index++; } @@ -1290,10 +1551,18 @@ static int __init numa_parse_sun4u(void) static int __init bootmem_init_numa(void) { + int i, j; int err = -1; numadbg("bootmem_init_numa()\n"); + /* Some sane defaults for numa latency values */ + for (i = 0; i < MAX_NUMNODES; i++) { + for (j = 0; j < MAX_NUMNODES; j++) + numa_latency[i][j] = (i == j) ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + } + if (numa_enabled) { if (tlb_type == hypervisor) err = numa_parse_mdesc(); @@ -1325,7 +1594,7 @@ static void __init bootmem_init_nonnuma(void) (top_of_ram - total_ram) >> 20); init_node_masks_nonnuma(); - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); allocate_node_data(0); node_set_online(0); } @@ -1346,7 +1615,6 @@ static unsigned long __init bootmem_init(unsigned long phys_base) /* XXX cpu notifier XXX */ - sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); return end_pfn; @@ -1355,9 +1623,148 @@ static unsigned long __init bootmem_init(unsigned long phys_base) static struct linux_prom64_registers pall[MAX_BANKS] __initdata; static int pall_ents __initdata; -#ifdef CONFIG_DEBUG_PAGEALLOC +static unsigned long max_phys_bits = 40; + +bool kern_addr_valid(unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if ((long)addr < 0L) { + unsigned long pa = __pa(addr); + + if ((pa >> max_phys_bits) != 0UL) + return false; + + return pfn_valid(pa >> PAGE_SHIFT); + } + + if (addr >= (unsigned long) KERNBASE && + addr < (unsigned long)&_end) + return true; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return false; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return false; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return false; + + if (pud_leaf(*pud)) + return pfn_valid(pud_pfn(*pud)); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return false; + + if (pmd_leaf(*pmd)) + return pfn_valid(pmd_pfn(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + return false; + + return pfn_valid(pte_pfn(*pte)); +} + +static unsigned long __ref kernel_map_hugepud(unsigned long vstart, + unsigned long vend, + pud_t *pud) +{ + const unsigned long mask16gb = (1UL << 34) - 1UL; + u64 pte_val = vstart; + + /* Each PUD is 8GB */ + if ((vstart & mask16gb) || + (vend - vstart <= mask16gb)) { + pte_val ^= kern_linear_pte_xor[2]; + pud_val(*pud) = pte_val | _PAGE_PUD_HUGE; + + return vstart + PUD_SIZE; + } + + pte_val ^= kern_linear_pte_xor[3]; + pte_val |= _PAGE_PUD_HUGE; + + vend = vstart + mask16gb + 1UL; + while (vstart < vend) { + pud_val(*pud) = pte_val; + + pte_val += PUD_SIZE; + vstart += PUD_SIZE; + pud++; + } + return vstart; +} + +static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend, + bool guard) +{ + if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE) + return true; + + return false; +} + +static unsigned long __ref kernel_map_hugepmd(unsigned long vstart, + unsigned long vend, + pmd_t *pmd) +{ + const unsigned long mask256mb = (1UL << 28) - 1UL; + const unsigned long mask2gb = (1UL << 31) - 1UL; + u64 pte_val = vstart; + + /* Each PMD is 8MB */ + if ((vstart & mask256mb) || + (vend - vstart <= mask256mb)) { + pte_val ^= kern_linear_pte_xor[0]; + pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE; + + return vstart + PMD_SIZE; + } + + if ((vstart & mask2gb) || + (vend - vstart <= mask2gb)) { + pte_val ^= kern_linear_pte_xor[1]; + pte_val |= _PAGE_PMD_HUGE; + vend = vstart + mask256mb + 1UL; + } else { + pte_val ^= kern_linear_pte_xor[2]; + pte_val |= _PAGE_PMD_HUGE; + vend = vstart + mask2gb + 1UL; + } + + while (vstart < vend) { + pmd_val(*pmd) = pte_val; + + pte_val += PMD_SIZE; + vstart += PMD_SIZE; + pmd++; + } + + return vstart; +} + +static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend, + bool guard) +{ + if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE) + return true; + + return false; +} + static unsigned long __ref kernel_map_range(unsigned long pstart, - unsigned long pend, pgprot_t prot) + unsigned long pend, pgprot_t prot, + bool use_huge) { unsigned long vstart = PAGE_OFFSET + pstart; unsigned long vend = PAGE_OFFSET + pend; @@ -1372,24 +1779,62 @@ static unsigned long __ref kernel_map_range(unsigned long pstart, while (vstart < vend) { unsigned long this_end, paddr = __pa(vstart); pgd_t *pgd = pgd_offset_k(vstart); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - pud = pud_offset(pgd, vstart); + if (pgd_none(*pgd)) { + pud_t *new; + + new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, + PAGE_SIZE); + if (!new) + goto err_alloc; + alloc_bytes += PAGE_SIZE; + pgd_populate(&init_mm, pgd, new); + } + + p4d = p4d_offset(pgd, vstart); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, + PAGE_SIZE); + if (!new) + goto err_alloc; + alloc_bytes += PAGE_SIZE; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, vstart); if (pud_none(*pud)) { pmd_t *new; - new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (kernel_can_map_hugepud(vstart, vend, use_huge)) { + vstart = kernel_map_hugepud(vstart, vend, pud); + continue; + } + new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, + PAGE_SIZE); + if (!new) + goto err_alloc; alloc_bytes += PAGE_SIZE; pud_populate(&init_mm, pud, new); } pmd = pmd_offset(pud, vstart); - if (!pmd_present(*pmd)) { + if (pmd_none(*pmd)) { pte_t *new; - new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (kernel_can_map_hugepmd(vstart, vend, use_huge)) { + vstart = kernel_map_hugepmd(vstart, vend, pmd); + continue; + } + new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, + PAGE_SIZE); + if (!new) + goto err_alloc; alloc_bytes += PAGE_SIZE; pmd_populate_kernel(&init_mm, pmd, new); } @@ -1409,102 +1854,41 @@ static unsigned long __ref kernel_map_range(unsigned long pstart, } return alloc_bytes; -} - -extern unsigned int kvmap_linear_patch[1]; -#endif /* CONFIG_DEBUG_PAGEALLOC */ - -static void __init kpte_set_val(unsigned long index, unsigned long val) -{ - unsigned long *ptr = kpte_linear_bitmap; - val <<= ((index % (BITS_PER_LONG / 2)) * 2); - ptr += (index / (BITS_PER_LONG / 2)); - - *ptr |= val; +err_alloc: + panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + return -ENOMEM; } -static const unsigned long kpte_shift_min = 28; /* 256MB */ -static const unsigned long kpte_shift_max = 34; /* 16GB */ -static const unsigned long kpte_shift_incr = 3; - -static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end, - unsigned long shift) +static void __init flush_all_kernel_tsbs(void) { - unsigned long size = (1UL << shift); - unsigned long mask = (size - 1UL); - unsigned long remains = end - start; - unsigned long val; - - if (remains < size || (start & mask)) - return start; - - /* VAL maps: - * - * shift 28 --> kern_linear_pte_xor index 1 - * shift 31 --> kern_linear_pte_xor index 2 - * shift 34 --> kern_linear_pte_xor index 3 - */ - val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1; - - remains &= ~mask; - if (shift != kpte_shift_max) - remains = size; - - while (remains) { - unsigned long index = start >> kpte_shift_min; + int i; - kpte_set_val(index, val); + for (i = 0; i < KERNEL_TSB_NENTRIES; i++) { + struct tsb *ent = &swapper_tsb[i]; - start += 1UL << kpte_shift_min; - remains -= 1UL << kpte_shift_min; + ent->tag = (1UL << TSB_TAG_INVALID_BIT); } +#ifndef CONFIG_DEBUG_PAGEALLOC + for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) { + struct tsb *ent = &swapper_4m_tsb[i]; - return start; -} - -static void __init mark_kpte_bitmap(unsigned long start, unsigned long end) -{ - unsigned long smallest_size, smallest_mask; - unsigned long s; - - smallest_size = (1UL << kpte_shift_min); - smallest_mask = (smallest_size - 1UL); - - while (start < end) { - unsigned long orig_start = start; - - for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) { - start = kpte_mark_using_shift(start, end, s); - - if (start != orig_start) - break; - } - - if (start == orig_start) - start = (start + smallest_size) & ~smallest_mask; + ent->tag = (1UL << TSB_TAG_INVALID_BIT); } +#endif } -static void __init init_kpte_bitmap(void) -{ - unsigned long i; - - for (i = 0; i < pall_ents; i++) { - unsigned long phys_start, phys_end; - - phys_start = pall[i].phys_addr; - phys_end = phys_start + pall[i].reg_size; - - mark_kpte_bitmap(phys_start, phys_end); - } -} +extern unsigned int kvmap_linear_patch[1]; static void __init kernel_physical_mapping_init(void) { -#ifdef CONFIG_DEBUG_PAGEALLOC unsigned long i, mem_alloced = 0UL; + bool use_huge = true; +#ifdef CONFIG_DEBUG_PAGEALLOC + use_huge = false; +#endif for (i = 0; i < pall_ents; i++) { unsigned long phys_start, phys_end; @@ -1512,7 +1896,7 @@ static void __init kernel_physical_mapping_init(void) phys_end = phys_start + pall[i].reg_size; mem_alloced += kernel_map_range(phys_start, phys_end, - PAGE_KERNEL); + PAGE_KERNEL, use_huge); } printk("Allocated %ld bytes for kernel page tables.\n", @@ -1521,18 +1905,19 @@ static void __init kernel_physical_mapping_init(void) kvmap_linear_patch[0] = 0x01000000; /* nop */ flushi(&kvmap_linear_patch[0]); + flush_all_kernel_tsbs(); + __flush_tlb_all(); -#endif } #ifdef CONFIG_DEBUG_PAGEALLOC -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); kernel_map_range(phys_start, phys_end, - (enable ? PAGE_KERNEL : __pgprot(0))); + (enable ? PAGE_KERNEL : __pgprot(0)), false); flush_tsb_kernel_range(PAGE_OFFSET + phys_start, PAGE_OFFSET + phys_end); @@ -1557,6 +1942,91 @@ unsigned long __init find_ecache_flush_span(unsigned long size) return ~0UL; } +unsigned long PAGE_OFFSET; +EXPORT_SYMBOL(PAGE_OFFSET); + +unsigned long VMALLOC_END = 0x0000010000000000UL; +EXPORT_SYMBOL(VMALLOC_END); + +unsigned long sparc64_va_hole_top = 0xfffff80000000000UL; +unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL; + +static void __init setup_page_offset(void) +{ + if (tlb_type == cheetah || tlb_type == cheetah_plus) { + /* Cheetah/Panther support a full 64-bit virtual + * address, so we can use all that our page tables + * support. + */ + sparc64_va_hole_top = 0xfff0000000000000UL; + sparc64_va_hole_bottom = 0x0010000000000000UL; + + max_phys_bits = 42; + } else if (tlb_type == hypervisor) { + switch (sun4v_chip_type) { + case SUN4V_CHIP_NIAGARA1: + case SUN4V_CHIP_NIAGARA2: + /* T1 and T2 support 48-bit virtual addresses. */ + sparc64_va_hole_top = 0xffff800000000000UL; + sparc64_va_hole_bottom = 0x0000800000000000UL; + + max_phys_bits = 39; + break; + case SUN4V_CHIP_NIAGARA3: + /* T3 supports 48-bit virtual addresses. */ + sparc64_va_hole_top = 0xffff800000000000UL; + sparc64_va_hole_bottom = 0x0000800000000000UL; + + max_phys_bits = 43; + break; + case SUN4V_CHIP_NIAGARA4: + case SUN4V_CHIP_NIAGARA5: + case SUN4V_CHIP_SPARC64X: + case SUN4V_CHIP_SPARC_M6: + /* T4 and later support 52-bit virtual addresses. */ + sparc64_va_hole_top = 0xfff8000000000000UL; + sparc64_va_hole_bottom = 0x0008000000000000UL; + max_phys_bits = 47; + break; + case SUN4V_CHIP_SPARC_M7: + case SUN4V_CHIP_SPARC_SN: + /* M7 and later support 52-bit virtual addresses. */ + sparc64_va_hole_top = 0xfff8000000000000UL; + sparc64_va_hole_bottom = 0x0008000000000000UL; + max_phys_bits = 49; + break; + case SUN4V_CHIP_SPARC_M8: + default: + /* M8 and later support 54-bit virtual addresses. + * However, restricting M8 and above VA bits to 53 + * as 4-level page table cannot support more than + * 53 VA bits. + */ + sparc64_va_hole_top = 0xfff0000000000000UL; + sparc64_va_hole_bottom = 0x0010000000000000UL; + max_phys_bits = 51; + break; + } + } + + if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) { + prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n", + max_phys_bits); + prom_halt(); + } + + PAGE_OFFSET = sparc64_va_hole_top; + VMALLOC_END = ((sparc64_va_hole_bottom >> 1) + + (sparc64_va_hole_bottom >> 2)); + + pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n", + PAGE_OFFSET, max_phys_bits); + pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n", + VMALLOC_START, VMALLOC_END); + pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n", + VMEMMAP_BASE, VMEMMAP_BASE << 1); +} + static void __init tsb_phys_patch(void) { struct tsb_ldquad_phys_patch_entry *pquad; @@ -1599,21 +2069,42 @@ static void __init tsb_phys_patch(void) #define NUM_KTSB_DESCR 1 #endif static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR]; -extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; + +/* The swapper TSBs are loaded with a base sequence of: + * + * sethi %uhi(SYMBOL), REG1 + * sethi %hi(SYMBOL), REG2 + * or REG1, %ulo(SYMBOL), REG1 + * or REG2, %lo(SYMBOL), REG2 + * sllx REG1, 32, REG1 + * or REG1, REG2, REG1 + * + * When we use physical addressing for the TSB accesses, we patch the + * first four instructions in the above sequence. + */ static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa) { - pa >>= KTSB_PHYS_SHIFT; + unsigned long high_bits, low_bits; + + high_bits = (pa >> 32) & 0xffffffff; + low_bits = (pa >> 0) & 0xffffffff; while (start < end) { unsigned int *ia = (unsigned int *)(unsigned long)*start; - ia[0] = (ia[0] & ~0x3fffff) | (pa >> 10); + ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10); __asm__ __volatile__("flush %0" : : "r" (ia)); - ia[1] = (ia[1] & ~0x3ff) | (pa & 0x3ff); + ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10); __asm__ __volatile__("flush %0" : : "r" (ia + 1)); + ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff); + __asm__ __volatile__("flush %0" : : "r" (ia + 2)); + + ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff); + __asm__ __volatile__("flush %0" : : "r" (ia + 3)); + start++; } } @@ -1694,7 +2185,7 @@ static void __init sun4v_ktsb_init(void) #endif } -void __cpuinit sun4v_ktsb_register(void) +void sun4v_ktsb_register(void) { unsigned long pa, ret; @@ -1719,11 +2210,26 @@ static void __init sun4u_linear_pte_xor_finalize(void) static void __init sun4v_linear_pte_xor_finalize(void) { + unsigned long pagecv_flag; + + /* Bit 9 of TTE is no longer CV bit on M7 processor and it instead + * enables MCD error. Do not set bit 9 on M7 processor. + */ + switch (sun4v_chip_type) { + case SUN4V_CHIP_SPARC_M7: + case SUN4V_CHIP_SPARC_M8: + case SUN4V_CHIP_SPARC_SN: + pagecv_flag = 0x00; + break; + default: + pagecv_flag = _PAGE_CV_4V; + break; + } #ifndef CONFIG_DEBUG_PAGEALLOC if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) { kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ - 0xfffff80000000000UL; - kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V | + PAGE_OFFSET; + kern_linear_pte_xor[1] |= (_PAGE_CP_4V | pagecv_flag | _PAGE_P_4V | _PAGE_W_4V); } else { kern_linear_pte_xor[1] = kern_linear_pte_xor[0]; @@ -1731,8 +2237,8 @@ static void __init sun4v_linear_pte_xor_finalize(void) if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) { kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^ - 0xfffff80000000000UL; - kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V | + PAGE_OFFSET; + kern_linear_pte_xor[2] |= (_PAGE_CP_4V | pagecv_flag | _PAGE_P_4V | _PAGE_W_4V); } else { kern_linear_pte_xor[2] = kern_linear_pte_xor[1]; @@ -1740,8 +2246,8 @@ static void __init sun4v_linear_pte_xor_finalize(void) if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) { kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^ - 0xfffff80000000000UL; - kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V | + PAGE_OFFSET; + kern_linear_pte_xor[3] |= (_PAGE_CP_4V | pagecv_flag | _PAGE_P_4V | _PAGE_W_4V); } else { kern_linear_pte_xor[3] = kern_linear_pte_xor[2]; @@ -1752,22 +2258,39 @@ static void __init sun4v_linear_pte_xor_finalize(void) /* paging_init() sets up the page tables */ static unsigned long last_valid_pfn; -pgd_t swapper_pg_dir[2048]; static void sun4u_pgprot_init(void); static void sun4v_pgprot_init(void); +#define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U) +#define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V) +#define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U) +#define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V) +#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R) +#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R) + +/* We need to exclude reserved regions. This exclusion will include + * vmlinux and initrd. To be more precise the initrd size could be used to + * compute a new lower limit because it is freed later during initialization. + */ +static void __init reduce_memory(phys_addr_t limit_ram) +{ + limit_ram += memblock_reserved_size(); + memblock_enforce_memory_limit(limit_ram); +} + void __init paging_init(void) { unsigned long end_pfn, shift, phys_base; unsigned long real_end, i; - int node; + + setup_page_offset(); /* These build time checkes make sure that the dcache_dirty_cpu() - * page->flags usage will work. + * folio->flags usage will work. * * When a page gets marked as dcache-dirty, we store the - * cpu number starting at bit 32 in the page->flags. Also, + * cpu number starting at bit 32 in the folio->flags. Also, * functions like clear_dcache_dirty_cpu use the cpu mask * in 13-bit signed-immediate instruction fields. */ @@ -1788,7 +2311,7 @@ void __init paging_init(void) BUILD_BUG_ON(NR_CPUS > 4096); - kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL; + kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; kern_size = (unsigned long)&_end - (unsigned long)KERNBASE; /* Invalidate both kernel TSBs. */ @@ -1797,6 +2320,27 @@ void __init paging_init(void) memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); #endif + /* TTE.cv bit on sparc v9 occupies the same position as TTE.mcde + * bit on M7 processor. This is a conflicting usage of the same + * bit. Enabling TTE.cv on M7 would turn on Memory Corruption + * Detection error on all pages and this will lead to problems + * later. Kernel does not run with MCD enabled and hence rest + * of the required steps to fully configure memory corruption + * detection are not taken. We need to ensure TTE.mcde is not + * set on M7 processor. Compute the value of cacheability + * flag for use later taking this into consideration. + */ + switch (sun4v_chip_type) { + case SUN4V_CHIP_SPARC_M7: + case SUN4V_CHIP_SPARC_M8: + case SUN4V_CHIP_SPARC_SN: + page_cache4v_flag = _PAGE_CP_4V; + break; + default: + page_cache4v_flag = _PAGE_CACHE_4V; + break; + } + if (tlb_type == hypervisor) sun4v_pgprot_init(); else @@ -1834,7 +2378,8 @@ void __init paging_init(void) find_ramdisk(phys_base); - memblock_enforce_memory_limit(cmdline_memory_size); + if (cmdline_memory_size) + reduce_memory(cmdline_memory_size); memblock_allow_resize(); memblock_dump_all(); @@ -1844,7 +2389,7 @@ void __init paging_init(void) shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE); real_end = (unsigned long)_end; - num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << 22); + num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB); printk("Kernel: Using %d locked TLB entries for main kernel image.\n", num_kernel_image_mappings); @@ -1853,16 +2398,10 @@ void __init paging_init(void) */ init_mm.pgd += ((shift) / (sizeof(pgd_t))); - memset(swapper_low_pmd_dir, 0, sizeof(swapper_low_pmd_dir)); + memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); - /* Now can init the kernel/bad page tables. */ - pud_set(pud_offset(&swapper_pg_dir[0], 0), - swapper_low_pmd_dir + (shift / sizeof(pgd_t))); - inherit_prom_mappings(); - init_kpte_bitmap(); - /* Ok, we can use our TLB miss and window trap handlers safely. */ setup_tba(); @@ -1913,21 +2452,6 @@ void __init paging_init(void) /* Setup bootmem... */ last_valid_pfn = end_pfn = bootmem_init(phys_base); - /* Once the OF device tree and MDESC have been setup, we know - * the list of possible cpus. Therefore we can allocate the - * IRQ stacks. - */ - for_each_possible_cpu(i) { - node = cpu_to_node(i); - - softirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node), - THREAD_SIZE, - THREAD_SIZE, 0); - hardirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node), - THREAD_SIZE, - THREAD_SIZE, 0); - } - kernel_physical_mapping_init(); { @@ -1937,7 +2461,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_NORMAL] = end_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } printk("Booting Linux...\n"); @@ -1969,73 +2493,9 @@ int page_in_phys_avail(unsigned long paddr) return 0; } -static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata; -static int pavail_rescan_ents __initdata; - -/* Certain OBP calls, such as fetching "available" properties, can - * claim physical memory. So, along with initializing the valid - * address bitmap, what we do here is refetch the physical available - * memory list again, and make sure it provides at least as much - * memory as 'pavail' does. - */ -static void __init setup_valid_addr_bitmap_from_pavail(unsigned long *bitmap) -{ - int i; - - read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents); - - for (i = 0; i < pavail_ents; i++) { - unsigned long old_start, old_end; - - old_start = pavail[i].phys_addr; - old_end = old_start + pavail[i].reg_size; - while (old_start < old_end) { - int n; - - for (n = 0; n < pavail_rescan_ents; n++) { - unsigned long new_start, new_end; - - new_start = pavail_rescan[n].phys_addr; - new_end = new_start + - pavail_rescan[n].reg_size; - - if (new_start <= old_start && - new_end >= (old_start + PAGE_SIZE)) { - set_bit(old_start >> 22, bitmap); - goto do_next_page; - } - } - - prom_printf("mem_init: Lost memory in pavail\n"); - prom_printf("mem_init: OLD start[%lx] size[%lx]\n", - pavail[i].phys_addr, - pavail[i].reg_size); - prom_printf("mem_init: NEW start[%lx] size[%lx]\n", - pavail_rescan[i].phys_addr, - pavail_rescan[i].reg_size); - prom_printf("mem_init: Cannot continue, aborting.\n"); - prom_halt(); - - do_next_page: - old_start += PAGE_SIZE; - } - } -} - -static void __init patch_tlb_miss_handler_bitmap(void) -{ - extern unsigned int valid_addr_bitmap_insn[]; - extern unsigned int valid_addr_bitmap_patch[]; - - valid_addr_bitmap_insn[1] = valid_addr_bitmap_patch[1]; - mb(); - valid_addr_bitmap_insn[0] = valid_addr_bitmap_patch[0]; - flushi(&valid_addr_bitmap_insn[0]); -} - static void __init register_page_bootmem_info(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int i; for_each_online_node(i) @@ -2045,22 +2505,13 @@ static void __init register_page_bootmem_info(void) } void __init mem_init(void) { - unsigned long addr, last; - - addr = PAGE_OFFSET + kern_base; - last = PAGE_ALIGN(kern_size) + addr; - while (addr < last) { - set_bit(__pa(addr) >> 22, sparc64_valid_addr_bitmap); - addr += PAGE_SIZE; - } - - setup_valid_addr_bitmap_from_pavail(sparc64_valid_addr_bitmap); - patch_tlb_miss_handler_bitmap(); - - high_memory = __va(last_valid_pfn << PAGE_SHIFT); - + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and memblock_free_all() initializes all the reserved + * deferred pages for us. + */ register_page_bootmem_info(); - free_all_bootmem(); /* * Set up the zero page, mark it reserved, so that page count @@ -2073,7 +2524,6 @@ void __init mem_init(void) } mark_page_reserved(mem_map_zero); - mem_init_print_info(NULL); if (tlb_type == cheetah || tlb_type == cheetah_plus) cheetah_ecache_flush_init(); @@ -2110,21 +2560,6 @@ void free_initmem(void) } } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - -#define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U) -#define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V) -#define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U) -#define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V) -#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R) -#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R) - pgprot_t PAGE_KERNEL __read_mostly; EXPORT_SYMBOL(PAGE_KERNEL); @@ -2146,18 +2581,9 @@ unsigned long _PAGE_CACHE __read_mostly; EXPORT_SYMBOL(_PAGE_CACHE); #ifdef CONFIG_SPARSEMEM_VMEMMAP -unsigned long vmemmap_table[VMEMMAP_SIZE]; - -static long __meminitdata addr_start, addr_end; -static int __meminitdata node_start; - int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, - int node) + int node, struct vmem_altmap *altmap) { - unsigned long phys_start = (vstart - VMEMMAP_BASE); - unsigned long phys_end = (vend - VMEMMAP_BASE); - unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK; - unsigned long end = VMEMMAP_ALIGN(phys_end); unsigned long pte_base; pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U | @@ -2165,52 +2591,49 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, _PAGE_P_4U | _PAGE_W_4U); if (tlb_type == hypervisor) pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V | - _PAGE_CP_4V | _PAGE_CV_4V | - _PAGE_P_4V | _PAGE_W_4V); + page_cache4v_flag | _PAGE_P_4V | _PAGE_W_4V); + + pte_base |= _PAGE_PMD_HUGE; + + vstart = vstart & PMD_MASK; + vend = ALIGN(vend, PMD_SIZE); + for (; vstart < vend; vstart += PMD_SIZE) { + pgd_t *pgd = vmemmap_pgd_populate(vstart, node); + unsigned long pte; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; - for (; addr < end; addr += VMEMMAP_CHUNK) { - unsigned long *vmem_pp = - vmemmap_table + (addr >> VMEMMAP_CHUNK_SHIFT); - void *block; + if (!pgd) + return -ENOMEM; + + p4d = vmemmap_p4d_populate(pgd, vstart, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, vstart, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, vstart); + pte = pmd_val(*pmd); + if (!(pte & _PAGE_VALID)) { + void *block = vmemmap_alloc_block(PMD_SIZE, node); - if (!(*vmem_pp & _PAGE_VALID)) { - block = vmemmap_alloc_block(1UL << 22, node); if (!block) return -ENOMEM; - *vmem_pp = pte_base | __pa(block); - - /* check to see if we have contiguous blocks */ - if (addr_end != addr || node_start != node) { - if (addr_start) - printk(KERN_DEBUG " [%lx-%lx] on node %d\n", - addr_start, addr_end-1, node_start); - addr_start = addr; - node_start = node; - } - addr_end = addr + VMEMMAP_CHUNK; + pmd_val(*pmd) = pte_base | __pa(block); } } - return 0; -} -void __meminit vmemmap_populate_print_last(void) -{ - if (addr_start) { - printk(KERN_DEBUG " [%lx-%lx] on node %d\n", - addr_start, addr_end-1, node_start); - addr_start = 0; - addr_end = 0; - node_start = 0; - } -} - -void vmemmap_free(unsigned long start, unsigned long end) -{ + return 0; } - #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */ +static pgprot_t protection_map[16] __ro_after_init; + static void prot_init_common(unsigned long page_none, unsigned long page_shared, unsigned long page_copy, @@ -2261,10 +2684,10 @@ static void __init sun4u_pgprot_init(void) __ACCESS_BITS_4U | _PAGE_E_4U); #ifdef CONFIG_DEBUG_PAGEALLOC - kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; + kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; #else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ - 0xfffff80000000000UL; + PAGE_OFFSET; #endif kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | _PAGE_W_4U); @@ -2298,23 +2721,23 @@ static void __init sun4v_pgprot_init(void) int i; PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID | - _PAGE_CACHE_4V | _PAGE_P_4V | + page_cache4v_flag | _PAGE_P_4V | __ACCESS_BITS_4V | __DIRTY_BITS_4V | _PAGE_EXEC_4V); PAGE_KERNEL_LOCKED = PAGE_KERNEL; _PAGE_IE = _PAGE_IE_4V; _PAGE_E = _PAGE_E_4V; - _PAGE_CACHE = _PAGE_CACHE_4V; + _PAGE_CACHE = page_cache4v_flag; #ifdef CONFIG_DEBUG_PAGEALLOC - kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; + kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; #else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ - 0xfffff80000000000UL; + PAGE_OFFSET; #endif - kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V | - _PAGE_P_4V | _PAGE_W_4V); + kern_linear_pte_xor[0] |= (page_cache4v_flag | _PAGE_P_4V | + _PAGE_W_4V); for (i = 1; i < 4; i++) kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; @@ -2327,12 +2750,12 @@ static void __init sun4v_pgprot_init(void) _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | _PAGE_SZ64K_4V | _PAGE_SZ8K_4V); - page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V; - page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V | + page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | page_cache4v_flag; + page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V); - page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V | + page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | __ACCESS_BITS_4V | _PAGE_EXEC_4V); - page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V | + page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | __ACCESS_BITS_4V | _PAGE_EXEC_4V); page_exec_bit = _PAGE_EXEC_4V; @@ -2390,7 +2813,7 @@ static unsigned long kern_large_tte(unsigned long paddr) _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U); if (tlb_type == hypervisor) val = (_PAGE_VALID | _PAGE_SZ4MB_4V | - _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | + page_cache4v_flag | _PAGE_P_4V | _PAGE_EXEC_4V | _PAGE_W_4V); return val | paddr; @@ -2455,92 +2878,40 @@ void __flush_tlb_all(void) : : "r" (pstate)); } -static pte_t *get_from_cache(struct mm_struct *mm) +static pte_t *__pte_alloc_one(struct mm_struct *mm) { - struct page *page; - pte_t *ret; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0); - spin_lock(&mm->page_table_lock); - page = mm->context.pgtable_page; - ret = NULL; - if (page) { - void *p = page_address(page); - - mm->context.pgtable_page = NULL; - - ret = (pte_t *) (p + (PAGE_SIZE / 2)); + if (!ptdesc) + return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; } - spin_unlock(&mm->page_table_lock); - - return ret; + return ptdesc_address(ptdesc); } -static struct page *__alloc_for_cache(struct mm_struct *mm) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | - __GFP_REPEAT | __GFP_ZERO); - - if (page) { - spin_lock(&mm->page_table_lock); - if (!mm->context.pgtable_page) { - atomic_set(&page->_count, 2); - mm->context.pgtable_page = page; - } - spin_unlock(&mm->page_table_lock); - } - return page; + return __pte_alloc_one(mm); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) +pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *page; - pte_t *pte; - - pte = get_from_cache(mm); - if (pte) - return pte; - - page = __alloc_for_cache(mm); - if (page) - pte = (pte_t *) page_address(page); - - return pte; + return __pte_alloc_one(mm); } -pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) +static void __pte_free(pgtable_t pte) { - struct page *page; - pte_t *pte; - - pte = get_from_cache(mm); - if (pte) - return pte; - - page = __alloc_for_cache(mm); - if (page) { - pgtable_page_ctor(page); - pte = (pte_t *) page_address(page); - } + struct ptdesc *ptdesc = virt_to_ptdesc(pte); - return pte; + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); } void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - struct page *page = virt_to_page(pte); - if (put_page_testzero(page)) - free_hot_cold_page(page, 0); -} - -static void __pte_free(pgtable_t pte) -{ - struct page *page = virt_to_page(pte); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } + __pte_free(pte); } void pte_free(struct mm_struct *mm, pgtable_t pte) @@ -2557,92 +2928,20 @@ void pgtable_free(void *table, bool is_page) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify) +static void pte_free_now(struct rcu_head *head) { - if (pgprot_val(pgprot) & _PAGE_VALID) - pmd_val(pmd) |= PMD_HUGE_PRESENT; - if (tlb_type == hypervisor) { - if (pgprot_val(pgprot) & _PAGE_WRITE_4V) - pmd_val(pmd) |= PMD_HUGE_WRITE; - if (pgprot_val(pgprot) & _PAGE_EXEC_4V) - pmd_val(pmd) |= PMD_HUGE_EXEC; - - if (!for_modify) { - if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V) - pmd_val(pmd) |= PMD_HUGE_ACCESSED; - if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V) - pmd_val(pmd) |= PMD_HUGE_DIRTY; - } - } else { - if (pgprot_val(pgprot) & _PAGE_WRITE_4U) - pmd_val(pmd) |= PMD_HUGE_WRITE; - if (pgprot_val(pgprot) & _PAGE_EXEC_4U) - pmd_val(pmd) |= PMD_HUGE_EXEC; - - if (!for_modify) { - if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U) - pmd_val(pmd) |= PMD_HUGE_ACCESSED; - if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U) - pmd_val(pmd) |= PMD_HUGE_DIRTY; - } - } - - return pmd; -} - -pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) -{ - pmd_t pmd; - - pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT))); - pmd_val(pmd) |= PMD_ISHUGE; - pmd = pmd_set_protbits(pmd, pgprot, false); - return pmd; -} + struct page *page; -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - pmd_val(pmd) &= ~(PMD_HUGE_PRESENT | - PMD_HUGE_WRITE | - PMD_HUGE_EXEC); - pmd = pmd_set_protbits(pmd, newprot, true); - return pmd; + page = container_of(head, struct page, rcu_head); + __pte_free((pgtable_t)page_address(page)); } -pgprot_t pmd_pgprot(pmd_t entry) +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { - unsigned long pte = 0; - - if (pmd_val(entry) & PMD_HUGE_PRESENT) - pte |= _PAGE_VALID; - - if (tlb_type == hypervisor) { - if (pmd_val(entry) & PMD_HUGE_PRESENT) - pte |= _PAGE_PRESENT_4V; - if (pmd_val(entry) & PMD_HUGE_EXEC) - pte |= _PAGE_EXEC_4V; - if (pmd_val(entry) & PMD_HUGE_WRITE) - pte |= _PAGE_W_4V; - if (pmd_val(entry) & PMD_HUGE_ACCESSED) - pte |= _PAGE_ACCESSED_4V; - if (pmd_val(entry) & PMD_HUGE_DIRTY) - pte |= _PAGE_MODIFIED_4V; - pte |= _PAGE_CP_4V|_PAGE_CV_4V; - } else { - if (pmd_val(entry) & PMD_HUGE_PRESENT) - pte |= _PAGE_PRESENT_4U; - if (pmd_val(entry) & PMD_HUGE_EXEC) - pte |= _PAGE_EXEC_4U; - if (pmd_val(entry) & PMD_HUGE_WRITE) - pte |= _PAGE_W_4U; - if (pmd_val(entry) & PMD_HUGE_ACCESSED) - pte |= _PAGE_ACCESSED_4U; - if (pmd_val(entry) & PMD_HUGE_DIRTY) - pte |= _PAGE_MODIFIED_4U; - pte |= _PAGE_CP_4U|_PAGE_CV_4U; - } + struct page *page; - return __pgprot(pte); + page = virt_to_page(pgtable); + call_rcu(&page->rcu_head, pte_free_now); } void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -2651,30 +2950,25 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, unsigned long pte, flags; struct mm_struct *mm; pmd_t entry = *pmd; - pgprot_t prot; - if (!pmd_large(entry) || !pmd_young(entry)) + if (!pmd_leaf(entry) || !pmd_young(entry)) return; - pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS); - pte <<= PMD_PADDR_SHIFT; - pte |= _PAGE_VALID; + pte = pmd_val(entry); - prot = pmd_pgprot(entry); - - if (tlb_type == hypervisor) - pgprot_val(prot) |= _PAGE_SZHUGE_4V; - else - pgprot_val(prot) |= _PAGE_SZHUGE_4U; + /* Don't insert a non-valid PMD into the TSB, we'll deadlock. */ + if (!(pte & _PAGE_VALID)) + return; - pte |= pgprot_val(prot); + /* We are fabricating 8MB pages using 4MB real hw pages. */ + pte |= (addr & (1UL << REAL_HPAGE_SHIFT)); mm = vma->vm_mm; spin_lock_irqsave(&mm->context.lock, flags); if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) - __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT, + __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, addr, pte); spin_unlock_irqrestore(&mm->context.lock, flags); @@ -2695,7 +2989,7 @@ void hugetlb_setup(struct pt_regs *regs) struct mm_struct *mm = current->mm; struct tsb_config *tp; - if (in_atomic() || !mm) { + if (faulthandler_disabled() || !mm) { const struct exception_table_entry *entry; entry = search_exception_tables(regs->tpc); @@ -2719,9 +3013,10 @@ void hugetlb_setup(struct pt_regs *regs) * the Data-TLB for huge pages. */ if (tlb_type == cheetah_plus) { + bool need_context_reload = false; unsigned long ctx; - spin_lock(&ctx_alloc_lock); + spin_lock_irq(&ctx_alloc_lock); ctx = mm->context.sparc64_ctx_val; ctx &= ~CTX_PGSZ_MASK; ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; @@ -2740,9 +3035,180 @@ void hugetlb_setup(struct pt_regs *regs) * also executing in this address space. */ mm->context.sparc64_ctx_val = ctx; + need_context_reload = true; + } + spin_unlock_irq(&ctx_alloc_lock); + + if (need_context_reload) on_each_cpu(context_reload, mm, 0); + } +} +#endif + +static struct resource code_resource = { + .name = "Kernel code", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource data_resource = { + .name = "Kernel data", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static inline resource_size_t compute_kern_paddr(void *addr) +{ + return (resource_size_t) (addr - KERNBASE + kern_base); +} + +static void __init kernel_lds_init(void) +{ + code_resource.start = compute_kern_paddr(_text); + code_resource.end = compute_kern_paddr(_etext - 1); + data_resource.start = compute_kern_paddr(_etext); + data_resource.end = compute_kern_paddr(_edata - 1); + bss_resource.start = compute_kern_paddr(__bss_start); + bss_resource.end = compute_kern_paddr(_end - 1); +} + +static int __init report_memory(void) +{ + int i; + struct resource *res; + + kernel_lds_init(); + + for (i = 0; i < pavail_ents; i++) { + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + + if (!res) { + pr_warn("Failed to allocate source.\n"); + break; + } + + res->name = "System RAM"; + res->start = pavail[i].phys_addr; + res->end = pavail[i].phys_addr + pavail[i].reg_size - 1; + res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + + if (insert_resource(&iomem_resource, res) < 0) { + pr_warn("Resource insertion failed.\n"); + break; } - spin_unlock(&ctx_alloc_lock); + + insert_resource(res, &code_resource); + insert_resource(res, &data_resource); + insert_resource(res, &bss_resource); } + + return 0; } +arch_initcall(report_memory); + +#ifdef CONFIG_SMP +#define do_flush_tlb_kernel_range smp_flush_tlb_kernel_range +#else +#define do_flush_tlb_kernel_range __flush_tlb_kernel_range #endif + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) { + if (start < LOW_OBP_ADDRESS) { + flush_tsb_kernel_range(start, LOW_OBP_ADDRESS); + do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS); + } + if (end > HI_OBP_ADDRESS) { + flush_tsb_kernel_range(HI_OBP_ADDRESS, end); + do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end); + } + } else { + flush_tsb_kernel_range(start, end); + do_flush_tlb_kernel_range(start, end); + } +} + +void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + char *vfrom, *vto; + + vfrom = kmap_atomic(from); + vto = kmap_atomic(to); + copy_user_page(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + + /* If this page has ADI enabled, copy over any ADI tags + * as well + */ + if (vma->vm_flags & VM_SPARC_ADI) { + unsigned long pfrom, pto, i, adi_tag; + + pfrom = page_to_phys(from); + pto = page_to_phys(to); + + for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) { + asm volatile("ldxa [%1] %2, %0\n\t" + : "=r" (adi_tag) + : "r" (i), "i" (ASI_MCD_REAL)); + asm volatile("stxa %0, [%1] %2\n\t" + : + : "r" (adi_tag), "r" (pto), + "i" (ASI_MCD_REAL)); + pto += adi_blksize(); + } + asm volatile("membar #Sync\n\t"); + } +} +EXPORT_SYMBOL(copy_user_highpage); + +void copy_highpage(struct page *to, struct page *from) +{ + char *vfrom, *vto; + + vfrom = kmap_atomic(from); + vto = kmap_atomic(to); + copy_page(vto, vfrom); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + + /* If this platform is ADI enabled, copy any ADI tags + * as well + */ + if (adi_capable()) { + unsigned long pfrom, pto, i, adi_tag; + + pfrom = page_to_phys(from); + pto = page_to_phys(to); + + for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) { + asm volatile("ldxa [%1] %2, %0\n\t" + : "=r" (adi_tag) + : "r" (i), "i" (ASI_MCD_REAL)); + asm volatile("stxa %0, [%1] %2\n\t" + : + : "r" (adi_tag), "r" (pto), + "i" (ASI_MCD_REAL)); + pto += adi_blksize(); + } + asm volatile("membar #Sync\n\t"); + } +} +EXPORT_SYMBOL(copy_highpage); + +pgprot_t vm_get_page_prot(vm_flags_t vm_flags) +{ + unsigned long prot = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + + if (vm_flags & VM_SPARC_ADI) + prot |= _PAGE_MCD_4V; + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); |
