From 0bdf525f04afd3a32c14e5a8778771f9c9e0f074 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:53:51 -0800 Subject: x86: Improve __phys_addr performance by making use of carry flags and inlining This patch is meant to improve overall system performance when making use of the __phys_addr call. To do this I have implemented several changes. First if CONFIG_DEBUG_VIRTUAL is not defined __phys_addr is made an inline, similar to how this is currently handled in 32 bit. However in order to do this it is required to export phys_base so that it is available if __phys_addr is used in kernel modules. The second change was to streamline the code by making use of the carry flag on an add operation instead of performing a compare on a 64 bit value. The advantage to this is that it allows us to significantly reduce the overall size of the call. On my Xeon E5 system the entire __phys_addr inline call consumes a little less than 32 bytes and 5 instructions. I also applied similar logic to the debug version of the function. My testing shows that the debug version of the function with this patch applied is slightly faster than the non-debug version without the patch. Finally I also applied the same logic changes to __virt_addr_valid since it used the same general code flow as __phys_addr and could achieve similar gains though these changes. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215315.8521.46270.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/physaddr.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index d2e2735327b4..fd40d75109ef 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -8,33 +8,43 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } + return x; } EXPORT_SYMBOL(__phys_addr); +#endif bool __virt_addr_valid(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) return false; - x += phys_base; } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !phys_addr_valid(x)) return false; } -- cgit From 7d74275d39def4d3ccc8cf4725388bf79ef13861 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:55:46 -0800 Subject: x86: Make it so that __pa_symbol can only process kernel symbols on x86_64 I submitted an earlier patch that make __phys_addr an inline. This obviously results in an increase in the code size. One step I can take to reduce that is to make it so that the __pa_symbol call does a direct translation for kernel addresses instead of covering all of virtual memory. On my system this reduced the size for __pa_symbol from 5 instructions totalling 30 bytes to 3 instructions totalling 16 bytes. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215356.8521.92472.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/physaddr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index fd40d75109ef..c73feddd518d 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -28,6 +28,17 @@ unsigned long __phys_addr(unsigned long x) return x; } EXPORT_SYMBOL(__phys_addr); + +unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) -- cgit From fc8d782677f163dee76427fdd8a92bebd2b50b23 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:57:13 -0800 Subject: x86: Use __pa_symbol instead of __pa on C visible symbols When I made an attempt at separating __pa_symbol and __pa I found that there were a number of cases where __pa was used on an obvious symbol. I also caught one non-obvious case as _brk_start and _brk_end are based on the address of __brk_base which is a C visible symbol. In mark_rodata_ro I was able to reduce the overhead of kernel symbol to virtual memory translation by using a combination of __va(__pa_symbol()) instead of page_address(virt_to_page()). Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215640.8521.80483.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 18 ++++++++---------- arch/x86/mm/pageattr.c | 8 ++++---- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3baff255adac..0374a10f4fb7 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -770,12 +770,10 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long rodata_start = - ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); - unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); - unsigned long data_start = (unsigned long) &_sdata; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -800,12 +798,12 @@ void mark_rodata_ro(void) #endif free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(text_end)), - (unsigned long) - page_address(virt_to_page(rodata_start))); + (unsigned long) __va(__pa_symbol(text_end)), + (unsigned long) __va(__pa_symbol(rodata_start))); + free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(rodata_end)), - (unsigned long) page_address(virt_to_page(data_start))); + (unsigned long) __va(__pa_symbol(rodata_end)), + (unsigned long) __va(__pa_symbol(_sdata))); } #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a718e0d23503..40f92f3aea54 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -94,12 +94,12 @@ static inline void split_page_count(int level) { } static inline unsigned long highmap_start_pfn(void) { - return __pa(_text) >> PAGE_SHIFT; + return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { - return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * The .rodata section needs to be read-only. Using the pfn * catches all aliases. */ - if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, - __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) -- cgit From 5e4bf1a55da976a5ed60901bb8801f1024ef9774 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Nov 2012 13:02:51 +0100 Subject: x86/mm: Don't flush the TLB on #WP pmd fixups If we have a write protection #PF and fix up the pmd then the hugetlb code [the only user of pmdp_set_access_flags], in its do_huge_pmd_wp_page() page fault resolution function calls pmdp_set_access_flags() to mark the pmd permissive again, and flushes the TLB. This TLB flush is unnecessary: a flush on #PF is guaranteed on most (all?) x86 CPUs, and even in the worst-case we'll generate a spurious fault. So remove it. Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Johannes Weiner Cc: Christoph Lameter Cc: Mel Gorman Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20121120120251.GA15742@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d0..8a828d773e58 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -328,7 +328,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *pmdp = entry; pmd_update_defer(vma->vm_mm, address, pmdp); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ } return changed; -- cgit From a25b9316841c5afa226f8f70a457861b35276a92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:30 -0800 Subject: x86, mm: Make DEBUG_VIRTUAL work earlier in boot The KVM code has some repeated bugs in it around use of __pa() on per-cpu data. Those data are not in an area on which using __pa() is valid. However, they are also called early enough in boot that __vmalloc_start_set is not set, and thus the CONFIG_DEBUG_VIRTUAL debugging does not catch them. This adds a check to also verify __pa() calls against max_low_pfn, which we can use earler in boot than is_vmalloc_addr(). However, if we are super-early in boot, max_low_pfn=0 and this will trip on every call, so also make sure that max_low_pfn is set before we try to use it. With this patch applied, CONFIG_DEBUG_VIRTUAL will actually catch the bug I was chasing (and fix later in this series). I'd love to find a generic way so that any __pa() call on percpu areas could do a BUG_ON(), but there don't appear to be any nice and easy ways to check if an address is a percpu one. Anybody have ideas on a way to do this? Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212430.F46F8159@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa.c | 2 +- arch/x86/mm/pat.c | 4 ++-- arch/x86/mm/physaddr.c | 9 ++++++++- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2d125be1bae9..76604eb9e4b0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __pa(nd); + nd_pa = __phys_addr_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 0eb572eda406..2610bd93c896 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (base >= __pa(high_memory)) + if (base > __pa(high_memory-1)) return 0; - id_sz = (__pa(high_memory) < base + size) ? + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index c73feddd518d..e666cbbb9261 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -68,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid); #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { + unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; + /* max_low_pfn is set early, but not _that_ early */ + if (max_low_pfn) { + VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); + BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); + } + return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif -- cgit From f3c4fbb68e93b10c781c0cc462a9d80770244da6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:32 -0800 Subject: x86, mm: Use new pagetable helpers in try_preserve_large_page() try_preserve_large_page() can be slightly simplified by using the new page_level_*() helpers. This also moves the 'level' over to the new pg_level enum type. Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212432.14F3D993@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 40f92f3aea54..2a5c9ab710b9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -396,7 +396,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; - unsigned int level; + enum pg_level level; if (cpa->force_split) return 1; @@ -412,15 +412,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: - psize = PMD_PAGE_SIZE; - pmask = PMD_PAGE_MASK; - break; #ifdef CONFIG_X86_64 case PG_LEVEL_1G: - psize = PUD_PAGE_SIZE; - pmask = PUD_PAGE_MASK; - break; #endif + psize = page_level_size(level); + pmask = page_level_mask(level); + break; default: do_split = -EINVAL; goto out_unlock; -- cgit From d765653445129b7c476758040e3079480775f80a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:33 -0800 Subject: x86, mm: Create slow_virt_to_phys() This is necessary because __pa() does not work on some kinds of memory, like vmalloc() or the alloc_remap() areas on 32-bit NUMA systems. We have some functions to do conversions _like_ this in the vmalloc() code (like vmalloc_to_page()), but they do not work on sizes other than 4k pages. We would potentially need to be able to handle all the page sizes that we use for the kernel linear mapping (4k, 2M, 1G). In practice, on 32-bit NUMA systems, the percpu areas get stuck in the alloc_remap() area. Any __pa() call on them will break and basically return garbage. This patch introduces a new function slow_virt_to_phys(), which walks the kernel page tables on x86 and should do precisely the same logical thing as __pa(), but actually work on a wider range of memory. It should work on the normal linear mapping, vmalloc(), kmap(), etc... Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212433.4D1FCA62@kernel.stglabs.ibm.com Acked-by: Rik van Riel Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2a5c9ab710b9..6d13d2a3f825 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -363,6 +363,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) } EXPORT_SYMBOL_GPL(lookup_address); +/* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems. The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at inititalization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ + unsigned long virt_addr = (unsigned long)__virt_addr; + phys_addr_t phys_addr; + unsigned long offset; + enum pg_level level; + unsigned long psize; + unsigned long pmask; + pte_t *pte; + + pte = lookup_address(virt_addr, &level); + BUG_ON(!pte); + psize = page_level_size(level); + pmask = page_level_mask(level); + offset = virt_addr & ~pmask; + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + return (phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); + /* * Set the new pmd in all the pgds we know about: */ -- cgit From 1e9209edc71b851d81f0316ca03a0e6335c0ef9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 27 Jan 2013 01:18:21 +0100 Subject: x86/numa: Use __pa_nodebug() instead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... and fix the following warning: arch/x86/mm/numa.c: In function ‘setup_node_data’: arch/x86/mm/numa.c:222:3: warning: passing argument 1 of ‘__phys_addr_nodebug’ makes integer from pointer without a cast Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: http://lkml.kernel.org/r/1359245901-8512-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 76604eb9e4b0..b2313c6739f5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __phys_addr_nodebug(nd); + nd_pa = __pa_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); -- cgit From f03574f2d5b2d6229dcdf2d322848065f72953c7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Jan 2013 16:56:16 -0800 Subject: x86-32, mm: Rip out x86_32 NUMA remapping code This code was an optimization for 32-bit NUMA systems. It has probably been the cause of a number of subtle bugs over the years, although the conditions to excite them would have been hard to trigger. Essentially, we remap part of the kernel linear mapping area, and then sometimes part of that area gets freed back in to the bootmem allocator. If those pages get used by kernel data structures (say mem_map[] or a dentry), there's no big deal. But, if anyone ever tried to use the linear mapping for these pages _and_ cared about their physical address, bad things happen. For instance, say you passed __GFP_ZERO to the page allocator and then happened to get handed one of these pages, it zero the remapped page, but it would make a pte to the _old_ page. There are probably a hundred other ways that it could screw with things. We don't need to hang on to performance optimizations for these old boxes any more. All my 32-bit NUMA systems are long dead and buried, and I probably had access to more than most people. This code is causing real things to break today: https://lkml.org/lkml/2013/1/9/376 I looked in to actually fixing this, but it requires surgery to way too much brittle code, as well as stuff like per_cpu_ptr_to_phys(). [ hpa: Cc: this for -stable, since it is a memory corruption issue. However, an alternative is to simply mark NUMA as depends BROKEN rather than EXPERIMENTAL in the X86_32 subclause... ] Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin Cc: --- arch/x86/mm/numa.c | 3 - arch/x86/mm/numa_32.c | 161 -------------------------------------------- arch/x86/mm/numa_internal.h | 6 -- 3 files changed, 170 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b2313c6739f5..61c2b6f5ff88 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -205,9 +205,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) if (end && (end - start) < NODE_MIN_SIZE) return; - /* initialize remap allocator before aligning to ZONE_ALIGN */ - init_alloc_remap(nid, start, end); - start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 534255a36b6b..73a6d7395bd3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long highend_pfn, highstart_pfn; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -/* - * Remap memory allocator - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/** - * alloc_remap - Allocate remapped memory - * @nid: NUMA node to allocate memory from - * @size: The size of allocation - * - * Allocate @size bytes from the remap area of NUMA node @nid. The - * size of the remap area is predetermined by init_alloc_remap() and - * only the callers considered there should call this function. For - * more info, please read the comment on top of init_alloc_remap(). - * - * The caller must be ready to handle allocation failure from this - * function and fall back to regular memory allocator in such cases. - * - * CONTEXT: - * Single CPU early boot context. - * - * RETURNS: - * Pointer to the allocated memory on success, %NULL on failure. - */ -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) - return NULL; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -#ifdef CONFIG_HIBERNATION -/** - * resume_map_numa_kva - add KVA mapping to the temporary page tables created - * during resume from hibernation - * @pgd_base - temporary resume page directory - */ -void resume_map_numa_kva(pgd_t *pgd_base) -{ - int node; - - for_each_online_node(node) { - unsigned long start_va, start_pfn, nr_pages, pfn; - - start_va = (unsigned long)node_remap_start_vaddr[node]; - start_pfn = node_remap_start_pfn[node]; - nr_pages = (node_remap_end_vaddr[node] - - node_remap_start_vaddr[node]) >> PAGE_SHIFT; - - printk(KERN_DEBUG "%s: node %d\n", __func__, node); - - for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { - unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); - pgd_t *pgd = pgd_base + pgd_index(vaddr); - pud_t *pud = pud_offset(pgd, vaddr); - pmd_t *pmd = pmd_offset(pud, vaddr); - - set_pmd(pmd, pfn_pmd(start_pfn + pfn, - PAGE_KERNEL_LARGE_EXEC)); - - printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", - __func__, vaddr, start_pfn + pfn); - } - } -} -#endif - -/** - * init_alloc_remap - Initialize remap allocator for a NUMA node - * @nid: NUMA node to initizlie remap allocator for - * - * NUMA nodes may end up without any lowmem. As allocating pgdat and - * memmap on a different node with lowmem is inefficient, a special - * remap allocator is implemented which can be used by alloc_remap(). - * - * For each node, the amount of memory which will be necessary for - * pgdat and memmap is calculated and two memory areas of the size are - * allocated - one in the node and the other in lowmem; then, the area - * in the node is remapped to the lowmem area. - * - * As pgdat and memmap must be allocated in lowmem anyway, this - * doesn't waste lowmem address space; however, the actual lowmem - * which gets remapped over is wasted. The amount shouldn't be - * problematic on machines this feature will be used. - * - * Initialization failure isn't fatal. alloc_remap() is used - * opportunistically and the callers will fall back to other memory - * allocation mechanisms on failure. - */ -void __init init_alloc_remap(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long size, pfn; - u64 node_pa, remap_pa; - void *remap_va; - - /* - * The acpi/srat node info can show hot-add memroy zones where - * memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, start_pfn, end_pfn); - - /* calculate the necessary space aligned to large page size */ - size = node_memmap_size_bytes(nid, start_pfn, end_pfn); - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - size = ALIGN(size, LARGE_PAGE_BYTES); - - /* allocate node memory and the lowmem remap area */ - node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (!node_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", - size, nid); - return; - } - memblock_reserve(node_pa, size); - - remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - size, LARGE_PAGE_BYTES); - if (!remap_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", - size, nid); - memblock_free(node_pa, size); - return; - } - memblock_reserve(remap_pa, size); - remap_va = phys_to_virt(remap_pa); - - /* perform actual remap */ - for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) - set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), - (node_pa >> PAGE_SHIFT) + pfn, - PAGE_KERNEL_LARGE); - - /* initialize remap allocator parameters */ - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; - node_remap_end_vaddr[nid] = remap_va + size; - node_remap_alloc_vaddr[nid] = remap_va; - - printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", - nid, node_pa, node_pa + size, remap_va, remap_va + size); -} - void __init initmem_init(void) { x86_numa_init(); diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 7178c3afe05e..ad86ec91e640 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -21,12 +21,6 @@ void __init numa_reset_distance(void); void __init x86_numa_init(void); -#ifdef CONFIG_X86_64 -static inline void init_alloc_remap(int nid, u64 start, u64 end) { } -#else -void __init init_alloc_remap(int nid, u64 start, u64 end); -#endif - #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); -- cgit From 07f4207a305c834f528d08428df4531744e25678 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 31 Jan 2013 14:00:48 -0800 Subject: x86-32, mm: Remove reference to alloc_remap() We have removed the remap allocator for x86-32, and x86-64 never had it (and doesn't need it). Remove residual reference to it. Reported-by: Yinghai Lu Signed-off-by: H. Peter Anvin Cc: Dave Hansen Cc: Link: http://lkml.kernel.org/r/CAE9FiQVn6_QZi3fNQ-JHYiR-7jeDJ5hT0SyT_%2BzVvfOj=PzF3w@mail.gmail.com --- arch/x86/mm/numa.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 61c2b6f5ff88..8504f3698753 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) static void __init setup_node_data(int nid, u64 start, u64 end) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - bool remapped = false; u64 nd_pa; void *nd; int tnid; @@ -211,28 +210,22 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nid, start, end - 1); /* - * Allocate node data. Try remap allocator first, node-local - * memory and then any node. Never allocate in DMA zone. + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. */ - nd = alloc_remap(nid, nd_size); - if (nd) { - nd_pa = __pa_nodebug(nd); - remapped = true; - } else { - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); - return; - } - nd = __va(nd_pa); + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; } + nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", - nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (!remapped && tnid != nid) + if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); node_data[nid] = nd; -- cgit