From 5255e0a79fcc0ff47b387af92bd9ef5729b1b859 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Fri, 22 Aug 2014 13:27:31 -0700 Subject: x86/mm/hotplug: Pass sync_global_pgds() a correct argument in remove_pagetable() When hot-adding memory after hot-removing memory, following call traces are shown: kernel BUG at arch/x86/mm/init_64.c:206! ... [] kernel_physical_mapping_init+0x1b2/0x1d2 [] init_memory_mapping+0x1d4/0x380 [] arch_add_memory+0x3d/0xd0 [] add_memory+0xb9/0x1b0 [] acpi_memory_device_add+0x1af/0x28e [] acpi_bus_device_attach+0x8c/0xf0 [] acpi_ns_walk_namespace+0xc8/0x17f [] ? acpi_bus_type_and_status+0xb7/0xb7 [] ? acpi_bus_type_and_status+0xb7/0xb7 [] acpi_walk_namespace+0x95/0xc5 [] acpi_bus_scan+0x9a/0xc2 [] acpi_scan_bus_device_check+0x8b/0x12e [] acpi_scan_device_check+0x13/0x15 [] acpi_os_execute_deferred+0x25/0x32 [] process_one_work+0x17b/0x460 [] worker_thread+0x11b/0x400 [] ? rescuer_thread+0x400/0x400 [] kthread+0xcf/0xe0 [] ? kthread_create_on_node+0x140/0x140 [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_node+0x140/0x140 The patch-set fixes the issue. This patch (of 2): remove_pagetable() gets start argument and passes the argument to sync_global_pgds(). In this case, the argument must not be modified. If the argument is modified and passed to sync_global_pgds(), sync_global_pgds() does not correctly synchronize PGD to PGD entries of all processes MM since synchronized range of memory [start, end] is wrong. Unfortunately the start argument is modified in remove_pagetable(). So this patch fixes the issue. Signed-off-by: Yasuaki Ishimatsu Acked-by: Toshi Kani Signed-off-by: Andrew Morton Cc: Tang Chen Cc: Gu Zheng Cc: Zhang Yanfei Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5621c47d7a1a..0e996c0a7eff 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -976,19 +976,20 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) { unsigned long next; + unsigned long addr; pgd_t *pgd; pud_t *pud; bool pgd_changed = false; - for (; start < end; start = next) { - next = pgd_addr_end(start, end); + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); - pgd = pgd_offset_k(start); + pgd = pgd_offset_k(addr); if (!pgd_present(*pgd)) continue; pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, start, next, direct); + remove_pud_table(pud, addr, next, direct); if (free_pud_table(pud, pgd)) pgd_changed = true; } -- cgit From 9661d5bcd058fe15b4138a00d96bd36516134543 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Fri, 22 Aug 2014 13:27:34 -0700 Subject: x86/mm/hotplug: Modify PGD entry when removing memory When hot-adding/removing memory, sync_global_pgds() is called for synchronizing PGD to PGD entries of all processes MM. But when hot-removing memory, sync_global_pgds() does not work correctly. At first, sync_global_pgds() checks whether target PGD is none or not. And if PGD is none, the PGD is skipped. But when hot-removing memory, PGD may be none since PGD may be cleared by free_pud_table(). So when sync_global_pgds() is called after hot-removing memory, sync_global_pgds() should not skip PGD even if the PGD is none. And sync_global_pgds() must clear PGD entries of all processes MM. Currently sync_global_pgds() does not clear PGD entries of all processes MM when hot-removing memory. So when hot adding memory which is same memory range as removed memory after hot-removing memory, following call traces are shown: kernel BUG at arch/x86/mm/init_64.c:206! ... [] kernel_physical_mapping_init+0x1b2/0x1d2 [] init_memory_mapping+0x1d4/0x380 [] arch_add_memory+0x3d/0xd0 [] add_memory+0xb9/0x1b0 [] acpi_memory_device_add+0x1af/0x28e [] acpi_bus_device_attach+0x8c/0xf0 [] acpi_ns_walk_namespace+0xc8/0x17f [] ? acpi_bus_type_and_status+0xb7/0xb7 [] ? acpi_bus_type_and_status+0xb7/0xb7 [] acpi_walk_namespace+0x95/0xc5 [] acpi_bus_scan+0x9a/0xc2 [] acpi_scan_bus_device_check+0x8b/0x12e [] acpi_scan_device_check+0x13/0x15 [] acpi_os_execute_deferred+0x25/0x32 [] process_one_work+0x17b/0x460 [] worker_thread+0x11b/0x400 [] ? rescuer_thread+0x400/0x400 [] kthread+0xcf/0xe0 [] ? kthread_create_on_node+0x140/0x140 [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_node+0x140/0x140 This patch clears PGD entries of all processes MM when sync_global_pgds() is called after hot-removing memory Signed-off-by: Yasuaki Ishimatsu Acked-by: Toshi Kani Signed-off-by: Andrew Morton Cc: Tang Chen Cc: Gu Zheng Cc: Zhang Yanfei Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0e996c0a7eff..529625118ff6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -178,7 +178,7 @@ __setup("noexec32=", nonx32_setup); * When memory was added/removed make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end) +void sync_global_pgds(unsigned long start, unsigned long end, int removed) { unsigned long address; @@ -186,7 +186,12 @@ void sync_global_pgds(unsigned long start, unsigned long end) const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - if (pgd_none(*pgd_ref)) + /* + * When it is called after memory hot remove, pgd_none() + * returns true. In this case (removed == 1), we must clear + * the PGD entries in the local PGD level page. + */ + if (pgd_none(*pgd_ref) && !removed) continue; spin_lock(&pgd_lock); @@ -199,12 +204,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + if (removed) { + if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) + pgd_clear(pgd); + } else { + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + } + spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); @@ -633,7 +644,7 @@ kernel_physical_mapping_init(unsigned long start, } if (pgd_changed) - sync_global_pgds(addr, end - 1); + sync_global_pgds(addr, end - 1, 0); __flush_tlb_all(); @@ -995,7 +1006,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) } if (pgd_changed) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 1); flush_tlb_all(); } @@ -1342,7 +1353,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 0); return err; } -- cgit From f955371ca9d3986bca100666041fcfa9b6d21962 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 7 Jan 2014 17:03:06 +0000 Subject: x86: remove the Xen-specific _PAGE_IOMAP PTE flag The _PAGE_IO_MAP PTE flag was only used by Xen PV guests to mark PTEs that were used to map I/O regions that are 1:1 in the p2m. This allowed Xen to obtain the correct PFN when converting the MFNs read from a PTE back to their PFN. Xen guests no longer use _PAGE_IOMAP for this. Instead mfn_to_pfn() returns the correct PFN by using a combination of the m2p and p2m to determine if an MFN corresponds to a 1:1 mapping in the the p2m. Remove _PAGE_IOMAP, replacing it with _PAGE_UNUSED2 to allow for future uses of the PTE flag. Signed-off-by: David Vrabel Acked-by: "H. Peter Anvin" --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5621c47d7a1a..5d984769cbd8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +pteval_t __supported_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); int force_personality32; -- cgit From b93590901a01a6d036b3b7c856bcc5724fdb9911 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:51 -0700 Subject: x86_64/vsyscall: Move all of the gate_area code to vsyscall_64.c This code exists for the sole purpose of making the vsyscall page look sort of like real userspace memory. Move it so that it lives with the rest of the vsyscall code. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/a7ee266773671a05f00b7175ca65a0dd812d2e4b.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763868fc..dd9ca9becc0f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1193,55 +1193,6 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - static unsigned long probe_memory_block_size(void) { /* start from 2g */ -- cgit From bdee237c0343a5d1a6cf72c7ea68e88338b26e08 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 4 Nov 2014 16:29:44 +0800 Subject: x86: mm: Use 2GB memory block size on large-memory x86-64 systems On large-memory x86-64 systems of 64GB or more with memory hot-plug enabled, use a 2GB memory block size. Eg with 64GB memory, this reduces the number of directories in /sys/devices/system/memory from 512 to 32, making it more manageable, and reducing the creation time accordingly. This caveat is that the memory can't be offlined (for hotplug or otherwise) with the finer default 128MB granularity, but this is unimportant due to the high memory densities generally used with such large-memory systems, where eg a single DIMM is the order of 16GB. Signed-off-by: Daniel J Blueman Cc: Steffen Persvold Cc: Bjorn Helgaas Link: http://lkml.kernel.org/r/1415089784-28779-4-git-send-email-daniel@numascale.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763868fc..ebca30f10708 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include "mm_internal.h" @@ -1247,12 +1246,10 @@ static unsigned long probe_memory_block_size(void) /* start from 2g */ unsigned long bz = 1UL<<31; -#ifdef CONFIG_X86_UV - if (is_uv_system()) { - printk(KERN_INFO "UV: memory block size 2GB\n"); + if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { + pr_info("Using 2GB memory block size for large-memory system\n"); return 2UL * 1024 * 1024 * 1024; } -#endif /* less than 64g installed */ if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) -- cgit From 2df58b6d35306e8dab48923c9fbe9e1ad17537e2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 3 Nov 2014 14:01:52 +0100 Subject: x86: Use new cache mode type in arch/x86/mm/init_64.c Instead of directly using the cache mode bits in the pte switch to using the cache mode type. Based-on-patch-by: Stefan Bader Signed-off-by: Juergen Gross Reviewed-by: Thomas Gleixner Cc: stefan.bader@canonical.com Cc: xen-devel@lists.xensource.com Cc: konrad.wilk@oracle.com Cc: ville.syrjala@linux.intel.com Cc: david.vrabel@citrix.com Cc: jbeulich@suse.com Cc: toshi.kani@hp.com Cc: plagnioj@jcrosoft.com Cc: tomi.valkeinen@ti.com Cc: bhelgaas@google.com Link: http://lkml.kernel.org/r/1415019724-4317-7-git-send-email-jgross@suse.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ebca30f10708..bd42786f078b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -337,12 +337,15 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) * Create large page table mappings for a range of physical addresses. */ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, - pgprot_t prot) + enum page_cache_mode cache) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + pgprot_t prot; + pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | + pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); @@ -365,12 +368,12 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); } void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); } /* -- cgit From 45e2a9d4701d8c624d4a4bcdd1084eae31e92f58 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 14 Nov 2014 11:47:37 -0800 Subject: x86, mm: Set NX across entire PMD at boot When setting up permissions on kernel memory at boot, the end of the PMD that was split from bss remained executable. It should be NX like the rest. This performs a PMD alignment instead of a PAGE alignment to get the correct span of memory. Before: ---[ High Kernel Mapping ]--- ... 0xffffffff8202d000-0xffffffff82200000 1868K RW GLB NX pte 0xffffffff82200000-0xffffffff82c00000 10M RW PSE GLB NX pmd 0xffffffff82c00000-0xffffffff82df5000 2004K RW GLB NX pte 0xffffffff82df5000-0xffffffff82e00000 44K RW GLB x pte 0xffffffff82e00000-0xffffffffc0000000 978M pmd After: ---[ High Kernel Mapping ]--- ... 0xffffffff8202d000-0xffffffff82200000 1868K RW GLB NX pte 0xffffffff82200000-0xffffffff82e00000 12M RW PSE GLB NX pmd 0xffffffff82e00000-0xffffffffc0000000 978M pmd [ tglx: Changed it to roundup(_brk_end, PMD_SIZE) and added a comment. We really should unmap the reminder along with the holes caused by init,initdata etc. but thats a different issue ] Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Toshi Kani Cc: Yasuaki Ishimatsu Cc: David Vrabel Cc: Wang Nan Cc: Yinghai Lu Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20141114194737.GA3091@www.outflux.net Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763868fc..4e5dfec750fc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1123,7 +1123,7 @@ void mark_rodata_ro(void) unsigned long end = (unsigned long) &__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(&__stop___ex_table); unsigned long rodata_end = PFN_ALIGN(&__end_rodata); - unsigned long all_end = PFN_ALIGN(&_end); + unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1134,7 +1134,16 @@ void mark_rodata_ro(void) /* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); -- cgit