From dc5131641dcbaeb79ce9f4fecb368305e010fc28 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 6 Jul 2017 15:35:34 -0700 Subject: mn10300: remove wrapper header for asm/device.h mn10300's asm/device.h is merely including asm-generic/device.h. Thus, the arch specific header can be omitted and the generic header can be used directly. Link: http://lkml.kernel.org/r/20170517124857.26834-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mn10300/include/asm/Kbuild | 1 + arch/mn10300/include/asm/device.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 arch/mn10300/include/asm/device.h (limited to 'arch') diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index ed810e7206e8..ca413fe69930 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -1,6 +1,7 @@ generic-y += barrier.h generic-y += clkdev.h +generic-y += device.h generic-y += exec.h generic-y += extable.h generic-y += irq_work.h diff --git a/arch/mn10300/include/asm/device.h b/arch/mn10300/include/asm/device.h deleted file mode 100644 index f0a4c256403b..000000000000 --- a/arch/mn10300/include/asm/device.h +++ /dev/null @@ -1 +0,0 @@ -#include -- cgit From 9cfc5e0454701cd3be65fe94fbf18eee41378782 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 6 Jul 2017 15:35:37 -0700 Subject: mn10300: use generic fb.h The mn10300 arch uses a verbatim copy of the asm-generic version and does not add any own implementations to the header, so use asm-generic/fb.h instead of duplicating code. Link: http://lkml.kernel.org/r/20170517083348.1815-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Reviewed-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mn10300/include/asm/Kbuild | 1 + arch/mn10300/include/asm/fb.h | 23 ----------------------- 2 files changed, 1 insertion(+), 23 deletions(-) delete mode 100644 arch/mn10300/include/asm/fb.h (limited to 'arch') diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index ca413fe69930..db5b57829a81 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += clkdev.h generic-y += device.h generic-y += exec.h generic-y += extable.h +generic-y += fb.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/mn10300/include/asm/fb.h b/arch/mn10300/include/asm/fb.h deleted file mode 100644 index 697b24a91e1a..000000000000 --- a/arch/mn10300/include/asm/fb.h +++ /dev/null @@ -1,23 +0,0 @@ -/* MN10300 Frame buffer stuff - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ -#ifndef _ASM_FB_H -#define _ASM_FB_H - -#include - -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H */ -- cgit From 3922920026c0242d752e62a3c88b758715c5c42f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 6 Jul 2017 15:35:40 -0700 Subject: tile: provide default ioremap declaration Add a default ioremap function which was not provided in all circumstances. (Only when CONFIG_PCI and CONFIG_TILEGX was set). I have designs to use them in scatterlist.c where they'd likely never be called with this architecture, but it is needed to compile. Thus, if the function is ever hit it returns NULL. Link: http://lkml.kernel.org/r/1495726904-27380-1-git-send-email-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Stephen Bates Cc: Chris Metcalf Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/mm/pgtable.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch') diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 492a7361e58e..ec5576fd3a86 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -503,6 +503,17 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, } EXPORT_SYMBOL(ioremap_prot); +#if !defined(CONFIG_PCI) || !defined(CONFIG_TILEGX) +/* ioremap is conditionally declared in pci_gx.c */ + +void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) +{ + return NULL; +} +EXPORT_SYMBOL(ioremap); + +#endif + /* Unmap an MMIO VA mapping. */ void iounmap(volatile void __iomem *addr_in) { -- cgit From 38d8b4e6bdc872f07a3149309ab01719c96f3894 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 6 Jul 2017 15:37:18 -0700 Subject: mm, THP, swap: delay splitting THP during swap out Patch series "THP swap: Delay splitting THP during swapping out", v11. This patchset is to optimize the performance of Transparent Huge Page (THP) swap. Recently, the performance of the storage devices improved so fast that we cannot saturate the disk bandwidth with single logical CPU when do page swap out even on a high-end server machine. Because the performance of the storage device improved faster than that of single logical CPU. And it seems that the trend will not change in the near future. On the other hand, the THP becomes more and more popular because of increased memory size. So it becomes necessary to optimize THP swap performance. The advantages of the THP swap support include: - Batch the swap operations for the THP to reduce lock acquiring/releasing, including allocating/freeing the swap space, adding/deleting to/from the swap cache, and writing/reading the swap space, etc. This will help improve the performance of the THP swap. - The THP swap space read/write will be 2M sequential IO. It is particularly helpful for the swap read, which are usually 4k random IO. This will improve the performance of the THP swap too. - It will help the memory fragmentation, especially when the THP is heavily used by the applications. The 2M continuous pages will be free up after THP swapping out. - It will improve the THP utilization on the system with the swap turned on. Because the speed for khugepaged to collapse the normal pages into the THP is quite slow. After the THP is split during the swapping out, it will take quite long time for the normal pages to collapse back into the THP after being swapped in. The high THP utilization helps the efficiency of the page based memory management too. There are some concerns regarding THP swap in, mainly because possible enlarged read/write IO size (for swap in/out) may put more overhead on the storage device. To deal with that, the THP swap in should be turned on only when necessary. For example, it can be selected via "always/never/madvise" logic, to be turned on globally, turned off globally, or turned on only for VMA with MADV_HUGEPAGE, etc. This patchset is the first step for the THP swap support. The plan is to delay splitting THP step by step, finally avoid splitting THP during the THP swapping out and swap out/in the THP as a whole. As the first step, in this patchset, the splitting huge page is delayed from almost the first step of swapping out to after allocating the swap space for the THP and adding the THP into the swap cache. This will reduce lock acquiring/releasing for the locks used for the swap cache management. With the patchset, the swap out throughput improves 15.5% (from about 3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case with 8 processes. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case creates 8 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used up. This patch (of 5): In this patch, splitting huge page is delayed from almost the first step of swapping out to after allocating the swap space for the THP (Transparent Huge Page) and adding the THP into the swap cache. This will batch the corresponding operation, thus improve THP swap out throughput. This is the first step for the THP swap optimization. The plan is to delay splitting the THP step by step and avoid splitting the THP finally. In this patch, one swap cluster is used to hold the contents of each THP swapped out. So, the size of the swap cluster is changed to that of the THP (Transparent Huge Page) on x86_64 architecture (512). For other architectures which want such THP swap optimization, ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for the architecture. In effect, this will enlarge swap cluster size by 2 times on x86_64. Which may make it harder to find a free cluster when the swap space becomes fragmented. So that, this may reduce the continuous swap space allocation and sequential write in theory. The performance test in 0day shows no regressions caused by this. In the future of THP swap optimization, some information of the swapped out THP (such as compound map count) will be recorded in the swap_cluster_info data structure. The mem cgroup swap accounting functions are enhanced to support charge or uncharge a swap cluster backing a THP as a whole. The swap cluster allocate/free functions are added to allocate/free a swap cluster for a THP. A fair simple algorithm is used for swap cluster allocation, that is, only the first swap device in priority list will be tried to allocate the swap cluster. The function will fail if the trying is not successful, and the caller will fallback to allocate a single swap slot instead. This works good enough for normal cases. If the difference of the number of the free swap clusters among multiple swap devices is significant, it is possible that some THPs are split earlier than necessary. For example, this could be caused by big size difference among multiple swap devices. The swap cache functions is enhanced to support add/delete THP to/from the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be enhanced in the future with multi-order radix tree. But because we will split the THP soon during swapping out, that optimization doesn't make much sense for this first step. The THP splitting functions are enhanced to support to split THP in swap cache during swapping out. The page lock will be held during allocating the swap cluster, adding the THP into the swap cache and splitting the THP. So in the code path other than swapping out, if the THP need to be split, the PageSwapCache(THP) will be always false. The swap cluster is only available for SSD, so the THP swap optimization in this patchset has no effect for HDD. [ying.huang@intel.com: fix two issues in THP optimize patch] Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com [hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size] Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Johannes Weiner Suggested-by: Andrew Morton [for config option] Acked-by: Kirill A. Shutemov [for changes in huge_memory.c and huge_mm.h] Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Hugh Dickins Cc: Shaohua Li Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e767ed24aeb4..1dbbe38f6ec0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -72,6 +72,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE -- cgit From 1b862aecfbd419cdc4553645bf86d07554279bed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:45 -0700 Subject: mm, memory_hotplug: get rid of is_zone_device_section Device memory hotplug hooks into regular memory hotplug only half way. It needs memory sections to track struct pages but there is no need/desire to associate those sections with memory blocks and export them to the userspace via sysfs because they cannot be onlined anyway. This is currently expressed by for_device argument to arch_add_memory which then makes sure to associate the given memory range with ZONE_DEVICE. register_new_memory then relies on is_zone_device_section to distinguish special memory hotplug from the regular one. While this works now, later patches in this series want to move __add_zone outside of arch_add_memory path so we have to come up with something else. Add want_memblock down the __add_pages path and use it to control whether the section->memblock association should be done. arch_add_memory then just trivially want memblock for everything but for_device hotplug. remove_memory_section doesn't need is_zone_device_section either. We can simply skip all the memblock specific cleanup if there is no memblock for the given section. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-5-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 8f3efa682ee8..39e2aeb4669d 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -658,7 +658,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9ee536ec0739..e6b2e6618b6c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -151,7 +151,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdata->node_zones + zone_for_memory(nid, start, size, 0, for_device); - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3348e60dd8ad..a3d549966b6a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -195,7 +195,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) continue; nr_pages = (start_pfn + size_pages > zone_end_pfn) ? zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages); + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (rc) break; start_pfn += nr_pages; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 75491862d900..a9d57f75ae8c 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -498,7 +498,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) ret = __add_pages(nid, pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device), - start_pfn, nr_pages); + start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 99fb83819a5f..94594b889144 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -831,7 +831,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dae6a5e5ad4a..9d64291459b6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -787,7 +787,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ -- cgit From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:11 -0700 Subject: mm, memory_hotplug: do not associate hotadded memory to zones until online The current memory hotplug implementation relies on having all the struct pages associate with a zone/node during the physical hotplug phase (arch_add_memory->__add_pages->__add_section->__add_zone). In the vast majority of cases this means that they are added to ZONE_NORMAL. This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory hotadd without sparsemem") and it wasn't a big deal back then because movable onlining didn't exist yet. Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable memory and portion memory") and then things got more complicated. Rather than reconsidering the zone association which was no longer needed (because the memory hotplug already depended on SPARSEMEM) a convoluted semantic of zone shifting has been developed. Only the currently last memblock or the one adjacent to the zone_movable can be onlined movable. This essentially means that the online type changes as the new memblocks are added. Let's simulate memory hot online manually $ echo 0x100000000 > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory32/valid_zones Normal Movable $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal Movable $ echo online_movable > /sys/devices/system/memory/memory34/state $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Normal This is an awkward semantic because an udev event is sent as soon as the block is onlined and an udev handler might want to online it based on some policy (e.g. association with a node) but it will inherently race with new blocks showing up. This patch changes the physical online phase to not associate pages with any zone at all. All the pages are just marked reserved and wait for the onlining phase to be associated with the zone as per the online request. There are only two requirements - existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap - ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses the latter one is not an inherent requirement and can be changed in the future. It preserves the current behavior and made the code slightly simpler. This is subject to change in future. This means that the same physical online steps as above will lead to the following state: Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Implementation: The current move_pfn_range is reimplemented to check the above requirements (allow_online_pfn_range) and then updates the respective zone (move_pfn_range_to_zone), the pgdat and links all the pages in the pfn range with the zone/node. __add_pages is updated to not require the zone and only initializes sections in the range. This allowed to simplify the arch_add_memory code (s390 could get rid of quite some of code). devm_memremap_pages is the only user of arch_add_memory which relies on the zone association because it only hooks into the memory hotplug only half way. It uses it to associate the new memory with ZONE_DEVICE but doesn't allow it to be {on,off}lined via sysfs. This means that this particular code path has to call move_pfn_range_to_zone explicitly. The original zone shifting code is kept in place and will be removed in the follow up patch for an easier review. Please note that this patch also changes the original behavior when offlining a memory block adjacent to another zone (Normal vs. Movable) used to allow to change its movable type. This will be handled later. [richard.weiyang@gmail.com: simplify zone_intersects()] Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com [richard.weiyang@gmail.com: remove duplicate call for set_page_links] Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com [akpm@linux-foundation.org: remove unused local `i'] Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Wei Yang Tested-by: Dan Williams Tested-by: Reza Arbab Acked-by: Heiko Carstens # For s390 bits Acked-by: Vlastimil Babka Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 9 +-------- arch/powerpc/mm/mem.c | 10 +--------- arch/s390/mm/init.c | 30 ++---------------------------- arch/sh/mm/init.c | 8 +------- arch/x86/mm/init_32.c | 5 +---- arch/x86/mm/init_64.c | 9 +-------- 6 files changed, 7 insertions(+), 64 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 39e2aeb4669d..80db57d063d0 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -648,18 +648,11 @@ mem_init (void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - - zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e6b2e6618b6c..72c46eb53215 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); - pgdata = NODE_DATA(nid); - start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0, for_device); - - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index a3d549966b6a..bfa918e3592b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *zone; - int rc, i; + int rc; rc = vmem_add_mapping(start, size); if (rc) return rc; - for (i = 0; i < MAX_NR_ZONES; i++) { - zone = pgdat->node_zones + i; - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits, if possible */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; - } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; - } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - if (rc) - break; - start_pfn += nr_pages; - size_pages -= nr_pages; - if (!size_pages) - break; - } + rc = __add_pages(nid, start_pfn, size_pages, !for_device); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index a9d57f75ae8c..3813a610a2bb 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, - for_device), - start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 94594b889144..a424066d0552 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,13 +825,10 @@ void __init mem_init(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d64291459b6..06afa84ac0a0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ -- cgit From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:21 -0700 Subject: mm, memory_hotplug: replace for_device by want_memblock in arch_add_memory arch_add_memory gets for_device argument which then controls whether we want to create memblocks for created memory sections. Simplify the logic by telling whether we want memblocks directly rather than going through pointless negation. This also makes the api easier to understand because it is clear what we want rather than nothing telling for_device which can mean anything. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/s390/mm/init.c | 4 ++-- arch/sh/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 80db57d063d0..a4e8d6bd9cfa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -646,13 +646,13 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 72c46eb53215..de5a90e1ceaa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -126,7 +126,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bfa918e3592b..8111694ce55a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,7 +166,7 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -176,7 +176,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, !for_device); + rc = __add_pages(nid, start_pfn, size_pages, want_memblock); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3813a610a2bb..bf726af5f1a5 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,14 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a424066d0552..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,12 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 06afa84ac0a0..136422d7d539 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,7 +772,7 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -780,7 +780,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ -- cgit From 50791e6de0b5f2fa74b1a5211edd4d2a8354cc53 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:38:59 -0700 Subject: powerpc/hugetlb: add follow_huge_pd implementation for ppc64 Link: http://lkml.kernel.org/r/1494926612-23928-8-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a4f33de4008e..f5ec043d49df 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -617,6 +619,46 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } +/* + * 64 bit book3s use generic follow_page_mask + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +struct page *follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, + int flags, int pdshift) +{ + pte_t *ptep; + spinlock_t *ptl; + struct page *page = NULL; + unsigned long mask; + int shift = hugepd_shift(hpd); + struct mm_struct *mm = vma->vm_mm; + +retry: + ptl = &mm->page_table_lock; + spin_lock(ptl); + + ptep = hugepte_offset(hpd, address, pdshift); + if (pte_present(*ptep)) { + mask = (1UL << shift) - 1; + page = pte_page(*ptep); + page += ((address & mask) >> PAGE_SHIFT); + if (flags & FOLL_GET) + get_page(page); + } else { + if (is_hugetlb_entry_migration(*ptep)) { + spin_unlock(ptl); + __migration_entry_wait(mm, ptep, ptl); + goto retry; + } + } + spin_unlock(ptl); + return page; +} + +#else /* !CONFIG_PPC_BOOK3S_64 */ + /* * We are holding mmap_sem, so a parallel huge page collapse cannot run. * To prevent hugepage split, disable irq. @@ -672,6 +714,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, BUG(); return NULL; } +#endif static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) -- cgit From 28c057160e8ae7538e5237744e6ec845d134975a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:02 -0700 Subject: powerpc/mm/hugetlb: remove follow_huge_addr for powerpc With generic code now handling hugetlb entries at pgd level and also supporting hugepage directory format, we can now remove the powerpc sepcific follow_huge_addr implementation. Link: http://lkml.kernel.org/r/1494926612-23928-9-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 64 ------------------------------------------- 1 file changed, 64 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f5ec043d49df..f0b97d4f4387 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -619,11 +619,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } -/* - * 64 bit book3s use generic follow_page_mask - */ -#ifdef CONFIG_PPC_BOOK3S_64 - struct page *follow_huge_pd(struct vm_area_struct *vma, unsigned long address, hugepd_t hpd, int flags, int pdshift) @@ -657,65 +652,6 @@ retry: return page; } -#else /* !CONFIG_PPC_BOOK3S_64 */ - -/* - * We are holding mmap_sem, so a parallel huge page collapse cannot run. - * To prevent hugepage split, disable irq. - */ -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - bool is_thp; - pte_t *ptep, pte; - unsigned shift; - unsigned long mask, flags; - struct page *page = ERR_PTR(-EINVAL); - - local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift); - if (!ptep) - goto no_page; - pte = READ_ONCE(*ptep); - /* - * Verify it is a huge page else bail. - * Transparent hugepages are handled by generic code. We can skip them - * here. - */ - if (!shift || is_thp) - goto no_page; - - if (!pte_present(pte)) { - page = NULL; - goto no_page; - } - mask = (1UL << shift) - 1; - page = pte_page(pte); - if (page) - page += (address & mask) / PAGE_SIZE; - -no_page: - local_irq_restore(flags); - return page; -} - -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - BUG(); - return NULL; -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - BUG(); - return NULL; -} -#endif - static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { -- cgit From f7fb506fef6e8701bdb0ea7bb4f01148efd7416c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:05 -0700 Subject: powerpc/hugetlb: enable hugetlb migration for ppc64 Link: http://lkml.kernel.org/r/1494926612-23928-10-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/Kconfig.cputype | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 684e886eaae4..d5ea976509d7 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -350,6 +350,11 @@ config PPC_RADIX_MMU is only implemented by IBM Power9 CPUs, if you don't have one of them you can probably disable this. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION + + config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU -- cgit From e1073d1e7920946ac4776a619cc40668b9e1401b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:17 -0700 Subject: mm/hugetlb: clean up ARCH_HAS_GIGANTIC_PAGE This moves the #ifdef in C code to a Kconfig dependency. Also we move the gigantic_page_supported() function to be arch specific. This allows architectures to conditionally enable runtime allocation of gigantic huge page. Architectures like ppc64 supports different gigantic huge page size (16G and 1G) based on the translation mode selected. This provides an opportunity for ppc64 to enable runtime allocation only w.r.t 1G hugepage. No functional change in this patch. Link: http://lkml.kernel.org/r/1494995292-4443-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman (powerpc) Cc: Anshuman Khandual Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/hugetlb.h | 4 ++++ arch/s390/Kconfig | 2 +- arch/s390/include/asm/hugetlb.h | 3 +++ arch/x86/Kconfig | 2 +- arch/x86/include/asm/hugetlb.h | 4 ++++ 6 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f7a934ff707..ff925ece82d6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -13,7 +13,7 @@ config ARM64 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index bbc1e35aa601..793bd73b0d07 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -83,4 +83,8 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm, extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 37abe86e5bc9..7eeb75d758c1 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -68,7 +68,7 @@ config S390 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cd546a245c68..89057b2cc8fe 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -112,4 +112,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1dbbe38f6ec0..fe53a3aa805a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 3a106165e03a..535af0f2d8ac 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -85,4 +85,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* _ASM_X86_HUGETLB_H */ -- cgit From 40692eb5eea209c2dd55857f44b4e1d7206e91d6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:20 -0700 Subject: powerpc/mm/hugetlb: add support for 1G huge pages POWER9 supports hugepages of size 2M and 1G in radix MMU mode. This patch enables the usage of 1G page size for hugetlbfs. This also update the helper such we can do 1G page allocation at runtime. We still don't enable 1G page size on DD1 version. This is to avoid doing workaround mentioned in commit 6d3a0379ebdc ("powerpc/mm: Add radix__tlb_flush_pte_p9_dd1()"). Link: http://lkml.kernel.org/r/1494995292-4443-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman (powerpc) Cc: Anshuman Khandual Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 10 ++++++++++ arch/powerpc/mm/hugetlbpage.c | 7 +++++-- arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 6666cd366596..5c28bd6f2ae1 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -50,4 +50,14 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, else return entry; } + +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) +{ + if (radix_enabled()) + return true; + return false; +} +#endif + #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f0b97d4f4387..1816b965a142 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -742,8 +742,11 @@ static int __init add_huge_page_size(unsigned long long size) * Hash: 16M and 16G */ if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M) - return -EINVAL; + if (mmu_psize != MMU_PAGE_2M) { + if (cpu_has_feature(CPU_FTR_POWER9_DD1) || + (mmu_psize != MMU_PAGE_1G)) + return -EINVAL; + } } else { if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) return -EINVAL; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d5ea976509d7..2f629e0551e9 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -344,6 +344,7 @@ config PPC_STD_MMU_64 config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this -- cgit From bb9dd3df8ee9a0995da4c35251e6a8e2eefe0b41 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 6 Jul 2017 15:39:29 -0700 Subject: arm64: hugetlb: refactor find_num_contig() Patch series "Support for contiguous pte hugepages", v4. This patchset updates the hugetlb code to fix issues arising from contiguous pte hugepages (such as on arm64). Compared to v3, This version addresses a build failure on arm64 by including two cleanup patches. Other than the arm64 cleanups, the rest are generic code changes. The remaining arm64 support based on these patches will be posted separately. The patches are based on v4.12-rc2. Previous related postings can be found at [0], [1], [2], and [3]. The patches fall into three categories - * Patch 1-2 - arm64 cleanups required to greatly simplify changing huge_pte_offset() prototype in Patch 5. Catalin, Will - are you happy for these patches to go via mm? * Patches 3-4 address issues with gup * Patches 5-8 relate to passing a size argument to hugepage helpers to disambiguate the size of the referred page. These changes are required to enable arch code to properly handle swap entries for contiguous pte hugepages. The changes to huge_pte_offset() (patch 5) touch multiple architectures but I've managed to minimise these changes for the other affected functions - huge_pte_clear() and set_huge_pte_at(). These patches gate the enabling of contiguous hugepages support on arm64 which has been requested for systems using !4k page granule. The ARM64 architecture supports two flavours of hugepages - * Block mappings at the pud/pmd level These are regular hugepages where a pmd or a pud page table entry points to a block of memory. Depending on the PAGE_SIZE in use the following size of block mappings are supported - PMD PUD --- --- 4K: 2M 1G 16K: 32M 64K: 512M For certain applications/usecases such as HPC and large enterprise workloads, folks are using 64k page size but the minimum hugepage size of 512MB isn't very practical. To overcome this ... * Using the Contiguous bit The architecture provides a contiguous bit in the translation table entry which acts as a hint to the mmu to indicate that it is one of a contiguous set of entries that can be cached in a single TLB entry. We use the contiguous bit in Linux to increase the mapping size at the pmd and pte (last) level. The number of supported contiguous entries varies by page size and level of the page table. Using the contiguous bit allows additional hugepage sizes - CONT PTE PMD CONT PMD PUD -------- --- -------- --- 4K: 64K 2M 32M 1G 16K: 2M 32M 1G 64K: 2M 512M 16G Of these, 64K with 4K and 2M with 64K pages have been explicitly requested by a few different users. Entries with the contiguous bit set are required to be modified all together - which makes things like memory poisoning and migration impossible to do correctly without knowing the size of hugepage being dealt with - the reason for adding size parameter to a few of the hugepage helpers in this series. This patch (of 8): As we regularly check for contiguous pte's in the huge accessors, remove this extra check from find_num_contig. [punit.agrawal@arm.com: resolve rebase conflicts due to patch re-ordering] Link: http://lkml.kernel.org/r/20170524115409.31309-2-punit.agrawal@arm.com Signed-off-by: Steve Capper Signed-off-by: Punit Agrawal Cc: David Woods Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Naoya Horiguchi Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 69b8200b1cfd..710bf935a473 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -42,15 +42,13 @@ int pud_huge(pud_t pud) } static int find_num_contig(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, size_t *pgsize) + pte_t *ptep, size_t *pgsize) { pgd_t *pgd = pgd_offset(mm, addr); pud_t *pud; pmd_t *pmd; *pgsize = PAGE_SIZE; - if (!pte_cont(pte)) - return 1; pud = pud_offset(pgd, addr); pmd = pmd_offset(pud, addr); if ((pte_t *)pmd == ptep) { @@ -65,15 +63,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, { size_t pgsize; int i; - int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize); + int ncontig; unsigned long pfn; pgprot_t hugeprot; - if (ncontig == 1) { + if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); return; } + ncontig = find_num_contig(mm, addr, ptep, &pgsize); pfn = pte_pfn(pte); hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); for (i = 0; i < ncontig; i++) { @@ -188,7 +187,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, bool is_dirty = false; cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + ncontig = find_num_contig(mm, addr, cpte, &pgsize); /* save the 1st pte to return */ pte = ptep_get_and_clear(mm, addr, cpte); for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { @@ -228,7 +227,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, cpte = huge_pte_offset(vma->vm_mm, addr); pfn = pte_pfn(*cpte); ncontig = find_num_contig(vma->vm_mm, addr, cpte, - *cpte, &pgsize); + &pgsize); for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { changed |= ptep_set_access_flags(vma, addr, cpte, pfn_pte(pfn, @@ -251,7 +250,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize = 0; cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + ncontig = find_num_contig(mm, addr, cpte, &pgsize); for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_set_wrprotect(mm, addr, cpte); } else { @@ -269,7 +268,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, cpte = huge_pte_offset(vma->vm_mm, addr); ncontig = find_num_contig(vma->vm_mm, addr, cpte, - *cpte, &pgsize); + &pgsize); for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_clear_flush(vma, addr, cpte); } else { -- cgit From f0b38d65c9d0b42f3e6d861a18906d49441bf78e Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 6 Jul 2017 15:39:33 -0700 Subject: arm64: hugetlb: remove spurious calls to huge_ptep_offset() We don't need to call huge_ptep_offset as our accessors are already supplied with the pte_t *. This patch removes those spurious calls. [punit.agrawal@arm.com: resolve rebase conflicts due to patch re-ordering] Link: http://lkml.kernel.org/r/20170524115409.31309-3-punit.agrawal@arm.com Signed-off-by: Steve Capper Signed-off-by: Punit Agrawal Cc: David Woods Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Naoya Horiguchi Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 710bf935a473..f89aa8fa5855 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -183,21 +183,19 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, if (pte_cont(*ptep)) { int ncontig, i; size_t pgsize; - pte_t *cpte; bool is_dirty = false; - cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, &pgsize); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); /* save the 1st pte to return */ - pte = ptep_get_and_clear(mm, addr, cpte); + pte = ptep_get_and_clear(mm, addr, ptep); for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { /* * If HW_AFDBM is enabled, then the HW could * turn on the dirty bit for any of the page * in the set, so check them all. */ - ++cpte; - if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) + ++ptep; + if (pte_dirty(ptep_get_and_clear(mm, addr, ptep))) is_dirty = true; } if (is_dirty) @@ -213,8 +211,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - pte_t *cpte; - if (pte_cont(pte)) { int ncontig, i, changed = 0; size_t pgsize = 0; @@ -224,12 +220,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); - cpte = huge_pte_offset(vma->vm_mm, addr); - pfn = pte_pfn(*cpte); - ncontig = find_num_contig(vma->vm_mm, addr, cpte, + pfn = pte_pfn(pte); + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { - changed |= ptep_set_access_flags(vma, addr, cpte, + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) { + changed |= ptep_set_access_flags(vma, addr, ptep, pfn_pte(pfn, hugeprot), dirty); @@ -246,13 +241,11 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, { if (pte_cont(*ptep)) { int ncontig, i; - pte_t *cpte; size_t pgsize = 0; - cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) - ptep_set_wrprotect(mm, addr, cpte); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) + ptep_set_wrprotect(mm, addr, ptep); } else { ptep_set_wrprotect(mm, addr, ptep); } @@ -263,14 +256,12 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, { if (pte_cont(*ptep)) { int ncontig, i; - pte_t *cpte; size_t pgsize = 0; - cpte = huge_pte_offset(vma->vm_mm, addr); - ncontig = find_num_contig(vma->vm_mm, addr, cpte, + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) - ptep_clear_flush(vma, addr, cpte); + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) + ptep_clear_flush(vma, addr, ptep); } else { ptep_clear_flush(vma, addr, ptep); } -- cgit From 7868a2087ec13ec4a5df0c5e00999863be132ba8 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:42 -0700 Subject: mm/hugetlb: add size parameter to huge_pte_offset() A poisoned or migrated hugepage is stored as a swap entry in the page tables. On architectures that support hugepages consisting of contiguous page table entries (such as on arm64) this leads to ambiguity in determining the page table entry to return in huge_pte_offset() when a poisoned entry is encountered. Let's remove the ambiguity by adding a size parameter to convey additional information about the requested address. Also fixup the definition/usage of huge_pte_offset() throughout the tree. Link: http://lkml.kernel.org/r/20170522133604.11392-4-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Steve Capper Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: James Hogan (odd fixer:METAG ARCHITECTURE) Cc: Ralf Baechle (supporter:MIPS) Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Cc: Michal Hocko Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Hillf Danton Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 3 ++- arch/ia64/mm/hugetlbpage.c | 4 ++-- arch/metag/mm/hugetlbpage.c | 3 ++- arch/mips/mm/hugetlbpage.c | 3 ++- arch/parisc/mm/hugetlbpage.c | 3 ++- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/s390/mm/hugetlbpage.c | 3 ++- arch/sh/mm/hugetlbpage.c | 3 ++- arch/sparc/mm/hugetlbpage.c | 3 ++- arch/tile/mm/hugetlbpage.c | 3 ++- arch/x86/mm/hugetlbpage.c | 2 +- 11 files changed, 20 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f89aa8fa5855..656e0ece2289 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -131,7 +131,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 85de86d36fdf..ae35140332f7 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -44,7 +44,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) } pte_t * -huge_pte_offset (struct mm_struct *mm, unsigned long addr) +huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; @@ -92,7 +92,7 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int writ if (REGION_NUMBER(addr) != RGN_HPAGE) return ERR_PTR(-EINVAL); - ptep = huge_pte_offset(mm, addr); + ptep = huge_pte_offset(mm, addr, HPAGE_SIZE); if (!ptep || pte_none(*ptep)) return NULL; page = pte_page(*ptep); diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index db1b7da91e4f..67fd53e2935a 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -74,7 +74,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 74aa6f62468f..cef152234312 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -36,7 +36,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, + unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index aa50ac090e9b..5eb8f633b282 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -69,7 +69,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1816b965a142..c41dc44472c5 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -57,7 +57,7 @@ static unsigned nr_gpages; #define hugepd_none(hpd) (hpd_val(hpd) == 0) -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { /* Only called for hugetlbfs pages, hence can ignore THP */ return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d3a5e39756f6..44a8e6f0391e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -180,7 +180,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return (pte_t *) pmdp; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgdp; p4d_t *p4dp; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index cc948db74878..d2412d2d6462 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -42,7 +42,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 88855e383b34..28ee8d8ffa07 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -277,7 +277,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 03e5cc4e76e4..0986d426a413 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -102,7 +102,8 @@ static pte_t *get_pte(pte_t *base, int index, int level) return ptep; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index adad702b39cd..2824607df108 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) if (!vma || !is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); /* hugetlb should be locked, and hence, prefaulted */ WARN_ON(!pte || pte_none(*pte)); -- cgit From 9386fac34c7cbe39013410b01348e284652ca1cf Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:46 -0700 Subject: mm/hugetlb: allow architectures to override huge_pte_clear() When unmapping a hugepage range, huge_pte_clear() is used to clear the page table entries that are marked as not present. huge_pte_clear() internally just ends up calling pte_clear() which does not correctly deal with hugepages consisting of contiguous page table entries. Add a size argument to address this issue and allow architectures to override huge_pte_clear() by wrapping it in a #ifndef block. Update s390 implementation with the size parameter as well. Note that the change only affects huge_pte_clear() - the other generic hugetlb functions don't need any change. Link: http://lkml.kernel.org/r/20170522162555.4313-1-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Martin Schwidefsky [s390 bits] Cc: Heiko Carstens Cc: Arnd Bergmann Cc: "Aneesh Kumar K.V" Cc: Mike Kravetz Cc: Catalin Marinas Cc: Will Deacon Cc: Naoya Horiguchi Cc: "Kirill A. Shutemov" Cc: Steve Capper Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 89057b2cc8fe..d95869ce3ca2 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,7 +39,7 @@ static inline int prepare_hugepage_range(struct file *file, #define arch_clear_hugepage_flags(page) do { } while (0) static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) pte_val(*ptep) = _REGION3_ENTRY_EMPTY; -- cgit From 00f3ca2c2d6635d85108571c4dd9a29088668662 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:52 -0700 Subject: mm: memcontrol: per-lruvec stats infrastructure lruvecs are at the intersection of the NUMA node and memcg, which is the scope for most paging activity. Introduce a convenient accounting infrastructure that maintains statistics per node, per memcg, and the lruvec itself. Then convert over accounting sites for statistics that are already tracked in both nodes and memcgs and can be easily switched. [hannes@cmpxchg.org: fix crash in the new cgroup stat keeping code] Link: http://lkml.kernel.org/r/20170531171450.GA10481@cmpxchg.org [hannes@cmpxchg.org: don't track uncharged pages at all Link: http://lkml.kernel.org/r/20170605175254.GA8547@cmpxchg.org [hannes@cmpxchg.org: add missing free_percpu()] Link: http://lkml.kernel.org/r/20170605175354.GB8547@cmpxchg.org [linux@roeck-us.net: hexagon: fix build error caused by include file order] Link: http://lkml.kernel.org/r/20170617153721.GA4382@roeck-us.net Link: http://lkml.kernel.org/r/20170530181724.27197-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Guenter Roeck Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/pgtable.h | 1 - arch/hexagon/kernel/asm-offsets.c | 1 - arch/hexagon/mm/vm_tlb.c | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 24a9177fb897..aef02f7ca8aa 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -24,7 +24,6 @@ /* * Page table definitions for Qualcomm Hexagon processor. */ -#include #include #define __ARCH_USE_5LEVEL_HACK #include diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c index 308be68d4fb3..3980c0407aa1 100644 --- a/arch/hexagon/kernel/asm-offsets.c +++ b/arch/hexagon/kernel/asm-offsets.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/hexagon/mm/vm_tlb.c b/arch/hexagon/mm/vm_tlb.c index 9647d00cb761..b474065533ce 100644 --- a/arch/hexagon/mm/vm_tlb.c +++ b/arch/hexagon/mm/vm_tlb.c @@ -24,6 +24,7 @@ * be instantiated for it, differently from a native build. */ #include +#include #include #include -- cgit