From bcf6f55a0d05eedd8ebb6ecc60ae3f93205ad833 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Mar 2019 15:41:27 -0800 Subject: kasan: fix kasan_check_read/write definitions Building little-endian allmodconfig kernels on arm64 started failing with the generated atomic.h implementation, since we now try to call kasan helpers from the EFI stub: aarch64-linux-gnu-ld: drivers/firmware/efi/libstub/arm-stub.stub.o: in function `atomic_set': include/generated/atomic-instrumented.h:44: undefined reference to `__efistub_kasan_check_write' I suspect that we get similar problems in other files that explicitly disable KASAN for some reason but call atomic_t based helper functions. We can fix this by checking the predefined __SANITIZE_ADDRESS__ macro that the compiler sets instead of checking CONFIG_KASAN, but this in turn requires a small hack in mm/kasan/common.c so we do see the extern declaration there instead of the inline function. Link: http://lkml.kernel.org/r/20181211133453.2835077-1-arnd@arndb.de Fixes: b1864b828644 ("locking/atomics: build atomic headers as required") Signed-off-by: Arnd Bergmann Reported-by: Anders Roxell Acked-by: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Will Deacon Cc: Mark Rutland Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Stephen Rothwell , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index d314150658a4..a61dc075e2ce 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,7 +2,7 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H -#ifdef CONFIG_KASAN +#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) void kasan_check_read(const volatile void *p, unsigned int size); void kasan_check_write(const volatile void *p, unsigned int size); #else -- cgit From de810f490db7ed4c1db2bbfa458b2e27681d2ccb Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 5 Mar 2019 15:42:07 -0800 Subject: include/linux/slub_def.h: comment fixes Capitialize comment string, use C89 comment style, correct grammar/punctuation in comments. Link: http://lkml.kernel.org/r/20190204005713.9463-2-tobin@kernel.org Link: http://lkml.kernel.org/r/20190204005713.9463-3-tobin@kernel.org Link: http://lkml.kernel.org/r/20190204005713.9463-4-tobin@kernel.org Signed-off-by: Tobin C. Harding Reviewed-by: Andrew Morton Reviewed-by: William Kucharski Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3a1a1dbc6f49..d2153789bd9f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,12 +81,12 @@ struct kmem_cache_order_objects { */ struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; - /* Used for retriving partial slabs etc */ + /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; - unsigned int size; /* The size of an object including meta data */ - unsigned int object_size;/* The size of an object without meta data */ - unsigned int offset; /* Free pointer offset. */ + unsigned int size; /* The size of an object including metadata */ + unsigned int object_size;/* The size of an object without metadata */ + unsigned int offset; /* Free pointer offset */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; @@ -110,7 +110,7 @@ struct kmem_cache { #endif #ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; - /* for propagation, maximum size of a stored attr */ + /* For propagation, maximum size of a stored attr */ unsigned int max_attr_size; #ifdef CONFIG_SYSFS struct kset *memcg_kset; @@ -151,7 +151,7 @@ struct kmem_cache { #else #define slub_cpu_partial(s) (0) #define slub_set_cpu_partial(s, n) -#endif // CONFIG_SLUB_CPU_PARTIAL +#endif /* CONFIG_SLUB_CPU_PARTIAL */ #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS -- cgit From a9cd410a3d296846a8125aa43d97a573a354c472 Mon Sep 17 00:00:00 2001 From: Arun KS Date: Tue, 5 Mar 2019 15:42:14 -0800 Subject: mm/page_alloc.c: memory hotplug: free pages as higher order When freeing pages are done with higher order, time spent on coalescing pages by buddy allocator can be reduced. With section size of 256MB, hot add latency of a single section shows improvement from 50-60 ms to less than 1 ms, hence improving the hot add latency by 60 times. Modify external providers of online callback to align with the change. [arunks@codeaurora.org: v11] Link: http://lkml.kernel.org/r/1547792588-18032-1-git-send-email-arunks@codeaurora.org [akpm@linux-foundation.org: remove unused local, per Arun] [akpm@linux-foundation.org: avoid return of void-returning __free_pages_core(), per Oscar] [akpm@linux-foundation.org: fix it for mm-convert-totalram_pages-and-totalhigh_pages-variables-to-atomic.patch] [arunks@codeaurora.org: v8] Link: http://lkml.kernel.org/r/1547032395-24582-1-git-send-email-arunks@codeaurora.org [arunks@codeaurora.org: v9] Link: http://lkml.kernel.org/r/1547098543-26452-1-git-send-email-arunks@codeaurora.org Link: http://lkml.kernel.org/r/1538727006-5727-1-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Reviewed-by: Andrew Morton Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Alexander Duyck Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Dan Williams Cc: Vlastimil Babka Cc: Joonsoo Kim Cc: Greg Kroah-Hartman Cc: Mathieu Malaterre Cc: "Kirill A. Shutemov" Cc: Souptick Joarder Cc: Mel Gorman Cc: Aaron Lu Cc: Srivatsa Vaddagiri Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 368267c1b71b..52869d6d38b3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -89,7 +89,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); extern void __offline_isolated_pages(unsigned long, unsigned long); -typedef void (*online_page_callback_t)(struct page *page); +typedef void (*online_page_callback_t)(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); -- cgit From 4d3467e171f8a8ef8f1dd205769cf2f21fbc8e1e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:18 -0800 Subject: mm: balloon: update comment about isolation/migration/compaction Patch series "mm/kdump: allow to exclude pages that are logically offline" Right now, pages inflated as part of a balloon driver will be dumped by dump tools like makedumpfile. While XEN is able to check in the crash kernel whether a certain pfn is actuall backed by memory in the hypervisor (see xen_oldmem_pfn_is_ram) and optimize this case, dumps of virtio-balloon, hv-balloon and VMWare balloon inflated memory will essentially result in zero pages getting allocated by the hypervisor and the dump getting filled with this data. The allocation and reading of zero pages can directly be avoided if a dumping tool could know which pages only contain stale information not to be dumped. Also for XEN, calling into the kernel and asking the hypervisor if a pfn is backed can be avoided if the duming tool would skip such pages right from the beginning. Dumping tools have no idea whether a given page is part of a balloon driver and shall not be dumped. Esp. PG_reserved cannot be used for that purpose as all memory allocated during early boot is also PG_reserved, see discussion at [1]. So some other way of indication is required and a new page flag is frowned upon. We have PG_balloon (MAPCOUNT value), which is essentially unused now. I suggest renaming it to something more generic (PG_offline) to mark pages as logically offline. This flag can than e.g. also be used by virtio-mem in the future to mark subsections as offline. Or by other code that wants to put pages logically offline (e.g. later maybe poisoned pages that shall no longer be used). This series converts PG_balloon to PG_offline, allows dumping tools to query the value to detect such pages and marks pages in the hv-balloon and XEN balloon properly as PG_offline. Note that virtio-balloon already set pages to PG_balloon (and now PG_offline). Please note that this is also helpful for a problem we were seeing under Hyper-V: Dumping logically offline memory (pages kept fake offline while onlining a section via online_page_callback) would under some condicions result in a kernel panic when dumping them. As I don't have access to neither XEN nor Hyper-V nor VMWare installations, this was only tested with the virtio-balloon and pages were properly skipped when dumping. I'll also attach the makedumpfile patch to this series. [1] https://lkml.org/lkml/2018/7/20/566 This patch (of 8): Commit b1123ea6d3b3 ("mm: balloon: use general non-lru movable page feature") reworked balloon handling to make use of the general non-lru movable page feature. The big comment block in balloon_compaction.h contains quite some outdated information. Let's fix this. Link: http://lkml.kernel.org/r/20181119101616.8901-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael S. Tsirkin Cc: Matthew Wilcox Cc: Michal Hocko Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Machek Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 53051f3d8f25..cbe50da5a59d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -4,15 +4,18 @@ * * Common interface definitions for making balloon pages movable by compaction. * - * Despite being perfectly possible to perform ballooned pages migration, they - * make a special corner case to compaction scans because balloon pages are not - * enlisted at any LRU list like the other pages we do compact / migrate. + * Balloon page migration makes use of the general non-lru movable page + * feature. + * + * page->private is used to reference the responsible balloon device. + * page->mapping is used in context of non-lru page migration to reference + * the address space operations for page isolation/migration/compaction. * * As the page isolation scanning step a compaction thread does is a lockless * procedure (from a page standpoint), it might bring some racy situations while * performing balloon page compaction. In order to sort out these racy scenarios * and safely perform balloon's page compaction and migration we must, always, - * ensure following these three simple rules: + * ensure following these simple rules: * * i. when updating a balloon's page ->mapping element, strictly do it under * the following lock order, independently of the far superior @@ -21,19 +24,8 @@ * +--spin_lock_irq(&b_dev_info->pages_lock); * ... page->mapping updates here ... * - * ii. before isolating or dequeueing a balloon page from the balloon device - * pages list, the page reference counter must be raised by one and the - * extra refcount must be dropped when the page is enqueued back into - * the balloon device page list, thus a balloon page keeps its reference - * counter raised only while it is under our special handling; - * - * iii. after the lockless scan step have selected a potential balloon page for - * isolation, re-test the PageBalloon mark and the PagePrivate flag - * under the proper page lock, to ensure isolating a valid balloon page - * (not yet isolated, nor under release procedure) - * - * iv. isolation or dequeueing procedure must clear PagePrivate flag under - * page lock together with removing page from balloon device page list. + * ii. isolation or dequeueing procedure must remove the page from balloon + * device page list under b_dev_info->pages_lock. * * The functions provided by this interface are placed to help on coping with * the aforementioned balloon page corner case, as well as to ensure the simple -- cgit From ca215086b14b89a0e70fc211314944aa6ce50020 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:23 -0800 Subject: mm: convert PG_balloon to PG_offline PG_balloon was introduced to implement page migration/compaction for pages inflated in virtio-balloon. Nowadays, it is only a marker that a page is part of virtio-balloon and therefore logically offline. We also want to make use of this flag in other balloon drivers - for inflated pages or when onlining a section but keeping some pages offline (e.g. used right now by XEN and Hyper-V via set_online_page_callback()). We are going to expose this flag to dump tools like makedumpfile. But instead of exposing PG_balloon, let's generalize the concept of marking pages as logically offline, so it can be reused for other purposes later on. Rename PG_balloon to PG_offline. This is an indicator that the page is logically offline, the content stale and that it should not be touched (e.g. a hypervisor would have to allocate backing storage in order for the guest to dump an unused page). We can then e.g. exclude such pages from dumps. We replace and reuse KPF_BALLOON (23), as this shouldn't really harm (and for now the semantics stay the same). In following patches, we will make use of this bit also in other balloon drivers. While at it, document PGTABLE. [akpm@linux-foundation.org: fix comment text, per David] Link: http://lkml.kernel.org/r/20181119101616.8901-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Konstantin Khlebnikov Acked-by: Michael S. Tsirkin Acked-by: Pankaj gupta Cc: Jonathan Corbet Cc: Alexey Dobriyan Cc: Mike Rapoport Cc: Christian Hansen Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Stephen Rothwell Cc: Matthew Wilcox Cc: Michal Hocko Cc: Pavel Tatashin Cc: Alexander Duyck Cc: Naoya Horiguchi Cc: Miles Chen Cc: David Rientjes Cc: Kazuhito Hagio Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Nadav Amit Cc: Omar Sandoval Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 8 ++++---- include/linux/page-flags.h | 11 +++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index cbe50da5a59d..f111c780ef1d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -95,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping, static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); @@ -111,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, */ static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); __ClearPageMovable(page); set_page_private(page, 0); /* @@ -141,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); list_add(&page->lru, &balloon->pages); } static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); list_del(&page->lru); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 39b4494e29f1..808b4183e30d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -671,7 +671,7 @@ PAGEFLAG_FALSE(DoubleMap) /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 -#define PG_balloon 0x00000100 +#define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 @@ -706,10 +706,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \ PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is true for pages that are on the balloon page list - * (see mm/balloon_compaction.c). + * PageOffline() indicates that the page is logically offline although the + * containing section is online. (e.g. inflated in a balloon driver or + * not onlined when onlining the section). + * The content of these pages is effectively stale. Such pages should not + * be touched (read/write/dump/save) except by their owner. */ -PAGE_TYPE_OPS(Balloon, balloon) +PAGE_TYPE_OPS(Offline, offline) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on -- cgit From 98fa15f34cb379864757670b8e8743b21456a20e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:42:58 -0800 Subject: mm: replace all open encodings for NUMA_NO_NODE Patch series "Replace all open encodings for NUMA_NO_NODE", v3. All these places for replacement were found by running the following grep patterns on the entire kernel code. Please let me know if this might have missed some instances. This might also have replaced some false positives. I will appreciate suggestions, inputs and review. 1. git grep "nid == -1" 2. git grep "node == -1" 3. git grep "nid = -1" 4. git grep "node = -1" This patch (of 2): At present there are multiple places where invalid node number is encoded as -1. Even though implicitly understood it is always better to have macros in there. Replace these open encodings for an invalid node number with the global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like 'invalid node' from various places redirecting them to a common definition. Link: http://lkml.kernel.org/r/1545127933-10711-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: David Hildenbrand Acked-by: Jeff Kirsher [ixgbe] Acked-by: Jens Axboe [mtip32xx] Acked-by: Vinod Koul [dmaengine.c] Acked-by: Michael Ellerman [powerpc] Acked-by: Doug Ledford [drivers/infiniband] Cc: Joseph Qi Cc: Hans Verkuil Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 6cb4640b6160..4d2f13e8c540 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1095,7 +1095,7 @@ static inline void set_dev_node(struct device *dev, int node) #else static inline int dev_to_node(struct device *dev) { - return -1; + return NUMA_NO_NODE; } static inline void set_dev_node(struct device *dev, int node) { -- cgit From 52d1e606ee733921e984770d47539a6bb91e8506 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 5 Mar 2019 15:43:06 -0800 Subject: mm: reuse only-pte-mapped KSM page in do_wp_page() Add an optimization for KSM pages almost in the same way that we have for ordinary anonymous pages. If there is a write fault in a page, which is mapped to an only pte, and it is not related to swap cache; the page may be reused without copying its content. [ Note that we do not consider PageSwapCache() pages at least for now, since we don't want to complicate __get_ksm_page(), which has nice optimization based on this (for the migration case). Currenly it is spinning on PageSwapCache() pages, waiting for when they have unfreezed counters (i.e., for the migration finish). But we don't want to make it also spinning on swap cache pages, which we try to reuse, since there is not a very high probability to reuse them. So, for now we do not consider PageSwapCache() pages at all. ] So in reuse_ksm_page() we check for 1) PageSwapCache() and 2) page_stable_node(), to skip a page, which KSM is currently trying to link to stable tree. Then we do page_ref_freeze() to prohibit KSM to merge one more page into the page, we are reusing. After that, nobody can refer to the reusing page: KSM skips !PageSwapCache() pages with zero refcount; and the protection against of all other participants is the same as for reused ordinary anon pages pte lock, page lock and mmap_sem. [akpm@linux-foundation.org: replace BUG_ON()s with WARN_ON()s] Link: http://lkml.kernel.org/r/154471491016.31352.1168978849911555609.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Christian Koenig Cc: Claudio Imbrenda Cc: Rik van Riel Cc: Huang Ying Cc: Minchan Kim Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 161e8164abcf..e48b1e453ff5 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -53,6 +53,8 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address); #else /* !CONFIG_KSM */ @@ -86,6 +88,11 @@ static inline void rmap_walk_ksm(struct page *page, static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } +static inline bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + return false; +} #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ -- cgit From 60cd4bcd62384cfa1e5890cebacccf08b3161156 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 5 Mar 2019 15:43:13 -0800 Subject: memcg: localize memcg_kmem_enabled() check Move the memcg_kmem_enabled() checks into memcg kmem charge/uncharge functions, so, the users don't have to explicitly check that condition. This is purely code cleanup patch without any functional change. Only the order of checks in memcg_charge_slab() can potentially be changed but the functionally it will be same. This should not matter as memcg_charge_slab() is not in the hot path. Link: http://lkml.kernel.org/r/20190103161203.162375-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83ae11cbd12c..b0eb29ea0d9c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1273,12 +1273,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_KMEM -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); -void memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1300,6 +1300,26 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge(page, gfp, order); + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge(page, order); +} + +static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, + int order, struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge_memcg(page, gfp, order, memcg); + return 0; +} /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function @@ -1325,6 +1345,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order) { } +static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void __memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) -- cgit From 6b7e5cad651a2b1031a4c69a98f87e3532dd4cef Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Mar 2019 15:43:41 -0800 Subject: mm: remove sysctl_extfrag_handler() sysctl_extfrag_handler() neglects to propagate the return value from proc_dointvec_minmax() to its caller. It's a wrapper that doesn't need to exist, so just use proc_dointvec_minmax() directly. Link: http://lkml.kernel.org/r/20190104032557.3056-1-willy@infradead.org Signed-off-by: Matthew Wilcox Reported-by: Aditya Pakki Acked-by: Mel Gorman Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 68250a57aace..70d0256edd31 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -88,8 +88,6 @@ extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; -extern int sysctl_extfrag_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); -- cgit From 7ed2c31dabdeb3ee6abe8ff5aac7287821a50cba Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:44 -0800 Subject: mm/hugetlb: distinguish between migratability and movability Patch series "arm64/mm: Enable HugeTLB migration", v4. This patch series enables HugeTLB migration support for all supported huge page sizes at all levels including contiguous bit implementation. Following HugeTLB migration support matrix has been enabled with this patch series. All permutations have been tested except for the 16GB. CONT PTE PMD CONT PMD PUD -------- --- -------- --- 4K: 64K 2M 32M 1G 16K: 2M 32M 1G 64K: 2M 512M 16G First the series adds migration support for PUD based huge pages. It then adds a platform specific hook to query an architecture if a given huge page size is supported for migration while also providing a default fallback option preserving the existing semantics which just checks for (PMD|PUD|PGDIR)_SHIFT macros. The last two patches enables HugeTLB migration on arm64 and subscribe to this new platform specific hook by defining an override. The second patch differentiates between movability and migratability aspects of huge pages and implements hugepage_movable_supported() which can then be used during allocation to decide whether to place the huge page in movable zone or not. This patch (of 5): During huge page allocation it's migratability is checked to determine if it should be placed under movable zones with GFP_HIGHUSER_MOVABLE. But the movability aspect of the huge page could depend on other factors than just migratability. Movability in itself is a distinct property which should not be tied with migratability alone. This differentiates these two and implements an enhanced movability check which also considers huge page size to determine if it is feasible to be placed under a movable zone. At present it just checks for gigantic pages but going forward it can incorporate other enhanced checks. Link: http://lkml.kernel.org/r/1545121450-1663-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Steve Capper Reviewed-by: Naoya Horiguchi Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 087fd5f48c91..1b858d795731 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -506,6 +506,31 @@ static inline bool hugepage_migration_supported(struct hstate *h) #endif } +/* + * Movability check is different as compared to migration check. + * It determines whether or not a huge page should be placed on + * movable zone or not. Movability of any huge page should be + * required only if huge page size is supported for migration. + * There wont be any reason for the huge page to be movable if + * it is not migratable to start with. Also the size of the huge + * page should be large enough to be placed under a movable zone + * and still feasible enough to be migratable. Just the presence + * in movable zone does not make the migration feasible. + * + * So even though large huge page sizes like the gigantic ones + * are migratable they should not be movable because its not + * feasible to migrate them from movable zone. + */ +static inline bool hugepage_movable_supported(struct hstate *h) +{ + if (!hugepage_migration_supported(h)) + return false; + + if (hstate_is_gigantic(h)) + return false; + return true; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { @@ -602,6 +627,11 @@ static inline bool hugepage_migration_supported(struct hstate *h) return false; } +static inline bool hugepage_movable_supported(struct hstate *h) +{ + return false; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { -- cgit From 9b553bf5eb99dd1b2d8ae23136da46da5c205dfd Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:48 -0800 Subject: mm/hugetlb: enable PUD level huge page migration Architectures like arm64 have PUD level HugeTLB pages for certain configs (1GB huge page is PUD based on ARM64_4K_PAGES base page size) that can be enabled for migration. It can be achieved through checking for PUD_SHIFT order based HugeTLB pages during migration. Link: http://lkml.kernel.org/r/1545121450-1663-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Naoya Horiguchi Reviewed-by: Steve Capper Acked-by: Michal Hocko Cc: Catalin Marinas Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1b858d795731..70bcd8973323 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -497,7 +497,8 @@ static inline bool hugepage_migration_supported(struct hstate *h) { #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION if ((huge_page_shift(h) == PMD_SHIFT) || - (huge_page_shift(h) == PGDIR_SHIFT)) + (huge_page_shift(h) == PUD_SHIFT) || + (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; -- cgit From e693de186414ae66f2a316ff9befcd2b7a6d07b6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:43:51 -0800 Subject: mm/hugetlb: enable arch specific huge page size support for migration Architectures like arm64 have HugeTLB page sizes which are different than generic sizes at PMD, PUD, PGD level and implemented via contiguous bits. At present these special size HugeTLB pages cannot be identified through macros like (PMD|PUD|PGDIR)_SHIFT and hence chosen not be migrated. Enabling migration support for these special HugeTLB page sizes along with the generic ones (PMD|PUD|PGD) would require identifying all of them on a given platform. A platform specific hook can precisely enumerate all huge page sizes supported for migration. Instead of comparing against standard huge page orders let hugetlb_migration_support() function call a platform hook arch_hugetlb_migration_support(). Default definition for the platform hook maintains existing semantics which checks standard huge page order. But an architecture can choose to override the default and provide support for a comprehensive set of huge page sizes. Link: http://lkml.kernel.org/r/1545121450-1663-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Naoya Horiguchi Reviewed-by: Steve Capper Acked-by: Michal Hocko Cc: Catalin Marinas Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 70bcd8973323..4cc3871b65fc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -493,18 +493,29 @@ static inline pgoff_t basepage_index(struct page *page) extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); -static inline bool hugepage_migration_supported(struct hstate *h) -{ + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION +#ifndef arch_hugetlb_migration_supported +static inline bool arch_hugetlb_migration_supported(struct hstate *h) +{ if ((huge_page_shift(h) == PMD_SHIFT) || (huge_page_shift(h) == PUD_SHIFT) || (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; +} +#endif #else +static inline bool arch_hugetlb_migration_supported(struct hstate *h) +{ return false; +} #endif + +static inline bool hugepage_migration_supported(struct hstate *h) +{ + return arch_hugetlb_migration_supported(h); } /* -- cgit From d71e53cee7c2e553b85c572e76da778a93d32135 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:44:18 -0800 Subject: mm: shuffle GFP_* flags GFP_KERNEL is one of the most used constant but on archs like arm with fixed length instruction some constants are more equal than the others. Constants with tightly packed bits can be injected directly into instruction stream: 0: e3a00d33 mov r0, #3264 ; 0xcc0 Others require multiple instructions or even loading out of instruction stream: 0: e3a000c0 mov r0, #192 ; 0xc0 4: e3400060 movt r0, #96 ; 0x60 Shuffle GFP_* flags so that GFP_KERNEL/GFP_ATOMIC + __GFP_ZERO bits are close to each other. Savings on arm configs are ~0.1%. Link: http://lkml.kernel.org/r/20190109201838.GA9140@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5f5e25fd6149..fdab7de7490d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,21 +24,21 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u -#define ___GFP_WRITE 0x100u -#define ___GFP_NOWARN 0x200u -#define ___GFP_RETRY_MAYFAIL 0x400u -#define ___GFP_NOFAIL 0x800u -#define ___GFP_NORETRY 0x1000u -#define ___GFP_MEMALLOC 0x2000u -#define ___GFP_COMP 0x4000u -#define ___GFP_ZERO 0x8000u -#define ___GFP_NOMEMALLOC 0x10000u -#define ___GFP_HARDWALL 0x20000u -#define ___GFP_THISNODE 0x40000u -#define ___GFP_ATOMIC 0x80000u -#define ___GFP_ACCOUNT 0x100000u -#define ___GFP_DIRECT_RECLAIM 0x200000u -#define ___GFP_KSWAPD_RECLAIM 0x400000u +#define ___GFP_ZERO 0x100u +#define ___GFP_ATOMIC 0x200u +#define ___GFP_DIRECT_RECLAIM 0x400u +#define ___GFP_KSWAPD_RECLAIM 0x800u +#define ___GFP_WRITE 0x1000u +#define ___GFP_NOWARN 0x2000u +#define ___GFP_RETRY_MAYFAIL 0x4000u +#define ___GFP_NOFAIL 0x8000u +#define ___GFP_NORETRY 0x10000u +#define ___GFP_MEMALLOC 0x20000u +#define ___GFP_COMP 0x40000u +#define ___GFP_NOMEMALLOC 0x80000u +#define ___GFP_HARDWALL 0x100000u +#define ___GFP_THISNODE 0x200000u +#define ___GFP_ACCOUNT 0x400000u #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x800000u #else -- cgit From 70b44595eafe9c7c235f076d653a268ca1ab9fdb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:44:54 -0800 Subject: mm, compaction: use free lists to quickly locate a migration source The migration scanner is a linear scan of a zone with a potentiall large search space. Furthermore, many pageblocks are unusable such as those filled with reserved pages or partially filled with pages that cannot migrate. These still get scanned in the common case of allocating a THP and the cost accumulates. The patch uses a partial search of the free lists to locate a migration source candidate that is marked as MOVABLE when allocating a THP. It prefers picking a block with a larger number of free pages already on the basis that there are fewer pages to migrate to free the entire block. The lowest PFN found during searches is tracked as the basis of the start for the linear search after the first search of the free list fails. After the search, the free list is shuffled so that the next search will not encounter the same page. If the search fails then the subsequent searches will be shorter and the linear scanner is used. If this search fails, or if the request is for a small or unmovable/reclaimable allocation then the linear scanner is still used. It is somewhat pointless to use the list search in those cases. Small free pages must be used for the search and there is no guarantee that movable pages are located within that block that are contiguous. 5.0.0-rc1 5.0.0-rc1 noboost-v3r10 findmig-v3r15 Amean fault-both-3 3771.41 ( 0.00%) 3390.40 ( 10.10%) Amean fault-both-5 5409.05 ( 0.00%) 5082.28 ( 6.04%) Amean fault-both-7 7040.74 ( 0.00%) 7012.51 ( 0.40%) Amean fault-both-12 11887.35 ( 0.00%) 11346.63 ( 4.55%) Amean fault-both-18 16718.19 ( 0.00%) 15324.19 ( 8.34%) Amean fault-both-24 21157.19 ( 0.00%) 16088.50 * 23.96%* Amean fault-both-30 21175.92 ( 0.00%) 18723.42 * 11.58%* Amean fault-both-32 21339.03 ( 0.00%) 18612.01 * 12.78%* 5.0.0-rc1 5.0.0-rc1 noboost-v3r10 findmig-v3r15 Percentage huge-3 86.50 ( 0.00%) 89.83 ( 3.85%) Percentage huge-5 92.52 ( 0.00%) 91.96 ( -0.61%) Percentage huge-7 92.44 ( 0.00%) 92.85 ( 0.44%) Percentage huge-12 92.98 ( 0.00%) 92.74 ( -0.25%) Percentage huge-18 91.70 ( 0.00%) 91.71 ( 0.02%) Percentage huge-24 91.59 ( 0.00%) 92.13 ( 0.60%) Percentage huge-30 90.14 ( 0.00%) 93.79 ( 4.04%) Percentage huge-32 90.03 ( 0.00%) 91.27 ( 1.37%) This shows an improvement in allocation latencies with similar allocation success rates. While not presented, there was a 31% reduction in migration scanning and a 8% reduction on system CPU usage. A 2-socket machine showed similar benefits. [mgorman@techsingularity.net: several fixes] Link: http://lkml.kernel.org/r/20190204120111.GL9565@techsingularity.net [vbabka@suse.cz: migrate block that was found-fast, some optimisations] Link: http://lkml.kernel.org/r/20190118175136.31341-10-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index edb7628e46ed..79626b5ab36c 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -206,6 +206,17 @@ static inline void list_bulk_move_tail(struct list_head *head, head->prev = last; } +/** + * list_is_first -- tests whether @ list is the first entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_first(const struct list_head *list, + const struct list_head *head) +{ + return list->prev == head; +} + /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test -- cgit From e332f741a8dd1ec9a6dc8aa997296ecbfe64323e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:38 -0800 Subject: mm, compaction: be selective about what pageblocks to clear skip hints Pageblock hints are cleared when compaction restarts or kswapd makes enough progress that it can sleep but it's over-eager in that the bit is cleared for migration sources with no LRU pages and migration targets with no free pages. As pageblock skip hint flushes are relatively rare and out-of-band with respect to kswapd, this patch makes a few more expensive checks to see if it's appropriate to even clear the bit. Every pageblock that is not cleared will avoid 512 pages being scanned unnecessarily on x86-64. The impact is variable with different workloads showing small differences in latency, success rates and scan rates. This is expected as clearing the hints is not that common but doing a small amount of work out-of-band to avoid a large amount of work in-band later is generally a good thing. Link: http://lkml.kernel.org/r/20190118175136.31341-22-mgorman@techsingularity.net Signed-off-by: Mel Gorman Signed-off-by: Qian Cai Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing [cai@lca.pw: no stuck in __reset_isolation_pfn()] Link: http://lkml.kernel.org/r/20190206034732.75687-1-cai@lca.pw Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 842f9189537b..90c13cdeefb5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -480,6 +480,8 @@ struct zone { unsigned long compact_cached_free_pfn; /* pfn where async and sync compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[2]; + unsigned long compact_init_migrate_pfn; + unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION -- cgit From 5e1f0f098b4649fad53011246bcaeff011ffdf5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:41 -0800 Subject: mm, compaction: capture a page under direct compaction Compaction is inherently race-prone as a suitable page freed during compaction can be allocated by any parallel task. This patch uses a capture_control structure to isolate a page immediately when it is freed by a direct compactor in the slow path of the page allocator. The intent is to avoid redundant scanning. 5.0.0-rc1 5.0.0-rc1 selective-v3r17 capture-v3r19 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 2582.11 ( 0.00%) 2563.68 ( 0.71%) Amean fault-both-5 4500.26 ( 0.00%) 4233.52 ( 5.93%) Amean fault-both-7 5819.53 ( 0.00%) 6333.65 ( -8.83%) Amean fault-both-12 9321.18 ( 0.00%) 9759.38 ( -4.70%) Amean fault-both-18 9782.76 ( 0.00%) 10338.76 ( -5.68%) Amean fault-both-24 15272.81 ( 0.00%) 13379.55 * 12.40%* Amean fault-both-30 15121.34 ( 0.00%) 16158.25 ( -6.86%) Amean fault-both-32 18466.67 ( 0.00%) 18971.21 ( -2.73%) Latency is only moderately affected but the devil is in the details. A closer examination indicates that base page fault latency is reduced but latency of huge pages is increased as it takes creater care to succeed. Part of the "problem" is that allocation success rates are close to 100% even when under pressure and compaction gets harder 5.0.0-rc1 5.0.0-rc1 selective-v3r17 capture-v3r19 Percentage huge-3 96.70 ( 0.00%) 98.23 ( 1.58%) Percentage huge-5 96.99 ( 0.00%) 95.30 ( -1.75%) Percentage huge-7 94.19 ( 0.00%) 97.24 ( 3.24%) Percentage huge-12 94.95 ( 0.00%) 97.35 ( 2.53%) Percentage huge-18 96.74 ( 0.00%) 97.30 ( 0.58%) Percentage huge-24 97.07 ( 0.00%) 97.55 ( 0.50%) Percentage huge-30 95.69 ( 0.00%) 98.50 ( 2.95%) Percentage huge-32 96.70 ( 0.00%) 99.27 ( 2.65%) And scan rates are reduced as expected by 6% for the migration scanner and 29% for the free scanner indicating that there is less redundant work. Compaction migrate scanned 20815362 19573286 Compaction free scanned 16352612 11510663 [mgorman@techsingularity.net: remove redundant check] Link: http://lkml.kernel.org/r/20190201143853.GH9565@techsingularity.net Link: http://lkml.kernel.org/r/20190118175136.31341-23-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 3 ++- include/linux/sched.h | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 70d0256edd31..c960923d9ec2 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -93,7 +93,8 @@ extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, - const struct alloc_context *ac, enum compact_priority prio); + const struct alloc_context *ac, enum compact_priority prio, + struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx); diff --git a/include/linux/sched.h b/include/linux/sched.h index f9b43c989577..ebfb34fb9b30 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -47,6 +47,7 @@ struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; +struct capture_control; struct robust_list_head; struct sched_attr; struct sched_param; @@ -958,6 +959,9 @@ struct task_struct { struct io_context *io_context; +#ifdef CONFIG_COMPACTION + struct capture_control *capture_control; +#endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; -- cgit From 147e1a97c4a0bdd43f55a582a9416bb9092563a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 5 Mar 2019 15:45:45 -0800 Subject: fs: kernfs: add poll file operation Patch series "psi: pressure stall monitors", v3. Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices. Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high enough frequency right now. This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached. As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring. With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly. The new interface is straightforward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.: /* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000"; fd = open("/proc/pressure/memory"); write(fd, trigger, sizeof(trigger)); while (poll() >= 0) { ... } close(fd); When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal. The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded. Patches 1-4 prepare the psi code for polling support. Patch 5 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files. The patches were developed in collaboration with Johannes Weiner. This patch (of 5): Kernfs has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default. This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations. Link: http://lkml.kernel.org/r/20190124211518.244221-2-surenb@google.com Signed-off-by: Johannes Weiner Signed-off-by: Suren Baghdasaryan Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernfs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5b36b1287a5a..0cac1207bb00 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -25,6 +25,7 @@ struct seq_file; struct vm_area_struct; struct super_block; struct file_system_type; +struct poll_table_struct; struct kernfs_open_node; struct kernfs_iattrs; @@ -261,6 +262,9 @@ struct kernfs_ops { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); + __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -350,6 +354,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, + struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); -- cgit From dc50537bdd1a0804fa2cbc990565ee9a944e66fa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 5 Mar 2019 15:45:48 -0800 Subject: kernel: cgroup: add poll file operation Cgroup has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default. This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations. Link: http://lkml.kernel.org/r/20190124211518.244221-3-surenb@google.com Signed-off-by: Johannes Weiner Signed-off-by: Suren Baghdasaryan Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup-defs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8fcbae1b8db0..aad3babef007 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -32,6 +32,7 @@ struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; struct seq_file; +struct poll_table_struct; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 @@ -574,6 +575,9 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); + __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif -- cgit From aa9694bb78bf6eb03810108d5f6064fafa4ae1e1 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 5 Mar 2019 15:45:52 -0800 Subject: mm, memcg: create mem_cgroup_from_seq This is the start of a series of patches similar to my earlier DEFINE_MEMCG_MAX_OR_VAL work, but with less Macro Magic(tm). There are a bunch of places we go from seq_file to mem_cgroup, which currently requires manually getting the css, then getting the mem_cgroup from the css. It's in enough places now that having mem_cgroup_from_seq makes sense (and also makes the next patch a bit nicer). Link: http://lkml.kernel.org/r/20190124194050.GA31341@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b0eb29ea0d9c..1f3d880b7ca1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -429,6 +429,11 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return mem_cgroup_from_css(seq_css(m)); +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; @@ -937,6 +942,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return NULL; +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; -- cgit From 8bb4e7a2ee26c05a94ae6cb0aec2f82a3523cf35 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 5 Mar 2019 15:46:22 -0800 Subject: mm: fix some typos in mm directory No functional change. Link: http://lkml.kernel.org/r/20190118235123.27843-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Pekka Enberg Acked-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 90c13cdeefb5..6d3290cd1f6f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1301,7 +1301,7 @@ void memory_present(int nid, unsigned long start, unsigned long end); /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we - * need to check pfn validility within that MAX_ORDER_NR_PAGES block. + * need to check pfn validity within that MAX_ORDER_NR_PAGES block. * pfn_valid_within() should be used in this case; we optimise this away * when we have no holes within a MAX_ORDER_NR_PAGES block. */ -- cgit From 023bdd00235eb0dcb71fd98f0b8347a9bb85d417 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:37 -0800 Subject: mm/hugetlb: add prot_modify_start/commit sequence for hugetlb update Architectures like ppc64 require to do a conditional tlb flush based on the old and new value of pte. Follow the regular pte change protection sequence for hugetlb too. This allows the architectures to override the update sequence. Link: http://lkml.kernel.org/r/20190116085035.29729-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4cc3871b65fc..54c317c8355f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -580,6 +580,26 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr set_huge_pte_at(mm, addr, ptep, pte); } #endif + +#ifndef huge_ptep_modify_prot_start +#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start +static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); +} +#endif + +#ifndef huge_ptep_modify_prot_commit +#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit +static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL -- cgit From b56a2d8af9147a4efe4011b60d93779c0461ca97 Mon Sep 17 00:00:00 2001 From: Vineeth Remanan Pillai Date: Tue, 5 Mar 2019 15:47:03 -0800 Subject: mm: rid swapoff of quadratic complexity This patch was initially posted by Kelley Nielsen. Reposting the patch with all review comments addressed and with minor modifications and optimizations. Also, folding in the fixes offered by Hugh Dickins and Huang Ying. Tests were rerun and commit message updated with new results. try_to_unuse() is of quadratic complexity, with a lot of wasted effort. It unuses swap entries one by one, potentially iterating over all the page tables for all the processes in the system for each one. This new proposed implementation of try_to_unuse simplifies its complexity to linear. It iterates over the system's mms once, unusing all the affected entries as it walks each set of page tables. It also makes similar changes to shmem_unuse. Improvement swapoff was called on a swap partition containing about 6G of data, in a VM(8cpu, 16G RAM), and calls to unuse_pte_range() were counted. Present implementation....about 1200M calls(8min, avg 80% cpu util). Prototype.................about 9.0K calls(3min, avg 5% cpu util). Details In shmem_unuse(), iterate over the shmem_swaplist and, for each shmem_inode_info that contains a swap entry, pass it to shmem_unuse_inode(), along with the swap type. In shmem_unuse_inode(), iterate over its associated xarray, and store the index and value of each swap entry in an array for passing to shmem_swapin_page() outside of the RCU critical section. In try_to_unuse(), instead of iterating over the entries in the type and unusing them one by one, perhaps walking all the page tables for all the processes for each one, iterate over the mmlist, making one pass. Pass each mm to unuse_mm() to begin its page table walk, and during the walk, unuse all the ptes that have backing store in the swap type received by try_to_unuse(). After the walk, check the type for orphaned swap entries with find_next_to_unuse(), and remove them from the swap cache. If find_next_to_unuse() starts over at the beginning of the type, repeat the check of the shmem_swaplist and the walk a maximum of three times. Change unuse_mm() and the intervening walk functions down to unuse_pte_range() to take the type as a parameter, and to iterate over their entire range, calling the next function down on every iteration. In unuse_pte_range(), make a swap entry from each pte in the range using the passed in type. If it has backing store in the type, call swapin_readahead() to retrieve the page and pass it to unuse_pte(). Pass the count of pages_to_unuse down the page table walks in try_to_unuse(), and return from the walk when the desired number of pages has been swapped back in. Link: http://lkml.kernel.org/r/20190114153129.4852-2-vpillai@digitalocean.com Signed-off-by: Vineeth Remanan Pillai Signed-off-by: Kelley Nielsen Signed-off-by: Huang Ying Acked-by: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 7 +++++++ include/linux/shmem_fs.h | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 011965c08b93..6d775984905b 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -7,6 +7,13 @@ #include #include +/* + * Return code to denote that requested number of + * frontswap pages are unused(moved to page cache). + * Used in in shmem_unuse and try_to_unuse. + */ +#define FRONTSWAP_PAGES_UNUSED 2 + struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f155dc607112..f3fb1edb3526 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); -extern int shmem_unuse(swp_entry_t entry, struct page *page); +extern int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, -- cgit From 6e2e07cd35f6f72d1950453b170f6bfb6c668c46 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:47:36 -0800 Subject: mm: better document PG_reserved The usage of PG_reserved and how PG_reserved pages are to be treated is buried deep down in different parts of the kernel. Let's shine some light onto these details by documenting current users and expected behavior. Especially, clarify on the "Some of them might not even exist" case. These are physical memory gaps that will never be dumped as they are not marked as IORESOURCE_SYSRAM. PG_reserved does in general not hinder anybody from dumping or swapping. In some cases, these pages will not be stored in the hibernation image. Link: http://lkml.kernel.org/r/20190114125903.24845-10-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Michal Hocko Cc: Alexander Duyck Cc: Matthew Wilcox Cc: Anthony Yznaga Cc: Miles Chen Cc: Cc: Dan Williams Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 808b4183e30d..9f8712a4b1a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -17,8 +17,37 @@ /* * Various page->flags bits: * - * PG_reserved is set for special pages, which can never be swapped out. Some - * of them might not even exist... + * PG_reserved is set for special pages. The "struct page" of such a page + * should in general not be touched (e.g. set dirty) except by its owner. + * Pages marked as PG_reserved include: + * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, + * initrd, HW tables) + * - Pages reserved or allocated early during boot (before the page allocator + * was initialized). This includes (depending on the architecture) the + * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much + * much more. Once (if ever) freed, PG_reserved is cleared and they will + * be given to the page allocator. + * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying + * to read/write these pages might end badly. Don't touch! + * - The zero page(s) + * - Pages not added to the page allocator when onlining a section because + * they were excluded via the online_page_callback() or because they are + * PG_hwpoison. + * - Pages allocated in the context of kexec/kdump (loaded kernel image, + * control pages, vmcoreinfo) + * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are + * not marked PG_reserved (as they might be in use by somebody else who does + * not respect the caching strategy). + * - Pages part of an offline section (struct pages of offline sections should + * not be trusted as they will be initialized when first onlined). + * - MCA pages on ia64 + * - Pages holding CPU notes for POWER Firmware Assisted Dump + * - Device memory (e.g. PMEM, DAX, HMM) + * Some PG_reserved pages will be excluded from the hibernation image. + * PG_reserved does in general not hinder anybody from dumping or swapping + * and is no longer required for remap_pfn_range(). ioremap might require it. + * Consequently, PG_reserved for a page mapped into user space can indicate + * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by -- cgit From d7fefcc8de9147cc37d0c00df12e7ea4f77999b5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:47:40 -0800 Subject: mm/cma: add PF flag to force non cma alloc Patch series "mm/kvm/vfio/ppc64: Migrate compound pages out of CMA region", v8. ppc64 uses the CMA area for the allocation of guest page table (hash page table). We won't be able to start guest if we fail to allocate hash page table. We have observed hash table allocation failure because we failed to migrate pages out of CMA region because they were pinned. This happen when we are using VFIO. VFIO on ppc64 pins the entire guest RAM. If the guest RAM pages get allocated out of CMA region, we won't be able to migrate those pages. The pages are also pinned for the lifetime of the guest. Currently we support migration of non-compound pages. With THP and with the addition of hugetlb migration we can end up allocating compound pages from CMA region. This patch series add support for migrating compound pages. This patch (of 4): Add PF_MEMALLOC_NOCMA which make sure any allocation in that context is marked non-movable and hence cannot be satisfied by CMA region. This is useful with get_user_pages_longterm where we want to take a page pin by migrating pages from CMA region. Marking the section PF_MEMALLOC_NOCMA ensures that we avoid unnecessary page migration later. Link: http://lkml.kernel.org/r/20190114095438.32470-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Michal Hocko Cc: Alexey Kardashevskiy Cc: David Gibson Cc: Michael Ellerman Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + include/linux/sched/mm.h | 48 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ebfb34fb9b30..36ec6e7e8291 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,6 +1407,7 @@ extern struct pid *cad_pid; #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3bfa6a0cbba4..0cd9f10423fb 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -148,17 +148,25 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_NOCMA implies no allocation from CMA region. */ static inline gfp_t current_gfp_context(gfp_t flags) { - /* - * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precedence - */ - if (unlikely(current->flags & PF_MEMALLOC_NOIO)) - flags &= ~(__GFP_IO | __GFP_FS); - else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) - flags &= ~__GFP_FS; + if (unlikely(current->flags & + (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precedence + */ + if (current->flags & PF_MEMALLOC_NOIO) + flags &= ~(__GFP_IO | __GFP_FS); + else if (current->flags & PF_MEMALLOC_NOFS) + flags &= ~__GFP_FS; +#ifdef CONFIG_CMA + if (current->flags & PF_MEMALLOC_NOCMA) + flags &= ~__GFP_MOVABLE; +#endif + } return flags; } @@ -248,6 +256,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_CMA +static inline unsigned int memalloc_nocma_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + + current->flags |= PF_MEMALLOC_NOCMA; + return flags; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; +} +#else +static inline unsigned int memalloc_nocma_save(void) +{ + return 0; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ +} +#endif + #ifdef CONFIG_MEMCG /** * memalloc_use_memcg - Starts the remote memcg charging scope. -- cgit From 9a4e9f3b2d7393d50256762c21e7466b4b6b1c9c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:47:44 -0800 Subject: mm: update get_user_pages_longterm to migrate pages allocated from CMA region This patch updates get_user_pages_longterm to migrate pages allocated out of CMA region. This makes sure that we don't keep non-movable pages (due to page reference count) in the CMA area. This will be used by ppc64 in a later patch to avoid pinning pages in the CMA region. ppc64 uses CMA region for allocation of the hardware page table (hash page table) and not able to migrate pages out of CMA region results in page table allocation failures. One case where we hit this easy is when a guest using a VFIO passthrough device. VFIO locks all the guest's memory and if the guest memory is backed by CMA region, it becomes unmovable resulting in fragmenting the CMA and possibly preventing other guests from allocation a large enough hash page table. NOTE: We allocate the new page without using __GFP_THISNODE Link: http://lkml.kernel.org/r/20190114095438.32470-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Alexey Kardashevskiy Cc: Andrea Arcangeli Cc: David Gibson Cc: Michael Ellerman Cc: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 ++ include/linux/mm.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 54c317c8355f..ea35263eb76b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..20ec56f8e2bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1536,7 +1536,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#ifdef CONFIG_FS_DAX + +#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); -- cgit From 59118c42a60b997d277ad04d2309a6ec30682e5e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Mar 2019 15:48:02 -0800 Subject: mm: swap: use mem_cgroup_is_root() instead of deferencing css->parent mem_cgroup_is_root() is the preferred API to check if memcg is root or not. Use it instead of deferencing css->parent. Link: http://lkml.kernel.org/r/1547232913-118148-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Michal Hocko Cc: Huang Ying Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 622025ac1461..649529be91f2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -625,7 +625,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return vm_swappiness; /* root ? */ - if (mem_cgroup_disabled() || !memcg->css.parent) + if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return vm_swappiness; return memcg->swappiness; -- cgit From b9726c26dc21b15a2faea96fae3a42f2f7fffdcb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:48:26 -0800 Subject: numa: make "nr_node_ids" unsigned int Number of NUMA nodes can't be negative. This saves a few bytes on x86_64: add/remove: 0/0 grow/shrink: 4/21 up/down: 27/-265 (-238) Function old new delta hv_synic_alloc.cold 88 110 +22 prealloc_shrinker 260 262 +2 bootstrap 249 251 +2 sched_init_numa 1566 1567 +1 show_slab_objects 778 777 -1 s_show 1201 1200 -1 kmem_cache_init 346 345 -1 __alloc_workqueue_key 1146 1145 -1 mem_cgroup_css_alloc 1614 1612 -2 __do_sys_swapon 4702 4699 -3 __list_lru_init 655 651 -4 nic_probe 2379 2374 -5 store_user_store 118 111 -7 red_zone_store 106 99 -7 poison_store 106 99 -7 wq_numa_init 348 338 -10 __kmem_cache_empty 75 65 -10 task_numa_free 186 173 -13 merge_across_nodes_store 351 336 -15 irq_create_affinity_masks 1261 1246 -15 do_numa_crng_init 343 321 -22 task_numa_fault 4760 4737 -23 swapfile_init 179 156 -23 hv_synic_alloc 536 492 -44 apply_wqattrs_prepare 746 695 -51 Link: http://lkml.kernel.org/r/20190201223029.GA15820@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 5a30ad594ccc..962c5e783d50 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -444,7 +444,7 @@ static inline int next_memory_node(int nid) return next_node(nid, node_states[N_MEMORY]); } -extern int nr_node_ids; +extern unsigned int nr_node_ids; extern int nr_online_nodes; static inline void node_set_online(int nid) @@ -485,7 +485,7 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) -#define nr_node_ids 1 +#define nr_node_ids 1U #define nr_online_nodes 1 #define node_set_online(node) node_set_state((node), N_ONLINE) -- cgit From ce0725f78a56a59bdb07cef003bc6fef722da38e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:48:29 -0800 Subject: numa: make "nr_online_nodes" unsigned int Number of online NUMA nodes can't be negative as well. This doesn't save space as the variable is used only in 32-bit context, but do it anyway for consistency. Link: http://lkml.kernel.org/r/20190201223151.GB15820@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 962c5e783d50..27e7fa36f707 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -445,7 +445,7 @@ static inline int next_memory_node(int nid) } extern unsigned int nr_node_ids; -extern int nr_online_nodes; +extern unsigned int nr_online_nodes; static inline void node_set_online(int nid) { @@ -486,7 +486,7 @@ static inline int num_node_state(enum node_states state) #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U -#define nr_online_nodes 1 +#define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) -- cgit From 6d2bef9df7ccf3a2db0160be24f8b92a3f24708a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 5 Mar 2019 15:48:33 -0800 Subject: mm/page_poison: update comment after code moved mm/debug-pagealloc.c is no more, so of course header now needs to be updated. This seems like something checkpatch should be able to catch - worth looking into? Link: http://lkml.kernel.org/r/20190207191113.14039-1-mst@redhat.com Fixes: 8823b1dbc05f ("mm/page_poison.c: enable PAGE_POISONING as a separate option") Signed-off-by: Michael S. Tsirkin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 15927ebc22f2..5046bad0c1c5 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -30,7 +30,7 @@ */ #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) -/********** mm/debug-pagealloc.c **********/ +/********** mm/page_poison.c **********/ #ifdef CONFIG_PAGE_POISONING_ZERO #define PAGE_POISON 0x00 #else -- cgit From 494eec70f054965e2e699db450cde2c08db1c008 Mon Sep 17 00:00:00 2001 From: "john.hubbard@gmail.com" Date: Tue, 5 Mar 2019 15:48:49 -0800 Subject: mm: page_cache_add_speculative(): refactor out some code duplication From: John Hubbard This combines the common elements of these routines: page_cache_get_speculative() page_cache_add_speculative() This was anticipated by the original author, as shown by the comment in commit ce0ad7f095258 ("powerpc/mm: Lockless get_user_pages_fast() for 64-bit (v3)"): "Same as above, but add instead of inc (could just be merged)" There is no intention to introduce any behavioral change, but there is a small risk of that, due to slightly differing ways of expressing the TINY_RCU and related configurations. This also removes the VM_BUG_ON(in_interrupt()) that was in page_cache_add_speculative(), but not in page_cache_get_speculative(). This provides slightly less detection of such bugs, but it given that it was only there on the "add" path anyway, we can likely do without it just fine. And it removes the VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); that page_cache_add_speculative() had. Link: http://lkml.kernel.org/r/20190206231016.22734-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Dave Kleikamp Cc: Hugh Dickins Cc: Jeff Layton Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e2d7039af6a3..b477a70cc2e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -164,7 +164,7 @@ void release_pages(struct page **pages, int nr); * will find the page or it will not. Likewise, the old find_get_page could run * either before the insertion or afterwards, depending on timing. */ -static inline int page_cache_get_speculative(struct page *page) +static inline int __page_cache_add_speculative(struct page *page, int count) { #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT @@ -180,10 +180,10 @@ static inline int page_cache_get_speculative(struct page *page) * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_inc(page); + page_ref_add(page, count); #else - if (unlikely(!get_page_unless_zero(page))) { + if (unlikely(!page_ref_add_unless(page, count, 0))) { /* * Either the page has been freed, or will be freed. * In either case, retry here and the caller should @@ -197,27 +197,14 @@ static inline int page_cache_get_speculative(struct page *page) return 1; } -/* - * Same as above, but add instead of inc (could just be merged) - */ -static inline int page_cache_add_speculative(struct page *page, int count) +static inline int page_cache_get_speculative(struct page *page) { - VM_BUG_ON(in_interrupt()); - -#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, count); - -#else - if (unlikely(!page_ref_add_unless(page, count, 0))) - return 0; -#endif - VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); + return __page_cache_add_speculative(page, 1); +} - return 1; +static inline int page_cache_add_speculative(struct page *page, int count) +{ + return __page_cache_add_speculative(page, count); } #ifdef CONFIG_NUMA -- cgit From ace451eb5ec5bb432fc28d8a723838b88e28643e Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 5 Mar 2019 15:48:56 -0800 Subject: include/linux/compaction.h: fix potential build error Declaration of struct node is required regardless. On UMA systems, including compaction.h without preceding node.h shouldn't cause a build error. Link: http://lkml.kernel.org/r/20190208080437.253322-1-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c960923d9ec2..9569e7c786d3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -226,8 +226,8 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i #endif /* CONFIG_COMPACTION */ -#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) struct node; +#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) extern int compaction_register_node(struct node *node); extern void compaction_unregister_node(struct node *node); -- cgit From a7ca12f9d905e7437dd3beb9cbb8e85bc2b991f4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 5 Mar 2019 15:49:35 -0800 Subject: mm/workingset: remove unused @mapping argument in workingset_eviction() workingset_eviction() doesn't use and never did use the @mapping argument. Remove it. Link: http://lkml.kernel.org/r/20190228083329.31892-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Michal Hocko Cc: William Kucharski Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 649529be91f2..fc50e21b3b88 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ -void *workingset_eviction(struct address_space *mapping, struct page *page); +void *workingset_eviction(struct page *page); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); -- cgit From f4b7e272b5c0425915e2115068e0a5a20a3a628e Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 5 Mar 2019 15:49:39 -0800 Subject: mm: remove zone_lru_lock() function, access ->lru_lock directly We have common pattern to access lru_lock from a page pointer: zone_lru_lock(page_zone(page)) Which is silly, because it unfolds to this: &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]->zone_pgdat->lru_lock while we can simply do &NODE_DATA(page_to_nid(page))->lru_lock Remove zone_lru_lock() function, since it's only complicate things. Use 'page_pgdat(page)->lru_lock' pattern instead. [aryabinin@virtuozzo.com: a slightly better version of __split_huge_page()] Link: http://lkml.kernel.org/r/20190301121651.7741-1-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/20190228083329.31892-2-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: William Kucharski Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- include/linux/mmzone.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0a36a22228e7..ab9b48420200 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -80,7 +80,7 @@ struct page { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by - * zone_lru_lock. Sometimes used as a generic list + * pgdat->lru_lock. Sometimes used as a generic list * by the page owner. */ struct list_head lru; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6d3290cd1f6f..fba7741533be 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -730,10 +730,6 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) -static inline spinlock_t *zone_lru_lock(struct zone *zone) -{ - return &zone->zone_pgdat->lru_lock; -} static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) { -- cgit From a9519defc771d574888ffe01e84747889152ec35 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Tue, 5 Mar 2019 15:50:03 -0800 Subject: writeback: fix inode cgroup switching comment Commit 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") refers to inode_switch_wb_work_fn() which never got merged. Switch the comments to inode_switch_wbs_work_fn(). Link: http://lkml.kernel.org/r/20190305004617.142590-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen Reviewed-by: Andrew Morton Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- include/linux/fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c28a47cbe355..f9b029180241 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -365,7 +365,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) rcu_read_lock(); /* - * Paired with store_release in inode_switch_wb_work_fn() and + * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; diff --git a/include/linux/fs.h b/include/linux/fs.h index fd423fec8d83..08f26046233e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2091,7 +2091,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See - * inode_switch_wb_work_fn() for details. + * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. -- cgit