From a3d0a918502cc73af4f60da2cc4c5cac5573f183 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 2 Feb 2016 16:57:08 -0800 Subject: thp: make split_queue per-node Andrea Arcangeli suggested to make split queue per-node to improve scalability. Let's do it. Signed-off-by: Kirill A. Shutemov Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 49 ++++++++++++++++++++++++++----------------------- mm/page_alloc.c | 5 +++++ 2 files changed, 31 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fd3a07b3e6f4..253a25e007d7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; -static DEFINE_SPINLOCK(split_queue_lock); -static LIST_HEAD(split_queue); -static unsigned long split_queue_len; static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) @@ -3358,6 +3355,7 @@ int total_mapcount(struct page *page) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct anon_vma *anon_vma; int count, mapcount, ret; bool mlocked; @@ -3401,19 +3399,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* Prevent deferred_split_scan() touching ->_count */ - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && count == 1) { if (!list_empty(page_deferred_list(head))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); __split_huge_page(page, list); ret = 0; } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); pr_alert("total_mapcount: %u, page_count(): %u\n", mapcount, count); if (PageTail(page)) @@ -3421,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } else { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); unfreeze_page(anon_vma, head); ret = -EBUSY; } @@ -3436,52 +3434,56 @@ out: void free_transhuge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { - list_add_tail(page_deferred_list(page), &split_queue); - split_queue_len++; + list_add_tail(page_deferred_list(page), &pgdata->split_queue); + pgdata->split_queue_len++; } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { + struct pglist_data *pgdata = NODE_DATA(sc->nid); /* * Split a page from split_queue will free up at least one page, * at most HPAGE_PMD_NR - 1. We don't track exact number. * Let's use HPAGE_PMD_NR / 2 as ballpark. */ - return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; + return ACCESS_ONCE(pgdata->split_queue_len) * HPAGE_PMD_NR / 2; } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct pglist_data *pgdata = NODE_DATA(sc->nid); unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_init(&split_queue, &list); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_init(&pgdata->split_queue, &list); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_safe(pos, next, &list) { @@ -3490,10 +3492,10 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, /* race with put_compound_page() */ if (!get_page_unless_zero(page)) { list_del_init(page_deferred_list(page)); - split_queue_len--; + pgdata->split_queue_len--; } } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -3505,9 +3507,9 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, put_page(page); } - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_tail(&list, &split_queue); - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_tail(&list, &pgdata->split_queue); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); return split * HPAGE_PMD_NR / 2; } @@ -3516,6 +3518,7 @@ static struct shrinker deferred_split_shrinker = { .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, }; #ifdef CONFIG_DEBUG_FS diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 63358d9f9aa9..ea2c4d3e0c03 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5209,6 +5209,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&pgdat->split_queue_lock); + INIT_LIST_HEAD(&pgdat->split_queue); + pgdat->split_queue_len = 0; #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); -- cgit From cb8d68ec16a511f8be7e1028fd8f869ef7c6a1a8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 2 Feb 2016 16:57:12 -0800 Subject: thp: change deferred_split_count() to return number of THP in queue I've got meaning of shrinker::count_objects() wrong: it should return number of potentially freeable objects, which is not necessary correlate with freeable memory. Returning 256 per THP in queue is not reasonable: shrinker::scan_objects() never called with nr_to_scan > 128 in my setup. Let's return 1 per THP and correct scan_object accordingly. Signed-off-by: Kirill A. Shutemov Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 253a25e007d7..7aae72114583 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3465,12 +3465,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); - /* - * Split a page from split_queue will free up at least one page, - * at most HPAGE_PMD_NR - 1. We don't track exact number. - * Let's use HPAGE_PMD_NR / 2 as ballpark. - */ - return ACCESS_ONCE(pgdata->split_queue_len) * HPAGE_PMD_NR / 2; + return ACCESS_ONCE(pgdata->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, @@ -3511,7 +3506,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, list_splice_tail(&list, &pgdata->split_queue); spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - return split * HPAGE_PMD_NR / 2; + /* + * Stop shrinker if we didn't split any page, but the queue is empty. + * This can happen if pages were freed under us. + */ + if (!split && list_empty(&pgdata->split_queue)) + return SHRINK_STOP; + return split; } static struct shrinker deferred_split_shrinker = { -- cgit From e3ae19535c665771e2c03cdd63df9bc4d6b37941 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 2 Feb 2016 16:57:15 -0800 Subject: thp: limit number of object to scan on deferred_split_scan() If we have a lot of pages in queue to be split, deferred_split_scan() can spend unreasonable amount of time under spinlock with disabled interrupts. Let's cap number of pages to split on scan by sc->nr_to_scan. Signed-off-by: Kirill A. Shutemov Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7aae72114583..c1411961167e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3478,17 +3478,19 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, int split = 0; spin_lock_irqsave(&pgdata->split_queue_lock, flags); - list_splice_init(&pgdata->split_queue, &list); - /* Take pin on all head pages to avoid freeing them under us */ list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); - /* race with put_compound_page() */ - if (!get_page_unless_zero(page)) { + if (get_page_unless_zero(page)) { + list_move(page_deferred_list(page), &list); + } else { + /* We lost race with put_compound_page() */ list_del_init(page_deferred_list(page)); pgdata->split_queue_len--; } + if (!--sc->nr_to_scan) + break; } spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); -- cgit From 65376df582174ffcec9e6471bf5b0dd79ba05e4a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 2 Feb 2016 16:57:29 -0800 Subject: proc: revert /proc//maps [stack:TID] annotation Commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") added [stack:TID] annotation to /proc//maps. Finding the task of a stack VMA requires walking the entire thread list, turning this into quadratic behavior: a thousand threads means a thousand stacks, so the rendering of /proc//maps needs to look at a million combinations. The cost is not in proportion to the usefulness as described in the patch. Drop the [stack:TID] annotation to make /proc//maps (and /proc//numa_maps) usable again for higher thread counts. The [stack] annotation inside /proc//task//maps is retained, as identifying the stack VMA there is an O(1) operation. Siddesh said: "The end users needed a way to identify thread stacks programmatically and there wasn't a way to do that. I'm afraid I no longer remember (or have access to the resources that would aid my memory since I changed employers) the details of their requirement. However, I did do this on my own time because I thought it was an interesting project for me and nobody really gave any feedback then as to its utility, so as far as I am concerned you could roll back the main thread maps information since the information is available in the thread-specific files" Signed-off-by: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Siddhesh Poyarekar Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index c108a6542d05..4fb14ca5a419 100644 --- a/mm/util.c +++ b/mm/util.c @@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) { return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -/* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the task_struct of the task - * that the vma is stack for. Must be called under rcu_read_lock(). - */ -struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group) -{ - if (vm_is_stack_for_task(task, vma)) - return task; - - if (in_group) { - struct task_struct *t; - - for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) - return t; - } - } - - return NULL; -} - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { -- cgit From d977d56ce5b3e8842236f2f9e7483d4914c9592e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 2 Feb 2016 16:57:43 -0800 Subject: mm: warn about VmData over RLIMIT_DATA This patch provides a way of working around a slight regression introduced by commit 84638335900f ("mm: rework virtual memory accounting"). Before that commit RLIMIT_DATA have control only over size of the brk region. But that change have caused problems with all existing versions of valgrind, because it set RLIMIT_DATA to zero. This patch fixes rlimit check (limit actually in bytes, not pages) and by default turns it into warning which prints at first VmData misuse: "mmap: top (795): VmData 516096 exceed data ulimit 512000. Will be forbidden soon." Behavior is controlled by boot param ignore_rlimit_data=y/n and by sysfs /sys/module/kernel/parameters/ignore_rlimit_data. For now it set to "y". [akpm@linux-foundation.org: tweak kernel-parameters.txt text[ Signed-off-by: Konstantin Khlebnikov Link: http://lkml.kernel.org/r/20151228211015.GL2194@uranus Reported-by: Christian Borntraeger Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Vegard Nossum Cc: Peter Zijlstra Cc: Vladimir Davydov Cc: Andy Lutomirski Cc: Quentin Casasnovas Cc: Kees Cook Cc: Willy Tarreau Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 16 ++++++++++++++++ mm/mmap.c | 23 +++++++++++++++++------ 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index ed8b5ffcf9b1..6e976302ddd8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -216,6 +216,22 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +static inline bool is_exec_mapping(vm_flags_t flags) +{ + return (flags & (VM_EXEC | VM_WRITE)) == VM_EXEC; +} + +static inline bool is_stack_mapping(vm_flags_t flags) +{ + return (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) != 0; +} + +static inline bool is_data_mapping(vm_flags_t flags) +{ + return (flags & ((VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)) | + VM_WRITE | VM_SHARED)) == VM_WRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); diff --git a/mm/mmap.c b/mm/mmap.c index 84b12624ceb0..cfc0cdca421e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif +static bool ignore_rlimit_data = true; +core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, @@ -2982,9 +2985,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; - if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & - (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) - return mm->data_vm + npages <= rlimit(RLIMIT_DATA); + if (is_data_mapping(flags) && + mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { + if (ignore_rlimit_data) + pr_warn_once("%s (%d): VmData %lu exceed data ulimit " + "%lu. Will be forbidden soon.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA)); + else + return false; + } return true; } @@ -2993,11 +3004,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { mm->total_vm += npages; - if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) + if (is_exec_mapping(flags)) mm->exec_vm += npages; - else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) + else if (is_stack_mapping(flags)) mm->stack_vm += npages; - else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + else if (is_data_mapping(flags)) mm->data_vm += npages; } -- cgit From 30bdbb78009e67767983085e302bec6d97afc679 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 2 Feb 2016 16:57:46 -0800 Subject: mm: polish virtual memory accounting * add VM_STACK as alias for VM_GROWSUP/DOWN depending on architecture * always account VMAs with flag VM_STACK as stack (as it was before) * cleanup classifying helpers * update comments and documentation Signed-off-by: Konstantin Khlebnikov Tested-by: Sudip Mukherjee Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 6e976302ddd8..a38a21ebddb4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -216,20 +216,35 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ static inline bool is_exec_mapping(vm_flags_t flags) { - return (flags & (VM_EXEC | VM_WRITE)) == VM_EXEC; + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } +/* + * Stack area - atomatically grows in one direction + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ static inline bool is_stack_mapping(vm_flags_t flags) { - return (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) != 0; + return (flags & VM_STACK) == VM_STACK; } +/* + * Data area - private, writable, not stack + */ static inline bool is_data_mapping(vm_flags_t flags) { - return (flags & ((VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)) | - VM_WRITE | VM_SHARED)) == VM_WRITE; + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } /* mm/util.c */ -- cgit From 3c1da7beeee02560cd0f0c66c5a59fce3c6746e3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 2 Feb 2016 16:57:49 -0800 Subject: mm/vmpressure.c: fix subtree pressure detection When vmpressure is called for the entire subtree under pressure we mistakenly use vmpressure->scanned instead of vmpressure->tree_scanned when checking if vmpressure work is to be scheduled. This results in suppressing all vmpressure events in the legacy cgroup hierarchy. Fix it. Fixes: 8e8ae645249b ("mm: memcontrol: hook up vmpressure to socket pressure") Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 9a6c0704211c..149fdf6c5c56 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, if (tree) { spin_lock(&vmpr->sr_lock); - vmpr->tree_scanned += scanned; + scanned = vmpr->tree_scanned += scanned; vmpr->tree_reclaimed += reclaimed; - scanned = vmpr->scanned; spin_unlock(&vmpr->sr_lock); if (scanned < vmpressure_win) -- cgit From 12c9d70bd5056b3ae84746fca973c286f48384cc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 2 Feb 2016 16:57:57 -0800 Subject: mm: fix memory leak in copy_huge_pmd() We allocate a pgtable but do not attach it to anything if the PMD is in a DAX VMA, causing it to leak. We certainly try to not free pgtables associated with the huge zero page if the zero page is in a DAX VMA, so I think this is the right solution. This needs to be properly audited. Signed-off-by: Matthew Wilcox Cc: Dan Williams Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c1411961167e..36c070167b71 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -858,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - pgtable_trans_huge_deposit(mm, pmd, pgtable); + if (pgtable) + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); return true; @@ -1036,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; - pgtable_t pgtable; + pgtable_t pgtable = NULL; int ret; - ret = -ENOMEM; - pgtable = pte_alloc_one(dst_mm, addr); - if (unlikely(!pgtable)) - goto out; + if (!vma_is_dax(vma)) { + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + } dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); @@ -1073,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (pmd_trans_huge(pmd)) { + if (!vma_is_dax(vma)) { /* thp accounting separate from pmd_devmap accounting */ src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); -- cgit From 464353647427793aef800503ec42acb68e95d9e2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 30 Jan 2016 18:03:16 -0800 Subject: mm: retire GUP WARN_ON_ONCE that outlived its usefulness Trinity is now hitting the WARN_ON_ONCE we added in v3.15 commit cda540ace6a1 ("mm: get_user_pages(write,force) refuse to COW in shared areas"). The warning has served its purpose, nobody was harmed by that change, so just remove the warning to generate less noise from Trinity. Which reminds me of the comment I wrongly left behind with that commit (but was spotted at the time by Kirill), which has since moved into a separate function, and become even more obscure: delete it. Reported-by: Dave Jones Suggested-by: Kirill A. Shutemov Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/gup.c | 4 +--- mm/memory.c | 5 ----- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index b64a36175884..7bf19ffa2199 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -430,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * Anon pages in shared mappings are surprising: now * just reject it. */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + if (!is_cow_mapping(vm_flags)) return -EFAULT; - } } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) diff --git a/mm/memory.c b/mm/memory.c index 93ce37989471..635451abc8f7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2237,11 +2237,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); - /* - * Only catch write-faults on shared writable pages, - * read-only shared pages can get COWed by - * get_user_pages(.write=1, .force=1). - */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; -- cgit From acf128d048c76aeaa99646bce3488d73be215f70 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 5 Feb 2016 15:36:13 -0800 Subject: mm: validate_mm browse_rb SMP race condition The mmap_sem for reading in validate_mm called from expand_stack is not enough to prevent the argumented rbtree rb_subtree_gap information to change from under us because expand_stack may be running from other threads concurrently which will hold the mmap_sem for reading too. The argumented rbtree is updated with vma_gap_update under the page_table_lock so use it in browse_rb() too to avoid false positives. Signed-off-by: Andrea Arcangeli Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index cfc0cdca421e..918c9ec5043f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) } #ifdef CONFIG_DEBUG_VM_RB -static int browse_rb(struct rb_root *root) +static int browse_rb(struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; @@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root) vma->vm_start, vma->vm_end); bug = 1; } + spin_lock(&mm->page_table_lock); if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; } + spin_unlock(&mm->page_table_lock); i++; pn = nd; prev = vma->vm_start; @@ -475,7 +478,7 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(&mm->mm_rb); + i = browse_rb(mm); if (i != mm->map_count) { if (i != -1) pr_emerg("map_count %d rb %d\n", mm->map_count, i); -- cgit From 1f1ffb8a151e8c59899c78019f76cb8f64be13f5 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Feb 2016 15:36:19 -0800 Subject: memblock: don't mark memblock_phys_mem_size() as __init At the moment memblock_phys_mem_size() is marked as __init, and so is discarded after boot. This is different from most of the memblock functions which are marked __init_memblock, and are only discarded after boot if memory hotplug is not configured. To allow for upcoming code which will need memblock_phys_mem_size() in the hotplug path, change it from __init to __init_memblock. Signed-off-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index d2ed81e59a94..dd7989929f13 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) * Remaining API functions */ -phys_addr_t __init memblock_phys_mem_size(void) +phys_addr_t __init_memblock memblock_phys_mem_size(void) { return memblock.memory.total_size; } -- cgit From 1ce221036b83288358d8d5d5ff1784657b3ab4e9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Feb 2016 15:36:21 -0800 Subject: mm/Kconfig: correct description of DEFERRED_STRUCT_PAGE_INIT The description mentions kswapd threads, while the deferred struct page initialization is actually done by one-off "pgdatinitX" threads. Fix the description so that potentially users are not confused about pgdatinit threads using CPU after boot instead of kswapd. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 97a4e06b15c0..03cbfa072f42 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT bool config DEFERRED_STRUCT_PAGE_INIT - bool "Defer initialisation of struct pages to kswapd" + bool "Defer initialisation of struct pages to kthreads" default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG @@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up a subset of memmap at boot and then initialise the rest in parallel - when kswapd starts. This has a potential performance impact on - processes running early in the lifetime of the systemm until kswapd - finishes the initialisation. + by starting one-off "pgdatinitX" kernel thread for each node X. This + has a potential performance impact on processes running early in the + lifetime of the system until these kthreads finish the + initialisation. config IDLE_PAGE_TRACKING bool "Enable idle page tracking" -- cgit From f01f17d3705bb6081c9e5728078f64067982be36 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Feb 2016 15:36:24 -0800 Subject: mm, vmstat: make quiet_vmstat lighter Mike has reported a considerable overhead of refresh_cpu_vm_stats from the idle entry during pipe test: 12.89% [kernel] [k] refresh_cpu_vm_stats.isra.12 4.75% [kernel] [k] __schedule 4.70% [kernel] [k] mutex_unlock 3.14% [kernel] [k] __switch_to This is caused by commit 0eb77e988032 ("vmstat: make vmstat_updater deferrable again and shut down on idle") which has placed quiet_vmstat into cpu_idle_loop. The main reason here seems to be that the idle entry has to get over all zones and perform atomic operations for each vmstat entry even though there might be no per cpu diffs. This is a pointless overhead for _each_ idle entry. Make sure that quiet_vmstat is as light as possible. First of all it doesn't make any sense to do any local sync if the current cpu is already set in oncpu_stat_off because vmstat_update puts itself there only if there is nothing to do. Then we can check need_update which should be a cheap way to check for potential per-cpu diffs and only then do refresh_cpu_vm_stats. The original patch also did cancel_delayed_work which we are not doing here. There are two reasons for that. Firstly cancel_delayed_work from idle context will blow up on RT kernels (reported by Mike): CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.5.0-rt3 #7 Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013 Call Trace: dump_stack+0x49/0x67 ___might_sleep+0xf5/0x180 rt_spin_lock+0x20/0x50 try_to_grab_pending+0x69/0x240 cancel_delayed_work+0x26/0xe0 quiet_vmstat+0x75/0xa0 cpu_idle_loop+0x38/0x3e0 cpu_startup_entry+0x13/0x20 start_secondary+0x114/0x140 And secondly, even on !RT kernels it might add some non trivial overhead which is not necessary. Even if the vmstat worker wakes up and preempts idle then it will be most likely a single shot noop because the stats were already synced and so it would end up on the oncpu_stat_off anyway. We just need to teach both vmstat_shepherd and vmstat_update to stop scheduling the worker if there is nothing to do. [mgalbraith@suse.de: cancel pending work of the cpu_stat_off CPU] Signed-off-by: Michal Hocko Reported-by: Mike Galbraith Acked-by: Christoph Lameter Signed-off-by: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 40b2c74ddf16..1543f64df3e6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w) * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. + * If we were marked on cpu_stat_off clear the flag + * so that vmstat_shepherd doesn't schedule us again. */ - queue_delayed_work_on(smp_processor_id(), vmstat_wq, - this_cpu_ptr(&vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + if (!cpumask_test_and_clear_cpu(smp_processor_id(), + cpu_stat_off)) { + queue_delayed_work_on(smp_processor_id(), vmstat_wq, + this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + } } else { /* * We did not update any counters so the app may be in @@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w) * until the diffs stay at zero. The function is used by NOHZ and can only be * invoked when tick processing is not active. */ -void quiet_vmstat(void) -{ - if (system_state != SYSTEM_RUNNING) - return; - - do { - if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) - cancel_delayed_work(this_cpu_ptr(&vmstat_work)); - - } while (refresh_cpu_vm_stats(false)); -} - /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1452,6 +1445,30 @@ static bool need_update(int cpu) return false; } +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + /* + * If we are already in hands of the shepherd then there + * is nothing for us to do here. + */ + if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + return; + + if (!need_update(smp_processor_id())) + return; + + /* + * Just refresh counters and do not care about the pending delayed + * vmstat_update. It doesn't fire that often to matter and canceling + * it would be too expensive from this path. + * vmstat_shepherd will take care about that for us. + */ + refresh_cpu_vm_stats(false); +} + /* * Shepherd worker thread that checks the @@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ - for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && - cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - - queue_delayed_work_on(cpu, vmstat_wq, - &per_cpu(vmstat_work, cpu), 0); + for_each_cpu(cpu, cpu_stat_off) { + struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + if (need_update(cpu)) { + if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + } else { + /* + * Cancel the work if quiet_vmstat has put this + * cpu on cpu_stat_off because the work item might + * be still scheduled + */ + cancel_delayed_work(dw); + } + } put_online_cpus(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); - } static void __init start_shepherd_timer(void) -- cgit From ccde8bd4014eb2f01102f7a64f0fad3df193b758 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Feb 2016 15:36:27 -0800 Subject: vmstat: make vmstat_update deferrable Commit 0eb77e988032 ("vmstat: make vmstat_updater deferrable again and shut down on idle") made vmstat_shepherd deferrable. vmstat_update itself is still useing standard timer which might interrupt idle task. This is possible because "mm, vmstat: make quiet_vmstat lighter" removed cancel_delayed_work from the quiet_vmstat. Change vmstat_work to use DEFERRABLE_WORK to prevent from pointless wakeups from the idle context. Acked-by: Christoph Lameter Signed-off-by: Michal Hocko Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 1543f64df3e6..084c6725b373 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1512,7 +1512,7 @@ static void __init start_shepherd_timer(void) int cpu; for_each_possible_cpu(cpu) - INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) -- cgit From 564e81a57f9788b1475127012e0fd44e9049e342 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 5 Feb 2016 15:36:30 -0800 Subject: mm, vmstat: fix wrong WQ sleep when memory reclaim doesn't make any progress Jan Stancek has reported that system occasionally hanging after "oom01" testcase from LTP triggers OOM. Guessing from a result that there is a kworker thread doing memory allocation and the values between "Node 0 Normal free:" and "Node 0 Normal:" differs when hanging, vmstat is not up-to-date for some reason. According to commit 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress"), it meant to force the kworker thread to take a short sleep, but it by error used schedule_timeout(1). We missed that schedule_timeout() in state TASK_RUNNING doesn't do anything. Fix it by using schedule_timeout_uninterruptible(1) which forces the kworker thread to take a short sleep in order to make sure that vmstat is up-to-date. Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress") Signed-off-by: Tetsuo Handa Reported-by: Jan Stancek Acked-by: Michal Hocko Cc: Tejun Heo Cc: Cristopher Lameter Cc: Joonsoo Kim Cc: Arkadiusz Miskiewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index cc5d29d2da9b..926c76d56388 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * here rather than calling cond_resched(). */ if (current->flags & PF_WQ_WORKER) - schedule_timeout(1); + schedule_timeout_uninterruptible(1); else cond_resched(); -- cgit From 77bf45e78050790d8f7fc30b87a0ca674bf6265a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 5 Feb 2016 15:36:33 -0800 Subject: mempolicy: do not try to queue pages from !vma_migratable() Maybe I miss some point, but I don't see a reason why we try to queue pages from non migratable VMAs. This testcase steps on VM_BUG_ON_PAGE() in isolate_lru_page(): #include #include #include #include #include #define SIZE 0x2000 int foo; int main() { int fd; char *p; unsigned long mask = 2; fd = open("/dev/sg0", O_RDWR); p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); /* Faultin pages */ foo = p[0] + p[0x1000]; mbind(p, SIZE, MPOL_BIND, &mask, 4, MPOL_MF_MOVE | MPOL_MF_STRICT); return 0; } The only case when we can queue pages from such VMA is MPOL_MF_STRICT plus MPOL_MF_MOVE or MPOL_MF_MOVE_ALL for VMA which has pages on LRU, but gfp mask is not sutable for migaration (see mapping_gfp_mask() check in vma_migratable()). That's looks like a bug to me. Let's filter out non-migratable vma at start of queue_pages_test_walk() and go to queue_pages_pte_range() only if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL flag is set. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dmitry Vyukov Cc: Vlastimil Babka Cc: David Rientjes Cc: Naoya Horiguchi Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 27d135408a22..4c4187c0e1de 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -548,8 +548,7 @@ retry: goto retry; } - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, qp->pagelist, flags); + migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (vma->vm_flags & VM_PFNMAP) + if (!vma_migratable(vma)) return 1; if (endvma > end) @@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma_migratable(vma) && - vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) change_prot_numa(vma, start, endvma); return 1; } - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) - /* queue pages from current vma */ + /* queue pages from current vma */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } -- cgit From cf2a82ee432730ab25c21d054c67f296af4fc4bd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 5 Feb 2016 15:36:36 -0800 Subject: mm: downgrade VM_BUG in isolate_lru_page() to warning Calling isolate_lru_page() is wrong and shouldn't happen, but it not nessesary fatal: the page just will not be isolated if it's not on LRU. Let's downgrade the VM_BUG_ON_PAGE() to WARN_RATELIMIT(). Signed-off-by: Kirill A. Shutemov Cc: Dmitry Vyukov Cc: Vlastimil Babka Cc: David Rientjes Cc: Naoya Horiguchi Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index eb3dd37ccd7c..71b1c29948db 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page) int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); + WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { struct zone *zone = page_zone(page); -- cgit From b4330afbed0cdceeba33c4945158c55771047e81 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Feb 2016 15:36:38 -0800 Subject: mm/hugetlb: fix gigantic page initialization/allocation Attempting to preallocate 1G gigantic huge pages at boot time with "hugepagesz=1G hugepages=1" on the kernel command line will prevent booting with the following: kernel BUG at mm/hugetlb.c:1218! When mapcount accounting was reworked, the setting of compound_mapcount_ptr in prep_compound_gigantic_page was overlooked. As a result, the validation of mapcount in free_huge_page fails. The "BUG_ON" checks in free_huge_page were also changed to "VM_BUG_ON_PAGE" to assist with debugging. Fixes: 53f9263baba69 ("mm: rework mapcount accounting to enable 4k mapping of THPs") Signed-off-by: Mike Kravetz Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Acked-by: David Rientjes Tested-by: Vlastimil Babka Cc: "Aneesh Kumar K.V" Cc: Jerome Marchand Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 12908dcf5831..d7a802427ea8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; - BUG_ON(page_count(page)); - BUG_ON(page_mapcount(page)); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(page_mapcount(page), page); restore_reserve = PagePrivate(page); ClearPagePrivate(page); @@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) set_page_count(p, 0); set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } /* -- cgit From 080fe2068e1c7f19f565b30b78baf78edf16a980 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Feb 2016 15:36:41 -0800 Subject: mm, hugetlb: don't require CMA for runtime gigantic pages Commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") has added the runtime gigantic page allocation via alloc_contig_range(), making this support available only when CONFIG_CMA is enabled. Because it doesn't depend on MIGRATE_CMA pageblocks and the associated infrastructure, it is possible with few simple adjustments to require only CONFIG_MEMORY_ISOLATION instead of full CONFIG_CMA. After this patch, alloc_contig_range() and related functions are available and used for gigantic pages with just CONFIG_MEMORY_ISOLATION enabled. Note CONFIG_CMA selects CONFIG_MEMORY_ISOLATION. This allows supporting runtime gigantic pages without the CMA-specific checks in page allocator fastpaths. Signed-off-by: Vlastimil Babka Cc: Luiz Capitulino Cc: Kirill A. Shutemov Cc: Zhang Yanfei Cc: Yasuaki Ishimatsu Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7a802427ea8..06ae13e869d0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea2c4d3e0c03..838ca8bb64f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page) return !has_unmovable_pages(zone, page, 0, true); } -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) { -- cgit From 12352d3cae2cebe18805a91fab34b534d7444231 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Feb 2016 15:36:50 -0800 Subject: mm: replace vma_lock_anon_vma with anon_vma_lock_read/write Sequence vma_lock_anon_vma() - vma_unlock_anon_vma() isn't safe if anon_vma appeared between lock and unlock. We have to check anon_vma first or call anon_vma_prepare() to be sure that it's here. There are only few users of these legacy helpers. Let's get rid of them. This patch fixes anon_vma lock imbalance in validate_mm(). Write lock isn't required here, read lock is enough. And reorders expand_downwards/expand_upwards: security_mmap_addr() and wrapping-around check don't have to be under anon vma lock. Link: https://lkml.kernel.org/r/CACT4Y+Y908EjM2z=706dv4rV6dWtxTLK9nFg9_7DhRMLppBo2g@mail.gmail.com Signed-off-by: Konstantin Khlebnikov Reported-by: Dmitry Vyukov Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 55 +++++++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 918c9ec5043f..2f2415a7a688 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -459,12 +459,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -2145,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - int error; + int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ + /* Guard against wrapping around to address 0. */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. - * Also guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else { - vma_unlock_anon_vma(vma); - return -ENOMEM; - } - error = 0; + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { @@ -2188,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2211,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2227,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - address &= PAGE_MASK; error = security_mmap_addr(address); if (error) return error; - vma_lock_anon_vma(vma); + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { @@ -2263,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma, * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2284,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma, } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; -- cgit From ae026204a2b9060817503408906b35cefd824420 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 5 Feb 2016 15:36:53 -0800 Subject: thp: make deferred_split_scan() work again We need to iterate over split_queue, not local empty list to get anything split from the shrinker. Fixes: e3ae19535c66 ("thp: limit number of object to scan on deferred_split_scan()") Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36c070167b71..08fc0ba2207e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3482,7 +3482,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_lock_irqsave(&pgdata->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &list) { + list_for_each_safe(pos, next, &pgdata->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); if (get_page_unless_zero(page)) { -- cgit From 62eb320ab077890dbbcc28343fa6432a82a10c35 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 11 Feb 2016 16:12:58 -0800 Subject: mm: fix filemap.c kernel doc warning Add missing kernel-doc notation for function parameter 'gfp_mask' to fix kernel-doc warning. mm/filemap.c:1898: warning: No description found for parameter 'gfp_mask' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index bc943867d68c..23edccecadb0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1890,6 +1890,7 @@ EXPORT_SYMBOL(generic_file_read_iter); * page_cache_read - adds requested page to the page cache if not already there * @file: file to read * @offset: page index + * @gfp_mask: memory allocation flags * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. -- cgit From 6b9116a652bd9e0e2994505cfaaa5f66deaa2a05 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 11 Feb 2016 16:13:03 -0800 Subject: mm, dax: check for pmd_none() after split_huge_pmd() DAX implements split_huge_pmd() by clearing pmd. This simple approach reduces memory overhead, as we don't need to deposit page table on huge page mapping to make split_huge_pmd() never-fail. PTE table can be allocated and populated later on page fault from backing store. But one side effect is that have to check if pmd is pmd_none() after split_huge_pmd(). In most places we do this already to deal with parallel MADV_DONTNEED. But I found two call sites which is not affected by MADV_DONTNEED (due down_write(mmap_sem)), but need to have the check to work with DAX properly. Signed-off-by: Kirill A. Shutemov Cc: Dan Williams Cc: Matthew Wilcox Cc: Andrea Arcangeli Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 6 ++++-- mm/mremap.c | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 8eb7bb40dc40..f7cb3d4d9c2e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -160,9 +160,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { - if (next - addr != HPAGE_PMD_SIZE) + if (next - addr != HPAGE_PMD_SIZE) { split_huge_pmd(vma, pmd, addr); - else { + if (pmd_none(*pmd)) + continue; + } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); diff --git a/mm/mremap.c b/mm/mremap.c index d77946a997f7..8eeba02fc991 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -210,6 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, } } split_huge_pmd(vma, old_pmd, old_addr); + if (pmd_none(*old_pmd)) + continue; VM_BUG_ON(pmd_trans_huge(*old_pmd)); } if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, -- cgit From 078c6c3a5e7dc53a9a23408cc32c83954abb5d0d Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 11 Feb 2016 16:13:06 -0800 Subject: mm/backing-dev.c: fix error path in wb_init() We need to use post-decrement to get percpu_counter_destroy() called on &wb->stat[0]. Moreover, the pre-decremebt would cause infinite out-of-bounds accesses if the setup code failed at i==0. Signed-off-by: Rasmus Villemoes Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 926c76d56388..c554d173a65f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, return 0; out_destroy_stat: - while (--i) + while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); out_put_cong: -- cgit From 6a6ac72fd6ea32594b316513e1826c3f6db4cc93 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 11 Feb 2016 16:13:09 -0800 Subject: mm,thp: khugepaged: call pte flush at the time of collapse This showed up on ARC when running LMBench bw_mem tests as Overlapping TLB Machine Check Exception triggered due to STLB entry (2M pages) overlapping some NTLB entry (regular 8K page). bw_mem 2m touches a large chunk of vaddr creating NTLB entries. In the interim khugepaged kicks in, collapsing the contiguous ptes into a single pmd. pmdp_collapse_flush()->flush_pmd_tlb_range() is called to flush out NTLB entries for the ptes. This for ARC (by design) can only shootdown STLB entries (for pmd). The stray NTLB entries cause the overlap with the subsequent STLB entry for collapsed page. So make pmdp_collapse_flush() call pte flush interface not pmd flush. Note that originally all thp flush call sites in generic code called flush_tlb_range() leaving it to architecture to implement the flush for pte and/or pmd. Commit 12ebc1581ad11454 changed this by calling a new opt-in API flush_pmd_tlb_range() which made the semantics more explicit but failed to distinguish the pte vs pmd flush in generic code, which is what this patch fixes. Note that ARC can fixed w/o touching the generic pmdp_collapse_flush() by defining a ARC version, but that defeats the purpose of generic version, plus sementically this is the right thing to do. Fixes STAR 9000961194: LMBench on AXS103 triggering duplicate TLB exceptions with super pages Fixes: 12ebc1581ad11454 ("mm,thp: introduce flush_pmd_tlb_range") Signed-off-by: Vineet Gupta Reviewed-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: [4.4] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 9d4767698a1c..9f131c6034c5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -195,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + /* collapse entails shooting down ptes not pmd */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif -- cgit From 6b75d14912f2d89a3539c0b3a100519e1eec9a63 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 11 Feb 2016 16:13:11 -0800 Subject: mm,thp: fix spellos in describing __HAVE_ARCH_FLUSH_PMD_TLB_RANGE [akpm@linux-foundation.org: s/threshhold/threshold/] Signed-off-by: Vineet Gupta Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 9f131c6034c5..06a005b979a7 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. stock flush_tlb_range() typically has optimization to nuke the - * entire TLB TLB if flush span is greater than a threshhold, which will + * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single thp flush will - * invalidate the entire TLB which is not desitable. + * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -- cgit From c777e2a8b65420b31dac28a453e35be984f5808b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 9 Feb 2016 06:50:31 +0530 Subject: powerpc/mm: Fix Multi hit ERAT cause by recent THP update With ppc64 we use the deposited pgtable_t to store the hash pte slot information. We should not withdraw the deposited pgtable_t without marking the pmd none. This ensure that low level hash fault handling will skip this huge pte and we will handle them at upper levels. Recent change to pmd splitting changed the above in order to handle the race between pmd split and exit_mmap. The race is explained below. Consider following race: CPU0 CPU1 shrink_page_list() add_to_swap() split_huge_page_to_list() __split_huge_pmd_locked() pmdp_huge_clear_flush_notify() // pmd_none() == true exit_mmap() unmap_vmas() zap_pmd_range() // no action on pmd since pmd_none() == true pmd_populate() As result the THP will not be freed. The leak is detected by check_mm(): BUG: Bad rss-counter state mm:ffff880058d2e580 idx:1 val:512 The above required us to not mark pmd none during a pmd split. The fix for ppc is to clear the huge pte of _PAGE_USER, so that low level fault handling code skip this pte. At higher level we do take ptl lock. That should serialze us against the pmd split. Once the lock is acquired we do check the pmd again using pmd_same. That should always return false for us and hence we should retry the access. We do the pmd_same check in all case after taking plt with THP (do_huge_pmd_wp_page, do_huge_pmd_numa_page and huge_pmd_set_accessed) Also make sure we wait for irq disable section in other cpus to finish before flipping a huge pte entry with a regular pmd entry. Code paths like find_linux_pte_or_hugepte depend on irq disable to get a stable pte_t pointer. A parallel thp split need to make sure we don't convert a pmd pte to a regular pmd entry without waiting for the irq disable section to finish. Fixes: eef1b3ba053a ("thp: implement split_huge_pmd()") Acked-by: Kirill A. Shutemov Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b1cf73bc3b12..de3f43cde129 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2856,6 +2856,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); + pmdp_huge_split_prepare(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); -- cgit From 69a8ec2d8155b9121ca2990d43f8363b8e2bf550 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 17 Feb 2016 13:11:12 -0800 Subject: thp, dax: do not try to withdraw pgtable from non-anon VMA DAX doesn't deposit pgtables when it maps huge pages: nothing to withdraw. It can lead to crash. Signed-off-by: Kirill A. Shutemov Cc: Dan Williams Cc: Matthew Wilcox Cc: Andrea Arcangeli Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 08fc0ba2207e..722546dcfb7e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1700,7 +1700,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl) && + vma_is_anonymous(vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); -- cgit From 48f7df329474b49d83d0dffec1b6186647f11976 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 17 Feb 2016 13:11:15 -0800 Subject: mm: fix regression in remap_file_pages() emulation Grazvydas Ignotas has reported a regression in remap_file_pages() emulation. Testcase: #define _GNU_SOURCE #include #include #include #include #define SIZE (4096 * 3) int main(int argc, char **argv) { unsigned long *p; long i; p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { perror("mmap"); return -1; } for (i = 0; i < SIZE / 4096; i++) p[i * 4096 / sizeof(*p)] = i; if (remap_file_pages(p, 4096, 0, 1, 0)) { perror("remap_file_pages"); return -1; } if (remap_file_pages(p, 4096 * 2, 0, 1, 0)) { perror("remap_file_pages"); return -1; } assert(p[0] == 1); munmap(p, SIZE); return 0; } The second remap_file_pages() fails with -EINVAL. The reason is that remap_file_pages() emulation assumes that the target vma covers whole area we want to over map. That assumption is broken by first remap_file_pages() call: it split the area into two vma. The solution is to check next adjacent vmas, if they map the same file with the same flags. Fixes: c8d78c1823f4 ("mm: replace remap_file_pages() syscall with emulation") Signed-off-by: Kirill A. Shutemov Reported-by: Grazvydas Ignotas Tested-by: Grazvydas Ignotas Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2f2415a7a688..76d1ec29149b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2664,12 +2664,29 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (start < vma->vm_start || start + size > vma->vm_end) + if (start < vma->vm_start) goto out; - if (pgoff == linear_page_index(vma, start)) { - ret = 0; - goto out; + if (start + size > vma->vm_end) { + struct vm_area_struct *next; + + for (next = vma->vm_next; next; next = next->vm_next) { + /* hole between vmas ? */ + if (next->vm_start != next->vm_prev->vm_end) + goto out; + + if (next->vm_file != vma->vm_file) + goto out; + + if (next->vm_flags != vma->vm_flags) + goto out; + + if (start + size <= next->vm_end) + break; + } + + if (!next) + goto out; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; @@ -2679,9 +2696,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) { + struct vm_area_struct *tmp; flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ - munlock_vma_pages_range(vma, start, start + size); + for (tmp = vma; tmp->vm_start >= start + size; + tmp = tmp->vm_next) { + munlock_vma_pages_range(tmp, + max(tmp->vm_start, start), + min(tmp->vm_end, start + size)); + } } file = get_file(vma->vm_file); -- cgit From f8b74815a452ff2904b5d7fcce1a5ae2a4d7ca5e Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Wed, 17 Feb 2016 13:11:26 -0800 Subject: mm/hugetlb.c: fix incorrect proc nr_hugepages value Currently incorrect default hugepage pool size is reported by proc nr_hugepages when number of pages for the default huge page size is specified twice. When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages indicates the current number of pre-allocated huge pages of the default size. Basically /proc/sys/vm/nr_hugepages displays default_hstate-> max_huge_pages and after boot time pre-allocation, max_huge_pages should equal the number of pre-allocated pages (nr_hugepages). Test case: Note that this is specific to x86 architecture. Boot the kernel with command line option 'default_hugepagesz=1G hugepages=X hugepagesz=2M hugepages=Y hugepagesz=1G hugepages=Z'. After boot, 'cat /proc/sys/vm/nr_hugepages' and 'sysctl -a | grep hugepages' returns the value X. However, dmesg output shows that Z huge pages were pre-allocated. So, the root cause of the problem here is that the global variable default_hstate_max_huge_pages is set if a default huge page size is specified (directly or indirectly) on the command line. After the command line processing in hugetlb_init, if default_hstate_max_huge_pages is set, the value is assigned to default_hstae.max_huge_pages. However, default_hstate.max_huge_pages may have already been set based on the number of pre-allocated huge pages of default_hstate size. The solution to this problem is if hstate->max_huge_pages is already set then it should not set as a result of global max_huge_pages value. Basically if the value of the variable hugepages is set multiple times on a command line for a specific supported hugepagesize then proc layer should consider the last specified value. Signed-off-by: Vaishali Thakkar Reviewed-by: Naoya Horiguchi Cc: Mike Kravetz Cc: Hillf Danton Cc: Kirill A. Shutemov Cc: Dave Hansen Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 06ae13e869d0..01f2b48c8618 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2630,8 +2630,10 @@ static int __init hugetlb_init(void) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); - if (default_hstate_max_huge_pages) - default_hstate.max_huge_pages = default_hstate_max_huge_pages; + if (default_hstate_max_huge_pages) { + if (!default_hstate.max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + } hugetlb_init_hstates(); gather_bootmem_prealloc(); -- cgit From 52b4b950b50740bff507a62907e86710743c22e7 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 17 Feb 2016 13:11:37 -0800 Subject: mm: slab: free kmem_cache_node after destroy sysfs file When slub_debug alloc_calls_show is enabled we will try to track location and user of slab object on each online node, kmem_cache_node structure and cpu_cache/cpu_slub shouldn't be freed till there is the last reference to sysfs file. This fixes the following panic: BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 IP: list_locations+0x169/0x4e0 PGD 257304067 PUD 438456067 PMD 0 Oops: 0000 [#1] SMP CPU: 3 PID: 973074 Comm: cat ve: 0 Not tainted 3.10.0-229.7.2.ovz.9.30-00007-japdoll-dirty #2 9.30 Hardware name: DEPO Computers To Be Filled By O.E.M./H67DE3, BIOS L1.60c 07/14/2011 task: ffff88042a5dc5b0 ti: ffff88037f8d8000 task.ti: ffff88037f8d8000 RIP: list_locations+0x169/0x4e0 Call Trace: alloc_calls_show+0x1d/0x30 slab_attr_show+0x1b/0x30 sysfs_read_file+0x9a/0x1a0 vfs_read+0x9c/0x170 SyS_read+0x58/0xb0 system_call_fastpath+0x16/0x1b Code: 5e 07 12 00 b9 00 04 00 00 3d 00 04 00 00 0f 4f c1 3d 00 04 00 00 89 45 b0 0f 84 c3 00 00 00 48 63 45 b0 49 8b 9c c4 f8 00 00 00 <48> 8b 43 20 48 85 c0 74 b6 48 89 df e8 46 37 44 00 48 8b 53 10 CR2: 0000000000000020 Separated __kmem_cache_release from __kmem_cache_shutdown which now called on slab_kmem_cache_release (after the last reference to sysfs file object has dropped). Reintroduced locking in free_partial as sysfs file might access cache's partial list after shutdowning - partial revert of the commit 69cb8e6b7c29 ("slub: free slabs without holding locks"). Zap __remove_partial and use remove_partial (w/o underscores) as free_partial now takes list_lock which s partial revert for commit 1e4dd9461fab ("slub: do not assert not having lock in removing freed partial") Signed-off-by: Dmitry Safonov Suggested-by: Vladimir Davydov Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 12 ++++++------ mm/slab.h | 1 + mm/slab_common.c | 1 + mm/slob.c | 4 ++++ mm/slub.c | 38 +++++++++++++++++--------------------- 5 files changed, 29 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6ecc697a8bc4..621fbcb35a36 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2275,7 +2275,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) err = setup_cpu_cache(cachep, gfp); if (err) { - __kmem_cache_shutdown(cachep); + __kmem_cache_release(cachep); return err; } @@ -2413,13 +2413,14 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) } int __kmem_cache_shutdown(struct kmem_cache *cachep) +{ + return __kmem_cache_shrink(cachep, false); +} + +void __kmem_cache_release(struct kmem_cache *cachep) { int i; struct kmem_cache_node *n; - int rc = __kmem_cache_shrink(cachep, false); - - if (rc) - return rc; free_percpu(cachep->cpu_cache); @@ -2430,7 +2431,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) kfree(n); cachep->node[i] = NULL; } - return 0; } /* diff --git a/mm/slab.h b/mm/slab.h index 834ad240c0bb..2eedacea439d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -140,6 +140,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *, bool); void slab_kmem_cache_release(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index b50aef01ccf7..065b7bdabdc3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -693,6 +693,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s, void slab_kmem_cache_release(struct kmem_cache *s) { + __kmem_cache_release(s); destroy_memcg_params(s); kfree_const(s->name); kmem_cache_free(kmem_cache, s); diff --git a/mm/slob.c b/mm/slob.c index 17e8f8cc7c53..5ec158054ffe 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } +void __kmem_cache_release(struct kmem_cache *c) +{ +} + int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) { return 0; diff --git a/mm/slub.c b/mm/slub.c index 2e1355ac056b..d8fbd4a6ed59 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1592,18 +1592,12 @@ static inline void add_partial(struct kmem_cache_node *n, __add_partial(n, page, tail); } -static inline void -__remove_partial(struct kmem_cache_node *n, struct page *page) -{ - list_del(&page->lru); - n->nr_partial--; -} - static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { lockdep_assert_held(&n->list_lock); - __remove_partial(n, page); + list_del(&page->lru); + n->nr_partial--; } /* @@ -3184,6 +3178,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) } } +void __kmem_cache_release(struct kmem_cache *s) +{ + free_percpu(s->cpu_slab); + free_kmem_cache_nodes(s); +} + static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; @@ -3443,28 +3443,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. - * This is called from kmem_cache_close(). We must be the last thread - * using the cache and therefore we do not need to lock anymore. + * This is called from __kmem_cache_shutdown(). We must take list_lock + * because sysfs file might still access partial list after the shutdowning. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { struct page *page, *h; + BUG_ON(irqs_disabled()); + spin_lock_irq(&n->list_lock); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - __remove_partial(n, page); + remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, - "Objects remaining in %s on kmem_cache_close()"); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } + spin_unlock_irq(&n->list_lock); } /* * Release all resources used by a slab cache. */ -static inline int kmem_cache_close(struct kmem_cache *s) +int __kmem_cache_shutdown(struct kmem_cache *s) { int node; struct kmem_cache_node *n; @@ -3476,16 +3479,9 @@ static inline int kmem_cache_close(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } - free_percpu(s->cpu_slab); - free_kmem_cache_nodes(s); return 0; } -int __kmem_cache_shutdown(struct kmem_cache *s) -{ - return kmem_cache_close(s); -} - /******************************************************************** * Kmalloc subsystem *******************************************************************/ @@ -3980,7 +3976,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) memcg_propagate_slab_attrs(s); err = sysfs_slab_add(s); if (err) - kmem_cache_close(s); + __kmem_cache_release(s); return err; } -- cgit