From 27ba0644ea9dfe6e7693abc85837b60e40583b96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:59 -0800 Subject: rmap: drop support of non-linear mappings We don't create non-linear mappings anymore. Let's drop code which handles them in rmap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 4dc2ddade9f1..b379d9abddc7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - if (unlikely(tmp->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(tmp, - &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } -- cgit From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:50 -0800 Subject: mm: account pmd page tables to the process Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include #include #include #include #include #include #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov Reported-by: Dave Hansen Cc: Hugh Dickins Reviewed-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: David Rientjes Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b379d9abddc7..c99098c52641 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); +#ifndef __PAGETABLE_PMD_FOLDED + atomic_long_set(&mm->nr_pmds, 0); +#endif mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; -- cgit From b30fe6c7ced70f62862c3d09357e7e8084e98d9f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:53 -0800 Subject: mm: fix false-positive warning on exit due mm_nr_pmds(mm) The problem is that we check nr_ptes/nr_pmds in exit_mmap() which happens *before* pgd_free(). And if an arch does pte/pmd allocation in pgd_alloc() and frees them in pgd_free() we see offset in counters by the time of the checks. We tried to workaround this by offsetting expected counter value according to FIRST_USER_ADDRESS for both nr_pte and nr_pmd in exit_mmap(). But it doesn't work in some cases: 1. ARM with LPAE enabled also has non-zero USER_PGTABLES_CEILING, but upper addresses occupied with huge pmd entries, so the trick with offsetting expected counter value will get really ugly: we will have to apply it nr_pmds, but not nr_ptes. 2. Metag has non-zero FIRST_USER_ADDRESS, but doesn't do allocation pte/pmd page tables allocation in pgd_alloc(), just setup a pgd entry which is allocated at boot and shared accross all processes. The proposal is to move the check to check_mm() which happens *after* pgd_free() and do proper accounting during pgd_alloc() and pgd_free() which would bring counters to zero if nothing leaked. Signed-off-by: Kirill A. Shutemov Reported-by: Tyler Baker Tested-by: Tyler Baker Tested-by: Nishanth Menon Cc: Russell King Cc: James Hogan Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index c99098c52641..66e19c251581 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -606,6 +606,14 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } + + if (atomic_long_read(&mm->nr_ptes)) + pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", + atomic_long_read(&mm->nr_ptes)); + if (mm_nr_pmds(mm)) + pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", + mm_nr_pmds(mm)); + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif -- cgit