From 0b9b6fff7b4caf5838550151d15b389aaa217707 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 20 Jan 2016 14:58:09 -0800 Subject: thp: fix interrupt unsafe locking in split_huge_page() split_queue_lock can be taken from interrupt context in some cases, but I forgot to convert locking in split_huge_page() to interrupt-safe primitives. Let's fix this. lockdep output: ====================================================== [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] 4.4.0+ #259 Tainted: G W ------------------------------------------------------ syz-executor/18183 [HC0[0]:SC0[2]:HE0:SE0] is trying to acquire: (split_queue_lock){+.+...}, at: free_transhuge_page+0x24/0x90 mm/huge_memory.c:3436 and this task is already holding: (slock-AF_INET){+.-...}, at: spin_lock_bh include/linux/spinlock.h:307 (slock-AF_INET){+.-...}, at: lock_sock_fast+0x45/0x120 net/core/sock.c:2462 which would create a new lock dependency: (slock-AF_INET){+.-...} -> (split_queue_lock){+.+...} but this new dependency connects a SOFTIRQ-irq-safe lock: (slock-AF_INET){+.-...} ... which became SOFTIRQ-irq-safe at: mark_irqflags kernel/locking/lockdep.c:2799 __lock_acquire+0xfd8/0x4700 kernel/locking/lockdep.c:3162 lock_acquire+0x1dc/0x430 kernel/locking/lockdep.c:3585 __raw_spin_lock include/linux/spinlock_api_smp.h:144 _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:302 udp_queue_rcv_skb+0x781/0x1550 net/ipv4/udp.c:1680 flush_stack+0x50/0x330 net/ipv6/udp.c:799 __udp4_lib_mcast_deliver+0x694/0x7f0 net/ipv4/udp.c:1798 __udp4_lib_rcv+0x17dc/0x23e0 net/ipv4/udp.c:1888 udp_rcv+0x21/0x30 net/ipv4/udp.c:2108 ip_local_deliver_finish+0x2b3/0xa50 net/ipv4/ip_input.c:216 NF_HOOK_THRESH include/linux/netfilter.h:226 NF_HOOK include/linux/netfilter.h:249 ip_local_deliver+0x1c4/0x2f0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:498 ip_rcv_finish+0x5ec/0x1730 net/ipv4/ip_input.c:365 NF_HOOK_THRESH include/linux/netfilter.h:226 NF_HOOK include/linux/netfilter.h:249 ip_rcv+0x963/0x1080 net/ipv4/ip_input.c:455 __netif_receive_skb_core+0x1620/0x2f80 net/core/dev.c:4154 __netif_receive_skb+0x2a/0x160 net/core/dev.c:4189 netif_receive_skb_internal+0x1b5/0x390 net/core/dev.c:4217 napi_skb_finish net/core/dev.c:4542 napi_gro_receive+0x2bd/0x3c0 net/core/dev.c:4572 e1000_clean_rx_irq+0x4e2/0x1100 drivers/net/ethernet/intel/e1000e/netdev.c:1038 e1000_clean+0xa08/0x24a0 drivers/net/ethernet/intel/e1000/e1000_main.c:3819 napi_poll net/core/dev.c:5074 net_rx_action+0x7eb/0xdf0 net/core/dev.c:5139 __do_softirq+0x26a/0x920 kernel/softirq.c:273 invoke_softirq kernel/softirq.c:350 irq_exit+0x18f/0x1d0 kernel/softirq.c:391 exiting_irq ./arch/x86/include/asm/apic.h:659 do_IRQ+0x86/0x1a0 arch/x86/kernel/irq.c:252 ret_from_intr+0x0/0x20 arch/x86/entry/entry_64.S:520 arch_safe_halt ./arch/x86/include/asm/paravirt.h:117 default_idle+0x52/0x2e0 arch/x86/kernel/process.c:304 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:295 default_idle_call+0x48/0xa0 kernel/sched/idle.c:92 cpuidle_idle_call kernel/sched/idle.c:156 cpu_idle_loop kernel/sched/idle.c:252 cpu_startup_entry+0x554/0x710 kernel/sched/idle.c:300 rest_init+0x192/0x1a0 init/main.c:412 start_kernel+0x678/0x69e init/main.c:683 x86_64_start_reservations+0x2a/0x2c arch/x86/kernel/head64.c:195 x86_64_start_kernel+0x158/0x167 arch/x86/kernel/head64.c:184 to a SOFTIRQ-irq-unsafe lock: (split_queue_lock){+.+...} which became SOFTIRQ-irq-unsafe at: mark_irqflags kernel/locking/lockdep.c:2817 __lock_acquire+0x146e/0x4700 kernel/locking/lockdep.c:3162 lock_acquire+0x1dc/0x430 kernel/locking/lockdep.c:3585 __raw_spin_lock include/linux/spinlock_api_smp.h:144 _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:302 split_huge_page_to_list+0xcc0/0x1c50 mm/huge_memory.c:3399 split_huge_page include/linux/huge_mm.h:99 queue_pages_pte_range+0xa38/0xef0 mm/mempolicy.c:507 walk_pmd_range mm/pagewalk.c:50 walk_pud_range mm/pagewalk.c:90 walk_pgd_range mm/pagewalk.c:116 __walk_page_range+0x653/0xcd0 mm/pagewalk.c:204 walk_page_range+0xfe/0x2b0 mm/pagewalk.c:281 queue_pages_range+0xfb/0x130 mm/mempolicy.c:687 migrate_to_node mm/mempolicy.c:1004 do_migrate_pages+0x370/0x4e0 mm/mempolicy.c:1109 SYSC_migrate_pages mm/mempolicy.c:1453 SyS_migrate_pages+0x640/0x730 mm/mempolicy.c:1374 entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(split_queue_lock); local_irq_disable(); lock(slock-AF_INET); lock(split_queue_lock); lock(slock-AF_INET); Signed-off-by: Kirill A. Shutemov Reported-by: Dmitry Vyukov Acked-by: David Rientjes Reviewed-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b1cf73bc3b12..8ad580273521 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3357,6 +3357,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct anon_vma *anon_vma; int count, mapcount, ret; bool mlocked; + unsigned long flags; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); @@ -3396,7 +3397,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* Prevent deferred_split_scan() touching ->_count */ - spin_lock(&split_queue_lock); + spin_lock_irqsave(&split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && count == 1) { @@ -3404,11 +3405,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock(&split_queue_lock); + spin_unlock_irqrestore(&split_queue_lock, flags); __split_huge_page(page, list); ret = 0; } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock(&split_queue_lock); + spin_unlock_irqrestore(&split_queue_lock, flags); pr_alert("total_mapcount: %u, page_count(): %u\n", mapcount, count); if (PageTail(page)) @@ -3416,7 +3417,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } else { - spin_unlock(&split_queue_lock); + spin_unlock_irqrestore(&split_queue_lock, flags); unfreeze_page(anon_vma, head); ret = -EBUSY; } -- cgit From c102f07ca0b04f2cb49cfc161c83f6239d17f491 Mon Sep 17 00:00:00 2001 From: Junil Lee Date: Wed, 20 Jan 2016 14:58:18 -0800 Subject: zsmalloc: fix migrate_zspage-zs_free race condition record_obj() in migrate_zspage() does not preserve handle's HANDLE_PIN_BIT, set by find_aloced_obj()->trypin_tag(), and implicitly (accidentally) un-pins the handle, while migrate_zspage() still performs an explicit unpin_tag() on the that handle. This additional explicit unpin_tag() introduces a race condition with zs_free(), which can pin that handle by this time, so the handle becomes un-pinned. Schematically, it goes like this: CPU0 CPU1 migrate_zspage find_alloced_obj trypin_tag set HANDLE_PIN_BIT zs_free() pin_tag() obj_malloc() -- new object, no tag record_obj() -- remove HANDLE_PIN_BIT set HANDLE_PIN_BIT unpin_tag() -- remove zs_free's HANDLE_PIN_BIT The race condition may result in a NULL pointer dereference: Unable to handle kernel NULL pointer dereference at virtual address 00000000 CPU: 0 PID: 19001 Comm: CookieMonsterCl Tainted: PC is at get_zspage_mapping+0x0/0x24 LR is at obj_free.isra.22+0x64/0x128 Call trace: get_zspage_mapping+0x0/0x24 zs_free+0x88/0x114 zram_free_page+0x64/0xcc zram_slot_free_notify+0x90/0x108 swap_entry_free+0x278/0x294 free_swap_and_cache+0x38/0x11c unmap_single_vma+0x480/0x5c8 unmap_vmas+0x44/0x60 exit_mmap+0x50/0x110 mmput+0x58/0xe0 do_exit+0x320/0x8dc do_group_exit+0x44/0xa8 get_signal+0x538/0x580 do_signal+0x98/0x4b8 do_notify_resume+0x14/0x5c This patch keeps the lock bit in migration path and update value atomically. Signed-off-by: Junil Lee Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Cc: Sergey Senozhatsky Cc: [4.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e7414cec220b..2d7c4c11fc63 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -309,7 +309,12 @@ static void free_handle(struct zs_pool *pool, unsigned long handle) static void record_obj(unsigned long handle, unsigned long obj) { - *(unsigned long *)handle = obj; + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); } /* zpool driver */ @@ -1635,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj = obj_malloc(d_page, class, handle); zs_object_copy(free_obj, used_obj, class); index++; + /* + * record_obj updates handle's value to free_obj and it will + * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which + * breaks synchronization using pin_tag(e,g, zs_free) so + * let's keep the lock bit. + */ + free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); obj_free(pool, class, used_obj); -- cgit From caaee6234d05a58c5b4d05e7bf766131b810a657 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:04 -0800 Subject: ptrace: use fsuid, fsgid, effective creds for fs access checks By checking the effective credentials instead of the real UID / permitted capabilities, ensure that the calling process actually intended to use its credentials. To ensure that all ptrace checks use the correct caller credentials (e.g. in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS flag), use two new flags and require one of them to be set. The problem was that when a privileged task had temporarily dropped its privileges, e.g. by calling setreuid(0, user_uid), with the intent to perform following syscalls with the credentials of a user, it still passed ptrace access checks that the user would not be able to pass. While an attacker should not be able to convince the privileged task to perform a ptrace() syscall, this is a problem because the ptrace access check is reused for things in procfs. In particular, the following somewhat interesting procfs entries only rely on ptrace access checks: /proc/$pid/stat - uses the check for determining whether pointers should be visible, useful for bypassing ASLR /proc/$pid/maps - also useful for bypassing ASLR /proc/$pid/cwd - useful for gaining access to restricted directories that contain files with lax permissions, e.g. in this scenario: lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar drwx------ root root /root drwxr-xr-x root root /root/foobar -rw-r--r-- root root /root/foobar/secret Therefore, on a system where a root-owned mode 6755 binary changes its effective credentials as described and then dumps a user-specified file, this could be used by an attacker to reveal the memory layout of root's processes or reveal the contents of files he is not allowed to access (through /proc/$pid/cwd). [akpm@linux-foundation.org: fix warning] Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Casey Schaufler Cc: Oleg Nesterov Cc: Ingo Molnar Cc: James Morris Cc: "Serge E. Hallyn" Cc: Andy Shevchenko Cc: Andy Lutomirski Cc: Al Viro Cc: "Eric W. Biederman" Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e88d071648c2..5d453e58ddbf 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* -- cgit From c6d308534aef6c99904bf5862066360ae067abc4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 20 Jan 2016 15:00:55 -0800 Subject: UBSAN: run-time undefined behavior sanity checker UBSAN uses compile-time instrumentation to catch undefined behavior (UB). Compiler inserts code that perform certain kinds of checks before operations that could cause UB. If check fails (i.e. UB detected) __ubsan_handle_* function called to print error message. So the most of the work is done by compiler. This patch just implements ubsan handlers printing errors. GCC has this capability since 4.9.x [1] (see -fsanitize=undefined option and its suboptions). However GCC 5.x has more checkers implemented [2]. Article [3] has a bit more details about UBSAN in the GCC. [1] - https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/Debugging-Options.html [2] - https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html [3] - http://developerblog.redhat.com/2014/10/16/gcc-undefined-behavior-sanitizer-ubsan/ Issues which UBSAN has found thus far are: Found bugs: * out-of-bounds access - 97840cb67ff5 ("netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind") undefined shifts: * d48458d4a768 ("jbd2: use a better hash function for the revoke table") * 10632008b9e1 ("clockevents: Prevent shift out of bounds") * 'x << -1' shift in ext4 - http://lkml.kernel.org/r/<5444EF21.8020501@samsung.com> * undefined rol32(0) - http://lkml.kernel.org/r/<1449198241-20654-1-git-send-email-sasha.levin@oracle.com> * undefined dirty_ratelimit calculation - http://lkml.kernel.org/r/<566594E2.3050306@odin.com> * undefined roundown_pow_of_two(0) - http://lkml.kernel.org/r/<1449156616-11474-1-git-send-email-sasha.levin@oracle.com> * [WONTFIX] undefined shift in __bpf_prog_run - http://lkml.kernel.org/r/ WONTFIX here because it should be fixed in bpf program, not in kernel. signed overflows: * 32a8df4e0b33f ("sched: Fix odd values in effective_load() calculations") * mul overflow in ntp - http://lkml.kernel.org/r/<1449175608-1146-1-git-send-email-sasha.levin@oracle.com> * incorrect conversion into rtc_time in rtc_time64_to_tm() - http://lkml.kernel.org/r/<1449187944-11730-1-git-send-email-sasha.levin@oracle.com> * unvalidated timespec in io_getevents() - http://lkml.kernel.org/r/ * [NOTABUG] signed overflow in ktime_add_safe() - http://lkml.kernel.org/r/ [akpm@linux-foundation.org: fix unused local warning] [akpm@linux-foundation.org: fix __int128 build woes] Signed-off-by: Andrey Ryabinin Cc: Peter Zijlstra Cc: Sasha Levin Cc: Randy Dunlap Cc: Rasmus Villemoes Cc: Jonathan Corbet Cc: Michal Marek Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yury Gribov Cc: Dmitry Vyukov Cc: Konstantin Khlebnikov Cc: Kostya Serebryany Cc: Johannes Berg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 64710148941e..a61460d9f5b0 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,4 +1,5 @@ KASAN_SANITIZE := n +UBSAN_SANITIZE_kasan.o := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 -- cgit From a3b609ef9f8b1dbfe97034ccad6cd3fe71fbe7ab Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Jan 2016 15:01:05 -0800 Subject: proc read mm's {arg,env}_{start,end} with mmap semaphore taken. Only functions doing more than one read are modified. Consumeres happened to deal with possibly changing data, but it does not seem like a good thing to rely on. Signed-off-by: Mateusz Guzik Acked-by: Cyrill Gorcunov Cc: Alexey Dobriyan Cc: Jarod Wilson Cc: Jan Stancek Cc: Al Viro Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 6d1f9200f74e..c108a6542d05 100644 --- a/mm/util.c +++ b/mm/util.c @@ -476,17 +476,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); + unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ - len = mm->arg_end - mm->arg_start; + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + len = arg_end - arg_start; if (len > buflen) len = buflen; - res = access_process_vm(task, mm->arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, 0); /* * If the nul at the end of args has been overwritten, then @@ -497,10 +505,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len < res) { res = len; } else { - len = mm->env_end - mm->env_start; + len = env_end - env_start; if (len > buflen - res) len = buflen - res; - res += access_process_vm(task, mm->env_start, + res += access_process_vm(task, env_start, buffer+res, len, 0); res = strnlen(buffer, res); } -- cgit From 6d378dac7c4905db38f8127c4e618f0f627a4ced Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:18 -0800 Subject: mm: memcontrol: drop unused @css argument in memcg_init_kmem This series adds accounting of the historical "kmem" memory consumers to the cgroup2 memory controller. These consumers include the dentry cache, the inode cache, kernel stack pages, and a few others that are pointed out in patch 7/8. The footprint of these consumers is directly tied to userspace activity in common workloads, and so they have to be part of the minimally viable configuration in order to present a complete feature to our users. The cgroup2 interface of the memory controller is far from complete, but this series, along with the socket memory accounting series, provides the final semantic changes for the existing memory knobs in the cgroup2 interface, which is scheduled for initial release in the next merge window. This patch (of 8): Remove unused css argument frmo memcg_init_kmem() Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0eda67376df4..f21f29cd14f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3583,7 +3583,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, } #ifdef CONFIG_MEMCG_KMEM -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +static int memcg_init_kmem(struct mem_cgroup *memcg) { int ret; @@ -3591,7 +3591,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) if (ret) return ret; - return tcp_init_cgroup(memcg, ss); + return tcp_init_cgroup(memcg); } static void memcg_deactivate_kmem(struct mem_cgroup *memcg) @@ -4274,7 +4274,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); - ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); + ret = memcg_init_kmem(memcg); if (ret) return ret; -- cgit From b15aac110a45c52d7f47ab8ee2d68f98044cfe6c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:21 -0800 Subject: mm: memcontrol: remove double kmem page_counter init The kmem page_counter's limit is initialized to PAGE_COUNTER_MAX inside mem_cgroup_css_online(). There is no need to repeat this from memcg_propagate_kmem(). Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f21f29cd14f9..71dced17b16d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2861,8 +2861,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } #ifdef CONFIG_MEMCG_KMEM -static int memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long nr_pages) +static int memcg_activate_kmem(struct mem_cgroup *memcg) { int err = 0; int memcg_id; @@ -2897,13 +2896,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, goto out; } - /* - * We couldn't have accounted to this cgroup, because it hasn't got - * activated yet, so this should succeed. - */ - err = page_counter_limit(&memcg->kmem, nr_pages); - VM_BUG_ON(err); - static_branch_inc(&memcg_kmem_enabled_key); /* * A memory cgroup is considered kmem-active as soon as it gets @@ -2924,10 +2916,14 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, int ret; mutex_lock(&memcg_limit_mutex); - if (!memcg_kmem_is_active(memcg)) - ret = memcg_activate_kmem(memcg, limit); - else - ret = page_counter_limit(&memcg->kmem, limit); + /* Top-level cgroup doesn't propagate from root */ + if (!memcg_kmem_is_active(memcg)) { + ret = memcg_activate_kmem(memcg); + if (ret) + goto out; + } + ret = page_counter_limit(&memcg->kmem, limit); +out: mutex_unlock(&memcg_limit_mutex); return ret; } @@ -2946,7 +2942,7 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) * after this point, because it has at least one child already. */ if (memcg_kmem_is_active(parent)) - ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); + ret = memcg_activate_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; } -- cgit From 567e9ab2e614e55feca20e8bcb54b629e9cc1a3b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:24 -0800 Subject: mm: memcontrol: give the kmem states more descriptive names On any given memcg, the kmem accounting feature has three separate states: not initialized, structures allocated, and actively accounting slab memory. These are represented through a combination of the kmem_acct_activated and kmem_acct_active flags, which is confusing. Convert to a kmem_state enum with the states NONE, ALLOCATED, and ONLINE. Then rename the functions to modify the state accordingly. This follows the nomenclature of css object states more closely. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 52 +++++++++++++++++++++++++--------------------------- mm/slab_common.c | 4 ++-- mm/vmscan.c | 2 +- 3 files changed, 28 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 71dced17b16d..24b6bded13f8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2378,7 +2378,7 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) return 0; if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) @@ -2861,14 +2861,13 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } #ifdef CONFIG_MEMCG_KMEM -static int memcg_activate_kmem(struct mem_cgroup *memcg) +static int memcg_online_kmem(struct mem_cgroup *memcg) { int err = 0; int memcg_id; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_acct_activated); - BUG_ON(memcg->kmem_acct_active); + BUG_ON(memcg->kmem_state); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -2898,14 +2897,13 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg) static_branch_inc(&memcg_kmem_enabled_key); /* - * A memory cgroup is considered kmem-active as soon as it gets + * A memory cgroup is considered kmem-online as soon as it gets * kmemcg_id. Setting the id after enabling static branching will * guarantee no one starts accounting before all call sites are * patched. */ memcg->kmemcg_id = memcg_id; - memcg->kmem_acct_activated = true; - memcg->kmem_acct_active = true; + memcg->kmem_state = KMEM_ONLINE; out: return err; } @@ -2917,8 +2915,8 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, mutex_lock(&memcg_limit_mutex); /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_is_active(memcg)) { - ret = memcg_activate_kmem(memcg); + if (!memcg_kmem_online(memcg)) { + ret = memcg_online_kmem(memcg); if (ret) goto out; } @@ -2938,11 +2936,12 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) mutex_lock(&memcg_limit_mutex); /* - * If the parent cgroup is not kmem-active now, it cannot be activated - * after this point, because it has at least one child already. + * If the parent cgroup is not kmem-online now, it cannot be + * onlined after this point, because it has at least one child + * already. */ - if (memcg_kmem_is_active(parent)) - ret = memcg_activate_kmem(memcg); + if (memcg_kmem_online(parent)) + ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; } @@ -3590,22 +3589,21 @@ static int memcg_init_kmem(struct mem_cgroup *memcg) return tcp_init_cgroup(memcg); } -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; struct mem_cgroup *parent, *child; int kmemcg_id; - if (!memcg->kmem_acct_active) + if (memcg->kmem_state != KMEM_ONLINE) return; - /* - * Clear the 'active' flag before clearing memcg_caches arrays entries. - * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it - * guarantees no cache will be created for this cgroup after we are - * done (see memcg_create_kmem_cache()). + * Clear the online state before clearing memcg_caches array + * entries. The slab_mutex in memcg_deactivate_kmem_caches() + * guarantees that no cache will be created for this cgroup + * after we are done (see memcg_create_kmem_cache()). */ - memcg->kmem_acct_active = false; + memcg->kmem_state = KMEM_ALLOCATED; memcg_deactivate_kmem_caches(memcg); @@ -3636,9 +3634,9 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg) memcg_free_cache_id(kmemcg_id); } -static void memcg_destroy_kmem(struct mem_cgroup *memcg) +static void memcg_free_kmem(struct mem_cgroup *memcg) { - if (memcg->kmem_acct_activated) { + if (memcg->kmem_state == KMEM_ALLOCATED) { memcg_destroy_kmem_caches(memcg); static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); @@ -3651,11 +3649,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_destroy_kmem(struct mem_cgroup *memcg) +static void memcg_free_kmem(struct mem_cgroup *memcg) { } #endif @@ -4308,7 +4306,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); - memcg_deactivate_kmem(memcg); + memcg_offline_kmem(memcg); wb_memcg_offline(memcg); } @@ -4324,7 +4322,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg_destroy_kmem(memcg); + memcg_free_kmem(memcg); #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); diff --git a/mm/slab_common.c b/mm/slab_common.c index e016178063e1..8c262e6dc33e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -503,10 +503,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); /* - * The memory cgroup could have been deactivated while the cache + * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) goto out_unlock; idx = memcg_cache_id(memcg); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ac86956ff9d..05dd182f04fd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -411,7 +411,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_is_active(memcg)) + if (memcg && !memcg_kmem_online(memcg)) return 0; if (nr_scanned == 0) -- cgit From 8e0a891213fbddcec231c9d1d7577c320c77a25a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:26 -0800 Subject: mm: memcontrol: group kmem init and exit functions together Put all the related code to setup and teardown the kmem accounting state into the same location. No functional change intended. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 157 +++++++++++++++++++++++++++----------------------------- 1 file changed, 76 insertions(+), 81 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 24b6bded13f8..3dd9fe35fe62 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2945,12 +2945,88 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) mutex_unlock(&memcg_limit_mutex); return ret; } + +static int memcg_init_kmem(struct mem_cgroup *memcg) +{ + int ret; + + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; + + return tcp_init_cgroup(memcg); +} + +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + + if (memcg->kmem_state != KMEM_ONLINE) + return; + /* + * Clear the online state before clearing memcg_caches array + * entries. The slab_mutex in memcg_deactivate_kmem_caches() + * guarantees that no cache will be created for this cgroup + * after we are done (see memcg_create_kmem_cache()). + */ + memcg->kmem_state = KMEM_ALLOCATED; + + memcg_deactivate_kmem_caches(memcg); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). + */ + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); +} + +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ + if (memcg->kmem_state == KMEM_ALLOCATED) { + memcg_destroy_kmem_caches(memcg); + static_branch_dec(&memcg_kmem_enabled_key); + WARN_ON(page_counter_read(&memcg->kmem)); + } + tcp_destroy_cgroup(memcg); +} #else static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { return -EINVAL; } +static int memcg_init_kmem(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ +} +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ +} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -3577,87 +3653,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, return 0; } -#ifdef CONFIG_MEMCG_KMEM -static int memcg_init_kmem(struct mem_cgroup *memcg) -{ - int ret; - - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; - - return tcp_init_cgroup(memcg); -} - -static void memcg_offline_kmem(struct mem_cgroup *memcg) -{ - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; - int kmemcg_id; - - if (memcg->kmem_state != KMEM_ONLINE) - return; - /* - * Clear the online state before clearing memcg_caches array - * entries. The slab_mutex in memcg_deactivate_kmem_caches() - * guarantees that no cache will be created for this cgroup - * after we are done (see memcg_create_kmem_cache()). - */ - memcg->kmem_state = KMEM_ALLOCATED; - - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by - * memcg_drain_all_list_lrus(). - */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; - } - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); - - memcg_free_cache_id(kmemcg_id); -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ - if (memcg->kmem_state == KMEM_ALLOCATED) { - memcg_destroy_kmem_caches(memcg); - static_branch_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); - } - tcp_destroy_cgroup(memcg); -} -#else -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - return 0; -} - -static void memcg_offline_kmem(struct mem_cgroup *memcg) -{ -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -- cgit From 3893e302f6a377c4ef0f077f190bf760bf84e0be Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:29 -0800 Subject: mm: memcontrol: separate kmem code from legacy tcp accounting code The cgroup2 memory controller will include important in-kernel memory consumers per default, including socket memory, but it will no longer carry the historic tcp control interface. Separate the kmem state init from the tcp control interface init in preparation for that. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dd9fe35fe62..7f8219b58e0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2946,17 +2946,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) return ret; } -static int memcg_init_kmem(struct mem_cgroup *memcg) -{ - int ret; - - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; - - return tcp_init_cgroup(memcg); -} - static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; @@ -3009,7 +2998,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } - tcp_destroy_cgroup(memcg); } #else static int memcg_update_kmem_limit(struct mem_cgroup *memcg, @@ -3017,16 +3005,9 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, { return -EINVAL; } -static int memcg_init_kmem(struct mem_cgroup *memcg) -{ - return 0; -} static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -4263,9 +4244,14 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); - ret = memcg_init_kmem(memcg); +#ifdef CONFIG_MEMCG_KMEM + ret = memcg_propagate_kmem(memcg); if (ret) return ret; + ret = tcp_init_cgroup(memcg); + if (ret) + return ret; +#endif #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) @@ -4317,11 +4303,16 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg_free_kmem(memcg); #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); #endif + +#ifdef CONFIG_MEMCG_KMEM + memcg_free_kmem(memcg); + tcp_destroy_cgroup(memcg); +#endif + __mem_cgroup_free(memcg); } -- cgit From 127424c86bb6cb87f0b563d9fdcfbbaf3c86ecec Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:32 -0800 Subject: mm: memcontrol: move kmem accounting code to CONFIG_MEMCG The cgroup2 memory controller will account important in-kernel memory consumers per default. Move all necessary components to CONFIG_MEMCG. Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 12 +++++----- mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++----------------------- mm/slab.h | 6 ++--- mm/slab_common.c | 10 ++++---- mm/slub.c | 10 ++++---- 5 files changed, 60 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/mm/list_lru.c b/mm/list_lru.c index afc71ea9a381..1d05cb9d363d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include #include -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru) static void list_lru_unregister(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { return &nlru->lru; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { @@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f8219b58e0c..fe51d5e61389 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -297,7 +297,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_css(css); } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -349,7 +349,7 @@ void memcg_put_cache_ids(void) DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) @@ -2203,7 +2203,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB static int memcg_alloc_cache_id(void) { int id, size; @@ -2424,7 +2424,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) page->mem_cgroup = NULL; css_put_many(&memcg->css, nr_pages); } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2860,7 +2860,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB static int memcg_online_kmem(struct mem_cgroup *memcg) { int err = 0; @@ -2908,24 +2908,6 @@ out: return err; } -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - int ret; - - mutex_lock(&memcg_limit_mutex); - /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_online(memcg)) { - ret = memcg_online_kmem(memcg); - if (ret) - goto out; - } - ret = page_counter_limit(&memcg->kmem, limit); -out: - mutex_unlock(&memcg_limit_mutex); - return ret; -} - static int memcg_propagate_kmem(struct mem_cgroup *memcg) { int ret = 0; @@ -3000,16 +2982,45 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ +} +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ +} +#endif /* !CONFIG_SLOB */ + +#ifdef CONFIG_MEMCG_KMEM static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - return -EINVAL; + int ret; + + mutex_lock(&memcg_limit_mutex); + /* Top-level cgroup doesn't propagate from root */ + if (!memcg_kmem_online(memcg)) { + ret = memcg_online_kmem(memcg); + if (ret) + goto out; + } + ret = page_counter_limit(&memcg->kmem, limit); +out: + mutex_unlock(&memcg_limit_mutex); + return ret; } -static void memcg_offline_kmem(struct mem_cgroup *memcg) +#else +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long limit) { + return -EINVAL; } #endif /* CONFIG_MEMCG_KMEM */ + /* * The user of this function is... * RES_LIMIT. @@ -4182,7 +4193,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -4244,10 +4255,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); -#ifdef CONFIG_MEMCG_KMEM ret = memcg_propagate_kmem(memcg); if (ret) return ret; + +#ifdef CONFIG_MEMCG_KMEM ret = tcp_init_cgroup(memcg); if (ret) return ret; @@ -4308,8 +4320,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) static_branch_dec(&memcg_sockets_enabled_key); #endif -#ifdef CONFIG_MEMCG_KMEM memcg_free_kmem(memcg); + +#ifdef CONFIG_MEMCG_KMEM tcp_destroy_cgroup(memcg); #endif diff --git a/mm/slab.h b/mm/slab.h index c63b8699cfa3..834ad240c0bb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -173,7 +173,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * Iterate over all memcg caches of the given root cache. The caller must hold * slab_mutex. @@ -251,7 +251,7 @@ static __always_inline int memcg_charge_slab(struct page *page, extern void slab_init_memcg_params(struct kmem_cache *); -#else /* !CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG && !CONFIG_SLOB */ #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) @@ -292,7 +292,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, static inline void slab_init_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 8c262e6dc33e..b50aef01ccf7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -128,7 +128,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, return i; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.is_root_cache = true; @@ -221,7 +221,7 @@ static inline int init_memcg_params(struct kmem_cache *s, static inline void destroy_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* * Find a mergeable slab cache @@ -477,7 +477,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier) } } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -689,7 +689,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s, { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ void slab_kmem_cache_release(struct kmem_cache *s) { @@ -1123,7 +1123,7 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, list); diff --git a/mm/slub.c b/mm/slub.c index b21fd24b08b1..2e1355ac056b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5207,7 +5207,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { struct kmem_cache *c; @@ -5242,7 +5242,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, static void memcg_propagate_slab_attrs(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG int i; char *buffer = NULL; struct kmem_cache *root_cache; @@ -5328,7 +5328,7 @@ static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (!is_root_cache(s)) return s->memcg_params.root_cache->memcg_kset; #endif @@ -5405,7 +5405,7 @@ static int sysfs_slab_add(struct kmem_cache *s) if (err) goto out_del_kobj; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (is_root_cache(s)) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { @@ -5438,7 +5438,7 @@ void sysfs_slab_remove(struct kmem_cache *s) */ return; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); -- cgit From 52c29b04823cb1bab2805336b80866325fe2bc3f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:35 -0800 Subject: mm: memcontrol: account "kmem" consumers in cgroup2 memory controller The original cgroup memory controller has an extension to account slab memory (and other "kernel memory" consumers) in a separate "kmem" counter, once the user set an explicit limit on that "kmem" pool. However, this includes various consumers whose sizes are directly linked to userspace activity. Accounting them as an optional "kmem" extension is problematic for several reasons: 1. It leaves the main memory interface with incomplete semantics. A user who puts their workload into a cgroup and configures a memory limit does not expect us to leave holes in the containment as big as the dentry and inode cache, or the kernel stack pages. 2. If the limit set on this random historical subgroup of consumers is reached, subsequent allocations will fail even when the main memory pool available to the cgroup is not yet exhausted and/or has reclaimable memory in it. 3. Calling it 'kernel memory' is misleading. The dentry and inode caches are no more 'kernel' (or no less 'user') memory than the page cache itself. Treating these consumers as different classes is a historical implementation detail that should not leak to users. So, in addition to page cache, anonymous memory, and network socket memory, account the following memory consumers per default in the cgroup2 memory controller: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This should give us reasonable memory isolation for most common workloads out of the box. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe51d5e61389..9e7a4e521917 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2381,13 +2381,14 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!memcg_kmem_online(memcg)) return 0; - if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) - return -ENOMEM; - ret = try_charge(memcg, gfp, nr_pages); - if (ret) { - page_counter_uncharge(&memcg->kmem, nr_pages); + if (ret) return ret; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + cancel_charge(memcg, nr_pages); + return -ENOMEM; } page->mem_cgroup = memcg; @@ -2416,7 +2417,9 @@ void __memcg_kmem_uncharge(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - page_counter_uncharge(&memcg->kmem, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); @@ -2922,7 +2925,8 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) * onlined after this point, because it has at least one child * already. */ - if (memcg_kmem_online(parent)) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) || + memcg_kmem_online(parent)) ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; -- cgit From 04823c833b3eaef7816e28e3727124394f6bb3c3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:38 -0800 Subject: mm: memcontrol: allow to disable kmem accounting for cgroup2 Kmem accounting might incur overhead that some users can't put up with. Besides, the implementation is still considered unstable. So let's provide a way to disable it for those users who aren't happy with it. To disable kmem accounting for cgroup2, pass cgroup.memory=nokmem at boot time. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e7a4e521917..2239e6dd4d4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,6 +83,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket; +/* Kernel memory accounting disabled? */ +static bool cgroup_memory_nokmem; + /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP int do_swap_account __read_mostly; @@ -2925,8 +2928,8 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) * onlined after this point, because it has at least one child * already. */ - if (cgroup_subsys_on_dfl(memory_cgrp_subsys) || - memcg_kmem_online(parent)) + if (memcg_kmem_online(parent) || + (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; @@ -5638,6 +5641,8 @@ static int __init cgroup_memory(char *s) continue; if (!strcmp(token, "nosocket")) cgroup_memory_nosocket = true; + if (!strcmp(token, "nokmem")) + cgroup_memory_nokmem = true; } return 0; } -- cgit From 489c2a20a414351fe0813a727c34600c0f7292ae Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:41 -0800 Subject: mm: memcontrol: introduce CONFIG_MEMCG_LEGACY_KMEM Let the user know that CONFIG_MEMCG_KMEM does not apply to the cgroup2 interface. This also makes legacy-only code sections stand out better. [arnd@arndb.de: mm: memcontrol: only manage socket pressure for CONFIG_INET] Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 +++++++++--------- mm/vmpressure.c | 2 ++ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2239e6dd4d4c..92e8ab67b6df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3001,7 +3001,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { @@ -3025,7 +3025,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, { return -EINVAL; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG_LEGACY_KMEM */ /* @@ -4039,7 +4039,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), @@ -4266,13 +4266,13 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (ret) return ret; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_INET +#ifdef CONFIG_MEMCG_LEGACY_KMEM ret = tcp_init_cgroup(memcg); if (ret) return ret; #endif -#ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); #endif @@ -4329,7 +4329,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) memcg_free_kmem(memcg); -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) tcp_destroy_cgroup(memcg); #endif @@ -5558,7 +5558,7 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) goto out; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) goto out; #endif @@ -5587,7 +5587,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { gfp_t gfp_mask = GFP_KERNEL; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { struct page_counter *counter; @@ -5619,7 +5619,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 9a6c0704211c..89b1d441af4b 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -275,6 +275,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, level = vmpressure_calc_level(scanned, reclaimed); +#ifdef CONFIG_INET if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that @@ -286,6 +287,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, */ memcg->socket_pressure = jiffies + HZ; } +#endif } } -- cgit From d55f90bfab40e3b5db323711d28186ff09461692 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:44 -0800 Subject: net: drop tcp_memcontrol.c tcp_memcontrol.c only contains legacy memory.tcp.kmem.* file definitions and mem_cgroup->tcp_mem init/destroy stuff. This doesn't belong to network subsys. Let's move it to memcontrol.c. This also allows us to reuse generic code for handling legacy memcg files. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: "David S. Miller" Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92e8ab67b6df..15896708429b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,7 +66,6 @@ #include "internal.h" #include #include -#include #include "slab.h" #include @@ -242,6 +241,7 @@ enum res_type { _MEMSWAP, _OOM_TYPE, _KMEM, + _TCP, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) @@ -2842,6 +2842,11 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, case _KMEM: counter = &memcg->kmem; break; +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + case _TCP: + counter = &memcg->tcp_mem.memory_allocated; + break; +#endif default: BUG(); } @@ -3028,6 +3033,48 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, #endif /* CONFIG_MEMCG_LEGACY_KMEM */ +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) +static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +{ + int ret; + + mutex_lock(&memcg_limit_mutex); + + ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, limit); + if (ret) + goto out; + + if (!memcg->tcp_mem.active) { + /* + * The active flag needs to be written after the static_key + * update. This is what guarantees that the socket activation + * function is the last one to run. See sock_update_memcg() for + * details, and note that we don't mark any socket as belonging + * to this memcg until that flag is up. + * + * We need to do this, because static_keys will span multiple + * sites, but we can't control their order. If we mark a socket + * as accounted, but the accounting functions are not patched in + * yet, we'll lose accounting. + * + * We never race with the readers in sock_update_memcg(), + * because when this value change, the code to process it is not + * patched in yet. + */ + static_branch_inc(&memcg_sockets_enabled_key); + memcg->tcp_mem.active = true; + } +out: + mutex_unlock(&memcg_limit_mutex); + return ret; +} +#else +static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +{ + return -EINVAL; +} +#endif /* CONFIG_MEMCG_LEGACY_KMEM && CONFIG_INET */ + /* * The user of this function is... * RES_LIMIT. @@ -3060,6 +3107,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); break; + case _TCP: + ret = memcg_update_tcp_limit(memcg, nr_pages); + break; } break; case RES_SOFT_LIMIT: @@ -3086,6 +3136,11 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, case _KMEM: counter = &memcg->kmem; break; +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + case _TCP: + counter = &memcg->tcp_mem.memory_allocated; + break; +#endif default: BUG(); } @@ -4072,6 +4127,31 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_slab_show, }, #endif +#ifdef CONFIG_INET + { + .name = "kmem.tcp.limit_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.failcnt", + .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, +#endif #endif { }, /* terminate */ }; @@ -4241,6 +4321,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + page_counter_init(&memcg->tcp_mem.memory_allocated, + &parent->tcp_mem.memory_allocated); +#endif /* * No need to take a reference to the parent because cgroup @@ -4252,6 +4336,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + page_counter_init(&memcg->tcp_mem.memory_allocated, NULL); +#endif /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4267,12 +4354,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) return ret; #ifdef CONFIG_INET -#ifdef CONFIG_MEMCG_LEGACY_KMEM - ret = tcp_init_cgroup(memcg); - if (ret) - return ret; -#endif - if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); #endif @@ -4330,7 +4411,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) memcg_free_kmem(memcg); #if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) - tcp_destroy_cgroup(memcg); + if (memcg->tcp_mem.active) + static_branch_dec(&memcg_sockets_enabled_key); #endif __mem_cgroup_free(memcg); -- cgit From d886f4e483ce63a3304adc9eda87031b93341c28 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:47 -0800 Subject: mm: memcontrol: rein in the CONFIG space madness What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory controller code is insignificant, having these conditionals is not worth the complication and fragility that comes with them. [akpm@linux-foundation.org: rework mem_cgroup_css_free() statement ordering] Signed-off-by: Johannes Weiner Cc: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 53 ++++------------------------------------------------- mm/vmpressure.c | 2 -- 2 files changed, 4 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15896708429b..379f9911b87b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2842,11 +2842,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, case _KMEM: counter = &memcg->kmem; break; -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) case _TCP: counter = &memcg->tcp_mem.memory_allocated; break; -#endif default: BUG(); } @@ -3006,7 +3004,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_LEGACY_KMEM static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { @@ -3024,16 +3021,7 @@ out: mutex_unlock(&memcg_limit_mutex); return ret; } -#else -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - return -EINVAL; -} -#endif /* CONFIG_MEMCG_LEGACY_KMEM */ - -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) { int ret; @@ -3068,12 +3056,6 @@ out: mutex_unlock(&memcg_limit_mutex); return ret; } -#else -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) -{ - return -EINVAL; -} -#endif /* CONFIG_MEMCG_LEGACY_KMEM && CONFIG_INET */ /* * The user of this function is... @@ -3136,11 +3118,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, case _KMEM: counter = &memcg->kmem; break; -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) case _TCP: counter = &memcg->tcp_mem.memory_allocated; break; -#endif default: BUG(); } @@ -4094,7 +4074,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_LEGACY_KMEM { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), @@ -4127,7 +4106,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_slab_show, }, #endif -#ifdef CONFIG_INET { .name = "kmem.tcp.limit_in_bytes", .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), @@ -4151,8 +4129,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#endif -#endif { }, /* terminate */ }; @@ -4280,14 +4256,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); + memcg->socket_pressure = jiffies; #ifndef CONFIG_SLOB memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); -#endif -#ifdef CONFIG_INET - memcg->socket_pressure = jiffies; #endif return &memcg->css; @@ -4321,10 +4295,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) page_counter_init(&memcg->tcp_mem.memory_allocated, &parent->tcp_mem.memory_allocated); -#endif /* * No need to take a reference to the parent because cgroup @@ -4336,9 +4308,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) page_counter_init(&memcg->tcp_mem.memory_allocated, NULL); -#endif /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4353,10 +4323,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (ret) return ret; -#ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); -#endif /* * Make sure the memcg is initialized: mem_cgroup_iter() @@ -4403,18 +4371,13 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); -#ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); -#endif - - memcg_free_kmem(memcg); -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) if (memcg->tcp_mem.active) static_branch_dec(&memcg_sockets_enabled_key); -#endif + memcg_free_kmem(memcg); __mem_cgroup_free(memcg); } @@ -5613,8 +5576,6 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, true); } -#ifdef CONFIG_INET - DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); @@ -5640,10 +5601,8 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) goto out; -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) goto out; -#endif if (css_tryget_online(&memcg->css)) sk->sk_memcg = memcg; out: @@ -5669,7 +5628,6 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { gfp_t gfp_mask = GFP_KERNEL; -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { struct page_counter *counter; @@ -5682,7 +5640,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) memcg->tcp_mem.memory_pressure = 1; return false; } -#endif + /* Don't block in the packet receive path */ if (in_softirq()) gfp_mask = GFP_NOWAIT; @@ -5701,19 +5659,16 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); return; } -#endif + page_counter_uncharge(&memcg->memory, nr_pages); css_put_many(&memcg->css, nr_pages); } -#endif /* CONFIG_INET */ - static int __init cgroup_memory(char *s) { char *token; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 89b1d441af4b..9a6c0704211c 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -275,7 +275,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, level = vmpressure_calc_level(scanned, reclaimed); -#ifdef CONFIG_INET if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that @@ -287,7 +286,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, */ memcg->socket_pressure = jiffies + HZ; } -#endif } } -- cgit From 0db1529817b7b16226421f01470c5ba982c5f302 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:50 -0800 Subject: mm: memcontrol: flatten struct cg_proto There are no more external users of struct cg_proto, flatten the structure into struct mem_cgroup. Since using those struct members doesn't stand out as much anymore, add cgroup2 static branches to make it clearer which code is legacy. Suggested-by: Vladimir Davydov Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 379f9911b87b..6937f16f5ecb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2843,7 +2843,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, counter = &memcg->kmem; break; case _TCP: - counter = &memcg->tcp_mem.memory_allocated; + counter = &memcg->tcpmem; break; default: BUG(); @@ -3028,11 +3028,11 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, limit); + ret = page_counter_limit(&memcg->tcpmem, limit); if (ret) goto out; - if (!memcg->tcp_mem.active) { + if (!memcg->tcpmem_active) { /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -3050,7 +3050,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) * patched in yet. */ static_branch_inc(&memcg_sockets_enabled_key); - memcg->tcp_mem.active = true; + memcg->tcpmem_active = true; } out: mutex_unlock(&memcg_limit_mutex); @@ -3119,7 +3119,7 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, counter = &memcg->kmem; break; case _TCP: - counter = &memcg->tcp_mem.memory_allocated; + counter = &memcg->tcpmem; break; default: BUG(); @@ -4295,8 +4295,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); - page_counter_init(&memcg->tcp_mem.memory_allocated, - &parent->tcp_mem.memory_allocated); + page_counter_init(&memcg->tcpmem, &parent->tcpmem); /* * No need to take a reference to the parent because cgroup @@ -4308,7 +4307,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcp_mem.memory_allocated, NULL); + page_counter_init(&memcg->tcpmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4374,7 +4373,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); - if (memcg->tcp_mem.active) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) static_branch_dec(&memcg_sockets_enabled_key); memcg_free_kmem(memcg); @@ -5601,7 +5600,7 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) goto out; if (css_tryget_online(&memcg->css)) sk->sk_memcg = memcg; @@ -5629,15 +5628,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) gfp_t gfp_mask = GFP_KERNEL; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - struct page_counter *counter; + struct page_counter *fail; - if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, - nr_pages, &counter)) { - memcg->tcp_mem.memory_pressure = 0; + if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { + memcg->tcpmem_pressure = 0; return true; } - page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); - memcg->tcp_mem.memory_pressure = 1; + page_counter_charge(&memcg->tcpmem, nr_pages); + memcg->tcpmem_pressure = 1; return false; } @@ -5660,8 +5658,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - page_counter_uncharge(&memcg->tcp_mem.memory_allocated, - nr_pages); + page_counter_uncharge(&memcg->tcpmem, nr_pages); return; } -- cgit From 0b8f73e104285a4badf9d768d1c39b06d77d1f97 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:53 -0800 Subject: mm: memcontrol: clean up alloc, online, offline, free functions The creation and teardown of struct mem_cgroup is fairly messy and that has attracted mistakes and subtle bugs before. The main cause for this is that there is no clear model about what needs to happen when, and that attracts more chaos. So create one: 1. mem_cgroup_alloc() should allocate struct mem_cgroup and its auxiliary members and initialize work items, locks etc. so that the object it returns is fully initialized and in a neutral state. 2. mem_cgroup_css_alloc() will use mem_cgroup_alloc() to obtain a new memcg object and configure it and the system according to the role of the new memory-controlled cgroup in the hierarchy. 3. mem_cgroup_css_online() is no longer needed to synchronize with iterators, but it verifies css->id which isn't available earlier. 4. mem_cgroup_css_offline() implements stuff that needs to happen upon the user-visible destruction of a cgroup, which includes stopping all user interfacing as well as releasing certain structures when continued memory consumption would be unexpected at that point. 5. mem_cgroup_css_free() prepares the system and the memcg object for the object's disappearance, neutralizes its state, and then gives it back to mem_cgroup_free(). 6. mem_cgroup_free() releases struct mem_cgroup and auxiliary memory. [arnd@arndb.de: fix SLOB build regression] Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 257 ++++++++++++++++++-------------------------------------- 1 file changed, 84 insertions(+), 173 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6937f16f5ecb..f6bc78f4ed13 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,13 +250,6 @@ enum res_type { /* Used for OOM nofiier */ #define OOM_CONTROL (0) -/* - * The memcg_create_mutex will be held whenever a new cgroup is created. - * As a consequence, any change that needs to protect against new child cgroups - * appearing has to hold it as well. - */ -static DEFINE_MUTEX(memcg_create_mutex); - /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -899,17 +892,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (css == &root->css) break; - if (css_tryget(css)) { - /* - * Make sure the memcg is initialized: - * mem_cgroup_css_online() orders the the - * initialization against setting the flag. - */ - if (smp_load_acquire(&memcg->initialized)) - break; - - css_put(css); - } + if (css_tryget(css)) + break; memcg = NULL; } @@ -2690,14 +2674,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) { bool ret; - /* - * The lock does not prevent addition or deletion of children, but - * it prevents a new child from being initialized based on this - * parent in css_online(), so it's enough to decide whether - * hierarchically inherited attributes can still be changed or not. - */ - lockdep_assert_held(&memcg_create_mutex); - rcu_read_lock(); ret = css_next_child(NULL, &memcg->css); rcu_read_unlock(); @@ -2760,10 +2736,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - mutex_lock(&memcg_create_mutex); - if (memcg->use_hierarchy == val) - goto out; + return 0; /* * If parent's use_hierarchy is set, we can't make any modifications @@ -2782,9 +2756,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, } else retval = -EINVAL; -out: - mutex_unlock(&memcg_create_mutex); - return retval; } @@ -2872,37 +2843,14 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, #ifndef CONFIG_SLOB static int memcg_online_kmem(struct mem_cgroup *memcg) { - int err = 0; int memcg_id; BUG_ON(memcg->kmemcg_id >= 0); BUG_ON(memcg->kmem_state); - /* - * For simplicity, we won't allow this to be disabled. It also can't - * be changed if the cgroup has children already, or if tasks had - * already joined. - * - * If tasks join before we set the limit, a person looking at - * kmem.usage_in_bytes will have no way to determine when it took - * place, which makes the value quite meaningless. - * - * After it first became limited, changes in the value of the limit are - * of course permitted. - */ - mutex_lock(&memcg_create_mutex); - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - err = -EBUSY; - mutex_unlock(&memcg_create_mutex); - if (err) - goto out; - memcg_id = memcg_alloc_cache_id(); - if (memcg_id < 0) { - err = memcg_id; - goto out; - } + if (memcg_id < 0) + return memcg_id; static_branch_inc(&memcg_kmem_enabled_key); /* @@ -2913,17 +2861,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; -out: - return err; + + return 0; } -static int memcg_propagate_kmem(struct mem_cgroup *memcg) +static int memcg_propagate_kmem(struct mem_cgroup *parent, + struct mem_cgroup *memcg) { int ret = 0; - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - - if (!parent) - return 0; mutex_lock(&memcg_limit_mutex); /* @@ -2985,6 +2930,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) static void memcg_free_kmem(struct mem_cgroup *memcg) { + /* css_alloc() failed, offlining didn't happen */ + if (unlikely(memcg->kmem_state == KMEM_ONLINE)) + memcg_offline_kmem(memcg); + if (memcg->kmem_state == KMEM_ALLOCATED) { memcg_destroy_kmem_caches(memcg); static_branch_dec(&memcg_kmem_enabled_key); @@ -2992,7 +2941,11 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else -static int memcg_propagate_kmem(struct mem_cgroup *memcg) +static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) +{ + return 0; +} +static int memcg_online_kmem(struct mem_cgroup *memcg) { return 0; } @@ -3007,11 +2960,16 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - int ret; + int ret = 0; mutex_lock(&memcg_limit_mutex); /* Top-level cgroup doesn't propagate from root */ if (!memcg_kmem_online(memcg)) { + if (cgroup_is_populated(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) + ret = -EBUSY; + if (ret) + goto out; ret = memcg_online_kmem(memcg); if (ret) goto out; @@ -4167,90 +4125,44 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) kfree(memcg->nodeinfo[node]); } -static struct mem_cgroup *mem_cgroup_alloc(void) -{ - struct mem_cgroup *memcg; - size_t size; - - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); - if (!memcg) - return NULL; - - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) - goto out_free; - - if (memcg_wb_domain_init(memcg, GFP_KERNEL)) - goto out_free_stat; - - return memcg; - -out_free_stat: - free_percpu(memcg->stat); -out_free: - kfree(memcg); - return NULL; -} - -/* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. - */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void mem_cgroup_free(struct mem_cgroup *memcg) { int node; - cancel_work_sync(&memcg->high_work); - - mem_cgroup_remove_from_trees(memcg); - + memcg_wb_domain_exit(memcg); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); - free_percpu(memcg->stat); - memcg_wb_domain_exit(memcg); kfree(memcg); } -static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - long error = -ENOMEM; + size_t size; int node; - memcg = mem_cgroup_alloc(); + size = sizeof(struct mem_cgroup); + size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); + + memcg = kzalloc(size, GFP_KERNEL); if (!memcg) - return ERR_PTR(error); + return NULL; + + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) + goto fail; for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) - goto free_out; + goto fail; - /* root ? */ - if (parent_css == NULL) { - root_mem_cgroup = memcg; - page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; - page_counter_init(&memcg->memsw, NULL); - page_counter_init(&memcg->kmem, NULL); - } + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto fail; INIT_WORK(&memcg->high_work, high_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); @@ -4263,48 +4175,37 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif - return &memcg->css; - -free_out: - __mem_cgroup_free(memcg); - return ERR_PTR(error); + return memcg; +fail: + mem_cgroup_free(memcg); + return NULL; } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static struct cgroup_subsys_state * __ref +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); - int ret; - - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - - if (!parent) - return 0; - - mutex_lock(&memcg_create_mutex); + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); + struct mem_cgroup *memcg; + long error = -ENOMEM; - memcg->use_hierarchy = parent->use_hierarchy; - memcg->oom_kill_disable = parent->oom_kill_disable; - memcg->swappiness = mem_cgroup_swappiness(parent); + memcg = mem_cgroup_alloc(); + if (!memcg) + return ERR_PTR(error); - if (parent->use_hierarchy) { + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; + if (parent) { + memcg->swappiness = mem_cgroup_swappiness(parent); + memcg->oom_kill_disable = parent->oom_kill_disable; + } + if (parent && parent->use_hierarchy) { + memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); - - /* - * No need to take a reference to the parent because cgroup - * core guarantees its existence. - */ } else { page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); @@ -4316,21 +4217,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent != root_mem_cgroup) memory_cgrp_subsys.broken_hierarchy = true; } - mutex_unlock(&memcg_create_mutex); - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; + /* The following stuff does not apply to the root */ + if (!parent) { + root_mem_cgroup = memcg; + return &memcg->css; + } + + error = memcg_propagate_kmem(parent, memcg); + if (error) + goto fail; if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); - /* - * Make sure the memcg is initialized: mem_cgroup_iter() - * orders reading memcg->initialized against its callers - * reading the memcg members. - */ - smp_store_release(&memcg->initialized, 1); + return &memcg->css; +fail: + mem_cgroup_free(memcg); + return NULL; +} + +static int +mem_cgroup_css_online(struct cgroup_subsys_state *css) +{ + if (css->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; return 0; } @@ -4352,10 +4263,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - vmpressure_cleanup(&memcg->vmpressure); - memcg_offline_kmem(memcg); - wb_memcg_offline(memcg); } @@ -4376,8 +4284,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) static_branch_dec(&memcg_sockets_enabled_key); + vmpressure_cleanup(&memcg->vmpressure); + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); memcg_free_kmem(memcg); - __mem_cgroup_free(memcg); + mem_cgroup_free(memcg); } /** -- cgit From 37e84351198be087335ad2b2253b35c7cc76a5ad Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:56 -0800 Subject: mm: memcontrol: charge swap to cgroup2 This patchset introduces swap accounting to cgroup2. This patch (of 7): In the legacy hierarchy we charge memsw, which is dubious, because: - memsw.limit must be >= memory.limit, so it is impossible to limit swap usage less than memory usage. Taking into account the fact that the primary limiting mechanism in the unified hierarchy is memory.high while memory.limit is either left unset or set to a very large value, moving memsw.limit knob to the unified hierarchy would effectively make it impossible to limit swap usage according to the user preference. - memsw.usage != memory.usage + swap.usage, because a page occupying both swap entry and a swap cache page is charged only once to memsw counter. As a result, it is possible to effectively eat up to memory.limit of memory pages *and* memsw.limit of swap entries, which looks unexpected. That said, we should provide a different swap limiting mechanism for cgroup2. This patch adds mem_cgroup->swap counter, which charges the actual number of swap entries used by a cgroup. It is only charged in the unified hierarchy, while the legacy hierarchy memsw logic is left intact. The swap usage can be monitored using new memory.swap.current file and limited using memory.swap.max. Note, to charge swap resource properly in the unified hierarchy, we have to make swap_entry_free uncharge swap only when ->usage reaches zero, not just ->count, i.e. when all references to a swap entry, including the one taken by swap cache, are gone. This is necessary, because otherwise swap-in could result in uncharging swap even if the page is still in swap cache and hence still occupies a swap entry. At the same time, this shouldn't break memsw counter logic, where a page is never charged twice for using both memory and swap, because in case of legacy hierarchy we uncharge swap on commit (see mem_cgroup_commit_charge). Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- mm/shmem.c | 4 ++ mm/swap_state.c | 5 +++ mm/swapfile.c | 4 +- 4 files changed, 121 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6bc78f4ed13..1ff552e3722b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1220,7 +1220,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], K(mem_cgroup_read_stat(iter, i))); @@ -1259,9 +1259,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { unsigned long memsw_limit; + unsigned long swap_limit; memsw_limit = memcg->memsw.limit; - limit = min(limit + total_swap_pages, memsw_limit); + swap_limit = memcg->swap.limit; + swap_limit = min(swap_limit, (unsigned long)total_swap_pages); + limit = min(limit + swap_limit, memsw_limit); } return limit; } @@ -4201,11 +4204,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent && parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); + page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); @@ -5224,7 +5229,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, if (page->mem_cgroup) goto out; - if (do_memsw_account()) { + if (do_swap_account) { swp_entry_t ent = { .val = page_private(page), }; unsigned short id = lookup_swap_cgroup_id(ent); @@ -5677,26 +5682,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) memcg_check_events(memcg, page); } +/* + * mem_cgroup_try_charge_swap - try charging a swap entry + * @page: page being added to swap + * @entry: swap entry to charge + * + * Try to charge @entry to the memcg that @page belongs to. + * + * Returns 0 on success, -ENOMEM on failure. + */ +int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + struct page_counter *counter; + unsigned short oldid; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + return 0; + + memcg = page->mem_cgroup; + + /* Readahead page, never charged */ + if (!memcg) + return 0; + + if (!mem_cgroup_is_root(memcg) && + !page_counter_try_charge(&memcg->swap, 1, &counter)) + return -ENOMEM; + + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + + css_get(&memcg->css); + return 0; +} + /** * mem_cgroup_uncharge_swap - uncharge a swap entry * @entry: swap entry to uncharge * - * Drop the memsw charge associated with @entry. + * Drop the swap charge associated with @entry. */ void mem_cgroup_uncharge_swap(swp_entry_t entry) { struct mem_cgroup *memcg; unsigned short id; - if (!do_memsw_account()) + if (!do_swap_account) return; id = swap_cgroup_record(entry, 0); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memsw, 1); + if (!mem_cgroup_is_root(memcg)) { + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->swap, 1); + else + page_counter_uncharge(&memcg->memsw, 1); + } mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -5720,6 +5765,63 @@ static int __init enable_swap_account(char *s) } __setup("swapaccount=", enable_swap_account); +static u64 swap_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; +} + +static int swap_max_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = READ_ONCE(memcg->swap.limit); + + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); + + return 0; +} + +static ssize_t swap_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + mutex_lock(&memcg_limit_mutex); + err = page_counter_limit(&memcg->swap, max); + mutex_unlock(&memcg_limit_mutex); + if (err) + return err; + + return nbytes; +} + +static struct cftype swap_files[] = { + { + .name = "swap.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = swap_current_read, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_max_show, + .write = swap_max_write, + }, + { } /* terminate */ +}; + static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", @@ -5751,6 +5853,8 @@ static int __init mem_cgroup_swap_init(void) { if (!mem_cgroup_disabled() && really_do_swap_account) { do_swap_account = 1; + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, + swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); } diff --git a/mm/shmem.c b/mm/shmem.c index b98e1011858c..fa2ceb2d2655 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; + if (mem_cgroup_try_charge_swap(page, swap)) + goto free_swap; + /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); +free_swap: swapcache_free(swap); redirty: set_page_dirty(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 676ff2991380..69cb2464e7dc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list) if (!entry.val) return 0; + if (mem_cgroup_try_charge_swap(page, entry)) { + swapcache_free(entry); + return 0; + } + if (unlikely(PageTransHuge(page))) if (unlikely(split_huge_page_to_list(page, list))) { swapcache_free(entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2bb30aa3a412..22a7a1fc1e47 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, count--; } - if (!count) - mem_cgroup_uncharge_swap(entry); - usage = count | has_cache; p->swap_map[offset] = usage; /* free if no reference */ if (!usage) { + mem_cgroup_uncharge_swap(entry); dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; -- cgit From 3337767850b490eec5ca822f871241c981664737 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:59 -0800 Subject: mm: vmscan: pass memcg to get_scan_count() memcg will come in handy in get_scan_count(). It can already be used for getting swappiness immediately in get_scan_count() instead of passing it around. The following patches will add more memcg-related values, which will be used there. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 05dd182f04fd..014ff89a4aa5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1966,10 +1966,11 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, int swappiness, +static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr, unsigned long *lru_pages) { + int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ @@ -2193,9 +2194,10 @@ static inline void init_tlb_ubc(void) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, int swappiness, - struct scan_control *sc, unsigned long *lru_pages) +static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long *lru_pages) { + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2205,7 +2207,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, swappiness, sc, nr, lru_pages); + get_scan_count(lruvec, memcg, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2409,8 +2411,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, unsigned long lru_pages; unsigned long reclaimed; unsigned long scanned; - struct lruvec *lruvec; - int swappiness; if (mem_cgroup_low(root, memcg)) { if (!sc->may_thrash) @@ -2418,12 +2418,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, mem_cgroup_events(memcg, MEMCG_LOW, 1); } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); - swappiness = mem_cgroup_swappiness(memcg); reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + shrink_zone_memcg(zone, memcg, sc, &lru_pages); zone_lru_pages += lru_pages; if (memcg && is_classzone) @@ -2893,8 +2891,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !noswap, }; - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - int swappiness = mem_cgroup_swappiness(memcg); unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2911,7 +2907,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); + shrink_zone_memcg(zone, memcg, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); -- cgit From eb01aaab43084f1c919ce66183fea005033351b9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:02 -0800 Subject: mm: memcontrol: replace mem_cgroup_lruvec_online with mem_cgroup_online mem_cgroup_lruvec_online() takes lruvec, but it only needs memcg. Since get_scan_count(), which is the only user of this function, now possesses pointer to memcg, let's pass memcg directly to mem_cgroup_online() instead of picking it out of lruvec and rename the function accordingly. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 014ff89a4aa5..9a8556e49cd9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1997,7 +1997,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, if (current_is_kswapd()) { if (!zone_reclaimable(zone)) force_scan = true; - if (!mem_cgroup_lruvec_online(lruvec)) + if (!mem_cgroup_online(memcg)) force_scan = true; } if (!global_reclaim(sc)) -- cgit From d8b38438a0bcb362c396f49d8279ef7b505917f4 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:07 -0800 Subject: mm: vmscan: do not scan anon pages if memcg swap limit is hit We don't scan anonymous memory if we ran out of swap, neither should we do it in case memcg swap limit is hit, because swap out is impossible anyway. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++++++++++++ mm/vmscan.c | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1ff552e3722b..bcb38718e174 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5748,6 +5748,19 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) rcu_read_unlock(); } +long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) +{ + long nr_swap_pages = get_nr_swap_pages(); + + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return nr_swap_pages; + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + nr_swap_pages = min_t(long, nr_swap_pages, + READ_ONCE(memcg->swap.limit) - + page_counter_read(&memcg->swap)); + return nr_swap_pages; +} + /* for remember boot option*/ #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a8556e49cd9..3be5f9db6c42 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2004,7 +2004,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { scan_balance = SCAN_FILE; goto out; } -- cgit From 5ccc5abaaf6f9242cc63342c5286990233f392fa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:10 -0800 Subject: mm: free swap cache aggressively if memcg swap is full Swap cache pages are freed aggressively if swap is nearly full (>50% currently), because otherwise we are likely to stop scanning anonymous when we near the swap limit even if there is plenty of freeable swap cache pages. We should follow the same trend in case of memory cgroup, which has its own swap limit. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 22 ++++++++++++++++++++++ mm/memory.c | 3 ++- mm/swapfile.c | 2 +- mm/vmscan.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bcb38718e174..6a0007965e31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5761,6 +5761,28 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; } +bool mem_cgroup_swap_full(struct page *page) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (vm_swap_full()) + return true; + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + memcg = page->mem_cgroup; + if (!memcg) + return false; + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + return true; + + return false; +} + /* for remember boot option*/ #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; diff --git a/mm/memory.c b/mm/memory.c index ff17850a52d9..30991f83d0bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2582,7 +2582,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if (mem_cgroup_swap_full(page) || + (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (page != swapcache) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 22a7a1fc1e47..c43f654a7b64 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1006,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + (!page_mapped(page) || mem_cgroup_swap_full(page))) { delete_from_swap_cache(page); SetPageDirty(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3be5f9db6c42..bd620b65db52 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1214,7 +1214,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && mem_cgroup_swap_full(page)) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); -- cgit From 44b7a8d33d666268062e0f725d5f14813a63a6ea Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:03:16 -0800 Subject: mm: memcontrol: do not uncharge old page in page cache replacement Changing page->mem_cgroup of a live page is tricky and fragile. In particular, the memcg writeback code relies on that mapping being stable and users of mem_cgroup_replace_page() not overlapping with dirtyable inodes. Page cache replacement doesn't have to do that, though. Instead of being clever and transferring the charge from the old page to the new, force-charge the new page and leave the old page alone. A temporary overcharge won't matter in practice, and the old page is going to be freed shortly after this anyway. And this is not performance critical. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a0007965e31..bf35bff282fc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -366,13 +366,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. - * - * XXX: The above description of behavior on the default hierarchy isn't - * strictly true yet as replace_page_cache_page() can modify the - * association before @page is released even on the default hierarchy; - * however, the current and planned usages don't mix the the two functions - * and replace_page_cache_page() will soon be updated to make the invariant - * actually true. */ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { @@ -5464,7 +5457,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; - int isolated; + unsigned int nr_pages; + bool compound; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); @@ -5484,11 +5478,21 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) if (!memcg) return; - lock_page_lru(oldpage, &isolated); - oldpage->mem_cgroup = NULL; - unlock_page_lru(oldpage, isolated); + /* Force-charge the new page. The old one will be freed soon */ + compound = PageTransHuge(newpage); + nr_pages = compound ? hpage_nr_pages(newpage) : 1; + + page_counter_charge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); commit_charge(newpage, memcg, true); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); + memcg_check_events(memcg, newpage); + local_irq_enable(); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); -- cgit From 587d9f726aaec52157e4156e50363dbe6cb82bdb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:03:19 -0800 Subject: mm: memcontrol: basic memory statistics in cgroup2 memory controller Provide a cgroup2 memory.stat that provides statistics on LRU memory and fault event counters. More consumers and breakdowns will follow. Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bf35bff282fc..98f4109bff6c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2767,6 +2767,18 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } +static unsigned long tree_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx) +{ + struct mem_cgroup *iter; + unsigned long val = 0; + + for_each_mem_cgroup_tree(iter, memcg) + val += mem_cgroup_read_events(iter, idx); + + return val; +} + static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -5096,6 +5108,57 @@ static int memory_events_show(struct seq_file *m, void *v) return 0; } +static int memory_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + int i; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_printf(m, "anon %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + seq_printf(m, "file %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + + seq_printf(m, "file_mapped %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * + PAGE_SIZE); + seq_printf(m, "file_dirty %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * + PAGE_SIZE); + seq_printf(m, "file_writeback %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) { + struct mem_cgroup *mi; + unsigned long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_nr_lru_pages(mi, BIT(i)); + seq_printf(m, "%s %llu\n", + mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); + } + + /* Accumulated memory events */ + + seq_printf(m, "pgfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + seq_printf(m, "pgmajfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + + return 0; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5126,6 +5189,11 @@ static struct cftype memory_files[] = { .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, + { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_stat_show, + }, { } /* terminate */ }; -- cgit From b2807f07f4f87362925b8a5b8cbb7b624da10f03 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:03:22 -0800 Subject: mm: memcontrol: add "sock" to cgroup2 memory.stat Provide statistics on how much of a cgroup's memory footprint is made up of socket buffers from network connections owned by the group. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 98f4109bff6c..ca052f2a4a0b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5128,6 +5128,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + seq_printf(m, "sock %llu\n", + (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * @@ -5631,6 +5633,8 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) if (in_softirq()) gfp_mask = GFP_NOWAIT; + this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + if (try_charge(memcg, gfp_mask, nr_pages) == 0) return true; @@ -5650,6 +5654,8 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) return; } + this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); css_put_many(&memcg->css, nr_pages); } -- cgit