From 086f694a75e1a283a11f9afa7bae258f30892b81 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 22 Mar 2022 14:40:07 -0700 Subject: memcg: replace in_interrupt() with !in_task() Replace the deprecated in_interrupt() with !in_task() because in_interrupt() returns true for BH disabled even if the call happens in the task context. in_task() is the right interface to differentiate task context from NMI, hard IRQ and softirq contexts. Link: https://lkml.kernel.org/r/20220127162636.3461256-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Vasily Averin Cc: Johannes Weiner Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36e9f38c919d..209e66893da6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2688,7 +2688,7 @@ done_restock: READ_ONCE(memcg->swap.high); /* Don't bother a random interrupted task */ - if (in_interrupt()) { + if (!in_task()) { if (mem_high) { schedule_work(&memcg->high_work); break; @@ -6968,7 +6968,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) return; /* Do not associate the sock with unrelated interrupted task's memcg. */ - if (in_interrupt()) + if (!in_task()) return; rcu_read_lock(); -- cgit From a8c49af3be5f0b4e105ef678bcf14ef102c270be Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 22 Mar 2022 14:40:10 -0700 Subject: memcg: add per-memcg total kernel memory stat Currently memcg stats show several types of kernel memory: kernel stack, page tables, sock, vmalloc, and slab. However, there are other allocations with __GFP_ACCOUNT (or supersets such as GFP_KERNEL_ACCOUNT) that are not accounted in any of those stats, a few examples are: - various kvm allocations (e.g. allocated pages to create vcpus) - io_uring - tmp_page in pipes during pipe_write() - bpf ringbuffers - unix sockets Keeping track of the total kernel memory is essential for the ease of migration from cgroup v1 to v2 as there are large discrepancies between v1's kmem.usage_in_bytes and the sum of the available kernel memory stats in v2. Adding separate memcg stats for all __GFP_ACCOUNT kernel allocations is an impractical maintenance burden as there a lot of those all over the kernel code, with more use cases likely to show up in the future. Therefore, add a "kernel" memcg stat that is analogous to kmem page counter, with added benefits such as using rstat infrastructure which aggregates stats more efficiently. Additionally, this provides a lighter alternative in case the legacy kmem is deprecated in the future [yosryahmed@google.com: v2] Link: https://lkml.kernel.org/r/20220203193856.972500-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20220201200823.3283171-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 209e66893da6..e64a276837b0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1371,6 +1371,7 @@ struct memory_stat { static const struct memory_stat memory_stats[] = { { "anon", NR_ANON_MAPPED }, { "file", NR_FILE_PAGES }, + { "kernel", MEMCG_KMEM }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, @@ -2114,6 +2115,7 @@ static DEFINE_MUTEX(percpu_charge_mutex); static void drain_obj_stock(struct obj_stock *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); #else static inline void drain_obj_stock(struct obj_stock *stock) @@ -2124,6 +2126,9 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { return false; } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ +} #endif /** @@ -2979,6 +2984,18 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ + mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (nr_pages > 0) + page_counter_charge(&memcg->kmem, nr_pages); + else + page_counter_uncharge(&memcg->kmem, -nr_pages); + } +} + + /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge @@ -2991,8 +3008,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, -nr_pages); refill_stock(memcg, nr_pages); css_put(&memcg->css); @@ -3018,8 +3034,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_charge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); @@ -6801,8 +6816,8 @@ static void uncharge_batch(const struct uncharge_gather *ug) page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) - page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + if (ug->nr_kmem) + memcg_account_kmem(ug->memcg, -ug->nr_kmem); memcg_oom_recover(ug->memcg); } -- cgit From c857266dca8fe7af3d51343d0f4edf3ba4dd1542 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 22 Mar 2022 14:40:13 -0700 Subject: mm/memcg: mem_cgroup_per_node is already set to 0 on allocation kzalloc_node() would set data to 0, so it's not necessary to set it again. Link: https://lkml.kernel.org/r/20220201004643.8391-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Muchun Song Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Reviewed-by: Mike Rapoport Reviewed-by: Shakeel Butt Cc: Roman Gushchin Cc: Johannes Weiner Cc: Suren Baghdasaryan Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e64a276837b0..5318c3fb92f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5105,8 +5105,6 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) } lruvec_init(&pn->lruvec); - pn->usage_in_excess = 0; - pn->on_tree = false; pn->memcg = memcg; memcg->nodeinfo[node] = pn; -- cgit From becdf89d776c8f59e66071f567effa150068c338 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 22 Mar 2022 14:40:19 -0700 Subject: memcg: refactor mem_cgroup_oom Patch series "memcg: robust enforcement of memory.high", v2. Due to the semantics of memory.high enforcement i.e. throttle the workload without oom-kill, we are trying to use it for right sizing the workloads in our production environment. However we observed the mechanism fails for some specific applications which does big chunck of allocations in a single syscall. The reason behind this failure is due to the limitation of the memory.high enforcement's current implementation. This patch series solves this issue by enforcing the memory.high synchronously if the current process has accumulated a large amount of high overcharge. This patch (of 4): The function mem_cgroup_oom returns enum which has four possible values but the caller does not care about such values and only cares if the return value is OOM_SUCCESS or not. So, remove the enum altogether and make mem_cgroup_oom returns a simple bool. Link: https://lkml.kernel.org/r/20220211064917.2028469-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220211064917.2028469-2-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Chris Down Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5318c3fb92f4..0451cc06b157 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1796,20 +1796,16 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -enum oom_status { - OOM_SUCCESS, - OOM_FAILED, - OOM_ASYNC, - OOM_SKIPPED -}; - -static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +/* + * Returns true if successfully killed one or more processes. Though in some + * corner cases it can return true even without killing any process. + */ +static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - enum oom_status ret; - bool locked; + bool locked, ret; if (order > PAGE_ALLOC_COSTLY_ORDER) - return OOM_SKIPPED; + return false; memcg_memory_event(memcg, MEMCG_OOM); @@ -1832,14 +1828,13 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int * victim and then we have to bail out from the charge path. */ if (memcg->oom_kill_disable) { - if (!current->in_user_fault) - return OOM_SKIPPED; - css_get(&memcg->css); - current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; - - return OOM_ASYNC; + if (current->in_user_fault) { + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + } + return false; } mem_cgroup_mark_under_oom(memcg); @@ -1850,10 +1845,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int mem_cgroup_oom_notify(memcg); mem_cgroup_unmark_under_oom(memcg); - if (mem_cgroup_out_of_memory(memcg, mask, order)) - ret = OOM_SUCCESS; - else - ret = OOM_FAILED; + ret = mem_cgroup_out_of_memory(memcg, mask, order); if (locked) mem_cgroup_oom_unlock(memcg); @@ -2546,7 +2538,6 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, int nr_retries = MAX_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; - enum oom_status oom_status; unsigned long nr_reclaimed; bool passed_oom = false; bool may_swap = true; @@ -2649,9 +2640,8 @@ retry: * a forward progress or bypass the charge if the oom killer * couldn't make any progress. */ - oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, - get_order(nr_pages * PAGE_SIZE)); - if (oom_status == OOM_SUCCESS) { + if (mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE))) { passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; -- cgit From 1461e8c2b6af89e9662b5cbb714d7cb80baae3ca Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 22 Mar 2022 14:40:22 -0700 Subject: memcg: unify force charging conditions Currently the kernel force charges the allocations which have __GFP_HIGH flag without triggering the memory reclaim. __GFP_HIGH indicates that the caller is high priority and since commit 869712fd3de5 ("mm: memcontrol: fix network errors from failing __GFP_ATOMIC charges") the kernel lets such allocations do force charging. Please note that __GFP_ATOMIC has been replaced by __GFP_HIGH. __GFP_HIGH does not tell if the caller can block or can trigger reclaim. There are separate checks to determine that. So, there is no need to skip reclaiming for __GFP_HIGH allocations. So, handle __GFP_HIGH together with __GFP_NOFAIL which also does force charging. Please note that this is a noop change as there are no __GFP_HIGH allocators in the kernel which also have __GFP_ACCOUNT (or SLAB_ACCOUNT) and does not allow reclaim for now. Link: https://lkml.kernel.org/r/20220211064917.2028469-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Roman Gushchin Cc: Chris Down Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0451cc06b157..0e8a58d6e374 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2565,15 +2565,6 @@ retry: goto retry; } - /* - * Memcg doesn't have a dedicated reserve for atomic - * allocations. But like the global atomic pool, we need to - * put the burden of reclaim on regular allocation requests - * and let these go through as privileged allocations. - */ - if (gfp_mask & __GFP_ATOMIC) - goto force; - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, @@ -2647,7 +2638,13 @@ retry: goto retry; } nomem: - if (!(gfp_mask & __GFP_NOFAIL)) + /* + * Memcg doesn't have a dedicated reserve for atomic + * allocations. But like the global atomic pool, we need to + * put the burden of reclaim on regular allocation requests + * and let these go through as privileged allocations. + */ + if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) return -ENOMEM; force: /* -- cgit From c9afe31ec443ea6d81d556159abc7ef0bc462ac0 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 22 Mar 2022 14:40:28 -0700 Subject: memcg: synchronously enforce memory.high for large overcharges The high limit is used to throttle the workload without invoking the oom-killer. Recently we tried to use the high limit to right size our internal workloads. More specifically dynamically adjusting the limits of the workload without letting the workload get oom-killed. However due to the limitation of the implementation of high limit enforcement, we observed the mechanism fails for some real workloads. The high limit is enforced on return-to-userspace i.e. the kernel let the usage goes over the limit and when the execution returns to userspace, the high reclaim is triggered and the process can get throttled as well. However this mechanism fails for workloads which do large allocations in a single kernel entry e.g. applications that mlock() a large chunk of memory in a single syscall. Such applications bypass the high limit and can trigger the oom-killer. To make high limit enforcement more robust, this patch makes the limit enforcement synchronous only if the accumulated overcharge becomes larger than MEMCG_CHARGE_BATCH. So, most of the allocations would still be throttled on the return-to-userspace path but only the extreme allocations which accumulates large amount of overcharge without returning to the userspace will be throttled synchronously. The value MEMCG_CHARGE_BATCH is a bit arbitrary but most of other places in the memcg codebase uses this constant therefore for now uses the same one. Link: https://lkml.kernel.org/r/20220211064917.2028469-5-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Chris Down Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e8a58d6e374..17398e7601f6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2704,6 +2704,11 @@ done_restock: } } while ((memcg = parent_mem_cgroup(memcg))); + if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && + !(current->flags & PF_MEMALLOC) && + gfpflags_allow_blocking(gfp_mask)) { + mem_cgroup_handle_over_high(); + } return 0; } -- cgit From 460a79e18842caca6fa0c415de4a3ac1e671ac50 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 22 Mar 2022 14:40:31 -0700 Subject: mm/memcontrol: return 1 from cgroup.memory __setup() handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __setup() handlers should return 1 if the command line option is handled and 0 if not (or maybe never return 0; it just pollutes init's environment). The only reason that this particular __setup handler does not pollute init's environment is that the setup string contains a '.', as in "cgroup.memory". This causes init/main.c::unknown_boottoption() to consider it to be an "Unused module parameter" and ignore it. (This is for parsing of loadable module parameters any time after kernel init.) Otherwise the string "cgroup.memory=whatever" would be added to init's environment strings. Instead of relying on this '.' quirk, just return 1 to indicate that the boot option has been handled. Note that there is no warning message if someone enters: cgroup.memory=anything_invalid Link: https://lkml.kernel.org/r/20220222005811.10672-1-rdunlap@infradead.org Fixes: f7e1cb6ec51b0 ("mm: memcontrol: account socket memory in unified hierarchy memory controller") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Reviewed-by: Michal Koutný Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17398e7601f6..416a608939f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7058,7 +7058,7 @@ static int __init cgroup_memory(char *s) if (!strcmp(token, "nokmem")) cgroup_memory_nokmem = true; } - return 0; + return 1; } __setup("cgroup.memory=", cgroup_memory); -- cgit From fead2b869764f89d524b79dc8862e61d5191be55 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:40:35 -0700 Subject: mm/memcg: revert ("mm/memcg: optimize user context object stock access") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/memcg: Address PREEMPT_RT problems instead of disabling it", v5. This series aims to address the memcg related problem on PREEMPT_RT. I tested them on CONFIG_PREEMPT and CONFIG_PREEMPT_RT with the tools/testing/selftests/cgroup/* tests and I haven't observed any regressions (other than the lockdep report that is already there). This patch (of 6): The optimisation is based on a micro benchmark where local_irq_save() is more expensive than a preempt_disable(). There is no evidence that it is visible in a real-world workload and there are CPUs where the opposite is true (local_irq_save() is cheaper than preempt_disable()). Based on micro benchmarks, the optimisation makes sense on PREEMPT_NONE where preempt_disable() is optimized away. There is no improvement with PREEMPT_DYNAMIC since the preemption counter is always available. The optimization makes also the PREEMPT_RT integration more complicated since most of the assumption are not true on PREEMPT_RT. Revert the optimisation since it complicates the PREEMPT_RT integration and the improvement is hardly visible. [bigeasy@linutronix.de: patch body around Michal's diff] Link: https://lkml.kernel.org/r/20220226204144.1008339-1-bigeasy@linutronix.de Link: https://lore.kernel.org/all/YgOGkXXCrD%2F1k+p4@dhcp22.suse.cz Link: https://lkml.kernel.org/r/YdX+INO9gQje6d0S@linutronix.de Link: https://lkml.kernel.org/r/20220226204144.1008339-2-bigeasy@linutronix.de Signed-off-by: Michal Hocko Signed-off-by: Sebastian Andrzej Siewior Acked-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Waiman Long Cc: kernel test robot Cc: Michal Hocko Cc: Michal Koutný Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 94 +++++++++++++++++---------------------------------------- 1 file changed, 27 insertions(+), 67 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 416a608939f5..7bf204b2b053 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2078,23 +2078,17 @@ void unlock_page_memcg(struct page *page) folio_memcg_unlock(page_folio(page)); } -struct obj_stock { +struct memcg_stock_pcp { + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; -#else - int dummy[0]; #endif -}; - -struct memcg_stock_pcp { - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; - struct obj_stock task_obj; - struct obj_stock irq_obj; struct work_struct work; unsigned long flags; @@ -2104,13 +2098,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct obj_stock *stock); +static void drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); #else -static inline void drain_obj_stock(struct obj_stock *stock) +static inline void drain_obj_stock(struct memcg_stock_pcp *stock) { } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -2190,9 +2184,7 @@ static void drain_local_stock(struct work_struct *dummy) local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(&stock->irq_obj); - if (in_task()) - drain_obj_stock(&stock->task_obj); + drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); @@ -2767,41 +2759,6 @@ retry: */ #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) -/* - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable - * sequence used in this case to access content from object stock is slow. - * To optimize for user context access, there are now two object stocks for - * task context and interrupt context access respectively. - * - * The task context object stock can be accessed by disabling preemption only - * which is cheap in non-preempt kernel. The interrupt context object stock - * can only be accessed after disabling interrupt. User context code can - * access interrupt object stock, but not vice versa. - */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) -{ - struct memcg_stock_pcp *stock; - - if (likely(in_task())) { - *pflags = 0UL; - preempt_disable(); - stock = this_cpu_ptr(&memcg_stock); - return &stock->task_obj; - } - - local_irq_save(*pflags); - stock = this_cpu_ptr(&memcg_stock); - return &stock->irq_obj; -} - -static inline void put_obj_stock(unsigned long flags) -{ - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); -} - /* * mod_objcg_mlstate() may be called with irq enabled, so * mod_memcg_lruvec_state() should be used. @@ -3082,10 +3039,13 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); int *bytes; + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); + /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat or idx @@ -3136,26 +3096,29 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - put_obj_stock(flags); + local_irq_restore(flags); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); bool ret = false; + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } - put_obj_stock(flags); + local_irq_restore(flags); return ret; } -static void drain_obj_stock(struct obj_stock *stock) +static void drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = stock->cached_objcg; @@ -3211,13 +3174,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { struct mem_cgroup *memcg; - if (in_task() && stock->task_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); - if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; - } - if (stock->irq_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); + if (stock->cached_objcg) { + memcg = obj_cgroup_memcg(stock->cached_objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3228,10 +3186,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); unsigned int nr_pages = 0; + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ drain_obj_stock(stock); obj_cgroup_get(objcg); @@ -3247,7 +3208,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - put_obj_stock(flags); + local_irq_restore(flags); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -6826,7 +6787,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - bool use_objcg = folio_memcg_kmem(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -6835,7 +6795,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) * folio memcg or objcg at this point, we have fully * exclusive access to the folio. */ - if (use_objcg) { + if (folio_memcg_kmem(folio)) { objcg = __folio_objcg(folio); /* * This get matches the put at the end of the function and @@ -6863,7 +6823,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) nr_pages = folio_nr_pages(folio); - if (use_objcg) { + if (folio_memcg_kmem(folio)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; -- cgit From 2343e88d238f5de973d609d861c505890f94f22e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Mar 2022 14:40:38 -0700 Subject: mm/memcg: disable threshold event handlers on PREEMPT_RT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the integration of PREEMPT_RT support, the code flow around memcg_check_events() resulted in `twisted code'. Moving the code around and avoiding then would then lead to an additional local-irq-save section within memcg_check_events(). While looking better, it adds a local-irq-save section to code flow which is usually within an local-irq-off block on non-PREEMPT_RT configurations. The threshold event handler is a deprecated memcg v1 feature. Instead of trying to get it to work under PREEMPT_RT just disable it. There should be no users on PREEMPT_RT. From that perspective it makes even less sense to get it to work under PREEMPT_RT while having zero users. Make memory.soft_limit_in_bytes and cgroup.event_control return -EOPNOTSUPP on PREEMPT_RT. Make an empty memcg_check_events() and memcg_write_event_control() which return only -EOPNOTSUPP on PREEMPT_RT. Document that the two knobs are disabled on PREEMPT_RT. Link: https://lkml.kernel.org/r/20220226204144.1008339-3-bigeasy@linutronix.de Suggested-by: Michal Hocko Suggested-by: Michal Koutný Signed-off-by: Sebastian Andrzej Siewior Acked-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: kernel test robot Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bf204b2b053..6c61b4ec041e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -858,6 +858,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, */ static void memcg_check_events(struct mem_cgroup *memcg, int nid) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -3731,8 +3734,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } break; case RES_SOFT_LIMIT: - memcg->soft_limit = nr_pages; - ret = 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + ret = -EOPNOTSUPP; + } else { + memcg->soft_limit = nr_pages; + ret = 0; + } break; } return ret ?: nbytes; @@ -4708,6 +4715,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, char *endp; int ret; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return -EOPNOTSUPP; + buf = strstrip(buf); efd = simple_strtoul(buf, &endp, 10); -- cgit From be3e67b54b437123e6144da31cf312ddcaa5aef2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Mar 2022 14:40:41 -0700 Subject: mm/memcg: protect per-CPU counter by disabling preemption on PREEMPT_RT where needed. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-CPU counter are modified with the non-atomic modifier. The consistency is ensured by disabling interrupts for the update. On non PREEMPT_RT configuration this works because acquiring a spinlock_t typed lock with the _irq() suffix disables interrupts. On PREEMPT_RT configurations the RMW operation can be interrupted. Another problem is that mem_cgroup_swapout() expects to be invoked with disabled interrupts because the caller has to acquire a spinlock_t which is acquired with disabled interrupts. Since spinlock_t never disables interrupts on PREEMPT_RT the interrupts are never disabled at this point. The code is never called from in_irq() context on PREEMPT_RT therefore disabling preemption during the update is sufficient on PREEMPT_RT. The sections which explicitly disable interrupts can remain on PREEMPT_RT because the sections remain short and they don't involve sleeping locks (memcg_check_events() is doing nothing on PREEMPT_RT). Disable preemption during update of the per-CPU variables which do not explicitly disable interrupts. Link: https://lkml.kernel.org/r/20220226204144.1008339-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: kernel test robot Cc: Michal Hocko Cc: Michal Hocko Cc: Michal Koutný Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c61b4ec041e..bc60694b4e45 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -629,6 +629,35 @@ static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +/* + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can + * not rely on this as part of an acquired spinlock_t lock. These functions are + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion + * is sufficient. + */ +static void memcg_stats_lock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#else + VM_BUG_ON(!irqs_disabled()); +#endif +} + +static void __memcg_stats_lock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#endif +} + +static void memcg_stats_unlock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_enable(); +#endif +} + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { unsigned int x; @@ -705,6 +734,27 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + /* + * The caller from rmap relay on disabled preemption becase they never + * update their counter from in-interrupt context. For these two + * counters we check that the update is never performed from an + * interrupt context while other caller need to have disabled interrupt. + */ + __memcg_stats_lock(); + if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + switch (idx) { + case NR_ANON_MAPPED: + case NR_FILE_MAPPED: + case NR_ANON_THPS: + case NR_SHMEM_PMDMAPPED: + case NR_FILE_PMDMAPPED: + WARN_ON_ONCE(!in_task()); + break; + default: + WARN_ON_ONCE(!irqs_disabled()); + } + } + /* Update memcg */ __this_cpu_add(memcg->vmstats_percpu->state[idx], val); @@ -712,6 +762,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); memcg_rstat_updated(memcg, val); + memcg_stats_unlock(); } /** @@ -794,8 +845,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; + memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[idx], count); memcg_rstat_updated(memcg, count); + memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -7154,8 +7207,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ - VM_BUG_ON(!irqs_disabled()); + memcg_stats_lock(); mem_cgroup_charge_statistics(memcg, -nr_entries); + memcg_stats_unlock(); memcg_check_events(memcg, page_to_nid(page)); css_put(&memcg->css); -- cgit From af9a3b69e84bef996ce4620282fcf69a5786be3a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 22 Mar 2022 14:40:44 -0700 Subject: mm/memcg: opencode the inner part of obj_cgroup_uncharge_pages() in drain_obj_stock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide the inner part of refill_stock() as __refill_stock() without disabling interrupts. This eases the integration of local_lock_t where recursive locking must be avoided. Open code obj_cgroup_uncharge_pages() in drain_obj_stock() and use __refill_stock(). The caller of drain_obj_stock() already disables interrupts. [bigeasy@linutronix.de: patch body around Johannes' diff] Link: https://lkml.kernel.org/r/20220226204144.1008339-5-bigeasy@linutronix.de Signed-off-by: Johannes Weiner Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Cc: kernel test robot Cc: Michal Hocko Cc: Michal Koutný Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc60694b4e45..00bedb9b47b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2251,12 +2251,9 @@ static void drain_local_stock(struct work_struct *dummy) * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned long flags; - - local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ @@ -2268,7 +2265,14 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); +} + +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + unsigned long flags; + local_irq_save(flags); + __refill_stock(memcg, nr_pages); local_irq_restore(flags); } @@ -3185,8 +3189,16 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); - if (nr_pages) - obj_cgroup_uncharge_pages(old, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(old); + + memcg_account_kmem(memcg, -nr_pages); + __refill_stock(memcg, nr_pages); + + css_put(&memcg->css); + } /* * The leftover is flushed to the centralized per-memcg value. -- cgit From 5675114623872300aa9fcd72aef2b8b7f421fe12 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Mar 2022 14:40:47 -0700 Subject: mm/memcg: protect memcg_stock with a local_lock_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The members of the per-CPU structure memcg_stock_pcp are protected by disabling interrupts. This is not working on PREEMPT_RT because it creates atomic context in which actions are performed which require preemptible context. One example is obj_cgroup_release(). The IRQ-disable sections can be replaced with local_lock_t which preserves the explicit disabling of interrupts while keeps the code preemptible on PREEMPT_RT. drain_obj_stock() drops a reference on obj_cgroup which leads to an invocat= ion of obj_cgroup_release() if it is the last object. This in turn leads to recursive locking of the local_lock_t. To avoid this, obj_cgroup_release() = is invoked outside of the locked section. obj_cgroup_uncharge_pages() can be invoked with the local_lock_t acquired a= nd without it. This will lead later to a recursion in refill_stock(). To avoid the locking recursion provide obj_cgroup_uncharge_pages_locked() which uses the locked version of refill_stock(). - Replace disabling interrupts for memcg_stock with a local_lock_t. - Let drain_obj_stock() return the old struct obj_cgroup which is passed to obj_cgroup_put() outside of the locked section. - Provide obj_cgroup_uncharge_pages_locked() which uses the locked version of refill_stock() to avoid recursive locking in drain_obj_stock(). Link: https://lkml.kernel.org/r/20220209014709.GA26885@xsang-OptiPlex-9020 Link: https://lkml.kernel.org/r/20220226204144.1008339-6-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reported-by: kernel test robot Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Michal Koutný Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 59 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 21 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00bedb9b47b6..fa381d892422 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2135,6 +2135,7 @@ void unlock_page_memcg(struct page *page) } struct memcg_stock_pcp { + local_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -2150,18 +2151,21 @@ struct memcg_stock_pcp { unsigned long flags; #define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { + .stock_lock = INIT_LOCAL_LOCK(stock_lock), +}; static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct memcg_stock_pcp *stock); +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); #else -static inline void drain_obj_stock(struct memcg_stock_pcp *stock) +static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { + return NULL; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) @@ -2193,7 +2197,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { @@ -2201,7 +2205,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -2230,6 +2234,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; /* @@ -2237,14 +2242,16 @@ static void drain_local_stock(struct work_struct *dummy) * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(stock); + old = drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } /* @@ -2271,9 +2278,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); __refill_stock(memcg, nr_pages); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @@ -3100,10 +3107,11 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; int *bytes; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); /* @@ -3112,7 +3120,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, * changes. */ if (stock->cached_objcg != objcg) { - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; @@ -3156,7 +3164,9 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3165,7 +3175,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) unsigned long flags; bool ret = false; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { @@ -3173,17 +3183,17 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } -static void drain_obj_stock(struct memcg_stock_pcp *stock) +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = stock->cached_objcg; if (!old) - return; + return NULL; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; @@ -3233,8 +3243,12 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) stock->cached_pgdat = NULL; } - obj_cgroup_put(old); stock->cached_objcg = NULL; + /* + * The `old' objects needs to be released by the caller via + * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. + */ + return old; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -3255,14 +3269,15 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; unsigned int nr_pages = 0; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->cached_objcg = objcg; stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) @@ -3276,7 +3291,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); -- cgit From 0790ed623847bbdd440ae29cc01da81c99834ea5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Mar 2022 14:40:50 -0700 Subject: mm/memcg: disable migration instead of preemption in drain_all_stock(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before the for-each-CPU loop, preemption is disabled so that so that drain_local_stock() can be invoked directly instead of scheduling a worker. Ensuring that drain_local_stock() completed on the local CPU is not correctness problem. It _could_ be that the charging path will be forced to reclaim memory because cached charges are still waiting for their draining. Disabling preemption before invoking drain_local_stock() is problematic on PREEMPT_RT due to the sleeping locks involved. To ensure that no CPU migrations happens across for_each_online_cpu() it is enouhg to use migrate_disable() which disables migration and keeps context preemptible to a sleeping lock can be acquired. A race with CPU hotplug is not a problem because pcp data is not going away. In the worst case we just schedule draining of an empty stock. Use migrate_disable() instead of get_cpu() around the for_each_online_cpu() loop. Link: https://lkml.kernel.org/r/20220226204144.1008339-7-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Michal Hocko Cc: Johannes Weiner Cc: kernel test robot Cc: Michal Hocko Cc: Michal Koutný Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa381d892422..85a259515e91 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2300,7 +2300,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) * as well as workers from this path always operate on the local * per-cpu data. CPU up doesn't touch memcg_stock at all. */ - curcpu = get_cpu(); + migrate_disable(); + curcpu = smp_processor_id(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; @@ -2323,7 +2324,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) schedule_work_on(cpu, &stock->work); } } - put_cpu(); + migrate_enable(); mutex_unlock(&percpu_charge_mutex); } -- cgit From 88f2ef73fd66491a2f9a82373d22ca6540f23c62 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:40:56 -0700 Subject: mm: introduce kmem_cache_alloc_lru We currently allocate scope for every memcg to be able to tracked on every superblock instantiated in the system, regardless of whether that superblock is even accessible to that memcg. These huge memcg counts come from container hosts where memcgs are confined to just a small subset of the total number of superblocks that instantiated at any given point in time. For these systems with huge container counts, list_lru does not need the capability of tracking every memcg on every superblock. What it comes down to is that adding the memcg to the list_lru at the first insert. So introduce kmem_cache_alloc_lru to allocate objects and its list_lru. In the later patch, we will convert all inode and dentry allocation from kmem_cache_alloc to kmem_cache_alloc_lru. Link: https://lkml.kernel.org/r/20220228122126.37293-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85a259515e91..52835528eb2a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2805,20 +2805,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); -retry: - memcg = obj_cgroup_memcg(objcg); - if (unlikely(!css_tryget(&memcg->css))) - goto retry; - rcu_read_unlock(); - - return memcg; -} - #ifdef CONFIG_MEMCG_KMEM /* * The allocated objcg pointers array is not accounted directly. -- cgit From da0efe30944476275c902c52fbac812db0541d87 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:15 -0700 Subject: mm: memcontrol: move memcg_online_kmem() to mem_cgroup_css_online() It will simplify the code if moving memcg_online_kmem() to mem_cgroup_css_online() and do not need to set ->kmemcg_id to -1 to indicate the memcg is offline. In the next patch, ->kmemcg_id will be used to sync list lru reparenting which requires not to change ->kmemcg_id. Link: https://lkml.kernel.org/r/20220228122126.37293-10-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 52835528eb2a..f08a0dc2ac36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3670,7 +3670,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) if (cgroup_memory_nokmem) return 0; - BUG_ON(memcg->kmemcg_id >= 0); + if (unlikely(mem_cgroup_is_root(memcg))) + return 0; memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) @@ -3696,7 +3697,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) struct mem_cgroup *parent; int kmemcg_id; - if (memcg->kmemcg_id == -1) + if (cgroup_memory_nokmem) + return; + + if (unlikely(mem_cgroup_is_root(memcg))) return; parent = parent_mem_cgroup(memcg); @@ -3706,7 +3710,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_reparent_objcgs(memcg, parent); kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); /* * After we have finished memcg_reparent_objcgs(), all list_lrus @@ -3717,7 +3720,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); - memcg->kmemcg_id = -1; } #else static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -5237,7 +5239,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); struct mem_cgroup *memcg, *old_memcg; - long error = -ENOMEM; old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(); @@ -5266,34 +5267,26 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } - /* The following stuff does not apply to the root */ - error = memcg_online_kmem(memcg); - if (error) - goto fail; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); return &memcg->css; -fail: - mem_cgroup_id_remove(memcg); - mem_cgroup_free(memcg); - return ERR_PTR(error); } static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + if (memcg_online_kmem(memcg)) + goto remove_id; + /* * A memcg must be visible for expand_shrinker_info() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (alloc_shrinker_info(memcg)) { - mem_cgroup_id_remove(memcg); - return -ENOMEM; - } + if (alloc_shrinker_info(memcg)) + goto offline_kmem; /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); @@ -5303,6 +5296,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); return 0; +offline_kmem: + memcg_offline_kmem(memcg); +remove_id: + mem_cgroup_id_remove(memcg); + return -ENOMEM; } static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) @@ -5360,9 +5358,6 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); - - /* Need to offline kmem if online_css() fails */ - memcg_offline_kmem(memcg); mem_cgroup_free(memcg); } -- cgit From 5abc1e37afa0335c52608d640fd30910b2eeda21 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:19 -0700 Subject: mm: list_lru: allocate list_lru_one only when needed In our server, we found a suspected memory leak problem. The kmalloc-32 consumes more than 6GB of memory. Other kmem_caches consume less than 2GB memory. After our in-depth analysis, the memory consumption of kmalloc-32 slab cache is the cause of list_lru_one allocation. crash> p memcg_nr_cache_ids memcg_nr_cache_ids = $2 = 24574 memcg_nr_cache_ids is very large and memory consumption of each list_lru can be calculated with the following formula. num_numa_node * memcg_nr_cache_ids * 32 (kmalloc-32) There are 4 numa nodes in our system, so each list_lru consumes ~3MB. crash> list super_blocks | wc -l 952 Every mount will register 2 list lrus, one is for inode, another is for dentry. There are 952 super_blocks. So the total memory is 952 * 2 * 3 MB (~5.6GB). But the number of memory cgroup is less than 500. So I guess more than 12286 containers have been deployed on this machine (I do not know why there are so many containers, it may be a user's bug or the user really want to do that). And memcg_nr_cache_ids has not been reduced to a suitable value. This can waste a lot of memory. Now the infrastructure for dynamic list_lru_one allocation is ready, so remove statically allocated memory code to save memory. Link: https://lkml.kernel.org/r/20220228122126.37293-11-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f08a0dc2ac36..69c09efc599d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3709,6 +3709,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_reparent_objcgs(memcg, parent); + /* + * memcg_drain_all_list_lrus() can change memcg->kmemcg_id. + * Cache it to local @kmemcg_id. + */ kmemcg_id = memcg->kmemcg_id; /* @@ -3717,7 +3721,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) * The ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ - memcg_drain_all_list_lrus(kmemcg_id, parent); + memcg_drain_all_list_lrus(memcg, parent); memcg_free_cache_id(kmemcg_id); } -- cgit From 1f391eb270791359ee79031945dbe3afeaec6ce3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:22 -0700 Subject: mm: list_lru: rename memcg_drain_all_list_lrus to memcg_reparent_list_lrus The purpose of the memcg_drain_all_list_lrus() is list_lrus reparenting. It is very similar to memcg_reparent_objcgs(). Rename it to memcg_reparent_list_lrus() so that the name can more consistent with memcg_reparent_objcgs(). Link: https://lkml.kernel.org/r/20220228122126.37293-12-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 69c09efc599d..c36b0a0dbc19 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3710,7 +3710,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_reparent_objcgs(memcg, parent); /* - * memcg_drain_all_list_lrus() can change memcg->kmemcg_id. + * memcg_reparent_list_lrus() can change memcg->kmemcg_id. * Cache it to local @kmemcg_id. */ kmemcg_id = memcg->kmemcg_id; @@ -3719,9 +3719,9 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) * After we have finished memcg_reparent_objcgs(), all list_lrus * corresponding to this cgroup are guaranteed to remain empty. * The ordering is imposed by list_lru_node->lock taken by - * memcg_drain_all_list_lrus(). + * memcg_reparent_list_lrus(). */ - memcg_drain_all_list_lrus(memcg, parent); + memcg_reparent_list_lrus(memcg, parent); memcg_free_cache_id(kmemcg_id); } -- cgit From bbca91cca9a902de2e9907370e9c1e0a3d1aab0f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:25 -0700 Subject: mm: list_lru: replace linear array with xarray If we run 10k containers in the system, the size of the list_lru_memcg->lrus can be ~96KB per list_lru. When we decrease the number containers, the size of the array will not be shrinked. It is not scalable. The xarray is a good choice for this case. We can save a lot of memory when there are tens of thousands continers in the system. If we use xarray, we also can remove the logic code of resizing array, which can simplify the code. [akpm@linux-foundation.org: remove unused local] Link: https://lkml.kernel.org/r/20220228122126.37293-13-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 77 ++++----------------------------------------------------- 1 file changed, 5 insertions(+), 72 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c36b0a0dbc19..68eb62d10c48 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -351,42 +351,17 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, * This will be used as a shrinker list's index. * The main reason for not using cgroup id for this: * this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. Or also, if we have, for instance, 200 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a - * 200 entry array for that. - * - * The current size of the caches array is stored in memcg_nr_cache_ids. It - * will double each time we have to increase it. + * but only a few kmem-limited. */ static DEFINE_IDA(memcg_cache_ida); -int memcg_nr_cache_ids; - -/* Protects memcg_nr_cache_ids */ -static DECLARE_RWSEM(memcg_cache_ids_sem); - -void memcg_get_cache_ids(void) -{ - down_read(&memcg_cache_ids_sem); -} - -void memcg_put_cache_ids(void) -{ - up_read(&memcg_cache_ids_sem); -} /* - * MIN_SIZE is different than 1, because we would like to avoid going through - * the alloc/free process all the time. In a small machine, 4 kmem-limited - * cgroups is a reasonable guess. In the future, it could be a parameter or - * tunable, but that is strictly not necessary. - * * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get * this constant directly from cgroup, but it is understandable that this is * better kept as an internal representation in cgroup.c. In any case, the * cgrp_id space is not getting any smaller, and we don't have to necessarily * increase ours as well if it increases. */ -#define MEMCG_CACHES_MIN_SIZE 4 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX /* @@ -2944,49 +2919,6 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) return objcg; } -static int memcg_alloc_cache_id(void) -{ - int id, size; - int err; - - id = ida_simple_get(&memcg_cache_ida, - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); - if (id < 0) - return id; - - if (id < memcg_nr_cache_ids) - return id; - - /* - * There's no space for the new id in memcg_caches arrays, - * so we have to grow them. - */ - down_write(&memcg_cache_ids_sem); - - size = 2 * (id + 1); - if (size < MEMCG_CACHES_MIN_SIZE) - size = MEMCG_CACHES_MIN_SIZE; - else if (size > MEMCG_CACHES_MAX_SIZE) - size = MEMCG_CACHES_MAX_SIZE; - - err = memcg_update_all_list_lrus(size); - if (!err) - memcg_nr_cache_ids = size; - - up_write(&memcg_cache_ids_sem); - - if (err) { - ida_simple_remove(&memcg_cache_ida, id); - return err; - } - return id; -} - -static void memcg_free_cache_id(int id) -{ - ida_simple_remove(&memcg_cache_ida, id); -} - static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) { mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); @@ -3673,13 +3605,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) if (unlikely(mem_cgroup_is_root(memcg))) return 0; - memcg_id = memcg_alloc_cache_id(); + memcg_id = ida_alloc_max(&memcg_cache_ida, MEMCG_CACHES_MAX_SIZE - 1, + GFP_KERNEL); if (memcg_id < 0) return memcg_id; objcg = obj_cgroup_alloc(); if (!objcg) { - memcg_free_cache_id(memcg_id); + ida_free(&memcg_cache_ida, memcg_id); return -ENOMEM; } objcg->memcg = memcg; @@ -3723,7 +3656,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg_reparent_list_lrus(memcg, parent); - memcg_free_cache_id(kmemcg_id); + ida_free(&memcg_cache_ida, kmemcg_id); } #else static int memcg_online_kmem(struct mem_cgroup *memcg) -- cgit From f9c69d6346bc6934369c80b316fa277bc96ffa77 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:28 -0700 Subject: mm: memcontrol: reuse memory cgroup ID for kmem ID There are two idrs being used by memory cgroup, one is for kmem ID, another is for memory cgroup ID. The maximum ID of both is 64Ki. Both of them can limit the total number of memory cgroups. Actually, we can reuse memory cgroup ID for kmem ID to simplify the code. Link: https://lkml.kernel.org/r/20220228122126.37293-14-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 39 +++------------------------------------ 1 file changed, 3 insertions(+), 36 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 68eb62d10c48..4af673350377 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -347,23 +347,6 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, percpu_ref_kill(&objcg->refcnt); } -/* - * This will be used as a shrinker list's index. - * The main reason for not using cgroup id for this: - * this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. - */ -static DEFINE_IDA(memcg_cache_ida); - -/* - * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get - * this constant directly from cgroup, but it is understandable that this is - * better kept as an internal representation in cgroup.c. In any case, the - * cgrp_id space is not getting any smaller, and we don't have to necessarily - * increase ours as well if it increases. - */ -#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX - /* * A lot of the calls to the cache allocation functions are expected to be * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are @@ -3597,7 +3580,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, static int memcg_online_kmem(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; - int memcg_id; if (cgroup_memory_nokmem) return 0; @@ -3605,22 +3587,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) if (unlikely(mem_cgroup_is_root(memcg))) return 0; - memcg_id = ida_alloc_max(&memcg_cache_ida, MEMCG_CACHES_MAX_SIZE - 1, - GFP_KERNEL); - if (memcg_id < 0) - return memcg_id; - objcg = obj_cgroup_alloc(); - if (!objcg) { - ida_free(&memcg_cache_ida, memcg_id); + if (!objcg) return -ENOMEM; - } + objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); static_branch_enable(&memcg_kmem_enabled_key); - memcg->kmemcg_id = memcg_id; + memcg->kmemcg_id = memcg->id.id; return 0; } @@ -3628,7 +3604,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct mem_cgroup *parent; - int kmemcg_id; if (cgroup_memory_nokmem) return; @@ -3642,12 +3617,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_reparent_objcgs(memcg, parent); - /* - * memcg_reparent_list_lrus() can change memcg->kmemcg_id. - * Cache it to local @kmemcg_id. - */ - kmemcg_id = memcg->kmemcg_id; - /* * After we have finished memcg_reparent_objcgs(), all list_lrus * corresponding to this cgroup are guaranteed to remain empty. @@ -3655,8 +3624,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) * memcg_reparent_list_lrus(). */ memcg_reparent_list_lrus(memcg, parent); - - ida_free(&memcg_cache_ida, kmemcg_id); } #else static int memcg_online_kmem(struct mem_cgroup *memcg) -- cgit From be740503ed03ea04ca362330baf082e6a38fe462 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:31 -0700 Subject: mm: memcontrol: fix cannot alloc the maximum memcg ID The idr_alloc() does not include @max ID. So in the current implementation, the maximum memcg ID is 65534 instead of 65535. It seems a bug. So fix this. Link: https://lkml.kernel.org/r/20220228122126.37293-15-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4af673350377..33ad13d69bda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5088,8 +5088,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) return ERR_PTR(error); memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, - 1, MEM_CGROUP_ID_MAX, - GFP_KERNEL); + 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { error = memcg->id.id; goto fail; -- cgit From 8c9bb39816f01a309d30243da0ca91bd7e7bd1c2 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 22 Mar 2022 14:47:06 -0700 Subject: memcg: do not tweak node in alloc_mem_cgroup_per_node_info alloc_mem_cgroup_per_node_info is allocated for each possible node and this used to be a problem because !node_online nodes didn't have appropriate data structure allocated. This has changed by "mm: handle uninitialized numa nodes gracefully" so we can drop the special casing here. Link: https://lkml.kernel.org/r/20220127085305.20890-7-mhocko@kernel.org Signed-off-by: Wei Yang Signed-off-by: Michal Hocko Cc: David Hildenbrand Cc: Alexey Makhalov Cc: Dennis Zhou Cc: Eric Dumazet Cc: Tejun Heo Cc: Christoph Lameter Cc: Nico Pache Cc: Wei Yang Cc: Mike Rapoport Cc: Oscar Salvador Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33ad13d69bda..f5ad1a680494 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5020,18 +5020,8 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - int tmp = node; - /* - * This routine is called against possible nodes. - * But it's BUG to call kmalloc() against offline node. - * - * TODO: this routine can waste much memory for nodes which will - * never be onlined. It's better to use memory hotplug callback - * function. - */ - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); if (!pn) return 1; -- cgit