From 7490a2d248145d8694e1e9828801b496250fd697 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:53:27 -0700 Subject: writeback: memcg: simplify cgroup_writeback_by_id Currently cgroup_writeback_by_id calls mem_cgroup_wb_stats() to get dirty pages for a memcg. However mem_cgroup_wb_stats() does a lot more than just get the number of dirty pages. Just directly get the number of dirty pages instead of calling mem_cgroup_wb_stats(). Also cgroup_writeback_by_id() is only called for best-effort dirty flushing, so remove the unused 'nr' parameter and no need to explicitly flush memcg stats. Link: https://lkml.kernel.org/r/20210722182627.2267368-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 702a81dfe72d..1047f0271ff8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -645,17 +645,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } -/* idx can be of type enum memcg_stat_item or node_stat_item. */ -static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -4668,7 +4657,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) atomic_read(&frn->done.cnt) == 1) { frn->at = 0; trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); - cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, WB_REASON_FOREIGN_FLUSH, &frn->done); } -- cgit From 56cab2859fbe08e1f2a5ac3f4655dcc398bc013d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:47 -0700 Subject: mm, memcg: add mem_cgroup_disabled checks in vmpressure and swap-related functions Add mem_cgroup_disabled check in vmpressure, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~2.1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n, CONFIG_MEMCG_SWAP=n} against {CONFIG_MEMCG=y, CONFIG_MEMCG_SWAP=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1047f0271ff8..211f3911228f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7297,6 +7297,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); -- cgit From 2c8d8f97ae2272f1455ee31a5af62b326772eb31 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:50 -0700 Subject: mm, memcg: inline mem_cgroup_{charge/uncharge} to improve disabled memcg config Inline mem_cgroup_{charge/uncharge} and mem_cgroup_uncharge_list functions functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~0.4% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 211f3911228f..33bb8434eea0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6693,8 +6693,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, - gfp_t gfp) +static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) { unsigned int nr_pages = thp_nr_pages(page); int ret; @@ -6715,7 +6714,7 @@ out: } /** - * mem_cgroup_charge - charge a newly allocated page to a cgroup + * __mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode @@ -6728,16 +6727,14 @@ out: * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { struct mem_cgroup *memcg; int ret; - if (mem_cgroup_disabled()) - return 0; - memcg = get_mem_cgroup_from_mm(mm); - ret = __mem_cgroup_charge(page, memcg, gfp_mask); + ret = charge_memcg(page, memcg, gfp_mask); css_put(&memcg->css); return ret; @@ -6772,7 +6769,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = __mem_cgroup_charge(page, memcg, gfp); + ret = charge_memcg(page, memcg, gfp); css_put(&memcg->css); return ret; @@ -6908,18 +6905,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } /** - * mem_cgroup_uncharge - uncharge a page + * __mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_charge(). + * Uncharge a page previously charged with __mem_cgroup_charge(). */ -void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct page *page) { struct uncharge_gather ug; - if (mem_cgroup_disabled()) - return; - /* Don't touch page->lru of any random page, pre-check: */ if (!page_memcg(page)) return; @@ -6930,20 +6924,17 @@ void mem_cgroup_uncharge(struct page *page) } /** - * mem_cgroup_uncharge_list - uncharge a list of page + * __mem_cgroup_uncharge_list - uncharge a list of page * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_charge(). + * __mem_cgroup_charge(). */ -void mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; struct page *page; - if (mem_cgroup_disabled()) - return; - uncharge_gather_clear(&ug); list_for_each_entry(page, page_list, lru) uncharge_page(page, &ug); -- cgit From 01c4b28cd2e600160566a7d83b4703800381dae1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:54 -0700 Subject: mm, memcg: inline swap-related functions to improve disabled memcg config Inline mem_cgroup_try_charge_swap, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Yang Shi Cc: Alex Shi Cc: Wei Yang Cc: Jens Axboe Cc: Joonsoo Kim Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Alistair Popple Cc: Minchan Kim Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33bb8434eea0..81e15a67391b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7226,7 +7226,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_try_charge_swap - try charging swap space for a page + * __mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * @@ -7234,16 +7234,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * * Returns 0 on success, -ENOMEM on failure. */ -int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { unsigned int nr_pages = thp_nr_pages(page); struct page_counter *counter; struct mem_cgroup *memcg; unsigned short oldid; - if (mem_cgroup_disabled()) - return 0; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; @@ -7279,18 +7276,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_uncharge_swap - uncharge swap space + * __mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; - if (mem_cgroup_disabled()) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); -- cgit From 7e1c0d6f58207e7e60674647d3935f446f05613c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:00 -0700 Subject: memcg: switch lruvec stats to rstat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat") switched memcg stats to rstat infrastructure but skipped the conversion of the lruvec stats as such stats are read in the performance critical code paths and flushing stats may have impacted the performances of the applications. This patch converts the lruvec stats to rstat and later patches add mechanisms to keep the performance impact to minimum. The rstat conversion comes with the price i.e. memory cost. Effectively this patch reverts the savings done by the commit f3344adf38bd ("mm: memcontrol: optimize per-lruvec stats counter memory usage"). However this cost is justified due to negative impact of the inaccurate lruvec stats on many heuristics. One such case is reported in [1]. The memory reclaim code is filled with plethora of heuristics and many of those heuristics reads the lruvec stats. So, inaccurate stats can make such heuristics ineffective. [1] reports the impact of inaccurate lruvec stats on the "cache trim mode" heuristic. Inaccurate lruvec stats can impact the deactivation and aging anon heuristics as well. [1] https://lore.kernel.org/linux-mm/20210311004449.1170308-1-ying.huang@intel.com/ Link: https://lkml.kernel.org/r/20210716212137.1391164-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Johannes Weiner Cc: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Huang Ying Cc: Hillf Danton Cc: Michal Koutný Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 114 +++++++++++++++++++------------------------------------- 1 file changed, 38 insertions(+), 76 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81e15a67391b..400d210e030a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -660,23 +660,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) return x; } -static struct mem_cgroup_per_node * -parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) -{ - struct mem_cgroup *parent; - - parent = parent_mem_cgroup(pn->memcg); - if (!parent) - return NULL; - return parent->nodeinfo[nid]; -} - void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; - long x, threshold = MEMCG_CHARGE_BATCH; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -685,21 +673,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); - - if (vmstat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; - - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > threshold)) { - pg_data_t *pgdat = lruvec_pgdat(lruvec); - struct mem_cgroup_per_node *pi; - - for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) - atomic_long_add(x, &pi->lruvec_stat[idx]); - x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); } /** @@ -2278,40 +2252,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) -{ - int nid; - - for_each_node(nid) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; - unsigned long stat[NR_VM_NODE_STAT_ITEMS]; - struct batched_lruvec_stat *lstatc; - int i; - - lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - stat[i] = lstatc->count[i]; - lstatc->count[i] = 0; - } - - do { - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } -} - static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); - for_each_mem_cgroup(memcg) - memcg_flush_lruvec_page_state(memcg, cpu); - return 0; } @@ -5118,17 +5065,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_local) { - kfree(pn); - return 1; - } - - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_cpu) { - free_percpu(pn->lruvec_stat_local); + pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, + GFP_KERNEL_ACCOUNT); + if (!pn->lruvec_stats_percpu) { kfree(pn); return 1; } @@ -5149,8 +5088,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return; - free_percpu(pn->lruvec_stat_cpu); - free_percpu(pn->lruvec_stat_local); + free_percpu(pn->lruvec_stats_percpu); kfree(pn); } @@ -5166,15 +5104,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { - int cpu; - memcg_wb_domain_exit(memcg); - /* - * Flush percpu lruvec stats to guarantee the value - * correctness on parent's and all ancestor levels. - */ - for_each_online_cpu(cpu) - memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } @@ -5407,7 +5337,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; long delta, v; - int i; + int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5455,6 +5385,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent) parent->vmstats.events_pending[i] += delta; } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats_percpu *lstatc; + + if (parent) + ppn = parent->nodeinfo[nid]; + + lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + delta = pn->lruvec_stats.state_pending[i]; + if (delta) + pn->lruvec_stats.state_pending[i] = 0; + + v = READ_ONCE(lstatc->state[i]); + if (v != lstatc->state_prev[i]) { + delta += v - lstatc->state_prev[i]; + lstatc->state_prev[i] = v; + } + + if (!delta) + continue; + + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } + } } #ifdef CONFIG_MMU @@ -6388,6 +6348,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; -- cgit From aa48e47e3906c332eaf1e5d7b58be11d3509ad9f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:04 -0700 Subject: memcg: infrastructure to flush memcg stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment memcg stats are read in four contexts: 1. memcg stat user interfaces 2. dirty throttling 3. page fault 4. memory reclaim Currently the kernel flushes the stats for first two cases. Flushing the stats for remaining two casese may have performance impact. Always flushing the memcg stats on the page fault code path may negatively impacts the performance of the applications. In addition flushing in the memory reclaim code path, though treated as slowpath, can become the source of contention for the global lock taken for stat flushing because when system or memcg is under memory pressure, many tasks may enter the reclaim path. This patch uses following mechanisms to solve these challenges: 1. Periodically flush the stats from root memcg every 2 seconds. This will time limit the out of sync stats. 2. Asynchronously flush the stats after fixed number of stat updates. In the worst case the stat can be out of sync by O(nr_cpus * BATCH) for 2 seconds. 3. For avoiding thundering herd to flush the stats particularly from the memory reclaim context, introduce memcg local spinlock and let only one flusher active at a time. This could have been done through cgroup_rstat_lock lock but that lock is used by other subsystem and for userspace reading memcg stats. So, it is better to keep flushers introduced by this patch decoupled from cgroup_rstat_lock. However we would have to use irqsafe version of rstat flush but that is fine as this code path will be flushing for whole tree and do the work for everyone. No one will be waiting for that worker. [shakeelb@google.com: fix sleep-in-wrong context bug] Link: https://lkml.kernel.org/r/20210716212137.1391164-2-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-2-shakeelb@google.com Signed-off-by: Shakeel Butt Tested-by: Marek Szyprowski Cc: Hillf Danton Cc: Huang Ying Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 400d210e030a..4d8c9afecf98 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,6 +103,14 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } +/* memcg and lruvec stats flushing */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static void flush_memcg_stats_work(struct work_struct *w); +static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work); +static DEFINE_PER_CPU(unsigned int, stats_flush_threshold); +static DEFINE_SPINLOCK(stats_flush_lock); + #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -674,6 +682,8 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH)) + queue_work(system_unbound_wq, &stats_flush_work); } /** @@ -5240,6 +5250,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); + + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); return 0; } @@ -5331,6 +5345,26 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +void mem_cgroup_flush_stats(void) +{ + if (!spin_trylock(&stats_flush_lock)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + spin_unlock(&stats_flush_lock); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + +static void flush_memcg_stats_work(struct work_struct *w) +{ + mem_cgroup_flush_stats(); +} + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); -- cgit From 55a68c823951855f3ec313fdb85100db84284505 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:49 -0700 Subject: memcg: replace in_interrupt() by !in_task() in active_memcg() set_active_memcg() uses in_interrupt() check to select proper storage for cgroup: pointer on task struct or per-cpu pointer. It isn't fully correct: obsoleted in_interrupt() includes tasks with disabled BH. It's better to use '!in_task()' instead. Link: https://lkml.org/lkml/2021/7/26/487 Link: https://lkml.kernel.org/r/ed4448b0-4970-616f-7368-ef9dd3cb628d@virtuozzo.com Fixes: 37d5985c003d ("mm: kmem: prepare remote memcg charging infra for interrupt contexts") Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d8c9afecf98..b09f2639f28b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -878,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task); static __always_inline struct mem_cgroup *active_memcg(void) { - if (in_interrupt()) + if (!in_task()) return this_cpu_read(int_active_memcg); else return current->active_memcg; -- cgit From 37bc3cb9bbef86d1ddbbc789e55b588c8a2cac26 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 2 Sep 2021 14:55:53 -0700 Subject: mm: memcontrol: set the correct memcg swappiness restriction Since commit c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") has expended the swappiness value to make swap to be preferred in some systems. We should also change the memcg swappiness restriction to allow memcg swap-preferred. Link: https://lkml.kernel.org/r/d77469b90c45c49953ccbc51e54a1d465bc18f70.1627626255.git.baolin.wang@linux.alibaba.com Fixes: c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") Signed-off-by: Baolin Wang Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b09f2639f28b..80840e026c2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4062,7 +4062,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val > 100) + if (val > 200) return -EINVAL; if (!mem_cgroup_is_root(memcg)) -- cgit From 27fb0956ed08780e480a14c5cca2fd4818e99fa7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:55:59 -0700 Subject: mm, memcg: save some atomic ops when flush is already true Add 'else' to save some atomic ops in obj_stock_flush_required() when flush is already true. No functional change intended here. Link: https://lkml.kernel.org/r/20210807082835.61281-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Muchun Song Cc: Matthew Wilcox (Oracle) Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80840e026c2f..894a24d5ec55 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2246,7 +2246,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; - if (obj_stock_flush_required(stock, root_memcg)) + else if (obj_stock_flush_required(stock, root_memcg)) flush = true; rcu_read_unlock(); -- cgit From 5c49cf9ad600f705f595ecbbbac2a5da9a3bb3bd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 2 Sep 2021 14:56:02 -0700 Subject: memcg: fix up drain_local_stock comment Thomas and Vlastimil have noticed that the comment in drain_local_stock doesn't quite make sense. It talks about a synchronization with the memory hotplug but there is no actual memory hotplug involvement here. I meant to talk about cpu hotplug here. Fix that up and hopefuly make the comment more helpful by referencing the cpu hotplug callback as well. Link: https://lkml.kernel.org/r/YRDwOhVglJmY7ES5@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 894a24d5ec55..81fa83133c2c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2178,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy) unsigned long flags; /* - * The only protection from memory hotplug vs. drain_stock races is - * that we always operate on local CPU stock here with IRQ disabled + * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled */ local_irq_save(flags); -- cgit From 4ba9515d32bac1c54c2357796e32d68eeab784ce Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:56:05 -0700 Subject: memcg: make memcg->event_list_lock irqsafe The memcg->event_list_lock is usually taken in the normal context but when the userspace closes the corresponding eventfd, eventfd_release through memcg_event_wake takes memcg->event_list_lock with interrupts disabled. This is not an issue on its own but it creates a nested dependency from eventfd_ctx->wqh.lock to memcg->event_list_lock. Independently, for unrelated eventfd, eventfd_signal() can be called in the irq context, thus making eventfd_ctx->wqh.lock an irq lock. For example, FPGA DFL driver, VHOST VPDA driver and couple of VFIO drivers. This will force memcg->event_list_lock to be an irqsafe lock as well. One way to break the nested dependency between eventfd_ctx->wqh.lock and memcg->event_list_lock is to add an indirection. However the simplest solution would be to make memcg->event_list_lock irqsafe. This is cgroup v1 feature, is in maintenance and may get deprecated in near future. So, no need to add more code. BTW this has been discussed previously [1] but there weren't irq users of eventfd_signal() at the time. [1] https://www.spinics.net/lists/cgroups/msg06248.html Link: https://lkml.kernel.org/r/20210830172953.207257-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81fa83133c2c..06ae0075e864 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4839,9 +4839,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, vfs_poll(efile.file, &event->pt); - spin_lock(&memcg->event_list_lock); + spin_lock_irq(&memcg->event_list_lock); list_add(&event->list, &memcg->event_list); - spin_unlock(&memcg->event_list_lock); + spin_unlock_irq(&memcg->event_list_lock); fdput(cfile); fdput(efile); @@ -5268,12 +5268,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) * Notify userspace about cgroup removing only after rmdir of cgroup * directory to avoid race between userspace and kernelspace. */ - spin_lock(&memcg->event_list_lock); + spin_lock_irq(&memcg->event_list_lock); list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { list_del_init(&event->list); schedule_work(&event->remove); } - spin_unlock(&memcg->event_list_lock); + spin_unlock_irq(&memcg->event_list_lock); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); -- cgit From 9647875be52b33fe22cb034ec3074896c581543f Mon Sep 17 00:00:00 2001 From: Hui Su Date: Thu, 2 Sep 2021 14:59:36 -0700 Subject: mm/vmpressure: replace vmpressure_to_css() with vmpressure_to_memcg() We can get memcg directly form vmpr instead of vmpr->memcg->css->memcg, so add a new func helper vmpressure_to_memcg(). And no code will use vmpressure_to_css(), so delete it. Link: https://lkml.kernel.org/r/20210630112146.455103-1-suhui@zeku.com Signed-off-by: Hui Su Acked-by: Michal Hocko Acked-by: Chris Down Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06ae0075e864..896f0f403c52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -256,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) return &memcg->vmpressure; } -struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) { - return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; + return container_of(vmpr, struct mem_cgroup, vmpressure); } #ifdef CONFIG_MEMCG_KMEM -- cgit