summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/memcontrol.h8
-rw-r--r--mm/memcontrol.c68
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zswap.c2
5 files changed, 52 insertions, 38 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a308c8eacf20..43b77363ab8e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1051,8 +1051,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
-void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_ratelimited(void);
+void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
+void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
@@ -1563,11 +1563,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
}
-static inline void mem_cgroup_flush_stats(void)
+static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}
-static inline void mem_cgroup_flush_stats_ratelimited(void)
+static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c5aa0c2cb68b..b08b9cd4a3a8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -670,7 +670,6 @@ struct memcg_vmstats {
*/
static void flush_memcg_stats_dwork(struct work_struct *w);
static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
@@ -731,35 +730,40 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
}
}
-static void do_flush_stats(void)
+static void do_flush_stats(struct mem_cgroup *memcg)
{
- /*
- * We always flush the entire tree, so concurrent flushers can just
- * skip. This avoids a thundering herd problem on the rstat global lock
- * from memcg flushers (e.g. reclaim, refault, etc).
- */
- if (atomic_read(&stats_flush_ongoing) ||
- atomic_xchg(&stats_flush_ongoing, 1))
- return;
-
- WRITE_ONCE(flush_last_time, jiffies_64);
-
- cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+ if (mem_cgroup_is_root(memcg))
+ WRITE_ONCE(flush_last_time, jiffies_64);
- atomic_set(&stats_flush_ongoing, 0);
+ cgroup_rstat_flush(memcg->css.cgroup);
}
-void mem_cgroup_flush_stats(void)
+/*
+ * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
+ * @memcg: root of the subtree to flush
+ *
+ * Flushing is serialized by the underlying global rstat lock. There is also a
+ * minimum amount of work to be done even if there are no stat updates to flush.
+ * Hence, we only flush the stats if the updates delta exceeds a threshold. This
+ * avoids unnecessary work and contention on the underlying lock.
+ */
+void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
- if (memcg_should_flush_stats(root_mem_cgroup))
- do_flush_stats();
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!memcg)
+ memcg = root_mem_cgroup;
+
+ if (memcg_should_flush_stats(memcg))
+ do_flush_stats(memcg);
}
-void mem_cgroup_flush_stats_ratelimited(void)
+void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
/* Only flush if the periodic flusher is one full cycle late */
if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(memcg);
}
static void flush_memcg_stats_dwork(struct work_struct *w)
@@ -768,7 +772,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* Deliberately ignore memcg_should_flush_stats() here so that flushing
* in latency-sensitive paths is as cheap as possible.
*/
- do_flush_stats();
+ do_flush_stats(root_mem_cgroup);
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
@@ -1643,7 +1647,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
*
* Current memory state:
*/
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(memcg);
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
@@ -4193,7 +4197,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(memcg);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
@@ -4274,7 +4278,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(memcg);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4770,7 +4774,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(memcg);
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -6865,7 +6869,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(memcg);
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
@@ -8096,7 +8100,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
break;
}
- cgroup_rstat_flush(memcg->css.cgroup);
+ /*
+ * mem_cgroup_flush_stats() ignores small changes. Use
+ * do_flush_stats() directly to get accurate stats for charging.
+ */
+ do_flush_stats(memcg);
pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
if (pages < max)
continue;
@@ -8161,8 +8169,10 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
static u64 zswap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- cgroup_rstat_flush(css->cgroup);
- return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ mem_cgroup_flush_stats(memcg);
+ return memcg_page_state(memcg, MEMCG_ZSWAP_B);
}
static int zswap_max_show(struct seq_file *m, void *v)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f0eba9ef3332..b4ca3563bcf4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2226,7 +2226,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
* Flush the memory cgroup stats, so that we read accurate per-memcg
* lruvec stats for heuristics.
*/
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(sc->target_mem_cgroup);
/*
* Determine the scan balance between anon and file LRUs.
diff --git a/mm/workingset.c b/mm/workingset.c
index 6b9871f5a2e8..2a2a34234df9 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -464,8 +464,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
rcu_read_unlock();
- /* Flush stats (and potentially sleep) outside the RCU read section */
- mem_cgroup_flush_stats_ratelimited();
+ /*
+ * Flush stats (and potentially sleep) outside the RCU read section.
+ * XXX: With per-memcg flushing and thresholding, is ratelimiting
+ * still needed here?
+ */
+ mem_cgroup_flush_stats_ratelimited(eviction_memcg);
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -676,7 +680,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(sc->memcg);
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
diff --git a/mm/zswap.c b/mm/zswap.c
index 015425ed9003..ac31fec176e9 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -641,7 +641,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
return 0;
#ifdef CONFIG_MEMCG_KMEM
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(memcg);
nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
#else