summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c82
1 files changed, 76 insertions, 6 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5d52d6a24af..dfefa1d99d1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2459,17 +2459,80 @@ out:
*lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg);
+
+ if (protection > 0) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long baseline = 0;
+
+ /*
+ * During the reclaim first pass, we only consider
+ * cgroups in excess of their protection setting, but if
+ * that doesn't produce free pages, we come back for a
+ * second pass where we reclaim from all groups.
+ *
+ * To maintain fairness in both cases, the first pass
+ * targets groups in proportion to their overage, and
+ * the second pass targets groups in proportion to their
+ * protection utilization.
+ *
+ * So on the first pass, a group whose size is 130% of
+ * its protection will be targeted at 30% of its size.
+ * On the second pass, a group whose size is at 40% of
+ * its protection will be
+ * targeted at 40% of its size.
+ */
+ if (!sc->memcg_low_reclaim)
+ baseline = lruvec_size;
+ scan = lruvec_size * cgroup_size / protection - baseline;
+
+ /*
+ * Don't allow the scan target to exceed the lruvec
+ * size, which otherwise could happen if we have >200%
+ * overage in the normal case, or >100% overage when
+ * sc->memcg_low_reclaim is set.
+ *
+ * This is important because other cgroups without
+ * memory.low have their scan target initially set to
+ * their lruvec size, so allowing values >100% of the
+ * lruvec size here could result in penalising cgroups
+ * with memory.low set even *more* than their peers in
+ * some cases in the case of large overages.
+ *
+ * Also, minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards.
+ */
+ scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2489,7 +2552,7 @@ out:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2498,7 +2561,7 @@ out:
BUG();
}
- *lru_pages += size;
+ *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2742,6 +2805,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}