summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst27
-rw-r--r--include/linux/memcontrol.h15
-rw-r--r--include/linux/page_counter.h11
-rw-r--r--mm/memcontrol.c118
-rw-r--r--mm/page_counter.c63
-rw-r--r--mm/vmscan.c18
6 files changed, 202 insertions, 50 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7b56ca80e37a..e34d3c938729 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back.
The total amount of memory currently being used by the cgroup
and its descendants.
+ memory.min
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ Hard memory protection. If the memory usage of a cgroup
+ is within its effective min boundary, the cgroup's memory
+ won't be reclaimed under any conditions. If there is no
+ unprotected reclaimable memory available, OOM killer
+ is invoked.
+
+ Effective min boundary is limited by memory.min values of
+ all ancestor cgroups. If there is memory.min overcommitment
+ (child cgroup or cgroups are requiring more protected memory
+ than parent will allow), then each child cgroup will get
+ the part of parent's protection proportional to its
+ actual memory usage below memory.min.
+
+ Putting more memory than generally available under this
+ protection is discouraged and may lead to constant OOMs.
+
+ If a memory cgroup is not populated with processes,
+ its memory.min is ignored.
+
memory.low
A read-write single value file which exists on non-root
cgroups. The default is "0".
@@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back.
Effective low boundary is limited by memory.low values of
all ancestor cgroups. If there is memory.low overcommitment
- (child cgroup or cgroups are requiring more protected memory,
+ (child cgroup or cgroups are requiring more protected memory
than parent will allow), then each child cgroup will get
- the part of parent's protection proportional to the its
+ the part of parent's protection proportional to its
actual memory usage below memory.low.
Putting more memory than generally available under this
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 10d741e8fe51..9c04cf8e6487 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -58,6 +58,12 @@ enum memcg_memory_event {
MEMCG_NR_MEMORY_EVENTS,
};
+enum mem_cgroup_protection {
+ MEMCG_PROT_NONE,
+ MEMCG_PROT_LOW,
+ MEMCG_PROT_MIN,
+};
+
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
int priority;
@@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
+ struct mem_cgroup *memcg);
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcgp,
@@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
{
}
-static inline bool mem_cgroup_low(struct mem_cgroup *root,
- struct mem_cgroup *memcg)
+static inline enum mem_cgroup_protection mem_cgroup_protected(
+ struct mem_cgroup *root, struct mem_cgroup *memcg)
{
- return false;
+ return MEMCG_PROT_NONE;
}
static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 7902a727d3b6..bab7e57f659b 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -8,10 +8,16 @@
struct page_counter {
atomic_long_t usage;
- unsigned long max;
+ unsigned long min;
unsigned long low;
+ unsigned long max;
struct page_counter *parent;
+ /* effective memory.min and memory.min usage tracking */
+ unsigned long emin;
+ atomic_long_t min_usage;
+ atomic_long_t children_min_usage;
+
/* effective memory.low and memory.low usage tracking */
unsigned long elow;
atomic_long_t low_usage;
@@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter,
unsigned long nr_pages,
struct page_counter **fail);
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6de0d6a3a8d..e3d56927a724 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
+ page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg);
@@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
+static int memory_min_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long min = READ_ONCE(memcg->memory.min);
+
+ if (min == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t memory_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long min;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &min);
+ if (err)
+ return err;
+
+ page_counter_set_min(&memcg->memory, min);
+
+ return nbytes;
+}
+
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5301,6 +5333,12 @@ static struct cftype memory_files[] = {
.read_u64 = memory_current_read,
},
{
+ .name = "min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_min_show,
+ .write = memory_min_write,
+ },
+ {
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_low_show,
@@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_low - check if memory consumption is in the normal range
+ * mem_cgroup_protected - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
*
- * Returns %true if memory consumption of @memcg is in the normal range.
+ * Returns one of the following:
+ * MEMCG_PROT_NONE: cgroup memory is not protected
+ * MEMCG_PROT_LOW: cgroup memory is protected as long there is
+ * an unprotected supply of reclaimable memory from other cgroups.
+ * MEMCG_PROT_MIN: cgroup memory is protected
*
- * @root is exclusive; it is never low when looked at directly
+ * @root is exclusive; it is never protected when looked at directly
*
- * To provide a proper hierarchical behavior, effective memory.low value
- * is used.
+ * To provide a proper hierarchical behavior, effective memory.min/low values
+ * are used. Below is the description of how effective memory.low is calculated.
+ * Effective memory.min values is calculated in the same way.
*
* Effective memory.low is always equal or less than the original memory.low.
* If there is no memory.low overcommittment (which is always true for
@@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = {
* E/memory.current = 0
*
* These calculations require constant tracking of the actual low usages
- * (see propagate_low_usage()), as well as recursive calculation of
- * effective memory.low values. But as we do call mem_cgroup_low()
+ * (see propagate_protected_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_protected()
* path for each memory cgroup top-down from the reclaim,
* it's possible to optimize this part, and save calculated elow
* for next usage. This part is intentionally racy, but it's ok,
* as memory.low is a best-effort mechanism.
*/
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
- unsigned long usage, low_usage, siblings_low_usage;
- unsigned long elow, parent_elow;
struct mem_cgroup *parent;
+ unsigned long emin, parent_emin;
+ unsigned long elow, parent_elow;
+ unsigned long usage;
if (mem_cgroup_disabled())
- return false;
+ return MEMCG_PROT_NONE;
if (!root)
root = root_mem_cgroup;
if (memcg == root)
- return false;
+ return MEMCG_PROT_NONE;
- elow = memcg->memory.low;
usage = page_counter_read(&memcg->memory);
- parent = parent_mem_cgroup(memcg);
+ if (!usage)
+ return MEMCG_PROT_NONE;
+
+ emin = memcg->memory.min;
+ elow = memcg->memory.low;
+ parent = parent_mem_cgroup(memcg);
if (parent == root)
goto exit;
+ parent_emin = READ_ONCE(parent->memory.emin);
+ emin = min(emin, parent_emin);
+ if (emin && parent_emin) {
+ unsigned long min_usage, siblings_min_usage;
+
+ min_usage = min(usage, memcg->memory.min);
+ siblings_min_usage = atomic_long_read(
+ &parent->memory.children_min_usage);
+
+ if (min_usage && siblings_min_usage)
+ emin = min(emin, parent_emin * min_usage /
+ siblings_min_usage);
+ }
+
parent_elow = READ_ONCE(parent->memory.elow);
elow = min(elow, parent_elow);
+ if (elow && parent_elow) {
+ unsigned long low_usage, siblings_low_usage;
- if (!elow || !parent_elow)
- goto exit;
+ low_usage = min(usage, memcg->memory.low);
+ siblings_low_usage = atomic_long_read(
+ &parent->memory.children_low_usage);
- low_usage = min(usage, memcg->memory.low);
- siblings_low_usage = atomic_long_read(
- &parent->memory.children_low_usage);
-
- if (!low_usage || !siblings_low_usage)
- goto exit;
+ if (low_usage && siblings_low_usage)
+ elow = min(elow, parent_elow * low_usage /
+ siblings_low_usage);
+ }
- elow = min(elow, parent_elow * low_usage / siblings_low_usage);
exit:
+ memcg->memory.emin = emin;
memcg->memory.elow = elow;
- return usage && usage <= elow;
+
+ if (usage <= emin)
+ return MEMCG_PROT_MIN;
+ else if (usage <= elow)
+ return MEMCG_PROT_LOW;
+ else
+ return MEMCG_PROT_NONE;
}
/**
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a5ff4cbc355a..de31470655f6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,26 +13,38 @@
#include <linux/bug.h>
#include <asm/page.h>
-static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+static void propagate_protected_usage(struct page_counter *c,
+ unsigned long usage)
{
- unsigned long low_usage, old;
+ unsigned long protected, old_protected;
long delta;
if (!c->parent)
return;
- if (!c->low && !atomic_long_read(&c->low_usage))
- return;
+ if (c->min || atomic_long_read(&c->min_usage)) {
+ if (usage <= c->min)
+ protected = usage;
+ else
+ protected = 0;
+
+ old_protected = atomic_long_xchg(&c->min_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_min_usage);
+ }
- if (usage <= c->low)
- low_usage = usage;
- else
- low_usage = 0;
+ if (c->low || atomic_long_read(&c->low_usage)) {
+ if (usage <= c->low)
+ protected = usage;
+ else
+ protected = 0;
- old = atomic_long_xchg(&c->low_usage, low_usage);
- delta = low_usage - old;
- if (delta)
- atomic_long_add(delta, &c->parent->children_low_usage);
+ old_protected = atomic_long_xchg(&c->low_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+ }
}
/**
@@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_sub_return(nr_pages, &counter->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -191,6 +203,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
}
/**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->min = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
* page_counter_set_low - set the amount of protected memory
* @counter: counter
* @nr_pages: value to set
@@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
counter->low = nr_pages;
for (c = counter; c; c = c->parent)
- propagate_low_usage(c, atomic_long_read(&c->usage));
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0e67b477ecef..03822f86f288 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;
- if (mem_cgroup_low(root, memcg)) {
+ switch (mem_cgroup_protected(root, memcg)) {
+ case MEMCG_PROT_MIN:
+ /*
+ * Hard protection.
+ * If there is no reclaimable memory, OOM.
+ */
+ continue;
+ case MEMCG_PROT_LOW:
+ /*
+ * Soft protection.
+ * Respect the protection only as long as
+ * there is an unprotected supply
+ * of reclaimable memory from other cgroups.
+ */
if (!sc->memcg_low_reclaim) {
sc->memcg_low_skipped = 1;
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
+ break;
+ case MEMCG_PROT_NONE:
+ break;
}
reclaimed = sc->nr_reclaimed;