summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c852
1 files changed, 508 insertions, 344 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0eda67376df4..ca052f2a4a0b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,7 +66,6 @@
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
-#include <net/tcp_memcontrol.h>
#include "slab.h"
#include <asm/uaccess.h>
@@ -83,6 +82,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
+/* Kernel memory accounting disabled? */
+static bool cgroup_memory_nokmem;
+
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
int do_swap_account __read_mostly;
@@ -239,6 +241,7 @@ enum res_type {
_MEMSWAP,
_OOM_TYPE,
_KMEM,
+ _TCP,
};
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
@@ -247,13 +250,6 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
-/*
- * The memcg_create_mutex will be held whenever a new cgroup is created.
- * As a consequence, any change that needs to protect against new child cgroups
- * appearing has to hold it as well.
- */
-static DEFINE_MUTEX(memcg_create_mutex);
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -297,7 +293,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return mem_cgroup_from_css(css);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
@@ -349,7 +345,7 @@ void memcg_put_cache_ids(void)
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
@@ -370,13 +366,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
- *
- * XXX: The above description of behavior on the default hierarchy isn't
- * strictly true yet as replace_page_cache_page() can modify the
- * association before @page is released even on the default hierarchy;
- * however, the current and planned usages don't mix the the two functions
- * and replace_page_cache_page() will soon be updated to make the invariant
- * actually true.
*/
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
@@ -896,17 +885,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (css == &root->css)
break;
- if (css_tryget(css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- break;
-
- css_put(css);
- }
+ if (css_tryget(css))
+ break;
memcg = NULL;
}
@@ -1233,7 +1213,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
@@ -1272,9 +1252,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = memcg->memory.limit;
if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_limit;
+ unsigned long swap_limit;
memsw_limit = memcg->memsw.limit;
- limit = min(limit + total_swap_pages, memsw_limit);
+ swap_limit = memcg->swap.limit;
+ swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
+ limit = min(limit + swap_limit, memsw_limit);
}
return limit;
}
@@ -2203,7 +2186,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
unlock_page_lru(page, isolated);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2378,16 +2361,17 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_is_active(memcg))
+ if (!memcg_kmem_online(memcg))
return 0;
- if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
- return -ENOMEM;
-
ret = try_charge(memcg, gfp, nr_pages);
- if (ret) {
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (ret)
return ret;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+ !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+ cancel_charge(memcg, nr_pages);
+ return -ENOMEM;
}
page->mem_cgroup = memcg;
@@ -2416,7 +2400,9 @@ void __memcg_kmem_uncharge(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
@@ -2424,7 +2410,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
page->mem_cgroup = NULL;
css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2684,14 +2670,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
bool ret;
- /*
- * The lock does not prevent addition or deletion of children, but
- * it prevents a new child from being initialized based on this
- * parent in css_online(), so it's enough to decide whether
- * hierarchically inherited attributes can still be changed or not.
- */
- lockdep_assert_held(&memcg_create_mutex);
-
rcu_read_lock();
ret = css_next_child(NULL, &memcg->css);
rcu_read_unlock();
@@ -2754,10 +2732,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
- mutex_lock(&memcg_create_mutex);
-
if (memcg->use_hierarchy == val)
- goto out;
+ return 0;
/*
* If parent's use_hierarchy is set, we can't make any modifications
@@ -2776,9 +2752,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
} else
retval = -EINVAL;
-out:
- mutex_unlock(&memcg_create_mutex);
-
return retval;
}
@@ -2794,6 +2767,18 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
return val;
}
+static unsigned long tree_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx)
+{
+ struct mem_cgroup *iter;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ val += mem_cgroup_read_events(iter, idx);
+
+ return val;
+}
+
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val;
@@ -2836,6 +2821,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -2860,103 +2848,180 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long nr_pages)
+#ifndef CONFIG_SLOB
+static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- int err = 0;
int memcg_id;
BUG_ON(memcg->kmemcg_id >= 0);
- BUG_ON(memcg->kmem_acct_activated);
- BUG_ON(memcg->kmem_acct_active);
-
- /*
- * For simplicity, we won't allow this to be disabled. It also can't
- * be changed if the cgroup has children already, or if tasks had
- * already joined.
- *
- * If tasks join before we set the limit, a person looking at
- * kmem.usage_in_bytes will have no way to determine when it took
- * place, which makes the value quite meaningless.
- *
- * After it first became limited, changes in the value of the limit are
- * of course permitted.
- */
- mutex_lock(&memcg_create_mutex);
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- err = -EBUSY;
- mutex_unlock(&memcg_create_mutex);
- if (err)
- goto out;
+ BUG_ON(memcg->kmem_state);
memcg_id = memcg_alloc_cache_id();
- if (memcg_id < 0) {
- err = memcg_id;
- goto out;
- }
-
- /*
- * We couldn't have accounted to this cgroup, because it hasn't got
- * activated yet, so this should succeed.
- */
- err = page_counter_limit(&memcg->kmem, nr_pages);
- VM_BUG_ON(err);
+ if (memcg_id < 0)
+ return memcg_id;
static_branch_inc(&memcg_kmem_enabled_key);
/*
- * A memory cgroup is considered kmem-active as soon as it gets
+ * A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
* guarantee no one starts accounting before all call sites are
* patched.
*/
memcg->kmemcg_id = memcg_id;
- memcg->kmem_acct_activated = true;
- memcg->kmem_acct_active = true;
-out:
- return err;
+ memcg->kmem_state = KMEM_ONLINE;
+
+ return 0;
}
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_propagate_kmem(struct mem_cgroup *parent,
+ struct mem_cgroup *memcg)
{
- int ret;
+ int ret = 0;
mutex_lock(&memcg_limit_mutex);
- if (!memcg_kmem_is_active(memcg))
- ret = memcg_activate_kmem(memcg, limit);
- else
- ret = page_counter_limit(&memcg->kmem, limit);
+ /*
+ * If the parent cgroup is not kmem-online now, it cannot be
+ * onlined after this point, because it has at least one child
+ * already.
+ */
+ if (memcg_kmem_online(parent) ||
+ (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
+ ret = memcg_online_kmem(memcg);
mutex_unlock(&memcg_limit_mutex);
return ret;
}
-static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- int ret = 0;
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (memcg->kmem_state != KMEM_ONLINE)
+ return;
+ /*
+ * Clear the online state before clearing memcg_caches array
+ * entries. The slab_mutex in memcg_deactivate_kmem_caches()
+ * guarantees that no cache will be created for this cgroup
+ * after we are done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_state = KMEM_ALLOCATED;
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
if (!parent)
- return 0;
+ parent = root_mem_cgroup;
- mutex_lock(&memcg_limit_mutex);
/*
- * If the parent cgroup is not kmem-active now, it cannot be activated
- * after this point, because it has at least one child already.
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
*/
- if (memcg_kmem_is_active(parent))
- ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+ /* css_alloc() failed, offlining didn't happen */
+ if (unlikely(memcg->kmem_state == KMEM_ONLINE))
+ memcg_offline_kmem(memcg);
+
+ if (memcg->kmem_state == KMEM_ALLOCATED) {
+ memcg_destroy_kmem_caches(memcg);
+ static_branch_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
}
#else
+static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static int memcg_online_kmem(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
+{
+}
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+}
+#endif /* !CONFIG_SLOB */
+
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- return -EINVAL;
+ int ret = 0;
+
+ mutex_lock(&memcg_limit_mutex);
+ /* Top-level cgroup doesn't propagate from root */
+ if (!memcg_kmem_online(memcg)) {
+ if (cgroup_is_populated(memcg->css.cgroup) ||
+ (memcg->use_hierarchy && memcg_has_children(memcg)))
+ ret = -EBUSY;
+ if (ret)
+ goto out;
+ ret = memcg_online_kmem(memcg);
+ if (ret)
+ goto out;
+ }
+ ret = page_counter_limit(&memcg->kmem, limit);
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
+}
+
+static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+{
+ int ret;
+
+ mutex_lock(&memcg_limit_mutex);
+
+ ret = page_counter_limit(&memcg->tcpmem, limit);
+ if (ret)
+ goto out;
+
+ if (!memcg->tcpmem_active) {
+ /*
+ * The active flag needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ */
+ static_branch_inc(&memcg_sockets_enabled_key);
+ memcg->tcpmem_active = true;
+ }
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
}
-#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
@@ -2990,6 +3055,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
break;
+ case _TCP:
+ ret = memcg_update_tcp_limit(memcg, nr_pages);
+ break;
}
break;
case RES_SOFT_LIMIT:
@@ -3016,6 +3084,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -3582,88 +3653,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- int ret;
-
- ret = memcg_propagate_kmem(memcg);
- if (ret)
- return ret;
-
- return tcp_init_cgroup(memcg, ss);
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *parent, *child;
- int kmemcg_id;
-
- if (!memcg->kmem_acct_active)
- return;
-
- /*
- * Clear the 'active' flag before clearing memcg_caches arrays entries.
- * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
- * guarantees no cache will be created for this cgroup after we are
- * done (see memcg_create_kmem_cache()).
- */
- memcg->kmem_acct_active = false;
-
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- /*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent. After we have finished, all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty. The
- * ordering is imposed by list_lru_node->lock taken by
- * memcg_drain_all_list_lrus().
- */
- css_for_each_descendant_pre(css, &memcg->css) {
- child = mem_cgroup_from_css(css);
- BUG_ON(child->kmemcg_id != kmemcg_id);
- child->kmemcg_id = parent->kmemcg_id;
- if (!memcg->use_hierarchy)
- break;
- }
- memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
-
- memcg_free_cache_id(kmemcg_id);
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
- if (memcg->kmem_acct_activated) {
- memcg_destroy_kmem_caches(memcg);
- static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
- }
- tcp_destroy_cgroup(memcg);
-}
-#else
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- return 0;
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
-}
-#endif
-
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
@@ -4051,7 +4040,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_numa_stat_show,
},
#endif
-#ifdef CONFIG_MEMCG_KMEM
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
@@ -4084,7 +4072,29 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_slab_show,
},
#endif
-#endif
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
{ }, /* terminate */
};
@@ -4123,147 +4133,92 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}
-static struct mem_cgroup *mem_cgroup_alloc(void)
-{
- struct mem_cgroup *memcg;
- size_t size;
-
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
- if (!memcg)
- return NULL;
-
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
- goto out_free;
-
- if (memcg_wb_domain_init(memcg, GFP_KERNEL))
- goto out_free_stat;
-
- return memcg;
-
-out_free_stat:
- free_percpu(memcg->stat);
-out_free:
- kfree(memcg);
- return NULL;
-}
-
-/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
- */
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- cancel_work_sync(&memcg->high_work);
-
- mem_cgroup_remove_from_trees(memcg);
-
+ memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
-
free_percpu(memcg->stat);
- memcg_wb_domain_exit(memcg);
kfree(memcg);
}
-static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- long error = -ENOMEM;
+ size_t size;
int node;
- memcg = mem_cgroup_alloc();
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return ERR_PTR(error);
+ return NULL;
+
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat)
+ goto fail;
for_each_node(node)
if (alloc_mem_cgroup_per_zone_info(memcg, node))
- goto free_out;
+ goto fail;
- /* root ? */
- if (parent_css == NULL) {
- root_mem_cgroup = memcg;
- page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
- page_counter_init(&memcg->memsw, NULL);
- page_counter_init(&memcg->kmem, NULL);
- }
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
- memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
-#ifdef CONFIG_MEMCG_KMEM
+ memcg->socket_pressure = jiffies;
+#ifndef CONFIG_SLOB
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
-#ifdef CONFIG_INET
- memcg->socket_pressure = jiffies;
-#endif
- return &memcg->css;
-
-free_out:
- __mem_cgroup_free(memcg);
- return ERR_PTR(error);
+ return memcg;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
}
-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
- int ret;
-
- if (css->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
-
- if (!parent)
- return 0;
-
- mutex_lock(&memcg_create_mutex);
+ struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
+ struct mem_cgroup *memcg;
+ long error = -ENOMEM;
- memcg->use_hierarchy = parent->use_hierarchy;
- memcg->oom_kill_disable = parent->oom_kill_disable;
- memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
+ return ERR_PTR(error);
- if (parent->use_hierarchy) {
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
+ if (parent) {
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ }
+ if (parent && parent->use_hierarchy) {
+ memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
-
- /*
- * No need to take a reference to the parent because cgroup
- * core guarantees its existence.
- */
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -4272,23 +4227,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
memory_cgrp_subsys.broken_hierarchy = true;
}
- mutex_unlock(&memcg_create_mutex);
- ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
- if (ret)
- return ret;
+ /* The following stuff does not apply to the root */
+ if (!parent) {
+ root_mem_cgroup = memcg;
+ return &memcg->css;
+ }
+
+ error = memcg_propagate_kmem(parent, memcg);
+ if (error)
+ goto fail;
-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
-#endif
- /*
- * Make sure the memcg is initialized: mem_cgroup_iter()
- * orders reading memcg->initialized against its callers
- * reading the memcg members.
- */
- smp_store_release(&memcg->initialized, 1);
+ return &memcg->css;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
+}
+
+static int
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ if (css->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
return 0;
}
@@ -4310,10 +4273,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- vmpressure_cleanup(&memcg->vmpressure);
-
- memcg_deactivate_kmem(memcg);
-
+ memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
}
@@ -4328,12 +4288,17 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- memcg_destroy_kmem(memcg);
-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
-#endif
- __mem_cgroup_free(memcg);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
+ static_branch_dec(&memcg_sockets_enabled_key);
+
+ vmpressure_cleanup(&memcg->vmpressure);
+ cancel_work_sync(&memcg->high_work);
+ mem_cgroup_remove_from_trees(memcg);
+ memcg_free_kmem(memcg);
+ mem_cgroup_free(memcg);
}
/**
@@ -5143,6 +5108,59 @@ static int memory_events_show(struct seq_file *m, void *v)
return 0;
}
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ int i;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_printf(m, "anon %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ seq_printf(m, "file %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ seq_printf(m, "sock %llu\n",
+ (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+
+ seq_printf(m, "file_mapped %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_printf(m, "file_dirty %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
+ PAGE_SIZE);
+ seq_printf(m, "file_writeback %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++) {
+ struct mem_cgroup *mi;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_nr_lru_pages(mi, BIT(i));
+ seq_printf(m, "%s %llu\n",
+ mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
+ }
+
+ /* Accumulated memory events */
+
+ seq_printf(m, "pgfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ seq_printf(m, "pgmajfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+
+ return 0;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5173,6 +5191,11 @@ static struct cftype memory_files[] = {
.file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
+ {
+ .name = "stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_stat_show,
+ },
{ } /* terminate */
};
@@ -5269,7 +5292,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
if (page->mem_cgroup)
goto out;
- if (do_memsw_account()) {
+ if (do_swap_account) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id = lookup_swap_cgroup_id(ent);
@@ -5504,7 +5527,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
- int isolated;
+ unsigned int nr_pages;
+ bool compound;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
@@ -5524,14 +5548,22 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
if (!memcg)
return;
- lock_page_lru(oldpage, &isolated);
- oldpage->mem_cgroup = NULL;
- unlock_page_lru(oldpage, isolated);
+ /* Force-charge the new page. The old one will be freed soon */
+ compound = PageTransHuge(newpage);
+ nr_pages = compound ? hpage_nr_pages(newpage) : 1;
+
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
commit_charge(newpage, memcg, true);
-}
-#ifdef CONFIG_INET
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+ memcg_check_events(memcg, newpage);
+ local_irq_enable();
+}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
EXPORT_SYMBOL(memcg_sockets_enabled_key);
@@ -5558,10 +5590,8 @@ void sock_update_memcg(struct sock *sk)
memcg = mem_cgroup_from_task(current);
if (memcg == root_mem_cgroup)
goto out;
-#ifdef CONFIG_MEMCG_KMEM
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
-#endif
if (css_tryget_online(&memcg->css))
sk->sk_memcg = memcg;
out:
@@ -5587,24 +5617,24 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
gfp_t gfp_mask = GFP_KERNEL;
-#ifdef CONFIG_MEMCG_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- struct page_counter *counter;
+ struct page_counter *fail;
- if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
- nr_pages, &counter)) {
- memcg->tcp_mem.memory_pressure = 0;
+ if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+ memcg->tcpmem_pressure = 0;
return true;
}
- page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
- memcg->tcp_mem.memory_pressure = 1;
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ memcg->tcpmem_pressure = 1;
return false;
}
-#endif
+
/* Don't block in the packet receive path */
if (in_softirq())
gfp_mask = GFP_NOWAIT;
+ this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
if (try_charge(memcg, gfp_mask, nr_pages) == 0)
return true;
@@ -5619,19 +5649,17 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
-#ifdef CONFIG_MEMCG_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
- nr_pages);
+ page_counter_uncharge(&memcg->tcpmem, nr_pages);
return;
}
-#endif
+
+ this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
page_counter_uncharge(&memcg->memory, nr_pages);
css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_INET */
-
static int __init cgroup_memory(char *s)
{
char *token;
@@ -5641,6 +5669,8 @@ static int __init cgroup_memory(char *s)
continue;
if (!strcmp(token, "nosocket"))
cgroup_memory_nosocket = true;
+ if (!strcmp(token, "nokmem"))
+ cgroup_memory_nokmem = true;
}
return 0;
}
@@ -5730,32 +5760,107 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
memcg_check_events(memcg, page);
}
+/*
+ * mem_cgroup_try_charge_swap - try charging a swap entry
+ * @page: page being added to swap
+ * @entry: swap entry to charge
+ *
+ * Try to charge @entry to the memcg that @page belongs to.
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ struct page_counter *counter;
+ unsigned short oldid;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ return 0;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return 0;
+
+ if (!mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, 1, &counter))
+ return -ENOMEM;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ css_get(&memcg->css);
+ return 0;
+}
+
/**
* mem_cgroup_uncharge_swap - uncharge a swap entry
* @entry: swap entry to uncharge
*
- * Drop the memsw charge associated with @entry.
+ * Drop the swap charge associated with @entry.
*/
void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
struct mem_cgroup *memcg;
unsigned short id;
- if (!do_memsw_account())
+ if (!do_swap_account)
return;
id = swap_cgroup_record(entry, 0);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->swap, 1);
+ else
+ page_counter_uncharge(&memcg->memsw, 1);
+ }
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
rcu_read_unlock();
}
+long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+ long nr_swap_pages = get_nr_swap_pages();
+
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return nr_swap_pages;
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ nr_swap_pages = min_t(long, nr_swap_pages,
+ READ_ONCE(memcg->swap.limit) -
+ page_counter_read(&memcg->swap));
+ return nr_swap_pages;
+}
+
+bool mem_cgroup_swap_full(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (vm_swap_full())
+ return true;
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ memcg = page->mem_cgroup;
+ if (!memcg)
+ return false;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ return true;
+
+ return false;
+}
+
/* for remember boot option*/
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
@@ -5773,6 +5878,63 @@ static int __init enable_swap_account(char *s)
}
__setup("swapaccount=", enable_swap_account);
+static u64 swap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+}
+
+static int swap_max_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = READ_ONCE(memcg->swap.limit);
+
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t swap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ mutex_lock(&memcg_limit_mutex);
+ err = page_counter_limit(&memcg->swap, max);
+ mutex_unlock(&memcg_limit_mutex);
+ if (err)
+ return err;
+
+ return nbytes;
+}
+
+static struct cftype swap_files[] = {
+ {
+ .name = "swap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_current_read,
+ },
+ {
+ .name = "swap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_max_show,
+ .write = swap_max_write,
+ },
+ { } /* terminate */
+};
+
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
@@ -5804,6 +5966,8 @@ static int __init mem_cgroup_swap_init(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account) {
do_swap_account = 1;
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
+ swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
memsw_cgroup_files));
}