summaryrefslogtreecommitdiff
path: root/mm/vmpressure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmpressure.c')
-rw-r--r--mm/vmpressure.c265
1 files changed, 186 insertions, 79 deletions
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 736a6011c2c8..c197ed47bcc4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Linux VM pressure
*
@@ -6,10 +7,6 @@
*
* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
*/
#include <linux/cgroup.h>
@@ -19,6 +16,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/eventfd.h>
+#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/printk.h>
#include <linux/vmpressure.h>
@@ -74,15 +72,9 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
return container_of(work, struct vmpressure, work);
}
-static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
-{
- return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
-}
-
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
- struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
- struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
+ struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
memcg = parent_mem_cgroup(memcg);
if (!memcg)
@@ -97,12 +89,25 @@ enum vmpressure_levels {
VMPRESSURE_NUM_LEVELS,
};
+enum vmpressure_modes {
+ VMPRESSURE_NO_PASSTHROUGH = 0,
+ VMPRESSURE_HIERARCHY,
+ VMPRESSURE_LOCAL,
+ VMPRESSURE_NUM_MODES,
+};
+
static const char * const vmpressure_str_levels[] = {
[VMPRESSURE_LOW] = "low",
[VMPRESSURE_MEDIUM] = "medium",
[VMPRESSURE_CRITICAL] = "critical",
};
+static const char * const vmpressure_str_modes[] = {
+ [VMPRESSURE_NO_PASSTHROUGH] = "default",
+ [VMPRESSURE_HIERARCHY] = "hierarchy",
+ [VMPRESSURE_LOCAL] = "local",
+};
+
static enum vmpressure_levels vmpressure_level(unsigned long pressure)
{
if (pressure >= vmpressure_level_critical)
@@ -116,9 +121,16 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
unsigned long reclaimed)
{
unsigned long scale = scanned + reclaimed;
- unsigned long pressure;
+ unsigned long pressure = 0;
/*
+ * reclaimed can be greater than scanned for things such as reclaimed
+ * slab pages. shrink_node() just adds reclaimed pages without a
+ * related increment to scanned pages.
+ */
+ if (reclaimed >= scanned)
+ goto out;
+ /*
* We calculate the ratio (in percents) of how many pages were
* scanned vs. reclaimed in a given time frame (window). Note that
* time is in VM reclaimer's "ticks", i.e. number of pages
@@ -128,6 +140,7 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
pressure = scale - (reclaimed * scale / scanned);
pressure = pressure * 100 / scale;
+out:
pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
scanned, reclaimed);
@@ -137,30 +150,31 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
struct vmpressure_event {
struct eventfd_ctx *efd;
enum vmpressure_levels level;
+ enum vmpressure_modes mode;
struct list_head node;
};
static bool vmpressure_event(struct vmpressure *vmpr,
- unsigned long scanned, unsigned long reclaimed)
+ const enum vmpressure_levels level,
+ bool ancestor, bool signalled)
{
struct vmpressure_event *ev;
- enum vmpressure_levels level;
- bool signalled = false;
-
- level = vmpressure_calc_level(scanned, reclaimed);
+ bool ret = false;
mutex_lock(&vmpr->events_lock);
-
list_for_each_entry(ev, &vmpr->events, node) {
- if (level >= ev->level) {
- eventfd_signal(ev->efd, 1);
- signalled = true;
- }
+ if (ancestor && ev->mode == VMPRESSURE_LOCAL)
+ continue;
+ if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
+ continue;
+ if (level < ev->level)
+ continue;
+ eventfd_signal(ev->efd);
+ ret = true;
}
-
mutex_unlock(&vmpr->events_lock);
- return signalled;
+ return ret;
}
static void vmpressure_work_fn(struct work_struct *work)
@@ -168,7 +182,11 @@ static void vmpressure_work_fn(struct work_struct *work)
struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned;
unsigned long reclaimed;
+ enum vmpressure_levels level;
+ bool ancestor = false;
+ bool signalled = false;
+ spin_lock(&vmpr->sr_lock);
/*
* Several contexts might be calling vmpressure(), so it is
* possible that the work was rescheduled again before the old
@@ -177,23 +195,23 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- if (!vmpr->scanned)
+ scanned = vmpr->tree_scanned;
+ if (!scanned) {
+ spin_unlock(&vmpr->sr_lock);
return;
+ }
- mutex_lock(&vmpr->sr_lock);
- scanned = vmpr->scanned;
- reclaimed = vmpr->reclaimed;
- vmpr->scanned = 0;
- vmpr->reclaimed = 0;
- mutex_unlock(&vmpr->sr_lock);
+ reclaimed = vmpr->tree_reclaimed;
+ vmpr->tree_scanned = 0;
+ vmpr->tree_reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
do {
- if (vmpressure_event(vmpr, scanned, reclaimed))
- break;
- /*
- * If not handled, propagate the event upward into the
- * hierarchy.
- */
+ if (vmpressure_event(vmpr, level, ancestor, signalled))
+ signalled = true;
+ ancestor = true;
} while ((vmpr = vmpressure_parent(vmpr)));
}
@@ -201,6 +219,7 @@ static void vmpressure_work_fn(struct work_struct *work)
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
+ * @tree: legacy subtree mode
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
@@ -208,12 +227,32 @@ static void vmpressure_work_fn(struct work_struct *work)
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
- struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure *vmpr;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ /*
+ * The in-kernel users only care about the reclaim efficiency
+ * for this @memcg rather than the whole subtree, and there
+ * isn't and won't be any in-kernel user in a legacy cgroup.
+ */
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree)
+ return;
+
+ vmpr = memcg_to_vmpressure(memcg);
/*
* Here we only want to account pressure that userland is able to
@@ -240,15 +279,46 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
if (!scanned)
return;
- mutex_lock(&vmpr->sr_lock);
- vmpr->scanned += scanned;
- vmpr->reclaimed += reclaimed;
- scanned = vmpr->scanned;
- mutex_unlock(&vmpr->sr_lock);
-
- if (scanned < vmpressure_win || work_pending(&vmpr->work))
- return;
- schedule_work(&vmpr->work);
+ if (tree) {
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ spin_unlock(&vmpr->sr_lock);
+
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+ } else {
+ enum vmpressure_levels level;
+
+ /* For now, no users for root-level efficiency */
+ if (!memcg || mem_cgroup_is_root(memcg))
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned += scanned;
+ reclaimed = vmpr->reclaimed += reclaimed;
+ if (scanned < vmpressure_win) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+ vmpr->scanned = vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ if (level > VMPRESSURE_LOW) {
+ /*
+ * Let the socket buffer allocator know that
+ * we are having trouble reclaiming LRU pages.
+ *
+ * For hysteresis keep the pressure state
+ * asserted for a second in which subsequent
+ * pressure events can occur.
+ */
+ mem_cgroup_set_socket_pressure(memcg);
+ }
+ }
}
/**
@@ -278,73 +348,94 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, vmpressure_win, 0);
+ vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
+#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
+
/**
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
- * @cg: cgroup that is interested in vmpressure notifications
- * @cft: cgroup control files handle
+ * @memcg: memcg that is interested in vmpressure notifications
* @eventfd: eventfd context to link notifications with
- * @args: event arguments (used to set up a pressure level threshold)
+ * @args: event arguments (pressure level threshold, optional mode)
*
* This function associates eventfd context with the vmpressure
* infrastructure, so that the notifications will be delivered to the
- * @eventfd. The @args parameter is a string that denotes pressure level
- * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
- * "critical").
+ * @eventfd. The @args parameter is a comma-delimited string that denotes a
+ * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
+ * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
+ * "hierarchy" or "local").
*
- * This function should not be used directly, just pass it to (struct
- * cftype).register_event, and then cgroup core will handle everything by
- * itself.
+ * To be used as memcg event method.
+ *
+ * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
+ * not be parsed.
*/
-int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
- struct vmpressure *vmpr = cg_to_vmpressure(cg);
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
- int level;
+ enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
+ enum vmpressure_levels level;
+ char *spec, *spec_orig;
+ char *token;
+ int ret = 0;
- for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
- if (!strcmp(vmpressure_str_levels[level], args))
- break;
- }
+ spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
+ if (!spec)
+ return -ENOMEM;
- if (level >= VMPRESSURE_NUM_LEVELS)
- return -EINVAL;
+ /* Find required level */
+ token = strsep(&spec, ",");
+ ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (ret < 0)
+ goto out;
+ level = ret;
+
+ /* Find optional mode */
+ token = strsep(&spec, ",");
+ if (token) {
+ ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (ret < 0)
+ goto out;
+ mode = ret;
+ }
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
- if (!ev)
- return -ENOMEM;
+ if (!ev) {
+ ret = -ENOMEM;
+ goto out;
+ }
ev->efd = eventfd;
ev->level = level;
+ ev->mode = mode;
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
-
- return 0;
+ ret = 0;
+out:
+ kfree(spec_orig);
+ return ret;
}
/**
* vmpressure_unregister_event() - Unbind eventfd from vmpressure
- * @cg: cgroup handle
- * @cft: cgroup control files handle
+ * @memcg: memcg handle
* @eventfd: eventfd context that was used to link vmpressure with the @cg
*
* This function does internal manipulations to detach the @eventfd from
* the vmpressure notifications, and then frees internal resources
* associated with the @eventfd (but the @eventfd itself is not freed).
*
- * This function should not be used directly, just pass it to (struct
- * cftype).unregister_event, and then cgroup core will handle everything
- * by itself.
+ * To be used as memcg event method.
*/
-void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
- struct vmpressure *vmpr = cg_to_vmpressure(cg);
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
mutex_lock(&vmpr->events_lock);
@@ -367,8 +458,24 @@ void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
*/
void vmpressure_init(struct vmpressure *vmpr)
{
- mutex_init(&vmpr->sr_lock);
+ spin_lock_init(&vmpr->sr_lock);
mutex_init(&vmpr->events_lock);
INIT_LIST_HEAD(&vmpr->events);
INIT_WORK(&vmpr->work, vmpressure_work_fn);
}
+
+/**
+ * vmpressure_cleanup() - shuts down vmpressure control structure
+ * @vmpr: Structure to be cleaned up
+ *
+ * This function should be called before the structure in which it is
+ * embedded is cleaned up.
+ */
+void vmpressure_cleanup(struct vmpressure *vmpr)
+{
+ /*
+ * Make sure there is no pending work before eventfd infrastructure
+ * goes away.
+ */
+ flush_work(&vmpr->work);
+}