summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c482
1 files changed, 321 insertions, 161 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8dfd2864bbf..7de11524a936 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <linux/mmu_notifier.h>
+#include <linux/parser.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -93,10 +94,8 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
-#ifdef CONFIG_MEMCG
/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
int *proactive_swappiness;
-#endif
/* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
@@ -120,7 +119,7 @@ struct scan_control {
/* Has cache_trim_mode failed at least once? */
unsigned int cache_trim_mode_failed:1;
- /* Proactive reclaim invoked by userspace through memory.reclaim */
+ /* Proactive reclaim invoked by userspace */
unsigned int proactive:1;
/*
@@ -652,14 +651,45 @@ typedef enum {
PAGE_CLEAN,
} pageout_t;
+static pageout_t writeout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug, struct list_head *folio_list)
+{
+ int res;
+
+ folio_set_reclaim(folio);
+
+ /*
+ * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
+ * or we failed to allocate contiguous swap entries, in which case
+ * the split out folios get added back to folio_list.
+ */
+ if (shmem_mapping(mapping))
+ res = shmem_writeout(folio, plug, folio_list);
+ else
+ res = swap_writeout(folio, plug);
+
+ if (res < 0)
+ handle_write_error(mapping, folio, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ folio_clear_reclaim(folio);
+ return PAGE_ACTIVATE;
+ }
+
+ /* synchronous write? */
+ if (!folio_test_writeback(folio))
+ folio_clear_reclaim(folio);
+
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
+}
+
/*
* pageout is called by shrink_folio_list() for each dirty folio.
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
- int (*writeout)(struct folio *, struct writeback_control *);
-
/*
* We no longer attempt to writeback filesystem folios here, other
* than tmpfs/shmem. That's taken care of in page-writeback.
@@ -690,51 +720,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (shmem_mapping(mapping))
- writeout = shmem_writeout;
- else if (folio_test_anon(folio))
- writeout = swap_writeout;
- else
- return PAGE_ACTIVATE;
- if (folio_clear_dirty_for_io(folio)) {
- int res;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = SWAP_CLUSTER_MAX,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1,
- .swap_plug = plug,
- };
-
- /*
- * The large shmem folio can be split if CONFIG_THP_SWAP is
- * not enabled or contiguous swap entries are failed to
- * allocate.
- */
- if (shmem_mapping(mapping) && folio_test_large(folio))
- wbc.list = folio_list;
-
- folio_set_reclaim(folio);
- res = writeout(folio, &wbc);
- if (res < 0)
- handle_write_error(mapping, folio, res);
- if (res == AOP_WRITEPAGE_ACTIVATE) {
- folio_clear_reclaim(folio);
- return PAGE_ACTIVATE;
- }
-
- if (!folio_test_writeback(folio)) {
- /* synchronous write? */
- folio_clear_reclaim(folio);
- }
- trace_mm_vmscan_write_folio(folio);
- node_stat_add_folio(folio, NR_VMSCAN_WRITE);
- return PAGE_SUCCESS;
- }
-
- return PAGE_CLEAN;
+ if (!shmem_mapping(mapping) && !folio_test_anon(folio))
+ return PAGE_ACTIVATE;
+ if (!folio_clear_dirty_for_io(folio))
+ return PAGE_CLEAN;
+ return writeout(folio, mapping, plug, folio_list);
}
/*
@@ -915,7 +906,7 @@ static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
@@ -1014,7 +1005,8 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-struct folio *alloc_migrate_folio(struct folio *src, unsigned long private)
+static struct folio *alloc_demote_folio(struct folio *src,
+ unsigned long private)
{
struct folio *dst;
nodemask_t *allowed_mask;
@@ -1077,7 +1069,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_folios, alloc_migrate_folio, NULL,
+ migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -1138,6 +1130,14 @@ retry:
goto keep;
if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() can't handle large
+ * folio, just skip it. memory_failure() will
+ * handle it if the UCE is triggered again.
+ */
+ if (folio_test_large(folio))
+ goto keep_locked;
+
unmap_poisoned_folio(folio, folio_pfn(folio), false);
folio_unlock(folio);
folio_put(folio);
@@ -1658,9 +1658,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
unsigned int noreclaim_flag;
list_for_each_entry_safe(folio, next, folio_list, lru) {
+ /* TODO: these pages should not even appear in this list. */
+ if (page_has_movable_ops(&folio->page))
+ continue;
if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
- !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
- !folio_test_unevictable(folio)) {
+ !folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
folio_clear_active(folio);
list_move(&folio->lru, &clean_folios);
}
@@ -2059,9 +2061,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
__count_vm_events(item, nr_reclaimed);
count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+ nr_scanned - nr_reclaimed);
/*
* If dirty folios are scanned that are not queued for IO, it
@@ -2127,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
{
unsigned long nr_taken;
unsigned long nr_scanned;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
@@ -2207,10 +2209,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&lruvec->lru_lock);
- if (nr_rotated)
- lru_note_cost(lruvec, file, 0, nr_rotated);
+ lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2482,6 +2482,69 @@ static inline void calculate_pressure_balance(struct scan_control *sc,
*denominator = ap + fp;
}
+static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long scan)
+{
+ unsigned long min, low;
+
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan -= scan * protection / (cgroup_size + 1);
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decrementing
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ }
+ return scan;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2560,70 +2623,10 @@ out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
unsigned long lruvec_size;
- unsigned long low, min;
unsigned long scan;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- mem_cgroup_protection(sc->target_mem_cgroup, memcg,
- &min, &low);
-
- if (min || low) {
- /*
- * Scale a cgroup's reclaim pressure by proportioning
- * its current usage to its memory.low or memory.min
- * setting.
- *
- * This is important, as otherwise scanning aggression
- * becomes extremely binary -- from nothing as we
- * approach the memory protection threshold, to totally
- * nominal as we exceed it. This results in requiring
- * setting extremely liberal protection thresholds. It
- * also means we simply get no protection at all if we
- * set it too low, which is not ideal.
- *
- * If there is any protection in place, we reduce scan
- * pressure by how much of the total memory used is
- * within protection thresholds.
- *
- * There is one special case: in the first reclaim pass,
- * we skip over all groups that are within their low
- * protection. If that fails to reclaim enough pages to
- * satisfy the reclaim goal, we come back and override
- * the best-effort low protection. However, we still
- * ideally want to honor how well-behaved groups are in
- * that case instead of simply punishing them all
- * equally. As such, we reclaim them based on how much
- * memory they are using, reducing the scan pressure
- * again by how much of the total memory used is under
- * hard protection.
- */
- unsigned long cgroup_size = mem_cgroup_size(memcg);
- unsigned long protection;
-
- /* memory.low scaling, make sure we retry before OOM */
- if (!sc->memcg_low_reclaim && low > min) {
- protection = low;
- sc->memcg_low_skipped = 1;
- } else {
- protection = min;
- }
-
- /* Avoid TOCTOU with earlier protection check */
- cgroup_size = max(cgroup_size, protection);
-
- scan = lruvec_size - lruvec_size * protection /
- (cgroup_size + 1);
-
- /*
- * Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decrementing
- * sc->priority further than desirable.
- */
- scan = max(scan, SWAP_CLUSTER_MAX);
- } else {
- scan = lruvec_size;
- }
-
+ scan = apply_proportional_protection(memcg, sc, lruvec_size);
scan >>= sc->priority;
/*
@@ -3429,7 +3432,7 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
if (!pte_present(pte) || is_zero_pfn(pfn))
return -1;
- if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ if (WARN_ON_ONCE(pte_special(pte)))
return -1;
if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
@@ -3454,9 +3457,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
return -1;
- if (WARN_ON_ONCE(pmd_devmap(pmd)))
- return -1;
-
if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
return -1;
@@ -3927,6 +3927,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
+ bool seq_inc_flag = false;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
@@ -3943,11 +3944,20 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
}
min_seq[type]++;
+ seq_inc_flag = true;
}
next:
;
}
+ /*
+ * If min_seq[type] of both anonymous and file is not increased,
+ * we can directly return false to avoid unnecessary checking
+ * overhead later.
+ */
+ if (!seq_inc_flag)
+ return success;
+
/* see the comment on lru_gen_folio */
if (swappiness && swappiness <= MAX_SWAPPINESS) {
unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
@@ -4554,8 +4564,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return true;
}
-static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- int type, int tier, struct list_head *list)
+static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int type, int tier,
+ struct list_head *list)
{
int i;
int gen;
@@ -4564,7 +4575,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int skipped = 0;
- int remaining = MAX_LRU_BATCH;
+ int remaining = min(nr_to_scan, MAX_LRU_BATCH);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -4675,7 +4686,8 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
return positive_ctrl_err(&sp, &pv);
}
-static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
@@ -4687,7 +4699,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
*type_scanned = type;
- scanned = scan_folios(lruvec, sc, type, tier, list);
+ scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
if (scanned)
return scanned;
@@ -4697,7 +4709,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return 0;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -4716,7 +4729,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
spin_lock_irq(&lruvec->lru_lock);
- scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+ scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
scanned += try_to_inc_min_seq(lruvec, swappiness);
@@ -4837,6 +4850,8 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s
if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
+ nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
+
/* try to get away with not aging at the default priority */
if (!success || sc->priority == DEF_PRIORITY)
return nr_to_scan >> sc->priority;
@@ -4889,7 +4904,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (nr_to_scan <= 0)
break;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
if (!delta)
break;
@@ -5420,7 +5435,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
- bool full = !debugfs_real_fops(m->file)->write;
+ bool full = debugfs_get_aux_num(m->file);
struct lruvec *lruvec = v;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
@@ -5510,7 +5525,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness))
+ if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
+ swappiness))
return 0;
cond_resched();
@@ -5756,8 +5772,10 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
- debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
- debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+ debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, 1,
+ &lru_gen_rw_fops);
+ debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, 0,
+ &lru_gen_ro_fops);
return 0;
};
@@ -6713,6 +6731,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
return nr_reclaimed;
}
+#else
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
+ gfp_t gfp_mask,
+ unsigned int reclaim_options,
+ int *swappiness)
+{
+ return 0;
+}
#endif
static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -7607,36 +7634,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
/*
* Try to free up some pages from this node through reclaim.
*/
-static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
+ unsigned long nr_pages,
+ struct scan_control *sc)
{
- /* Minimum pages needed in order to stay on node */
- const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
unsigned int noreclaim_flag;
- struct scan_control sc = {
- .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = current_gfp_context(gfp_mask),
- .order = order,
- .priority = NODE_RECLAIM_PRIORITY,
- .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
- .may_swap = 1,
- .reclaim_idx = gfp_zone(gfp_mask),
- };
unsigned long pflags;
- trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
- sc.gfp_mask);
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order,
+ sc->gfp_mask);
cond_resched();
psi_memstall_enter(&pflags);
delayacct_freepages_start();
- fs_reclaim_acquire(sc.gfp_mask);
+ fs_reclaim_acquire(sc->gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
*/
noreclaim_flag = memalloc_noreclaim_save();
- set_task_reclaim_state(p, &sc.reclaim_state);
+ set_task_reclaim_state(p, &sc->reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
@@ -7645,24 +7662,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* priorities until we have enough memory freed.
*/
do {
- shrink_node(pgdat, &sc);
- } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
+ shrink_node(pgdat, sc);
+ } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0);
}
set_task_reclaim_state(p, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
- fs_reclaim_release(sc.gfp_mask);
- psi_memstall_leave(&pflags);
+ fs_reclaim_release(sc->gfp_mask);
delayacct_freepages_end();
+ psi_memstall_leave(&pflags);
- trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed);
- return sc.nr_reclaimed >= nr_pages;
+ return sc->nr_reclaimed;
}
int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
int ret;
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct scan_control sc = {
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .order = order,
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
+ .may_swap = 1,
+ .reclaim_idx = gfp_zone(gfp_mask),
+ };
/*
* Node reclaim reclaims unmapped file backed pages and
@@ -7697,7 +7726,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
- ret = __node_reclaim(pgdat, gfp_mask, order);
+ ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
@@ -7707,6 +7736,114 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
return ret;
}
+
+enum {
+ MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
+ MEMORY_RECLAIM_NULL,
+};
+static const match_table_t tokens = {
+ { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int swappiness = -1;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ gfp_t gfp_mask = GFP_KERNEL;
+
+ if (!buf || (!memcg && !pgdat) || (memcg && pgdat))
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ switch (match_token(start, tokens, args)) {
+ case MEMORY_RECLAIM_SWAPPINESS:
+ if (match_int(&args[0], &swappiness))
+ return -EINVAL;
+ if (swappiness < MIN_SWAPPINESS ||
+ swappiness > MAX_SWAPPINESS)
+ return -EINVAL;
+ break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages.
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ if (memcg) {
+ unsigned int reclaim_options;
+
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ batch_size, gfp_mask,
+ reclaim_options,
+ swappiness == -1 ? NULL : &swappiness);
+ } else {
+ struct scan_control sc = {
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .reclaim_idx = gfp_zone(gfp_mask),
+ .proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
+ .may_unmap = 1,
+ .may_swap = 1,
+ .proactive = 1,
+ };
+
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED,
+ &pgdat->flags))
+ return -EBUSY;
+
+ reclaimed = __node_reclaim(pgdat, gfp_mask,
+ batch_size, &sc);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ }
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
+
+ return 0;
+}
+
#endif
/**
@@ -7754,3 +7891,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t reclaim_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret, nid = dev->id;
+
+ ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid));
+ return ret ? -EAGAIN : count;
+}
+
+static DEVICE_ATTR_WO(reclaim);
+int reclaim_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_reclaim);
+}
+
+void reclaim_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_reclaim);
+}
+#endif