diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 474 |
1 files changed, 313 insertions, 161 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 424412680cfc..7de11524a936 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,7 @@ #include <linux/rculist_nulls.h> #include <linux/random.h> #include <linux/mmu_notifier.h> +#include <linux/parser.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -93,10 +94,8 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; -#ifdef CONFIG_MEMCG /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ int *proactive_swappiness; -#endif /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 @@ -120,7 +119,7 @@ struct scan_control { /* Has cache_trim_mode failed at least once? */ unsigned int cache_trim_mode_failed:1; - /* Proactive reclaim invoked by userspace through memory.reclaim */ + /* Proactive reclaim invoked by userspace */ unsigned int proactive:1; /* @@ -652,14 +651,45 @@ typedef enum { PAGE_CLEAN, } pageout_t; +static pageout_t writeout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug, struct list_head *folio_list) +{ + int res; + + folio_set_reclaim(folio); + + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled + * or we failed to allocate contiguous swap entries, in which case + * the split out folios get added back to folio_list. + */ + if (shmem_mapping(mapping)) + res = shmem_writeout(folio, plug, folio_list); + else + res = swap_writeout(folio, plug); + + if (res < 0) + handle_write_error(mapping, folio, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + folio_clear_reclaim(folio); + return PAGE_ACTIVATE; + } + + /* synchronous write? */ + if (!folio_test_writeback(folio)) + folio_clear_reclaim(folio); + + trace_mm_vmscan_write_folio(folio); + node_stat_add_folio(folio, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; +} + /* * pageout is called by shrink_folio_list() for each dirty folio. */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { - int (*writeout)(struct folio *, struct writeback_control *); - /* * We no longer attempt to writeback filesystem folios here, other * than tmpfs/shmem. That's taken care of in page-writeback. @@ -690,51 +720,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } return PAGE_KEEP; } - if (shmem_mapping(mapping)) - writeout = shmem_writeout; - else if (folio_test_anon(folio)) - writeout = swap_writeout; - else - return PAGE_ACTIVATE; - - if (folio_clear_dirty_for_io(folio)) { - int res; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = SWAP_CLUSTER_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1, - .swap_plug = plug, - }; - - /* - * The large shmem folio can be split if CONFIG_THP_SWAP is - * not enabled or contiguous swap entries are failed to - * allocate. - */ - if (shmem_mapping(mapping) && folio_test_large(folio)) - wbc.list = folio_list; - - folio_set_reclaim(folio); - res = writeout(folio, &wbc); - if (res < 0) - handle_write_error(mapping, folio, res); - if (res == AOP_WRITEPAGE_ACTIVATE) { - folio_clear_reclaim(folio); - return PAGE_ACTIVATE; - } - - if (!folio_test_writeback(folio)) { - /* synchronous write? */ - folio_clear_reclaim(folio); - } - trace_mm_vmscan_write_folio(folio); - node_stat_add_folio(folio, NR_VMSCAN_WRITE); - return PAGE_SUCCESS; - } - return PAGE_CLEAN; + if (!shmem_mapping(mapping) && !folio_test_anon(folio)) + return PAGE_ACTIVATE; + if (!folio_clear_dirty_for_io(folio)) + return PAGE_CLEAN; + return writeout(folio, mapping, plug, folio_list); } /* @@ -915,7 +906,7 @@ static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; - unsigned long vm_flags; + vm_flags_t vm_flags; referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); @@ -1014,7 +1005,8 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -struct folio *alloc_migrate_folio(struct folio *src, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { struct folio *dst; nodemask_t *allowed_mask; @@ -1077,7 +1069,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_migrate_folio, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -1666,9 +1658,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, unsigned int noreclaim_flag; list_for_each_entry_safe(folio, next, folio_list, lru) { + /* TODO: these pages should not even appear in this list. */ + if (page_has_movable_ops(&folio->page)) + continue; if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && - !folio_test_dirty(folio) && !__folio_test_movable(folio) && - !folio_test_unevictable(folio)) { + !folio_test_dirty(folio) && !folio_test_unevictable(folio)) { folio_clear_active(folio); list_move(&folio->lru, &clean_folios); } @@ -2067,9 +2061,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __count_vm_events(item, nr_reclaimed); count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); + lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, + nr_scanned - nr_reclaimed); /* * If dirty folios are scanned that are not queued for IO, it @@ -2135,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan, { unsigned long nr_taken; unsigned long nr_scanned; - unsigned long vm_flags; + vm_flags_t vm_flags; LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); @@ -2215,10 +2209,8 @@ static void shrink_active_list(unsigned long nr_to_scan, count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&lruvec->lru_lock); - if (nr_rotated) - lru_note_cost(lruvec, file, 0, nr_rotated); + lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2490,6 +2482,69 @@ static inline void calculate_pressure_balance(struct scan_control *sc, *denominator = ap + fp; } +static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long scan) +{ + unsigned long min, low; + + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + + if (min || low) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan -= scan * protection / (cgroup_size + 1); + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decrementing + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } + return scan; +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. @@ -2568,70 +2623,10 @@ out: for_each_evictable_lru(lru) { bool file = is_file_lru(lru); unsigned long lruvec_size; - unsigned long low, min; unsigned long scan; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - mem_cgroup_protection(sc->target_mem_cgroup, memcg, - &min, &low); - - if (min || low) { - /* - * Scale a cgroup's reclaim pressure by proportioning - * its current usage to its memory.low or memory.min - * setting. - * - * This is important, as otherwise scanning aggression - * becomes extremely binary -- from nothing as we - * approach the memory protection threshold, to totally - * nominal as we exceed it. This results in requiring - * setting extremely liberal protection thresholds. It - * also means we simply get no protection at all if we - * set it too low, which is not ideal. - * - * If there is any protection in place, we reduce scan - * pressure by how much of the total memory used is - * within protection thresholds. - * - * There is one special case: in the first reclaim pass, - * we skip over all groups that are within their low - * protection. If that fails to reclaim enough pages to - * satisfy the reclaim goal, we come back and override - * the best-effort low protection. However, we still - * ideally want to honor how well-behaved groups are in - * that case instead of simply punishing them all - * equally. As such, we reclaim them based on how much - * memory they are using, reducing the scan pressure - * again by how much of the total memory used is under - * hard protection. - */ - unsigned long cgroup_size = mem_cgroup_size(memcg); - unsigned long protection; - - /* memory.low scaling, make sure we retry before OOM */ - if (!sc->memcg_low_reclaim && low > min) { - protection = low; - sc->memcg_low_skipped = 1; - } else { - protection = min; - } - - /* Avoid TOCTOU with earlier protection check */ - cgroup_size = max(cgroup_size, protection); - - scan = lruvec_size - lruvec_size * protection / - (cgroup_size + 1); - - /* - * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decrementing - * sc->priority further than desirable. - */ - scan = max(scan, SWAP_CLUSTER_MAX); - } else { - scan = lruvec_size; - } - + scan = apply_proportional_protection(memcg, sc, lruvec_size); scan >>= sc->priority; /* @@ -3437,7 +3432,7 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (!pte_present(pte) || is_zero_pfn(pfn)) return -1; - if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + if (WARN_ON_ONCE(pte_special(pte))) return -1; if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) @@ -3462,9 +3457,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) return -1; - if (WARN_ON_ONCE(pmd_devmap(pmd))) - return -1; - if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) return -1; @@ -3935,6 +3927,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; + bool seq_inc_flag = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); @@ -3951,11 +3944,20 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) } min_seq[type]++; + seq_inc_flag = true; } next: ; } + /* + * If min_seq[type] of both anonymous and file is not increased, + * we can directly return false to avoid unnecessary checking + * overhead later. + */ + if (!seq_inc_flag) + return success; + /* see the comment on lru_gen_folio */ if (swappiness && swappiness <= MAX_SWAPPINESS) { unsigned long seq = lrugen->max_seq - MIN_NR_GENS; @@ -4562,8 +4564,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return true; } -static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, - int type, int tier, struct list_head *list) +static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int type, int tier, + struct list_head *list) { int i; int gen; @@ -4572,7 +4575,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int scanned = 0; int isolated = 0; int skipped = 0; - int remaining = MAX_LRU_BATCH; + int remaining = min(nr_to_scan, MAX_LRU_BATCH); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -4683,7 +4686,8 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness) return positive_ctrl_err(&sp, &pv); } -static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, +static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; @@ -4695,7 +4699,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw *type_scanned = type; - scanned = scan_folios(lruvec, sc, type, tier, list); + scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); if (scanned) return scanned; @@ -4705,7 +4709,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw return 0; } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness) { int type; int scanned; @@ -4724,7 +4729,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_lock_irq(&lruvec->lru_lock); - scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); + scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); scanned += try_to_inc_min_seq(lruvec, swappiness); @@ -4845,6 +4850,8 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; + nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); + /* try to get away with not aging at the default priority */ if (!success || sc->priority == DEF_PRIORITY) return nr_to_scan >> sc->priority; @@ -4897,7 +4904,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (nr_to_scan <= 0) break; - delta = evict_folios(lruvec, sc, swappiness); + delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); if (!delta) break; @@ -5428,7 +5435,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, static int lru_gen_seq_show(struct seq_file *m, void *v) { unsigned long seq; - bool full = !debugfs_real_fops(m->file)->write; + bool full = debugfs_get_aux_num(m->file); struct lruvec *lruvec = v; struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; @@ -5518,7 +5525,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(lruvec, sc, swappiness)) + if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, + swappiness)) return 0; cond_resched(); @@ -5764,8 +5772,10 @@ static int __init init_lru_gen(void) if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); - debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); - debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, 1, + &lru_gen_rw_fops); + debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, 0, + &lru_gen_ro_fops); return 0; }; @@ -6721,6 +6731,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, return nr_reclaimed; } +#else +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options, + int *swappiness) +{ + return 0; +} #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) @@ -7615,36 +7634,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) /* * Try to free up some pages from this node through reclaim. */ -static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) +static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, + unsigned long nr_pages, + struct scan_control *sc) { - /* Minimum pages needed in order to stay on node */ - const unsigned long nr_pages = 1 << order; struct task_struct *p = current; unsigned int noreclaim_flag; - struct scan_control sc = { - .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = current_gfp_context(gfp_mask), - .order = order, - .priority = NODE_RECLAIM_PRIORITY, - .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), - .may_swap = 1, - .reclaim_idx = gfp_zone(gfp_mask), - }; unsigned long pflags; - trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, - sc.gfp_mask); + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order, + sc->gfp_mask); cond_resched(); psi_memstall_enter(&pflags); delayacct_freepages_start(); - fs_reclaim_acquire(sc.gfp_mask); + fs_reclaim_acquire(sc->gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ noreclaim_flag = memalloc_noreclaim_save(); - set_task_reclaim_state(p, &sc.reclaim_state); + set_task_reclaim_state(p, &sc->reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { @@ -7653,24 +7662,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { - shrink_node(pgdat, &sc); - } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); + shrink_node(pgdat, sc); + } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0); } set_task_reclaim_state(p, NULL); memalloc_noreclaim_restore(noreclaim_flag); - fs_reclaim_release(sc.gfp_mask); - psi_memstall_leave(&pflags); + fs_reclaim_release(sc->gfp_mask); delayacct_freepages_end(); + psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); - return sc.nr_reclaimed >= nr_pages; + return sc->nr_reclaimed; } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { int ret; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = current_gfp_context(gfp_mask), + .order = order, + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), + .may_swap = 1, + .reclaim_idx = gfp_zone(gfp_mask), + }; /* * Node reclaim reclaims unmapped file backed pages and @@ -7705,7 +7726,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; - ret = __node_reclaim(pgdat, gfp_mask, order); + ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages; clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (ret) @@ -7715,6 +7736,114 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) return ret; } + +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, + MEMORY_RECLAIM_NULL, +}; +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + gfp_t gfp_mask = GFP_KERNEL; + + if (!buf || (!memcg && !pgdat) || (memcg && pgdat)) + return -EINVAL; + + buf = strstrip(buf); + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || + swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; + default: + return -EINVAL; + } + } + + while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages. + */ + if (!nr_retries) + lru_add_drain_all(); + + if (memcg) { + unsigned int reclaim_options; + + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + reclaimed = try_to_free_mem_cgroup_pages(memcg, + batch_size, gfp_mask, + reclaim_options, + swappiness == -1 ? NULL : &swappiness); + } else { + struct scan_control sc = { + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), + .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), + .may_unmap = 1, + .may_swap = 1, + .proactive = 1, + }; + + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, + &pgdat->flags)) + return -EBUSY; + + reclaimed = __node_reclaim(pgdat, gfp_mask, + batch_size, &sc); + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + } + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return 0; +} + #endif /** @@ -7762,3 +7891,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t reclaim_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret, nid = dev->id; + + ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid)); + return ret ? -EAGAIN : count; +} + +static DEVICE_ATTR_WO(reclaim); +int reclaim_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_reclaim); +} + +void reclaim_unregister_node(struct node *node) +{ + return device_remove_file(&node->dev, &dev_attr_reclaim); +} +#endif |