summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c1922
1 files changed, 1128 insertions, 794 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34de9cd0d4..900c74b6aa62 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,8 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/mmu_notifier.h>
+#include <linux/parser.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -92,6 +94,9 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
+ /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
+ int *proactive_swappiness;
+
/* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
@@ -114,7 +119,7 @@ struct scan_control {
/* Has cache_trim_mode failed at least once? */
unsigned int cache_trim_mode_failed:1;
- /* Proactive reclaim invoked by userspace through memory.reclaim */
+ /* Proactive reclaim invoked by userspace */
unsigned int proactive:1;
/*
@@ -128,6 +133,9 @@ struct scan_control {
unsigned int memcg_low_reclaim:1;
unsigned int memcg_low_skipped:1;
+ /* Shared cgroup tree walk failed, rescan the whole tree */
+ unsigned int memcg_full_walk:1;
+
unsigned int hibernation_mode:1;
/* One of the zones is ready for compaction */
@@ -189,7 +197,7 @@ struct scan_control {
#endif
/*
- * From 0 .. 200. Higher means more swappy.
+ * From 0 .. MAX_SWAPPINESS. Higher means more swappy.
*/
int vm_swappiness = 60;
@@ -233,6 +241,13 @@ static bool writeback_throttling_sane(struct scan_control *sc)
#endif
return false;
}
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+ if (sc->proactive && sc->proactive_swappiness)
+ return *sc->proactive_swappiness;
+ return mem_cgroup_swappiness(memcg);
+}
#else
static bool cgroup_reclaim(struct scan_control *sc)
{
@@ -248,8 +263,32 @@ static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
}
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+ return READ_ONCE(vm_swappiness);
+}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -302,16 +341,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -336,7 +381,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -368,13 +413,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
unsigned long size = 0;
int zid;
+ struct zone *zone;
- for (zid = 0; zid <= zone_idx; zid++) {
- struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
@@ -413,35 +454,29 @@ void drop_slab(void)
} while ((freed >> shift++) > 1);
}
-static int reclaimer_offset(void)
+#define CHECK_RECLAIMER_OFFSET(type) \
+ do { \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGSCAN_##type - PGSCAN_KSWAPD); \
+ } while (0)
+
+static int reclaimer_offset(struct scan_control *sc)
{
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+ CHECK_RECLAIMER_OFFSET(DIRECT);
+ CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
+ CHECK_RECLAIMER_OFFSET(PROACTIVE);
if (current_is_kswapd())
return 0;
if (current_is_khugepaged())
return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ if (sc->proactive)
+ return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
-static inline int is_page_cache_freeable(struct folio *folio)
-{
- /*
- * A freeable page cache folio is referenced only by the caller
- * that isolated the folio, the page cache and optional filesystem
- * private data at folio->private.
- */
- return folio_ref_count(folio) - folio_test_private(folio) ==
- 1 + folio_nr_pages(folio);
-}
-
/*
* We detected a synchronous write error writing a folio out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
@@ -467,12 +502,12 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
-
+ struct zone *zone;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
/*
@@ -480,12 +515,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
@@ -603,78 +633,69 @@ typedef enum {
PAGE_CLEAN,
} pageout_t;
+static pageout_t writeout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug, struct list_head *folio_list)
+{
+ int res;
+
+ folio_set_reclaim(folio);
+
+ /*
+ * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
+ * or we failed to allocate contiguous swap entries, in which case
+ * the split out folios get added back to folio_list.
+ */
+ if (shmem_mapping(mapping))
+ res = shmem_writeout(folio, plug, folio_list);
+ else
+ res = swap_writeout(folio, plug);
+
+ if (res < 0)
+ handle_write_error(mapping, folio, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ folio_clear_reclaim(folio);
+ return PAGE_ACTIVATE;
+ }
+
+ /* synchronous write? */
+ if (!folio_test_writeback(folio))
+ folio_clear_reclaim(folio);
+
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
+}
+
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
- struct swap_iocb **plug)
+ struct swap_iocb **plug, struct list_head *folio_list)
{
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
+ *
+ * A freeable shmem or swapcache folio is referenced only by the
+ * caller that isolated the folio and the page cache.
*/
- if (!is_page_cache_freeable(folio))
+ if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping)
return PAGE_KEEP;
- if (!mapping) {
- /*
- * Some data journaling orphaned folios can have
- * folio->mapping == NULL while being dirty with clean buffers.
- */
- if (folio_test_private(folio)) {
- if (try_to_free_buffers(folio)) {
- folio_clear_dirty(folio);
- pr_info("%s: orphaned folio\n", __func__);
- return PAGE_CLEAN;
- }
- }
- return PAGE_KEEP;
- }
- if (mapping->a_ops->writepage == NULL)
+ if (!shmem_mapping(mapping) && !folio_test_anon(folio))
return PAGE_ACTIVATE;
-
- if (folio_clear_dirty_for_io(folio)) {
- int res;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = SWAP_CLUSTER_MAX,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1,
- .swap_plug = plug,
- };
-
- folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
- if (res < 0)
- handle_write_error(mapping, folio, res);
- if (res == AOP_WRITEPAGE_ACTIVATE) {
- folio_clear_reclaim(folio);
- return PAGE_ACTIVATE;
- }
-
- if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
- folio_clear_reclaim(folio);
- }
- trace_mm_vmscan_write_folio(folio);
- node_stat_add_folio(folio, NR_VMSCAN_WRITE);
- return PAGE_SUCCESS;
- }
-
- return PAGE_CLEAN;
+ if (!folio_clear_dirty_for_io(folio))
+ return PAGE_CLEAN;
+ return writeout(folio, mapping, plug, folio_list);
}
/*
@@ -686,13 +707,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
{
int refcount;
void *shadow = NULL;
+ struct swap_cluster_info *ci;
BUG_ON(!folio_test_locked(folio));
BUG_ON(mapping != folio_mapping(folio));
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ ci = swap_cluster_get_and_lock_irq(folio);
+ } else {
spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ xa_lock_irq(&mapping->i_pages);
+ }
+
/*
* The non racy check for a busy folio.
*
@@ -732,9 +758,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __delete_from_swap_cache(folio, swap, shadow);
- mem_cgroup_swapout(folio, swap);
- xa_unlock_irq(&mapping->i_pages);
+ __swap_cache_del_folio(ci, folio, swap, shadow);
+ memcg1_swapout(folio, swap);
+ swap_cluster_unlock_irq(ci);
put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
@@ -762,7 +788,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
__filemap_remove_folio(folio, shadow);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (free_folio)
@@ -772,9 +798,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
return 1;
cannot_free:
- xa_unlock_irq(&mapping->i_pages);
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ swap_cluster_unlock_irq(ci);
+ } else {
+ xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
+ }
return 0;
}
@@ -826,15 +855,39 @@ enum folio_references {
FOLIOREF_ACTIVATE,
};
+#ifdef CONFIG_LRU_GEN
+/*
+ * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
+ * needs to be done by taking the folio off the LRU list and then adding it back
+ * with PG_active set. In contrast, the aging (page table walk) path uses
+ * folio_update_gen().
+ */
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
+ return false;
+ }
+
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
+ return true;
+}
+#else
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ return false;
+}
+#endif /* CONFIG_LRU_GEN */
+
static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
- referenced_folio = folio_test_clear_referenced(folio);
/*
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
@@ -843,10 +896,24 @@ static enum folio_references folio_check_references(struct folio *folio,
if (vm_flags & VM_LOCKED)
return FOLIOREF_ACTIVATE;
- /* rmap lock contention: rotate */
+ /*
+ * There are two cases to consider.
+ * 1) Rmap lock contention: rotate.
+ * 2) Skip the non-shared swapbacked folio mapped solely by
+ * the exiting or OOM-reaped process.
+ */
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
+ if (lru_gen_enabled()) {
+ if (!referenced_ptes)
+ return FOLIOREF_RECLAIM;
+
+ return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
+ }
+
+ referenced_folio = folio_test_clear_referenced(folio);
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -964,7 +1031,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
* When this happens, 'page' will likely just be discarded
* instead of migrated.
*/
- .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
.nmask = &allowed_mask,
@@ -984,9 +1051,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
- mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
- nr_succeeded);
-
return nr_succeeded;
}
@@ -1011,12 +1075,13 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
- unsigned int nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
struct swap_iocb *plug = NULL;
@@ -1024,7 +1089,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1042,6 +1107,21 @@ retry:
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() can't handle large
+ * folio, just skip it. memory_failure() will
+ * handle it if the UCE is triggered again.
+ */
+ if (folio_test_large(folio))
+ goto keep_locked;
+
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
@@ -1055,11 +1135,6 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- /* folio_update_gen() tried to promote this page? */
- if (lru_gen_enabled() && !ignore_references &&
- folio_mapped(folio) && folio_test_referenced(folio))
- goto keep_locked;
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1100,8 +1175,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1126,6 +1203,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1136,7 +1215,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1203,17 +1284,18 @@ retry:
goto keep_locked;
if (folio_test_large(folio)) {
/* cannot split folio, skip it */
- if (!can_split_folio(folio, NULL))
+ if (!can_split_folio(folio, 1, NULL))
goto activate_locked;
/*
* Split partially mapped folios right away.
* We can free the unmapped pages without IO.
*/
- if (data_race(!list_empty(&folio->_deferred_list)) &&
+ if (data_race(!list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio)) &&
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (!add_to_swap(folio)) {
+ if (folio_alloc_swap(folio)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1227,17 +1309,24 @@ retry:
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
}
- count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
#endif
- if (!add_to_swap(folio))
+ count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
+ if (folio_alloc_swap(folio))
goto activate_locked_split;
}
+ /*
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
- } else if (folio_test_swapbacked(folio) &&
- folio_test_large(folio)) {
- /* Split shmem folio */
- if (split_folio_to_list(folio, folio_list))
- goto keep_locked;
}
/*
@@ -1272,7 +1361,7 @@ retry:
* try_to_unmap acquire PTL from the first PTE,
* eliminating the influence of temporary PTE values.
*/
- if (folio_test_large(folio) && list_empty(&folio->_deferred_list))
+ if (folio_test_large(folio))
flags |= TTU_SYNC;
try_to_unmap(folio, flags);
@@ -1297,21 +1386,7 @@ retry:
mapping = folio_mapping(folio);
if (folio_test_dirty(folio)) {
- /*
- * Only kswapd can writeback filesystem folios
- * to avoid risk of stack overflow. But avoid
- * injecting inefficient single-folio I/O into
- * flusher writeback as much as possible: only
- * write folios when we've encountered many
- * dirty folios, and when we've already scanned
- * the rest of the LRU for clean folios and see
- * the same dirty folios again (with the reclaim
- * flag set).
- */
- if (folio_is_file_lru(folio) &&
- (!current_is_kswapd() ||
- !folio_test_reclaim(folio) ||
- !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
+ if (folio_is_file_lru(folio)) {
/*
* Immediately reclaim when written back.
* Similar in principle to folio_deactivate()
@@ -1320,7 +1395,8 @@ retry:
*/
node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
nr_pages);
- folio_set_reclaim(folio);
+ if (!folio_test_reclaim(folio))
+ folio_set_reclaim(folio);
goto activate_locked;
}
@@ -1338,12 +1414,25 @@ retry:
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping, &plug)) {
+ switch (pageout(folio, mapping, &plug, folio_list)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
+ /*
+ * If shmem folio is split when writeback to swap,
+ * the tail pages will make their own pass through
+ * this function and be accounted then.
+ */
+ if (nr_pages > 1 && !folio_test_large(folio)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
goto activate_locked;
case PAGE_SUCCESS:
+ if (nr_pages > 1 && !folio_test_large(folio)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
stat->nr_pageout += nr_pages;
if (folio_test_writeback(folio))
@@ -1437,9 +1526,7 @@ free_it:
*/
nr_reclaimed += nr_pages;
- if (folio_test_large(folio) &&
- folio_test_large_rmappable(folio))
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@@ -1478,7 +1565,9 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
+ nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_reclaimed += nr_demoted;
+ stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
/* Folios which weren't demoted go back on @folio_list */
@@ -1534,9 +1623,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
unsigned int noreclaim_flag;
list_for_each_entry_safe(folio, next, folio_list, lru) {
+ /* TODO: these pages should not even appear in this list. */
+ if (page_has_movable_ops(&folio->page))
+ continue;
if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
- !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
- !folio_test_unevictable(folio)) {
+ !folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
folio_clear_active(folio);
list_move(&folio->lru, &clean_folios);
}
@@ -1550,7 +1641,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1587,25 +1678,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
-#ifdef CONFIG_CMA
-/*
- * It is waste of effort to scan and reclaim CMA pages if it is not available
- * for current allocation context. Kswapd can not be enrolled as it can not
- * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
- */
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
- return !current_is_kswapd() &&
- gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
- folio_migratetype(folio) == MIGRATE_CMA;
-}
-#else
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
- return false;
-}
-#endif
-
/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
@@ -1636,12 +1708,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
+ unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -1652,10 +1723,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx ||
- skip_cma(folio, sc)) {
+ /* Using max_nr_skipped to prevent hard LOCKUP*/
+ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
+ (folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
+ max_nr_skipped++;
goto move;
}
@@ -1846,9 +1919,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- if (folio_test_large(folio) &&
- folio_test_large_rmappable(folio))
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
@@ -1930,10 +2001,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -1941,20 +2012,23 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
+ mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+ stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+ nr_scanned - nr_reclaimed);
/*
* If dirty folios are scanned that are not queued for IO, it
@@ -2020,7 +2094,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
{
unsigned long nr_taken;
unsigned long nr_scanned;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
@@ -2040,7 +2114,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2097,13 +2171,11 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&lruvec->lru_lock);
- if (nr_rotated)
- lru_note_cost(lruvec, file, 0, nr_rotated);
+ lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2111,7 +2183,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
static unsigned int reclaim_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat)
{
- struct reclaim_stat dummy_stat;
+ struct reclaim_stat stat;
unsigned int nr_reclaimed;
struct folio *folio;
struct scan_control sc = {
@@ -2122,12 +2194,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
folio_putback_lru(folio);
}
+ trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat);
return nr_reclaimed;
}
@@ -2244,10 +2317,11 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
/*
- * Flush the memory cgroup stats, so that we read accurate per-memcg
- * lruvec stats for heuristics.
+ * Flush the memory cgroup stats in rate-limited way as we don't need
+ * most accurate stats here. We may switch to regular stats flushing
+ * in the future once it is cheap enough.
*/
- mem_cgroup_flush_stats(sc->target_mem_cgroup);
+ mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup);
/*
* Determine the scan balance between anon and file LRUs.
@@ -2312,17 +2386,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
+ struct zone *zone;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
total_high_wmark += high_wmark_pages(zone);
}
@@ -2340,6 +2410,106 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
}
}
+static inline void calculate_pressure_balance(struct scan_control *sc,
+ int swappiness, u64 *fraction, u64 *denominator)
+{
+ unsigned long anon_cost, file_cost, total_cost;
+ unsigned long ap, fp;
+
+ /*
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
+ *
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
+ *
+ * With swappiness at 100, anon and file have equal IO cost.
+ */
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
+
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
+
+ fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
+
+ fraction[WORKINGSET_ANON] = ap;
+ fraction[WORKINGSET_FILE] = fp;
+ *denominator = ap + fp;
+}
+
+static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long scan)
+{
+ unsigned long min, low;
+
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan -= scan * protection / (cgroup_size + 1);
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decrementing
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ }
+ return scan;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2352,12 +2522,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- unsigned long anon_cost, file_cost, total_cost;
- int swappiness = mem_cgroup_swappiness(memcg);
+ int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
- unsigned long ap, fp;
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon folios. */
@@ -2378,6 +2546,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2398,7 +2573,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2406,103 +2582,16 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
- /*
- * Calculate the pressure balance between anon and file pages.
- *
- * The amount of pressure we put on each LRU is inversely
- * proportional to the cost of reclaiming each list, as
- * determined by the share of pages that are refaulting, times
- * the relative IO cost of bringing back a swapped out
- * anonymous page vs reloading a filesystem page (swappiness).
- *
- * Although we limit that influence to ensure no list gets
- * left behind completely: at least a third of the pressure is
- * applied, before swappiness.
- *
- * With swappiness at 100, anon and file have equal IO cost.
- */
- total_cost = sc->anon_cost + sc->file_cost;
- anon_cost = total_cost + sc->anon_cost;
- file_cost = total_cost + sc->file_cost;
- total_cost = anon_cost + file_cost;
-
- ap = swappiness * (total_cost + 1);
- ap /= anon_cost + 1;
-
- fp = (200 - swappiness) * (total_cost + 1);
- fp /= file_cost + 1;
+ calculate_pressure_balance(sc, swappiness, fraction, &denominator);
- fraction[0] = ap;
- fraction[1] = fp;
- denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
unsigned long lruvec_size;
- unsigned long low, min;
unsigned long scan;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- mem_cgroup_protection(sc->target_mem_cgroup, memcg,
- &min, &low);
-
- if (min || low) {
- /*
- * Scale a cgroup's reclaim pressure by proportioning
- * its current usage to its memory.low or memory.min
- * setting.
- *
- * This is important, as otherwise scanning aggression
- * becomes extremely binary -- from nothing as we
- * approach the memory protection threshold, to totally
- * nominal as we exceed it. This results in requiring
- * setting extremely liberal protection thresholds. It
- * also means we simply get no protection at all if we
- * set it too low, which is not ideal.
- *
- * If there is any protection in place, we reduce scan
- * pressure by how much of the total memory used is
- * within protection thresholds.
- *
- * There is one special case: in the first reclaim pass,
- * we skip over all groups that are within their low
- * protection. If that fails to reclaim enough pages to
- * satisfy the reclaim goal, we come back and override
- * the best-effort low protection. However, we still
- * ideally want to honor how well-behaved groups are in
- * that case instead of simply punishing them all
- * equally. As such, we reclaim them based on how much
- * memory they are using, reducing the scan pressure
- * again by how much of the total memory used is under
- * hard protection.
- */
- unsigned long cgroup_size = mem_cgroup_size(memcg);
- unsigned long protection;
-
- /* memory.low scaling, make sure we retry before OOM */
- if (!sc->memcg_low_reclaim && low > min) {
- protection = low;
- sc->memcg_low_skipped = 1;
- } else {
- protection = min;
- }
-
- /* Avoid TOCTOU with earlier protection check */
- cgroup_size = max(cgroup_size, protection);
-
- scan = lruvec_size - lruvec_size * protection /
- (cgroup_size + 1);
-
- /*
- * Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decrementing
- * sc->priority further than desirable.
- */
- scan = max(scan, SWAP_CLUSTER_MAX);
- } else {
- scan = lruvec_size;
- }
-
+ scan = apply_proportional_protection(memcg, sc, lruvec_size);
scan >>= sc->priority;
/*
@@ -2548,7 +2637,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2556,7 +2645,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2583,8 +2673,6 @@ static bool should_clear_pmd_young(void)
* shorthand helpers
******************************************************************************/
-#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
-
#define DEFINE_MAX_SEQ(lruvec) \
unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
@@ -2594,11 +2682,21 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
+#define evictable_min_seq(min_seq, swappiness) \
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
+
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define for_each_evictable_type(type, swappiness) \
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
+
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2630,11 +2728,11 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
- return mem_cgroup_swappiness(memcg);
+ return sc_swappiness(sc, memcg);
}
static int get_nr_gens(struct lruvec *lruvec, int type)
@@ -2644,10 +2742,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_folio */
- return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ int n = get_nr_gens(lruvec, type);
+
+ if (n < MIN_NR_GENS || n > MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
}
/******************************************************************************
@@ -3048,16 +3152,20 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
+ int i;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- pos->refaulted = lrugen->avg_refaulted[type][tier] +
- atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- pos->total = lrugen->avg_total[type][tier] +
- atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- pos->total += lrugen->protected[hist][type][tier - 1];
pos->gain = gain;
+ pos->refaulted = pos->total = 0;
+
+ for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
+ pos->refaulted += lrugen->avg_refaulted[type][i] +
+ atomic_long_read(&lrugen->refaulted[hist][type][i]);
+ pos->total += lrugen->avg_total[type][i] +
+ lrugen->protected[hist][type][i] +
+ atomic_long_read(&lrugen->evicted[hist][type][i]);
+ }
}
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
@@ -3083,17 +3191,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
+ lrugen->protected[hist][type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
- if (tier)
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
}
}
}
@@ -3116,22 +3222,24 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
/* promote pages accessed through page tables */
static int folio_update_gen(struct folio *folio, int gen)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
- VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
+ return -1;
+ }
do {
/* lru_gen_del_folio() has isolated this page? */
- if (!(old_flags & LRU_GEN_MASK)) {
- /* for shrink_folio_list() */
- new_flags = old_flags | BIT(PG_referenced);
- continue;
- }
+ if (!(old_flags & LRU_GEN_MASK))
+ return -1;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
+ new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
@@ -3142,7 +3250,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
int type = folio_is_file_lru(folio);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
@@ -3154,12 +3262,12 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
new_gen = (old_gen + 1) % MAX_NR_GENS;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
/* for folio_end_writeback() */
if (reclaiming)
new_flags |= BIT(PG_reclaim);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
lru_gen_update_size(lruvec, folio, old_gen, new_gen);
@@ -3229,7 +3337,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (vma_is_anonymous(vma))
- return !walk->can_swap;
+ return !walk->swappiness;
if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
return true;
@@ -3239,7 +3347,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (shmem_mapping(mapping))
- return !walk->can_swap;
+ return !walk->swappiness;
+
+ if (walk->swappiness > MAX_SWAPPINESS)
+ return true;
/* to exclude special mappings like dax, etc. */
return !mapping->a_ops->read_folio;
@@ -3276,7 +3387,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
return false;
}
-static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
+ struct pglist_data *pgdat)
{
unsigned long pfn = pte_pfn(pte);
@@ -3285,16 +3397,23 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
if (!pte_present(pte) || is_zero_pfn(pfn))
return -1;
- if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ if (WARN_ON_ONCE(pte_special(pte)))
+ return -1;
+
+ if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
return -1;
if (WARN_ON_ONCE(!pfn_valid(pfn)))
return -1;
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return -1;
+
return pfn;
}
-static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
+ struct pglist_data *pgdat)
{
unsigned long pfn = pmd_pfn(pmd);
@@ -3303,33 +3422,30 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
return -1;
- if (WARN_ON_ONCE(pmd_devmap(pmd)))
+ if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
return -1;
if (WARN_ON_ONCE(!pfn_valid(pfn)))
return -1;
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return -1;
+
return pfn;
}
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- struct pglist_data *pgdat, bool can_swap)
+ struct pglist_data *pgdat)
{
- struct folio *folio;
+ struct folio *folio = pfn_folio(pfn);
- /* try to avoid unnecessary memory loads */
- if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ if (folio_lru_gen(folio) < 0)
return NULL;
- folio = pfn_folio(pfn);
if (folio_nid(folio) != pgdat->node_id)
return NULL;
- if (folio_memcg_rcu(folio) != memcg)
- return NULL;
-
- /* file VMAs can contain anon pages from COW */
- if (!folio_is_file_lru(folio) && !can_swap)
+ if (folio_memcg(folio) != memcg)
return NULL;
return folio;
@@ -3343,26 +3459,59 @@ static bool suitable_to_scan(int total, int young)
return young * n >= total;
}
+static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int new_gen, bool dirty)
+{
+ int old_gen;
+
+ if (!folio)
+ return;
+
+ if (dirty && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ } else if (lru_gen_set_refs(folio)) {
+ old_gen = folio_lru_gen(folio);
+ if (old_gen >= 0 && old_gen != new_gen)
+ folio_activate(folio);
+ }
+}
+
static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct mm_walk *args)
{
int i;
+ bool dirty;
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
int total = 0;
int young = 0;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
+ pmd_t pmdval;
- pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
if (!pte)
return false;
+
if (!spin_trylock(ptl)) {
pte_unmap(pte);
+ return true;
+ }
+
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ pte_unmap_unlock(pte, ptl);
return false;
}
@@ -3376,35 +3525,34 @@ restart:
total++;
walk->mm_stats[MM_LEAF_TOTAL]++;
- pfn = get_pte_pfn(ptent, args->vma, addr);
+ pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
if (pfn == -1)
continue;
- if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
+ folio = get_pfn_folio(pfn, memcg, pgdat);
+ if (!folio)
continue;
- }
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
- if (!folio)
+ if (!ptep_clear_young_notify(args->vma, addr, pte + i))
continue;
- if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- young++;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ last = folio;
+ dirty = false;
+ }
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ if (pte_dirty(ptent))
+ dirty = true;
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
}
+ walk_update_folio(walk, last, gen, dirty);
+ last = NULL;
+
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
@@ -3418,13 +3566,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
+ bool dirty;
pmd_t *pmd;
spinlock_t *ptl;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3456,37 +3606,44 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
/* don't round down the first address */
addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
- pfn = get_pmd_pfn(pmd[i], vma, addr);
- if (pfn == -1)
+ if (!pmd_present(pmd[i]))
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (should_clear_pmd_young())
+ if (!walk->force_scan && should_clear_pmd_young() &&
+ !mm_has_notifiers(args->mm))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
+ if (pfn == -1)
+ goto next;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
goto next;
- if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+ if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ last = folio;
+ dirty = false;
+ }
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (pmd_dirty(pmd[i]))
+ dirty = true;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
next:
i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
} while (i <= MIN_LRU_BATCH);
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
spin_unlock(ptl);
done:
@@ -3528,27 +3685,18 @@ restart:
}
if (pmd_trans_huge(val)) {
- unsigned long pfn = pmd_pfn(val);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
walk->mm_stats[MM_LEAF_TOTAL]++;
- if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
- continue;
- }
-
- /* try to avoid unnecessary memory loads */
- if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- continue;
-
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ if (pfn != -1)
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
- if (should_clear_pmd_young()) {
+ if (!walk->force_scan && should_clear_pmd_young() &&
+ !mm_has_notifiers(args->mm)) {
if (!pmd_young(val))
continue;
@@ -3589,7 +3737,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
pud = pud_offset(p4d, start & P4D_MASK);
restart:
for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
- pud_t val = READ_ONCE(pud[i]);
+ pud_t val = pudp_get(pud + i);
next = pud_addr_end(addr, end);
@@ -3624,10 +3772,8 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
.p4d_entry = walk_pud_range,
.walk_lock = PGWALK_RDLOCK,
};
-
int err;
struct lruvec *lruvec = walk->lruvec;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
walk->next_addr = FIRST_USER_ADDRESS;
@@ -3640,10 +3786,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
if (walk->seq != max_seq)
break;
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- break;
-
/* the caller might be holding the lock for write */
if (mmap_read_trylock(mm)) {
err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
@@ -3651,8 +3793,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
mmap_read_unlock(mm);
}
- mem_cgroup_unlock_pages();
-
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(walk);
@@ -3695,22 +3835,30 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{
int zone;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type == LRU_GEN_ANON && !can_swap)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
- /* prevent cold/hot inversion if force_scan is true */
+ /* prevent cold/hot inversion if the type is evictable */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
+ int refs = folio_lru_refs(folio);
+ bool workingset = folio_test_workingset(folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
@@ -3720,6 +3868,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
new_gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int tier = lru_tier_from_refs(refs, workingset);
+ int delta = folio_nr_pages(folio);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
+
if (!--remaining)
return false;
}
@@ -3731,17 +3888,18 @@ done:
return true;
}
-static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
+ bool seq_inc_flag = false;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
/* find the oldest populated generation */
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
@@ -3751,19 +3909,32 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
}
min_seq[type]++;
+ seq_inc_flag = true;
}
next:
;
}
+ /*
+ * If min_seq[type] of both anonymous and file is not increased,
+ * we can directly return false to avoid unnecessary checking
+ * overhead later.
+ */
+ if (!seq_inc_flag)
+ return success;
+
/* see the comment on lru_gen_folio */
- if (can_swap) {
- min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ if (swappiness && swappiness <= MAX_SWAPPINESS) {
+ unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
+
+ if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
+ min_seq[LRU_GEN_ANON] = seq;
+ else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
+ min_seq[LRU_GEN_FILE] = seq;
}
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
- if (min_seq[type] == lrugen->min_seq[type])
+ for_each_evictable_type(type, swappiness) {
+ if (min_seq[type] <= lrugen->min_seq[type])
continue;
reset_ctrl_pos(lruvec, type, true);
@@ -3774,8 +3945,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
{
bool success;
int prev, next;
@@ -3793,13 +3963,11 @@ restart:
if (!success)
goto unlock;
- for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ for (type = 0; type < ANON_AND_FILE; type++) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
-
- if (inc_min_seq(lruvec, type, can_swap))
+ if (inc_min_seq(lruvec, type, swappiness))
continue;
spin_unlock_irq(&lruvec->lru_lock);
@@ -3843,7 +4011,7 @@ unlock:
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3854,7 +4022,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, swappiness);
/* see the comment in iterate_mm_list() */
if (seq <= READ_ONCE(mm_state->seq))
@@ -3879,7 +4047,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
walk->lruvec = lruvec;
walk->seq = seq;
- walk->can_swap = can_swap;
+ walk->swappiness = swappiness;
walk->force_scan = force_scan;
do {
@@ -3889,7 +4057,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, swappiness);
WARN_ON_ONCE(!success);
}
@@ -3900,17 +4068,43 @@ done:
* working set protection
******************************************************************************/
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on
+ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+ * where reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ /*
+ * The estimation is based on LRU pages only, so cap it to prevent
+ * overshoots of shrinker objects by large margins.
+ */
+ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+}
+
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
unsigned long total = 0;
- bool can_swap = get_swappiness(lruvec, sc);
+ int swappiness = get_swappiness(lruvec, sc);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
@@ -3930,22 +4124,20 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
{
int gen;
unsigned long birth;
+ int swappiness = get_swappiness(lruvec, sc);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
+ if (mem_cgroup_below_min(NULL, memcg))
return false;
if (!lruvec_is_sizable(lruvec, sc))
return false;
- mem_cgroup_calculate_protection(NULL, memcg);
+ gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- return !mem_cgroup_below_min(NULL, memcg);
+ return time_is_before_jiffies(birth + min_ttl);
}
/* to protect the working set of the last N jiffies */
@@ -3955,23 +4147,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ bool reclaimable = !min_ttl;
VM_WARN_ON_ONCE(!current_is_kswapd());
- /* check the order to exclude compaction-induced reclaim */
- if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
- return;
+ set_initial_priority(pgdat, sc);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
- mem_cgroup_iter_break(NULL, memcg);
- return;
- }
+ mem_cgroup_calculate_protection(NULL, memcg);
- cond_resched();
+ if (!reclaimable)
+ reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
/*
@@ -3979,7 +4168,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* younger than min_ttl. However, another possibility is all memcgs are
* either too small or below min.
*/
- if (mutex_trylock(&oom_lock)) {
+ if (!reclaimable && mutex_trylock(&oom_lock)) {
struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
@@ -4001,34 +4190,38 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* the PTE table to the Bloom filter. This forms a feedback loop between the
* eviction and the aging.
*/
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
+ bool dirty;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
- int young = 0;
+ struct folio *last = NULL;
+ int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+ if (!ptep_clear_young_notify(vma, addr, pte))
+ return false;
+
if (spin_is_contended(pvmw->ptl))
- return;
+ return true;
/* exclude special VMAs containing anon pages from COW */
if (vma->vm_flags & VM_SPECIAL)
- return;
+ return true;
/* avoid taking the LRU lock under the PTL when possible */
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4036,6 +4229,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
start = max(addr & PMD_MASK, vma->vm_start);
end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
+ if (end - start == PAGE_SIZE)
+ return true;
+
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4047,10 +4243,6 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}
}
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- return;
-
arch_enter_lazy_mmu_mode();
pte -= (addr - start) / PAGE_SIZE;
@@ -4059,48 +4251,39 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
unsigned long pfn;
pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(ptent, vma, addr);
+ pfn = get_pte_pfn(ptent, vma, addr, pgdat);
if (pfn == -1)
continue;
- if (!pte_young(ptent))
- continue;
-
- folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
- if (!ptep_test_and_clear_young(vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
-
- young++;
-
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ if (!ptep_clear_young_notify(vma, addr, pte + i))
+ continue;
- if (walk) {
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- continue;
+ last = folio;
+ dirty = false;
}
- old_gen = folio_lru_gen(folio);
- if (old_gen < 0)
- folio_set_referenced(folio);
- else if (old_gen != new_gen)
- folio_activate(folio);
+ if (pte_dirty(ptent))
+ dirty = true;
+
+ young++;
}
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
- mem_cgroup_unlock_pages();
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+
+ return true;
}
/******************************************************************************
@@ -4245,12 +4428,14 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int tier_idx)
{
bool success;
+ bool dirty, writeback;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4265,15 +4450,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
return true;
}
- /* dirty lazyfree */
- if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
- success = lru_gen_del_folio(lruvec, folio, true);
- VM_WARN_ON_ONCE_FOLIO(!success, folio);
- folio_set_swapbacked(folio);
- lruvec_add_folio_tail(lruvec, folio);
- return true;
- }
-
/* promoted */
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
@@ -4281,27 +4457,37 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
- int hist = lru_hist_from_seq(lrugen->min_seq[type]);
-
+ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- lrugen->protected[hist][type][tier - 1] + delta);
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
return true;
}
/* ineligible */
- if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
+ if (zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
+ dirty = folio_test_dirty(folio);
+ writeback = folio_test_writeback(folio);
+ if (type == LRU_GEN_FILE && dirty) {
+ sc->nr.file_taken += delta;
+ if (!writeback)
+ sc->nr.unqueued_dirty += delta;
+ }
+
/* waiting for writeback */
- if (folio_test_locked(folio) || folio_test_writeback(folio) ||
- (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ if (writeback || (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4330,13 +4516,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return false;
}
- /* see the comment on MAX_NR_TIERS */
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
- folio_clear_referenced(folio);
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
@@ -4344,8 +4529,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return true;
}
-static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- int type, int tier, struct list_head *list)
+static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int type, int tier,
+ struct list_head *list)
{
int i;
int gen;
@@ -4354,7 +4540,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int skipped = 0;
- int remaining = MAX_LRU_BATCH;
+ int remaining = min(nr_to_scan, MAX_LRU_BATCH);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -4406,18 +4592,19 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
-
+ if (type == LRU_GEN_FILE)
+ sc->nr.file_taken += isolated;
/*
* There might not be eligible folios due to reclaim_idx. Check the
* remaining to prevent livelock if it's not making progress.
@@ -4431,13 +4618,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
struct ctrl_pos sp, pv;
/*
- * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * To leave a margin for fluctuations, use a larger gain factor (2:3).
* This value is chosen because any other tier would have at least twice
* as many refaults as the first tier.
*/
- read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ read_ctrl_pos(lruvec, type, 0, 2, &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ read_ctrl_pos(lruvec, type, tier, 3, &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
@@ -4445,82 +4632,50 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
return tier - 1;
}
-static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- int type, tier;
struct ctrl_pos sp, pv;
- int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+ if (swappiness <= MIN_SWAPPINESS + 1)
+ return LRU_GEN_FILE;
+
+ if (swappiness >= MAX_SWAPPINESS)
+ return LRU_GEN_ANON;
/*
- * Compare the first tier of anon with that of file to determine which
- * type to scan. Also need to compare other tiers of the selected type
- * with the first tier of the other type to determine the last tier (of
- * the selected type) to evict.
+ * Compare the sum of all tiers of anon with that of file to determine
+ * which type to scan.
*/
- read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
- read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
- type = positive_ctrl_err(&sp, &pv);
-
- read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
- for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
- if (!positive_ctrl_err(&sp, &pv))
- break;
- }
-
- *tier_idx = tier - 1;
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
- return type;
+ return positive_ctrl_err(&sp, &pv);
}
-static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
- int type;
- int scanned;
- int tier = -1;
- DEFINE_MIN_SEQ(lruvec);
+ int type = get_type_to_scan(lruvec, swappiness);
- /*
- * Try to make the obvious choice first, and if anon and file are both
- * available from the same generation,
- * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
- * first.
- * 2. If !__GFP_IO, file first since clean pagecache is more likely to
- * exist than clean swapcache.
- */
- if (!swappiness)
- type = LRU_GEN_FILE;
- else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
- type = LRU_GEN_ANON;
- else if (swappiness == 1)
- type = LRU_GEN_FILE;
- else if (swappiness == 200)
- type = LRU_GEN_ANON;
- else if (!(sc->gfp_mask & __GFP_IO))
- type = LRU_GEN_FILE;
- else
- type = get_type_to_scan(lruvec, swappiness, &tier);
+ for_each_evictable_type(i, swappiness) {
+ int scanned;
+ int tier = get_tier_idx(lruvec, type);
- for (i = !swappiness; i < ANON_AND_FILE; i++) {
- if (tier < 0)
- tier = get_tier_idx(lruvec, type);
+ *type_scanned = type;
- scanned = scan_folios(lruvec, sc, type, tier, list);
+ scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
if (scanned)
- break;
+ return scanned;
type = !type;
- tier = -1;
}
- *type_scanned = type;
-
- return scanned;
+ return 0;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -4533,16 +4688,17 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
bool skip_retry = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
spin_lock_irq(&lruvec->lru_lock);
- scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+ scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
scanned += try_to_inc_min_seq(lruvec, swappiness);
- if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
spin_unlock_irq(&lruvec->lru_lock);
@@ -4550,39 +4706,32 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
scanned, reclaimed, &stat, sc->priority,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ DEFINE_MIN_SEQ(lruvec);
+
if (!folio_evictable(folio)) {
list_del(&folio->lru);
folio_putback_lru(folio);
continue;
}
- if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
- continue;
- }
-
- if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
- folio_mapped(folio) || folio_test_locked(folio) ||
- folio_test_dirty(folio) || folio_test_writeback(folio)) {
- /* don't add rejected folios to the oldest generation */
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
- BIT(PG_active));
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+ !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+ list_move(&folio->lru, &clean);
continue;
}
- /* retry folios that may have missed folio_rotate_reclaimable() */
- list_move(&folio->lru, &clean);
- sc->nr_scanned -= folio_nr_pages(folio);
+ /* don't add rejected folios to the oldest generation */
+ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4595,10 +4744,13 @@ retry:
reset_batch_size(walk);
}
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+ stat.nr_demoted);
+
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -4614,63 +4766,32 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- bool can_swap, unsigned long *nr_to_scan)
+ int swappiness, unsigned long *nr_to_scan)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
- unsigned long total = 0;
+ unsigned long size = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
- /* whether this lruvec is completely out of cold folios */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
- *nr_to_scan = 0;
+ *nr_to_scan = 0;
+ /* have to run aging, since eviction is not possible anymore */
+ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
return true;
- }
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
}
}
- *nr_to_scan = total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ *nr_to_scan = size;
+ /* better to run aging even though eviction is still possible */
+ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
/*
@@ -4678,7 +4799,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
bool success;
unsigned long nr_to_scan;
@@ -4688,18 +4809,20 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
+ success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
/* try to scrape all its memory if this memcg was deleted */
if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
+ nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
+
/* try to get away with not aging at the default priority */
if (!success || sc->priority == DEF_PRIORITY)
return nr_to_scan >> sc->priority;
/* stop scanning this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -4746,7 +4869,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (nr_to_scan <= 0)
break;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
if (!delta)
break;
@@ -4760,6 +4883,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
}
+ /*
+ * If too many file cache in the coldest generation can't be evicted
+ * due to being dirty, wake up the flusher.
+ */
+ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+
/* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -4772,8 +4902,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- mem_cgroup_calculate_protection(NULL, memcg);
-
+ /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;
@@ -4897,28 +5026,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}
-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
-{
- int priority;
- unsigned long reclaimable;
-
- if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
- return;
- /*
- * Determine the initial priority based on
- * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
- * where reclaimed_to_scanned_ratio = inactive / total.
- */
- reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
- reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
-
- /* round down reclaimable and round up sc->nr_to_reclaim */
- priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
-
- sc->priority = clamp(priority, 0, DEF_PRIORITY);
-}
-
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct blk_plug plug;
@@ -4957,8 +5064,8 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
blk_finish_plug(&plug);
done:
- /* kswapd should never fail */
- pgdat->kswapd_failures = 0;
+ if (sc->nr_reclaimed > reclaimed)
+ atomic_set(&pgdat->kswapd_failures, 0);
}
/******************************************************************************
@@ -5248,19 +5355,18 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
s = "rep";
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
}
for (i = 0; i < 3; i++)
@@ -5274,14 +5380,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
@@ -5294,7 +5400,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
- bool full = !debugfs_real_fops(m->file)->write;
+ bool full = debugfs_get_aux_num(m->file);
struct lruvec *lruvec = v;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
@@ -5315,7 +5421,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
seq_printf(m, " node %5d\n", nid);
if (!full)
- seq = min_seq[LRU_GEN_ANON];
+ seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
else if (max_seq >= MAX_NR_GENS)
seq = max_seq - MAX_NR_GENS + 1;
else
@@ -5355,23 +5461,14 @@ static const struct seq_operations lru_gen_seq_ops = {
};
static int run_aging(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
-
- if (seq < max_seq)
- return 0;
if (seq > max_seq)
return -EINVAL;
- if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
- return -ERANGE;
-
- try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
-
- return 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
}
static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
@@ -5387,13 +5484,14 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
while (!signal_pending(current)) {
DEFINE_MIN_SEQ(lruvec);
- if (seq < min_seq[!swappiness])
+ if (seq < evictable_min_seq(min_seq, swappiness))
return 0;
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness))
+ if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
+ swappiness))
return 0;
cond_resched();
@@ -5428,11 +5526,12 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (memcg_id != mem_cgroup_id(memcg))
goto done;
+ sc->target_mem_cgroup = memcg;
lruvec = get_lruvec(memcg, nid);
- if (swappiness < 0)
+ if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > 200)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5464,6 +5563,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.may_swap = true,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
+ .proactive = true,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -5489,24 +5589,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5628,8 +5739,10 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
- debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
- debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+ debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false,
+ &lru_gen_rw_fops);
+ debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true,
+ &lru_gen_ro_fops);
return 0;
};
@@ -5766,7 +5879,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -5797,6 +5910,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
int z;
+ struct zone *zone;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
@@ -5816,17 +5930,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* If compaction would go ahead or the allocation would succeed, stop */
- for (z = 0; z <= sc->reclaim_idx; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
+ unsigned long watermark = min_wmark_pages(zone);
/* Allocation can already succeed, nothing to do */
- if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ if (zone_watermark_ok(zone, sc->order, watermark,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, watermark,
+ sc->reclaim_idx))
return false;
}
@@ -5845,9 +5958,25 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .pgdat = pgdat,
+ };
+ struct mem_cgroup_reclaim_cookie *partial = &reclaim;
struct mem_cgroup *memcg;
- memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
+ /*
+ * In most cases, direct reclaimers can do partial walks
+ * through the cgroup tree, using an iterator state that
+ * persists across invocations. This strikes a balance between
+ * fairness and allocation latency.
+ *
+ * For kswapd, reliable forward progress is more important
+ * than a quick return to idle. Always do full walks.
+ */
+ if (current_is_kswapd() || sc->memcg_full_walk)
+ partial = NULL;
+
+ memcg = mem_cgroup_iter(target_memcg, NULL, partial);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
unsigned long reclaimed;
@@ -5897,7 +6026,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
+ /* If partial walks are allowed, bail once goal is reached */
+ if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
+ mem_cgroup_iter_break(target_memcg, memcg);
+ break;
+ }
+ } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
}
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
@@ -5907,6 +6041,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
bool reclaimable = false;
if (lru_gen_enabled() && root_reclaim(sc)) {
+ memset(&sc->nr, 0, sizeof(sc->nr));
lru_gen_shrink_node(pgdat, sc);
return;
}
@@ -5956,10 +6091,6 @@ again:
if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
- /* Allow kswapd to start writing pages during reclaim.*/
- if (sc->nr.unqueued_dirty == sc->nr.file_taken)
- set_bit(PGDAT_DIRTY, &pgdat->flags);
-
/*
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
@@ -6008,7 +6139,7 @@ again:
* successful direct reclaim run will revive a dormant kswapd.
*/
if (reclaimable)
- pgdat->kswapd_failures = 0;
+ atomic_set(&pgdat->kswapd_failures, 0);
else if (sc->cache_trim_mode)
sc->cache_trim_mode_failed = 1;
}
@@ -6030,22 +6161,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
sc->reclaim_idx, 0))
return true;
- /* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
- return false;
-
/*
- * Compaction is already possible, but it takes time to run and there
- * are potentially other callers using the pages just freed. So proceed
- * with reclaim to make a buffer of free pages available to give
- * compaction a reasonable chance of completing and allocating the page.
+ * Direct reclaim usually targets the min watermark, but compaction
+ * takes time to run and there are potentially other callers using the
+ * pages just freed. So target a higher buffer to give compaction a
+ * reasonable chance of completing and allocating the pages.
+ *
* Note that we won't actually reclaim the whole buffer in one attempt
* as the target watermark in should_continue_reclaim() is lower. But if
* we are already above the high+gap watermark, don't reclaim at all.
*/
- watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+ watermark = high_wmark_pages(zone);
+ if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
+ return true;
- return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+ return false;
}
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
@@ -6150,9 +6280,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
- sc->order, sc->gfp_mask,
- &nr_soft_scanned);
+ nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */
@@ -6271,6 +6401,21 @@ retry:
return 1;
/*
+ * In most cases, direct reclaimers can do partial walks
+ * through the cgroup tree to meet the reclaim goal while
+ * keeping latency low. Since the iterator state is shared
+ * among all direct reclaim invocations (to retain fairness
+ * among cgroups), though, high concurrency can result in
+ * individual threads not seeing enough cgroups to make
+ * meaningful forward progress. Avoid false OOMs in this case.
+ */
+ if (!sc->memcg_full_walk) {
+ sc->priority = initial_priority;
+ sc->memcg_full_walk = 1;
+ goto retry;
+ }
+
+ /*
* We make inactive:active ratio decisions based on the node's
* composition of memory, but a restrictive reclaim_idx or a
* memory.low cgroup setting can exempt large amounts of
@@ -6306,15 +6451,11 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
int i;
bool wmark_ok;
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
- for (i = 0; i <= ZONE_NORMAL; i++) {
- zone = &pgdat->node_zones[i];
- if (!managed_zone(zone))
- continue;
-
- if (!zone_reclaimable_pages(zone))
+ for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
+ if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -6515,12 +6656,14 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options)
+ unsigned int reclaim_options,
+ int *swappiness)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .proactive_swappiness = swappiness,
.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.reclaim_idx = MAX_NR_ZONES - 1,
@@ -6550,6 +6693,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
return nr_reclaimed;
}
+#else
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
+ gfp_t gfp_mask,
+ unsigned int reclaim_options,
+ int *swappiness)
+{
+ return 0;
+}
#endif
static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -6562,10 +6714,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
@@ -6616,17 +6768,48 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
+ unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
- mark = wmark_pages(zone, WMARK_PROMO);
+ mark = promo_wmark_pages(zone);
else
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+
+ /*
+ * In defrag_mode, watermarks must be met in whole
+ * blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
+ */
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
+ else
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
+
+ if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
+ 0, free_pages))
return true;
}
@@ -6648,7 +6831,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
- clear_bit(PGDAT_DIRTY, &pgdat->flags);
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
@@ -6678,7 +6860,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -6702,14 +6884,11 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
{
struct zone *zone;
int z;
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
- for (z = 0; z <= sc->reclaim_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
@@ -6729,7 +6908,8 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
sc->order = 0;
- return sc->nr_scanned >= sc->nr_to_reclaim;
+ /* account for progress from mm_account_reclaimed_pages() */
+ return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
}
/* Page allocator PCP high watermark is lowered if reclaim is active. */
@@ -6739,12 +6919,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
int i;
struct zone *zone;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
else
@@ -6805,11 +6980,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
@@ -6899,8 +7070,8 @@ restart:
/* Call soft limit reclaim before calling shrink_node. */
sc.nr_scanned = 0;
nr_soft_scanned = 0;
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
- sc.gfp_mask, &nr_soft_scanned);
+ nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, sc.order,
+ sc.gfp_mask, &nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
/*
@@ -6956,8 +7127,13 @@ restart:
goto restart;
}
- if (!sc.nr_reclaimed)
- pgdat->kswapd_failures++;
+ /*
+ * If the reclaim was boosted, we might still be far from the
+ * watermark_high at this point. We need to avoid increasing the
+ * failure count to prevent the kswapd thread from stopping.
+ */
+ if (!sc.nr_reclaimed && !boosted)
+ atomic_inc(&pgdat->kswapd_failures);
out:
clear_reclaim_active(pgdat, highest_zoneidx);
@@ -7116,10 +7292,6 @@ static int kswapd(void *p)
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
@@ -7220,7 +7392,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
return;
/* Hopeless node, leave it to direct reclaim if possible */
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
@@ -7288,13 +7460,15 @@ void __meminit kswapd_run(int nid)
pgdat_kswapd_lock(pgdat);
if (!pgdat->kswapd) {
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
pr_err("Failed to start kswapd on node %d,ret=%ld\n",
nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
+ } else {
+ wake_up_process(pgdat->kswapd);
}
}
pgdat_kswapd_unlock(pgdat);
@@ -7318,6 +7492,28 @@ void __meminit kswapd_stop(int nid)
pgdat_kswapd_unlock(pgdat);
}
+static const struct ctl_table vmscan_sysctl_table[] = {
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+#endif
+};
+
static int __init kswapd_init(void)
{
int nid;
@@ -7325,6 +7521,7 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
+ register_sysctl_init("vm", vmscan_sysctl_table);
return 0;
}
@@ -7389,9 +7586,11 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
else
nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
- /* If we can't clean pages, remove dirty pages from consideration */
- if (!(node_reclaim_mode & RECLAIM_WRITE))
- delta += node_page_state(pgdat, NR_FILE_DIRTY);
+ /*
+ * Since we can't clean folios through reclaim, remove dirty file
+ * folios from consideration.
+ */
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
@@ -7403,36 +7602,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
/*
* Try to free up some pages from this node through reclaim.
*/
-static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
+ unsigned long nr_pages,
+ struct scan_control *sc)
{
- /* Minimum pages needed in order to stay on node */
- const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
unsigned int noreclaim_flag;
- struct scan_control sc = {
- .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = current_gfp_context(gfp_mask),
- .order = order,
- .priority = NODE_RECLAIM_PRIORITY,
- .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
- .may_swap = 1,
- .reclaim_idx = gfp_zone(gfp_mask),
- };
unsigned long pflags;
- trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
- sc.gfp_mask);
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order,
+ sc->gfp_mask);
cond_resched();
psi_memstall_enter(&pflags);
delayacct_freepages_start();
- fs_reclaim_acquire(sc.gfp_mask);
+ fs_reclaim_acquire(sc->gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
*/
noreclaim_flag = memalloc_noreclaim_save();
- set_task_reclaim_state(p, &sc.reclaim_state);
+ set_task_reclaim_state(p, &sc->reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
@@ -7441,24 +7630,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* priorities until we have enough memory freed.
*/
do {
- shrink_node(pgdat, &sc);
- } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
+ shrink_node(pgdat, sc);
+ } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0);
}
set_task_reclaim_state(p, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
- fs_reclaim_release(sc.gfp_mask);
- psi_memstall_leave(&pflags);
+ fs_reclaim_release(sc->gfp_mask);
delayacct_freepages_end();
+ psi_memstall_leave(&pflags);
- trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed);
- return sc.nr_reclaimed >= nr_pages;
+ return sc->nr_reclaimed;
}
int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
int ret;
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct scan_control sc = {
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .order = order,
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
+ .may_swap = 1,
+ .reclaim_idx = gfp_zone(gfp_mask),
+ };
/*
* Node reclaim reclaims unmapped file backed pages and
@@ -7490,17 +7691,127 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
- ret = __node_reclaim(pgdat, gfp_mask, order);
- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
- if (!ret)
+ if (ret)
+ count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
+ else
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
}
+
+enum {
+ MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
+ MEMORY_RECLAIM_NULL,
+};
+static const match_table_t tokens = {
+ { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int swappiness = -1;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ gfp_t gfp_mask = GFP_KERNEL;
+
+ if (!buf || (!memcg && !pgdat) || (memcg && pgdat))
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ switch (match_token(start, tokens, args)) {
+ case MEMORY_RECLAIM_SWAPPINESS:
+ if (match_int(&args[0], &swappiness))
+ return -EINVAL;
+ if (swappiness < MIN_SWAPPINESS ||
+ swappiness > MAX_SWAPPINESS)
+ return -EINVAL;
+ break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages.
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ if (memcg) {
+ unsigned int reclaim_options;
+
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ batch_size, gfp_mask,
+ reclaim_options,
+ swappiness == -1 ? NULL : &swappiness);
+ } else {
+ struct scan_control sc = {
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .reclaim_idx = gfp_zone(gfp_mask),
+ .proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
+ .may_unmap = 1,
+ .may_swap = 1,
+ .proactive = 1,
+ };
+
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED,
+ &pgdat->flags))
+ return -EBUSY;
+
+ reclaimed = __node_reclaim(pgdat, gfp_mask,
+ batch_size, &sc);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ }
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
+
+ return 0;
+}
+
#endif
/**
@@ -7548,3 +7859,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t reclaim_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret, nid = dev->id;
+
+ ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid));
+ return ret ? -EAGAIN : count;
+}
+
+static DEVICE_ATTR_WO(reclaim);
+int reclaim_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_reclaim);
+}
+
+void reclaim_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_reclaim);
+}
+#endif