summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-06 14:08:17 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-06 14:08:17 -0700
commit512b7931ad0561ffe14265f9ff554a3c081b476b (patch)
treea94450d08468e094d2d92a495de4650faab09c1f /mm/vmscan.c
parentfe91c4725aeed35023ba4f7a1e1adfebb6878c23 (diff)
parent658f9ae761b5965893727dd4edcdad56e5a439bb (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "257 patches. Subsystems affected by this patch series: scripts, ocfs2, vfs, and mm (slab-generic, slab, slub, kconfig, dax, kasan, debug, pagecache, gup, swap, memcg, pagemap, mprotect, mremap, iomap, tracing, vmalloc, pagealloc, memory-failure, hugetlb, userfaultfd, vmscan, tools, memblock, oom-kill, hugetlbfs, migration, thp, readahead, nommu, ksm, vmstat, madvise, memory-hotplug, rmap, zsmalloc, highmem, zram, cleanups, kfence, and damon)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (257 commits) mm/damon: remove return value from before_terminate callback mm/damon: fix a few spelling mistakes in comments and a pr_debug message mm/damon: simplify stop mechanism Docs/admin-guide/mm/pagemap: wordsmith page flags descriptions Docs/admin-guide/mm/damon/start: simplify the content Docs/admin-guide/mm/damon/start: fix a wrong link Docs/admin-guide/mm/damon/start: fix wrong example commands mm/damon/dbgfs: add adaptive_targets list check before enable monitor_on mm/damon: remove unnecessary variable initialization Documentation/admin-guide/mm/damon: add a document for DAMON_RECLAIM mm/damon: introduce DAMON-based Reclamation (DAMON_RECLAIM) selftests/damon: support watermarks mm/damon/dbgfs: support watermarks mm/damon/schemes: activate schemes based on a watermarks mechanism tools/selftests/damon: update for regions prioritization of schemes mm/damon/dbgfs: support prioritization weights mm/damon/vaddr,paddr: support pageout prioritization mm/damon/schemes: prioritize regions within the quotas mm/damon/selftests: support schemes quotas mm/damon/dbgfs: support quotas of schemes ...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c154
1 files changed, 138 insertions, 16 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71f178f85f5b..ef4a6dc7f000 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1021,6 +1021,91 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
+void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
+{
+ wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
+ long timeout, ret;
+ DEFINE_WAIT(wait);
+
+ /*
+ * Do not throttle IO workers, kthreads other than kswapd or
+ * workqueues. They may be required for reclaim to make
+ * forward progress (e.g. journalling workqueues or kthreads).
+ */
+ if (!current_is_kswapd() &&
+ current->flags & (PF_IO_WORKER|PF_KTHREAD))
+ return;
+
+ /*
+ * These figures are pulled out of thin air.
+ * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
+ * parallel reclaimers which is a short-lived event so the timeout is
+ * short. Failing to make progress or waiting on writeback are
+ * potentially long-lived events so use a longer timeout. This is shaky
+ * logic as a failure to make progress could be due to anything from
+ * writeback to a slow device to excessive references pages at the tail
+ * of the inactive LRU.
+ */
+ switch(reason) {
+ case VMSCAN_THROTTLE_WRITEBACK:
+ timeout = HZ/10;
+
+ if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
+ WRITE_ONCE(pgdat->nr_reclaim_start,
+ node_page_state(pgdat, NR_THROTTLED_WRITTEN));
+ }
+
+ break;
+ case VMSCAN_THROTTLE_NOPROGRESS:
+ timeout = HZ/2;
+ break;
+ case VMSCAN_THROTTLE_ISOLATED:
+ timeout = HZ/50;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ timeout = HZ;
+ break;
+ }
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+ if (reason == VMSCAN_THROTTLE_WRITEBACK)
+ atomic_dec(&pgdat->nr_writeback_throttled);
+
+ trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
+ jiffies_to_usecs(timeout - ret),
+ reason);
+}
+
+/*
+ * Account for pages written if tasks are throttled waiting on dirty
+ * pages to clean. If enough pages have been cleaned since throttling
+ * started then wakeup the throttled tasks.
+ */
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+ int nr_throttled)
+{
+ unsigned long nr_written;
+
+ node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
+
+ /*
+ * This is an inaccurate read as the per-cpu deltas may not
+ * be synchronised. However, given that the system is
+ * writeback throttled, it is not worth taking the penalty
+ * of getting an accurate count. At worst, the throttle
+ * timeout guarantees forward progress.
+ */
+ nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
+ READ_ONCE(pgdat->nr_reclaim_start);
+
+ if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
+ wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
+}
+
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -1352,7 +1437,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
- int err;
if (list_empty(demote_pages))
return 0;
@@ -1361,7 +1445,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
return 0;
/* Demotion ignores all cpuset and mempolicy settings */
- err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ migrate_pages(demote_pages, alloc_demote_page, NULL,
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -1427,9 +1511,8 @@ retry:
/*
* The number of dirty pages determines if a node is marked
- * reclaim_congested which affects wait_iff_congested. kswapd
- * will stall and start writing pages if the tail of the LRU
- * is all dirty unqueued pages.
+ * reclaim_congested. kswapd will stall and start writing
+ * pages if the tail of the LRU is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
@@ -2135,6 +2218,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
+ bool too_many;
if (current_is_kswapd())
return 0;
@@ -2158,7 +2242,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
inactive >>= 3;
- return isolated > inactive;
+ too_many = isolated > inactive;
+
+ /* Wake up tasks throttled due to too_many_isolated. */
+ if (!too_many)
+ wake_throttle_isolated(pgdat);
+
+ return too_many;
}
/*
@@ -2267,8 +2357,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return 0;
/* wait a bit for the reclaimer. */
- msleep(100);
stalled = true;
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@@ -3196,19 +3286,19 @@ again:
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
- * faster than they are written so also forcibly stall.
+ * faster than they are written so forcibly stall
+ * until some pages complete writeback.
*/
if (sc->nr.immediate)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/*
- * Tag a node/memcg as congested if all the dirty pages
- * scanned were backed by a congested BDI and
- * wait_iff_congested will stall.
+ * Tag a node/memcg as congested if all the dirty pages were marked
+ * for writeback and immediate reclaim (counted in nr.congested).
*
* Legacy memcg will stall in page writeback so avoid forcibly
- * stalling in wait_iff_congested().
+ * stalling in reclaim_throttle().
*/
if ((current_is_kswapd() ||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
@@ -3216,15 +3306,15 @@ again:
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
/*
- * Stall direct reclaim for IO completions if underlying BDIs
- * and node is congested. Allow kswapd to continue until it
+ * Stall direct reclaim for IO completions if the lruvec is
+ * node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
- wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
@@ -3272,6 +3362,36 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}
+static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
+{
+ /*
+ * If reclaim is making progress greater than 12% efficiency then
+ * wake all the NOPROGRESS throttled tasks.
+ */
+ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
+ wait_queue_head_t *wqh;
+
+ wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+
+ return;
+ }
+
+ /*
+ * Do not throttle kswapd on NOPROGRESS as it will throttle on
+ * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
+ * writeback and marked for immediate reclaim at the tail of
+ * the LRU.
+ */
+ if (current_is_kswapd())
+ return;
+
+ /* Throttle if making no progress at high prioities. */
+ if (sc->priority < DEF_PRIORITY - 2)
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -3356,6 +3476,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
continue;
last_pgdat = zone->zone_pgdat;
shrink_node(zone->zone_pgdat, sc);
+ consider_reclaim_throttle(zone->zone_pgdat, sc);
}
/*
@@ -4302,6 +4423,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+ atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
bool ret;