summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c48
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/internal.h11
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/vmscan.c82
-rw-r--r--mm/vmstat.c1
6 files changed, 88 insertions, 60 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5ccb25089808..3d2983752e24 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1038,51 +1038,3 @@ long congestion_wait(int sync, long timeout)
return ret;
}
EXPORT_SYMBOL(congestion_wait);
-
-/**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * In the event of a congested backing_dev (any backing_dev) this waits
- * for up to @timeout jiffies for either a BDI to exit congestion of the
- * given @sync queue or a write to complete.
- *
- * The return value is 0 if the sleep is for the full timeout. Otherwise,
- * it is the number of jiffies that were still remaining when the function
- * returned. return_value == timeout implies the function did not sleep.
- */
-long wait_iff_congested(int sync, long timeout)
-{
- long ret;
- unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];
-
- /*
- * If there is no congestion, yield if necessary instead
- * of sleeping on the congestion queue
- */
- if (atomic_read(&nr_wb_congested[sync]) == 0) {
- cond_resched();
-
- /* In case we scheduled, work out time remaining */
- ret = timeout - (jiffies - start);
- if (ret < 0)
- ret = 0;
-
- goto out;
- }
-
- /* Sleep until uncongested or a write happens */
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
-
-out:
- trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
- jiffies_to_usecs(jiffies - start));
-
- return ret;
-}
-EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f2c4bd6a571..b6140debc2da 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1612,6 +1612,7 @@ void end_page_writeback(struct page *page)
smp_mb__after_atomic();
wake_up_page(page, PG_writeback);
+ acct_reclaim_writeback(page);
put_page(page);
}
EXPORT_SYMBOL(end_page_writeback);
diff --git a/mm/internal.h b/mm/internal.h
index 6c3e1a9f8c5a..a59b5626f968 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,6 +34,17 @@
void page_writeback_init(void);
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page,
+ int nr_throttled);
+static inline void acct_reclaim_writeback(struct page *page)
+{
+ pg_data_t *pgdat = page_pgdat(page);
+ int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
+
+ if (nr_throttled)
+ __acct_reclaim_writeback(pgdat, page, nr_throttled);
+}
+
vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 09a0f1c5d5d2..0f1f1f353211 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7408,6 +7408,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
+ int i;
+
pgdat_resize_init(pgdat);
pgdat_init_split_queue(pgdat);
@@ -7416,6 +7418,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+ for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
+ init_waitqueue_head(&pgdat->reclaim_wait[i]);
+
pgdat_page_ext_init(pgdat);
lruvec_init(&pgdat->__lruvec);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9483dd6af550..09beeb55546c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1006,6 +1006,64 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
+static void
+reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason,
+ long timeout)
+{
+ wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
+ long ret;
+ DEFINE_WAIT(wait);
+
+ /*
+ * Do not throttle IO workers, kthreads other than kswapd or
+ * workqueues. They may be required for reclaim to make
+ * forward progress (e.g. journalling workqueues or kthreads).
+ */
+ if (!current_is_kswapd() &&
+ current->flags & (PF_IO_WORKER|PF_KTHREAD))
+ return;
+
+ if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
+ WRITE_ONCE(pgdat->nr_reclaim_start,
+ node_page_state(pgdat, NR_THROTTLED_WRITTEN));
+ }
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ atomic_dec(&pgdat->nr_writeback_throttled);
+
+ trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
+ jiffies_to_usecs(timeout - ret),
+ reason);
+}
+
+/*
+ * Account for pages written if tasks are throttled waiting on dirty
+ * pages to clean. If enough pages have been cleaned since throttling
+ * started then wakeup the throttled tasks.
+ */
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page,
+ int nr_throttled)
+{
+ unsigned long nr_written;
+
+ inc_node_page_state(page, NR_THROTTLED_WRITTEN);
+
+ /*
+ * This is an inaccurate read as the per-cpu deltas may not
+ * be synchronised. However, given that the system is
+ * writeback throttled, it is not worth taking the penalty
+ * of getting an accurate count. At worst, the throttle
+ * timeout guarantees forward progress.
+ */
+ nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
+ READ_ONCE(pgdat->nr_reclaim_start);
+
+ if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
+ wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
+}
+
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -1411,9 +1469,8 @@ retry:
/*
* The number of dirty pages determines if a node is marked
- * reclaim_congested which affects wait_iff_congested. kswapd
- * will stall and start writing pages if the tail of the LRU
- * is all dirty unqueued pages.
+ * reclaim_congested. kswapd will stall and start writing
+ * pages if the tail of the LRU is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
@@ -3179,19 +3236,19 @@ again:
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
- * faster than they are written so also forcibly stall.
+ * faster than they are written so forcibly stall
+ * until some pages complete writeback.
*/
if (sc->nr.immediate)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10);
}
/*
- * Tag a node/memcg as congested if all the dirty pages
- * scanned were backed by a congested BDI and
- * wait_iff_congested will stall.
+ * Tag a node/memcg as congested if all the dirty pages were marked
+ * for writeback and immediate reclaim (counted in nr.congested).
*
* Legacy memcg will stall in page writeback so avoid forcibly
- * stalling in wait_iff_congested().
+ * stalling in reclaim_throttle().
*/
if ((current_is_kswapd() ||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
@@ -3199,15 +3256,15 @@ again:
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
/*
- * Stall direct reclaim for IO completions if underlying BDIs
- * and node is congested. Allow kswapd to continue until it
+ * Stall direct reclaim for IO completions if the lruvec is
+ * node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
- wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
@@ -4285,6 +4342,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+ atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
bool ret;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ae87c90e0b4e..0f4643e5324d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1225,6 +1225,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
+ "nr_throttled_written",
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",