summaryrefslogtreecommitdiff
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c621
1 files changed, 274 insertions, 347 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8a1c92090129..ccdeb0e84d39 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -38,6 +38,7 @@
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
+#include <linux/shmem_fs.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -54,7 +55,7 @@
#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
/*
- * Estimate write bandwidth at 200ms intervals.
+ * Estimate write bandwidth or update dirty limit at 200ms intervals.
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
@@ -120,27 +121,6 @@ EXPORT_SYMBOL(laptop_mode);
struct wb_domain global_wb_domain;
-/* consolidated parameters for balance_dirty_pages() and its subroutines */
-struct dirty_throttle_control {
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct wb_domain *dom;
- struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
-#endif
- struct bdi_writeback *wb;
- struct fprop_local_percpu *wb_completions;
-
- unsigned long avail; /* dirtyable */
- unsigned long dirty; /* file_dirty + write + nfs */
- unsigned long thresh; /* dirty threshold */
- unsigned long bg_thresh; /* dirty background threshold */
-
- unsigned long wb_dirty; /* per-wb counterparts */
- unsigned long wb_thresh;
- unsigned long wb_bg_thresh;
-
- unsigned long pos_ratio;
-};
-
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
@@ -416,7 +396,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
tsk = current;
- if (rt_task(tsk)) {
+ if (rt_or_dl_task(tsk)) {
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
@@ -475,7 +455,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
else
dirty = vm_dirty_ratio * node_memory / 100;
- if (rt_task(tsk))
+ if (rt_or_dl_task(tsk))
dirty += dirty / 4;
/*
@@ -504,7 +484,7 @@ bool node_dirty_ok(struct pglist_data *pgdat)
}
#ifdef CONFIG_SYSCTL
-static int dirty_background_ratio_handler(struct ctl_table *table, int write,
+static int dirty_background_ratio_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -515,7 +495,7 @@ static int dirty_background_ratio_handler(struct ctl_table *table, int write,
return ret;
}
-static int dirty_background_bytes_handler(struct ctl_table *table, int write,
+static int dirty_background_bytes_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -533,7 +513,7 @@ static int dirty_background_bytes_handler(struct ctl_table *table, int write,
return ret;
}
-static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
@@ -541,13 +521,13 @@ static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
-static int dirty_bytes_handler(struct ctl_table *table, int write,
+static int dirty_bytes_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
@@ -584,7 +564,7 @@ static void wb_domain_writeout_add(struct wb_domain *dom,
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
- * We can race with other __bdi_writeout_inc calls here but
+ * We can race with other wb_domain_writeout_add calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.
@@ -628,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
*/
static void writeout_period(struct timer_list *t)
{
- struct wb_domain *dom = from_timer(dom, t, period_timer);
+ struct wb_domain *dom = timer_container_of(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
@@ -661,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
- del_timer_sync(&dom->period_timer);
+ timer_delete_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
@@ -690,6 +670,8 @@ static unsigned long bdi_ratio_from_pages(unsigned long pages)
unsigned long ratio;
global_dirty_limits(&background_thresh, &dirty_thresh);
+ if (!dirty_thresh)
+ return -EINVAL;
ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);
return ratio;
@@ -788,13 +770,15 @@ int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{
int ret;
unsigned long pages = min_bytes >> PAGE_SHIFT;
- unsigned long min_ratio;
+ long min_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
min_ratio = bdi_ratio_from_pages(pages);
+ if (min_ratio < 0)
+ return min_ratio;
return __bdi_set_min_ratio(bdi, min_ratio);
}
@@ -807,13 +791,15 @@ int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{
int ret;
unsigned long pages = max_bytes >> PAGE_SHIFT;
- unsigned long max_ratio;
+ long max_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
max_ratio = bdi_ratio_from_pages(pages);
+ if (max_ratio < 0)
+ return max_ratio;
return __bdi_set_max_ratio(bdi, max_ratio);
}
@@ -859,6 +845,34 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
mdtc->avail = filepages + min(headroom, other_clean);
}
+static inline bool dtc_is_global(struct dirty_throttle_control *dtc)
+{
+ return mdtc_gdtc(dtc) == NULL;
+}
+
+/*
+ * Dirty background will ignore pages being written as we're trying to
+ * decide whether to put more under writeback.
+ */
+static void domain_dirty_avail(struct dirty_throttle_control *dtc,
+ bool include_writeback)
+{
+ if (dtc_is_global(dtc)) {
+ dtc->avail = global_dirtyable_memory();
+ dtc->dirty = global_node_page_state(NR_FILE_DIRTY);
+ if (include_writeback)
+ dtc->dirty += global_node_page_state(NR_WRITEBACK);
+ } else {
+ unsigned long filepages = 0, headroom = 0, writeback = 0;
+
+ mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty,
+ &writeback);
+ if (include_writeback)
+ dtc->dirty += writeback;
+ mdtc_calc_avail(dtc, filepages, headroom);
+ }
+}
+
/**
* __wb_calc_thresh - @wb's share of dirty threshold
* @dtc: dirty_throttle_context of interest
@@ -887,7 +901,9 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
unsigned long thresh)
{
struct wb_domain *dom = dtc_dom(dtc);
+ struct bdi_writeback *wb = dtc->wb;
u64 wb_thresh;
+ u64 wb_max_thresh;
unsigned long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
@@ -901,11 +917,27 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
wb_thresh *= numerator;
wb_thresh = div64_ul(wb_thresh, denominator);
- wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
+ wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
- if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE))
- wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+
+ /*
+ * It's very possible that wb_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
+ */
+ if (thresh > dtc->dirty) {
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT))
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100);
+ else
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8);
+ }
+
+ wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > wb_max_thresh)
+ wb_thresh = wb_max_thresh;
return wb_thresh;
}
@@ -914,6 +946,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+ domain_dirty_avail(&gdtc, true);
return __wb_calc_thresh(&gdtc, thresh);
}
@@ -921,16 +954,9 @@ unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
{
struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
- unsigned long filepages = 0, headroom = 0, writeback = 0;
- gdtc.avail = global_dirtyable_memory();
- gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_WRITEBACK);
-
- mem_cgroup_wb_stats(wb, &filepages, &headroom,
- &mdtc.dirty, &writeback);
- mdtc.dirty += writeback;
- mdtc_calc_avail(&mdtc, filepages, headroom);
+ domain_dirty_avail(&gdtc, true);
+ domain_dirty_avail(&mdtc, true);
domain_dirty_limits(&mdtc);
return __wb_calc_thresh(&mdtc, mdtc.thresh);
@@ -1047,7 +1073,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
struct bdi_writeback *wb = dtc->wb;
unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
- unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
@@ -1075,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
* such filesystems balance_dirty_pages always checks wb counters
* against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
- * 1% by default. Without strictlimit feature, fuse writeback may
- * consume arbitrary amount of RAM because it is accounted in
- * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+ * 1% by default.
*
* Here, in wb_position_ratio(), we calculate pos_ratio based on
* two values: wb_dirty and wb_thresh. Let's consider an example:
@@ -1097,12 +1121,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
- if (dtc->wb_dirty < 8) {
- dtc->pos_ratio = min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
- return;
- }
-
if (dtc->wb_dirty >= wb_thresh)
return;
@@ -1174,14 +1192,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
/*
- * It's very possible that wb_thresh is close to 0 not because the
- * device is slow, but that it has remained inactive for long time.
- * Honour such devices a reasonable good (hopefully IO efficient)
- * threshold, so that the occasional writes won't be blocked and active
- * writes can rampup the threshold quickly.
- */
- wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
- /*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
*/
@@ -1436,17 +1446,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
* Hence, to calculate "step" properly, we have to use wb_dirty as
* "dirty" and wb_setpoint as "setpoint".
- *
- * We rampup dirty_ratelimit forcibly if wb_dirty is low because
- * it's possible that wb_thresh is close to zero due to inactivity
- * of backing device.
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
dirty = dtc->wb_dirty;
- if (dtc->wb_dirty < 8)
- setpoint = dtc->wb_dirty + 1;
- else
- setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
@@ -1703,6 +1706,100 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
}
}
+static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc,
+ bool strictlimit)
+{
+ unsigned long dirty, thresh;
+
+ if (strictlimit) {
+ dirty = dtc->wb_dirty;
+ thresh = dtc->wb_thresh;
+ } else {
+ dirty = dtc->dirty;
+ thresh = dtc->thresh;
+ }
+
+ return dirty_poll_interval(dirty, thresh);
+}
+
+/*
+ * Throttle it only when the background writeback cannot catch-up. This avoids
+ * (excessively) small writeouts when the wb limits are ramping up in case of
+ * !strictlimit.
+ *
+ * In strictlimit case make decision based on the wb counters and limits. Small
+ * writeouts when the wb limits are ramping up are the price we consciously pay
+ * for strictlimit-ing.
+ */
+static void domain_dirty_freerun(struct dirty_throttle_control *dtc,
+ bool strictlimit)
+{
+ unsigned long dirty, thresh, bg_thresh;
+
+ if (unlikely(strictlimit)) {
+ wb_dirty_limits(dtc);
+ dirty = dtc->wb_dirty;
+ thresh = dtc->wb_thresh;
+ bg_thresh = dtc->wb_bg_thresh;
+ } else {
+ dirty = dtc->dirty;
+ thresh = dtc->thresh;
+ bg_thresh = dtc->bg_thresh;
+ }
+ dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh);
+}
+
+static void balance_domain_limits(struct dirty_throttle_control *dtc,
+ bool strictlimit)
+{
+ domain_dirty_avail(dtc, true);
+ domain_dirty_limits(dtc);
+ domain_dirty_freerun(dtc, strictlimit);
+}
+
+static void wb_dirty_freerun(struct dirty_throttle_control *dtc,
+ bool strictlimit)
+{
+ dtc->freerun = false;
+
+ /* was already handled in domain_dirty_freerun */
+ if (strictlimit)
+ return;
+
+ wb_dirty_limits(dtc);
+ /*
+ * LOCAL_THROTTLE tasks must not be throttled when below the per-wb
+ * freerun ceiling.
+ */
+ if (!(current->flags & PF_LOCAL_THROTTLE))
+ return;
+
+ dtc->freerun = dtc->wb_dirty <
+ dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh);
+}
+
+static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc,
+ bool strictlimit)
+{
+ dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) &&
+ ((dtc->dirty > dtc->thresh) || strictlimit);
+}
+
+/*
+ * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is
+ * in freerun state. Please don't use these invalid fields in freerun case.
+ */
+static void balance_wb_limits(struct dirty_throttle_control *dtc,
+ bool strictlimit)
+{
+ wb_dirty_freerun(dtc, strictlimit);
+ if (dtc->freerun)
+ return;
+
+ wb_dirty_exceeded(dtc, strictlimit);
+ wb_position_ratio(dtc);
+}
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@@ -1725,7 +1822,6 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
long max_pause;
long min_pause;
int nr_dirtied_pause;
- bool dirty_exceeded = false;
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
struct backing_dev_info *bdi = wb->bdi;
@@ -1735,53 +1831,16 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
for (;;) {
unsigned long now = jiffies;
- unsigned long dirty, thresh, bg_thresh;
- unsigned long m_dirty = 0; /* stop bogus uninit warnings */
- unsigned long m_thresh = 0;
- unsigned long m_bg_thresh = 0;
nr_dirty = global_node_page_state(NR_FILE_DIRTY);
- gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK);
-
- domain_dirty_limits(gdtc);
-
- if (unlikely(strictlimit)) {
- wb_dirty_limits(gdtc);
-
- dirty = gdtc->wb_dirty;
- thresh = gdtc->wb_thresh;
- bg_thresh = gdtc->wb_bg_thresh;
- } else {
- dirty = gdtc->dirty;
- thresh = gdtc->thresh;
- bg_thresh = gdtc->bg_thresh;
- }
+ balance_domain_limits(gdtc, strictlimit);
if (mdtc) {
- unsigned long filepages, headroom, writeback;
-
/*
* If @wb belongs to !root memcg, repeat the same
* basic calculations for the memcg domain.
*/
- mem_cgroup_wb_stats(wb, &filepages, &headroom,
- &mdtc->dirty, &writeback);
- mdtc->dirty += writeback;
- mdtc_calc_avail(mdtc, filepages, headroom);
-
- domain_dirty_limits(mdtc);
-
- if (unlikely(strictlimit)) {
- wb_dirty_limits(mdtc);
- m_dirty = mdtc->wb_dirty;
- m_thresh = mdtc->wb_thresh;
- m_bg_thresh = mdtc->wb_bg_thresh;
- } else {
- m_dirty = mdtc->dirty;
- m_thresh = mdtc->thresh;
- m_bg_thresh = mdtc->bg_thresh;
- }
+ balance_domain_limits(mdtc, strictlimit);
}
/*
@@ -1798,31 +1857,21 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
wb_start_background_writeback(wb);
/*
- * Throttle it only when the background writeback cannot
- * catch-up. This avoids (excessively) small writeouts
- * when the wb limits are ramping up in case of !strictlimit.
- *
- * In strictlimit case make decision based on the wb counters
- * and limits. Small writeouts when the wb limits are ramping
- * up are the price we consciously pay for strictlimit-ing.
- *
* If memcg domain is in effect, @dirty should be under
* both global and memcg freerun ceilings.
*/
- if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
- (!mdtc ||
- m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+ if (gdtc->freerun && (!mdtc || mdtc->freerun)) {
unsigned long intv;
unsigned long m_intv;
free_running:
- intv = dirty_poll_interval(dirty, thresh);
+ intv = domain_poll_intv(gdtc, strictlimit);
m_intv = ULONG_MAX;
current->dirty_paused_when = now;
current->nr_dirtied = 0;
if (mdtc)
- m_intv = dirty_poll_interval(m_dirty, m_thresh);
+ m_intv = domain_poll_intv(mdtc, strictlimit);
current->nr_dirtied_pause = min(intv, m_intv);
break;
}
@@ -1837,24 +1886,9 @@ free_running:
* Calculate global domain's pos_ratio and select the
* global dtc by default.
*/
- if (!strictlimit) {
- wb_dirty_limits(gdtc);
-
- if ((current->flags & PF_LOCAL_THROTTLE) &&
- gdtc->wb_dirty <
- dirty_freerun_ceiling(gdtc->wb_thresh,
- gdtc->wb_bg_thresh))
- /*
- * LOCAL_THROTTLE tasks must not be throttled
- * when below the per-wb freerun ceiling.
- */
- goto free_running;
- }
-
- dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
- ((gdtc->dirty > gdtc->thresh) || strictlimit);
-
- wb_position_ratio(gdtc);
+ balance_wb_limits(gdtc, strictlimit);
+ if (gdtc->freerun)
+ goto free_running;
sdtc = gdtc;
if (mdtc) {
@@ -1864,31 +1898,15 @@ free_running:
* both global and memcg domains. Choose the one
* w/ lower pos_ratio.
*/
- if (!strictlimit) {
- wb_dirty_limits(mdtc);
-
- if ((current->flags & PF_LOCAL_THROTTLE) &&
- mdtc->wb_dirty <
- dirty_freerun_ceiling(mdtc->wb_thresh,
- mdtc->wb_bg_thresh))
- /*
- * LOCAL_THROTTLE tasks must not be
- * throttled when below the per-wb
- * freerun ceiling.
- */
- goto free_running;
- }
- dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
- ((mdtc->dirty > mdtc->thresh) || strictlimit);
-
- wb_position_ratio(mdtc);
+ balance_wb_limits(mdtc, strictlimit);
+ if (mdtc->freerun)
+ goto free_running;
if (mdtc->pos_ratio < gdtc->pos_ratio)
sdtc = mdtc;
}
- if (dirty_exceeded != wb->dirty_exceeded)
- wb->dirty_exceeded = dirty_exceeded;
-
+ wb->dirty_exceeded = gdtc->dirty_exceeded ||
+ (mdtc && mdtc->dirty_exceeded);
if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
BANDWIDTH_INTERVAL))
__wb_update_bandwidth(gdtc, mdtc, true);
@@ -1920,11 +1938,7 @@ free_running:
*/
if (pause < min_pause) {
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -1949,11 +1963,7 @@ free_running:
pause:
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -2109,6 +2119,35 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+/*
+ * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty
+ * and thresh, but it's for background writeback.
+ */
+static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc)
+{
+ struct bdi_writeback *wb = dtc->wb;
+
+ dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh);
+ if (dtc->wb_bg_thresh < 2 * wb_stat_error())
+ dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE);
+ else
+ dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE);
+}
+
+static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc)
+{
+ domain_dirty_avail(dtc, false);
+ domain_dirty_limits(dtc);
+ if (dtc->dirty > dtc->bg_thresh)
+ return true;
+
+ wb_bg_dirty_limits(dtc);
+ if (dtc->wb_dirty > dtc->wb_bg_thresh)
+ return true;
+
+ return false;
+}
+
/**
* wb_over_bg_thresh - does @wb need to be written back?
* @wb: bdi_writeback of interest
@@ -2120,54 +2159,14 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
*/
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
- struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
- struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
- struct dirty_throttle_control * const gdtc = &gdtc_stor;
- struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
- &mdtc_stor : NULL;
- unsigned long reclaimable;
- unsigned long thresh;
-
- /*
- * Similar to balance_dirty_pages() but ignores pages being written
- * as we're trying to decide whether to put more under writeback.
- */
- gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
- domain_dirty_limits(gdtc);
-
- if (gdtc->dirty > gdtc->bg_thresh)
- return true;
-
- thresh = __wb_calc_thresh(gdtc, gdtc->bg_thresh);
- if (thresh < 2 * wb_stat_error())
- reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
- else
- reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+ struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
- if (reclaimable > thresh)
+ if (domain_over_bg_thresh(&gdtc))
return true;
- if (mdtc) {
- unsigned long filepages, headroom, writeback;
-
- mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
- &writeback);
- mdtc_calc_avail(mdtc, filepages, headroom);
- domain_dirty_limits(mdtc); /* ditto, ignore writeback */
-
- if (mdtc->dirty > mdtc->bg_thresh)
- return true;
-
- thresh = __wb_calc_thresh(mdtc, mdtc->bg_thresh);
- if (thresh < 2 * wb_stat_error())
- reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
- else
- reclaimable = wb_stat(wb, WB_RECLAIMABLE);
-
- if (reclaimable > thresh)
- return true;
- }
+ if (mdtc_valid(&mdtc))
+ return domain_over_bg_thresh(&mdtc);
return false;
}
@@ -2176,7 +2175,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
-static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
unsigned int old_interval = dirty_writeback_interval;
@@ -2202,7 +2201,7 @@ static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
- from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+ timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
@@ -2229,7 +2228,7 @@ void laptop_sync_completion(void)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- del_timer(&bdi->laptop_mode_wb_timer);
+ timer_delete(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
@@ -2267,7 +2266,7 @@ static int page_writeback_cpu_online(unsigned int cpu)
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-static struct ctl_table vm_page_writeback_sysctls[] = {
+static const struct ctl_table vm_page_writeback_sysctls[] = {
{
.procname = "dirty_background_ratio",
.data = &dirty_background_ratio,
@@ -2435,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping,
return true;
}
-static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
-{
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- return PAGECACHE_TAG_TOWRITE;
- return PAGECACHE_TAG_DIRTY;
-}
static pgoff_t wbc_end(struct writeback_control *wbc)
{
@@ -2564,11 +2557,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
- * that hold pages in PageWriteback to aggregate I/O until
+ * that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
- * start of the file. Doing so causes a page lock/page
+ * start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping
+ * multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/
@@ -2585,63 +2578,12 @@ struct folio *writeback_iter(struct address_space *mapping,
done:
if (wbc->range_cyclic)
- mapping->writeback_index = folio->index + folio_nr_pages(folio);
+ mapping->writeback_index = folio_next_index(folio);
folio_batch_release(&wbc->fbatch);
return NULL;
}
EXPORT_SYMBOL_GPL(writeback_iter);
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * Return: %0 on success, negative error code otherwise
- *
- * Note: please use writeback_iter() instead.
- */
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
-{
- struct folio *folio = NULL;
- int error;
-
- while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
- error = writepage(folio, wbc, data);
- if (error == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- error = 0;
- }
- }
-
- return error;
-}
-EXPORT_SYMBOL(write_cache_pages);
-
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2652,14 +2594,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
@@ -2697,8 +2636,6 @@ EXPORT_SYMBOL(noop_dirty_folio);
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold folio_memcg_lock().
- *
* NOTE: This relies on being atomic wrt interrupts.
*/
static void folio_account_dirtied(struct folio *folio,
@@ -2715,7 +2652,7 @@ static void folio_account_dirtied(struct folio *folio,
inode_attach_wb(inode, folio);
wb = inode_to_wb(inode);
- __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
__node_stat_mod_folio(folio, NR_DIRTIED, nr);
wb_stat_mod(wb, WB_RECLAIMABLE, nr);
@@ -2731,7 +2668,6 @@ static void folio_account_dirtied(struct folio *folio,
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold folio_memcg_lock().
*/
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
@@ -2749,9 +2685,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
- * The caller must hold folio_memcg_lock(). It is the caller's
- * responsibility to prevent the folio from being truncated while
- * this function is in progress, although it may have been truncated
+ * It is the caller's responsibility to prevent the folio from being truncated
+ * while this function is in progress, although it may have been truncated
* before this function is called. Most callers have the folio locked.
* A few have the folio blocked from truncation through other means (e.g.
* zap_vma_pages() has it mapped and is holding the page table lock).
@@ -2764,12 +2699,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
{
unsigned long flags;
+ /*
+ * Shmem writeback relies on swap, and swap writeback is LRU based,
+ * not using the dirty mark.
+ */
+ VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping));
+
xa_lock_irqsave(&mapping->i_pages, flags);
if (folio->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
folio_account_dirtied(folio, mapping);
- __xa_set_mark(&mapping->i_pages, folio_index(folio),
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -2795,14 +2736,10 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
*/
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- folio_memcg_lock(folio);
- if (folio_test_set_dirty(folio)) {
- folio_memcg_unlock(folio);
+ if (folio_test_set_dirty(folio))
return false;
- }
__folio_mark_dirty(folio, mapping, !folio_test_private(folio));
- folio_memcg_unlock(folio);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2887,25 +2824,25 @@ bool folio_mark_dirty(struct folio *folio)
EXPORT_SYMBOL(folio_mark_dirty);
/*
- * set_page_dirty() is racy if the caller has no reference against
- * page->mapping->host, and if the page is unlocked. This is because another
- * CPU could truncate the page off the mapping and then free the mapping.
+ * folio_mark_dirty() is racy if the caller has no reference against
+ * folio->mapping->host, and if the folio is unlocked. This is because another
+ * CPU could truncate the folio off the mapping and then free the mapping.
*
- * Usually, the page _is_ locked, or the caller is a user-space process which
+ * Usually, the folio _is_ locked, or the caller is a user-space process which
* holds a reference on the inode by having an open file.
*
- * In other cases, the page should be locked before running set_page_dirty().
+ * In other cases, the folio should be locked before running folio_mark_dirty().
*/
-int set_page_dirty_lock(struct page *page)
+bool folio_mark_dirty_lock(struct folio *folio)
{
- int ret;
+ bool ret;
- lock_page(page);
- ret = set_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ ret = folio_mark_dirty(folio);
+ folio_unlock(folio);
return ret;
}
-EXPORT_SYMBOL(set_page_dirty_lock);
+EXPORT_SYMBOL(folio_mark_dirty_lock);
/*
* This cancels just the dirty bit on the kernel page itself, it does NOT
@@ -2929,14 +2866,12 @@ void __folio_cancel_dirty(struct folio *folio)
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
- folio_memcg_lock(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (folio_test_clear_dirty(folio))
folio_account_cleaned(folio, wb);
unlocked_inode_to_wb_end(inode, &cookie);
- folio_memcg_unlock(folio);
} else {
folio_clear_dirty(folio);
}
@@ -3047,29 +2982,25 @@ bool __folio_end_writeback(struct folio *folio)
struct address_space *mapping = folio_mapping(folio);
bool ret;
- folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
- __xa_clear_mark(&mapping->i_pages, folio_index(folio),
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
- struct bdi_writeback *wb = inode_to_wb(inode);
- wb_stat_mod(wb, WB_WRITEBACK, -nr);
- __wb_writeout_add(wb, nr);
- if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
- wb_inode_writeback_end(wb);
+ wb = inode_to_wb(inode);
+ wb_stat_mod(wb, WB_WRITEBACK, -nr);
+ __wb_writeout_add(wb, nr);
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ wb_inode_writeback_end(wb);
+ if (mapping->host)
+ sb_clear_inode_writeback(mapping->host);
}
- if (mapping->host && !mapping_tagged(mapping,
- PAGECACHE_TAG_WRITEBACK))
- sb_clear_inode_writeback(mapping->host);
-
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
@@ -3078,7 +3009,6 @@ bool __folio_end_writeback(struct folio *folio)
lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
node_stat_mod_folio(folio, NR_WRITTEN, nr);
- folio_memcg_unlock(folio);
return ret;
}
@@ -3090,12 +3020,12 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
int access_ret;
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
- XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ XA_STATE(xas, &mapping->i_pages, folio->index);
struct inode *inode = mapping->host;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
unsigned long flags;
bool on_wblist;
@@ -3106,21 +3036,19 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
- struct bdi_writeback *wb = inode_to_wb(inode);
-
- wb_stat_mod(wb, WB_WRITEBACK, nr);
- if (!on_wblist)
- wb_inode_writeback_start(wb);
+ wb = inode_to_wb(inode);
+ wb_stat_mod(wb, WB_WRITEBACK, nr);
+ if (!on_wblist) {
+ wb_inode_writeback_start(wb);
+ /*
+ * We can come through here when swapping anonymous
+ * folios, so we don't necessarily have an inode to
+ * track for sync.
+ */
+ if (mapping->host)
+ sb_mark_inode_writeback(mapping->host);
}
- /*
- * We can come through here when swapping anonymous
- * folios, so we don't necessarily have an inode to
- * track for sync.
- */
- if (mapping->host && !on_wblist)
- sb_mark_inode_writeback(mapping->host);
if (!folio_test_dirty(folio))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
@@ -3132,7 +3060,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
- folio_memcg_unlock(folio);
access_ret = arch_make_folio_accessible(folio);
/*