diff options
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r-- | mm/page-writeback.c | 603 |
1 files changed, 307 insertions, 296 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e19b87049db..72b0ff0d4bae 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -41,6 +41,7 @@ #include <trace/events/writeback.h> #include "internal.h" +#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). @@ -54,7 +55,7 @@ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* - * Estimate write bandwidth at 200ms intervals. + * Estimate write bandwidth or update dirty limit at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) @@ -120,27 +121,6 @@ EXPORT_SYMBOL(laptop_mode); struct wb_domain global_wb_domain; -/* consolidated parameters for balance_dirty_pages() and its subroutines */ -struct dirty_throttle_control { -#ifdef CONFIG_CGROUP_WRITEBACK - struct wb_domain *dom; - struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ -#endif - struct bdi_writeback *wb; - struct fprop_local_percpu *wb_completions; - - unsigned long avail; /* dirtyable */ - unsigned long dirty; /* file_dirty + write + nfs */ - unsigned long thresh; /* dirty threshold */ - unsigned long bg_thresh; /* dirty background threshold */ - - unsigned long wb_dirty; /* per-wb counterparts */ - unsigned long wb_thresh; - unsigned long wb_bg_thresh; - - unsigned long pos_ratio; -}; - /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will @@ -415,13 +395,20 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (bg_thresh >= thresh) - bg_thresh = thresh / 2; tsk = current; - if (rt_task(tsk)) { + if (rt_or_dl_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + if (thresh > UINT_MAX) + thresh = UINT_MAX; + /* This makes sure bg_thresh is within 32-bits as well */ + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; @@ -468,10 +455,14 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) else dirty = vm_dirty_ratio * node_memory / 100; - if (rt_task(tsk)) + if (rt_or_dl_task(tsk)) dirty += dirty / 4; - return dirty; + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + return min_t(unsigned long, dirty, UINT_MAX); } /** @@ -493,7 +484,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) } #ifdef CONFIG_SYSCTL -static int dirty_background_ratio_handler(struct ctl_table *table, int write, +static int dirty_background_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -504,18 +495,25 @@ static int dirty_background_ratio_handler(struct ctl_table *table, int write, return ret; } -static int dirty_background_bytes_handler(struct ctl_table *table, int write, +static int dirty_background_bytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; + unsigned long old_bytes = dirty_background_bytes; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); - if (ret == 0 && write) + if (ret == 0 && write) { + if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > + UINT_MAX) { + dirty_background_bytes = old_bytes; + return -ERANGE; + } dirty_background_ratio = 0; + } return ret; } -static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, +static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; @@ -523,13 +521,13 @@ static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - writeback_set_ratelimit(); vm_dirty_bytes = 0; + writeback_set_ratelimit(); } return ret; } -static int dirty_bytes_handler(struct ctl_table *table, int write, +static int dirty_bytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; @@ -537,6 +535,10 @@ static int dirty_bytes_handler(struct ctl_table *table, int write, ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { + vm_dirty_bytes = old_bytes; + return -ERANGE; + } writeback_set_ratelimit(); vm_dirty_ratio = 0; } @@ -562,7 +564,7 @@ static void wb_domain_writeout_add(struct wb_domain *dom, /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* - * We can race with other __bdi_writeout_inc calls here but + * We can race with other wb_domain_writeout_add calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. @@ -606,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); */ static void writeout_period(struct timer_list *t) { - struct wb_domain *dom = from_timer(dom, t, period_timer); + struct wb_domain *dom = timer_container_of(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; @@ -639,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { - del_timer_sync(&dom->period_timer); + timer_delete_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif @@ -668,6 +670,8 @@ static unsigned long bdi_ratio_from_pages(unsigned long pages) unsigned long ratio; global_dirty_limits(&background_thresh, &dirty_thresh); + if (!dirty_thresh) + return -EINVAL; ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); return ratio; @@ -766,13 +770,15 @@ int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) { int ret; unsigned long pages = min_bytes >> PAGE_SHIFT; - unsigned long min_ratio; + long min_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; min_ratio = bdi_ratio_from_pages(pages); + if (min_ratio < 0) + return min_ratio; return __bdi_set_min_ratio(bdi, min_ratio); } @@ -785,13 +791,15 @@ int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) { int ret; unsigned long pages = max_bytes >> PAGE_SHIFT; - unsigned long max_ratio; + long max_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; max_ratio = bdi_ratio_from_pages(pages); + if (max_ratio < 0) + return max_ratio; return __bdi_set_max_ratio(bdi, max_ratio); } @@ -837,14 +845,44 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, mdtc->avail = filepages + min(headroom, other_clean); } +static inline bool dtc_is_global(struct dirty_throttle_control *dtc) +{ + return mdtc_gdtc(dtc) == NULL; +} + +/* + * Dirty background will ignore pages being written as we're trying to + * decide whether to put more under writeback. + */ +static void domain_dirty_avail(struct dirty_throttle_control *dtc, + bool include_writeback) +{ + if (dtc_is_global(dtc)) { + dtc->avail = global_dirtyable_memory(); + dtc->dirty = global_node_page_state(NR_FILE_DIRTY); + if (include_writeback) + dtc->dirty += global_node_page_state(NR_WRITEBACK); + } else { + unsigned long filepages = 0, headroom = 0, writeback = 0; + + mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty, + &writeback); + if (include_writeback) + dtc->dirty += writeback; + mdtc_calc_avail(dtc, filepages, headroom); + } +} + /** - * __wb_calc_thresh - @wb's share of dirty throttling threshold + * __wb_calc_thresh - @wb's share of dirty threshold * @dtc: dirty_throttle_context of interest + * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc * - * Note that balance_dirty_pages() will only seriously take it as a hard limit - * when sleeping max_pause per page is not enough to keep the dirty pages under - * control. For example, when the device is completely stalled due to some error - * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * Note that balance_dirty_pages() will only seriously take dirty throttling + * threshold as a hard limit when sleeping max_pause per page is not enough + * to keep the dirty pages under control. For example, when the device is + * completely stalled due to some error conditions, or when there are 1000 + * dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * @@ -855,19 +893,22 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * - * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty and PG_writeback pages. + * Return: @wb's dirty limit in pages. For dirty throttling limit, the term + * "dirty" in the context of dirty balancing includes all PG_dirty and + * PG_writeback pages. */ -static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, + unsigned long thresh) { struct wb_domain *dom = dtc_dom(dtc); - unsigned long thresh = dtc->thresh; + struct bdi_writeback *wb = dtc->wb; u64 wb_thresh; + u64 wb_max_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the thresh ratio. + * Calculate this wb's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); @@ -876,20 +917,49 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); - wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); + wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); - if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) - wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + + /* + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ + if (thresh > dtc->dirty) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); + else + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); + } + + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + if (wb_thresh > wb_max_thresh) + wb_thresh = wb_max_thresh; return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { - struct dirty_throttle_control gdtc = { GDTC_INIT(wb), - .thresh = thresh }; - return __wb_calc_thresh(&gdtc); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + domain_dirty_avail(&gdtc, true); + return __wb_calc_thresh(&gdtc, thresh); +} + +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; + + domain_dirty_avail(&gdtc, true); + domain_dirty_avail(&mdtc, true); + domain_dirty_limits(&mdtc); + + return __wb_calc_thresh(&mdtc, mdtc.thresh); } /* @@ -1003,7 +1073,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); - unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ @@ -1053,12 +1123,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; - if (dtc->wb_dirty < 8) { - dtc->pos_ratio = min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); - return; - } - if (dtc->wb_dirty >= wb_thresh) return; @@ -1130,14 +1194,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* - * It's very possible that wb_thresh is close to 0 not because the - * device is slow, but that it has remained inactive for long time. - * Honour such devices a reasonable good (hopefully IO efficient) - * threshold, so that the occasional writes won't be blocked and active - * writes can rampup the threshold quickly. - */ - wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); - /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ @@ -1392,17 +1448,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". - * - * We rampup dirty_ratelimit forcibly if wb_dirty is low because - * it's possible that wb_thresh is close to zero due to inactivity - * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; - if (dtc->wb_dirty < 8) - setpoint = dtc->wb_dirty + 1; - else - setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { @@ -1636,9 +1685,9 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? - div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -1659,6 +1708,100 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) } } +static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh; + + if (strictlimit) { + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + } + + return dirty_poll_interval(dirty, thresh); +} + +/* + * Throttle it only when the background writeback cannot catch-up. This avoids + * (excessively) small writeouts when the wb limits are ramping up in case of + * !strictlimit. + * + * In strictlimit case make decision based on the wb counters and limits. Small + * writeouts when the wb limits are ramping up are the price we consciously pay + * for strictlimit-ing. + */ +static void domain_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh, bg_thresh; + + if (unlikely(strictlimit)) { + wb_dirty_limits(dtc); + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + bg_thresh = dtc->wb_bg_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + bg_thresh = dtc->bg_thresh; + } + dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); +} + +static void balance_domain_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + domain_dirty_avail(dtc, true); + domain_dirty_limits(dtc); + domain_dirty_freerun(dtc, strictlimit); +} + +static void wb_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->freerun = false; + + /* was already handled in domain_dirty_freerun */ + if (strictlimit) + return; + + wb_dirty_limits(dtc); + /* + * LOCAL_THROTTLE tasks must not be throttled when below the per-wb + * freerun ceiling. + */ + if (!(current->flags & PF_LOCAL_THROTTLE)) + return; + + dtc->freerun = dtc->wb_dirty < + dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh); +} + +static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) && + ((dtc->dirty > dtc->thresh) || strictlimit); +} + +/* + * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is + * in freerun state. Please don't use these invalid fields in freerun case. + */ +static void balance_wb_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + wb_dirty_freerun(dtc, strictlimit); + if (dtc->freerun) + return; + + wb_dirty_exceeded(dtc, strictlimit); + wb_position_ratio(dtc); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1675,13 +1818,12 @@ static int balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty */ + unsigned long nr_dirty; long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; - bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; @@ -1691,53 +1833,16 @@ static int balance_dirty_pages(struct bdi_writeback *wb, for (;;) { unsigned long now = jiffies; - unsigned long dirty, thresh, bg_thresh; - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ - unsigned long m_thresh = 0; - unsigned long m_bg_thresh = 0; - - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); - domain_dirty_limits(gdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(gdtc); - - dirty = gdtc->wb_dirty; - thresh = gdtc->wb_thresh; - bg_thresh = gdtc->wb_bg_thresh; - } else { - dirty = gdtc->dirty; - thresh = gdtc->thresh; - bg_thresh = gdtc->bg_thresh; - } + nr_dirty = global_node_page_state(NR_FILE_DIRTY); + balance_domain_limits(gdtc, strictlimit); if (mdtc) { - unsigned long filepages, headroom, writeback; - /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ - mem_cgroup_wb_stats(wb, &filepages, &headroom, - &mdtc->dirty, &writeback); - mdtc->dirty += writeback; - mdtc_calc_avail(mdtc, filepages, headroom); - - domain_dirty_limits(mdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(mdtc); - m_dirty = mdtc->wb_dirty; - m_thresh = mdtc->wb_thresh; - m_bg_thresh = mdtc->wb_bg_thresh; - } else { - m_dirty = mdtc->dirty; - m_thresh = mdtc->thresh; - m_bg_thresh = mdtc->bg_thresh; - } + balance_domain_limits(mdtc, strictlimit); } /* @@ -1749,36 +1854,26 @@ static int balance_dirty_pages(struct bdi_writeback *wb, * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ - if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && + if (!laptop_mode && nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the wb limits are ramping up in case of !strictlimit. - * - * In strictlimit case make decision based on the wb counters - * and limits. Small writeouts when the wb limits are ramping - * up are the price we consciously pay for strictlimit-ing. - * * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && - (!mdtc || - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + if (gdtc->freerun && (!mdtc || mdtc->freerun)) { unsigned long intv; unsigned long m_intv; free_running: - intv = dirty_poll_interval(dirty, thresh); + intv = domain_poll_intv(gdtc, strictlimit); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) - m_intv = dirty_poll_interval(m_dirty, m_thresh); + m_intv = domain_poll_intv(mdtc, strictlimit); current->nr_dirtied_pause = min(intv, m_intv); break; } @@ -1793,24 +1888,9 @@ free_running: * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) { - wb_dirty_limits(gdtc); - - if ((current->flags & PF_LOCAL_THROTTLE) && - gdtc->wb_dirty < - dirty_freerun_ceiling(gdtc->wb_thresh, - gdtc->wb_bg_thresh)) - /* - * LOCAL_THROTTLE tasks must not be throttled - * when below the per-wb freerun ceiling. - */ - goto free_running; - } - - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && - ((gdtc->dirty > gdtc->thresh) || strictlimit); - - wb_position_ratio(gdtc); + balance_wb_limits(gdtc, strictlimit); + if (gdtc->freerun) + goto free_running; sdtc = gdtc; if (mdtc) { @@ -1820,31 +1900,15 @@ free_running: * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) { - wb_dirty_limits(mdtc); - - if ((current->flags & PF_LOCAL_THROTTLE) && - mdtc->wb_dirty < - dirty_freerun_ceiling(mdtc->wb_thresh, - mdtc->wb_bg_thresh)) - /* - * LOCAL_THROTTLE tasks must not be - * throttled when below the per-wb - * freerun ceiling. - */ - goto free_running; - } - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && - ((mdtc->dirty > mdtc->thresh) || strictlimit); - - wb_position_ratio(mdtc); + balance_wb_limits(mdtc, strictlimit); + if (mdtc->freerun) + goto free_running; if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } - if (dirty_exceeded != wb->dirty_exceeded) - wb->dirty_exceeded = dirty_exceeded; - + wb->dirty_exceeded = gdtc->dirty_exceeded || + (mdtc && mdtc->dirty_exceeded); if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); @@ -1876,11 +1940,7 @@ free_running: */ if (pause < min_pause) { trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1905,11 +1965,7 @@ free_running: pause: trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -2065,6 +2121,35 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +/* + * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty + * and thresh, but it's for background writeback. + */ +static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + + dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh); + if (dtc->wb_bg_thresh < 2 * wb_stat_error()) + dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); + else + dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE); +} + +static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc) +{ + domain_dirty_avail(dtc, false); + domain_dirty_limits(dtc); + if (dtc->dirty > dtc->bg_thresh) + return true; + + wb_bg_dirty_limits(dtc); + if (dtc->wb_dirty > dtc->wb_bg_thresh) + return true; + + return false; +} + /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest @@ -2076,54 +2161,14 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited); */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; - struct dirty_throttle_control * const gdtc = &gdtc_stor; - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? - &mdtc_stor : NULL; - unsigned long reclaimable; - unsigned long thresh; - - /* - * Similar to balance_dirty_pages() but ignores pages being written - * as we're trying to decide whether to put more under writeback. - */ - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); - domain_dirty_limits(gdtc); - - if (gdtc->dirty > gdtc->bg_thresh) - return true; - - thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); - if (thresh < 2 * wb_stat_error()) - reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); - else - reclaimable = wb_stat(wb, WB_RECLAIMABLE); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; - if (reclaimable > thresh) + if (domain_over_bg_thresh(&gdtc)) return true; - if (mdtc) { - unsigned long filepages, headroom, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, - &writeback); - mdtc_calc_avail(mdtc, filepages, headroom); - domain_dirty_limits(mdtc); /* ditto, ignore writeback */ - - if (mdtc->dirty > mdtc->bg_thresh) - return true; - - thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); - if (thresh < 2 * wb_stat_error()) - reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); - else - reclaimable = wb_stat(wb, WB_RECLAIMABLE); - - if (reclaimable > thresh) - return true; - } + if (mdtc_valid(&mdtc)) + return domain_over_bg_thresh(&mdtc); return false; } @@ -2132,7 +2177,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, +static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; @@ -2158,7 +2203,7 @@ static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = - from_timer(backing_dev_info, t, laptop_mode_wb_timer); + timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } @@ -2185,7 +2230,7 @@ void laptop_sync_completion(void) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - del_timer(&bdi->laptop_mode_wb_timer); + timer_delete(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } @@ -2223,7 +2268,7 @@ static int page_writeback_cpu_online(unsigned int cpu) /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -static struct ctl_table vm_page_writeback_sysctls[] = { +static const struct ctl_table vm_page_writeback_sysctls[] = { { .procname = "dirty_background_ratio", .data = &dirty_background_ratio, @@ -2291,7 +2336,6 @@ static struct ctl_table vm_page_writeback_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - {} }; #endif @@ -2521,11 +2565,11 @@ struct folio *writeback_iter(struct address_space *mapping, if (!folio) { /* * To avoid deadlocks between range_cyclic writeback and callers - * that hold pages in PageWriteback to aggregate I/O until + * that hold folios in writeback to aggregate I/O until * the writeback iteration finishes, we do not loop back to the - * start of the file. Doing so causes a page lock/page + * start of the file. Doing so causes a folio lock/folio * writeback access order inversion - we should only ever lock - * multiple pages in ascending page->index order, and looping + * multiple folios in ascending folio->index order, and looping * back to the start of the file violates that rule and causes * deadlocks. */ @@ -2542,10 +2586,11 @@ struct folio *writeback_iter(struct address_space *mapping, done: if (wbc->range_cyclic) - mapping->writeback_index = folio->index + folio_nr_pages(folio); + mapping->writeback_index = folio_next_index(folio); folio_batch_release(&wbc->fbatch); return NULL; } +EXPORT_SYMBOL_GPL(writeback_iter); /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. @@ -2577,27 +2622,6 @@ int write_cache_pages(struct address_space *mapping, } EXPORT_SYMBOL(write_cache_pages); -static int writeback_use_writepage(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct folio *folio = NULL; - struct blk_plug plug; - int err; - - blk_start_plug(&plug); - while ((folio = writeback_iter(mapping, wbc, folio, &err))) { - err = mapping->a_ops->writepage(&folio->page, wbc); - if (err == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - err = 0; - } - mapping_set_error(mapping, err); - } - blk_finish_plug(&plug); - - return err; -} - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2608,14 +2632,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { - if (mapping->a_ops->writepages) { + if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); - } else if (mapping->a_ops->writepage) { - ret = writeback_use_writepage(mapping, wbc); - } else { + else /* deal with chardevs and other special files */ ret = 0; - } if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; @@ -2653,8 +2674,6 @@ EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * - * Caller must hold folio_memcg_lock(). - * * NOTE: This relies on being atomic wrt interrupts. */ static void folio_account_dirtied(struct folio *folio, @@ -2687,7 +2706,6 @@ static void folio_account_dirtied(struct folio *folio, /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold folio_memcg_lock(). */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { @@ -2700,17 +2718,19 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) } /* - * Mark the folio dirty, and set it dirty in the page cache, and mark - * the inode dirty. + * Mark the folio dirty, and set it dirty in the page cache. * * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold folio_memcg_lock(). Most callers have the folio - * locked. A few have the folio blocked from truncation through other - * means (eg zap_vma_pages() has it mapped and is holding the page table - * lock). This can also be called from mark_buffer_dirty(), which I - * cannot prove is always protected against truncate. + * It is the caller's responsibility to prevent the folio from being truncated + * while this function is in progress, although it may have been truncated + * before this function is called. Most callers have the folio locked. + * A few have the folio blocked from truncation through other means (e.g. + * zap_vma_pages() has it mapped and is holding the page table lock). + * When called from mark_buffer_dirty(), the filesystem should hold a + * reference to the buffer_head that is being marked dirty, which causes + * try_to_free_buffers() to fail. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) @@ -2748,14 +2768,10 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, */ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { - folio_memcg_lock(folio); - if (folio_test_set_dirty(folio)) { - folio_memcg_unlock(folio); + if (folio_test_set_dirty(folio)) return false; - } __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); - folio_memcg_unlock(folio); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2840,25 +2856,25 @@ bool folio_mark_dirty(struct folio *folio) EXPORT_SYMBOL(folio_mark_dirty); /* - * set_page_dirty() is racy if the caller has no reference against - * page->mapping->host, and if the page is unlocked. This is because another - * CPU could truncate the page off the mapping and then free the mapping. + * folio_mark_dirty() is racy if the caller has no reference against + * folio->mapping->host, and if the folio is unlocked. This is because another + * CPU could truncate the folio off the mapping and then free the mapping. * - * Usually, the page _is_ locked, or the caller is a user-space process which + * Usually, the folio _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * - * In other cases, the page should be locked before running set_page_dirty(). + * In other cases, the folio should be locked before running folio_mark_dirty(). */ -int set_page_dirty_lock(struct page *page) +bool folio_mark_dirty_lock(struct folio *folio) { - int ret; + bool ret; - lock_page(page); - ret = set_page_dirty(page); - unlock_page(page); + folio_lock(folio); + ret = folio_mark_dirty(folio); + folio_unlock(folio); return ret; } -EXPORT_SYMBOL(set_page_dirty_lock); +EXPORT_SYMBOL(folio_mark_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT @@ -2882,14 +2898,12 @@ void __folio_cancel_dirty(struct folio *folio) struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; - folio_memcg_lock(folio); wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); - folio_memcg_unlock(folio); } else { folio_clear_dirty(folio); } @@ -3000,7 +3014,6 @@ bool __folio_end_writeback(struct folio *folio) struct address_space *mapping = folio_mapping(folio); bool ret; - folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -3031,7 +3044,6 @@ bool __folio_end_writeback(struct folio *folio) lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); node_stat_mod_folio(folio, NR_WRITTEN, nr); - folio_memcg_unlock(folio); return ret; } @@ -3043,8 +3055,8 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) int access_ret; VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; @@ -3085,7 +3097,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); - folio_memcg_unlock(folio); access_ret = arch_make_folio_accessible(folio); /* |