diff options
Diffstat (limited to 'mm/page-writeback.c')
| -rw-r--r-- | mm/page-writeback.c | 1977 |
1 files changed, 1149 insertions, 828 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7d1010453fb9..ccdeb0e84d39 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * mm/page-writeback.c * @@ -12,6 +13,7 @@ */ #include <linux/kernel.h> +#include <linux/math64.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> @@ -31,12 +33,12 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> -#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> +#include <linux/shmem_fs.h> #include <trace/events/writeback.h> #include "internal.h" @@ -53,7 +55,7 @@ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* - * Estimate write bandwidth at 200ms intervals. + * Estimate write bandwidth or update dirty limit at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) @@ -70,30 +72,30 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ -int dirty_background_ratio = 10; +static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ -unsigned long dirty_background_bytes; +static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ -int vm_highmem_is_dirtyable; +static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 20; +static int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ -unsigned long vm_dirty_bytes; +static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks @@ -108,11 +110,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* - * Flag that makes the machine dump writes/reads and block dirtyings. - */ -int block_dump; - -/* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ @@ -124,27 +121,6 @@ EXPORT_SYMBOL(laptop_mode); struct wb_domain global_wb_domain; -/* consolidated parameters for balance_dirty_pages() and its subroutines */ -struct dirty_throttle_control { -#ifdef CONFIG_CGROUP_WRITEBACK - struct wb_domain *dom; - struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ -#endif - struct bdi_writeback *wb; - struct fprop_local_percpu *wb_completions; - - unsigned long avail; /* dirtyable */ - unsigned long dirty; /* file_dirty + write + nfs */ - unsigned long thresh; /* dirty threshold */ - unsigned long bg_thresh; /* dirty background threshold */ - - unsigned long wb_dirty; /* per-wb counterparts */ - unsigned long wb_thresh; - unsigned long wb_bg_thresh; - - unsigned long pos_ratio; -}; - /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will @@ -188,7 +164,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { - unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; @@ -200,11 +176,11 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, if (this_bw < tot_bw) { if (min) { min *= this_bw; - do_div(min, tot_bw); + min = div64_ul(min, tot_bw); } - if (max < 100) { + if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; - do_div(max, tot_bw); + max = div64_ul(max, tot_bw); } } @@ -256,7 +232,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * requiring writeback. * * This number of dirtyable pages is the base value of which the - * user-configurable dirty ratio is the effictive number of pages that + * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * @@ -270,7 +246,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * - * Returns the node's number of pages potentially available for dirty + * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) @@ -329,18 +305,6 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) } /* - * Unreclaimable memory (kernel memory or anonymous memory - * without swap) can bring down the dirtyable pages below - * the zone's dirty balance reserve and the above calculation - * will underflow. However we still want to add in nodes - * which are below threshold (negative values) to get a more - * accurate calculation but make sure that the total never - * underflows. - */ - if ((long)x < 0) - x = 0; - - /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure @@ -355,7 +319,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) /** * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the global number of pages potentially available for dirty + * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) @@ -386,8 +350,7 @@ static unsigned long global_dirtyable_memory(void) * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. + * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { @@ -432,13 +395,20 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (bg_thresh >= thresh) - bg_thresh = thresh / 2; tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + if (rt_or_dl_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + if (thresh > UINT_MAX) + thresh = UINT_MAX; + /* This makes sure bg_thresh is within 32-bits as well */ + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; @@ -470,7 +440,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * - * Returns the maximum number of dirty pages allowed in a node, based + * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) @@ -485,17 +455,21 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) else dirty = vm_dirty_ratio * node_memory / 100; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + if (rt_or_dl_task(tsk)) dirty += dirty / 4; - return dirty; + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + return min_t(unsigned long, dirty, UINT_MAX); } /** * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * - * Returns %true when the dirty pages in @pgdat are within the node's + * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) @@ -504,15 +478,14 @@ bool node_dirty_ok(struct pglist_data *pgdat) unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } -int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +#ifdef CONFIG_SYSCTL +static int dirty_background_ratio_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -522,47 +495,56 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, return ret; } -int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int dirty_background_bytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; + unsigned long old_bytes = dirty_background_bytes; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); - if (ret == 0 && write) + if (ret == 0 && write) { + if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > + UINT_MAX) { + dirty_background_bytes = old_bytes; + return -ERANGE; + } dirty_background_ratio = 0; + } return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - writeback_set_ratelimit(); vm_dirty_bytes = 0; + writeback_set_ratelimit(); } return ret; } -int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int dirty_bytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { + vm_dirty_bytes = old_bytes; + return -ERANGE; + } writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } +#endif static unsigned long wp_next_time(unsigned long cur_time) { @@ -573,16 +555,16 @@ static unsigned long wp_next_time(unsigned long cur_time) return cur_time; } -static void wb_domain_writeout_inc(struct wb_domain *dom, +static void wb_domain_writeout_add(struct wb_domain *dom, struct fprop_local_percpu *completions, - unsigned int max_prop_frac) + unsigned int max_prop_frac, long nr) { - __fprop_inc_percpu_max(&dom->completions, completions, - max_prop_frac); + __fprop_add_percpu_max(&dom->completions, completions, + max_prop_frac, nr); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* - * We can race with other __bdi_writeout_inc calls here but + * We can race with other wb_domain_writeout_add calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. @@ -594,20 +576,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom, /* * Increment @wb's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). + * completion count. Called from __folio_end_writeback(). */ -static inline void __wb_writeout_inc(struct bdi_writeback *wb) +static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) { struct wb_domain *cgdom; - inc_wb_stat(wb, WB_WRITTEN); - wb_domain_writeout_inc(&global_wb_domain, &wb->completions, - wb->bdi->max_prop_frac); + wb_stat_mod(wb, WB_WRITTEN, nr); + wb_domain_writeout_add(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac, nr); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) - wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), - wb->bdi->max_prop_frac); + wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac, nr); } void wb_writeout_inc(struct bdi_writeback *wb) @@ -615,7 +597,7 @@ void wb_writeout_inc(struct bdi_writeback *wb) unsigned long flags; local_irq_save(flags); - __wb_writeout_inc(wb); + __wb_writeout_add(wb, 1); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); @@ -626,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); */ static void writeout_period(struct timer_list *t) { - struct wb_domain *dom = from_timer(dom, t, period_timer); + struct wb_domain *dom = timer_container_of(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; @@ -659,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { - del_timer_sync(&dom->period_timer); + timer_delete_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif @@ -671,20 +653,66 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; -int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +static int bdi_check_pages_limit(unsigned long pages) +{ + unsigned long max_dirty_pages = global_dirtyable_memory(); + + if (pages > max_dirty_pages) + return -EINVAL; + + return 0; +} + +static unsigned long bdi_ratio_from_pages(unsigned long pages) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long ratio; + + global_dirty_limits(&background_thresh, &dirty_thresh); + if (!dirty_thresh) + return -EINVAL; + ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); + + return ratio; +} + +static u64 bdi_get_bytes(unsigned int ratio) { + unsigned long background_thresh; + unsigned long dirty_thresh; + u64 bytes; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; + + return bytes; +} + +static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + unsigned int delta; int ret = 0; + if (min_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { - min_ratio -= bdi->min_ratio; - if (bdi_min_ratio + min_ratio < 100) { - bdi_min_ratio += min_ratio; - bdi->min_ratio += min_ratio; + if (min_ratio < bdi->min_ratio) { + delta = bdi->min_ratio - min_ratio; + bdi_min_ratio -= delta; + bdi->min_ratio = min_ratio; } else { - ret = -EINVAL; + delta = min_ratio - bdi->min_ratio; + if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { + bdi_min_ratio += delta; + bdi->min_ratio = min_ratio; + } else { + ret = -EINVAL; + } } } spin_unlock_bh(&bdi_lock); @@ -692,11 +720,11 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) return ret; } -int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; - if (max_ratio > 100) + if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); @@ -704,14 +732,92 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) ret = -EINVAL; } else { bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / + (100 * BDI_RATIO_SCALE); } spin_unlock_bh(&bdi_lock); return ret; } + +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio); +} + +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); +} EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_min_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->min_ratio); +} + +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) +{ + int ret; + unsigned long pages = min_bytes >> PAGE_SHIFT; + long min_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + min_ratio = bdi_ratio_from_pages(pages); + if (min_ratio < 0) + return min_ratio; + return __bdi_set_min_ratio(bdi, min_ratio); +} + +u64 bdi_get_max_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->max_ratio); +} + +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) +{ + int ret; + unsigned long pages = max_bytes >> PAGE_SHIFT; + long max_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + max_ratio = bdi_ratio_from_pages(pages); + if (max_ratio < 0) + return max_ratio; + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@ -739,17 +845,44 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, mdtc->avail = filepages + min(headroom, other_clean); } +static inline bool dtc_is_global(struct dirty_throttle_control *dtc) +{ + return mdtc_gdtc(dtc) == NULL; +} + +/* + * Dirty background will ignore pages being written as we're trying to + * decide whether to put more under writeback. + */ +static void domain_dirty_avail(struct dirty_throttle_control *dtc, + bool include_writeback) +{ + if (dtc_is_global(dtc)) { + dtc->avail = global_dirtyable_memory(); + dtc->dirty = global_node_page_state(NR_FILE_DIRTY); + if (include_writeback) + dtc->dirty += global_node_page_state(NR_WRITEBACK); + } else { + unsigned long filepages = 0, headroom = 0, writeback = 0; + + mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty, + &writeback); + if (include_writeback) + dtc->dirty += writeback; + mdtc_calc_avail(dtc, filepages, headroom); + } +} + /** - * __wb_calc_thresh - @wb's share of dirty throttling threshold + * __wb_calc_thresh - @wb's share of dirty threshold * @dtc: dirty_throttle_context of interest + * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc * - * Returns @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. - * - * Note that balance_dirty_pages() will only seriously take it as a hard limit - * when sleeping max_pause per page is not enough to keep the dirty pages under - * control. For example, when the device is completely stalled due to some error - * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * Note that balance_dirty_pages() will only seriously take dirty throttling + * threshold as a hard limit when sleeping max_pause per page is not enough + * to keep the dirty pages under control. For example, when the device is + * completely stalled due to some error conditions, or when there are 1000 + * dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * @@ -759,39 +892,74 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + * + * Return: @wb's dirty limit in pages. For dirty throttling limit, the term + * "dirty" in the context of dirty balancing includes all PG_dirty and + * PG_writeback pages. */ -static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, + unsigned long thresh) { struct wb_domain *dom = dtc_dom(dtc); - unsigned long thresh = dtc->thresh; + struct bdi_writeback *wb = dtc->wb; u64 wb_thresh; - long numerator, denominator; + u64 wb_max_thresh; + unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the thresh ratio. + * Calculate this wb's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); - wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; - do_div(wb_thresh, denominator); + wb_thresh = div64_ul(wb_thresh, denominator); + + wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); - wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); + wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + + /* + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ + if (thresh > dtc->dirty) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); + else + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); + } - wb_thresh += (thresh * wb_min_ratio) / 100; - if (wb_thresh > (thresh * wb_max_ratio) / 100) - wb_thresh = thresh * wb_max_ratio / 100; + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + if (wb_thresh > wb_max_thresh) + wb_thresh = wb_max_thresh; return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { - struct dirty_throttle_control gdtc = { GDTC_INIT(wb), - .thresh = thresh }; - return __wb_calc_thresh(&gdtc); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + domain_dirty_avail(&gdtc, true); + return __wb_calc_thresh(&gdtc, thresh); +} + +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; + + domain_dirty_avail(&gdtc, true); + domain_dirty_avail(&mdtc, true); + domain_dirty_limits(&mdtc); + + return __wb_calc_thresh(&mdtc, mdtc.thresh); } /* @@ -850,7 +1018,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * ^ pos_ratio * | * | |<===== global dirty control scope ======>| - * 2.0 .............* + * 2.0 * * * * * * * * | .* * | . * * | . * @@ -903,9 +1071,9 @@ static long long pos_ratio_polynom(unsigned long setpoint, static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; - unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); - unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ @@ -933,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to - * 1% by default. Without strictlimit feature, fuse writeback may - * consume arbitrary amount of RAM because it is accounted in - * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". + * 1% by default. * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: @@ -955,12 +1121,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; - if (dtc->wb_dirty < 8) { - dtc->pos_ratio = min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); - return; - } - if (dtc->wb_dirty >= wb_thresh) return; @@ -1032,14 +1192,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* - * It's very possible that wb_thresh is close to 0 not because the - * device is slow, but that it has remained inactive for long time. - * Honour such devices a reasonable good (hopefully IO efficient) - * threshold, so that the occasional writes won't be blocked and active - * writes can rampup the threshold quickly. - */ - wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); - /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ @@ -1095,13 +1247,13 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, * write_bandwidth = --------------------------------------------------- * period * - * @written may have decreased due to account_page_redirty(). + * @written may have decreased due to folio_redirty_for_writepage(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { - do_div(bw, elapsed); + bw = div64_ul(bw, elapsed); avg = bw; goto out; } @@ -1126,7 +1278,7 @@ out: &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; - wb->avg_write_bandwidth = avg; + WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) @@ -1158,8 +1310,8 @@ update: dom->dirty_limit = limit; } -static void domain_update_bandwidth(struct dirty_throttle_control *dtc, - unsigned long now) +static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, + unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); @@ -1294,17 +1446,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". - * - * We rampup dirty_ratelimit forcibly if wb_dirty is low because - * it's possible that wb_thresh is close to zero due to inactivity - * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; - if (dtc->wb_dirty < 8) - setpoint = dtc->wb_dirty + 1; - else - setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { @@ -1335,7 +1480,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, else dirty_ratelimit -= step; - wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); @@ -1343,35 +1488,28 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, - unsigned long start_time, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - wb->bw_time_stamp; + unsigned long elapsed; unsigned long dirtied; unsigned long written; - lockdep_assert_held(&wb->list_lock); + spin_lock(&wb->list_lock); /* - * rate-limit, only update once every 200ms. + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. */ - if (elapsed < BANDWIDTH_INTERVAL) - return; - + elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); - /* - * Skip quiet periods when disk bandwidth is under-utilized. - * (at least 1s idle time between two flusher runs) - */ - if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) - goto snapshot; - if (update_ratelimit) { - domain_update_bandwidth(gdtc, now); + domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* @@ -1379,23 +1517,41 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { - domain_update_bandwidth(mdtc, now); + domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); -snapshot: wb->dirtied_stamp = dirtied; wb->written_stamp = written; - wb->bw_time_stamp = now; + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); } -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) +void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - __wb_update_bandwidth(&gdtc, NULL, start_time, false); + __wb_update_bandwidth(&gdtc, NULL, false); +} + +/* Interval after which we consider wb idle and don't estimate bandwidth */ +#define WB_BANDWIDTH_IDLE_JIF (HZ) + +static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); + + if (elapsed > WB_BANDWIDTH_IDLE_JIF && + !atomic_read(&wb->writeback_inodes)) { + spin_lock(&wb->list_lock); + wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); + wb->written_stamp = wb_stat(wb, WB_WRITTEN); + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); + } } /* @@ -1418,7 +1574,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty, static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { - unsigned long bw = wb->avg_write_bandwidth; + unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* @@ -1440,8 +1596,8 @@ static long wb_min_pause(struct bdi_writeback *wb, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { - long hi = ilog2(wb->avg_write_bandwidth); - long lo = ilog2(wb->dirty_ratelimit); + long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); + long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1527,7 +1683,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; @@ -1550,6 +1706,100 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) } } +static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh; + + if (strictlimit) { + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + } + + return dirty_poll_interval(dirty, thresh); +} + +/* + * Throttle it only when the background writeback cannot catch-up. This avoids + * (excessively) small writeouts when the wb limits are ramping up in case of + * !strictlimit. + * + * In strictlimit case make decision based on the wb counters and limits. Small + * writeouts when the wb limits are ramping up are the price we consciously pay + * for strictlimit-ing. + */ +static void domain_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh, bg_thresh; + + if (unlikely(strictlimit)) { + wb_dirty_limits(dtc); + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + bg_thresh = dtc->wb_bg_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + bg_thresh = dtc->bg_thresh; + } + dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); +} + +static void balance_domain_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + domain_dirty_avail(dtc, true); + domain_dirty_limits(dtc); + domain_dirty_freerun(dtc, strictlimit); +} + +static void wb_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->freerun = false; + + /* was already handled in domain_dirty_freerun */ + if (strictlimit) + return; + + wb_dirty_limits(dtc); + /* + * LOCAL_THROTTLE tasks must not be throttled when below the per-wb + * freerun ceiling. + */ + if (!(current->flags & PF_LOCAL_THROTTLE)) + return; + + dtc->freerun = dtc->wb_dirty < + dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh); +} + +static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) && + ((dtc->dirty > dtc->thresh) || strictlimit); +} + +/* + * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is + * in freerun state. Please don't use these invalid fields in freerun case. + */ +static void balance_wb_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + wb_dirty_freerun(dtc, strictlimit); + if (dtc->freerun) + return; + + wb_dirty_exceeded(dtc, strictlimit); + wb_position_ratio(dtc); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1557,8 +1807,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct bdi_writeback *wb, - unsigned long pages_dirtied) +static int balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, unsigned int flags) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; @@ -1566,117 +1816,79 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_dirty; long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; - bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; + int ret = 0; for (;;) { unsigned long now = jiffies; - unsigned long dirty, thresh, bg_thresh; - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ - unsigned long m_thresh = 0; - unsigned long m_bg_thresh = 0; - /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); - - domain_dirty_limits(gdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(gdtc); - - dirty = gdtc->wb_dirty; - thresh = gdtc->wb_thresh; - bg_thresh = gdtc->wb_bg_thresh; - } else { - dirty = gdtc->dirty; - thresh = gdtc->thresh; - bg_thresh = gdtc->bg_thresh; - } + nr_dirty = global_node_page_state(NR_FILE_DIRTY); + balance_domain_limits(gdtc, strictlimit); if (mdtc) { - unsigned long filepages, headroom, writeback; - /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ - mem_cgroup_wb_stats(wb, &filepages, &headroom, - &mdtc->dirty, &writeback); - mdtc->dirty += writeback; - mdtc_calc_avail(mdtc, filepages, headroom); - - domain_dirty_limits(mdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(mdtc); - m_dirty = mdtc->wb_dirty; - m_thresh = mdtc->wb_thresh; - m_bg_thresh = mdtc->wb_bg_thresh; - } else { - m_dirty = mdtc->dirty; - m_thresh = mdtc->thresh; - m_bg_thresh = mdtc->bg_thresh; - } + balance_domain_limits(mdtc, strictlimit); } /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the wb limits are ramping up in case of !strictlimit. - * - * In strictlimit case make decision based on the wb counters - * and limits. Small writeouts when the wb limits are ramping - * up are the price we consciously pay for strictlimit-ing. + * In laptop mode, we wait until hitting the higher threshold + * before starting background writeout, and then write out all + * the way down to the lower threshold. So slow writers cause + * minimal disk activity. * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if (!laptop_mode && nr_dirty > gdtc->bg_thresh && + !writeback_in_progress(wb)) + wb_start_background_writeback(wb); + + /* * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && - (!mdtc || - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv = dirty_poll_interval(dirty, thresh); - unsigned long m_intv = ULONG_MAX; + if (gdtc->freerun && (!mdtc || mdtc->freerun)) { + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = domain_poll_intv(gdtc, strictlimit); + m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) - m_intv = dirty_poll_interval(m_dirty, m_thresh); + m_intv = domain_poll_intv(mdtc, strictlimit); current->nr_dirtied_pause = min(intv, m_intv); break; } + /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); + mem_cgroup_flush_foreign(wb); + /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) - wb_dirty_limits(gdtc); - - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && - ((gdtc->dirty > gdtc->thresh) || strictlimit); - - wb_position_ratio(gdtc); + balance_wb_limits(gdtc, strictlimit); + if (gdtc->freerun) + goto free_running; sdtc = gdtc; if (mdtc) { @@ -1686,29 +1898,21 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) - wb_dirty_limits(mdtc); - - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && - ((mdtc->dirty > mdtc->thresh) || strictlimit); - - wb_position_ratio(mdtc); + balance_wb_limits(mdtc, strictlimit); + if (mdtc->freerun) + goto free_running; if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } - if (dirty_exceeded && !wb->dirty_exceeded) - wb->dirty_exceeded = 1; - - if (time_is_before_jiffies(wb->bw_time_stamp + - BANDWIDTH_INTERVAL)) { - spin_lock(&wb->list_lock); - __wb_update_bandwidth(gdtc, mdtc, start_time, true); - spin_unlock(&wb->list_lock); - } + wb->dirty_exceeded = gdtc->dirty_exceeded || + (mdtc && mdtc->dirty_exceeded); + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + __wb_update_bandwidth(gdtc, mdtc, true); /* throttle according to the chosen dtc */ - dirty_ratelimit = wb->dirty_ratelimit; + dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); @@ -1734,11 +1938,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, */ if (pause < min_pause) { trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1763,19 +1963,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb, pause: trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); + if (flags & BDP_ASYNC) { + ret = -EAGAIN; + break; + } __set_current_state(TASK_KILLABLE); - wb->dirty_sleep = now; + bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; @@ -1790,7 +1990,7 @@ pause: break; /* - * In the case of an unresponding NFS server and the NFS dirty + * In the case of an unresponsive NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * @@ -1805,26 +2005,7 @@ pause: if (fatal_signal_pending(current)) break; } - - if (!dirty_exceeded && wb->dirty_exceeded) - wb->dirty_exceeded = 0; - - if (writeback_in_progress(wb)) - return; - - /* - * In laptop mode, we wait until hitting the higher threshold before - * starting background writeout, and then write out all the way down - * to the lower threshold. So slow writers cause minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (laptop_mode) - return; - - if (nr_reclaimable > gdtc->bg_thresh) - wb_start_background_writeback(wb); + return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1846,28 +2027,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited - balance dirty memory state - * @mapping: address_space which was dirtied + * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. + * @mapping: address_space which was dirtied. + * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * On really big machines, get_writeback_state is expensive, so try to avoid - * calling it too often (ratelimiting). But once we're over the dirty memory - * limit we decrease the ratelimiting by a lot, to prevent individual processes - * from overshooting the limit by (ratelimit_pages) each. + * See balance_dirty_pages_ratelimited() for details. + * + * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to + * indicate that memory is out of balance and the caller must wait + * for I/O to complete. Otherwise, it will return 0 to indicate + * that either memory was already in balance, or it was able to sleep + * until the amount of dirty memory returned to balance. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; + int ret = 0; int *p; - if (!bdi_cap_account_dirty(bdi)) - return; + if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) + return ret; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); @@ -1907,67 +2094,89 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(wb, current->nr_dirtied); + ret = balance_dirty_pages(wb, current->nr_dirtied, flags); wb_put(wb); + return ret; } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); /** - * wb_over_bg_thresh - does @wb need to be written back? - * @wb: bdi_writeback of interest + * balance_dirty_pages_ratelimited - balance dirty memory state. + * @mapping: address_space which was dirtied. * - * Determines whether background writeback should keep writing @wb or it's - * clean enough. Returns %true if writeback should continue. + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * Once we're over the dirty memory limit we decrease the ratelimiting + * by a lot, to prevent individual processes from overshooting the limit + * by (ratelimit_pages) each. */ -bool wb_over_bg_thresh(struct bdi_writeback *wb) +void balance_dirty_pages_ratelimited(struct address_space *mapping) { - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; - struct dirty_throttle_control * const gdtc = &gdtc_stor; - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? - &mdtc_stor : NULL; + balance_dirty_pages_ratelimited_flags(mapping, 0); +} +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); - /* - * Similar to balance_dirty_pages() but ignores pages being written - * as we're trying to decide whether to put more under writeback. - */ - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); - domain_dirty_limits(gdtc); +/* + * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty + * and thresh, but it's for background writeback. + */ +static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; - if (gdtc->dirty > gdtc->bg_thresh) + dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh); + if (dtc->wb_bg_thresh < 2 * wb_stat_error()) + dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); + else + dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE); +} + +static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc) +{ + domain_dirty_avail(dtc, false); + domain_dirty_limits(dtc); + if (dtc->dirty > dtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > - wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) + wb_bg_dirty_limits(dtc); + if (dtc->wb_dirty > dtc->wb_bg_thresh) return true; - if (mdtc) { - unsigned long filepages, headroom, writeback; + return false; +} - mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, - &writeback); - mdtc_calc_avail(mdtc, filepages, headroom); - domain_dirty_limits(mdtc); /* ditto, ignore writeback */ +/** + * wb_over_bg_thresh - does @wb need to be written back? + * @wb: bdi_writeback of interest + * + * Determines whether background writeback should keep writing @wb or it's + * clean enough. + * + * Return: %true if writeback should continue. + */ +bool wb_over_bg_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; - if (mdtc->dirty > mdtc->bg_thresh) - return true; + if (domain_over_bg_thresh(&gdtc)) + return true; - if (wb_stat(wb, WB_RECLAIMABLE) > - wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) - return true; - } + if (mdtc_valid(&mdtc)) + return domain_over_bg_thresh(&mdtc); return false; } +#ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; @@ -1987,12 +2196,12 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } +#endif -#ifdef CONFIG_BLOCK void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = - from_timer(backing_dev_info, t, laptop_mode_wb_timer); + timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } @@ -2019,17 +2228,14 @@ void laptop_sync_completion(void) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - del_timer(&bdi->laptop_mode_wb_timer); + timer_delete(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } -#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. - * If it is too low then SMP machines will call the (expensive) - * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory @@ -2055,17 +2261,91 @@ static int page_writeback_cpu_online(unsigned int cpu) return 0; } +#ifdef CONFIG_SYSCTL + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; + +static const struct ctl_table vm_page_writeback_sysctls[] = { + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +}; +#endif + /* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory - * related to pages that could be allocated for buffers (by - * comparing nr_free_buffer_pages() to vm_total_pages. + * related to pages that could be allocated for buffers. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" - * is now applied to total non-HIGHPAGE memory (by subtracting - * totalhigh_pages from vm_total_pages), and as such we can't + * is now applied to total non-HIGHPAGE memory, and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. @@ -2081,21 +2361,24 @@ void __init page_writeback_init(void) page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_page_writeback_sysctls); +#endif } /** - * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * tag_pages_for_writeback - tag pages to be written by writeback * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags - * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is - * that write_cache_pages (or whoever calls this function) will then use - * TOWRITE tag to identify pages eligible for writeback. This mechanism is - * used to avoid livelocking of writeback by a process steadily creating new - * dirty pages in the file (thus it is important for this function to be quick - * so that it can tag pages faster than a dirtying process can create them). + * all pages that have DIRTY tag set with a special TOWRITE tag. The caller + * can then use the TOWRITE tag to identify pages eligible for writeback. + * This mechanism is used to avoid livelocking of writeback by a process + * steadily creating new dirty pages in the file (thus it is important for this + * function to be quick so that it can tag pages faster than a dirtying process + * can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2119,474 +2402,447 @@ void tag_pages_for_writeback(struct address_space *mapping, } EXPORT_SYMBOL(tag_pages_for_writeback); -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - * - * To avoid livelocks (when other process dirties new pages), we first tag - * pages which should be written back with TOWRITE tag and only then start - * writing them. For data-integrity sync we have to be careful so that we do - * not miss some pages (e.g., because some other process has cleared TOWRITE - * tag we set). The rule we follow is that TOWRITE tag can be cleared only - * by the process clearing the DIRTY tag (and submitting the page for IO). - * - * To avoid deadlocks between range_cyclic writeback and callers that hold - * pages in PageWriteback to aggregate IO until write_cache_pages() returns, - * we do not loop back to the start of the file. Doing so causes a page - * lock/page writeback access order inversion - we should only ever lock - * multiple pages in ascending page->index order, and looping back to the start - * of the file violates that rule and causes deadlocks. - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) +static bool folio_prepare_writeback(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio) { - int ret = 0; - int done = 0; - int error; - struct pagevec pvec; - int nr_pages; - pgoff_t uninitialized_var(writeback_index); - pgoff_t index; - pgoff_t end; /* Inclusive */ - pgoff_t done_index; - int range_whole = 0; - xa_mark_t tag; - - pagevec_init(&pvec); - if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; - end = -1; - } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - done_index = index; - while (!done && (index <= end)) { - int i; - - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) - break; + /* + * Folio truncated or invalidated. We can freely skip it then, + * even for data integrity operations: the folio has disappeared + * concurrently, so there could be no real expectation of this + * data integrity operation even if there is now a new, dirty + * folio at the same pagecache index. + */ + if (unlikely(folio->mapping != mapping)) + return false; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + /* + * Did somebody else write it for us? + */ + if (!folio_test_dirty(folio)) + return false; - done_index = page->index; + if (folio_test_writeback(folio)) { + if (wbc->sync_mode == WB_SYNC_NONE) + return false; + folio_wait_writeback(folio); + } + BUG_ON(folio_test_writeback(folio)); - lock_page(page); + if (!folio_clear_dirty_for_io(folio)) + return false; - /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data interity operation - * even if there is now a new, dirty page at the same - * pagecache address. - */ - if (unlikely(page->mapping != mapping)) { -continue_unlock: - unlock_page(page); - continue; - } + return true; +} - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } - if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - else - goto continue_unlock; - } +static pgoff_t wbc_end(struct writeback_control *wbc) +{ + if (wbc->range_cyclic) + return -1; + return wbc->range_end >> PAGE_SHIFT; +} - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) - goto continue_unlock; - - trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = (*writepage)(page, wbc, data); - if (unlikely(error)) { - /* - * Handle errors according to the type of - * writeback. There's no need to continue for - * background writeback. Just push done_index - * past this page so media errors won't choke - * writeout for the entire file. For integrity - * writeback, we must process the entire dirty - * set regardless of errors because the fs may - * still have state to clear for each page. In - * that case we continue processing and return - * the first error. - */ - if (error == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - error = 0; - } else if (wbc->sync_mode != WB_SYNC_ALL) { - ret = error; - done_index = page->index + 1; - done = 1; - break; - } - if (!ret) - ret = error; - } +static struct folio *writeback_get_folio(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct folio *folio; - /* - * We stop writing back only if we are not doing - * integrity sync. In case of integrity sync we have to - * keep going until we have written all the pages - * we tagged for writeback prior to entering this loop. - */ - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { - done = 1; - break; - } - } - pagevec_release(&pvec); +retry: + folio = folio_batch_next(&wbc->fbatch); + if (!folio) { + folio_batch_release(&wbc->fbatch); cond_resched(); + filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), + wbc_to_tag(wbc), &wbc->fbatch); + folio = folio_batch_next(&wbc->fbatch); + if (!folio) + return NULL; } - /* - * If we hit the last page and there is more work to be done: wrap - * back the index back to the start of the file for the next - * time we are called. - */ - if (wbc->range_cyclic && !done) - done_index = 0; - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = done_index; - - return ret; -} -EXPORT_SYMBOL(write_cache_pages); + folio_lock(folio); + if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) { + folio_unlock(folio); + goto retry; + } -/* - * Function used by generic_writepages to call the real writepage - * function and set the mapping flags on error - */ -static int __writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + return folio; } /** - * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * writeback_iter - iterate folio of a mapping for writeback * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @wbc: writeback context + * @folio: previously iterated folio (%NULL to start) + * @error: in-out pointer for writeback errors (see below) + * + * This function returns the next folio for the writeback operation described by + * @wbc on @mapping and should be called in a while loop in the ->writepages + * implementation. + * + * To start the writeback operation, %NULL is passed in the @folio argument, and + * for every subsequent iteration the folio returned previously should be passed + * back in. + * + * If there was an error in the per-folio writeback inside the writeback_iter() + * loop, @error should be set to the error value. * - * This is a library function, which implements the writepages() - * address_space_operation. + * Once the writeback described in @wbc has finished, this function will return + * %NULL and if there was an error in any iteration restore it to @error. + * + * Note: callers should not manually break out of the loop using break or goto + * but must keep calling writeback_iter() until it returns %NULL. + * + * Return: the folio to write or %NULL if the loop is done. */ -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc) +struct folio *writeback_iter(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio, int *error) { - struct blk_plug plug; - int ret; + if (!folio) { + folio_batch_init(&wbc->fbatch); + wbc->saved_err = *error = 0; - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; + /* + * For range cyclic writeback we remember where we stopped so + * that we can continue where we stopped. + * + * For non-cyclic writeback we always start at the beginning of + * the passed in range. + */ + if (wbc->range_cyclic) + wbc->index = mapping->writeback_index; + else + wbc->index = wbc->range_start >> PAGE_SHIFT; - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __writepage, mapping); - blk_finish_plug(&plug); - return ret; -} + /* + * To avoid livelocks when other processes dirty new pages, we + * first tag pages which should be written back and only then + * start writing them. + * + * For data-integrity writeback we have to be careful so that we + * do not miss some pages (e.g., because some other process has + * cleared the TOWRITE tag we set). The rule we follow is that + * TOWRITE tag can be cleared only by the process clearing the + * DIRTY tag (and submitting the page for I/O). + */ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, wbc->index, + wbc_end(wbc)); + } else { + wbc->nr_to_write -= folio_nr_pages(folio); -EXPORT_SYMBOL(generic_writepages); + WARN_ON_ONCE(*error > 0); + + /* + * For integrity writeback we have to keep going until we have + * written all the folios we tagged for writeback above, even if + * we run past wbc->nr_to_write or encounter errors. + * We stash away the first error we encounter in wbc->saved_err + * so that it can be retrieved when we're done. This is because + * the file system may still have state to clear for each folio. + * + * For background writeback we exit as soon as we run past + * wbc->nr_to_write or encounter the first error. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + if (*error && !wbc->saved_err) + wbc->saved_err = *error; + } else { + if (*error || wbc->nr_to_write <= 0) + goto done; + } + } + + folio = writeback_get_folio(mapping, wbc); + if (!folio) { + /* + * To avoid deadlocks between range_cyclic writeback and callers + * that hold folios in writeback to aggregate I/O until + * the writeback iteration finishes, we do not loop back to the + * start of the file. Doing so causes a folio lock/folio + * writeback access order inversion - we should only ever lock + * multiple folios in ascending folio->index order, and looping + * back to the start of the file violates that rule and causes + * deadlocks. + */ + if (wbc->range_cyclic) + mapping->writeback_index = 0; + + /* + * Return the first error we encountered (if there was any) to + * the caller. + */ + *error = wbc->saved_err; + } + return folio; + +done: + if (wbc->range_cyclic) + mapping->writeback_index = folio_next_index(folio); + folio_batch_release(&wbc->fbatch); + return NULL; +} +EXPORT_SYMBOL_GPL(writeback_iter); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; + struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; + wb = inode_to_wb_wbc(mapping->host, wbc); + wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else - ret = generic_writepages(mapping, wbc); - if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) + /* deal with chardevs and other special files */ + ret = 0; + if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } - return ret; -} - -/** - * write_one_page - write out a single page and wait on I/O - * @page: the page to write - * - * The page must be locked by the caller and will be unlocked upon return. - * - * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this - * function returns. - */ -int write_one_page(struct page *page) -{ - struct address_space *mapping = page->mapping; - int ret = 0; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - }; - - BUG_ON(!PageLocked(page)); - wait_on_page_writeback(page); - - if (clear_page_dirty_for_io(page)) { - get_page(page); - ret = mapping->a_ops->writepage(page, &wbc); - if (ret == 0) - wait_on_page_writeback(page); - put_page(page); - } else { - unlock_page(page); + /* + * Lacking an allocation context or the locality or writeback + * state of any of the inode's pages, throttle based on + * writeback activity on the local node. It's as good a + * guess as any. + */ + reclaim_throttle(NODE_DATA(numa_node_id()), + VMSCAN_THROTTLE_WRITEBACK); } - - if (!ret) - ret = filemap_check_errors(mapping); + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); return ret; } -EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ -int __set_page_dirty_no_writeback(struct page *page) +bool noop_dirty_folio(struct address_space *mapping, struct folio *folio) { - if (!PageDirty(page)) - return !TestSetPageDirty(page); - return 0; + if (!folio_test_dirty(folio)) + return !folio_test_set_dirty(folio); + return false; } +EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * - * Caller must hold lock_page_memcg(). - * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) +static void folio_account_dirtied(struct folio *folio, + struct address_space *mapping) { struct inode *inode = mapping->host; - trace_writeback_dirty_page(page, mapping); + trace_writeback_dirty_folio(folio, mapping); - if (mapping_cap_account_dirty(mapping)) { + if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; + long nr = folio_nr_pages(folio); - inode_attach_wb(inode, page); + inode_attach_wb(inode, folio); wb = inode_to_wb(inode); - __inc_lruvec_page_state(page, NR_FILE_DIRTY); - __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); - __inc_node_page_state(page, NR_DIRTIED); - inc_wb_stat(wb, WB_RECLAIMABLE); - inc_wb_stat(wb, WB_DIRTIED); - task_io_account_write(PAGE_SIZE); - current->nr_dirtied++; - this_cpu_inc(bdp_ratelimits); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + __node_stat_mod_folio(folio, NR_DIRTIED, nr); + wb_stat_mod(wb, WB_RECLAIMABLE, nr); + wb_stat_mod(wb, WB_DIRTIED, nr); + task_io_account_write(nr * PAGE_SIZE); + current->nr_dirtied += nr; + __this_cpu_add(bdp_ratelimits, nr); + + mem_cgroup_track_foreign_dirty(folio, wb); } } -EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold lock_page_memcg(). */ -void account_page_cleaned(struct page *page, struct address_space *mapping, - struct bdi_writeback *wb) -{ - if (mapping_cap_account_dirty(mapping)) { - dec_lruvec_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - dec_wb_stat(wb, WB_RECLAIMABLE); - task_io_account_cancelled_write(PAGE_SIZE); - } +void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) +{ + long nr = folio_nr_pages(folio); + + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + task_io_account_cancelled_write(nr * PAGE_SIZE); } /* - * For address_spaces which do not use buffers. Just tag the page as dirty in - * the xarray. + * Mark the folio dirty, and set it dirty in the page cache. * - * This is also used when a single buffer is being dirtied: we want to set the - * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * If warn is true, then emit a warning if the folio is not uptodate and has + * not been truncated. * - * The caller must ensure this doesn't race with truncation. Most will simply - * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and - * the pte lock held, which also locks out truncation. + * It is the caller's responsibility to prevent the folio from being truncated + * while this function is in progress, although it may have been truncated + * before this function is called. Most callers have the folio locked. + * A few have the folio blocked from truncation through other means (e.g. + * zap_vma_pages() has it mapped and is holding the page table lock). + * When called from mark_buffer_dirty(), the filesystem should hold a + * reference to the buffer_head that is being marked dirty, which causes + * try_to_free_buffers() to fail. */ -int __set_page_dirty_nobuffers(struct page *page) +void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, + int warn) { - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); - unsigned long flags; + unsigned long flags; - if (!mapping) { - unlock_page_memcg(page); - return 1; - } + /* + * Shmem writeback relies on swap, and swap writeback is LRU based, + * not using the dirty mark. + */ + VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping)); + + xa_lock_irqsave(&mapping->i_pages, flags); + if (folio->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); + folio_account_dirtied(folio, mapping); + __xa_set_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); + } + xa_unlock_irqrestore(&mapping->i_pages, flags); +} - xa_lock_irqsave(&mapping->i_pages, flags); - BUG_ON(page_mapping(page) != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); - xa_unlock_irqrestore(&mapping->i_pages, flags); - unlock_page_memcg(page); +/** + * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. + * @mapping: Address space this folio belongs to. + * @folio: Folio to be marked as dirty. + * + * Filesystems which do not use buffer heads should call this function + * from their dirty_folio address space operation. It ignores the + * contents of folio_get_private(), so if the filesystem marks individual + * blocks as dirty, the filesystem should handle that itself. + * + * This is also sometimes used by filesystems which use buffer_heads when + * a single buffer is being dirtied: we want to set the folio dirty in + * that case, but not all the buffers. This is a "bottom-up" dirtying, + * whereas block_dirty_folio() is a "top-down" dirtying. + * + * The caller must ensure this doesn't race with truncation. Most will + * simply hold the folio lock, but e.g. zap_pte_range() calls with the + * folio mapped and the pte lock held, which also locks out truncation. + */ +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + if (folio_test_set_dirty(folio)) + return false; - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - } - return 1; + __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); + + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } - unlock_page_memcg(page); - return 0; + return true; } -EXPORT_SYMBOL(__set_page_dirty_nobuffers); +EXPORT_SYMBOL(filemap_dirty_folio); -/* - * Call this whenever redirtying a page, to de-account the dirty counters - * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written - * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to - * systematic errors in balanced_dirty_ratelimit and the dirty pages position - * control. +/** + * folio_redirty_for_writepage - Decline to write a dirty folio. + * @wbc: The writeback control. + * @folio: The folio. + * + * When a writepage implementation decides that it doesn't want to write + * @folio for some reason, it should call this function, unlock @folio and + * return 0. + * + * Return: True if we redirtied the folio. False if someone else dirtied + * it first. */ -void account_page_redirty(struct page *page) +bool folio_redirty_for_writepage(struct writeback_control *wbc, + struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); + bool ret; - if (mapping && mapping_cap_account_dirty(mapping)) { + wbc->pages_skipped += nr; + ret = filemap_dirty_folio(mapping, folio); + if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); - current->nr_dirtied--; - dec_node_page_state(page, NR_DIRTIED); - dec_wb_stat(wb, WB_DIRTIED); + current->nr_dirtied -= nr; + node_stat_mod_folio(folio, NR_DIRTIED, -nr); + wb_stat_mod(wb, WB_DIRTIED, -nr); unlocked_inode_to_wb_end(inode, &cookie); } -} -EXPORT_SYMBOL(account_page_redirty); - -/* - * When a writepage implementation decides that it doesn't want to write this - * page for some reason, it should redirty the locked page via - * redirty_page_for_writepage() and it should then unlock the page and return 0 - */ -int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) -{ - int ret; - - wbc->pages_skipped++; - ret = __set_page_dirty_nobuffers(page); - account_page_redirty(page); return ret; } -EXPORT_SYMBOL(redirty_page_for_writepage); +EXPORT_SYMBOL(folio_redirty_for_writepage); -/* - * Dirty a page. +/** + * folio_mark_dirty - Mark a folio as being modified. + * @folio: The folio. * - * For pages with a mapping this should be done under the page lock - * for the benefit of asynchronous memory errors who prefer a consistent - * dirty state. This rule can be broken in some special cases, - * but should be better not to. + * The folio may not be truncated while this function is running. + * Holding the folio lock is sufficient to prevent truncation, but some + * callers cannot acquire a sleeping lock. These callers instead hold + * the page table lock for a page table which contains at least one page + * in this folio. Truncation will block on the page table lock as it + * unmaps pages before removing the folio from its mapping. * - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. + * Return: True if the folio was newly dirtied, false if it was already dirty. */ -int set_page_dirty(struct page *page) +bool folio_mark_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); - page = compound_head(page); if (likely(mapping)) { - int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; /* - * readahead/lru_deactivate_page could remain - * PG_readahead/PG_reclaim due to race with end_page_writeback - * About readahead, if the page is written, the flags would be + * readahead/folio_deactivate could remain + * PG_readahead/PG_reclaim due to race with folio_end_writeback + * About readahead, if the folio is written, the flags would be * reset. So no problem. - * About lru_deactivate_page, if the page is redirty, the flag - * will be reset. So no problem. but if the page is used by readahead - * it will confuse readahead and make it restart the size rampup - * process. But it's a trivial problem. + * About folio_deactivate, if the folio is redirtied, + * the flag will be reset. So no problem. but if the + * folio is used by readahead it will confuse readahead + * and make it restart the size rampup process. But it's + * a trivial problem. */ - if (PageReclaim(page)) - ClearPageReclaim(page); -#ifdef CONFIG_BLOCK - if (!spd) - spd = __set_page_dirty_buffers; -#endif - return (*spd)(page); + if (folio_test_reclaim(folio)) + folio_clear_reclaim(folio); + return mapping->a_ops->dirty_folio(mapping, folio); } - if (!PageDirty(page)) { - if (!TestSetPageDirty(page)) - return 1; - } - return 0; + + return noop_dirty_folio(mapping, folio); } -EXPORT_SYMBOL(set_page_dirty); +EXPORT_SYMBOL(folio_mark_dirty); /* - * set_page_dirty() is racy if the caller has no reference against - * page->mapping->host, and if the page is unlocked. This is because another - * CPU could truncate the page off the mapping and then free the mapping. + * folio_mark_dirty() is racy if the caller has no reference against + * folio->mapping->host, and if the folio is unlocked. This is because another + * CPU could truncate the folio off the mapping and then free the mapping. * - * Usually, the page _is_ locked, or the caller is a user-space process which + * Usually, the folio _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * - * In other cases, the page should be locked before running set_page_dirty(). + * In other cases, the folio should be locked before running folio_mark_dirty(). */ -int set_page_dirty_lock(struct page *page) +bool folio_mark_dirty_lock(struct folio *folio) { - int ret; + bool ret; - lock_page(page); - ret = set_page_dirty(page); - unlock_page(page); + folio_lock(folio); + ret = folio_mark_dirty(folio); + folio_unlock(folio); return ret; } -EXPORT_SYMBOL(set_page_dirty_lock); +EXPORT_SYMBOL(folio_mark_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT @@ -2601,51 +2857,49 @@ EXPORT_SYMBOL(set_page_dirty_lock); * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ -void __cancel_dirty_page(struct page *page) +void __folio_cancel_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); - if (mapping_cap_account_dirty(mapping)) { + if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; - lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &cookie); - if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, wb); + if (folio_test_clear_dirty(folio)) + folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); - unlock_page_memcg(page); } else { - ClearPageDirty(page); + folio_clear_dirty(folio); } } -EXPORT_SYMBOL(__cancel_dirty_page); +EXPORT_SYMBOL(__folio_cancel_dirty); /* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. - * - * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the xarray so that a concurrent write-for-sync - * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage - * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and xarray dirty tag - * back into sync. - * - * This incoherency between the page's dirty flag and xarray tag is - * unfortunate, but it only exists while the page is locked. + * Clear a folio's dirty flag, while caring for dirty memory accounting. + * Returns true if the folio was previously dirty. + * + * This is for preparing to put the folio under writeout. We leave + * the folio tagged as dirty in the xarray so that a concurrent + * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. + * The ->writepage implementation will run either folio_start_writeback() + * or folio_mark_dirty(), at which stage we bring the folio's dirty flag + * and xarray dirty tag back into sync. + * + * This incoherency between the folio's dirty flag and xarray tag is + * unfortunate, but it only exists while the folio is locked. */ -int clear_page_dirty_for_io(struct page *page) +bool folio_clear_dirty_for_io(struct folio *folio) { - struct address_space *mapping = page_mapping(page); - int ret = 0; + struct address_space *mapping = folio_mapping(folio); + bool ret = false; - BUG_ON(!PageLocked(page)); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (mapping && mapping_cap_account_dirty(mapping)) { + if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; @@ -2656,161 +2910,228 @@ int clear_page_dirty_for_io(struct page *page) * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to - * mark the whole page dirty if it was + * mark the whole folio dirty if it was * dirty in a pagetable. Only to then - * (c) clean the page again and return 1 to + * (c) clean the folio again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * - * Note! Normally the "set_page_dirty(page)" + * Note! Normally the "folio_mark_dirty(folio)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * - * We basically use the page "master dirty bit" + * We basically use the folio "master dirty bit" * as a serialization point for all the different * threads doing their things. */ - if (page_mkclean(page)) - set_page_dirty(page); + if (folio_mkclean(folio)) + folio_mark_dirty(folio); /* * We carefully synchronise fault handlers against - * installing a dirty pte and marking the page dirty + * installing a dirty pte and marking the folio dirty * at this point. We do this by having them hold the - * page lock while dirtying the page, and pages are + * page lock while dirtying the folio, and folios are * always locked coming in here, so we get the desired * exclusion. */ wb = unlocked_inode_to_wb_begin(inode, &cookie); - if (TestClearPageDirty(page)) { - dec_lruvec_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - dec_wb_stat(wb, WB_RECLAIMABLE); - ret = 1; + if (folio_test_clear_dirty(folio)) { + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + ret = true; } unlocked_inode_to_wb_end(inode, &cookie); return ret; } - return TestClearPageDirty(page); + return folio_test_clear_dirty(folio); } -EXPORT_SYMBOL(clear_page_dirty_for_io); +EXPORT_SYMBOL(folio_clear_dirty_for_io); -int test_clear_page_writeback(struct page *page) +static void wb_inode_writeback_start(struct bdi_writeback *wb) { - struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; - struct lruvec *lruvec; - int ret; + atomic_inc(&wb->writeback_inodes); +} + +static void wb_inode_writeback_end(struct bdi_writeback *wb) +{ + unsigned long flags; + atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + spin_lock_irqsave(&wb->work_lock, flags); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); + spin_unlock_irqrestore(&wb->work_lock, flags); +} + +bool __folio_end_writeback(struct folio *folio) +{ + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; - memcg = lock_page_memcg(page); - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb; unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - ret = TestClearPageWriteback(page); - if (ret) { - __xa_clear_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) { - struct bdi_writeback *wb = inode_to_wb(inode); - - dec_wb_stat(wb, WB_WRITEBACK); - __wb_writeout_inc(wb); - } - } + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); + __xa_clear_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_WRITEBACK); - if (mapping->host && !mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK)) - sb_clear_inode_writeback(mapping->host); + wb = inode_to_wb(inode); + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + wb_inode_writeback_end(wb); + if (mapping->host) + sb_clear_inode_writeback(mapping->host); + } xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - ret = TestClearPageWriteback(page); + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); } - /* - * NOTE: Page might be free now! Writeback doesn't hold a page - * reference on its own, it relies on truncation to wait for - * the clearing of PG_writeback. The below can only access - * page state that is static across allocation cycles. - */ - if (ret) { - dec_lruvec_state(lruvec, NR_WRITEBACK); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - inc_node_page_state(page, NR_WRITTEN); - } - __unlock_page_memcg(memcg); + + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + node_stat_mod_folio(folio, NR_WRITTEN, nr); + return ret; } -int __test_set_page_writeback(struct page *page, bool keep_write) +void __folio_start_writeback(struct folio *folio, bool keep_write) { - struct address_space *mapping = page_mapping(page); - int ret; + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + int access_ret; + + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { - XA_STATE(xas, &mapping->i_pages, page_index(page)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb; unsigned long flags; + bool on_wblist; xas_lock_irqsave(&xas, flags); xas_load(&xas); - ret = TestSetPageWriteback(page); - if (!ret) { - bool on_wblist; + folio_test_set_writeback(folio); - on_wblist = mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK); - - xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) - inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + wb = inode_to_wb(inode); + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) { + wb_inode_writeback_start(wb); /* * We can come through here when swapping anonymous - * pages, so we don't necessarily have an inode to track - * for sync. + * folios, so we don't necessarily have an inode to + * track for sync. */ - if (mapping->host && !on_wblist) + if (mapping->host) sb_mark_inode_writeback(mapping->host); } - if (!PageDirty(page)) + + if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { - ret = TestSetPageWriteback(page); + folio_test_set_writeback(folio); } - if (!ret) { - inc_lruvec_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); + + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + + access_ret = arch_make_folio_accessible(folio); + /* + * If writeback has been triggered on a page that cannot be made + * accessible, it is too late to recover here. + */ + VM_BUG_ON_FOLIO(access_ret != 0, folio); +} +EXPORT_SYMBOL(__folio_start_writeback); + +/** + * folio_wait_writeback - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + */ +void folio_wait_writeback(struct folio *folio) +{ + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + folio_wait_bit(folio, PG_writeback); + } +} +EXPORT_SYMBOL_GPL(folio_wait_writeback); + +/** + * folio_wait_writeback_killable - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete or a fatal signal to arrive. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + * Return: 0 on success, -EINTR if we get a fatal signal while waiting. + */ +int folio_wait_writeback_killable(struct folio *folio) +{ + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + if (folio_wait_bit_killable(folio, PG_writeback)) + return -EINTR; } - unlock_page_memcg(page); - return ret; + return 0; } -EXPORT_SYMBOL(__test_set_page_writeback); +EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); /** - * wait_for_stable_page() - wait for writeback to finish, if necessary. - * @page: The page to wait on. + * folio_wait_stable() - wait for writeback to finish, if necessary. + * @folio: The folio to wait on. + * + * This function determines if the given folio is related to a backing + * device that requires folio contents to be held stable during writeback. + * If so, then it will wait for any pending writeback to complete. * - * This function determines if the given page is related to a backing device - * that requires page contents to be held stable during writeback. If so, then - * it will wait for any pending writeback to complete. + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. */ -void wait_for_stable_page(struct page *page) +void folio_wait_stable(struct folio *folio) { - if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) - wait_on_page_writeback(page); + if (mapping_stable_writes(folio_mapping(folio))) + folio_wait_writeback(folio); } -EXPORT_SYMBOL_GPL(wait_for_stable_page); +EXPORT_SYMBOL_GPL(folio_wait_stable); |
