diff options
Diffstat (limited to 'mm/page-writeback.c')
| -rw-r--r-- | mm/page-writeback.c | 3060 |
1 files changed, 1940 insertions, 1120 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4514ad7415c3..ccdeb0e84d39 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1,8 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. @@ -12,6 +13,7 @@ */ #include <linux/kernel.h> +#include <linux/math64.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> @@ -27,17 +29,20 @@ #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> -#include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> -#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> +#include <linux/sched/signal.h> +#include <linux/mm_inline.h> +#include <linux/shmem_fs.h> #include <trace/events/writeback.h> +#include "internal.h" + /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ @@ -50,7 +55,7 @@ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* - * Estimate write bandwidth at 200ms intervals. + * Estimate write bandwidth or update dirty limit at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) @@ -67,30 +72,30 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ -int dirty_background_ratio = 10; +static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ -unsigned long dirty_background_bytes; +static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ -int vm_highmem_is_dirtyable; +static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 20; +static int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ -unsigned long vm_dirty_bytes; +static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks @@ -105,11 +110,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* - * Flag that makes the machine dump writes/reads and block dirtyings. - */ -int block_dump; - -/* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ @@ -119,31 +119,7 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -unsigned long global_dirty_limit; - -/* - * Scale the writeback cache size proportional to the relative writeout speeds. - * - * We do this by keeping a floating proportion between BDIs, based on page - * writeback completions [end_page_writeback()]. Those devices that write out - * pages fastest will get the larger share, while the slower will get a smaller - * share. - * - * We use page writeout completions because we are interested in getting rid of - * dirty pages. Having them written out is the primary goal. - * - * We introduce a concept of time, a period over which we measure these events, - * because demand can/will vary over time. The length of this period itself is - * measured in page writeback completions. - * - */ -static struct fprop_global writeout_completions; - -static void writeout_period(unsigned long t); -/* Timer for aging of writeout_completions */ -static struct timer_list writeout_period_timer = - TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); -static unsigned long writeout_period_time = 0; +struct wb_domain global_wb_domain; /* * Length of period for aging writeout fractions of bdis. This is an @@ -152,23 +128,101 @@ static unsigned long writeout_period_time = 0; */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ +#ifdef CONFIG_CGROUP_WRITEBACK + +#define GDTC_INIT(__wb) .wb = (__wb), \ + .dom = &global_wb_domain, \ + .wb_completions = &(__wb)->completions + +#define GDTC_INIT_NO_WB .dom = &global_wb_domain + +#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ + .dom = mem_cgroup_wb_domain(__wb), \ + .wb_completions = &(__wb)->memcg_completions, \ + .gdtc = __gdtc + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return mdtc->gdtc; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return &wb->memcg_completions; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + unsigned long long min = wb->bdi->min_ratio; + unsigned long long max = wb->bdi->max_ratio; + + /* + * @wb may already be clean by the time control reaches here and + * the total may not include its bw. + */ + if (this_bw < tot_bw) { + if (min) { + min *= this_bw; + min = div64_ul(min, tot_bw); + } + if (max < 100 * BDI_RATIO_SCALE) { + max *= this_bw; + max = div64_ul(max, tot_bw); + } + } + + *minp = min; + *maxp = max; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +#define GDTC_INIT(__wb) .wb = (__wb), \ + .wb_completions = &(__wb)->completions +#define GDTC_INIT_NO_WB +#define MDTC_INIT(__wb, __gdtc) + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return false; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return &global_wb_domain; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return NULL; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return NULL; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + *minp = wb->bdi->min_ratio; + *maxp = wb->bdi->max_ratio; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ /* * In a memory zone, there is a certain amount of pages we consider @@ -178,7 +232,7 @@ static unsigned long writeout_period_time = 0; * requiring writeback. * * This number of dirtyable pages is the base value of which the - * user-configurable dirty ratio is the effictive number of pages that + * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * @@ -188,30 +242,67 @@ static unsigned long writeout_period_time = 0; * global dirtyable memory first. */ +/** + * node_dirtyable_memory - number of dirtyable pages in a node + * @pgdat: the node + * + * Return: the node's number of pages potentially available for dirty + * page cache. This is the base value for the per-node dirty limits. + */ +static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) +{ + unsigned long nr_pages = 0; + int z; + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + nr_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + nr_pages -= min(nr_pages, pgdat->totalreserve_pages); + + nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); + nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); + + return nr_pages; +} + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; + int i; for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { + struct zone *z; + unsigned long nr_pages; + + if (!is_highmem_idx(i)) + continue; + + z = &NODE_DATA(node)->node_zones[i]; + if (!populated_zone(z)) + continue; - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z) - z->dirty_balance_reserve; + nr_pages = zone_page_state(z, NR_FREE_PAGES); + /* watch for underflows */ + nr_pages -= min(nr_pages, high_wmark_pages(z)); + nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); + nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); + x += nr_pages; + } } - /* - * Unreclaimable memory (kernel memory or anonymous memory - * without swap) can bring down the dirtyable pages below - * the zone's dirty balance reserve and the above calculation - * will underflow. However we still want to add in nodes - * which are below threshold (negative values) to get a more - * accurate calculation but make sure that the total never - * underflows. - */ - if ((long)x < 0) - x = 0; /* * Make sure that the number of highmem pages is never larger @@ -228,136 +319,173 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) /** * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the global number of pages potentially available for dirty + * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); - x -= min(x, dirty_balance_reserve); + x = global_zone_page_state(NR_FREE_PAGES); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + x -= min(x, totalreserve_pages); + + x += global_node_page_state(NR_INACTIVE_FILE); + x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); - /* Subtract min_free_kbytes */ - x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); - return x + 1; /* Ensure that we never return 0 */ } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds +/** + * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain + * @dtc: dirty_throttle_control of interest * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. + * Calculate @dtc->thresh and ->bg_thresh considering + * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller + * must ensure that @dtc->avail is set before calling this function. The + * dirty limits will be lifted by 1/4 for real-time tasks. */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +static void domain_dirty_limits(struct dirty_throttle_control *dtc) { - unsigned long background; - unsigned long dirty; - unsigned long uninitialized_var(available_memory); + const unsigned long available_memory = dtc->avail; + struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); + unsigned long bytes = vm_dirty_bytes; + unsigned long bg_bytes = dirty_background_bytes; + /* convert ratios to per-PAGE_SIZE for higher precision */ + unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; + unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; + unsigned long thresh; + unsigned long bg_thresh; struct task_struct *tsk; - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = global_dirtyable_memory(); + /* gdtc is !NULL iff @dtc is for memcg domain */ + if (gdtc) { + unsigned long global_avail = gdtc->avail; - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + /* + * The byte settings can't be applied directly to memcg + * domains. Convert them to ratios by scaling against + * globally available memory. As the ratios are in + * per-PAGE_SIZE, they can be obtained by dividing bytes by + * number of pages. + */ + if (bytes) + ratio = min(DIV_ROUND_UP(bytes, global_avail), + PAGE_SIZE); + if (bg_bytes) + bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), + PAGE_SIZE); + bytes = bg_bytes = 0; + } + + if (bytes) + thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else - dirty = (vm_dirty_ratio * available_memory) / 100; + thresh = (ratio * available_memory) / PAGE_SIZE; - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + if (bg_bytes) + bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else - background = (dirty_background_ratio * available_memory) / 100; + bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (background >= dirty) - background = dirty / 2; tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; + if (rt_or_dl_task(tsk)) { + bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; + thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + if (thresh > UINT_MAX) + thresh = UINT_MAX; + /* This makes sure bg_thresh is within 32-bits as well */ + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; + dtc->thresh = thresh; + dtc->bg_thresh = bg_thresh; + + /* we should eventually report the domain in the TP */ + if (!gdtc) + trace_global_dirty_state(bg_thresh, thresh); } /** - * zone_dirtyable_memory - number of dirtyable pages in a zone - * @zone: the zone + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * @pbackground: out parameter for bg_thresh + * @pdirty: out parameter for thresh * - * Returns the zone's number of pages potentially available for dirty - * page cache. This is the base value for the per-zone dirty limits. + * Calculate bg_thresh and thresh for global_wb_domain. See + * domain_dirty_limits() for details. */ -static unsigned long zone_dirtyable_memory(struct zone *zone) +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { - /* - * The effective global number of dirtyable pages may exclude - * highmem as a big-picture measure to keep the ratio between - * dirty memory and lowmem reasonable. - * - * But this function is purely about the individual zone and a - * highmem zone can hold its share of dirty pages, so we don't - * care about vm_highmem_is_dirtyable here. - */ - unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + - zone_reclaimable_pages(zone); + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; - /* don't allow this to underflow */ - nr_pages -= min(nr_pages, zone->dirty_balance_reserve); - return nr_pages; + gdtc.avail = global_dirtyable_memory(); + domain_dirty_limits(&gdtc); + + *pbackground = gdtc.bg_thresh; + *pdirty = gdtc.thresh; } /** - * zone_dirty_limit - maximum number of dirty pages allowed in a zone - * @zone: the zone + * node_dirty_limit - maximum number of dirty pages allowed in a node + * @pgdat: the node * - * Returns the maximum number of dirty pages allowed in a zone, based - * on the zone's dirtyable memory. + * Return: the maximum number of dirty pages allowed in a node, based + * on the node's dirtyable memory. */ -static unsigned long zone_dirty_limit(struct zone *zone) +static unsigned long node_dirty_limit(struct pglist_data *pgdat) { - unsigned long zone_memory = zone_dirtyable_memory(zone); + unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * - zone_memory / global_dirtyable_memory(); + node_memory / global_dirtyable_memory(); else - dirty = vm_dirty_ratio * zone_memory / 100; + dirty = vm_dirty_ratio * node_memory / 100; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + if (rt_or_dl_task(tsk)) dirty += dirty / 4; - return dirty; + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + return min_t(unsigned long, dirty, UINT_MAX); } /** - * zone_dirty_ok - tells whether a zone is within its dirty limits - * @zone: the zone to check + * node_dirty_ok - tells whether a node is within its dirty limits + * @pgdat: the node to check * - * Returns %true when the dirty pages in @zone are within the zone's + * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ -bool zone_dirty_ok(struct zone *zone) +bool node_dirty_ok(struct pglist_data *pgdat) { - unsigned long limit = zone_dirty_limit(zone); + unsigned long limit = node_dirty_limit(pgdat); + unsigned long nr_pages = 0; - return zone_page_state(zone, NR_FILE_DIRTY) + - zone_page_state(zone, NR_UNSTABLE_NFS) + - zone_page_state(zone, NR_WRITEBACK) <= limit; + nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); + nr_pages += node_page_state(pgdat, NR_WRITEBACK); + + return nr_pages <= limit; } -int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +#ifdef CONFIG_SYSCTL +static int dirty_background_ratio_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -367,47 +495,56 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, return ret; } -int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int dirty_background_bytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; + unsigned long old_bytes = dirty_background_bytes; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); - if (ret == 0 && write) + if (ret == 0 && write) { + if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > + UINT_MAX) { + dirty_background_bytes = old_bytes; + return -ERANGE; + } dirty_background_ratio = 0; + } return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - writeback_set_ratelimit(); vm_dirty_bytes = 0; + writeback_set_ratelimit(); } return ret; } -int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int dirty_bytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { + vm_dirty_bytes = old_bytes; + return -ERANGE; + } writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } +#endif static unsigned long wp_next_time(unsigned long cur_time) { @@ -418,70 +555,97 @@ static unsigned long wp_next_time(unsigned long cur_time) return cur_time; } -/* - * Increment the BDI's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). - */ -static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +static void wb_domain_writeout_add(struct wb_domain *dom, + struct fprop_local_percpu *completions, + unsigned int max_prop_frac, long nr) { - __inc_bdi_stat(bdi, BDI_WRITTEN); - __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_add_percpu_max(&dom->completions, completions, + max_prop_frac, nr); /* First event after period switching was turned off? */ - if (!unlikely(writeout_period_time)) { + if (unlikely(!dom->period_time)) { /* - * We can race with other __bdi_writeout_inc calls here but + * We can race with other wb_domain_writeout_add calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ - writeout_period_time = wp_next_time(jiffies); - mod_timer(&writeout_period_timer, writeout_period_time); + dom->period_time = wp_next_time(jiffies); + mod_timer(&dom->period_timer, dom->period_time); } } -void bdi_writeout_inc(struct backing_dev_info *bdi) +/* + * Increment @wb's writeout completion count and the global writeout + * completion count. Called from __folio_end_writeback(). + */ +static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) { - unsigned long flags; + struct wb_domain *cgdom; - local_irq_save(flags); - __bdi_writeout_inc(bdi); - local_irq_restore(flags); + wb_stat_mod(wb, WB_WRITTEN, nr); + wb_domain_writeout_add(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac, nr); + + cgdom = mem_cgroup_wb_domain(wb); + if (cgdom) + wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac, nr); } -EXPORT_SYMBOL_GPL(bdi_writeout_inc); -/* - * Obtain an accurate fraction of the BDI's portion. - */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, - long *numerator, long *denominator) +void wb_writeout_inc(struct bdi_writeback *wb) { - fprop_fraction_percpu(&writeout_completions, &bdi->completions, - numerator, denominator); + unsigned long flags; + + local_irq_save(flags); + __wb_writeout_add(wb, 1); + local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ -static void writeout_period(unsigned long t) +static void writeout_period(struct timer_list *t) { - int miss_periods = (jiffies - writeout_period_time) / + struct wb_domain *dom = timer_container_of(dom, t, period_timer); + int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; - if (fprop_new_period(&writeout_completions, miss_periods + 1)) { - writeout_period_time = wp_next_time(writeout_period_time + + if (fprop_new_period(&dom->completions, miss_periods + 1)) { + dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); - mod_timer(&writeout_period_timer, writeout_period_time); + mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ - writeout_period_time = 0; + dom->period_time = 0; } } +int wb_domain_init(struct wb_domain *dom, gfp_t gfp) +{ + memset(dom, 0, sizeof(*dom)); + + spin_lock_init(&dom->lock); + + timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); + + dom->dirty_limit_tstamp = jiffies; + + return fprop_global_init(&dom->completions, gfp); +} + +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom) +{ + timer_delete_sync(&dom->period_timer); + fprop_global_destroy(&dom->completions); +} +#endif + /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not @@ -489,20 +653,66 @@ static void writeout_period(unsigned long t) */ static unsigned int bdi_min_ratio; -int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +static int bdi_check_pages_limit(unsigned long pages) +{ + unsigned long max_dirty_pages = global_dirtyable_memory(); + + if (pages > max_dirty_pages) + return -EINVAL; + + return 0; +} + +static unsigned long bdi_ratio_from_pages(unsigned long pages) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long ratio; + + global_dirty_limits(&background_thresh, &dirty_thresh); + if (!dirty_thresh) + return -EINVAL; + ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); + + return ratio; +} + +static u64 bdi_get_bytes(unsigned int ratio) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + u64 bytes; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; + + return bytes; +} + +static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { + unsigned int delta; int ret = 0; + if (min_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { - min_ratio -= bdi->min_ratio; - if (bdi_min_ratio + min_ratio < 100) { - bdi_min_ratio += min_ratio; - bdi->min_ratio += min_ratio; + if (min_ratio < bdi->min_ratio) { + delta = bdi->min_ratio - min_ratio; + bdi_min_ratio -= delta; + bdi->min_ratio = min_ratio; } else { - ret = -EINVAL; + delta = min_ratio - bdi->min_ratio; + if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { + bdi_min_ratio += delta; + bdi->min_ratio = min_ratio; + } else { + ret = -EINVAL; + } } } spin_unlock_bh(&bdi_lock); @@ -510,11 +720,11 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) return ret; } -int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; - if (max_ratio > 100) + if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); @@ -522,66 +732,265 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) ret = -EINVAL; } else { bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / + (100 * BDI_RATIO_SCALE); } spin_unlock_bh(&bdi_lock); return ret; } + +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio); +} + +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); +} EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_min_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->min_ratio); +} + +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) +{ + int ret; + unsigned long pages = min_bytes >> PAGE_SHIFT; + long min_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + min_ratio = bdi_ratio_from_pages(pages); + if (min_ratio < 0) + return min_ratio; + return __bdi_set_min_ratio(bdi, min_ratio); +} + +u64 bdi_get_max_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->max_ratio); +} + +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) +{ + int ret; + unsigned long pages = max_bytes >> PAGE_SHIFT; + long max_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + max_ratio = bdi_ratio_from_pages(pages); + if (max_ratio < 0) + return max_ratio; + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } -static unsigned long hard_dirty_limit(unsigned long thresh) +static unsigned long hard_dirty_limit(struct wb_domain *dom, + unsigned long thresh) { - return max(thresh, global_dirty_limit); + return max(thresh, dom->dirty_limit); +} + +/* + * Memory which can be further allocated to a memcg domain is capped by + * system-wide clean memory excluding the amount being used in the domain. + */ +static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, + unsigned long filepages, unsigned long headroom) +{ + struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); + unsigned long clean = filepages - min(filepages, mdtc->dirty); + unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); + unsigned long other_clean = global_clean - min(global_clean, clean); + + mdtc->avail = filepages + min(headroom, other_clean); +} + +static inline bool dtc_is_global(struct dirty_throttle_control *dtc) +{ + return mdtc_gdtc(dtc) == NULL; +} + +/* + * Dirty background will ignore pages being written as we're trying to + * decide whether to put more under writeback. + */ +static void domain_dirty_avail(struct dirty_throttle_control *dtc, + bool include_writeback) +{ + if (dtc_is_global(dtc)) { + dtc->avail = global_dirtyable_memory(); + dtc->dirty = global_node_page_state(NR_FILE_DIRTY); + if (include_writeback) + dtc->dirty += global_node_page_state(NR_WRITEBACK); + } else { + unsigned long filepages = 0, headroom = 0, writeback = 0; + + mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty, + &writeback); + if (include_writeback) + dtc->dirty += writeback; + mdtc_calc_avail(dtc, filepages, headroom); + } } /** - * bdi_dirty_limit - @bdi's share of dirty throttling threshold - * @bdi: the backing_dev_info to query - * @dirty: global dirty limit in pages - * - * Returns @bdi's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * __wb_calc_thresh - @wb's share of dirty threshold + * @dtc: dirty_throttle_context of interest + * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc * - * Note that balance_dirty_pages() will only seriously take it as a hard limit - * when sleeping max_pause per page is not enough to keep the dirty pages under - * control. For example, when the device is completely stalled due to some error - * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * Note that balance_dirty_pages() will only seriously take dirty throttling + * threshold as a hard limit when sleeping max_pause per page is not enough + * to keep the dirty pages under control. For example, when the device is + * completely stalled due to some error conditions, or when there are 1000 + * dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks - * more (rather than completely block them) when the bdi dirty pages go high. + * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * - * The bdi's share of dirty limit will be adapting to its throughput and + * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + * + * Return: @wb's dirty limit in pages. For dirty throttling limit, the term + * "dirty" in the context of dirty balancing includes all PG_dirty and + * PG_writeback pages. */ -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, + unsigned long thresh) { - u64 bdi_dirty; - long numerator, denominator; + struct wb_domain *dom = dtc_dom(dtc); + struct bdi_writeback *wb = dtc->wb; + u64 wb_thresh; + u64 wb_max_thresh; + unsigned long numerator, denominator; + unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the dirty ratio. + * Calculate this wb's share of the thresh ratio. + */ + fprop_fraction_percpu(&dom->completions, dtc->wb_completions, + &numerator, &denominator); + + wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); + wb_thresh *= numerator; + wb_thresh = div64_ul(wb_thresh, denominator); + + wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); + + wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + + /* + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. */ - bdi_writeout_fraction(bdi, &numerator, &denominator); + if (thresh > dtc->dirty) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); + else + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); + } + + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + if (wb_thresh > wb_max_thresh) + wb_thresh = wb_max_thresh; + + return wb_thresh; +} + +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + domain_dirty_avail(&gdtc, true); + return __wb_calc_thresh(&gdtc, thresh); +} + +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); + domain_dirty_avail(&gdtc, true); + domain_dirty_avail(&mdtc, true); + domain_dirty_limits(&mdtc); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; + return __wb_calc_thresh(&mdtc, mdtc.thresh); +} - return bdi_dirty; +/* + * setpoint - dirty 3 + * f(dirty) := 1.0 + (----------------) + * limit - setpoint + * + * it's a 3rd order polynomial that subjects to + * + * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast + * (2) f(setpoint) = 1.0 => the balance point + * (3) f(limit) = 0 => the hard limit + * (4) df/dx <= 0 => negative feedback control + * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) + * => fast response on large errors; small oscillation near setpoint + */ +static long long pos_ratio_polynom(unsigned long setpoint, + unsigned long dirty, + unsigned long limit) +{ + long long pos_ratio; + long x; + + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + (limit - setpoint) | 1); + pos_ratio = x; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + + return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* @@ -589,7 +998,7 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * * (o) global/bdi setpoints * - * We want the dirty pages be balanced around the global/bdi setpoints. + * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. @@ -599,8 +1008,8 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * - * if (bdi_dirty < bdi_setpoint) scale up pos_ratio - * if (bdi_dirty > bdi_setpoint) scale down pos_ratio + * if (wb_dirty < wb_setpoint) scale up pos_ratio + * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * @@ -609,7 +1018,7 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * ^ pos_ratio * | * | |<===== global dirty control scope ======>| - * 2.0 .............* + * 2.0 * * * * * * * * | .* * | . * * | . * @@ -625,7 +1034,7 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * - * (o) bdi control line + * (o) wb control line * * ^ pos_ratio * | @@ -651,145 +1060,184 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * | . . * | . . * 0 +----------------------.-------------------------------.-------------> - * bdi_setpoint^ x_intercept^ + * wb_setpoint^ x_intercept^ * - * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can + * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD - * card's bdi_dirty may rush to many times higher than bdi_setpoint. - * - the bdi dirty thresh drops quickly due to change of JBOD workload + * card's wb_dirty may rush to many times higher than wb_setpoint. + * - the wb dirty thresh drops quickly due to change of JBOD workload */ -static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty) -{ - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_position_ratio(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ - unsigned long bdi_setpoint; + unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; - if (unlikely(dirty >= limit)) - return 0; + dtc->pos_ratio = 0; + + if (unlikely(dtc->dirty >= limit)) + return; /* * global setpoint * - * setpoint - dirty 3 - * f(dirty) := 1.0 + (----------------) - * limit - setpoint + * See comment for pos_ratio_polynom(). + */ + setpoint = (freerun + limit) / 2; + pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); + + /* + * The strictlimit feature is a tool preventing mistrusted filesystems + * from growing a large number of dirty pages before throttling. For + * such filesystems balance_dirty_pages always checks wb counters + * against wb limits. Even if global "nr_dirty" is under "freerun". + * This is especially important for fuse which sets bdi->max_ratio to + * 1% by default. * - * it's a 3rd order polynomial that subjects to + * Here, in wb_position_ratio(), we calculate pos_ratio based on + * two values: wb_dirty and wb_thresh. Let's consider an example: + * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global + * limits are set by default to 10% and 20% (background and throttle). + * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is + * about ~6K pages (as the average of background and throttle wb + * limits). The 3rd order polynomial will provide positive feedback if + * wb_dirty is under wb_setpoint and vice versa. * - * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast - * (2) f(setpoint) = 1.0 => the balance point - * (3) f(limit) = 0 => the hard limit - * (4) df/dx <= 0 => negative feedback control - * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) - * => fast response on large errors; small oscillation near setpoint + * Note, that we cannot use global counters in these calculations + * because we want to throttle process writing to a strictlimit wb + * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB + * in the example above). */ - setpoint = (freerun + limit) / 2; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, - limit - setpoint + 1); - pos_ratio = x; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long wb_pos_ratio; + + if (dtc->wb_dirty >= wb_thresh) + return; + + wb_setpoint = dirty_freerun_ceiling(wb_thresh, + dtc->wb_bg_thresh); + + if (wb_setpoint == 0 || wb_setpoint == wb_thresh) + return; + + wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, + wb_thresh); + + /* + * Typically, for strictlimit case, wb_setpoint << setpoint + * and pos_ratio >> wb_pos_ratio. In the other words global + * state ("dirty") is not limiting factor and we have to + * make decision based on wb counters. But there is an + * important case when global pos_ratio should get precedence: + * global limits are exceeded (e.g. due to activities on other + * wb's) while given strictlimit wb is below limit. + * + * "pos_ratio * wb_pos_ratio" would work for the case above, + * but it would look too non-natural for the case of all + * activity in the system coming from a single strictlimit wb + * with bdi->max_ratio == 100%. + * + * Note that min() below somewhat changes the dynamics of the + * control system. Normally, pos_ratio value can be well over 3 + * (when globally we are at freerun and wb is well below wb + * setpoint). Now the maximum pos_ratio in the same situation + * is 2. We might want to tweak this if we observe the control + * system is too slow to adapt. + */ + dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); + return; + } /* * We have computed basic pos_ratio above based on global situation. If - * the bdi is over/under its share of dirty pages, we want to scale + * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* - * bdi setpoint + * wb setpoint * - * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) + * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * - * x_intercept - bdi_dirty + * x_intercept - wb_dirty * := -------------------------- - * x_intercept - bdi_setpoint + * x_intercept - wb_setpoint * - * The main bdi control line is a linear function that subjects to + * The main wb control line is a linear function that subjects to * - * (1) f(bdi_setpoint) = 1.0 - * (2) k = - 1 / (8 * write_bw) (in single bdi case) - * or equally: x_intercept = bdi_setpoint + 8 * write_bw + * (1) f(wb_setpoint) = 1.0 + * (2) k = - 1 / (8 * write_bw) (in single wb case) + * or equally: x_intercept = wb_setpoint + 8 * write_bw * - * For single bdi case, the dirty pages are observed to fluctuate + * For single wb case, the dirty pages are observed to fluctuate * regularly within range - * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] + * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * - * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its + * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that - * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. - */ - if (unlikely(bdi_thresh > thresh)) - bdi_thresh = thresh; - /* - * It's very possible that bdi_thresh is close to 0 not because the - * device is slow, but that it has remained inactive for long time. - * Honour such devices a reasonable good (hopefully IO efficient) - * threshold, so that the occasional writes won't be blocked and active - * writes can rampup the threshold quickly. + * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ - bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); + if (unlikely(wb_thresh > dtc->thresh)) + wb_thresh = dtc->thresh; /* - * scale global setpoint to bdi's: - * bdi_setpoint = setpoint * bdi_thresh / thresh + * scale global setpoint to wb's: + * wb_setpoint = setpoint * wb_thresh / thresh */ - x = div_u64((u64)bdi_thresh << 16, thresh + 1); - bdi_setpoint = setpoint * (u64)x >> 16; + x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); + wb_setpoint = setpoint * (u64)x >> 16; /* - * Use span=(8*write_bw) in single bdi case as indicated by - * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. + * Use span=(8*write_bw) in single wb case as indicated by + * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * - * bdi_thresh thresh - bdi_thresh - * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh - * thresh thresh + * wb_thresh thresh - wb_thresh + * span = --------- * (8 * write_bw) + ------------------ * wb_thresh + * thresh thresh */ - span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; - x_intercept = bdi_setpoint + span; + span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = wb_setpoint + span; - if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), - x_intercept - bdi_setpoint + 1); + if (dtc->wb_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), + (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* - * bdi reserve area, safeguard against dirty pool underrun and disk idle + * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ - x_intercept = bdi_thresh / 2; - if (bdi_dirty < x_intercept) { - if (bdi_dirty > x_intercept / 8) - pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty); + x_intercept = wb_thresh / 2; + if (dtc->wb_dirty < x_intercept) { + if (dtc->wb_dirty > x_intercept / 8) + pos_ratio = div_u64(pos_ratio * x_intercept, + dtc->wb_dirty); else pos_ratio *= 8; } - return pos_ratio; + dtc->pos_ratio = pos_ratio; } -static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, - unsigned long elapsed, - unsigned long written) +static void wb_update_write_bandwidth(struct bdi_writeback *wb, + unsigned long elapsed, + unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); - unsigned long avg = bdi->avg_write_bandwidth; - unsigned long old = bdi->write_bandwidth; + unsigned long avg = wb->avg_write_bandwidth; + unsigned long old = wb->write_bandwidth; u64 bw; /* @@ -798,15 +1246,18 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period + * + * @written may have decreased due to folio_redirty_for_writepage(). + * Avoid underflowing @bw calculation. */ - bw = written - bdi->written_stamp; + bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { - do_div(bw, elapsed); + bw = div64_ul(bw, elapsed); avg = bw; goto out; } - bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* @@ -819,21 +1270,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, avg += (old - avg) >> 3; out: - bdi->write_bandwidth = bw; - bdi->avg_write_bandwidth = avg; + /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ + avg = max(avg, 1LU); + if (wb_has_dirty_io(wb)) { + long delta = avg - wb->avg_write_bandwidth; + WARN_ON_ONCE(atomic_long_add_return(delta, + &wb->bdi->tot_write_bandwidth) <= 0); + } + wb->write_bandwidth = bw; + WRITE_ONCE(wb->avg_write_bandwidth, avg); } -/* - * The global dirtyable memory and dirty threshold could be suddenly knocked - * down by a large amount (eg. on the startup of KVM in a swapless system). - * This may throw the system into deep dirty exceeded state and throttle - * heavy/light dirtiers alike. To retain good responsiveness, maintain - * global_dirty_limit for tracking slowly down to the knocked down dirty - * threshold. - */ -static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +static void update_dirty_limit(struct dirty_throttle_control *dtc) { - unsigned long limit = global_dirty_limit; + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + unsigned long limit = dom->dirty_limit; /* * Follow up in one step. @@ -846,84 +1298,77 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty) /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce - * global_dirty_limit which is guaranteed to lie above the dirty pages. + * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ - thresh = max(thresh, dirty); + thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: - global_dirty_limit = limit; + dom->dirty_limit = limit; } -static void global_update_bandwidth(unsigned long thresh, - unsigned long dirty, - unsigned long now) +static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, + unsigned long now) { - static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time; + struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ - if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; - spin_lock(&dirty_lock); - if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { - update_dirty_limit(thresh, dirty); - update_time = now; + spin_lock(&dom->lock); + if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { + update_dirty_limit(dtc); + dom->dirty_limit_tstamp = now; } - spin_unlock(&dirty_lock); + spin_unlock(&dom->lock); } /* - * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. + * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * - * Normal bdi tasks will be curbed at or below it in long term. + * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ -static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long dirtied, - unsigned long elapsed) -{ - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, + unsigned long dirtied, + unsigned long elapsed) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long dirty = dtc->dirty; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long dirty_ratelimit = bdi->dirty_ratelimit; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; - unsigned long pos_ratio; unsigned long step; unsigned long x; + unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ - dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; - pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty); /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * - pos_ratio >> RATELIMIT_CALC_SHIFT; + dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, - * if there are N dd tasks, each throttled at task_ratelimit, the bdi's + * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * @@ -962,7 +1407,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, /* * We could safely do this and return immediately: * - * bdi->dirty_ratelimit = balanced_dirty_ratelimit; + * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and @@ -994,14 +1439,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * keep that period small to reduce time lags). */ step = 0; + + /* + * For strictlimit case, calculations above were based on wb counters + * and limits (starting from pos_ratio = wb_position_ratio() and up to + * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). + * Hence, to calculate "step" properly, we have to use wb_dirty as + * "dirty" and wb_setpoint as "setpoint". + */ + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = dtc->wb_dirty; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + } + if (dirty < setpoint) { - x = min(bdi->balanced_dirty_ratelimit, - min(balanced_dirty_ratelimit, task_ratelimit)); + x = min3(wb->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { - x = max(bdi->balanced_dirty_ratelimit, - max(balanced_dirty_ratelimit, task_ratelimit)); + x = max3(wb->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } @@ -1011,80 +1469,89 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ - step >>= dirty_ratelimit / (2 * step + 1); - /* - * Limit the tracking speed to avoid overshooting. - */ - step = (step + 7) / 8; + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; - bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); - bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; + WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); + wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; - trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); + trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, + struct dirty_throttle_control *mdtc, + bool update_ratelimit) { + struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long elapsed; unsigned long dirtied; unsigned long written; + spin_lock(&wb->list_lock); + /* - * rate-limit, only update once every 200ms. + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. */ - if (elapsed < BANDWIDTH_INTERVAL) - return; + elapsed = max(now - wb->bw_time_stamp, 1UL); + dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); + written = percpu_counter_read(&wb->stat[WB_WRITTEN]); - dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); - written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + if (update_ratelimit) { + domain_update_dirty_limit(gdtc, now); + wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); - /* - * Skip quiet periods when disk bandwidth is under-utilized. - * (at least 1s idle time between two flusher runs) - */ - if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) - goto snapshot; - - if (thresh) { - global_update_bandwidth(thresh, dirty, now); - bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, - dirtied, elapsed); + /* + * @mdtc is always NULL if !CGROUP_WRITEBACK but the + * compiler has no way to figure that out. Help it. + */ + if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { + domain_update_dirty_limit(mdtc, now); + wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); + } } - bdi_update_write_bandwidth(bdi, elapsed, written); + wb_update_write_bandwidth(wb, elapsed, written); -snapshot: - bdi->dirtied_stamp = dirtied; - bdi->written_stamp = written; - bdi->bw_time_stamp = now; + wb->dirtied_stamp = dirtied; + wb->written_stamp = written; + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); } -static void bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +void wb_update_bandwidth(struct bdi_writeback *wb) { - if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) - return; - spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, start_time); - spin_unlock(&bdi->wb.list_lock); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + __wb_update_bandwidth(&gdtc, NULL, false); +} + +/* Interval after which we consider wb idle and don't estimate bandwidth */ +#define WB_BANDWIDTH_IDLE_JIF (HZ) + +static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); + + if (elapsed > WB_BANDWIDTH_IDLE_JIF && + !atomic_read(&wb->writeback_inodes)) { + spin_lock(&wb->list_lock); + wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); + wb->written_stamp = wb_stat(wb, WB_WRITTEN); + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); + } } /* @@ -1092,7 +1559,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive - * global_page_state() too often. So scale it near-sqrt to the safety margin + * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, @@ -1104,11 +1571,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long wb_max_pause(struct bdi_writeback *wb, + unsigned long wb_dirty) { - long bw = bdi->avg_write_bandwidth; - long t; + unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); + unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long @@ -1117,20 +1584,20 @@ static long bdi_max_pause(struct backing_dev_info *bdi, * * 8 serves as the safety ratio. */ - t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; - return min_t(long, t, MAX_PAUSE); + return min_t(unsigned long, t, MAX_PAUSE); } -static long bdi_min_pause(struct backing_dev_info *bdi, - long max_pause, - unsigned long task_ratelimit, - unsigned long dirty_ratelimit, - int *nr_dirtied_pause) +static long wb_min_pause(struct bdi_writeback *wb, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) { - long hi = ilog2(bdi->avg_write_bandwidth); - long lo = ilog2(bdi->dirty_ratelimit); + long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); + long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1198,6 +1665,141 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } +static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long wb_reclaimable; + + /* + * wb_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, wb_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (wb_dirty >> wb_thresh) either because + * wb_dirty starts high, or because wb_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until wb_dirty drops under + * wb_thresh. Instead the auxiliary wb control line in + * wb_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down wb_dirty. + */ + dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); + dtc->wb_bg_thresh = dtc->thresh ? + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (dtc->wb_thresh < 2 * wb_stat_error()) { + wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); + } else { + wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); + } +} + +static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh; + + if (strictlimit) { + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + } + + return dirty_poll_interval(dirty, thresh); +} + +/* + * Throttle it only when the background writeback cannot catch-up. This avoids + * (excessively) small writeouts when the wb limits are ramping up in case of + * !strictlimit. + * + * In strictlimit case make decision based on the wb counters and limits. Small + * writeouts when the wb limits are ramping up are the price we consciously pay + * for strictlimit-ing. + */ +static void domain_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh, bg_thresh; + + if (unlikely(strictlimit)) { + wb_dirty_limits(dtc); + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + bg_thresh = dtc->wb_bg_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + bg_thresh = dtc->bg_thresh; + } + dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); +} + +static void balance_domain_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + domain_dirty_avail(dtc, true); + domain_dirty_limits(dtc); + domain_dirty_freerun(dtc, strictlimit); +} + +static void wb_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->freerun = false; + + /* was already handled in domain_dirty_freerun */ + if (strictlimit) + return; + + wb_dirty_limits(dtc); + /* + * LOCAL_THROTTLE tasks must not be throttled when below the per-wb + * freerun ceiling. + */ + if (!(current->flags & PF_LOCAL_THROTTLE)) + return; + + dtc->freerun = dtc->wb_dirty < + dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh); +} + +static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) && + ((dtc->dirty > dtc->thresh) || strictlimit); +} + +/* + * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is + * in freerun state. Please don't use these invalid fields in freerun case. + */ +static void balance_wb_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + wb_dirty_freerun(dtc, strictlimit); + if (dtc->freerun) + return; + + wb_dirty_exceeded(dtc, strictlimit); + wb_position_ratio(dtc); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1205,116 +1807,118 @@ static long bdi_min_pause(struct backing_dev_info *bdi, * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping, - unsigned long pages_dirtied) -{ - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long bdi_reclaimable; - unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long bdi_dirty; - unsigned long freerun; - unsigned long background_thresh; - unsigned long dirty_thresh; - unsigned long bdi_thresh; +static int balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, unsigned int flags) +{ + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + struct dirty_throttle_control *sdtc; + unsigned long nr_dirty; long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; - bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; - unsigned long pos_ratio; - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = wb->bdi; + bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; + int ret = 0; for (;;) { unsigned long now = jiffies; + nr_dirty = global_node_page_state(NR_FILE_DIRTY); + + balance_domain_limits(gdtc, strictlimit); + if (mdtc) { + /* + * If @wb belongs to !root memcg, repeat the same + * basic calculations for the memcg domain. + */ + balance_domain_limits(mdtc, strictlimit); + } + /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. + * In laptop mode, we wait until hitting the higher threshold + * before starting background writeout, and then write out all + * the way down to the lower threshold. So slow writers cause + * minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. */ - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); - - global_dirty_limits(&background_thresh, &dirty_thresh); + if (!laptop_mode && nr_dirty > gdtc->bg_thresh && + !writeback_in_progress(wb)) + wb_start_background_writeback(wb); /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up. + * If memcg domain is in effect, @dirty should be under + * both global and memcg freerun ceilings. */ - freerun = dirty_freerun_ceiling(dirty_thresh, - background_thresh); - if (nr_dirty <= freerun) { + if (gdtc->freerun && (!mdtc || mdtc->freerun)) { + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = domain_poll_intv(gdtc, strictlimit); + m_intv = ULONG_MAX; + current->dirty_paused_when = now; current->nr_dirtied = 0; - current->nr_dirtied_pause = - dirty_poll_interval(nr_dirty, dirty_thresh); + if (mdtc) + m_intv = domain_poll_intv(mdtc, strictlimit); + current->nr_dirtied_pause = min(intv, m_intv); break; } - if (unlikely(!writeback_in_progress(bdi))) - bdi_start_background_writeback(bdi); + /* Start writeback even when in laptop mode */ + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); - /* - * bdi_thresh is not treated as some limiting factor as - * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot - * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. - * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. - */ - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + mem_cgroup_flush_foreign(wb); /* - * In order to avoid the stacked BDI deadlock we need - * to ensure we accurately count the 'dirty' pages when - * the threshold is low. - * - * Otherwise it would be possible to get thresh+n pages - * reported dirty, even though there are thresh-m pages - * actually dirty; with m+n sitting in the percpu - * deltas. + * Calculate global domain's pos_ratio and select the + * global dtc by default. */ - if (bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); - } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); - } + balance_wb_limits(gdtc, strictlimit); + if (gdtc->freerun) + goto free_running; + sdtc = gdtc; - dirty_exceeded = (bdi_dirty > bdi_thresh) && - (nr_dirty > dirty_thresh); - if (dirty_exceeded && !bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; + if (mdtc) { + /* + * If memcg domain is in effect, calculate its + * pos_ratio. @wb should satisfy constraints from + * both global and memcg domains. Choose the one + * w/ lower pos_ratio. + */ + balance_wb_limits(mdtc, strictlimit); + if (mdtc->freerun) + goto free_running; + if (mdtc->pos_ratio < gdtc->pos_ratio) + sdtc = mdtc; + } - bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, - nr_dirty, bdi_thresh, bdi_dirty, - start_time); + wb->dirty_exceeded = gdtc->dirty_exceeded || + (mdtc && mdtc->dirty_exceeded); + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + __wb_update_bandwidth(gdtc, mdtc, true); - dirty_ratelimit = bdi->dirty_ratelimit; - pos_ratio = bdi_position_ratio(bdi, dirty_thresh, - background_thresh, nr_dirty, - bdi_thresh, bdi_dirty); - task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> + /* throttle according to the chosen dtc */ + dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); + task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; - max_pause = bdi_max_pause(bdi, bdi_dirty); - min_pause = bdi_min_pause(bdi, max_pause, - task_ratelimit, dirty_ratelimit, - &nr_dirtied_pause); + max_pause = wb_max_pause(wb, sdtc->wb_dirty); + min_pause = wb_min_pause(wb, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; @@ -1333,12 +1937,8 @@ static void balance_dirty_pages(struct address_space *mapping, * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { - trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + trace_balance_dirty_pages(wb, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1362,19 +1962,20 @@ static void balance_dirty_pages(struct address_space *mapping, } pause: - trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + trace_balance_dirty_pages(wb, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); + if (flags & BDP_ASYNC) { + ret = -EAGAIN; + break; + } __set_current_state(TASK_KILLABLE); + bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; @@ -1382,58 +1983,29 @@ pause: current->nr_dirtied_pause = nr_dirtied_pause; /* - * This is typically equal to (nr_dirty < dirty_thresh) and can - * also keep "1000+ dd on a slow USB stick" under control. + * This is typically equal to (dirty < thresh) and can also + * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* - * In the case of an unresponding NFS server and the NFS dirty - * pages exceeds dirty_thresh, give the other good bdi's a pipe + * In the case of an unresponsive NFS server and the NFS dirty + * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * - * In theory 1 page is enough to keep the comsumer-producer + * In theory 1 page is enough to keep the consumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 - * more page. However bdi_dirty has accounting errors. So use - * the larger and more IO friendly bdi_stat_error. + * more page. However wb_dirty has accounting errors. So use + * the larger and more IO friendly wb_stat_error. */ - if (bdi_dirty <= bdi_stat_error(bdi)) + if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) break; } - - if (!dirty_exceeded && bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; - - if (writeback_in_progress(bdi)) - return; - - /* - * In laptop mode, we wait until hitting the higher threshold before - * starting background writeout, and then write out all the way down - * to the lower threshold. So slow writers cause minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (laptop_mode) - return; - - if (nr_reclaimable > background_thresh) - bdi_start_background_writeback(bdi); -} - -void set_page_dirty_balance(struct page *page, int page_mkwrite) -{ - if (set_page_dirty(page) || page_mkwrite) { - struct address_space *mapping = page_mapping(page); - - if (mapping) - balance_dirty_pages_ratelimited(mapping); - } + return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1455,29 +2027,42 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited - balance dirty memory state - * @mapping: address_space which was dirtied + * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. + * @mapping: address_space which was dirtied. + * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * On really big machines, get_writeback_state is expensive, so try to avoid - * calling it too often (ratelimiting). But once we're over the dirty memory - * limit we decrease the ratelimiting by a lot, to prevent individual processes - * from overshooting the limit by (ratelimit_pages) each. + * See balance_dirty_pages_ratelimited() for details. + * + * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to + * indicate that memory is out of balance and the caller must wait + * for I/O to complete. Otherwise, it will return 0 to indicate + * that either memory was already in balance, or it was able to sleep + * until the amount of dirty memory returned to balance. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; int ratelimit; + int ret = 0; int *p; - if (!bdi_cap_account_dirty(bdi)) - return; + if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) + return ret; + + if (inode_cgwb_enabled(inode)) + wb = wb_get_create_current(bdi, GFP_KERNEL); + if (!wb) + wb = &bdi->wb; ratelimit = current->nr_dirtied_pause; - if (bdi->dirty_exceeded) + if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); @@ -1487,7 +2072,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ - p = &__get_cpu_var(bdp_ratelimits); + p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { @@ -1499,7 +2084,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ - p = &__get_cpu_var(dirty_throttle_leaks); + p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); @@ -1509,64 +2094,116 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(mapping, current->nr_dirtied); + ret = balance_dirty_pages(wb, current->nr_dirtied, flags); + + wb_put(wb); + return ret; +} +EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); + +/** + * balance_dirty_pages_ratelimited - balance dirty memory state. + * @mapping: address_space which was dirtied. + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * Once we're over the dirty memory limit we decrease the ratelimiting + * by a lot, to prevent individual processes from overshooting the limit + * by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited(struct address_space *mapping) +{ + balance_dirty_pages_ratelimited_flags(mapping, 0); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); -void throttle_vm_writeout(gfp_t gfp_mask) +/* + * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty + * and thresh, but it's for background writeback. + */ +static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) { - unsigned long background_thresh; - unsigned long dirty_thresh; + struct bdi_writeback *wb = dtc->wb; - for ( ; ; ) { - global_dirty_limits(&background_thresh, &dirty_thresh); - dirty_thresh = hard_dirty_limit(dirty_thresh); + dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh); + if (dtc->wb_bg_thresh < 2 * wb_stat_error()) + dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); + else + dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE); +} - /* - * Boost the allowable dirty threshold a bit for page - * allocators so they don't get DoS'ed by heavy writers - */ - dirty_thresh += dirty_thresh / 10; /* wheeee... */ +static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc) +{ + domain_dirty_avail(dtc, false); + domain_dirty_limits(dtc); + if (dtc->dirty > dtc->bg_thresh) + return true; - if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) <= dirty_thresh) - break; - congestion_wait(BLK_RW_ASYNC, HZ/10); + wb_bg_dirty_limits(dtc); + if (dtc->wb_dirty > dtc->wb_bg_thresh) + return true; - /* - * The caller might hold locks which can prevent IO completion - * or progress in the filesystem. So we cannot just sit here - * waiting for IO to complete. - */ - if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) - break; - } + return false; } -/* - * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs +/** + * wb_over_bg_thresh - does @wb need to be written back? + * @wb: bdi_writeback of interest + * + * Determines whether background writeback should keep writing @wb or it's + * clean enough. + * + * Return: %true if writeback should continue. */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +bool wb_over_bg_thresh(struct bdi_writeback *wb) { - proc_dointvec(table, write, buffer, length, ppos); - return 0; + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; + + if (domain_over_bg_thresh(&gdtc)) + return true; + + if (mdtc_valid(&mdtc)) + return domain_over_bg_thresh(&mdtc); + + return false; } -#ifdef CONFIG_BLOCK -void laptop_mode_timer_fn(unsigned long data) +#ifdef CONFIG_SYSCTL +/* + * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs + */ +static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) { - struct request_queue *q = (struct request_queue *)data; - int nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + unsigned int old_interval = dirty_writeback_interval; + int ret; + + ret = proc_dointvec(table, write, buffer, length, ppos); /* - * We want to write everything out, not just down to the dirty - * threshold + * Writing 0 to dirty_writeback_interval will disable periodic writeback + * and a different non-zero value will wakeup the writeback threads. + * wb_wakeup_delayed() would be more appropriate, but it's a pain to + * iterate over all bdis and wbs. + * The reason we do this is to make the change take effect immediately. */ - if (bdi_has_dirty_io(&q->backing_dev_info)) - bdi_start_writeback(&q->backing_dev_info, nr_pages, - WB_REASON_LAPTOP_TIMER); + if (!ret && write && dirty_writeback_interval && + dirty_writeback_interval != old_interval) + wakeup_flusher_threads(WB_REASON_PERIODIC); + + return ret; +} +#endif + +void laptop_mode_timer_fn(struct timer_list *t) +{ + struct backing_dev_info *backing_dev_info = + timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); + + wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } /* @@ -1591,17 +2228,14 @@ void laptop_sync_completion(void) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - del_timer(&bdi->laptop_mode_wb_timer); + timer_delete(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } -#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. - * If it is too low then SMP machines will call the (expensive) - * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory @@ -1610,46 +2244,108 @@ void laptop_sync_completion(void) void writeback_set_ratelimit(void) { + struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; + global_dirty_limits(&background_thresh, &dirty_thresh); - global_dirty_limit = dirty_thresh; + dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } -static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long action, - void *hcpu) +static int page_writeback_cpu_online(unsigned int cpu) { - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DEAD: - writeback_set_ratelimit(); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } + writeback_set_ratelimit(); + return 0; } -static struct notifier_block __cpuinitdata ratelimit_nb = { - .notifier_call = ratelimit_handler, - .next = NULL, +#ifdef CONFIG_SYSCTL + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; + +static const struct ctl_table vm_page_writeback_sysctls[] = { + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, }; +#endif /* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory - * related to pages that could be allocated for buffers (by - * comparing nr_free_buffer_pages() to vm_total_pages. + * related to pages that could be allocated for buffers. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" - * is now applied to total non-HIGHPAGE memory (by subtracting - * totalhigh_pages from vm_total_pages), and as such we can't + * is now applied to total non-HIGHPAGE memory, and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. @@ -1659,659 +2355,783 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { */ void __init page_writeback_init(void) { - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); - - fprop_global_init(&writeout_completions); + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); + + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", + page_writeback_cpu_online, NULL); + cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, + page_writeback_cpu_online); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_page_writeback_sysctls); +#endif } /** - * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * tag_pages_for_writeback - tag pages to be written by writeback * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags - * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is - * that write_cache_pages (or whoever calls this function) will then use - * TOWRITE tag to identify pages eligible for writeback. This mechanism is - * used to avoid livelocking of writeback by a process steadily creating new - * dirty pages in the file (thus it is important for this function to be quick - * so that it can tag pages faster than a dirtying process can create them). - */ -/* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + * all pages that have DIRTY tag set with a special TOWRITE tag. The caller + * can then use the TOWRITE tag to identify pages eligible for writeback. + * This mechanism is used to avoid livelocking of writeback by a process + * steadily creating new dirty pages in the file (thus it is important for this + * function to be quick so that it can tag pages faster than a dirtying process + * can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { -#define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged; - - do { - spin_lock_irq(&mapping->tree_lock); - tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, - &start, end, WRITEBACK_TAG_BATCH, - PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); - spin_unlock_irq(&mapping->tree_lock); - WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); cond_resched(); - /* We check 'start' to handle wrapping when end == ~0UL */ - } while (tagged >= WRITEBACK_TAG_BATCH && start); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - * - * To avoid livelocks (when other process dirties new pages), we first tag - * pages which should be written back with TOWRITE tag and only then start - * writing them. For data-integrity sync we have to be careful so that we do - * not miss some pages (e.g., because some other process has cleared TOWRITE - * tag we set). The rule we follow is that TOWRITE tag can be cleared only - * by the process clearing the DIRTY tag (and submitting the page for IO). - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) +static bool folio_prepare_writeback(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio) { - int ret = 0; - int done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t uninitialized_var(writeback_index); - pgoff_t index; - pgoff_t end; /* Inclusive */ - pgoff_t done_index; - int cycled; - int range_whole = 0; - int tag; - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ - } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; -retry: - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - done_index = index; - while (!done && (index <= end)) { - int i; - - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + /* + * Folio truncated or invalidated. We can freely skip it then, + * even for data integrity operations: the folio has disappeared + * concurrently, so there could be no real expectation of this + * data integrity operation even if there is now a new, dirty + * folio at the same pagecache index. + */ + if (unlikely(folio->mapping != mapping)) + return false; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } + /* + * Did somebody else write it for us? + */ + if (!folio_test_dirty(folio)) + return false; - done_index = page->index; + if (folio_test_writeback(folio)) { + if (wbc->sync_mode == WB_SYNC_NONE) + return false; + folio_wait_writeback(folio); + } + BUG_ON(folio_test_writeback(folio)); - lock_page(page); + if (!folio_clear_dirty_for_io(folio)) + return false; - /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data interity operation - * even if there is now a new, dirty page at the same - * pagecache address. - */ - if (unlikely(page->mapping != mapping)) { -continue_unlock: - unlock_page(page); - continue; - } + return true; +} - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } - if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - else - goto continue_unlock; - } +static pgoff_t wbc_end(struct writeback_control *wbc) +{ + if (wbc->range_cyclic) + return -1; + return wbc->range_end >> PAGE_SHIFT; +} - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) - goto continue_unlock; - - trace_wbc_writepage(wbc, mapping->backing_dev_info); - ret = (*writepage)(page, wbc, data); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ - done_index = page->index + 1; - done = 1; - break; - } - } +static struct folio *writeback_get_folio(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct folio *folio; - /* - * We stop writing back only if we are not doing - * integrity sync. In case of integrity sync we have to - * keep going until we have written all the pages - * we tagged for writeback prior to entering this loop. - */ - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { - done = 1; - break; - } - } - pagevec_release(&pvec); +retry: + folio = folio_batch_next(&wbc->fbatch); + if (!folio) { + folio_batch_release(&wbc->fbatch); cond_resched(); + filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), + wbc_to_tag(wbc), &wbc->fbatch); + folio = folio_batch_next(&wbc->fbatch); + if (!folio) + return NULL; } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; + + folio_lock(folio); + if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) { + folio_unlock(folio); goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = done_index; - - return ret; -} -EXPORT_SYMBOL(write_cache_pages); -/* - * Function used by generic_writepages to call the real writepage - * function and set the mapping flags on error - */ -static int __writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + return folio; } /** - * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * writeback_iter - iterate folio of a mapping for writeback * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @wbc: writeback context + * @folio: previously iterated folio (%NULL to start) + * @error: in-out pointer for writeback errors (see below) + * + * This function returns the next folio for the writeback operation described by + * @wbc on @mapping and should be called in a while loop in the ->writepages + * implementation. * - * This is a library function, which implements the writepages() - * address_space_operation. + * To start the writeback operation, %NULL is passed in the @folio argument, and + * for every subsequent iteration the folio returned previously should be passed + * back in. + * + * If there was an error in the per-folio writeback inside the writeback_iter() + * loop, @error should be set to the error value. + * + * Once the writeback described in @wbc has finished, this function will return + * %NULL and if there was an error in any iteration restore it to @error. + * + * Note: callers should not manually break out of the loop using break or goto + * but must keep calling writeback_iter() until it returns %NULL. + * + * Return: the folio to write or %NULL if the loop is done. */ -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc) +struct folio *writeback_iter(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio, int *error) { - struct blk_plug plug; - int ret; + if (!folio) { + folio_batch_init(&wbc->fbatch); + wbc->saved_err = *error = 0; - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; + /* + * For range cyclic writeback we remember where we stopped so + * that we can continue where we stopped. + * + * For non-cyclic writeback we always start at the beginning of + * the passed in range. + */ + if (wbc->range_cyclic) + wbc->index = mapping->writeback_index; + else + wbc->index = wbc->range_start >> PAGE_SHIFT; - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __writepage, mapping); - blk_finish_plug(&plug); - return ret; -} + /* + * To avoid livelocks when other processes dirty new pages, we + * first tag pages which should be written back and only then + * start writing them. + * + * For data-integrity writeback we have to be careful so that we + * do not miss some pages (e.g., because some other process has + * cleared the TOWRITE tag we set). The rule we follow is that + * TOWRITE tag can be cleared only by the process clearing the + * DIRTY tag (and submitting the page for I/O). + */ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, wbc->index, + wbc_end(wbc)); + } else { + wbc->nr_to_write -= folio_nr_pages(folio); + + WARN_ON_ONCE(*error > 0); + + /* + * For integrity writeback we have to keep going until we have + * written all the folios we tagged for writeback above, even if + * we run past wbc->nr_to_write or encounter errors. + * We stash away the first error we encounter in wbc->saved_err + * so that it can be retrieved when we're done. This is because + * the file system may still have state to clear for each folio. + * + * For background writeback we exit as soon as we run past + * wbc->nr_to_write or encounter the first error. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + if (*error && !wbc->saved_err) + wbc->saved_err = *error; + } else { + if (*error || wbc->nr_to_write <= 0) + goto done; + } + } + + folio = writeback_get_folio(mapping, wbc); + if (!folio) { + /* + * To avoid deadlocks between range_cyclic writeback and callers + * that hold folios in writeback to aggregate I/O until + * the writeback iteration finishes, we do not loop back to the + * start of the file. Doing so causes a folio lock/folio + * writeback access order inversion - we should only ever lock + * multiple folios in ascending folio->index order, and looping + * back to the start of the file violates that rule and causes + * deadlocks. + */ + if (wbc->range_cyclic) + mapping->writeback_index = 0; -EXPORT_SYMBOL(generic_writepages); + /* + * Return the first error we encountered (if there was any) to + * the caller. + */ + *error = wbc->saved_err; + } + return folio; + +done: + if (wbc->range_cyclic) + mapping->writeback_index = folio_next_index(folio); + folio_batch_release(&wbc->fbatch); + return NULL; +} +EXPORT_SYMBOL_GPL(writeback_iter); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; + struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; - if (mapping->a_ops->writepages) - ret = mapping->a_ops->writepages(mapping, wbc); - else - ret = generic_writepages(mapping, wbc); - return ret; -} + wb = inode_to_wb_wbc(mapping->host, wbc); + wb_bandwidth_estimate_start(wb); + while (1) { + if (mapping->a_ops->writepages) + ret = mapping->a_ops->writepages(mapping, wbc); + else + /* deal with chardevs and other special files */ + ret = 0; + if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) + break; -/** - * write_one_page - write out a single page and optionally wait on I/O - * @page: the page to write - * @wait: if true, wait on writeout - * - * The page must be locked by the caller and will be unlocked upon return. - * - * write_one_page() returns a negative error code if I/O failed. - */ -int write_one_page(struct page *page, int wait) -{ - struct address_space *mapping = page->mapping; - int ret = 0; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - }; - - BUG_ON(!PageLocked(page)); - - if (wait) - wait_on_page_writeback(page); - - if (clear_page_dirty_for_io(page)) { - page_cache_get(page); - ret = mapping->a_ops->writepage(page, &wbc); - if (ret == 0 && wait) { - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } - page_cache_release(page); - } else { - unlock_page(page); + /* + * Lacking an allocation context or the locality or writeback + * state of any of the inode's pages, throttle based on + * writeback activity on the local node. It's as good a + * guess as any. + */ + reclaim_throttle(NODE_DATA(numa_node_id()), + VMSCAN_THROTTLE_WRITEBACK); } + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); return ret; } -EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ -int __set_page_dirty_no_writeback(struct page *page) +bool noop_dirty_folio(struct address_space *mapping, struct folio *folio) { - if (!PageDirty(page)) - return !TestSetPageDirty(page); - return 0; + if (!folio_test_dirty(folio)) + return !folio_test_set_dirty(folio); + return false; } +EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. + * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) -{ - trace_writeback_dirty_page(page, mapping); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); - task_io_account_write(PAGE_CACHE_SIZE); - current->nr_dirtied++; - this_cpu_inc(bdp_ratelimits); +static void folio_account_dirtied(struct folio *folio, + struct address_space *mapping) +{ + struct inode *inode = mapping->host; + + trace_writeback_dirty_folio(folio, mapping); + + if (mapping_can_writeback(mapping)) { + struct bdi_writeback *wb; + long nr = folio_nr_pages(folio); + + inode_attach_wb(inode, folio); + wb = inode_to_wb(inode); + + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + __node_stat_mod_folio(folio, NR_DIRTIED, nr); + wb_stat_mod(wb, WB_RECLAIMABLE, nr); + wb_stat_mod(wb, WB_DIRTIED, nr); + task_io_account_write(nr * PAGE_SIZE); + current->nr_dirtied += nr; + __this_cpu_add(bdp_ratelimits, nr); + + mem_cgroup_track_foreign_dirty(folio, wb); } } -EXPORT_SYMBOL(account_page_dirtied); /* - * Helper function for set_page_writeback family. - * NOTE: Unlike account_page_dirtied this does not rely on being atomic - * wrt interrupts. + * Helper function for deaccounting dirty page without writeback. + * */ -void account_page_writeback(struct page *page) +void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { - inc_zone_page_state(page, NR_WRITEBACK); + long nr = folio_nr_pages(folio); + + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + task_io_account_cancelled_write(nr * PAGE_SIZE); } -EXPORT_SYMBOL(account_page_writeback); /* - * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. - * - * This is also used when a single buffer is being dirtied: we want to set the - * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * Mark the folio dirty, and set it dirty in the page cache. * - * Most callers have locked the page, which pins the address_space in memory. - * But zap_pte_range() does not lock the page, however in that case the - * mapping is pinned by the vma's ->vm_file reference. + * If warn is true, then emit a warning if the folio is not uptodate and has + * not been truncated. * - * We take care to handle the case where the page was truncated from the - * mapping by re-checking page_mapping() inside tree_lock. + * It is the caller's responsibility to prevent the folio from being truncated + * while this function is in progress, although it may have been truncated + * before this function is called. Most callers have the folio locked. + * A few have the folio blocked from truncation through other means (e.g. + * zap_vma_pages() has it mapped and is holding the page table lock). + * When called from mark_buffer_dirty(), the filesystem should hold a + * reference to the buffer_head that is being marked dirty, which causes + * try_to_free_buffers() to fail. */ -int __set_page_dirty_nobuffers(struct page *page) -{ - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); - struct address_space *mapping2; - - if (!mapping) - return 1; - - spin_lock_irq(&mapping->tree_lock); - mapping2 = page_mapping(page); - if (mapping2) { /* Race with truncate? */ - BUG_ON(mapping2 != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irq(&mapping->tree_lock); - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - } - return 1; +void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, + int warn) +{ + unsigned long flags; + + /* + * Shmem writeback relies on swap, and swap writeback is LRU based, + * not using the dirty mark. + */ + VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping)); + + xa_lock_irqsave(&mapping->i_pages, flags); + if (folio->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); + folio_account_dirtied(folio, mapping); + __xa_set_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); } - return 0; + xa_unlock_irqrestore(&mapping->i_pages, flags); } -EXPORT_SYMBOL(__set_page_dirty_nobuffers); -/* - * Call this whenever redirtying a page, to de-account the dirty counters - * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written - * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to - * systematic errors in balanced_dirty_ratelimit and the dirty pages position - * control. +/** + * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. + * @mapping: Address space this folio belongs to. + * @folio: Folio to be marked as dirty. + * + * Filesystems which do not use buffer heads should call this function + * from their dirty_folio address space operation. It ignores the + * contents of folio_get_private(), so if the filesystem marks individual + * blocks as dirty, the filesystem should handle that itself. + * + * This is also sometimes used by filesystems which use buffer_heads when + * a single buffer is being dirtied: we want to set the folio dirty in + * that case, but not all the buffers. This is a "bottom-up" dirtying, + * whereas block_dirty_folio() is a "top-down" dirtying. + * + * The caller must ensure this doesn't race with truncation. Most will + * simply hold the folio lock, but e.g. zap_pte_range() calls with the + * folio mapped and the pte lock held, which also locks out truncation. */ -void account_page_redirty(struct page *page) +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct address_space *mapping = page->mapping; - if (mapping && mapping_cap_account_dirty(mapping)) { - current->nr_dirtied--; - dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + if (folio_test_set_dirty(folio)) + return false; + + __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); + + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } + return true; } -EXPORT_SYMBOL(account_page_redirty); +EXPORT_SYMBOL(filemap_dirty_folio); -/* - * When a writepage implementation decides that it doesn't want to write this - * page for some reason, it should redirty the locked page via - * redirty_page_for_writepage() and it should then unlock the page and return 0 +/** + * folio_redirty_for_writepage - Decline to write a dirty folio. + * @wbc: The writeback control. + * @folio: The folio. + * + * When a writepage implementation decides that it doesn't want to write + * @folio for some reason, it should call this function, unlock @folio and + * return 0. + * + * Return: True if we redirtied the folio. False if someone else dirtied + * it first. */ -int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +bool folio_redirty_for_writepage(struct writeback_control *wbc, + struct folio *folio) { - wbc->pages_skipped++; - account_page_redirty(page); - return __set_page_dirty_nobuffers(page); + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); + bool ret; + + wbc->pages_skipped += nr; + ret = filemap_dirty_folio(mapping, folio); + if (mapping && mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + current->nr_dirtied -= nr; + node_stat_mod_folio(folio, NR_DIRTIED, -nr); + wb_stat_mod(wb, WB_DIRTIED, -nr); + unlocked_inode_to_wb_end(inode, &cookie); + } + return ret; } -EXPORT_SYMBOL(redirty_page_for_writepage); +EXPORT_SYMBOL(folio_redirty_for_writepage); -/* - * Dirty a page. +/** + * folio_mark_dirty - Mark a folio as being modified. + * @folio: The folio. * - * For pages with a mapping this should be done under the page lock - * for the benefit of asynchronous memory errors who prefer a consistent - * dirty state. This rule can be broken in some special cases, - * but should be better not to. + * The folio may not be truncated while this function is running. + * Holding the folio lock is sufficient to prevent truncation, but some + * callers cannot acquire a sleeping lock. These callers instead hold + * the page table lock for a page table which contains at least one page + * in this folio. Truncation will block on the page table lock as it + * unmaps pages before removing the folio from its mapping. * - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. + * Return: True if the folio was newly dirtied, false if it was already dirty. */ -int set_page_dirty(struct page *page) +bool folio_mark_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); if (likely(mapping)) { - int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; /* - * readahead/lru_deactivate_page could remain - * PG_readahead/PG_reclaim due to race with end_page_writeback - * About readahead, if the page is written, the flags would be + * readahead/folio_deactivate could remain + * PG_readahead/PG_reclaim due to race with folio_end_writeback + * About readahead, if the folio is written, the flags would be * reset. So no problem. - * About lru_deactivate_page, if the page is redirty, the flag - * will be reset. So no problem. but if the page is used by readahead - * it will confuse readahead and make it restart the size rampup - * process. But it's a trivial problem. + * About folio_deactivate, if the folio is redirtied, + * the flag will be reset. So no problem. but if the + * folio is used by readahead it will confuse readahead + * and make it restart the size rampup process. But it's + * a trivial problem. */ - ClearPageReclaim(page); -#ifdef CONFIG_BLOCK - if (!spd) - spd = __set_page_dirty_buffers; -#endif - return (*spd)(page); - } - if (!PageDirty(page)) { - if (!TestSetPageDirty(page)) - return 1; + if (folio_test_reclaim(folio)) + folio_clear_reclaim(folio); + return mapping->a_ops->dirty_folio(mapping, folio); } - return 0; + + return noop_dirty_folio(mapping, folio); } -EXPORT_SYMBOL(set_page_dirty); +EXPORT_SYMBOL(folio_mark_dirty); /* - * set_page_dirty() is racy if the caller has no reference against - * page->mapping->host, and if the page is unlocked. This is because another - * CPU could truncate the page off the mapping and then free the mapping. + * folio_mark_dirty() is racy if the caller has no reference against + * folio->mapping->host, and if the folio is unlocked. This is because another + * CPU could truncate the folio off the mapping and then free the mapping. * - * Usually, the page _is_ locked, or the caller is a user-space process which + * Usually, the folio _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * - * In other cases, the page should be locked before running set_page_dirty(). + * In other cases, the folio should be locked before running folio_mark_dirty(). */ -int set_page_dirty_lock(struct page *page) +bool folio_mark_dirty_lock(struct folio *folio) { - int ret; + bool ret; - lock_page(page); - ret = set_page_dirty(page); - unlock_page(page); + folio_lock(folio); + ret = folio_mark_dirty(folio); + folio_unlock(folio); return ret; } -EXPORT_SYMBOL(set_page_dirty_lock); +EXPORT_SYMBOL(folio_mark_dirty_lock); /* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. - * - * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync - * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage - * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag - * back into sync. - * - * This incoherency between the page's dirty flag and radix-tree tag is - * unfortunate, but it only exists while the page is locked. + * This cancels just the dirty bit on the kernel page itself, it does NOT + * actually remove dirty bits on any mmap's that may be around. It also + * leaves the page tagged dirty, so any sync activity will still find it on + * the dirty lists, and in particular, clear_page_dirty_for_io() will still + * look at the dirty bits in the VM. + * + * Doing this should *normally* only ever be done when a page is truncated, + * and is not actually mapped anywhere at all. However, fs/buffer.c does + * this when it notices that somebody has cleaned out all the buffers on a + * page without actually doing it through the VM. Can you say "ext3 is + * horribly ugly"? Thought you could. */ -int clear_page_dirty_for_io(struct page *page) +void __folio_cancel_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); - BUG_ON(!PageLocked(page)); + if (mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + + if (folio_test_clear_dirty(folio)) + folio_account_cleaned(folio, wb); + + unlocked_inode_to_wb_end(inode, &cookie); + } else { + folio_clear_dirty(folio); + } +} +EXPORT_SYMBOL(__folio_cancel_dirty); + +/* + * Clear a folio's dirty flag, while caring for dirty memory accounting. + * Returns true if the folio was previously dirty. + * + * This is for preparing to put the folio under writeout. We leave + * the folio tagged as dirty in the xarray so that a concurrent + * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. + * The ->writepage implementation will run either folio_start_writeback() + * or folio_mark_dirty(), at which stage we bring the folio's dirty flag + * and xarray dirty tag back into sync. + * + * This incoherency between the folio's dirty flag and xarray tag is + * unfortunate, but it only exists while the folio is locked. + */ +bool folio_clear_dirty_for_io(struct folio *folio) +{ + struct address_space *mapping = folio_mapping(folio); + bool ret = false; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (mapping && mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; - if (mapping && mapping_cap_account_dirty(mapping)) { /* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to - * mark the whole page dirty if it was + * mark the whole folio dirty if it was * dirty in a pagetable. Only to then - * (c) clean the page again and return 1 to + * (c) clean the folio again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * - * Note! Normally the "set_page_dirty(page)" + * Note! Normally the "folio_mark_dirty(folio)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * - * We basically use the page "master dirty bit" + * We basically use the folio "master dirty bit" * as a serialization point for all the different * threads doing their things. */ - if (page_mkclean(page)) - set_page_dirty(page); + if (folio_mkclean(folio)) + folio_mark_dirty(folio); /* * We carefully synchronise fault handlers against - * installing a dirty pte and marking the page dirty - * at this point. We do this by having them hold the - * page lock at some point after installing their - * pte, but before marking the page dirty. - * Pages are always locked coming in here, so we get - * the desired exclusion. See mm/memory.c:do_wp_page() - * for more comments. + * installing a dirty pte and marking the folio dirty + * at this point. We do this by having them hold the + * page lock while dirtying the folio, and folios are + * always locked coming in here, so we get the desired + * exclusion. */ - if (TestClearPageDirty(page)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - return 1; + wb = unlocked_inode_to_wb_begin(inode, &cookie); + if (folio_test_clear_dirty(folio)) { + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + ret = true; } - return 0; + unlocked_inode_to_wb_end(inode, &cookie); + return ret; } - return TestClearPageDirty(page); + return folio_test_clear_dirty(folio); } -EXPORT_SYMBOL(clear_page_dirty_for_io); +EXPORT_SYMBOL(folio_clear_dirty_for_io); -int test_clear_page_writeback(struct page *page) +static void wb_inode_writeback_start(struct bdi_writeback *wb) { - struct address_space *mapping = page_mapping(page); - int ret; + atomic_inc(&wb->writeback_inodes); +} - if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; +static void wb_inode_writeback_end(struct bdi_writeback *wb) +{ + unsigned long flags; + atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + spin_lock_irqsave(&wb->work_lock, flags); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); + spin_unlock_irqrestore(&wb->work_lock, flags); +} + +bool __folio_end_writeback(struct folio *folio) +{ + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; + + if (mapping && mapping_use_writeback_tags(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); - ret = TestClearPageWriteback(page); - if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) { - __dec_bdi_stat(bdi, BDI_WRITEBACK); - __bdi_writeout_inc(bdi); - } + xa_lock_irqsave(&mapping->i_pages, flags); + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); + __xa_clear_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_WRITEBACK); + + wb = inode_to_wb(inode); + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + wb_inode_writeback_end(wb); + if (mapping->host) + sb_clear_inode_writeback(mapping->host); } - spin_unlock_irqrestore(&mapping->tree_lock, flags); + + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - ret = TestClearPageWriteback(page); - } - if (ret) { - dec_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); } + + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + node_stat_mod_folio(folio, NR_WRITTEN, nr); + return ret; } -int test_set_page_writeback(struct page *page) +void __folio_start_writeback(struct folio *folio, bool keep_write) { - struct address_space *mapping = page_mapping(page); - int ret; + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + int access_ret; - if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, folio->index); + struct inode *inode = mapping->host; + struct bdi_writeback *wb; unsigned long flags; + bool on_wblist; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + folio_test_set_writeback(folio); + + on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - spin_lock_irqsave(&mapping->tree_lock, flags); - ret = TestSetPageWriteback(page); - if (!ret) { - radix_tree_tag_set(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) - __inc_bdi_stat(bdi, BDI_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + wb = inode_to_wb(inode); + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) { + wb_inode_writeback_start(wb); + /* + * We can come through here when swapping anonymous + * folios, so we don't necessarily have an inode to + * track for sync. + */ + if (mapping->host) + sb_mark_inode_writeback(mapping->host); } - if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_TOWRITE); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + + if (!folio_test_dirty(folio)) + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + if (!keep_write) + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); } else { - ret = TestSetPageWriteback(page); + folio_test_set_writeback(folio); } - if (!ret) - account_page_writeback(page); - return ret; + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + + access_ret = arch_make_folio_accessible(folio); + /* + * If writeback has been triggered on a page that cannot be made + * accessible, it is too late to recover here. + */ + VM_BUG_ON_FOLIO(access_ret != 0, folio); } -EXPORT_SYMBOL(test_set_page_writeback); +EXPORT_SYMBOL(__folio_start_writeback); -/* - * Return true if any of the pages in the mapping are marked with the - * passed tag. +/** + * folio_wait_writeback - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. */ -int mapping_tagged(struct address_space *mapping, int tag) +void folio_wait_writeback(struct folio *folio) { - return radix_tree_tagged(&mapping->page_tree, tag); + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + folio_wait_bit(folio, PG_writeback); + } } -EXPORT_SYMBOL(mapping_tagged); +EXPORT_SYMBOL_GPL(folio_wait_writeback); /** - * wait_for_stable_page() - wait for writeback to finish, if necessary. - * @page: The page to wait on. + * folio_wait_writeback_killable - Wait for a folio to finish writeback. + * @folio: The folio to wait for. * - * This function determines if the given page is related to a backing device - * that requires page contents to be held stable during writeback. If so, then - * it will wait for any pending writeback to complete. + * If the folio is currently being written back to storage, wait for the + * I/O to complete or a fatal signal to arrive. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + * Return: 0 on success, -EINTR if we get a fatal signal while waiting. */ -void wait_for_stable_page(struct page *page) +int folio_wait_writeback_killable(struct folio *folio) { - struct address_space *mapping = page_mapping(page); - struct backing_dev_info *bdi = mapping->backing_dev_info; + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + if (folio_wait_bit_killable(folio, PG_writeback)) + return -EINTR; + } - if (!bdi_cap_stable_pages_required(bdi)) - return; + return 0; +} +EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); - wait_on_page_writeback(page); +/** + * folio_wait_stable() - wait for writeback to finish, if necessary. + * @folio: The folio to wait on. + * + * This function determines if the given folio is related to a backing + * device that requires folio contents to be held stable during writeback. + * If so, then it will wait for any pending writeback to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + */ +void folio_wait_stable(struct folio *folio) +{ + if (mapping_stable_writes(folio_mapping(folio))) + folio_wait_writeback(folio); } -EXPORT_SYMBOL_GPL(wait_for_stable_page); +EXPORT_SYMBOL_GPL(folio_wait_stable); |
