diff options
Diffstat (limited to 'include/linux/backing-dev.h')
| -rw-r--r-- | include/linux/backing-dev.h | 509 |
1 files changed, 257 insertions, 252 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c3881553f7d1..0c8342747cab 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/backing-dev.h * @@ -8,352 +9,356 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H -#include <linux/percpu_counter.h> -#include <linux/log2.h> -#include <linux/flex_proportions.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> -#include <linux/timer.h> +#include <linux/device.h> #include <linux/writeback.h> -#include <linux/atomic.h> -#include <linux/sysctl.h> -#include <linux/workqueue.h> +#include <linux/backing-dev-defs.h> +#include <linux/slab.h> -struct page; -struct device; -struct dentry; - -/* - * Bits in backing_dev_info.state - */ -enum bdi_state { - BDI_wb_alloc, /* Default embedded wb allocated */ - BDI_async_congested, /* The async (write) queue is getting full */ - BDI_sync_congested, /* The sync queue is getting full */ - BDI_registered, /* bdi_register() was done */ - BDI_writeback_running, /* Writeback is in progress */ - BDI_unused, /* Available bits start here */ -}; - -typedef int (congested_fn)(void *, int); - -enum bdi_stat_item { - BDI_RECLAIMABLE, - BDI_WRITEBACK, - BDI_DIRTIED, - BDI_WRITTEN, - NR_BDI_STAT_ITEMS -}; - -#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) - -struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ - unsigned int nr; - - unsigned long last_old_flush; /* last old data flush */ - - struct delayed_work dwork; /* work item used for writeback */ - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ - spinlock_t list_lock; /* protects the b_* lists */ -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned long state; /* Always use atomic bitops on this */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; - - struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; - - unsigned long bw_time_stamp; /* last time write bw is updated */ - unsigned long dirtied_stamp; - unsigned long written_stamp; /* pages written at bw_time_stamp */ - unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ - - /* - * The base dirty throttle rate, re-calculated on every 200ms. - * All the bdi tasks' dirty rate will be curbed under it. - * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit - * in small steps and is much more smooth/stable than the latter. - */ - unsigned long dirty_ratelimit; - unsigned long balanced_dirty_ratelimit; - - struct fprop_local_percpu completions; - int dirty_exceeded; - - unsigned int min_ratio; - unsigned int max_ratio, max_prop_frac; - - struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list */ - - struct list_head work_list; +static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) +{ + kref_get(&bdi->refcnt); + return bdi; +} - struct device *dev; +struct backing_dev_info *bdi_get_by_id(u64 id); +void bdi_put(struct backing_dev_info *bdi); - struct timer_list laptop_mode_wb_timer; +__printf(2, 3) +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); +__printf(2, 0) +int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, + va_list args); +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); +void bdi_unregister(struct backing_dev_info *bdi); -#ifdef CONFIG_DEBUG_FS - struct dentry *debug_dir; - struct dentry *debug_stats; -#endif -}; +struct backing_dev_info *bdi_alloc(int node_id); -int bdi_init(struct backing_dev_info *bdi); -void bdi_destroy(struct backing_dev_info *bdi); +void wb_start_background_writeback(struct bdi_writeback *wb); +void wb_workfn(struct work_struct *work); -__printf(3, 4) -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...); -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); -void bdi_unregister(struct backing_dev_info *bdi); -int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason); -void bdi_start_background_writeback(struct backing_dev_info *bdi); -void bdi_writeback_workfn(struct work_struct *work); -int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); -void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); +void wb_wait_for_completion(struct wb_completion *done); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -static inline int wb_has_dirty_io(struct bdi_writeback *wb) -{ - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); -} - -static inline void __add_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item, s64 amount) -{ - __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); -} - -static inline void __inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { - __add_bdi_stat(bdi, item, 1); + return test_bit(WB_has_dirty_io, &wb->state); } -static inline void inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { - unsigned long flags; - - local_irq_save(flags); - __inc_bdi_stat(bdi, item); - local_irq_restore(flags); -} - -static inline void __dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) -{ - __add_bdi_stat(bdi, item, -1); -} - -static inline void dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) -{ - unsigned long flags; - - local_irq_save(flags); - __dec_bdi_stat(bdi, item); - local_irq_restore(flags); + /* + * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are + * any dirty wbs. See wb_update_write_bandwidth(). + */ + return atomic_long_read(&bdi->tot_write_bandwidth); } -static inline s64 bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void wb_stat_mod(struct bdi_writeback *wb, + enum wb_stat_item item, s64 amount) { - return percpu_counter_read_positive(&bdi->bdi_stat[item]); + percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - return percpu_counter_sum_positive(&bdi->bdi_stat[item]); + return percpu_counter_read_positive(&wb->stat[item]); } -static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { - s64 sum; - unsigned long flags; - - local_irq_save(flags); - sum = __bdi_stat_sum(bdi, item); - local_irq_restore(flags); - - return sum; + return percpu_counter_sum_positive(&wb->stat[item]); } -extern void bdi_writeout_inc(struct backing_dev_info *bdi); +extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP - return nr_cpu_ids * BDI_STAT_BATCH; + return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif } +/* BDI ratio is expressed as part per 1000000 for finer granularity. */ +#define BDI_RATIO_SCALE 10000 + +u64 bdi_get_min_bytes(struct backing_dev_info *bdi); +u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability * - * The first three flags control whether dirty pages will contribute to the - * VM's accounting and whether writepages() should be called for dirty pages - * (something that would not, for example, be appropriate for ramfs) - * - * WARNING: these flags are closely related and should not normally be - * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these - * three flags into a single convenience macro. + * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages + * should contribute to accounting + * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold + */ +#define BDI_CAP_WRITEBACK (1 << 0) +#define BDI_CAP_STRICTLIMIT (1 << 1) + +extern struct backing_dev_info noop_backing_dev_info; + +int bdi_init(struct backing_dev_info *bdi); + +/** + * writeback_in_progress - determine whether there is writeback in progress + * @wb: bdi_writeback of interest * - * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting - * BDI_CAP_NO_WRITEBACK: Don't write pages back - * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages + * Determine whether there is writeback waiting to be handled against a + * bdi_writeback. + */ +static inline bool writeback_in_progress(struct bdi_writeback *wb) +{ + return test_bit(WB_writeback_running, &wb->state); +} + +struct backing_dev_info *inode_to_bdi(struct inode *inode); + +static inline bool mapping_can_writeback(struct address_space *mapping) +{ + return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css); +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp); +void wb_memcg_offline(struct mem_cgroup *memcg); +void wb_blkcg_offline(struct cgroup_subsys_state *css); + +/** + * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode + * @inode: inode of interest * - * These flags let !MMU mmap() govern direct device mapping vs immediate - * copying more easily for MAP_PRIVATE, especially for ROM filesystems. + * Cgroup writeback requires support from the filesystem. Also, both memcg and + * iocg have to be on the default hierarchy. Test whether all conditions are + * met. * - * BDI_CAP_MAP_COPY: Copy can be mapped (MAP_PRIVATE) - * BDI_CAP_MAP_DIRECT: Can be mapped directly (MAP_SHARED) - * BDI_CAP_READ_MAP: Can be mapped for reading - * BDI_CAP_WRITE_MAP: Can be mapped for writing - * BDI_CAP_EXEC_MAP: Can be mapped for execution + * Note that the test result may change dynamically on the same inode + * depending on how memcg and iocg are configured. + */ +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + return cgroup_subsys_on_dfl(memory_cgrp_subsys) && + cgroup_subsys_on_dfl(io_cgrp_subsys) && + (bdi->capabilities & BDI_CAP_WRITEBACK) && + (inode->i_sb->s_iflags & SB_I_CGROUPWB); +} + +/** + * wb_find_current - find wb for %current on a bdi + * @bdi: bdi of interest * - * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. + * Find the wb of @bdi which matches both the memcg and blkcg of %current. + * Must be called under rcu_read_lock() which protects the returend wb. + * NULL if not found. */ -#define BDI_CAP_NO_ACCT_DIRTY 0x00000001 -#define BDI_CAP_NO_WRITEBACK 0x00000002 -#define BDI_CAP_MAP_COPY 0x00000004 -#define BDI_CAP_MAP_DIRECT 0x00000008 -#define BDI_CAP_READ_MAP 0x00000010 -#define BDI_CAP_WRITE_MAP 0x00000020 -#define BDI_CAP_EXEC_MAP 0x00000040 -#define BDI_CAP_NO_ACCT_WB 0x00000080 -#define BDI_CAP_SWAP_BACKED 0x00000100 -#define BDI_CAP_STABLE_WRITES 0x00000200 - -#define BDI_CAP_VMFLAGS \ - (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) - -#define BDI_CAP_NO_ACCT_AND_WRITEBACK \ - (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) - -#if defined(VM_MAYREAD) && \ - (BDI_CAP_READ_MAP != VM_MAYREAD || \ - BDI_CAP_WRITE_MAP != VM_MAYWRITE || \ - BDI_CAP_EXEC_MAP != VM_MAYEXEC) -#error please change backing_dev_info::capabilities flags -#endif +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; -extern struct backing_dev_info default_backing_dev_info; -extern struct backing_dev_info noop_backing_dev_info; + memcg_css = task_css(current, memory_cgrp_id); + if (!memcg_css->parent) + return &bdi->wb; -int writeback_in_progress(struct backing_dev_info *bdi); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); -static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) + /* + * %current's blkcg equals the effective blkcg of its memcg. No + * need to use the relatively expensive cgroup_get_e_css(). + */ + if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) + return wb; + return NULL; +} + +/** + * wb_get_create_current - get or create wb for %current on a bdi + * @bdi: bdi of interest + * @gfp: allocation mask + * + * Equivalent to wb_get_create() on %current's memcg. This function is + * called from a relatively hot path and optimizes the common cases using + * wb_find_current(). + */ +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { - if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->state & bdi_bits); + struct bdi_writeback *wb; + + rcu_read_lock(); + wb = wb_find_current(bdi); + if (wb && unlikely(!wb_tryget(wb))) + wb = NULL; + rcu_read_unlock(); + + if (unlikely(!wb)) { + struct cgroup_subsys_state *memcg_css; + + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, gfp); + css_put(memcg_css); + } + return wb; } -static inline int bdi_read_congested(struct backing_dev_info *bdi) +/** + * inode_to_wb - determine the wb of an inode + * @inode: inode of interest + * + * Returns the wb @inode is currently associated with. The caller must be + * holding either @inode->i_lock, the i_pages lock, or the + * associated wb's list_lock. + */ +static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { - return bdi_congested(bdi, 1 << BDI_sync_congested); +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && + (inode->i_sb->s_iflags & SB_I_CGROUPWB) && + (!lockdep_is_held(&inode->i_lock) && + !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && + !lockdep_is_held(&inode->i_wb->list_lock))); +#endif + return inode->i_wb; } -static inline int bdi_write_congested(struct backing_dev_info *bdi) +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) { - return bdi_congested(bdi, 1 << BDI_async_congested); + /* + * If wbc does not have inode attached, it means cgroup writeback was + * disabled when wbc started. Just use the default wb in that case. + */ + return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; } -static inline int bdi_rw_congested(struct backing_dev_info *bdi) +/** + * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction + * @inode: target inode + * @cookie: output param, to be passed to the end function + * + * The caller wants to access the wb associated with @inode but isn't + * holding inode->i_lock, the i_pages lock or wb->list_lock. This + * function determines the wb associated with @inode and ensures that the + * association doesn't change until the transaction is finished with + * unlocked_inode_to_wb_end(). + * + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and + * can't sleep during the transaction. IRQs may or may not be disabled on + * return. + */ +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { - return bdi_congested(bdi, (1 << BDI_sync_congested) | - (1 << BDI_async_congested)); + rcu_read_lock(); + + /* + * Paired with a release fence in inode_do_switch_wbs() and + * ensures that we see the new wb if we see cleared I_WB_SWITCH. + */ + cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH; + smp_rmb(); + + if (unlikely(cookie->locked)) + xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); + + /* + * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages + * lock. inode_to_wb() will bark. Deref directly. + */ + return inode->i_wb; } -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; +/** + * unlocked_inode_to_wb_end - end inode wb access transaction + * @inode: target inode + * @cookie: @cookie from unlocked_inode_to_wb_begin() + */ +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) +{ + if (unlikely(cookie->locked)) + xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); -long congestion_wait(int sync, long timeout); -long wait_iff_congested(struct zone *zone, int sync, long timeout); -int pdflush_proc_obsolete(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + rcu_read_unlock(); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ -static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) +static inline bool inode_cgwb_enabled(struct inode *inode) { - return bdi->capabilities & BDI_CAP_STABLE_WRITES; + return false; } -static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { - return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK); + return &bdi->wb; } -static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi) +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { - return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY); + return &bdi->wb; } -static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { - /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */ - return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB | - BDI_CAP_NO_WRITEBACK)); + return &inode_to_bdi(inode)->wb; } -static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) { - return bdi->capabilities & BDI_CAP_SWAP_BACKED; + return inode_to_wb(inode); } -static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) + +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { - return bdi_cap_writeback_dirty(mapping->backing_dev_info); + return inode_to_wb(inode); } -static inline bool mapping_cap_account_dirty(struct address_space *mapping) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { - return bdi_cap_account_dirty(mapping->backing_dev_info); } -static inline bool mapping_cap_swap_backed(struct address_space *mapping) +static inline void wb_memcg_offline(struct mem_cgroup *memcg) { - return bdi_cap_swap_backed(mapping->backing_dev_info); } -static inline int bdi_sched_wait(void *word) +static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { - schedule(); - return 0; } -#endif /* _LINUX_BACKING_DEV_H */ +#endif /* CONFIG_CGROUP_WRITEBACK */ + +const char *bdi_dev_name(struct backing_dev_info *bdi); + +#endif /* _LINUX_BACKING_DEV_H */ |
