diff options
Diffstat (limited to 'include/linux/swap.h')
| -rw-r--r-- | include/linux/swap.h | 329 |
1 files changed, 139 insertions, 190 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -24,7 +24,6 @@ struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff -#define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ @@ -74,14 +73,13 @@ static inline int current_is_kswapd(void) * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page - * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. + * table entries to a special SWP_DEVICE_EXCLUSIVE entry. */ #ifdef CONFIG_DEVICE_PRIVATE -#define SWP_DEVICE_NUM 4 +#define SWP_DEVICE_NUM 3 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) -#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) -#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) +#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #else #define SWP_DEVICE_NUM 0 #endif @@ -153,13 +151,28 @@ union swap_header { * memory reclaim */ struct reclaim_state { - unsigned long reclaimed_slab; + /* pages reclaimed outside of LRU-based reclaim */ + unsigned long reclaimed; #ifdef CONFIG_LRU_GEN /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif }; +/* + * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based + * reclaim + * @pages: number of pages reclaimed + * + * If the current process is undergoing a reclaim operation, increment the + * number of reclaimed pages by @pages. + */ +static inline void mm_account_reclaimed_pages(unsigned long pages) +{ + if (current->reclaim_state) + current->reclaim_state->reclaimed += pages; +} + #ifdef __KERNEL__ struct address_space; @@ -204,10 +217,10 @@ enum { SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL +#define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ @@ -223,42 +236,25 @@ enum { #define SWAP_CONT_MAX 0x7f /* Max count */ /* - * We use this to track usage of a cluster. A cluster is a block of swap disk - * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All - * free clusters are organized into a list. We fetch an entry from the list to - * get a free cluster. - * - * The data field stores next cluster if the cluster is free or cluster usage - * counter otherwise. The flags field determines if a cluster is free. This is - * protected by swap_info_struct.lock. + * The first page in the swap file is the swap header, which is always marked + * bad to prevent it from being allocated as an entry. This also prevents the + * cluster to which it belongs being marked free. Therefore 0 is safe to use as + * a sentinel to indicate an entry is not valid. */ -struct swap_cluster_info { - spinlock_t lock; /* - * Protect swap_cluster_info fields - * and swap_info_struct->swap_map - * elements correspond to the swap - * cluster - */ - unsigned int data:24; - unsigned int flags:8; -}; -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ -#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */ +#define SWAP_ENTRY_INVALID 0 + +#ifdef CONFIG_THP_SWAP +#define SWAP_NR_ORDERS (PMD_ORDER + 1) +#else +#define SWAP_NR_ORDERS 1 +#endif /* - * We assign a cluster to each CPU, so each CPU can allocate swap entry from - * its own cluster and swapout sequentially. The purpose is to optimize swapout - * throughput. + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. */ -struct percpu_cluster { - struct swap_cluster_info index; /* Current cluster index */ - unsigned int next; /* Likely next allocation offset */ -}; - -struct swap_cluster_list { - struct swap_cluster_info head; - struct swap_cluster_info tail; +struct swap_sequential_cluster { + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; /* @@ -272,32 +268,26 @@ struct swap_info_struct { signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct swap_cluster_list free_clusters; /* free clusters list */ - unsigned int lowest_bit; /* index of first free in swap_map */ - unsigned int highest_bit; /* index of last free in swap_map */ + struct list_head free_clusters; /* free clusters list */ + struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ unsigned int pages; /* total of usable pages of swap */ - unsigned int inuse_pages; /* number of those currently in use */ - unsigned int cluster_next; /* likely index for next allocation */ - unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ - struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ + atomic_long_t inuse_pages; /* number of those currently in use */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ - unsigned int old_block_size; /* seldom referenced */ struct completion comp; /* seldom referenced */ -#ifdef CONFIG_FRONTSWAP - unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ - atomic_t frontswap_pages; /* frontswap pages in-use counter */ -#endif spinlock_t lock; /* * protect map scan related fields like - * swap_map, lowest_bit, highest_bit, - * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc, - * highest_alloc, free/discard cluster - * list. other fields are only changed + * swap_map, inuse_pages and all cluster + * lists. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If @@ -309,65 +299,28 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ - struct swap_cluster_list discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct work_struct reclaim_work; /* reclaim worker */ + struct list_head discard_clusters; /* discard clusters list */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; -#ifdef CONFIG_64BIT -#define SWAP_RA_ORDER_CEILING 5 -#else -/* Avoid stack overflow, because we need to save part of page table */ -#define SWAP_RA_ORDER_CEILING 3 -#define SWAP_RA_PTE_CACHE_SIZE (1 << SWAP_RA_ORDER_CEILING) -#endif - -struct vma_swap_readahead { - unsigned short win; - unsigned short offset; - unsigned short nr_pte; -#ifdef CONFIG_64BIT - pte_t *ptes; -#else - pte_t ptes[SWAP_RA_PTE_CACHE_SIZE]; -#endif -}; - -static inline swp_entry_t folio_swap_entry(struct folio *folio) +static inline swp_entry_t page_swap_entry(struct page *page) { - swp_entry_t entry = { .val = page_private(&folio->page) }; - return entry; -} + struct folio *folio = page_folio(page); + swp_entry_t entry = folio->swap; -static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) -{ - folio->private = (void *)entry.val; + entry.val += folio_page_idx(folio, page); + return entry; } /* linux/mm/workingset.c */ +bool workingset_test_recent(void *shadow, bool file, bool *workingset, + bool flush); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); -/* Only track the nodes of mappings with shadow entries */ -void workingset_update_node(struct xa_node *node); -extern struct list_lru shadow_nodes; -#define mapping_set_update(xas, mapping) do { \ - if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ - xas_set_update(xas, workingset_update_node); \ - xas_set_lru(xas, &shadow_nodes); \ - } \ -} while (0) - /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; @@ -376,14 +329,25 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated); +void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated) + __releases(lruvec->lru_lock); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) @@ -401,13 +365,10 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void deactivate_page(struct page *page); -extern void mark_page_lazyfree(struct page *page); +void folio_deactivate(struct folio *folio); +void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); -extern void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma); - /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, @@ -415,11 +376,17 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MIN_SWAPPINESS 0 +#define MAX_SWAPPINESS 200 + +/* Just reclaim from anon folios in proactive memory reclaim */ +#define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1) + extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, - nodemask_t *nodemask); + int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, @@ -428,26 +395,31 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); -extern unsigned long reclaim_pages(struct list_head *page_list); -#ifdef CONFIG_NUMA -extern int node_reclaim_mode; -extern int sysctl_min_unmapped_ratio; -extern int sysctl_min_slab_ratio; +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +extern int reclaim_register_node(struct node *node); +extern void reclaim_unregister_node(struct node *node); + #else -#define node_reclaim_mode 0 -#endif -static inline bool node_reclaim_enabled(void) +static inline int reclaim_register_node(struct node *node) +{ + return 0; +} + +static inline void reclaim_unregister_node(struct node *node) { - /* Is any node_reclaim_mode bit set? */ - return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); } +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ + +#ifdef CONFIG_NUMA +extern int sysctl_min_unmapped_ratio; +extern int sysctl_min_slab_ratio; +#endif void check_move_unevictable_folios(struct folio_batch *fbatch); -void check_move_unevictable_pages(struct pagevec *pvec); -extern void kswapd_run(int nid); -extern void kswapd_stop(int nid); +extern void __meminit kswapd_run(int nid); +extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP @@ -461,14 +433,13 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } -extern void free_swap_cache(struct page *page); -extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct encoded_page **, int); +void free_swap_cache(struct folio *folio); +void free_folio_and_swap_cache(struct folio *folio); +void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; -extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) @@ -482,32 +453,26 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -swp_entry_t folio_alloc_swap(struct folio *folio); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t); +extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t); -extern void swap_free(swp_entry_t); -extern void swapcache_free_entries(swp_entry_t *entries, int n); -extern int free_swap_and_cache(swp_entry_t); +extern int swapcache_prepare(swp_entry_t entry, int nr); +extern void swap_free_nr(swp_entry_t entry, int nr_pages); +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern int __swp_swapcount(swp_entry_t entry); +extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -extern struct swap_info_struct *page_swap_info(struct page *); -extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; -extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); -extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); -sector_t swap_page_sector(struct page *page); +sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { @@ -515,11 +480,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return NULL; -} - static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; @@ -536,17 +496,16 @@ static inline void put_swap_device(struct swap_info_struct *si) #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) -/* only sparc can not include linux/pagemap.h in this file - * so leave put_page and release_pages undeclared... */ -#define free_page_and_swap_cache(page) \ - put_page(page) +#define free_folio_and_swap_cache(folio) \ + folio_put(folio) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ -#define free_swap_and_cache(e) is_pfn_swap_entry(e) +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) +{ +} -static inline void free_swap_cache(struct page *page) +static inline void free_swap_cache(struct folio *folio) { } @@ -555,7 +514,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp) +static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } @@ -564,7 +523,12 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } -static inline void swap_free(swp_entry_t swp) +static inline int swapcache_prepare(swp_entry_t swp, int nr) +{ + return 0; +} + +static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } @@ -577,9 +541,9 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline int __swp_swapcount(swp_entry_t entry) +static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { - return 0; + return false; } static inline int swp_swapcount(swp_entry_t entry) @@ -587,11 +551,9 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline swp_entry_t folio_alloc_swap(struct folio *folio) +static inline int folio_alloc_swap(struct folio *folio) { - swp_entry_t entry; - entry.val = 0; - return entry; + return -EINVAL; } static inline bool folio_free_swap(struct folio *folio) @@ -607,60 +569,51 @@ static inline int add_swap_extent(struct swap_info_struct *sis, } #endif /* CONFIG_SWAP */ -#ifdef CONFIG_THP_SWAP -extern int split_swap_cluster(swp_entry_t entry); -#else -static inline int split_swap_cluster(swp_entry_t entry) +static inline void free_swap_and_cache(swp_entry_t entry) { - return 0; + free_swap_and_cache_nr(entry, 1); +} + +static inline void swap_free(swp_entry_t entry) +{ + swap_free_nr(entry, 1); } -#endif #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return vm_swappiness; + return READ_ONCE(vm_swappiness); /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) - return vm_swappiness; + return READ_ONCE(vm_swappiness); - return memcg->swappiness; + return READ_ONCE(memcg->swappiness); } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { - return vm_swappiness; + return READ_ONCE(vm_swappiness); } #endif -#ifdef CONFIG_ZSWAP -extern u64 zswap_pool_total_size; -extern atomic_t zswap_stored_pages; -#endif - #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); -static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); +static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { if (mem_cgroup_disabled()) return; - __cgroup_throttle_swaprate(page, gfp_mask); + __folio_throttle_swaprate(folio, gfp); } #else -static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) -{ -} -#endif static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { - cgroup_throttle_swaprate(&folio->page, gfp); } +#endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) @@ -681,10 +634,6 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else -static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) -{ -} - static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { |
