diff options
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 566 |
1 files changed, 363 insertions, 203 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index b4f3cc712580..10760240a3a2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> +#include "swap_table.h" #include "internal.h" #include "swap.h" @@ -58,9 +59,9 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset); -static inline void unlock_cluster(struct swap_cluster_info *ci); +static void move_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -105,7 +106,9 @@ static PLIST_HEAD(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; + +static struct kmem_cache *swap_table_cachep; static DEFINE_MUTEX(swapon_mutex); @@ -127,14 +130,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .lock = INIT_LOCAL_LOCK(), }; -static struct swap_info_struct *swap_type_to_swap_info(int type) +/* May return NULL on invalid type, caller must check for NULL return */ +static struct swap_info_struct *swap_type_to_info(int type) { if (type >= MAX_SWAPFILES) return NULL; - return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } +/* May return NULL on invalid entry, caller must check for NULL return */ +static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) +{ + return swap_type_to_info(swp_type(entry)); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -212,16 +221,15 @@ static bool swap_is_last_map(struct swap_info_struct *si, static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { - swp_entry_t entry = swp_entry(si->type, offset); - struct address_space *address_space = swap_address_space(entry); + const swp_entry_t entry = swp_entry(si->type, offset); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; again: - folio = filemap_get_folio(address_space, swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) return 0; nr_pages = folio_nr_pages(folio); @@ -241,13 +249,12 @@ again: * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked. */ - entry = folio->swap; - if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) { + if (!folio_matches_swap_entry(folio, entry)) { folio_unlock(folio); folio_put(folio); goto again; } - offset = swp_offset(entry); + offset = swp_offset(folio->swap); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || @@ -260,13 +267,13 @@ again: * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); need_reclaim = swap_only_has_cache(si, offset, nr_pages); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); ret = nr_pages; out_unlock: @@ -347,7 +354,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; @@ -387,19 +394,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -#ifdef CONFIG_THP_SWAP -#define SWAPFILE_CLUSTER HPAGE_PMD_NR - -#define swap_entry_order(order) (order) -#else -#define SWAPFILE_CLUSTER 256 - -/* - * Define swap_entry_order() as constant to let compiler to optimize - * out some code if !CONFIG_THP_SWAP - */ -#define swap_entry_order(order) 0 -#endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) @@ -412,10 +406,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info) return info->flags == CLUSTER_FLAG_DISCARD; } +static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) +{ + return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); +} + static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; + if (!cluster_table_is_alloced(ci)) + return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; @@ -427,32 +428,126 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } -static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - return &si->cluster_info[offset / SWAPFILE_CLUSTER]; -} - static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset) +static struct swap_table *swap_table_alloc(gfp_t gfp) { - struct swap_cluster_info *ci; + struct folio *folio; - ci = offset_to_cluster(si, offset); - spin_lock(&ci->lock); + if (!SWP_TABLE_USE_PAGE) + return kmem_cache_zalloc(swap_table_cachep, gfp); - return ci; + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (folio) + return folio_address(folio); + return NULL; +} + +static void swap_table_free_folio_rcu_cb(struct rcu_head *head) +{ + struct folio *folio; + + folio = page_folio(container_of(head, struct page, rcu_head)); + folio_put(folio); +} + +static void swap_table_free(struct swap_table *table) +{ + if (!SWP_TABLE_USE_PAGE) { + kmem_cache_free(swap_table_cachep, table); + return; + } + + call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), + swap_table_free_folio_rcu_cb); +} + +static void swap_cluster_free_table(struct swap_cluster_info *ci) +{ + unsigned int ci_off; + struct swap_table *table; + + /* Only empty cluster's table is allow to be freed */ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(!cluster_is_empty(ci)); + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) + VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); + table = (void *)rcu_dereference_protected(ci->table, true); + rcu_assign_pointer(ci->table, NULL); + + swap_table_free(table); } -static inline void unlock_cluster(struct swap_cluster_info *ci) +/* + * Allocate swap table for one cluster. Attempt an atomic allocation first, + * then fallback to sleeping allocation. + */ +static struct swap_cluster_info * +swap_cluster_alloc_table(struct swap_info_struct *si, + struct swap_cluster_info *ci) { + struct swap_table *table; + + /* + * Only cluster isolation from the allocator does table allocation. + * Swap allocator uses percpu clusters and holds the local lock. + */ + lockdep_assert_held(&ci->lock); + lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); + + /* The cluster must be free and was just isolated from the free list. */ + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); + + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (table) { + rcu_assign_pointer(ci->table, table); + return ci; + } + + /* + * Try a sleep allocation. Each isolated free cluster may cause + * a sleep allocation, but there is a limited number of them, so + * the potential recursive allocation is limited. + */ spin_unlock(&ci->lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_unlock(&si->global_cluster_lock); + local_unlock(&percpu_swap_cluster.lock); + + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + + /* + * Back to atomic context. We might have migrated to a new CPU with a + * usable percpu cluster. But just keep using the isolated cluster to + * make things easier. Migration indicates a slight change of workload + * so using a new free cluster might not be a bad idea, and the worst + * could happen with ignoring the percpu cluster is fragmentation, + * which is acceptable since this fallback and race is rare. + */ + local_lock(&percpu_swap_cluster.lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_lock(&si->global_cluster_lock); + spin_lock(&ci->lock); + + /* Nothing except this helper should touch a dangling empty cluster. */ + if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { + if (table) + swap_table_free(table); + return ci; + } + + if (!table) { + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); + spin_unlock(&ci->lock); + return NULL; + } + + rcu_assign_pointer(ci->table, table); + return ci; } static void move_cluster(struct swap_info_struct *si, @@ -470,11 +565,6 @@ static void move_cluster(struct swap_info_struct *si, else list_move_tail(&ci->list, list); spin_unlock(&si->lock); - - if (ci->flags == CLUSTER_FLAG_FRAG) - atomic_long_dec(&si->frag_cluster_nr[ci->order]); - else if (new_flags == CLUSTER_FLAG_FRAG) - atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } @@ -489,7 +579,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&ci->lock); + swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } @@ -504,15 +594,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list) + struct swap_info_struct *si, struct list_head *list, int order) { - struct swap_cluster_info *ci, *ret = NULL; + struct swap_cluster_info *ci, *found = NULL; spin_lock(&si->lock); - - if (unlikely(!(si->flags & SWP_WRITEOK))) - goto out; - list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; @@ -524,13 +610,19 @@ static struct swap_cluster_info *isolate_lock_cluster( list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; - ret = ci; + found = ci; break; } -out: spin_unlock(&si->lock); - return ret; + if (found && !cluster_table_is_alloced(found)) { + /* Only an empty free cluster's swap table can be freed. */ + VM_WARN_ON_ONCE(list != &si->free_clusters); + VM_WARN_ON_ONCE(!cluster_is_empty(found)); + return swap_cluster_alloc_table(si, found); + } + + return found; } /* @@ -663,17 +755,27 @@ static void relocate_cluster(struct swap_info_struct *si, * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ -static void inc_cluster_info_page(struct swap_info_struct *si, +static int inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_table *table; struct swap_cluster_info *ci; ci = cluster_info + idx; + if (!ci->table) { + table = swap_table_alloc(GFP_KERNEL); + if (!table) + return -ENOMEM; + rcu_assign_pointer(ci->table, table); + } + ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); + + return 0; } static bool cluster_reclaim_range(struct swap_info_struct *si, @@ -742,6 +844,26 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } +/* + * Currently, the swap table is not used for count tracking, just + * do a sanity check here to ensure nothing leaked, so the swap + * table should be empty upon freeing. + */ +static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, + unsigned int start, unsigned int nr) +{ + unsigned int ci_off = start % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr; + unsigned long swp_tb; + + if (IS_ENABLED(CONFIG_DEBUG_VM)) { + do { + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + } +} + static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) @@ -761,6 +883,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster ci->order = order; memset(si->swap_map + start, usage, nr_pages); + swap_cluster_assert_table_empty(ci, start, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; @@ -815,7 +938,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, } out: relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); @@ -825,6 +948,29 @@ out: return found; } +static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, + struct list_head *list, + unsigned int order, + unsigned char usage, + bool scan_all) +{ + unsigned int found = SWAP_ENTRY_INVALID; + + do { + struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); + unsigned long offset; + + if (!ci) + break; + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + if (found) + break; + } while (scan_all); + + return found; +} + static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; @@ -836,7 +982,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; @@ -859,7 +1005,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (ci->flags == CLUSTER_FLAG_NONE) relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (to_scan <= 0) break; } @@ -898,7 +1044,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (offset == SWAP_ENTRY_INVALID) goto new_cluster; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) @@ -906,53 +1052,53 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } if (found) goto done; } new_cluster: - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); + /* + * If the device need discard, prefer new cluster over nonfull + * to spread out the writes. + */ + if (si->flags & SWP_PAGE_DISCARD) { + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); + if (found) + goto done; + } + + if (order < PMD_ORDER) { + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], + order, usage, true); + if (found) + goto done; + } + + if (!(si->flags & SWP_PAGE_DISCARD)) { + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); if (found) goto done; } - /* Try reclaim from full clusters if free clusters list is drained */ + /* Try reclaim full clusters if free and nonfull lists are drained */ if (vm_swap_full()) swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0, frags_existing; - - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - /* Clusters failed to allocate are moved to frag_clusters */ - frags++; - } - - frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); - while (frags < frags_existing && - (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { - atomic_long_dec(&si->frag_cluster_nr[order]); - /* - * Rotate the frag list to iterate, they were all - * failing high order allocation or moved here due to - * per-CPU usage, but they could contain newly released - * reclaimable (eg. lazy-freed swap cache) slots. - */ - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - frags++; - } + /* + * Scan only one fragment cluster is good enough. Order 0 + * allocation will surely success, and large allocation + * failure is not critical. Scanning one cluster still + * keeps the list rotated and reclaimed (for HAS_CACHE). + */ + found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, + usage, false); + if (found) + goto done; } /* @@ -971,24 +1117,20 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - atomic_long_dec(&si->frag_cluster_nr[o]); - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->frag_clusters[o], + 0, usage, true); + if (found) + goto done; - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], + 0, usage, true); + if (found) + goto done; } done: if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); + return found; } @@ -1145,7 +1287,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - clear_shadow_from_swap_cache(si->type, begin, end); + __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1192,7 +1334,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (!si || !offset || !get_swap_device_info(si)) return false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); @@ -1200,7 +1342,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (found) *entry = swp_entry(si->type, found); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } put_swap_device(si); @@ -1302,16 +1444,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (!entry.val) return -ENOMEM; - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) - goto out_free; + swap_cache_add_folio(folio, entry, NULL); return 0; @@ -1327,7 +1460,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) @@ -1442,7 +1575,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) @@ -1468,14 +1601,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, nr)) + ci = swap_cluster_lock(si, offset); + if (swap_only_has_cache(si, offset, nr)) { swap_entries_free(si, ci, entry, nr); - else { + } else { for (int i = 0; i < nr; i++, entry.val++) swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } - unlock_cluster(ci); + swap_cluster_unlock(ci); } static bool swap_entries_put_map(struct swap_info_struct *si, @@ -1493,7 +1626,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si, if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { goto locked_fallback; } @@ -1502,21 +1635,20 @@ static bool swap_entries_put_map(struct swap_info_struct *si, else for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; fallback: - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); locked_fallback: for (i = 0; i < nr; i++, entry.val++) { count = swap_entry_put_locked(si, ci, entry, 1); if (count == SWAP_HAS_CACHE) has_cache = true; } - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; - } /* @@ -1566,7 +1698,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1578,6 +1710,7 @@ static void swap_entries_free(struct swap_info_struct *si, mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + swap_cluster_assert_table_empty(ci, offset, nr_pages); if (!ci->count) free_cluster(si, ci); @@ -1624,7 +1757,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); @@ -1641,9 +1774,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster(ci); + swap_cluster_unlock(ci); return !!count; } @@ -1666,7 +1799,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1689,7 +1822,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return count; } @@ -1704,7 +1837,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; @@ -1717,7 +1850,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return ret; } @@ -1781,7 +1914,7 @@ bool folio_free_swap(struct folio *folio) if (folio_swapped(folio)) return false; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); return true; } @@ -1855,7 +1988,7 @@ out: swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; swp_entry_t entry = {0}; @@ -1865,7 +1998,13 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { + /* + * Grab the local lock to be complaint + * with swap table allocation. + */ + local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); + local_unlock(&percpu_swap_cluster.lock); if (offset) { entry = swp_entry(si->type, offset); atomic_long_dec(&nr_swap_pages); @@ -1936,7 +2075,7 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) @@ -1992,6 +2131,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, bool hwpoisoned = false; int ret = 1; + /* + * If the folio is removed from swap cache by others, continue to + * unuse other PTEs. try_to_unuse may try again if we missed this one. + */ + if (!folio_matches_swap_entry(folio, entry)) + return 0; + swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) @@ -2118,7 +2264,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); pte = NULL; - folio = swap_cache_get_folio(entry, vma, addr); + folio = swap_cache_get_folio(entry); if (!folio) { struct vm_fault vmf = { .vma = vma, @@ -2243,6 +2389,8 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); + if (check_stable_address_space(mm)) + goto unlock; for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); @@ -2252,6 +2400,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) cond_resched(); } +unlock: mmap_read_unlock(mm); return ret; } @@ -2344,8 +2493,8 @@ retry: (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) continue; /* @@ -2644,9 +2793,30 @@ static void wait_for_allocation(struct swap_info_struct *si) BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { - ci = lock_cluster(si, offset); - unlock_cluster(ci); + ci = swap_cluster_lock(si, offset); + swap_cluster_unlock(ci); + } +} + +static void free_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) +{ + struct swap_cluster_info *ci; + int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + + if (!cluster_info) + return; + for (i = 0; i < nr_clusters; i++) { + ci = cluster_info + i; + /* Cluster with bad marks count will have a remaining table */ + spin_lock(&ci->lock); + if (rcu_dereference_protected(ci->table, true)) { + ci->count = 0; + swap_cluster_free_table(ci); + } + spin_unlock(&ci->lock); } + kvfree(cluster_info); } /* @@ -2681,6 +2851,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; + unsigned int maxpages; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) @@ -2783,12 +2954,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; + maxpages = p->max; cluster_info = p->cluster_info; + p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -2799,10 +2971,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); - exit_swap_address_space(p->type); inode = mapping->host; @@ -2858,7 +3029,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; (si = swap_type_to_swap_info(type)); type++) { + for (type = 0; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2879,7 +3050,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) type = si->type + 1; ++(*pos); - for (; (si = swap_type_to_swap_info(type)); type++) { + for (; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; @@ -3166,21 +3337,14 @@ static int setup_swap_map(struct swap_info_struct *si, return 0; } -#define SWAP_CLUSTER_INFO_COLS \ - DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) -#define SWAP_CLUSTER_SPACE_COLS \ - DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) -#define SWAP_CLUSTER_COLS \ - max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) - static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; - unsigned long i, j, idx; int err = -ENOMEM; + unsigned long i; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) @@ -3206,16 +3370,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - inc_cluster_info_page(si, cluster_info, 0); + err = inc_cluster_info_page(si, cluster_info, 0); + if (err) + goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr >= maxpages) continue; - inc_cluster_info_page(si, cluster_info, page_nr); + err = inc_cluster_info_page(si, cluster_info, page_nr); + if (err) + goto err; + } + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { + err = inc_cluster_info_page(si, cluster_info, i); + if (err) + goto err; } - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) - inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); @@ -3224,34 +3395,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - atomic_long_set(&si->frag_cluster_nr[i], 0); } - /* - * Reduce false cache line sharing between cluster_info and - * sharing same address space. - */ - for (j = 0; j < SWAP_CLUSTER_COLS; j++) { - for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { - struct swap_cluster_info *ci; - idx = i * SWAP_CLUSTER_COLS + j; - ci = cluster_info + idx; - if (idx >= nr_clusters) - continue; - if (ci->count) { - ci->flags = CLUSTER_FLAG_NONFULL; - list_add_tail(&ci->list, &si->nonfull_clusters[0]); - continue; - } + for (i = 0; i < nr_clusters; i++) { + struct swap_cluster_info *ci = &cluster_info[i]; + + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &si->nonfull_clusters[0]); + } else { ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } } return cluster_info; - err_free: - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); err: return ERR_PTR(err); } @@ -3445,13 +3605,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = init_swap_address_space(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - error = zswap_swapon(si->type, maxpages); if (error) - goto free_swap_address_space; + goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this @@ -3486,8 +3642,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto out; free_swap_zswap: zswap_swapoff(si->type); -free_swap_address_space: - exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: @@ -3502,7 +3656,8 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + if (cluster_info) + free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) @@ -3553,7 +3708,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) unsigned char has_cache; int err, i; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { pr_err("%s%08lx\n", Bad_file, entry.val); return -EINVAL; @@ -3562,7 +3717,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3617,7 +3772,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return err; } @@ -3668,11 +3823,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) swap_entries_put_cache(si, entry, nr); } -struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return swap_type_to_swap_info(swp_type(entry)); -} - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's @@ -3716,7 +3866,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); @@ -3776,7 +3926,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) out_unlock_cont: spin_unlock(&si->cont_lock); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); put_swap_device(si); outer: if (page) @@ -3950,6 +4100,16 @@ static int __init swapfile_init(void) swapfile_maximum_size = arch_max_swapfile_size(); + /* + * Once a cluster is freed, it's swap table content is read + * only, and all swap cache readers (swap_cache_*) verifies + * the content before use. So it's safe to use RCU slab here. + */ + if (!SWP_TABLE_USE_PAGE) + swap_table_cachep = kmem_cache_create("swap_table", + sizeof(struct swap_table), + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; |