diff options
Diffstat (limited to 'mm/zswap.c')
-rw-r--r-- | mm/zswap.c | 1109 |
1 files changed, 576 insertions, 533 deletions
diff --git a/mm/zswap.c b/mm/zswap.c index 6f8850c44b61..455e9425c5f5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -20,7 +20,6 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> -#include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> @@ -43,12 +42,8 @@ /********************************* * statistics **********************************/ -/* Total bytes used by the compressed storage */ -u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ -atomic_t zswap_stored_pages = ATOMIC_INIT(0); -/* The number of same-value filled pages currently stored in zswap */ -static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -67,6 +62,8 @@ static u64 zswap_reject_reclaim_fail; static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; +/* Load or writeback failed due to decompression failure */ +static u64 zswap_decompress_fail; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ @@ -86,6 +83,7 @@ static bool zswap_pool_reached_full; static int zswap_setup(void); /* Enable/disable zswap */ +static DEFINE_STATIC_KEY_MAYBE(CONFIG_ZSWAP_DEFAULT_ON, zswap_ever_enabled); static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); @@ -126,32 +124,21 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* - * Enable/disable handling same-value filled pages (enabled by default). - * If disabled every page is considered non-same-value filled. - */ -static bool zswap_same_filled_pages_enabled = true; -module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, - bool, 0644); - -/* Enable/disable handling non-same-value filled pages (enabled by default) */ -static bool zswap_non_same_filled_pages_enabled = true; -module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, - bool, 0644); - -/* Number of zpools in zswap_pool (empirically determined for scalability) */ -#define ZSWAP_NR_ZPOOLS 32 - /* Enable/disable memory pressure-based shrinker. */ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); -bool is_zswap_enabled(void) +bool zswap_is_enabled(void) { return zswap_enabled; } +bool zswap_never_enabled(void) +{ + return !static_branch_maybe(CONFIG_ZSWAP_DEFAULT_ON, &zswap_ever_enabled); +} + /********************************* * data structures **********************************/ @@ -172,7 +159,7 @@ struct crypto_acomp_ctx { * needs to be verified that it's still valid in the tree. */ struct zswap_pool { - struct zpool *zpools[ZSWAP_NR_ZPOOLS]; + struct zpool *zpool; struct crypto_acomp_ctx __percpu *acomp_ctx; struct percpu_ref ref; struct list_head list; @@ -183,8 +170,6 @@ struct zswap_pool { /* Global LRU lists shared by all zswap pools. */ static struct list_lru zswap_list_lru; -/* counter of pages stored in all zswap pools. */ -static atomic_t zswap_nr_stored = ATOMIC_INIT(0); /* The lock protects zswap_next_shrink updates. */ static DEFINE_SPINLOCK(zswap_shrink_lock); @@ -198,36 +183,29 @@ static struct shrinker *zswap_shrinker; * This structure contains the metadata for tracking a single compressed * page within zswap. * - * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during - * decompression. For a same value filled page length is 0, and both - * pool and lru are invalid and must be ignored. + * decompression. + * referenced - true if the entry recently entered the zswap pool. Unset by the + * writeback logic. The entry is only reclaimed by the writeback + * logic if referenced is unset. See comments in the shrinker + * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data - * value - value of the same-value filled pages which have same content * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { - struct rb_node rbnode; swp_entry_t swpentry; unsigned int length; + bool referenced; struct zswap_pool *pool; - union { - unsigned long handle; - unsigned long value; - }; + unsigned long handle; struct obj_cgroup *objcg; struct list_head lru; }; -struct zswap_tree { - struct rb_root rbroot; - spinlock_t lock; -}; - -static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static struct xarray *zswap_trees[MAX_SWAPFILES]; static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ @@ -255,7 +233,7 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ -static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) >> SWAP_ADDRESS_SPACE_SHIFT]; @@ -263,46 +241,7 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ - zpool_get_type((p)->zpools[0])) - -static bool zswap_is_full(void) -{ - return totalram_pages() * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static bool zswap_can_accept(void) -{ - return totalram_pages() * zswap_accept_thr_percent / 100 * - zswap_max_pool_percent / 100 > - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static u64 get_zswap_pool_size(struct zswap_pool *pool) -{ - u64 pool_size = 0; - int i; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - pool_size += zpool_get_total_size(pool->zpools[i]); - - return pool_size; -} - -static void zswap_update_total_size(void) -{ - struct zswap_pool *pool; - u64 total = 0; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - total += get_zswap_pool_size(pool); - - rcu_read_unlock(); - - zswap_pool_total_size = total; -} + zpool_get_type((p)->zpool)) /********************************* * pool functions @@ -311,11 +250,10 @@ static void __zswap_pool_empty(struct percpu_ref *ref); static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { - int i; struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; + int ret, cpu; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we @@ -332,18 +270,14 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (!pool) return NULL; - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", - atomic_inc_return(&zswap_pools_count)); - - pool->zpools[i] = zpool_create_pool(type, name, gfp); - if (!pool->zpools[i]) { - pr_err("%s zpool not available\n", type); - goto error; - } + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + pool->zpool = zpool_create_pool(type, name, gfp); + if (!pool->zpool) { + pr_err("%s zpool not available\n", type); + goto error; } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); + pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); @@ -353,6 +287,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) goto error; } + for_each_possible_cpu(cpu) + mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) @@ -376,8 +313,8 @@ ref_fail: error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); - while (i--) - zpool_destroy_pool(pool->zpools[i]); + if (pool->zpool) + zpool_destroy_pool(pool->zpool); kfree(pool); return NULL; } @@ -426,15 +363,12 @@ static struct zswap_pool *__zswap_pool_create_fallback(void) static void zswap_pool_destroy(struct zswap_pool *pool) { - int i; - zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - zpool_destroy_pool(pool->zpools[i]); + zpool_destroy_pool(pool->zpool); kfree(pool); } @@ -473,7 +407,7 @@ static void __zswap_pool_empty(struct percpu_ref *ref) spin_unlock_bh(&zswap_pools_lock); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) +static int __must_check zswap_pool_tryget(struct zswap_pool *pool) { if (!pool) return 0; @@ -481,6 +415,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool) return percpu_ref_tryget(&pool->ref); } +/* The caller must already have a reference. */ +static void zswap_pool_get(struct zswap_pool *pool) +{ + percpu_ref_get(&pool->ref); +} + static void zswap_pool_put(struct zswap_pool *pool) { percpu_ref_put(&pool->ref); @@ -511,7 +451,7 @@ static struct zswap_pool *zswap_pool_current_get(void) rcu_read_lock(); pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) + if (!zswap_pool_tryget(pool)) pool = NULL; rcu_read_unlock(); @@ -529,11 +469,10 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) + if (strcmp(zpool_get_type(pool->zpool), type)) continue; /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) + if (!zswap_pool_tryget(pool)) continue; return pool; } @@ -541,6 +480,44 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +static unsigned long zswap_max_pages(void) +{ + return totalram_pages() * zswap_max_pool_percent / 100; +} + +static unsigned long zswap_accept_thr_pages(void) +{ + return zswap_max_pages() * zswap_accept_thr_percent / 100; +} + +unsigned long zswap_total_pages(void) +{ + struct zswap_pool *pool; + unsigned long total = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &zswap_pools, list) + total += zpool_get_total_pages(pool->zpool); + rcu_read_unlock(); + + return total; +} + +static bool zswap_check_limits(void) +{ + unsigned long cur_pages = zswap_total_pages(); + unsigned long max_pages = zswap_max_pages(); + + if (cur_pages >= max_pages) { + zswap_pool_limit_hit++; + zswap_pool_reached_full = true; + } else if (zswap_pool_reached_full && + cur_pages <= zswap_accept_thr_pages()) { + zswap_pool_reached_full = false; + } + return zswap_pool_reached_full; +} + /********************************* * param callbacks **********************************/ @@ -732,20 +709,16 @@ static inline int entry_to_nid(struct zswap_entry *entry) static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; - struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection - * used in list_lru lookup, only two scenarios are possible: + * concurrent memcg offlining: * - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * 1. list_lru_add() is called before list_lru_one is dead. The * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * 2. list_lru_add() is called after list_lru_one is dead. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del(). @@ -754,19 +727,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -784,7 +744,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) void zswap_lruvec_state_init(struct lruvec *lruvec) { - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); + atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0); } void zswap_folio_swapin(struct folio *folio) @@ -793,74 +753,30 @@ void zswap_folio_swapin(struct folio *folio) if (folio) { lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); } } +/* + * This function should be called when a memcg is being offlined. + * + * Since the global shrinker shrink_worker() may hold a reference + * of the memcg, we must check and release the reference in + * zswap_next_shrink. + * + * shrink_worker() must handle the case where this function releases + * the reference of memcg being shrunk. + */ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ spin_lock(&zswap_shrink_lock); - if (zswap_next_shrink == memcg) - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); - spin_unlock(&zswap_shrink_lock); -} - -/********************************* -* rbtree functions -**********************************/ -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) -{ - struct rb_node *node = root->rb_node; - struct zswap_entry *entry; - pgoff_t entry_offset; - - while (node) { - entry = rb_entry(node, struct zswap_entry, rbnode); - entry_offset = swp_offset(entry->swpentry); - if (entry_offset > offset) - node = node->rb_left; - else if (entry_offset < offset) - node = node->rb_right; - else - return entry; + if (zswap_next_shrink == memcg) { + do { + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink)); } - return NULL; -} - -/* - * In the case that a entry with the same offset is found, a pointer to - * the existing entry is stored in dupentry and the function returns -EEXIST - */ -static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, - struct zswap_entry **dupentry) -{ - struct rb_node **link = &root->rb_node, *parent = NULL; - struct zswap_entry *myentry; - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); - - while (*link) { - parent = *link; - myentry = rb_entry(parent, struct zswap_entry, rbnode); - myentry_offset = swp_offset(myentry->swpentry); - if (myentry_offset > entry_offset) - link = &(*link)->rb_left; - else if (myentry_offset < entry_offset) - link = &(*link)->rb_right; - else { - *dupentry = myentry; - return -EEXIST; - } - } - rb_link_node(&entry->rbnode, parent, link); - rb_insert_color(&entry->rbnode, root); - return 0; -} - -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); + spin_unlock(&zswap_shrink_lock); } /********************************* @@ -874,7 +790,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; - RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -883,48 +798,21 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } -static struct zpool *zswap_find_zpool(struct zswap_entry *entry) -{ - int i = 0; - - if (ZSWAP_NR_ZPOOLS > 1) - i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); - - return entry->pool->zpools[i]; -} - /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_entry_free(struct zswap_entry *entry) { - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zswap_lru_del(&zswap_list_lru, entry); - zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&zswap_nr_stored); - zswap_pool_put(entry->pool); - } + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - zswap_update_total_size(); -} - -/* - * The caller hold the tree lock and search the entry from the tree, - * so it must be on the tree, remove it from the tree and free it. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - zswap_rb_erase(&tree->rbroot, entry); - zswap_entry_free(entry); + atomic_long_dec(&zswap_stored_pages); } /********************************* @@ -934,36 +822,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; + struct crypto_acomp *acomp = NULL; + struct acomp_req *req = NULL; + u8 *buffer = NULL; int ret; - mutex_init(&acomp_ctx->mutex); - - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; + buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!buffer) { + ret = -ENOMEM; + goto fail; + } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); - goto acomp_fail; + goto fail; } - acomp_ctx->acomp = acomp; - acomp_ctx->is_sleepable = acomp_is_async(acomp); - req = acomp_request_alloc(acomp_ctx->acomp); + req = acomp_request_alloc(acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; - goto req_fail; + goto fail; } - acomp_ctx->req = req; + /* + * Only hold the mutex after completing allocations, otherwise we may + * recurse into zswap through reclaim and attempt to hold the mutex + * again resulting in a deadlock. + */ + mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); + /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback @@ -972,12 +865,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); + acomp_ctx->buffer = buffer; + acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); + acomp_ctx->req = req; + mutex_unlock(&acomp_ctx->mutex); return 0; -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); +fail: + if (acomp) + crypto_free_acomp(acomp); + kfree(buffer); return ret; } @@ -985,19 +883,62 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct acomp_req *req; + struct crypto_acomp *acomp; + u8 *buffer; - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } + if (IS_ERR_OR_NULL(acomp_ctx)) + return 0; + + mutex_lock(&acomp_ctx->mutex); + req = acomp_ctx->req; + acomp = acomp_ctx->acomp; + buffer = acomp_ctx->buffer; + acomp_ctx->req = NULL; + acomp_ctx->acomp = NULL; + acomp_ctx->buffer = NULL; + mutex_unlock(&acomp_ctx->mutex); + + /* + * Do the actual freeing after releasing the mutex to avoid subtle + * locking dependencies causing deadlocks. + */ + if (!IS_ERR_OR_NULL(req)) + acomp_request_free(req); + if (!IS_ERR_OR_NULL(acomp)) + crypto_free_acomp(acomp); + kfree(buffer); return 0; } -static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) +{ + struct crypto_acomp_ctx *acomp_ctx; + + for (;;) { + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + if (likely(acomp_ctx->req)) + return acomp_ctx; + /* + * It is possible that we were migrated to a different CPU after + * getting the per-CPU ctx but before the mutex was acquired. If + * the old CPU got offlined, zswap_cpu_comp_dead() could have + * already freed ctx->req (among other things) and set it to + * NULL. Just try again on the new CPU that we ended up on. + */ + mutex_unlock(&acomp_ctx->mutex); + } +} + +static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) +{ + mutex_unlock(&acomp_ctx->mutex); +} + +static bool zswap_compress(struct page *page, struct zswap_entry *entry, + struct zswap_pool *pool) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; @@ -1005,17 +946,13 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) unsigned int dlen = PAGE_SIZE; unsigned long handle; struct zpool *zpool; - char *buf; gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - + acomp_ctx = acomp_ctx_get_cpu_lock(pool); dst = acomp_ctx->buffer; sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + sg_set_page(&input, page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, @@ -1042,18 +979,13 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) if (comp_ret) goto unlock; - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + zpool = pool->zpool; + gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); if (alloc_ret) goto unlock; - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - + zpool_obj_write(zpool, handle, dst, dlen); entry->handle = handle; entry->length = dlen; @@ -1065,47 +997,53 @@ unlock: else if (alloc_ret) zswap_reject_alloc_fail++; - mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_unlock(acomp_ctx); return comp_ret == 0 && alloc_ret == 0; } -static void zswap_decompress(struct zswap_entry *entry, struct page *page) +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { - struct zpool *zpool = zswap_find_zpool(entry); + struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + int decomp_ret, dlen; + u8 *src, *obj; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); + acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); /* - * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer - * to do crypto_acomp_decompress() which might sleep. In such cases, we must - * resort to copying the buffer to a temporary one. - * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, - * such as a kmap address of high memory or even ever a vmap address. - * However, sg_init_one is only equipped to handle linearly mapped low memory. - * In such cases, we also must copy the buffer to a temporary and lowmem one. + * zpool_obj_read_begin() might return a kmap address of highmem when + * acomp_ctx->buffer is not used. However, sg_init_one() does not + * handle highmem addresses, so copy the object to acomp_ctx->buffer. */ - if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || - !virt_addr_valid(src)) { - memcpy(acomp_ctx->buffer, src, entry->length); + if (virt_addr_valid(obj)) { + src = obj; + } else { + WARN_ON_ONCE(obj == acomp_ctx->buffer); + memcpy(acomp_ctx->buffer, obj, entry->length); src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); } sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); + sg_set_folio(&output, folio, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(&acomp_ctx->mutex); + decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; - if (src != acomp_ctx->buffer) - zpool_unmap_handle(zpool, entry->handle); + zpool_obj_read_end(zpool, entry->handle, obj); + acomp_ctx_put_unlock(acomp_ctx); + + if (!decomp_ret && dlen == PAGE_SIZE) + return true; + + zswap_decompress_fail++; + pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n", + swp_type(entry->swpentry), + swp_offset(entry->swpentry), + entry->pool->tfm_name, entry->length, dlen); + return false; } /********************************* @@ -1126,18 +1064,26 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry) { - struct zswap_tree *tree; + struct xarray *tree; + pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; + struct swap_info_struct *si; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + int ret = 0; /* try to allocate swap cache folio */ + si = get_swap_device(swpentry); + if (!si) + return -EEXIST; + mpol = get_task_policy(current); folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + put_swap_device(si); if (!folio) return -ENOMEM; @@ -1149,8 +1095,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; + ret = -EEXIST; + goto out; } /* @@ -1163,24 +1109,21 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { - spin_unlock(&tree->lock); - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; + if (entry != xa_load(tree, offset)) { + ret = -ENOMEM; + goto out; } - /* Safe to deref entry after the entry is verified above. */ - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); + if (!zswap_decompress(entry, folio)) { + ret = -EIO; + goto out; + } - zswap_decompress(entry, &folio->page); + xa_erase(tree, offset); count_vm_event(ZSWPWB); if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); + count_objcg_events(entry->objcg, ZSWPWB, 1); zswap_entry_free(entry); @@ -1192,16 +1135,43 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(folio, &wbc); - folio_put(folio); - return 0; +out: + if (ret && ret != -EEXIST) { + delete_from_swap_cache(folio); + folio_unlock(folio); + } + folio_put(folio); + return ret; } /********************************* * shrinker functions **********************************/ +/* + * The dynamic shrinker is modulated by the following factors: + * + * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving + * the entry a second chance) before rotating it in the LRU list. If the + * entry is considered again by the shrinker, with its referenced bit unset, + * it is written back. The writeback rate as a result is dynamically + * adjusted by the pool activities - if the pool is dominated by new entries + * (i.e lots of recent zswapouts), these entries will be protected and + * the writeback rate will slow down. On the other hand, if the pool has a + * lot of stagnant entries, these entries will be reclaimed immediately, + * effectively increasing the writeback rate. + * + * 2. Swapins counter: If we observe swapins, it is a sign that we are + * overshrinking and should slow down. We maintain a swapins counter, which + * is consumed and subtract from the number of eligible objects on the LRU + * in zswap_shrinker_count(). + * + * 3. Compression ratio. The better the workload compresses, the less gains we + * can expect from writeback. We scale down the number of objects available + * for reclaim by this ratio. + */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg) + void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; @@ -1210,6 +1180,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o int writeback_result; /* + * Second chance algorithm: if the entry has its referenced bit set, give it + * a second chance. Only clear the referenced bit and rotate it in the + * zswap's LRU list. + */ + if (entry->referenced) { + entry->referenced = false; + return LRU_ROTATE; + } + + /* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: * @@ -1245,9 +1225,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. + * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP. */ - spin_unlock(lock); + spin_unlock(&l->lock); writeback_result = zswap_writeback_entry(entry, swpentry); @@ -1268,15 +1248,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o zswap_written_back_pages++; } - spin_lock(lock); return ret; } static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; + unsigned long shrink_ret; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1285,25 +1263,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap_list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); @@ -1318,7 +1277,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, { struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + atomic_long_t *nr_disk_swapins = + &lruvec->zswap_lruvec_state.nr_disk_swapins; + unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur, + nr_remain; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; @@ -1344,21 +1306,34 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { - nr_backing = zswap_pool_total_size >> PAGE_SHIFT; - nr_stored = atomic_read(&zswap_nr_stored); + nr_backing = zswap_total_pages(); + nr_stored = atomic_long_read(&zswap_stored_pages); } if (!nr_stored) return 0; - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + if (!nr_freeable) + return 0; + /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. + * Subtract from the lru size the number of pages that are recently swapped + * in from disk. The idea is that had we protect the zswap's LRU by this + * amount of pages, these disk swapins would not have happened. */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); + do { + if (nr_freeable >= nr_disk_swapins_cur) + nr_remain = 0; + else + nr_remain = nr_disk_swapins_cur - nr_freeable; + } while (!atomic_long_try_cmpxchg( + nr_disk_swapins, &nr_disk_swapins_cur, nr_remain)); + + nr_freeable -= nr_disk_swapins_cur - nr_remain; + if (!nr_freeable) + return 0; /* * Scale the number of freeable pages by the memory saving factor. @@ -1387,10 +1362,10 @@ static struct shrinker *zswap_alloc_shrinker(void) static int shrink_memcg(struct mem_cgroup *memcg) { - int nid, shrunk = 0; + int nid, shrunk = 0, scanned = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; + return -ENOENT; /* * Skip zombies because their LRUs are reparented and we would be @@ -1404,113 +1379,196 @@ static int shrink_memcg(struct mem_cgroup *memcg) shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); + scanned += 1 - nr_to_walk; } + + if (!scanned) + return -ENOENT; + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; - int ret, failures = 0; + int ret, failures = 0, attempts = 0; + unsigned long thr; - /* global reclaim will select cgroup in a round-robin fashion. */ - do { - spin_lock(&zswap_shrink_lock); - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); - memcg = zswap_next_shrink; + /* Reclaim down to the accept threshold */ + thr = zswap_accept_thr_pages(); + /* + * Global reclaim will select cgroup in a round-robin fashion from all + * online memcgs, but memcgs that have no pages in zswap and + * writeback-disabled memcgs (memory.zswap.writeback=0) are not + * candidates for shrinking. + * + * Shrinking will be aborted if we encounter the following + * MAX_RECLAIM_RETRIES times: + * - No writeback-candidate memcgs found in a memcg tree walk. + * - Shrinking a writeback-candidate memcg failed. + * + * We save iteration cursor memcg into zswap_next_shrink, + * which can be modified by the offline memcg cleaner + * zswap_memcg_offline_cleanup(). + * + * Since the offline cleaner is called only once, we cannot leave an + * offline memcg reference in zswap_next_shrink. + * We can rely on the cleaner only if we get online memcg under lock. + * + * If we get an offline memcg, we cannot determine if the cleaner has + * already been called or will be called later. We must put back the + * reference before returning from this function. Otherwise, the + * offline memcg left in zswap_next_shrink will hold the reference + * until the next run of shrink_worker(). + */ + do { /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. + * Start shrinking from the next memcg after zswap_next_shrink. + * When the offline cleaner has already advanced the cursor, + * advancing the cursor here overlooks one memcg, but this + * should be negligibly rare. * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. + * If we get an online memcg, keep the extra reference in case + * the original one obtained by mem_cgroup_iter() is dropped by + * zswap_memcg_offline_cleanup() while we are shrinking the + * memcg. */ - if (!memcg) { - spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - zswap_next_shrink = NULL; - spin_unlock(&zswap_shrink_lock); + spin_lock(&zswap_shrink_lock); + do { + memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + zswap_next_shrink = memcg; + } while (memcg && !mem_cgroup_tryget_online(memcg)); + spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) + if (!memcg) { + /* + * Continue shrinking without incrementing failures if + * we found candidate memcgs in the last tree walk. + */ + if (!attempts && ++failures == MAX_RECLAIM_RETRIES) break; + attempts = 0; goto resched; } - spin_unlock(&zswap_shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); - if (ret == -EINVAL) - break; + /* + * There are no writeback-candidate pages in the memcg. + * This is not an issue as long as we can find another memcg + * with pages in zswap. Skip this without incrementing attempts + * and failures. + */ + if (ret == -ENOENT) + continue; + ++attempts; + if (ret && ++failures == MAX_RECLAIM_RETRIES) break; - resched: cond_resched(); - } while (!zswap_can_accept()); + } while (zswap_total_pages() > thr); } -static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +/********************************* +* main API +**********************************/ + +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { - unsigned long *page; - unsigned long val; - unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; + swp_entry_t page_swpentry = page_swap_entry(page); + struct zswap_entry *entry, *old; - page = (unsigned long *)ptr; - val = page[0]; + /* allocate entry */ + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); + if (!entry) { + zswap_reject_kmemcache_fail++; + return false; + } - if (val != page[last_pos]) - return 0; + if (!zswap_compress(page, entry, pool)) + goto compress_failed; - for (pos = 1; pos < last_pos; pos++) { - if (val != page[pos]) - return 0; + old = xa_store(swap_zswap_tree(page_swpentry), + swp_offset(page_swpentry), + entry, GFP_KERNEL); + if (xa_is_err(old)) { + int err = xa_err(old); + + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); + zswap_reject_alloc_fail++; + goto store_failed; } - *value = val; + /* + * We may have had an existing entry that became stale when + * the folio was redirtied and now the new version is being + * swapped out. Get rid of the old. + */ + if (old) + zswap_entry_free(old); - return 1; -} + /* + * The entry is successfully compressed and stored in the tree, there is + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. + */ + zswap_pool_get(pool); + if (objcg) { + obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); -static void zswap_fill_page(void *ptr, unsigned long value) -{ - unsigned long *page; + /* + * We finish initializing the entry while it's already in xarray. + * This is safe because: + * + * 1. Concurrent stores and invalidations are excluded by folio lock. + * + * 2. Writeback is excluded by the entry not being on the LRU yet. + * The publishing order matters to prevent writeback from seeing + * an incoherent entry. + */ + entry->pool = pool; + entry->swpentry = page_swpentry; + entry->objcg = objcg; + entry->referenced = true; + if (entry->length) { + INIT_LIST_HEAD(&entry->lru); + zswap_lru_add(&zswap_list_lru, entry); + } + + return true; - page = (unsigned long *)ptr; - memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); +store_failed: + zpool_free(pool->zpool, entry->handle); +compress_failed: + zswap_entry_cache_free(entry); + return false; } bool zswap_store(struct folio *folio) { + long nr_pages = folio_nr_pages(folio); swp_entry_t swp = folio->swap; - pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); - struct zswap_entry *entry, *dupentry; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; + struct zswap_pool *pool; + bool ret = false; + long index; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); - /* Large folios aren't supported */ - if (folio_test_large(folio)) - return false; - if (!zswap_enabled) goto check_old; @@ -1519,54 +1577,17 @@ bool zswap_store(struct folio *folio) memcg = get_mem_cgroup_from_objcg(objcg); if (shrink_memcg(memcg)) { mem_cgroup_put(memcg); - goto reject; + goto put_objcg; } mem_cgroup_put(memcg); } - /* reclaim space if needed */ - if (zswap_is_full()) { - zswap_pool_limit_hit++; - zswap_pool_reached_full = true; - goto shrink; - } - - if (zswap_pool_reached_full) { - if (!zswap_can_accept()) - goto shrink; - else - zswap_pool_reached_full = false; - } + if (zswap_check_limits()) + goto put_objcg; - /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); - if (!entry) { - zswap_reject_kmemcache_fail++; - goto reject; - } - - if (zswap_same_filled_pages_enabled) { - unsigned long value; - u8 *src; - - src = kmap_local_folio(folio, 0); - if (zswap_is_page_same_filled(src, &value)) { - kunmap_local(src); - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto insert_entry; - } - kunmap_local(src); - } - - if (!zswap_non_same_filled_pages_enabled) - goto freepage; - - /* if entry is successfully added, it keeps the reference */ - entry->pool = zswap_pool_current_get(); - if (!entry->pool) - goto freepage; + pool = zswap_pool_current_get(); + if (!pool) + goto put_objcg; if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); @@ -1577,85 +1598,108 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - if (!zswap_compress(folio, entry)) - goto put_pool; + for (index = 0; index < nr_pages; ++index) { + struct page *page = folio_page(folio, index); -insert_entry: - entry->swpentry = swp; - entry->objcg = objcg; - if (objcg) { - obj_cgroup_charge_zswap(objcg, entry->length); - /* Account before objcg ref is moved to tree */ - count_objcg_event(objcg, ZSWPOUT); + if (!zswap_store_page(page, objcg, pool)) + goto put_pool; } - /* map */ - spin_lock(&tree->lock); - /* - * The folio may have been dirtied again, invalidate the - * possibly stale entry before inserting the new entry. - */ - if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - zswap_invalidate_entry(tree, dupentry); - WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); - } - if (entry->length) { - INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&zswap_list_lru, entry); - atomic_inc(&zswap_nr_stored); - } - spin_unlock(&tree->lock); + if (objcg) + count_objcg_events(objcg, ZSWPOUT, nr_pages); - /* update stats */ - atomic_inc(&zswap_stored_pages); - zswap_update_total_size(); - count_vm_event(ZSWPOUT); + count_vm_events(ZSWPOUT, nr_pages); - return true; + ret = true; put_pool: - zswap_pool_put(entry->pool); -freepage: - zswap_entry_cache_free(entry); -reject: - if (objcg) - obj_cgroup_put(objcg); + zswap_pool_put(pool); +put_objcg: + obj_cgroup_put(objcg); + if (!ret && zswap_pool_reached_full) + queue_work(shrink_wq, &zswap_shrink_work); check_old: /* - * If the zswap store fails or zswap is disabled, we must invalidate the - * possibly stale entry which was previously stored at this offset. - * Otherwise, writeback could overwrite the new data in the swapfile. + * If the zswap store fails or zswap is disabled, we must invalidate + * the possibly stale entries which were previously stored at the + * offsets corresponding to each page of the folio. Otherwise, + * writeback could overwrite the new data in the swapfile. */ - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); - return false; + if (!ret) { + unsigned type = swp_type(swp); + pgoff_t offset = swp_offset(swp); + struct zswap_entry *entry; + struct xarray *tree; + + for (index = 0; index < nr_pages; ++index) { + tree = swap_zswap_tree(swp_entry(type, offset + index)); + entry = xa_erase(tree, offset + index); + if (entry) + zswap_entry_free(entry); + } + } -shrink: - queue_work(shrink_wq, &zswap_shrink_work); - goto reject; + return ret; } -bool zswap_load(struct folio *folio) +/** + * zswap_load() - load a folio from zswap + * @folio: folio to load + * + * Return: 0 on success, with the folio unlocked and marked up-to-date, or one + * of the following error codes: + * + * -EIO: if the swapped out content was in zswap, but could not be loaded + * into the page due to a decompression failure. The folio is unlocked, but + * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() + * will SIGBUS). + * + * -EINVAL: if the swapped out content was in zswap, but the page belongs + * to a large folio, which is not supported by zswap. The folio is unlocked, + * but NOT marked up-to-date, so that an IO error is emitted (e.g. + * do_swap_page() will SIGBUS). + * + * -ENOENT: if the swapped out content was not in zswap. The folio remains + * locked on return. + */ +int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; bool swapcache = folio_test_swapcache(folio); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - spin_unlock(&tree->lock); - return false; + if (zswap_never_enabled()) + return -ENOENT; + + /* + * Large folios should not be swapped in while zswap is being used, as + * they are not properly handled. Zswap does not properly load large + * folios, and a large folio may only be partially in zswap. + */ + if (WARN_ON_ONCE(folio_test_large(folio))) { + folio_unlock(folio); + return -EINVAL; + } + + entry = xa_load(tree, offset); + if (!entry) + return -ENOENT; + + if (!zswap_decompress(entry, folio)) { + folio_unlock(folio); + return -EIO; } + + folio_mark_uptodate(folio); + + count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_events(entry->objcg, ZSWPIN, 1); + /* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and @@ -1668,46 +1712,33 @@ bool zswap_load(struct folio *folio) * files, which reads into a private page and may free it if * the fault fails. We remain the primary owner of the entry.) */ - if (swapcache) - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); - - if (entry->length) - zswap_decompress(entry, page); - else { - dst = kmap_local_page(page); - zswap_fill_page(dst, entry->value); - kunmap_local(dst); - } - - count_vm_event(ZSWPIN); - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPIN); - if (swapcache) { - zswap_entry_free(entry); folio_mark_dirty(folio); + xa_erase(tree, offset); + zswap_entry_free(entry); } - return true; + folio_unlock(folio); + return 0; } void zswap_invalidate(swp_entry_t swp) { pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + if (xa_empty(tree)) + return; + + entry = xa_erase(tree, offset); if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); } int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *trees, *tree; + struct xarray *trees, *tree; unsigned int nr, i; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); @@ -1717,11 +1748,8 @@ int zswap_swapon(int type, unsigned long nr_pages) return -ENOMEM; } - for (i = 0; i < nr; i++) { - tree = trees + i; - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - } + for (i = 0; i < nr; i++) + xa_init(trees + i); nr_zswap_trees[type] = nr; zswap_trees[type] = trees; @@ -1730,7 +1758,7 @@ int zswap_swapon(int type, unsigned long nr_pages) void zswap_swapoff(int type) { - struct zswap_tree *trees = zswap_trees[type]; + struct xarray *trees = zswap_trees[type]; unsigned int i; if (!trees) @@ -1738,7 +1766,7 @@ void zswap_swapoff(int type) /* try_to_unuse() invalidated all the entries already */ for (i = 0; i < nr_zswap_trees[type]; i++) - WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); + WARN_ON_ONCE(!xa_empty(trees + i)); kvfree(trees); nr_zswap_trees[type] = 0; @@ -1753,6 +1781,20 @@ void zswap_swapoff(int type) static struct dentry *zswap_debugfs_root; +static int debugfs_get_total_size(void *data, u64 *val) +{ + *val = zswap_total_pages() * PAGE_SIZE; + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); + +static int debugfs_get_stored_pages(void *data, u64 *val) +{ + *val = atomic_long_read(&zswap_stored_pages); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1772,14 +1814,14 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("decompress_fail", 0444, + zswap_debugfs_root, &zswap_decompress_fail); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("pool_total_size", 0444, - zswap_debugfs_root, &zswap_pool_total_size); - debugfs_create_atomic_t("stored_pages", 0444, - zswap_debugfs_root, &zswap_stored_pages); - debugfs_create_atomic_t("same_filled_pages", 0444, - zswap_debugfs_root, &zswap_same_filled_pages); + debugfs_create_file("pool_total_size", 0444, + zswap_debugfs_root, NULL, &total_size_fops); + debugfs_create_file("stored_pages", 0444, + zswap_debugfs_root, NULL, &stored_pages_fops); return 0; } @@ -1828,9 +1870,10 @@ static int zswap_setup(void) pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, - zpool_get_type(pool->zpools[0])); + zpool_get_type(pool->zpool)); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; + static_branch_enable(&zswap_ever_enabled); } else { pr_err("pool creation failed\n"); zswap_enabled = false; |