diff options
Diffstat (limited to 'mm/zswap.c')
-rw-r--r-- | mm/zswap.c | 307 |
1 files changed, 150 insertions, 157 deletions
diff --git a/mm/zswap.c b/mm/zswap.c index adeaf9c97fde..449914ea9919 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -44,8 +44,6 @@ **********************************/ /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); -/* The number of same-value filled pages currently stored in zswap */ -static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -185,8 +183,11 @@ static struct shrinker *zswap_shrinker; * * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during - * decompression. For a same value filled page length is 0, and both - * pool and lru are invalid and must be ignored. + * decompression. + * referenced - true if the entry recently entered the zswap pool. Unset by the + * writeback logic. The entry is only reclaimed by the writeback + * logic if referenced is unset. See comments in the shrinker + * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content @@ -196,11 +197,9 @@ static struct shrinker *zswap_shrinker; struct zswap_entry { swp_entry_t swpentry; unsigned int length; + bool referenced; struct zswap_pool *pool; - union { - unsigned long handle; - unsigned long value; - }; + unsigned long handle; struct obj_cgroup *objcg; struct list_head lru; }; @@ -700,11 +699,8 @@ static inline int entry_to_nid(struct zswap_entry *entry) static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; - struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -722,19 +718,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -752,7 +735,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) void zswap_lruvec_state_init(struct lruvec *lruvec) { - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); + atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0); } void zswap_folio_swapin(struct folio *folio) @@ -761,16 +744,29 @@ void zswap_folio_swapin(struct folio *folio) if (folio) { lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); } } +/* + * This function should be called when a memcg is being offlined. + * + * Since the global shrinker shrink_worker() may hold a reference + * of the memcg, we must check and release the reference in + * zswap_next_shrink. + * + * shrink_worker() must handle the case where this function releases + * the reference of memcg being shrunk. + */ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ spin_lock(&zswap_shrink_lock); - if (zswap_next_shrink == memcg) - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + if (zswap_next_shrink == memcg) { + do { + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink)); + } spin_unlock(&zswap_shrink_lock); } @@ -799,13 +795,9 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) */ static void zswap_entry_free(struct zswap_entry *entry) { - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zswap_lru_del(&zswap_list_lru, entry); - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); - } + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); @@ -1082,6 +1074,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /********************************* * shrinker functions **********************************/ +/* + * The dynamic shrinker is modulated by the following factors: + * + * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving + * the entry a second chance) before rotating it in the LRU list. If the + * entry is considered again by the shrinker, with its referenced bit unset, + * it is written back. The writeback rate as a result is dynamically + * adjusted by the pool activities - if the pool is dominated by new entries + * (i.e lots of recent zswapouts), these entries will be protected and + * the writeback rate will slow down. On the other hand, if the pool has a + * lot of stagnant entries, these entries will be reclaimed immediately, + * effectively increasing the writeback rate. + * + * 2. Swapins counter: If we observe swapins, it is a sign that we are + * overshrinking and should slow down. We maintain a swapins counter, which + * is consumed and subtract from the number of eligible objects on the LRU + * in zswap_shrinker_count(). + * + * 3. Compression ratio. The better the workload compresses, the less gains we + * can expect from writeback. We scale down the number of objects available + * for reclaim by this ratio. + */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { @@ -1092,6 +1106,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o int writeback_result; /* + * Second chance algorithm: if the entry has its referenced bit set, give it + * a second chance. Only clear the referenced bit and rotate it in the + * zswap's LRU list. + */ + if (entry->referenced) { + entry->referenced = false; + return LRU_ROTATE; + } + + /* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: * @@ -1157,8 +1181,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; + unsigned long shrink_ret; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1167,25 +1190,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap_list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); @@ -1200,7 +1204,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, { struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + atomic_long_t *nr_disk_swapins = + &lruvec->zswap_lruvec_state.nr_disk_swapins; + unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur, + nr_remain; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; @@ -1233,25 +1240,33 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!nr_stored) return 0; - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + if (!nr_freeable) + return 0; + /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. + * Subtract from the lru size the number of pages that are recently swapped + * in from disk. The idea is that had we protect the zswap's LRU by this + * amount of pages, these disk swapins would not have happened. */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); + do { + if (nr_freeable >= nr_disk_swapins_cur) + nr_remain = 0; + else + nr_remain = nr_disk_swapins_cur - nr_freeable; + } while (!atomic_long_try_cmpxchg( + nr_disk_swapins, &nr_disk_swapins_cur, nr_remain)); + + nr_freeable -= nr_disk_swapins_cur - nr_remain; + if (!nr_freeable) + return 0; /* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). - * - * The memory saving factor calculated here takes same-filled pages into - * account, but those are not freeable since they almost occupy no - * space. Hence, we may scale nr_freeable down a little bit more than we - * should if we have a lot of same-filled pages. */ return mult_frac(nr_freeable, nr_backing, nr_stored); } @@ -1274,10 +1289,10 @@ static struct shrinker *zswap_alloc_shrinker(void) static int shrink_memcg(struct mem_cgroup *memcg) { - int nid, shrunk = 0; + int nid, shrunk = 0, scanned = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; + return -ENOENT; /* * Skip zombies because their LRUs are reparented and we would be @@ -1291,63 +1306,94 @@ static int shrink_memcg(struct mem_cgroup *memcg) shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); + scanned += 1 - nr_to_walk; } + + if (!scanned) + return -ENOENT; + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; - int ret, failures = 0; + int ret, failures = 0, attempts = 0; unsigned long thr; /* Reclaim down to the accept threshold */ thr = zswap_accept_thr_pages(); - /* global reclaim will select cgroup in a round-robin fashion. */ + /* + * Global reclaim will select cgroup in a round-robin fashion from all + * online memcgs, but memcgs that have no pages in zswap and + * writeback-disabled memcgs (memory.zswap.writeback=0) are not + * candidates for shrinking. + * + * Shrinking will be aborted if we encounter the following + * MAX_RECLAIM_RETRIES times: + * - No writeback-candidate memcgs found in a memcg tree walk. + * - Shrinking a writeback-candidate memcg failed. + * + * We save iteration cursor memcg into zswap_next_shrink, + * which can be modified by the offline memcg cleaner + * zswap_memcg_offline_cleanup(). + * + * Since the offline cleaner is called only once, we cannot leave an + * offline memcg reference in zswap_next_shrink. + * We can rely on the cleaner only if we get online memcg under lock. + * + * If we get an offline memcg, we cannot determine if the cleaner has + * already been called or will be called later. We must put back the + * reference before returning from this function. Otherwise, the + * offline memcg left in zswap_next_shrink will hold the reference + * until the next run of shrink_worker(). + */ do { - spin_lock(&zswap_shrink_lock); - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); - memcg = zswap_next_shrink; - /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. + * Start shrinking from the next memcg after zswap_next_shrink. + * When the offline cleaner has already advanced the cursor, + * advancing the cursor here overlooks one memcg, but this + * should be negligibly rare. * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. + * If we get an online memcg, keep the extra reference in case + * the original one obtained by mem_cgroup_iter() is dropped by + * zswap_memcg_offline_cleanup() while we are shrinking the + * memcg. */ - if (!memcg) { - spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - zswap_next_shrink = NULL; - spin_unlock(&zswap_shrink_lock); + spin_lock(&zswap_shrink_lock); + do { + memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + zswap_next_shrink = memcg; + } while (memcg && !mem_cgroup_tryget_online(memcg)); + spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) + if (!memcg) { + /* + * Continue shrinking without incrementing failures if + * we found candidate memcgs in the last tree walk. + */ + if (!attempts && ++failures == MAX_RECLAIM_RETRIES) break; + attempts = 0; goto resched; } - spin_unlock(&zswap_shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); - if (ret == -EINVAL) - break; + /* + * There are no writeback-candidate pages in the memcg. + * This is not an issue as long as we can find another memcg + * with pages in zswap. Skip this without incrementing attempts + * and failures. + */ + if (ret == -ENOENT) + continue; + ++attempts; + if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: @@ -1356,42 +1402,6 @@ resched: } /********************************* -* same-filled functions -**********************************/ -static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) -{ - unsigned long *data; - unsigned long val; - unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; - bool ret = false; - - data = kmap_local_folio(folio, 0); - val = data[0]; - - if (val != data[last_pos]) - goto out; - - for (pos = 1; pos < last_pos; pos++) { - if (val != data[pos]) - goto out; - } - - *value = val; - ret = true; -out: - kunmap_local(data); - return ret; -} - -static void zswap_fill_folio(struct folio *folio, unsigned long value) -{ - unsigned long *data = kmap_local_folio(folio, 0); - - memset_l(data, value, PAGE_SIZE / sizeof(unsigned long)); - kunmap_local(data); -} - -/********************************* * main API **********************************/ bool zswap_store(struct folio *folio) @@ -1402,7 +1412,6 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1435,13 +1444,6 @@ bool zswap_store(struct folio *folio) goto reject; } - if (zswap_is_folio_same_filled(folio, &value)) { - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto store_entry; - } - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) @@ -1459,9 +1461,9 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; -store_entry: entry->swpentry = swp; entry->objcg = objcg; + entry->referenced = true; old = xa_store(tree, offset, entry, GFP_KERNEL); if (xa_is_err(old)) { @@ -1507,13 +1509,9 @@ store_entry: return true; store_failed: - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zpool_free(entry->pool->zpool, entry->handle); + zpool_free(entry->pool->zpool, entry->handle); put_pool: - zswap_pool_put(entry->pool); - } + zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); reject: @@ -1576,10 +1574,7 @@ bool zswap_load(struct folio *folio) if (!entry) return false; - if (entry->length) - zswap_decompress(entry, folio); - else - zswap_fill_folio(folio, entry->value); + zswap_decompress(entry, folio); count_vm_event(ZSWPIN); if (entry->objcg) @@ -1682,8 +1677,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); - debugfs_create_atomic_t("same_filled_pages", 0444, - zswap_debugfs_root, &zswap_same_filled_pages); return 0; } |