summaryrefslogtreecommitdiff
path: root/mm/zswap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/zswap.c')
-rw-r--r--mm/zswap.c307
1 files changed, 150 insertions, 157 deletions
diff --git a/mm/zswap.c b/mm/zswap.c
index adeaf9c97fde..449914ea9919 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -44,8 +44,6 @@
**********************************/
/* The number of compressed pages currently stored in zswap */
atomic_t zswap_stored_pages = ATOMIC_INIT(0);
-/* The number of same-value filled pages currently stored in zswap */
-static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -185,8 +183,11 @@ static struct shrinker *zswap_shrinker;
*
* swpentry - associated swap entry, the offset indexes into the red-black tree
* length - the length in bytes of the compressed page data. Needed during
- * decompression. For a same value filled page length is 0, and both
- * pool and lru are invalid and must be ignored.
+ * decompression.
+ * referenced - true if the entry recently entered the zswap pool. Unset by the
+ * writeback logic. The entry is only reclaimed by the writeback
+ * logic if referenced is unset. See comments in the shrinker
+ * section for context.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
@@ -196,11 +197,9 @@ static struct shrinker *zswap_shrinker;
struct zswap_entry {
swp_entry_t swpentry;
unsigned int length;
+ bool referenced;
struct zswap_pool *pool;
- union {
- unsigned long handle;
- unsigned long value;
- };
+ unsigned long handle;
struct obj_cgroup *objcg;
struct list_head lru;
};
@@ -700,11 +699,8 @@ static inline int entry_to_nid(struct zswap_entry *entry)
static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
{
- atomic_long_t *nr_zswap_protected;
- unsigned long lru_size, old, new;
int nid = entry_to_nid(entry);
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
/*
* Note that it is safe to use rcu_read_lock() here, even in the face of
@@ -722,19 +718,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
memcg = mem_cgroup_from_entry(entry);
/* will always succeed */
list_lru_add(list_lru, &entry->lru, nid, memcg);
-
- /* Update the protection area */
- lru_size = list_lru_count_one(list_lru, nid, memcg);
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
- old = atomic_long_inc_return(nr_zswap_protected);
- /*
- * Decay to avoid overflow and adapt to changing workloads.
- * This is based on LRU reclaim cost decaying heuristics.
- */
- do {
- new = old > lru_size / 4 ? old / 2 : old;
- } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
rcu_read_unlock();
}
@@ -752,7 +735,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
void zswap_lruvec_state_init(struct lruvec *lruvec)
{
- atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0);
}
void zswap_folio_swapin(struct folio *folio)
@@ -761,16 +744,29 @@ void zswap_folio_swapin(struct folio *folio)
if (folio) {
lruvec = folio_lruvec(folio);
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins);
}
}
+/*
+ * This function should be called when a memcg is being offlined.
+ *
+ * Since the global shrinker shrink_worker() may hold a reference
+ * of the memcg, we must check and release the reference in
+ * zswap_next_shrink.
+ *
+ * shrink_worker() must handle the case where this function releases
+ * the reference of memcg being shrunk.
+ */
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
{
/* lock out zswap shrinker walking memcg tree */
spin_lock(&zswap_shrink_lock);
- if (zswap_next_shrink == memcg)
- zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ if (zswap_next_shrink == memcg) {
+ do {
+ zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink));
+ }
spin_unlock(&zswap_shrink_lock);
}
@@ -799,13 +795,9 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
*/
static void zswap_entry_free(struct zswap_entry *entry)
{
- if (!entry->length)
- atomic_dec(&zswap_same_filled_pages);
- else {
- zswap_lru_del(&zswap_list_lru, entry);
- zpool_free(entry->pool->zpool, entry->handle);
- zswap_pool_put(entry->pool);
- }
+ zswap_lru_del(&zswap_list_lru, entry);
+ zpool_free(entry->pool->zpool, entry->handle);
+ zswap_pool_put(entry->pool);
if (entry->objcg) {
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
@@ -1082,6 +1074,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/*********************************
* shrinker functions
**********************************/
+/*
+ * The dynamic shrinker is modulated by the following factors:
+ *
+ * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving
+ * the entry a second chance) before rotating it in the LRU list. If the
+ * entry is considered again by the shrinker, with its referenced bit unset,
+ * it is written back. The writeback rate as a result is dynamically
+ * adjusted by the pool activities - if the pool is dominated by new entries
+ * (i.e lots of recent zswapouts), these entries will be protected and
+ * the writeback rate will slow down. On the other hand, if the pool has a
+ * lot of stagnant entries, these entries will be reclaimed immediately,
+ * effectively increasing the writeback rate.
+ *
+ * 2. Swapins counter: If we observe swapins, it is a sign that we are
+ * overshrinking and should slow down. We maintain a swapins counter, which
+ * is consumed and subtract from the number of eligible objects on the LRU
+ * in zswap_shrinker_count().
+ *
+ * 3. Compression ratio. The better the workload compresses, the less gains we
+ * can expect from writeback. We scale down the number of objects available
+ * for reclaim by this ratio.
+ */
static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
spinlock_t *lock, void *arg)
{
@@ -1092,6 +1106,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
int writeback_result;
/*
+ * Second chance algorithm: if the entry has its referenced bit set, give it
+ * a second chance. Only clear the referenced bit and rotate it in the
+ * zswap's LRU list.
+ */
+ if (entry->referenced) {
+ entry->referenced = false;
+ return LRU_ROTATE;
+ }
+
+ /*
* As soon as we drop the LRU lock, the entry can be freed by
* a concurrent invalidation. This means the following:
*
@@ -1157,8 +1181,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
- struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
- unsigned long shrink_ret, nr_protected, lru_size;
+ unsigned long shrink_ret;
bool encountered_page_in_swapcache = false;
if (!zswap_shrinker_enabled ||
@@ -1167,25 +1190,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
return SHRINK_STOP;
}
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- lru_size = list_lru_shrink_count(&zswap_list_lru, sc);
-
- /*
- * Abort if we are shrinking into the protected region.
- *
- * This short-circuiting is necessary because if we have too many multiple
- * concurrent reclaimers getting the freeable zswap object counts at the
- * same time (before any of them made reasonable progress), the total
- * number of reclaimed objects might be more than the number of unprotected
- * objects (i.e the reclaimers will reclaim into the protected area of the
- * zswap LRU).
- */
- if (nr_protected >= lru_size - sc->nr_to_scan) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
&encountered_page_in_swapcache);
@@ -1200,7 +1204,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
{
struct mem_cgroup *memcg = sc->memcg;
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
- unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+ atomic_long_t *nr_disk_swapins =
+ &lruvec->zswap_lruvec_state.nr_disk_swapins;
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur,
+ nr_remain;
if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
return 0;
@@ -1233,25 +1240,33 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
if (!nr_stored)
return 0;
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc);
+ if (!nr_freeable)
+ return 0;
+
/*
- * Subtract the lru size by an estimate of the number of pages
- * that should be protected.
+ * Subtract from the lru size the number of pages that are recently swapped
+ * in from disk. The idea is that had we protect the zswap's LRU by this
+ * amount of pages, these disk swapins would not have happened.
*/
- nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+ nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins);
+ do {
+ if (nr_freeable >= nr_disk_swapins_cur)
+ nr_remain = 0;
+ else
+ nr_remain = nr_disk_swapins_cur - nr_freeable;
+ } while (!atomic_long_try_cmpxchg(
+ nr_disk_swapins, &nr_disk_swapins_cur, nr_remain));
+
+ nr_freeable -= nr_disk_swapins_cur - nr_remain;
+ if (!nr_freeable)
+ return 0;
/*
* Scale the number of freeable pages by the memory saving factor.
* This ensures that the better zswap compresses memory, the fewer
* pages we will evict to swap (as it will otherwise incur IO for
* relatively small memory saving).
- *
- * The memory saving factor calculated here takes same-filled pages into
- * account, but those are not freeable since they almost occupy no
- * space. Hence, we may scale nr_freeable down a little bit more than we
- * should if we have a lot of same-filled pages.
*/
return mult_frac(nr_freeable, nr_backing, nr_stored);
}
@@ -1274,10 +1289,10 @@ static struct shrinker *zswap_alloc_shrinker(void)
static int shrink_memcg(struct mem_cgroup *memcg)
{
- int nid, shrunk = 0;
+ int nid, shrunk = 0, scanned = 0;
if (!mem_cgroup_zswap_writeback_enabled(memcg))
- return -EINVAL;
+ return -ENOENT;
/*
* Skip zombies because their LRUs are reparented and we would be
@@ -1291,63 +1306,94 @@ static int shrink_memcg(struct mem_cgroup *memcg)
shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
&shrink_memcg_cb, NULL, &nr_to_walk);
+ scanned += 1 - nr_to_walk;
}
+
+ if (!scanned)
+ return -ENOENT;
+
return shrunk ? 0 : -EAGAIN;
}
static void shrink_worker(struct work_struct *w)
{
struct mem_cgroup *memcg;
- int ret, failures = 0;
+ int ret, failures = 0, attempts = 0;
unsigned long thr;
/* Reclaim down to the accept threshold */
thr = zswap_accept_thr_pages();
- /* global reclaim will select cgroup in a round-robin fashion. */
+ /*
+ * Global reclaim will select cgroup in a round-robin fashion from all
+ * online memcgs, but memcgs that have no pages in zswap and
+ * writeback-disabled memcgs (memory.zswap.writeback=0) are not
+ * candidates for shrinking.
+ *
+ * Shrinking will be aborted if we encounter the following
+ * MAX_RECLAIM_RETRIES times:
+ * - No writeback-candidate memcgs found in a memcg tree walk.
+ * - Shrinking a writeback-candidate memcg failed.
+ *
+ * We save iteration cursor memcg into zswap_next_shrink,
+ * which can be modified by the offline memcg cleaner
+ * zswap_memcg_offline_cleanup().
+ *
+ * Since the offline cleaner is called only once, we cannot leave an
+ * offline memcg reference in zswap_next_shrink.
+ * We can rely on the cleaner only if we get online memcg under lock.
+ *
+ * If we get an offline memcg, we cannot determine if the cleaner has
+ * already been called or will be called later. We must put back the
+ * reference before returning from this function. Otherwise, the
+ * offline memcg left in zswap_next_shrink will hold the reference
+ * until the next run of shrink_worker().
+ */
do {
- spin_lock(&zswap_shrink_lock);
- zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
- memcg = zswap_next_shrink;
-
/*
- * We need to retry if we have gone through a full round trip, or if we
- * got an offline memcg (or else we risk undoing the effect of the
- * zswap memcg offlining cleanup callback). This is not catastrophic
- * per se, but it will keep the now offlined memcg hostage for a while.
+ * Start shrinking from the next memcg after zswap_next_shrink.
+ * When the offline cleaner has already advanced the cursor,
+ * advancing the cursor here overlooks one memcg, but this
+ * should be negligibly rare.
*
- * Note that if we got an online memcg, we will keep the extra
- * reference in case the original reference obtained by mem_cgroup_iter
- * is dropped by the zswap memcg offlining callback, ensuring that the
- * memcg is not killed when we are reclaiming.
+ * If we get an online memcg, keep the extra reference in case
+ * the original one obtained by mem_cgroup_iter() is dropped by
+ * zswap_memcg_offline_cleanup() while we are shrinking the
+ * memcg.
*/
- if (!memcg) {
- spin_unlock(&zswap_shrink_lock);
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
-
- if (!mem_cgroup_tryget_online(memcg)) {
- /* drop the reference from mem_cgroup_iter() */
- mem_cgroup_iter_break(NULL, memcg);
- zswap_next_shrink = NULL;
- spin_unlock(&zswap_shrink_lock);
+ spin_lock(&zswap_shrink_lock);
+ do {
+ memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ zswap_next_shrink = memcg;
+ } while (memcg && !mem_cgroup_tryget_online(memcg));
+ spin_unlock(&zswap_shrink_lock);
- if (++failures == MAX_RECLAIM_RETRIES)
+ if (!memcg) {
+ /*
+ * Continue shrinking without incrementing failures if
+ * we found candidate memcgs in the last tree walk.
+ */
+ if (!attempts && ++failures == MAX_RECLAIM_RETRIES)
break;
+ attempts = 0;
goto resched;
}
- spin_unlock(&zswap_shrink_lock);
ret = shrink_memcg(memcg);
/* drop the extra reference */
mem_cgroup_put(memcg);
- if (ret == -EINVAL)
- break;
+ /*
+ * There are no writeback-candidate pages in the memcg.
+ * This is not an issue as long as we can find another memcg
+ * with pages in zswap. Skip this without incrementing attempts
+ * and failures.
+ */
+ if (ret == -ENOENT)
+ continue;
+ ++attempts;
+
if (ret && ++failures == MAX_RECLAIM_RETRIES)
break;
resched:
@@ -1356,42 +1402,6 @@ resched:
}
/*********************************
-* same-filled functions
-**********************************/
-static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value)
-{
- unsigned long *data;
- unsigned long val;
- unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1;
- bool ret = false;
-
- data = kmap_local_folio(folio, 0);
- val = data[0];
-
- if (val != data[last_pos])
- goto out;
-
- for (pos = 1; pos < last_pos; pos++) {
- if (val != data[pos])
- goto out;
- }
-
- *value = val;
- ret = true;
-out:
- kunmap_local(data);
- return ret;
-}
-
-static void zswap_fill_folio(struct folio *folio, unsigned long value)
-{
- unsigned long *data = kmap_local_folio(folio, 0);
-
- memset_l(data, value, PAGE_SIZE / sizeof(unsigned long));
- kunmap_local(data);
-}
-
-/*********************************
* main API
**********************************/
bool zswap_store(struct folio *folio)
@@ -1402,7 +1412,6 @@ bool zswap_store(struct folio *folio)
struct zswap_entry *entry, *old;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
- unsigned long value;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1435,13 +1444,6 @@ bool zswap_store(struct folio *folio)
goto reject;
}
- if (zswap_is_folio_same_filled(folio, &value)) {
- entry->length = 0;
- entry->value = value;
- atomic_inc(&zswap_same_filled_pages);
- goto store_entry;
- }
-
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool)
@@ -1459,9 +1461,9 @@ bool zswap_store(struct folio *folio)
if (!zswap_compress(folio, entry))
goto put_pool;
-store_entry:
entry->swpentry = swp;
entry->objcg = objcg;
+ entry->referenced = true;
old = xa_store(tree, offset, entry, GFP_KERNEL);
if (xa_is_err(old)) {
@@ -1507,13 +1509,9 @@ store_entry:
return true;
store_failed:
- if (!entry->length)
- atomic_dec(&zswap_same_filled_pages);
- else {
- zpool_free(entry->pool->zpool, entry->handle);
+ zpool_free(entry->pool->zpool, entry->handle);
put_pool:
- zswap_pool_put(entry->pool);
- }
+ zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);
reject:
@@ -1576,10 +1574,7 @@ bool zswap_load(struct folio *folio)
if (!entry)
return false;
- if (entry->length)
- zswap_decompress(entry, folio);
- else
- zswap_fill_folio(folio, entry->value);
+ zswap_decompress(entry, folio);
count_vm_event(ZSWPIN);
if (entry->objcg)
@@ -1682,8 +1677,6 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_atomic_t("stored_pages", 0444,
zswap_debugfs_root, &zswap_stored_pages);
- debugfs_create_atomic_t("same_filled_pages", 0444,
- zswap_debugfs_root, &zswap_same_filled_pages);
return 0;
}