17 files changed, 446 insertions, 240 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 4796c197295f..621508e0ecd5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1953,7 +1953,7 @@ static inline bool is_via_compact_memory(int order)
 
 static bool kswapd_is_running(pg_data_t *pgdat)
 {
-	return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+	return pgdat->kswapd && task_is_running(pgdat->kswapd);
 }
 
 /*
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 267500896b1e..2baf121fb8c5 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -100,7 +100,7 @@ slab_flags_t __kasan_never_merge(void)
 	return 0;
 }
 
-void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
+void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
 {
 	u8 tag;
 	unsigned long i;
@@ -114,7 +114,7 @@ void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
 	kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
 }
 
-void __kasan_free_pages(struct page *page, unsigned int order, bool init)
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
 {
 	if (likely(!PageHighMem(page)))
 		kasan_poison(page_address(page), PAGE_SIZE << order,
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index d867b22ddbb7..4ea8c368b5b8 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -216,6 +216,38 @@ void __init kasan_init_hw_tags(void)
 	pr_info("KernelAddressSanitizer initialized\n");
 }
 
+void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+{
+	/*
+	 * This condition should match the one in post_alloc_hook() in
+	 * page_alloc.c.
+	 */
+	bool init = !want_init_on_free() && want_init_on_alloc(flags);
+
+	if (flags & __GFP_SKIP_KASAN_POISON)
+		SetPageSkipKASanPoison(page);
+
+	if (flags & __GFP_ZEROTAGS) {
+		int i;
+
+		for (i = 0; i != 1 << order; ++i)
+			tag_clear_highpage(page + i);
+	} else {
+		kasan_unpoison_pages(page, order, init);
+	}
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+	/*
+	 * This condition should match the one in free_pages_prepare() in
+	 * page_alloc.c.
+	 */
+	bool init = want_init_on_free();
+
+	kasan_poison_pages(page, order, init);
+}
+
 #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
 
 void kasan_set_tagging_report_once(bool state)
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 675e67375fb5..bd3f540feb47 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -166,3 +166,10 @@ void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
 	kasan_poison((void *)addr, size, tag, false);
 }
 EXPORT_SYMBOL(__hwasan_tag_memory);
+
+void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+			unsigned long ret_ip)
+{
+	kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
+		     ret_ip);
+}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b826dad6fa36..ae1f5d0cb581 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -81,14 +81,14 @@ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
 
 /* Socket memory accounting disabled? */
-static bool cgroup_memory_nosocket;
+static bool cgroup_memory_nosocket __ro_after_init;
 
 /* Kernel memory accounting disabled? */
-bool cgroup_memory_nokmem;
+bool cgroup_memory_nokmem __ro_after_init;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
-bool cgroup_memory_noswap __read_mostly;
+bool cgroup_memory_noswap __ro_after_init;
 #else
 #define cgroup_memory_noswap		1
 #endif
@@ -256,6 +256,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 #ifdef CONFIG_MEMCG_KMEM
 extern spinlock_t css_set_lock;
 
+bool mem_cgroup_kmem_disabled(void)
+{
+	return cgroup_memory_nokmem;
+}
+
 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
 				      unsigned int nr_pages);
 
diff --git a/mm/memfd.c b/mm/memfd.c
index 2647c898990c..081dd33e6a61 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create,
 	}
 
 	if (flags & MFD_HUGETLB) {
-		struct user_struct *user = NULL;
+		struct ucounts *ucounts = NULL;
 
-		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
 					HUGETLB_ANONHUGE_INODE,
 					(flags >> MFD_HUGE_SHIFT) &
 					MFD_HUGE_MASK);
diff --git a/mm/mempool.c b/mm/mempool.c
index a258cf4de575..0b8afbec3e35 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,8 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_slab_free_mempool(element);
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_free_pages(element, (unsigned long)pool->pool_data, false);
+		kasan_poison_pages(element, (unsigned long)pool->pool_data,
+				   false);
 }
 
 static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +115,8 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_unpoison_range(element, __ksize(element));
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_alloc_pages(element, (unsigned long)pool->pool_data, false);
+		kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
+				     false);
 }
 
 static __always_inline void add_element(mempool_t *pool, void *element)
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ab757ab6fe8..0d639bf48794 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall)
  */
 static DEFINE_SPINLOCK(shmlock_user_lock);
 
-int user_shm_lock(size_t size, struct user_struct *user)
+int user_shm_lock(size_t size, struct ucounts *ucounts)
 {
 	unsigned long lock_limit, locked;
+	long memlock;
 	int allowed = 0;
 
 	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user)
 		allowed = 1;
 	lock_limit >>= PAGE_SHIFT;
 	spin_lock(&shmlock_user_lock);
-	if (!allowed &&
-	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+	memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+
+	if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+		goto out;
+	}
+	if (!get_ucounts(ucounts)) {
+		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 		goto out;
-	get_uid(user);
-	user->locked_shm += locked;
+	}
 	allowed = 1;
 out:
 	spin_unlock(&shmlock_user_lock);
 	return allowed;
 }
 
-void user_shm_unlock(size_t size, struct user_struct *user)
+void user_shm_unlock(size_t size, struct ucounts *ucounts)
 {
 	spin_lock(&shmlock_user_lock);
-	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
 	spin_unlock(&shmlock_user_lock);
-	free_uid(user);
+	put_ucounts(ucounts);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index d8c92ae50565..aa9de981b659 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1609,7 +1609,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 			goto out_fput;
 		}
 	} else if (flags & MAP_HUGETLB) {
-		struct user_struct *user = NULL;
+		struct ucounts *ucounts = NULL;
 		struct hstate *hs;
 
 		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1625,7 +1625,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 		 */
 		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
 				VM_NORESERVE,
-				&user, HUGETLB_ANONHUGE_INODE,
+				&ucounts, HUGETLB_ANONHUGE_INODE,
 				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
 		if (IS_ERR(file))
 			return PTR_ERR(file);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e5b38ffe9fca..9f63548f247c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -108,11 +108,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 
 /*
- * Flag that makes the machine dump writes/reads and block dirtyings.
- */
-int block_dump;
-
-/*
  * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
  * a full sync is triggered after this time elapses without any disk activity.
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f71d92c45b4..d6e94cc8066c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -387,7 +387,7 @@ int page_group_by_mobility_disabled __read_mostly;
 static DEFINE_STATIC_KEY_TRUE(deferred_pages);
 
 /*
- * Calling kasan_free_pages() only after deferred memory initialization
+ * Calling kasan_poison_pages() only after deferred memory initialization
  * has completed. Poisoning pages during deferred memory init will greatly
  * lengthen the process and cause problem in large memory systems as the
  * deferred pages initialization is done with interrupt disabled.
@@ -399,15 +399,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
  * on-demand allocation and then freed again before the deferred pages
  * initialization is done, but this is not likely to happen.
  */
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-						bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
-	if (static_branch_unlikely(&deferred_pages))
-		return;
-	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-			(fpi_flags & FPI_SKIP_KASAN_POISON))
-		return;
-	kasan_free_pages(page, order, init);
+	return static_branch_unlikely(&deferred_pages) ||
+	       (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+		(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+	       PageSkipKASanPoison(page);
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
@@ -458,13 +455,11 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 	return false;
 }
 #else
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-						bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
-	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-			(fpi_flags & FPI_SKIP_KASAN_POISON))
-		return;
-	kasan_free_pages(page, order, init);
+	return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+		(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+	       PageSkipKASanPoison(page);
 }
 
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1281,10 +1276,16 @@ out:
 	return ret;
 }
 
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
 {
 	int i;
 
+	if (zero_tags) {
+		for (i = 0; i < numpages; i++)
+			tag_clear_highpage(page + i);
+		return;
+	}
+
 	/* s390's use of memset() could override KASAN redzones. */
 	kasan_disable_current();
 	for (i = 0; i < numpages; i++) {
@@ -1300,7 +1301,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order, bool check_free, fpi_t fpi_flags)
 {
 	int bad = 0;
-	bool init;
+	bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -1369,10 +1370,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	 * With hardware tag-based KASAN, memory tags must be set before the
 	 * page becomes unavailable via debug_pagealloc or arch_free_page.
 	 */
-	init = want_init_on_free();
-	if (init && !kasan_has_integrated_init())
-		kernel_init_free_pages(page, 1 << order);
-	kasan_free_nondeferred_pages(page, order, init, fpi_flags);
+	if (kasan_has_integrated_init()) {
+		if (!skip_kasan_poison)
+			kasan_free_pages(page, order);
+	} else {
+		bool init = want_init_on_free();
+
+		if (init)
+			kernel_init_free_pages(page, 1 << order, false);
+		if (!skip_kasan_poison)
+			kasan_poison_pages(page, order, init);
+	}
 
 	/*
 	 * arch_free_page() can make the page's contents inaccessible.  s390
@@ -2398,8 +2406,6 @@ static bool check_new_pages(struct page *page, unsigned int order)
 inline void post_alloc_hook(struct page *page, unsigned int order,
 				gfp_t gfp_flags)
 {
-	bool init;
-
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 
@@ -2418,10 +2424,16 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	 * kasan_alloc_pages and kernel_init_free_pages must be
 	 * kept together to avoid discrepancies in behavior.
 	 */
-	init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
-	kasan_alloc_pages(page, order, init);
-	if (init && !kasan_has_integrated_init())
-		kernel_init_free_pages(page, 1 << order);
+	if (kasan_has_integrated_init()) {
+		kasan_alloc_pages(page, order, gfp_flags);
+	} else {
+		bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+
+		kasan_unpoison_pages(page, order, init);
+		if (init)
+			kernel_init_free_pages(page, 1 << order,
+					       gfp_flags & __GFP_ZEROTAGS);
+	}
 
 	set_page_owner(page, order, gfp_flags);
 }
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index ae26b118e246..639662c20c82 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -6,25 +6,6 @@
 #include <linux/percpu.h>
 
 /*
- * There are two chunk types: root and memcg-aware.
- * Chunks of each type have separate slots list.
- *
- * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
- * used to store memcg membership data of a percpu object.  Obj_cgroups are
- * ref-counted pointers to a memory cgroup with an ability to switch dynamically
- * to the parent memory cgroup.  This allows to reclaim a deleted memory cgroup
- * without reclaiming of all outstanding objects, which hold a reference at it.
- */
-enum pcpu_chunk_type {
-	PCPU_CHUNK_ROOT,
-#ifdef CONFIG_MEMCG_KMEM
-	PCPU_CHUNK_MEMCG,
-#endif
-	PCPU_NR_CHUNK_TYPES,
-	PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
-};
-
-/*
  * pcpu_block_md is the metadata block struct.
  * Each chunk's bitmap is split into a number of full blocks.
  * All units are in terms of bits.
@@ -67,6 +48,8 @@ struct pcpu_chunk {
 
 	void			*data;		/* chunk data */
 	bool			immutable;	/* no [de]population allowed */
+	bool			isolated;	/* isolated from active chunk
+						   slots */
 	int			start_offset;	/* the overlap with the previous
 						   region to have a page aligned
 						   base_addr */
@@ -87,7 +70,9 @@ extern spinlock_t pcpu_lock;
 
 extern struct list_head *pcpu_chunk_lists;
 extern int pcpu_nr_slots;
-extern int pcpu_nr_empty_pop_pages[];
+extern int pcpu_sidelined_slot;
+extern int pcpu_to_depopulate_slot;
+extern int pcpu_nr_empty_pop_pages;
 
 extern struct pcpu_chunk *pcpu_first_chunk;
 extern struct pcpu_chunk *pcpu_reserved_chunk;
@@ -128,37 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
 	return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
 }
 
-#ifdef CONFIG_MEMCG_KMEM
-static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
-{
-	if (chunk->obj_cgroups)
-		return PCPU_CHUNK_MEMCG;
-	return PCPU_CHUNK_ROOT;
-}
-
-static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
-{
-	return chunk_type == PCPU_CHUNK_MEMCG;
-}
-
-#else
-static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
-{
-	return PCPU_CHUNK_ROOT;
-}
-
-static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
-{
-	return false;
-}
-#endif
-
-static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
-{
-	return &pcpu_chunk_lists[pcpu_nr_slots *
-				 pcpu_is_memcg_chunk(chunk_type)];
-}
-
 #ifdef CONFIG_PERCPU_STATS
 
 #include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 35c9941077ee..c9d529dc7651 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -44,8 +44,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	/* nada */
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
-					    gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
 	struct pcpu_chunk *chunk;
@@ -53,7 +52,7 @@ static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
 	unsigned long flags;
 	int i;
 
-	chunk = pcpu_alloc_chunk(type, gfp);
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
@@ -118,3 +117,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
 
 	return 0;
 }
+
+static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
+{
+	return false;
+}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index f6026dbcdf6b..c6bd092ff7a3 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -34,15 +34,11 @@ static int find_max_nr_alloc(void)
 {
 	struct pcpu_chunk *chunk;
 	int slot, max_nr_alloc;
-	enum pcpu_chunk_type type;
 
 	max_nr_alloc = 0;
-	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-		for (slot = 0; slot < pcpu_nr_slots; slot++)
-			list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
-					    list)
-				max_nr_alloc = max(max_nr_alloc,
-						   chunk->nr_alloc);
+	for (slot = 0; slot < pcpu_nr_slots; slot++)
+		list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list)
+			max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
 
 	return max_nr_alloc;
 }
@@ -133,9 +129,6 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
 	P("cur_min_alloc", cur_min_alloc);
 	P("cur_med_alloc", cur_med_alloc);
 	P("cur_max_alloc", cur_max_alloc);
-#ifdef CONFIG_MEMCG_KMEM
-	P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
-#endif
 	seq_putc(m, '\n');
 }
 
@@ -144,8 +137,6 @@ static int percpu_stats_show(struct seq_file *m, void *v)
 	struct pcpu_chunk *chunk;
 	int slot, max_nr_alloc;
 	int *buffer;
-	enum pcpu_chunk_type type;
-	int nr_empty_pop_pages;
 
 alloc_buffer:
 	spin_lock_irq(&pcpu_lock);
@@ -166,10 +157,6 @@ alloc_buffer:
 		goto alloc_buffer;
 	}
 
-	nr_empty_pop_pages = 0;
-	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-		nr_empty_pop_pages += pcpu_nr_empty_pop_pages[type];
-
 #define PL(X)								\
 	seq_printf(m, "  %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
 
@@ -201,7 +188,7 @@ alloc_buffer:
 	PU(nr_max_chunks);
 	PU(min_alloc_size);
 	PU(max_alloc_size);
-	P("empty_pop_pages", nr_empty_pop_pages);
+	P("empty_pop_pages", pcpu_nr_empty_pop_pages);
 	seq_putc(m, '\n');
 
 #undef PU
@@ -215,18 +202,17 @@ alloc_buffer:
 		chunk_map_stats(m, pcpu_reserved_chunk, buffer);
 	}
 
-	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
-		for (slot = 0; slot < pcpu_nr_slots; slot++) {
-			list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
-					    list) {
-				if (chunk == pcpu_first_chunk) {
-					seq_puts(m, "Chunk: <- First Chunk\n");
-					chunk_map_stats(m, chunk, buffer);
-				} else {
-					seq_puts(m, "Chunk:\n");
-					chunk_map_stats(m, chunk, buffer);
-				}
-			}
+	for (slot = 0; slot < pcpu_nr_slots; slot++) {
+		list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
+			if (chunk == pcpu_first_chunk)
+				seq_puts(m, "Chunk: <- First Chunk\n");
+			else if (slot == pcpu_to_depopulate_slot)
+				seq_puts(m, "Chunk (to_depopulate)\n");
+			else if (slot == pcpu_sidelined_slot)
+				seq_puts(m, "Chunk (sidelined):\n");
+			else
+				seq_puts(m, "Chunk:\n");
+			chunk_map_stats(m, chunk, buffer);
 		}
 	}
 
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 8d3844bc0c7c..ee5d89fcd66f 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -329,13 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	pcpu_free_pages(chunk, pages, page_start, page_end);
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
-					    gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	struct vm_struct **vms;
 
-	chunk = pcpu_alloc_chunk(type, gfp);
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
@@ -378,3 +377,33 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
 	/* no extra restriction */
 	return 0;
 }
+
+/**
+ * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
+ * @chunk: chunk of interest
+ *
+ * This is the entry point for percpu reclaim.  If a chunk qualifies, it is then
+ * isolated and managed in separate lists at the back of pcpu_slot: sidelined
+ * and to_depopulate respectively.  The to_depopulate list holds chunks slated
+ * for depopulation.  They no longer contribute to pcpu_nr_empty_pop_pages once
+ * they are on this list.  Once depopulated, they are moved onto the sidelined
+ * list which enables them to be pulled back in for allocation if no other chunk
+ * can suffice the allocation.
+ */
+static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
+{
+	/* do not reclaim either the first chunk or reserved chunk */
+	if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
+		return false;
+
+	/*
+	 * If it is isolated, it may be on the sidelined list so move it back to
+	 * the to_depopulate list.  If we hit at least 1/4 pages empty pages AND
+	 * there is no system-wide shortage of empty pages aside from this
+	 * chunk, move it to the to_depopulate list.
+	 */
+	return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
+		(pcpu_nr_empty_pop_pages >
+		 (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) &&
+		 chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index f99e9306b939..b4cebeca4c0c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -99,7 +99,10 @@
 
 #include "percpu-internal.h"
 
-/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+/*
+ * The slots are sorted by the size of the biggest continuous free area.
+ * 1-31 bytes share the same slot.
+ */
 #define PCPU_SLOT_BASE_SHIFT		5
 /* chunks in slots below this are subject to being sidelined on failed alloc */
 #define PCPU_SLOT_FAIL_THRESHOLD	3
@@ -132,6 +135,9 @@ static int pcpu_unit_size __ro_after_init;
 static int pcpu_nr_units __ro_after_init;
 static int pcpu_atom_size __ro_after_init;
 int pcpu_nr_slots __ro_after_init;
+static int pcpu_free_slot __ro_after_init;
+int pcpu_sidelined_slot __ro_after_init;
+int pcpu_to_depopulate_slot __ro_after_init;
 static size_t pcpu_chunk_struct_size __ro_after_init;
 
 /* cpus with the lowest and highest unit addresses */
@@ -173,10 +179,10 @@ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
 static LIST_HEAD(pcpu_map_extend_chunks);
 
 /*
- * The number of empty populated pages by chunk type, protected by pcpu_lock.
+ * The number of empty populated pages, protected by pcpu_lock.
  * The reserved chunk doesn't contribute to the count.
  */
-int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
+int pcpu_nr_empty_pop_pages;
 
 /*
  * The number of populated pages in use by the allocator, protected by
@@ -234,7 +240,7 @@ static int __pcpu_size_to_slot(int size)
 static int pcpu_size_to_slot(int size)
 {
 	if (size == pcpu_unit_size)
-		return pcpu_nr_slots - 1;
+		return pcpu_free_slot;
 	return __pcpu_size_to_slot(size);
 }
 
@@ -303,6 +309,25 @@ static unsigned long pcpu_block_off_to_off(int index, int off)
 	return index * PCPU_BITMAP_BLOCK_BITS + off;
 }
 
+/**
+ * pcpu_check_block_hint - check against the contig hint
+ * @block: block of interest
+ * @bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Check to see if the allocation can fit in the block's contig hint.
+ * Note, a chunk uses the same hints as a block so this can also check against
+ * the chunk's contig hint.
+ */
+static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
+				  size_t align)
+{
+	int bit_off = ALIGN(block->contig_hint_start, align) -
+		block->contig_hint_start;
+
+	return bit_off + bits <= block->contig_hint;
+}
+
 /*
  * pcpu_next_hint - determine which hint to use
  * @block: block of interest
@@ -507,13 +532,10 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
 			      bool move_front)
 {
 	if (chunk != pcpu_reserved_chunk) {
-		struct list_head *pcpu_slot;
-
-		pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
 		if (move_front)
-			list_move(&chunk->list, &pcpu_slot[slot]);
+			list_move(&chunk->list, &pcpu_chunk_lists[slot]);
 		else
-			list_move_tail(&chunk->list, &pcpu_slot[slot]);
+			list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
 	}
 }
 
@@ -539,10 +561,36 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 
+	/* leave isolated chunks in-place */
+	if (chunk->isolated)
+		return;
+
 	if (oslot != nslot)
 		__pcpu_chunk_move(chunk, nslot, oslot < nslot);
 }
 
+static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
+{
+	lockdep_assert_held(&pcpu_lock);
+
+	if (!chunk->isolated) {
+		chunk->isolated = true;
+		pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
+	}
+	list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
+}
+
+static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
+{
+	lockdep_assert_held(&pcpu_lock);
+
+	if (chunk->isolated) {
+		chunk->isolated = false;
+		pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
+		pcpu_chunk_relocate(chunk, -1);
+	}
+}
+
 /*
  * pcpu_update_empty_pages - update empty page counters
  * @chunk: chunk of interest
@@ -555,8 +603,8 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
 {
 	chunk->nr_empty_pop_pages += nr;
-	if (chunk != pcpu_reserved_chunk)
-		pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
+	if (chunk != pcpu_reserved_chunk && !chunk->isolated)
+		pcpu_nr_empty_pop_pages += nr;
 }
 
 /*
@@ -1063,14 +1111,11 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
 	int bit_off, bits, next_off;
 
 	/*
-	 * Check to see if the allocation can fit in the chunk's contig hint.
-	 * This is an optimization to prevent scanning by assuming if it
-	 * cannot fit in the global hint, there is memory pressure and creating
-	 * a new chunk would happen soon.
+	 * This is an optimization to prevent scanning by assuming if the
+	 * allocation cannot fit in the global hint, there is memory pressure
+	 * and creating a new chunk would happen soon.
 	 */
-	bit_off = ALIGN(chunk_md->contig_hint_start, align) -
-		  chunk_md->contig_hint_start;
-	if (bit_off + alloc_bits > chunk_md->contig_hint)
+	if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
 		return -1;
 
 	bit_off = pcpu_next_hint(chunk_md, alloc_bits);
@@ -1352,7 +1397,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 		      alloc_size);
 
 #ifdef CONFIG_MEMCG_KMEM
-	/* first chunk isn't memcg-aware */
+	/* first chunk is free to use */
 	chunk->obj_cgroups = NULL;
 #endif
 	pcpu_init_md_blocks(chunk);
@@ -1394,7 +1439,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 	return chunk;
 }
 
-static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	int region_bits;
@@ -1423,7 +1468,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
 		goto md_blocks_fail;
 
 #ifdef CONFIG_MEMCG_KMEM
-	if (pcpu_is_memcg_chunk(type)) {
+	if (!mem_cgroup_kmem_disabled()) {
 		chunk->obj_cgroups =
 			pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
 					sizeof(struct obj_cgroup *), gfp);
@@ -1536,8 +1581,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
 			       int page_start, int page_end, gfp_t gfp);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 				  int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
-					    gfp_t gfp);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1580,25 +1624,25 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
-						     struct obj_cgroup **objcgp)
+static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+				      struct obj_cgroup **objcgp)
 {
 	struct obj_cgroup *objcg;
 
 	if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
-		return PCPU_CHUNK_ROOT;
+		return true;
 
 	objcg = get_obj_cgroup_from_current();
 	if (!objcg)
-		return PCPU_CHUNK_ROOT;
+		return true;
 
 	if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
 		obj_cgroup_put(objcg);
-		return PCPU_FAIL_ALLOC;
+		return false;
 	}
 
 	*objcgp = objcg;
-	return PCPU_CHUNK_MEMCG;
+	return true;
 }
 
 static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@ -1608,7 +1652,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
 	if (!objcg)
 		return;
 
-	if (chunk) {
+	if (likely(chunk && chunk->obj_cgroups)) {
 		chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
 
 		rcu_read_lock();
@@ -1625,10 +1669,12 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
 {
 	struct obj_cgroup *objcg;
 
-	if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+	if (unlikely(!chunk->obj_cgroups))
 		return;
 
 	objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+	if (!objcg)
+		return;
 	chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
 
 	obj_cgroup_uncharge(objcg, size * num_possible_cpus());
@@ -1642,10 +1688,10 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
 }
 
 #else /* CONFIG_MEMCG_KMEM */
-static enum pcpu_chunk_type
+static bool
 pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
 {
-	return PCPU_CHUNK_ROOT;
+	return true;
 }
 
 static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@ -1680,8 +1726,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	gfp_t pcpu_gfp;
 	bool is_atomic;
 	bool do_warn;
-	enum pcpu_chunk_type type;
-	struct list_head *pcpu_slot;
 	struct obj_cgroup *objcg = NULL;
 	static int warn_limit = 10;
 	struct pcpu_chunk *chunk, *next;
@@ -1717,10 +1761,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		return NULL;
 	}
 
-	type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
-	if (unlikely(type == PCPU_FAIL_ALLOC))
+	if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
 		return NULL;
-	pcpu_slot = pcpu_chunk_list(type);
 
 	if (!is_atomic) {
 		/*
@@ -1758,8 +1800,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 restart:
 	/* search through normal chunks */
-	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
-		list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
+	for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
+		list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
+					 list) {
 			off = pcpu_find_block_fit(chunk, bits, bit_align,
 						  is_atomic);
 			if (off < 0) {
@@ -1769,9 +1812,10 @@ restart:
 			}
 
 			off = pcpu_alloc_area(chunk, bits, bit_align, off);
-			if (off >= 0)
+			if (off >= 0) {
+				pcpu_reintegrate_chunk(chunk);
 				goto area_found;
-
+			}
 		}
 	}
 
@@ -1787,8 +1831,8 @@ restart:
 		goto fail;
 	}
 
-	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-		chunk = pcpu_create_chunk(type, pcpu_gfp);
+	if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
+		chunk = pcpu_create_chunk(pcpu_gfp);
 		if (!chunk) {
 			err = "failed to allocate new chunk";
 			goto fail;
@@ -1832,7 +1876,7 @@ area_found:
 		mutex_unlock(&pcpu_alloc_mutex);
 	}
 
-	if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
+	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
 		pcpu_schedule_balance_work();
 
 	/* clear the areas and return address relative to base address */
@@ -1930,33 +1974,28 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 }
 
 /**
- * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
- * @type: chunk type
+ * pcpu_balance_free - manage the amount of free chunks
+ * @empty_only: free chunks only if there are no populated pages
  *
- * Reclaim all fully free chunks except for the first one.  This is also
- * responsible for maintaining the pool of empty populated pages.  However,
- * it is possible that this is called when physical memory is scarce causing
- * OOM killer to be triggered.  We should avoid doing so until an actual
- * allocation causes the failure as it is possible that requests can be
- * serviced from already backed regions.
+ * If empty_only is %false, reclaim all fully free chunks regardless of the
+ * number of populated pages.  Otherwise, only reclaim chunks that have no
+ * populated pages.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
  */
-static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
+static void pcpu_balance_free(bool empty_only)
 {
-	/* gfp flags passed to underlying allocators */
-	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 	LIST_HEAD(to_free);
-	struct list_head *pcpu_slot = pcpu_chunk_list(type);
-	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
+	struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
 	struct pcpu_chunk *chunk, *next;
-	int slot, nr_to_pop, ret;
+
+	lockdep_assert_held(&pcpu_lock);
 
 	/*
 	 * There's no reason to keep around multiple unused chunks and VM
 	 * areas can be scarce.  Destroy all free chunks except for one.
 	 */
-	mutex_lock(&pcpu_alloc_mutex);
-	spin_lock_irq(&pcpu_lock);
-
 	list_for_each_entry_safe(chunk, next, free_head, list) {
 		WARN_ON(chunk->immutable);
 
@@ -1964,11 +2003,14 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
-		list_move(&chunk->list, &to_free);
+		if (!empty_only || chunk->nr_empty_pop_pages == 0)
+			list_move(&chunk->list, &to_free);
 	}
 
-	spin_unlock_irq(&pcpu_lock);
+	if (list_empty(&to_free))
+		return;
 
+	spin_unlock_irq(&pcpu_lock);
 	list_for_each_entry_safe(chunk, next, &to_free, list) {
 		unsigned int rs, re;
 
@@ -1982,6 +2024,29 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
 		pcpu_destroy_chunk(chunk);
 		cond_resched();
 	}
+	spin_lock_irq(&pcpu_lock);
+}
+
+/**
+ * pcpu_balance_populated - manage the amount of populated pages
+ *
+ * Maintain a certain amount of populated pages to satisfy atomic allocations.
+ * It is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered.  We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ */
+static void pcpu_balance_populated(void)
+{
+	/* gfp flags passed to underlying allocators */
+	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+	struct pcpu_chunk *chunk;
+	int slot, nr_to_pop, ret;
+
+	lockdep_assert_held(&pcpu_lock);
 
 	/*
 	 * Ensure there are certain number of free populated pages for
@@ -2000,23 +2065,21 @@ retry_pop:
 		pcpu_atomic_alloc_failed = false;
 	} else {
 		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
-				  pcpu_nr_empty_pop_pages[type],
+				  pcpu_nr_empty_pop_pages,
 				  0, PCPU_EMPTY_POP_PAGES_HIGH);
 	}
 
-	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
 		unsigned int nr_unpop = 0, rs, re;
 
 		if (!nr_to_pop)
 			break;
 
-		spin_lock_irq(&pcpu_lock);
-		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+		list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
 			nr_unpop = chunk->nr_pages - chunk->nr_populated;
 			if (nr_unpop)
 				break;
 		}
-		spin_unlock_irq(&pcpu_lock);
 
 		if (!nr_unpop)
 			continue;
@@ -2026,12 +2089,13 @@ retry_pop:
 					     chunk->nr_pages) {
 			int nr = min_t(int, re - rs, nr_to_pop);
 
+			spin_unlock_irq(&pcpu_lock);
 			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+			cond_resched();
+			spin_lock_irq(&pcpu_lock);
 			if (!ret) {
 				nr_to_pop -= nr;
-				spin_lock_irq(&pcpu_lock);
 				pcpu_chunk_populated(chunk, rs, rs + nr);
-				spin_unlock_irq(&pcpu_lock);
 			} else {
 				nr_to_pop = 0;
 			}
@@ -2043,30 +2107,133 @@ retry_pop:
 
 	if (nr_to_pop) {
 		/* ran out of chunks to populate, create a new one and retry */
-		chunk = pcpu_create_chunk(type, gfp);
+		spin_unlock_irq(&pcpu_lock);
+		chunk = pcpu_create_chunk(gfp);
+		cond_resched();
+		spin_lock_irq(&pcpu_lock);
 		if (chunk) {
-			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_relocate(chunk, -1);
-			spin_unlock_irq(&pcpu_lock);
 			goto retry_pop;
 		}
 	}
+}
 
-	mutex_unlock(&pcpu_alloc_mutex);
+/**
+ * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
+ *
+ * Scan over chunks in the depopulate list and try to release unused populated
+ * pages back to the system.  Depopulated chunks are sidelined to prevent
+ * repopulating these pages unless required.  Fully free chunks are reintegrated
+ * and freed accordingly (1 is kept around).  If we drop below the empty
+ * populated pages threshold, reintegrate the chunk if it has empty free pages.
+ * Each chunk is scanned in the reverse order to keep populated pages close to
+ * the beginning of the chunk.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ *
+ */
+static void pcpu_reclaim_populated(void)
+{
+	struct pcpu_chunk *chunk;
+	struct pcpu_block_md *block;
+	int i, end;
+
+	lockdep_assert_held(&pcpu_lock);
+
+restart:
+	/*
+	 * Once a chunk is isolated to the to_depopulate list, the chunk is no
+	 * longer discoverable to allocations whom may populate pages.  The only
+	 * other accessor is the free path which only returns area back to the
+	 * allocator not touching the populated bitmap.
+	 */
+	while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) {
+		chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot],
+					 struct pcpu_chunk, list);
+		WARN_ON(chunk->immutable);
+
+		/*
+		 * Scan chunk's pages in the reverse order to keep populated
+		 * pages close to the beginning of the chunk.
+		 */
+		for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
+			/* no more work to do */
+			if (chunk->nr_empty_pop_pages == 0)
+				break;
+
+			/* reintegrate chunk to prevent atomic alloc failures */
+			if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+				pcpu_reintegrate_chunk(chunk);
+				goto restart;
+			}
+
+			/*
+			 * If the page is empty and populated, start or
+			 * extend the (i, end) range.  If i == 0, decrease
+			 * i and perform the depopulation to cover the last
+			 * (first) page in the chunk.
+			 */
+			block = chunk->md_blocks + i;
+			if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
+			    test_bit(i, chunk->populated)) {
+				if (end == -1)
+					end = i;
+				if (i > 0)
+					continue;
+				i--;
+			}
+
+			/* depopulate if there is an active range */
+			if (end == -1)
+				continue;
+
+			spin_unlock_irq(&pcpu_lock);
+			pcpu_depopulate_chunk(chunk, i + 1, end + 1);
+			cond_resched();
+			spin_lock_irq(&pcpu_lock);
+
+			pcpu_chunk_depopulated(chunk, i + 1, end + 1);
+
+			/* reset the range and continue */
+			end = -1;
+		}
+
+		if (chunk->free_bytes == pcpu_unit_size)
+			pcpu_reintegrate_chunk(chunk);
+		else
+			list_move(&chunk->list,
+				  &pcpu_chunk_lists[pcpu_sidelined_slot]);
+	}
 }
 
 /**
  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
  * @work: unused
  *
- * Call __pcpu_balance_workfn() for each chunk type.
+ * For each chunk type, manage the number of fully free chunks and the number of
+ * populated pages.  An important thing to consider is when pages are freed and
+ * how they contribute to the global counts.
  */
 static void pcpu_balance_workfn(struct work_struct *work)
 {
-	enum pcpu_chunk_type type;
+	/*
+	 * pcpu_balance_free() is called twice because the first time we may
+	 * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
+	 * to grow other chunks.  This then gives pcpu_reclaim_populated() time
+	 * to move fully free chunks to the active list to be freed if
+	 * appropriate.
+	 */
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
+
+	pcpu_balance_free(false);
+	pcpu_reclaim_populated();
+	pcpu_balance_populated();
+	pcpu_balance_free(true);
 
-	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-		__pcpu_balance_workfn(type);
+	spin_unlock_irq(&pcpu_lock);
+	mutex_unlock(&pcpu_alloc_mutex);
 }
 
 /**
@@ -2085,7 +2252,6 @@ void free_percpu(void __percpu *ptr)
 	unsigned long flags;
 	int size, off;
 	bool need_balance = false;
-	struct list_head *pcpu_slot;
 
 	if (!ptr)
 		return;
@@ -2101,19 +2267,24 @@ void free_percpu(void __percpu *ptr)
 
 	size = pcpu_free_area(chunk, off);
 
-	pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
-
 	pcpu_memcg_free_hook(chunk, off, size);
 
-	/* if there are more than one fully free chunks, wake up grim reaper */
-	if (chunk->free_bytes == pcpu_unit_size) {
+	/*
+	 * If there are more than one fully free chunks, wake up grim reaper.
+	 * If the chunk is isolated, it may be in the process of being
+	 * reclaimed.  Let reclaim manage cleaning up of that chunk.
+	 */
+	if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 
-		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
+		list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
 			if (pos != chunk) {
 				need_balance = true;
 				break;
 			}
+	} else if (pcpu_should_reclaim_chunk(chunk)) {
+		pcpu_isolate_chunk(chunk);
+		need_balance = true;
 	}
 
 	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
@@ -2414,7 +2585,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	int map_size;
 	unsigned long tmp_addr;
 	size_t alloc_size;
-	enum pcpu_chunk_type type;
 
 #define PCPU_SETUP_BUG_ON(cond)	do {					\
 	if (unlikely(cond)) {						\
@@ -2528,22 +2698,24 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	pcpu_stats_save_ai(ai);
 
 	/*
-	 * Allocate chunk slots.  The additional last slot is for
-	 * empty chunks.
+	 * Allocate chunk slots.  The slots after the active slots are:
+	 *   sidelined_slot - isolated, depopulated chunks
+	 *   free_slot - fully free chunks
+	 *   to_depopulate_slot - isolated, chunks to depopulate
 	 */
-	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+	pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
+	pcpu_free_slot = pcpu_sidelined_slot + 1;
+	pcpu_to_depopulate_slot = pcpu_free_slot + 1;
+	pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
 	pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
-					  sizeof(pcpu_chunk_lists[0]) *
-					  PCPU_NR_CHUNK_TYPES,
+					  sizeof(pcpu_chunk_lists[0]),
 					  SMP_CACHE_BYTES);
 	if (!pcpu_chunk_lists)
 		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
-		      PCPU_NR_CHUNK_TYPES);
+		      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
 
-	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-		for (i = 0; i < pcpu_nr_slots; i++)
-			INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
+	for (i = 0; i < pcpu_nr_slots; i++)
+		INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
 
 	/*
 	 * The end of the static region needs to be aligned with the
@@ -2580,7 +2752,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	/* link the first chunk in */
 	pcpu_first_chunk = chunk;
-	pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
+	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
 	/* include all regions of the first chunk */
@@ -2733,6 +2905,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
 	 * Related to atom_size, which could be much larger than the unit_size.
 	 */
 	last_allocs = INT_MAX;
+	best_upa = 0;
 	for (upa = max_upa; upa; upa--) {
 		int allocs = 0, wasted = 0;
 
@@ -2759,6 +2932,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
 		last_allocs = allocs;
 		best_upa = upa;
 	}
+	BUG_ON(!best_upa);
 	upa = best_upa;
 
 	/* allocate and fill alloc_info */
diff --git a/mm/shmem.c b/mm/shmem.c
index bd2ffd861ddb..70d9ce294bb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2249,7 +2249,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 }
 #endif
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 {
 	struct inode *inode = file_inode(file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2261,13 +2261,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 	 * no serialization needed when called from shm_destroy().
 	 */
 	if (lock && !(info->flags & VM_LOCKED)) {
-		if (!user_shm_lock(inode->i_size, user))
+		if (!user_shm_lock(inode->i_size, ucounts))
 			goto out_nomem;
 		info->flags |= VM_LOCKED;
 		mapping_set_unevictable(file->f_mapping);
 	}
-	if (!lock && (info->flags & VM_LOCKED) && user) {
-		user_shm_unlock(inode->i_size, user);
+	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+		user_shm_unlock(inode->i_size, ucounts);
 		info->flags &= ~VM_LOCKED;
 		mapping_clear_unevictable(file->f_mapping);
 	}
@@ -4059,7 +4059,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
 	return 0;
 }
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 {
 	return 0;
 }