20 files changed, 1951 insertions, 3005 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 57e4f3a24829..bd0401cc7ca5 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -7,9 +7,10 @@ obj-y		:= init.o fault.o extmem.o mmap.o vmem.o maccess.o
 obj-y		+= page-states.o pageattr.o pgtable.o pgalloc.o extable.o
 
 obj-$(CONFIG_CMM)		+= cmm.o
+obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_PTDUMP_CORE)	+= dump_pagetables.o
+obj-$(CONFIG_PTDUMP)		+= dump_pagetables.o
 obj-$(CONFIG_PGSTE)		+= gmap.o
+obj-$(CONFIG_PFAULT)		+= pfault.o
 
-KASAN_SANITIZE_kasan_init.o	:= n
-obj-$(CONFIG_KASAN)		+= kasan_init.o
+obj-$(subst m,y,$(CONFIG_KVM))	+= gmap_helpers.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 9141ed4c52e9..eb7ef63fab1e 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -90,16 +90,17 @@ static long cmm_alloc_pages(long nr, long *counter,
 			} else
 				free_page((unsigned long) npa);
 		}
-		diag10_range(virt_to_pfn(addr), 1);
+		diag10_range(virt_to_pfn((void *)addr), 1);
 		pa->pages[pa->index++] = addr;
 		(*counter)++;
 		spin_unlock(&cmm_lock);
 		nr--;
+		cond_resched();
 	}
 	return nr;
 }
 
-static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+static long __cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
 {
 	struct cmm_page_array *pa;
 	unsigned long addr;
@@ -123,6 +124,21 @@ static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
 	return nr;
 }
 
+static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+{
+	long inc = 0;
+
+	while (nr) {
+		inc = min(256L, nr);
+		nr -= inc;
+		inc = __cmm_free_pages(inc, counter, list);
+		if (inc)
+			break;
+		cond_resched();
+	}
+	return nr + inc;
+}
+
 static int cmm_oom_notify(struct notifier_block *self,
 			  unsigned long dummy, void *parm)
 {
@@ -185,10 +201,10 @@ static void cmm_set_timer(void)
 {
 	if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) {
 		if (timer_pending(&cmm_timer))
-			del_timer(&cmm_timer);
+			timer_delete(&cmm_timer);
 		return;
 	}
-	mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
+	mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds));
 }
 
 static void cmm_timer_fn(struct timer_list *unused)
@@ -243,7 +259,7 @@ static int cmm_skip_blanks(char *cp, char **endp)
 	return str != cp;
 }
 
-static int cmm_pages_handler(struct ctl_table *ctl, int write,
+static int cmm_pages_handler(const struct ctl_table *ctl, int write,
 			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	long nr = cmm_get_pages();
@@ -262,7 +278,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
+static int cmm_timed_pages_handler(const struct ctl_table *ctl, int write,
 				   void *buffer, size_t *lenp,
 				   loff_t *ppos)
 {
@@ -282,7 +298,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static int cmm_timeout_handler(struct ctl_table *ctl, int write,
+static int cmm_timeout_handler(const struct ctl_table *ctl, int write,
 			       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
@@ -305,8 +321,8 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
 		cmm_set_timeout(nr, seconds);
 		*ppos += *lenp;
 	} else {
-		len = sprintf(buf, "%ld %ld\n",
-			      cmm_timeout_pages, cmm_timeout_seconds);
+		len = scnprintf(buf, sizeof(buf), "%ld %ld\n",
+				cmm_timeout_pages, cmm_timeout_seconds);
 		if (len > *lenp)
 			len = *lenp;
 		memcpy(buffer, buf, len);
@@ -316,7 +332,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static struct ctl_table cmm_table[] = {
+static const struct ctl_table cmm_table[] = {
 	{
 		.procname	= "cmm_pages",
 		.mode		= 0644,
@@ -332,17 +348,6 @@ static struct ctl_table cmm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= cmm_timeout_handler,
 	},
-	{ }
-};
-
-static struct ctl_table cmm_dir_table[] = {
-	{
-		.procname	= "vm",
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= cmm_table,
-	},
-	{ }
 };
 
 #ifdef CONFIG_CMM_IUCV
@@ -389,7 +394,7 @@ static int __init cmm_init(void)
 {
 	int rc = -ENOMEM;
 
-	cmm_sysctl_header = register_sysctl_table(cmm_dir_table);
+	cmm_sysctl_header = register_sysctl("vm", cmm_table);
 	if (!cmm_sysctl_header)
 		goto out_sysctl;
 #ifdef CONFIG_CMM_IUCV
@@ -419,7 +424,7 @@ out_smsg:
 #endif
 	unregister_sysctl_table(cmm_sysctl_header);
 out_sysctl:
-	del_timer_sync(&cmm_timer);
+	timer_delete_sync(&cmm_timer);
 	return rc;
 }
 module_init(cmm_init);
@@ -432,10 +437,11 @@ static void __exit cmm_exit(void)
 #endif
 	unregister_oom_notifier(&cmm_oom_nb);
 	kthread_stop(cmm_thread_ptr);
-	del_timer_sync(&cmm_timer);
+	timer_delete_sync(&cmm_timer);
 	cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
 	cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
 }
 module_exit(cmm_exit);
 
+MODULE_DESCRIPTION("Cooperative memory management interface");
 MODULE_LICENSE("GPL");
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 9953819d7959..89badbe72ae7 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpufeature.h>
 #include <linux/set_memory.h>
 #include <linux/ptdump.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/sort.h>
 #include <linux/mm.h>
 #include <linux/kfence.h>
 #include <linux/kasan.h>
-#include <asm/ptdump.h>
 #include <asm/kasan.h>
 #include <asm/abs_lowcore.h>
 #include <asm/nospec-branch.h>
@@ -16,68 +18,14 @@
 static unsigned long max_addr;
 
 struct addr_marker {
+	int is_start;
 	unsigned long start_address;
+	unsigned long size;
 	const char *name;
 };
 
-enum address_markers_idx {
-	IDENTITY_BEFORE_NR = 0,
-	IDENTITY_BEFORE_END_NR,
-	AMODE31_START_NR,
-	AMODE31_END_NR,
-	KERNEL_START_NR,
-	KERNEL_END_NR,
-#ifdef CONFIG_KFENCE
-	KFENCE_START_NR,
-	KFENCE_END_NR,
-#endif
-	IDENTITY_AFTER_NR,
-	IDENTITY_AFTER_END_NR,
-#ifdef CONFIG_KASAN
-	KASAN_SHADOW_START_NR,
-	KASAN_SHADOW_END_NR,
-#endif
-	VMEMMAP_NR,
-	VMEMMAP_END_NR,
-	VMALLOC_NR,
-	VMALLOC_END_NR,
-	MODULES_NR,
-	MODULES_END_NR,
-	ABS_LOWCORE_NR,
-	ABS_LOWCORE_END_NR,
-	MEMCPY_REAL_NR,
-	MEMCPY_REAL_END_NR,
-};
-
-static struct addr_marker address_markers[] = {
-	[IDENTITY_BEFORE_NR]	= {0, "Identity Mapping Start"},
-	[IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"},
-	[AMODE31_START_NR]	= {0, "Amode31 Area Start"},
-	[AMODE31_END_NR]	= {0, "Amode31 Area End"},
-	[KERNEL_START_NR]	= {(unsigned long)_stext, "Kernel Image Start"},
-	[KERNEL_END_NR]		= {(unsigned long)_end, "Kernel Image End"},
-#ifdef CONFIG_KFENCE
-	[KFENCE_START_NR]	= {0, "KFence Pool Start"},
-	[KFENCE_END_NR]		= {0, "KFence Pool End"},
-#endif
-	[IDENTITY_AFTER_NR]	= {(unsigned long)_end, "Identity Mapping Start"},
-	[IDENTITY_AFTER_END_NR]	= {0, "Identity Mapping End"},
-#ifdef CONFIG_KASAN
-	[KASAN_SHADOW_START_NR]	= {KASAN_SHADOW_START, "Kasan Shadow Start"},
-	[KASAN_SHADOW_END_NR]	= {KASAN_SHADOW_END, "Kasan Shadow End"},
-#endif
-	[VMEMMAP_NR]		= {0, "vmemmap Area Start"},
-	[VMEMMAP_END_NR]	= {0, "vmemmap Area End"},
-	[VMALLOC_NR]		= {0, "vmalloc Area Start"},
-	[VMALLOC_END_NR]	= {0, "vmalloc Area End"},
-	[MODULES_NR]		= {0, "Modules Area Start"},
-	[MODULES_END_NR]	= {0, "Modules Area End"},
-	[ABS_LOWCORE_NR]	= {0, "Lowcore Area Start"},
-	[ABS_LOWCORE_END_NR]	= {0, "Lowcore Area End"},
-	[MEMCPY_REAL_NR]	= {0, "Real Memory Copy Area Start"},
-	[MEMCPY_REAL_END_NR]	= {0, "Real Memory Copy Area End"},
-	{ -1, NULL }
-};
+static struct addr_marker *markers;
+static unsigned int markers_cnt;
 
 struct pg_state {
 	struct ptdump_state ptdump;
@@ -103,7 +51,7 @@ struct pg_state {
 	struct seq_file *__m = (m);		\
 						\
 	if (__m)				\
-		seq_printf(__m, fmt);		\
+		seq_puts(__m, fmt);		\
 })
 
 static void print_prot(struct seq_file *m, unsigned int pr, int level)
@@ -122,7 +70,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level)
 
 static void note_prot_wx(struct pg_state *st, unsigned long addr)
 {
-#ifdef CONFIG_DEBUG_WX
 	if (!st->check_wx)
 		return;
 	if (st->current_prot & _PAGE_INVALID)
@@ -137,12 +84,26 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 	 * in which case we have two lpswe instructions in lowcore that need
 	 * to be executable.
 	 */
-	if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
+	if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear()))
 		return;
-	WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+	WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+		  "s390/mm: Found insecure W+X mapping at address %pS\n",
 		  (void *)st->start_address);
 	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
-#endif /* CONFIG_DEBUG_WX */
+}
+
+static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level)
+{
+	struct seq_file *m = st->seq;
+
+	while (addr >= st->marker[1].start_address) {
+		st->marker++;
+		pt_dump_seq_printf(m, "---[ %s %s ]---\n", st->marker->name,
+				   st->marker->is_start ? "Start" : "End");
+	}
+	st->start_address = addr;
+	st->current_prot = prot;
+	st->level = level;
 }
 
 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
@@ -167,10 +128,8 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	if (level == -1)
 		addr = max_addr;
 	if (st->level == -1) {
-		pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
-		st->start_address = addr;
-		st->current_prot = prot;
-		st->level = level;
+		pt_dump_seq_puts(m, "---[ Kernel Virtual Address Space ]---\n");
+		note_page_update_state(st, addr, prot, level);
 	} else if (prot != st->current_prot || level != st->level ||
 		   addr >= st->marker[1].start_address) {
 		note_prot_wx(st, addr);
@@ -184,22 +143,52 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 		}
 		pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
 		print_prot(m, st->current_prot, st->level);
-		while (addr >= st->marker[1].start_address) {
-			st->marker++;
-			pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
-		}
-		st->start_address = addr;
-		st->current_prot = prot;
-		st->level = level;
+		note_page_update_state(st, addr, prot, level);
 	}
 }
 
-#ifdef CONFIG_DEBUG_WX
-void ptdump_check_wx(void)
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+	note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+	note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+	note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+	note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+	note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+	pte_t pte_zero = {0};
+
+	note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.ptdump = {
-			.note_page = note_page,
+			.note_page_pte = note_page_pte,
+			.note_page_pmd = note_page_pmd,
+			.note_page_pud = note_page_pud,
+			.note_page_p4d = note_page_p4d,
+			.note_page_pgd = note_page_pgd,
+			.note_page_flush = note_page_flush,
 			.range = (struct ptdump_range[]) {
 				{.start = 0, .end = max_addr},
 				{.start = 0, .end = 0},
@@ -217,24 +206,33 @@ void ptdump_check_wx(void)
 		},
 	};
 
-	if (!MACHINE_HAS_NX)
-		return;
+	if (!cpu_has_nx())
+		return true;
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
-	if (st.wx_pages)
+	if (st.wx_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
-			(nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
+			(nospec_uses_trampoline() || !cpu_has_bear()) ?
 			"unexpected " : "");
+
+		return true;
+	}
 }
-#endif /* CONFIG_DEBUG_WX */
 
 #ifdef CONFIG_PTDUMP_DEBUGFS
 static int ptdump_show(struct seq_file *m, void *v)
 {
 	struct pg_state st = {
 		.ptdump = {
-			.note_page = note_page,
+			.note_page_pte = note_page_pte,
+			.note_page_pmd = note_page_pmd,
+			.note_page_pud = note_page_pud,
+			.note_page_p4d = note_page_p4d,
+			.note_page_pgd = note_page_pgd,
+			.note_page_flush = note_page_flush,
 			.range = (struct ptdump_range[]) {
 				{.start = 0, .end = max_addr},
 				{.start = 0, .end = 0},
@@ -246,35 +244,72 @@ static int ptdump_show(struct seq_file *m, void *v)
 		.check_wx = false,
 		.wx_pages = 0,
 		.start_address = 0,
-		.marker = address_markers,
+		.marker = markers,
 	};
 
-	get_online_mems();
 	mutex_lock(&cpa_mutex);
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 	mutex_unlock(&cpa_mutex);
-	put_online_mems();
 	return 0;
 }
 DEFINE_SHOW_ATTRIBUTE(ptdump);
 #endif /* CONFIG_PTDUMP_DEBUGFS */
 
-/*
- * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple
- * insertion sort to preserve the original order of markers with the same
- * start address.
- */
-static void sort_address_markers(void)
+static int ptdump_cmp(const void *a, const void *b)
 {
-	struct addr_marker tmp;
-	int i, j;
-
-	for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) {
-		tmp = address_markers[i];
-		for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--)
-			address_markers[j + 1] = address_markers[j];
-		address_markers[j + 1] = tmp;
+	const struct addr_marker *ama = a;
+	const struct addr_marker *amb = b;
+
+	if (ama->start_address > amb->start_address)
+		return 1;
+	if (ama->start_address < amb->start_address)
+		return -1;
+	/*
+	 * If the start addresses of two markers are identical sort markers in an
+	 * order that considers areas contained within other areas correctly.
+	 */
+	if (ama->is_start && amb->is_start) {
+		if (ama->size > amb->size)
+			return -1;
+		if (ama->size < amb->size)
+			return 1;
+		return 0;
 	}
+	if (!ama->is_start && !amb->is_start) {
+		if (ama->size > amb->size)
+			return 1;
+		if (ama->size < amb->size)
+			return -1;
+		return 0;
+	}
+	if (ama->is_start)
+		return 1;
+	if (amb->is_start)
+		return -1;
+	return 0;
+}
+
+static int add_marker(unsigned long start, unsigned long end, const char *name)
+{
+	struct addr_marker *new;
+	size_t newsize;
+
+	newsize = (markers_cnt + 2) * sizeof(*markers);
+	new = kvrealloc(markers, newsize, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+	markers = new;
+	markers[markers_cnt].is_start = 1;
+	markers[markers_cnt].start_address = start;
+	markers[markers_cnt].size = end - start;
+	markers[markers_cnt].name = name;
+	markers_cnt++;
+	markers[markers_cnt].is_start = 0;
+	markers[markers_cnt].start_address = end;
+	markers[markers_cnt].size = end - start;
+	markers[markers_cnt].name = name;
+	markers_cnt++;
+	return 0;
 }
 
 static int pt_dump_init(void)
@@ -282,34 +317,48 @@ static int pt_dump_init(void)
 #ifdef CONFIG_KFENCE
 	unsigned long kfence_start = (unsigned long)__kfence_pool;
 #endif
+	unsigned long lowcore = (unsigned long)get_lowcore();
+	int rc;
+
 	/*
 	 * Figure out the maximum virtual address being accessible with the
 	 * kernel ASCE. We need this to keep the page table walker functions
 	 * from accessing non-existent entries.
 	 */
-	max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
+	max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2;
 	max_addr = 1UL << (max_addr * 11 + 31);
-	address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size;
-	address_markers[AMODE31_START_NR].start_address = __samode31;
-	address_markers[AMODE31_END_NR].start_address = __eamode31;
-	address_markers[MODULES_NR].start_address = MODULES_VADDR;
-	address_markers[MODULES_END_NR].start_address = MODULES_END;
-	address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
-	address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
-	address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
-	address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE;
-	address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
-	address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
-	address_markers[VMALLOC_NR].start_address = VMALLOC_START;
-	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+	/* start + end markers - must be added first */
+	rc = add_marker(0, -1UL, NULL);
+	rc |= add_marker((unsigned long)_stext, (unsigned long)_end, "Kernel Image");
+	rc |= add_marker(lowcore, lowcore + sizeof(struct lowcore), "Lowcore");
+	rc |= add_marker(__identity_base, __identity_base + ident_map_size, "Identity Mapping");
+	rc |= add_marker((unsigned long)__samode31, (unsigned long)__eamode31, "Amode31 Area");
+	rc |= add_marker(MODULES_VADDR, MODULES_END, "Modules Area");
+	rc |= add_marker(__abs_lowcore, __abs_lowcore + ABS_LOWCORE_MAP_SIZE, "Lowcore Area");
+	rc |= add_marker(__memcpy_real_area, __memcpy_real_area + MEMCPY_REAL_SIZE, "Real Memory Copy Area");
+	rc |= add_marker((unsigned long)vmemmap, (unsigned long)vmemmap + vmemmap_size, "vmemmap Area");
+	rc |= add_marker(VMALLOC_START, VMALLOC_END, "vmalloc Area");
 #ifdef CONFIG_KFENCE
-	address_markers[KFENCE_START_NR].start_address = kfence_start;
-	address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE;
+	rc |= add_marker(kfence_start, kfence_start + KFENCE_POOL_SIZE, "KFence Pool");
+#endif
+#ifdef CONFIG_KMSAN
+	rc |= add_marker(KMSAN_VMALLOC_SHADOW_START, KMSAN_VMALLOC_SHADOW_END, "Kmsan vmalloc Shadow");
+	rc |= add_marker(KMSAN_VMALLOC_ORIGIN_START, KMSAN_VMALLOC_ORIGIN_END, "Kmsan vmalloc Origins");
+	rc |= add_marker(KMSAN_MODULES_SHADOW_START, KMSAN_MODULES_SHADOW_END, "Kmsan Modules Shadow");
+	rc |= add_marker(KMSAN_MODULES_ORIGIN_START, KMSAN_MODULES_ORIGIN_END, "Kmsan Modules Origins");
+#endif
+#ifdef CONFIG_KASAN
+	rc |= add_marker(KASAN_SHADOW_START, KASAN_SHADOW_END, "Kasan Shadow");
 #endif
-	sort_address_markers();
+	if (rc)
+		goto error;
+	sort(&markers[1], markers_cnt - 1, sizeof(*markers), ptdump_cmp, NULL);
 #ifdef CONFIG_PTDUMP_DEBUGFS
 	debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
 #endif /* CONFIG_PTDUMP_DEBUGFS */
 	return 0;
+error:
+	kvfree(markers);
+	return -ENOMEM;
 }
 device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
index 1e4d2187541a..7498e858c401 100644
--- a/arch/s390/mm/extable.c
+++ b/arch/s390/mm/extable.c
@@ -7,6 +7,7 @@
 #include <linux/panic.h>
 #include <asm/asm-extable.h>
 #include <asm/extable.h>
+#include <asm/fpu.h>
 
 const struct exception_table_entry *s390_search_extables(unsigned long addr)
 {
@@ -26,7 +27,7 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_r
 	return true;
 }
 
-static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs)
+static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs)
 {
 	unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
 
@@ -35,26 +36,83 @@ static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct p
 	return true;
 }
 
-static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs)
+static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex,
+				   bool pair, struct pt_regs *regs)
 {
-	unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+	unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
 	unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
-	size_t len = FIELD_GET(EX_DATA_LEN, ex->data);
 
 	regs->gprs[reg_err] = -EFAULT;
-	memset((void *)regs->gprs[reg_addr], 0, len);
+	regs->gprs[reg_zero] = 0;
+	if (pair)
+		regs->gprs[reg_zero + 1] = 0;
 	regs->psw.addr = extable_fixup(ex);
 	return true;
 }
 
-static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs)
+static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs)
 {
-	unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
-	unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+	unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+	unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+	unsigned long data, addr, offset;
 
-	regs->gprs[reg_err] = -EFAULT;
-	regs->gprs[reg_zero] = 0;
+	addr = regs->gprs[reg_addr];
+	offset = addr & (sizeof(unsigned long) - 1);
+	addr &= ~(sizeof(unsigned long) - 1);
+	data = *(unsigned long *)addr;
+	data <<= BITS_PER_BYTE * offset;
+	regs->gprs[reg_data] = data;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	fpu_sfpc(0);
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+struct insn_ssf {
+	u64	opc1 : 8;
+	u64	r3   : 4;
+	u64	opc2 : 4;
+	u64	b1   : 4;
+	u64	d1   : 12;
+	u64	b2   : 4;
+	u64	d2   : 12;
+} __packed;
+
+static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex,
+				bool from, struct pt_regs *regs)
+{
+	unsigned long uaddr, remainder;
+	struct insn_ssf *insn;
+
+	/*
+	 * If the faulting user space access crossed a page boundary retry by
+	 * limiting the access to the first page (adjust length accordingly).
+	 * Then the mvcos instruction will either complete with condition code
+	 * zero, or generate another fault where the user space access did not
+	 * cross a page boundary.
+	 * If the faulting user space access did not cross a page boundary set
+	 * length to zero and retry. In this case no user space access will
+	 * happen, and the mvcos instruction will complete with condition code
+	 * zero.
+	 * In both cases the instruction will complete with condition code
+	 * zero (copying finished), and the register which contains the
+	 * length, indicates the number of bytes copied.
+	 */
 	regs->psw.addr = extable_fixup(ex);
+	insn = (struct insn_ssf *)regs->psw.addr;
+	if (from)
+		uaddr = regs->gprs[insn->b2] + insn->d2;
+	else
+		uaddr = regs->gprs[insn->b1] + insn->d1;
+	remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1));
+	if (regs->gprs[insn->r3] <= remainder)
+		remainder = 0;
+	regs->gprs[insn->r3] = remainder;
 	return true;
 }
 
@@ -70,12 +128,20 @@ bool fixup_exception(struct pt_regs *regs)
 		return ex_handler_fixup(ex, regs);
 	case EX_TYPE_BPF:
 		return ex_handler_bpf(ex, regs);
-	case EX_TYPE_UA_STORE:
-		return ex_handler_ua_store(ex, regs);
-	case EX_TYPE_UA_LOAD_MEM:
-		return ex_handler_ua_load_mem(ex, regs);
+	case EX_TYPE_UA_FAULT:
+		return ex_handler_ua_fault(ex, regs);
 	case EX_TYPE_UA_LOAD_REG:
-		return ex_handler_ua_load_reg(ex, regs);
+		return ex_handler_ua_load_reg(ex, false, regs);
+	case EX_TYPE_UA_LOAD_REGPAIR:
+		return ex_handler_ua_load_reg(ex, true, regs);
+	case EX_TYPE_ZEROPAD:
+		return ex_handler_zeropad(ex, regs);
+	case EX_TYPE_FPC:
+		return ex_handler_fpc(ex, regs);
+	case EX_TYPE_UA_MVCOS_TO:
+		return ex_handler_ua_mvcos(ex, false, regs);
+	case EX_TYPE_UA_MVCOS_FROM:
+		return ex_handler_ua_mvcos(ex, true, regs);
 	}
 	panic("invalid exception table entry");
 }
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 5060956b8e7d..6cc33c705de2 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -7,8 +7,7 @@
  * Copyright IBM Corp. 2002, 2004
  */
 
-#define KMSG_COMPONENT "extmem"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "extmem: " fmt
 
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -21,6 +20,7 @@
 #include <linux/ioport.h>
 #include <linux/refcount.h>
 #include <linux/pgtable.h>
+#include <asm/machine.h>
 #include <asm/diag.h>
 #include <asm/page.h>
 #include <asm/ebcdic.h>
@@ -28,6 +28,7 @@
 #include <asm/extmem.h>
 #include <asm/cpcmd.h>
 #include <asm/setup.h>
+#include <asm/asm.h>
 
 #define DCSS_PURGESEG   0x08
 #define DCSS_LOADSHRX	0x20
@@ -134,20 +135,21 @@ dcss_diag(int *func, void *parameter,
            unsigned long *ret1, unsigned long *ret2)
 {
 	unsigned long rx, ry;
-	int rc;
+	int cc;
 
-	rx = (unsigned long) parameter;
+	rx = virt_to_phys(parameter);
 	ry = (unsigned long) *func;
 
 	diag_stat_inc(DIAG_STAT_X064);
 	asm volatile(
-		"	diag	%0,%1,0x64\n"
-		"	ipm	%2\n"
-		"	srl	%2,28\n"
-		: "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+		"	diag	%[rx],%[ry],0x64\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry)
+		:
+		: CC_CLOBBER);
 	*ret1 = rx;
 	*ret2 = ry;
-	return rc;
+	return CC_TRANSFORM(cc);
 }
 
 static inline int
@@ -178,7 +180,7 @@ query_segment_type (struct dcss_segment *seg)
 
 	/* initialize diag input parameters */
 	qin->qopcode = DCSS_FINDSEGA;
-	qin->qoutptr = (unsigned long) qout;
+	qin->qoutptr = virt_to_phys(qout);
 	qin->qoutlen = sizeof(struct qout64);
 	memcpy (qin->qname, seg->dcss_name, 8);
 
@@ -253,7 +255,7 @@ segment_type (char* name)
 	int rc;
 	struct dcss_segment seg;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -ENOSYS;
 
 	dcss_mkname(name, seg.dcss_name);
@@ -289,15 +291,17 @@ segment_overlaps_others (struct dcss_segment *seg)
 
 /*
  * real segment loading function, called from segment_load
+ * Must return either an error code < 0, or the segment type code >= 0
  */
 static int
 __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end)
 {
 	unsigned long start_addr, end_addr, dummy;
 	struct dcss_segment *seg;
-	int rc, diag_cc;
+	int rc, diag_cc, segtype;
 
 	start_addr = end_addr = 0;
+	segtype = -1;
 	seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA);
 	if (seg == NULL) {
 		rc = -ENOMEM;
@@ -326,9 +330,9 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 	seg->res_name[8] = '\0';
 	strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name));
 	seg->res->name = seg->res_name;
-	rc = seg->vm_segtype;
-	if (rc == SEG_TYPE_SC ||
-	    ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
+	segtype = seg->vm_segtype;
+	if (segtype == SEG_TYPE_SC ||
+	    ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared))
 		seg->res->flags |= IORESOURCE_READONLY;
 
 	/* Check for overlapping resources before adding the mapping. */
@@ -386,7 +390,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
  out_free:
 	kfree(seg);
  out:
-	return rc;
+	return rc < 0 ? rc : segtype;
 }
 
 /*
@@ -414,7 +418,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr,
 	struct dcss_segment *seg;
 	int rc;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -ENOSYS;
 
 	mutex_lock(&dcss_lock);
@@ -525,6 +529,14 @@ segment_modify_shared (char *name, int do_nonshared)
 	return rc;
 }
 
+static void __dcss_diag_purge_on_cpu_0(void *data)
+{
+	struct dcss_segment *seg = (struct dcss_segment *)data;
+	unsigned long dummy;
+
+	dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+}
+
 /*
  * Decrease the use count of a DCSS segment and remove
  * it from the address space if nobody is using it
@@ -533,10 +545,9 @@ segment_modify_shared (char *name, int do_nonshared)
 void
 segment_unload(char *name)
 {
-	unsigned long dummy;
 	struct dcss_segment *seg;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 
 	mutex_lock(&dcss_lock);
@@ -551,7 +562,14 @@ segment_unload(char *name)
 	kfree(seg->res);
 	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
 	list_del(&seg->list);
-	dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+	/*
+	 * Workaround for z/VM issue, where calling the DCSS unload diag on
+	 * a non-IPL CPU would cause bogus sclp maximum memory detection on
+	 * next IPL.
+	 * IPL CPU 0 cannot be set offline, so the dcss_diag() call can
+	 * directly be scheduled to that CPU.
+	 */
+	smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1);
 	kfree(seg);
 out_unlock:
 	mutex_unlock(&dcss_lock);
@@ -568,7 +586,7 @@ segment_save(char *name)
 	char cmd2[80];
 	int i, response;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 
 	mutex_lock(&dcss_lock);
@@ -579,14 +597,16 @@ segment_save(char *name)
 		goto out;
 	}
 
-	sprintf(cmd1, "DEFSEG %s", name);
+	snprintf(cmd1, sizeof(cmd1), "DEFSEG %s", name);
 	for (i=0; i<seg->segcnt; i++) {
-		sprintf(cmd1+strlen(cmd1), " %lX-%lX %s",
-			seg->range[i].start >> PAGE_SHIFT,
-			seg->range[i].end >> PAGE_SHIFT,
-			segtype_string[seg->range[i].start & 0xff]);
+		size_t len = strlen(cmd1);
+
+		snprintf(cmd1 + len, sizeof(cmd1) - len, " %lX-%lX %s",
+			 seg->range[i].start >> PAGE_SHIFT,
+			 seg->range[i].end >> PAGE_SHIFT,
+			 segtype_string[seg->range[i].start & 0xff]);
 	}
-	sprintf(cmd2, "SAVESEG %s", name);
+	snprintf(cmd2, sizeof(cmd2), "SAVESEG %s", name);
 	response = 0;
 	cpcmd(cmd1, NULL, 0, &response);
 	if (response) {
@@ -638,10 +658,13 @@ void segment_warning(int rc, char *seg_name)
 		pr_err("There is not enough memory to load or query "
 		       "DCSS %s\n", seg_name);
 		break;
-	case -ERANGE:
-		pr_err("DCSS %s exceeds the kernel mapping range (%lu) "
-		       "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS);
+	case -ERANGE: {
+		struct range mhp_range = arch_get_mappable_range();
+
+		pr_err("DCSS %s exceeds the kernel mapping range (%llu) "
+		       "and cannot be loaded\n", seg_name, mhp_range.end + 1);
 		break;
+	}
 	default:
 		break;
 	}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 9649d9382e0a..e2e13778c36a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -3,13 +3,15 @@
  *  S390 version
  *    Copyright IBM Corp. 1999
  *    Author(s): Hartmut Penner (hp@de.ibm.com)
- *               Ulrich Weigand (uweigand@de.ibm.com)
+ *		 Ulrich Weigand (uweigand@de.ibm.com)
  *
  *  Derived from "arch/i386/mm/fault.c"
  *    Copyright (C) 1995  Linus Torvalds
  */
 
 #include <linux/kernel_stat.h>
+#include <linux/mmu_context.h>
+#include <linux/cpufeature.h>
 #include <linux/perf_event.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
@@ -21,7 +23,6 @@
 #include <linux/ptrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
-#include <linux/compat.h>
 #include <linux/smp.h>
 #include <linux/kdebug.h>
 #include <linux/init.h>
@@ -32,123 +33,93 @@
 #include <linux/uaccess.h>
 #include <linux/hugetlb.h>
 #include <linux/kfence.h>
+#include <linux/pagewalk.h>
 #include <asm/asm-extable.h>
 #include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/fault.h>
 #include <asm/diag.h>
-#include <asm/gmap.h>
 #include <asm/irq.h>
-#include <asm/mmu_context.h>
 #include <asm/facility.h>
 #include <asm/uv.h>
 #include "../kernel/entry.h"
 
-#define __FAIL_ADDR_MASK -4096L
-#define __SUBCODE_MASK 0x0600
-#define __PF_RES_FIELD 0x8000000000000000ULL
-
-#define VM_FAULT_BADCONTEXT	((__force vm_fault_t) 0x010000)
-#define VM_FAULT_BADMAP		((__force vm_fault_t) 0x020000)
-#define VM_FAULT_BADACCESS	((__force vm_fault_t) 0x040000)
-#define VM_FAULT_SIGNAL		((__force vm_fault_t) 0x080000)
-#define VM_FAULT_PFAULT		((__force vm_fault_t) 0x100000)
-
-enum fault_type {
-	KERNEL_FAULT,
-	USER_FAULT,
-	GMAP_FAULT,
-};
-
-static unsigned long store_indication __read_mostly;
-
-static int __init fault_init(void)
-{
-	if (test_facility(75))
-		store_indication = 0xc00;
-	return 0;
-}
-early_initcall(fault_init);
-
 /*
  * Find out which address space caused the exception.
  */
-static enum fault_type get_fault_type(struct pt_regs *regs)
+static bool is_kernel_fault(struct pt_regs *regs)
 {
-	unsigned long trans_exc_code;
+	union teid teid = { .val = regs->int_parm_long };
 
-	trans_exc_code = regs->int_parm_long & 3;
-	if (likely(trans_exc_code == 0)) {
-		/* primary space exception */
-		if (user_mode(regs))
-			return USER_FAULT;
-		if (!IS_ENABLED(CONFIG_PGSTE))
-			return KERNEL_FAULT;
-		if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
-			return GMAP_FAULT;
-		return KERNEL_FAULT;
-	}
-	if (trans_exc_code == 2)
-		return USER_FAULT;
-	if (trans_exc_code == 1) {
-		/* access register mode, not used in the kernel */
-		return USER_FAULT;
-	}
-	/* home space exception -> access via kernel ASCE */
-	return KERNEL_FAULT;
+	if (user_mode(regs))
+		return false;
+	if (teid.as == PSW_BITS_AS_SECONDARY)
+		return false;
+	return true;
 }
 
-static int bad_address(void *p)
+static unsigned long get_fault_address(struct pt_regs *regs)
 {
-	unsigned long dummy;
+	union teid teid = { .val = regs->int_parm_long };
 
-	return get_kernel_nofault(dummy, (unsigned long *)p);
+	return teid.addr * PAGE_SIZE;
+}
+
+static __always_inline bool fault_is_write(struct pt_regs *regs)
+{
+	union teid teid = { .val = regs->int_parm_long };
+
+	if (test_facility(75))
+		return teid.fsi == TEID_FSI_STORE;
+	return false;
 }
 
 static void dump_pagetable(unsigned long asce, unsigned long address)
 {
-	unsigned long *table = __va(asce & _ASCE_ORIGIN);
+	unsigned long entry, *table = __va(asce & _ASCE_ORIGIN);
 
 	pr_alert("AS:%016lx ", asce);
 	switch (asce & _ASCE_TYPE_MASK) {
 	case _ASCE_TYPE_REGION1:
 		table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("R1:%016lx ", *table);
-		if (*table & _REGION_ENTRY_INVALID)
+		pr_cont("R1:%016lx ", entry);
+		if (entry & _REGION_ENTRY_INVALID)
 			goto out;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
 		fallthrough;
 	case _ASCE_TYPE_REGION2:
 		table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("R2:%016lx ", *table);
-		if (*table & _REGION_ENTRY_INVALID)
+		pr_cont("R2:%016lx ", entry);
+		if (entry & _REGION_ENTRY_INVALID)
 			goto out;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
 		fallthrough;
 	case _ASCE_TYPE_REGION3:
 		table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("R3:%016lx ", *table);
-		if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
+		pr_cont("R3:%016lx ", entry);
+		if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
 			goto out;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
 		fallthrough;
 	case _ASCE_TYPE_SEGMENT:
 		table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
-		if (bad_address(table))
+		if (get_kernel_nofault(entry, table))
 			goto bad;
-		pr_cont("S:%016lx ", *table);
-		if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
+		pr_cont("S:%016lx ", entry);
+		if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
 			goto out;
-		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
+		table = __va(entry & _SEGMENT_ENTRY_ORIGIN);
 	}
-	table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
-	if (bad_address(table))
+	table += (address & _PAGE_INDEX) >> PAGE_SHIFT;
+	if (get_kernel_nofault(entry, table))
 		goto bad;
-	pr_cont("P:%016lx ", *table);
+	pr_cont("P:%016lx ", entry);
 out:
 	pr_cont("\n");
 	return;
@@ -158,162 +129,127 @@ bad:
 
 static void dump_fault_info(struct pt_regs *regs)
 {
+	union teid teid = { .val = regs->int_parm_long };
 	unsigned long asce;
 
-	pr_alert("Failing address: %016lx TEID: %016lx\n",
-		 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
+	pr_alert("Failing address: %016lx TEID: %016lx",
+		 get_fault_address(regs), teid.val);
+	if (test_facility(131))
+		pr_cont(" ESOP-2");
+	else if (machine_has_esop())
+		pr_cont(" ESOP-1");
+	else
+		pr_cont(" SOP");
+	if (test_facility(75))
+		pr_cont(" FSI");
+	pr_cont("\n");
 	pr_alert("Fault in ");
-	switch (regs->int_parm_long & 3) {
-	case 3:
+	switch (teid.as) {
+	case PSW_BITS_AS_HOME:
 		pr_cont("home space ");
 		break;
-	case 2:
+	case PSW_BITS_AS_SECONDARY:
 		pr_cont("secondary space ");
 		break;
-	case 1:
+	case PSW_BITS_AS_ACCREG:
 		pr_cont("access register ");
 		break;
-	case 0:
+	case PSW_BITS_AS_PRIMARY:
 		pr_cont("primary space ");
 		break;
 	}
 	pr_cont("mode while using ");
-	switch (get_fault_type(regs)) {
-	case USER_FAULT:
-		asce = S390_lowcore.user_asce;
-		pr_cont("user ");
-		break;
-	case GMAP_FAULT:
-		asce = ((struct gmap *) S390_lowcore.gmap)->asce;
-		pr_cont("gmap ");
-		break;
-	case KERNEL_FAULT:
-		asce = S390_lowcore.kernel_asce;
+	if (is_kernel_fault(regs)) {
+		asce = get_lowcore()->kernel_asce.val;
 		pr_cont("kernel ");
-		break;
-	default:
-		unreachable();
+	} else {
+		asce = get_lowcore()->user_asce.val;
+		pr_cont("user ");
 	}
 	pr_cont("ASCE.\n");
-	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
+	dump_pagetable(asce, get_fault_address(regs));
 }
 
 int show_unhandled_signals = 1;
 
+static const struct ctl_table s390_fault_sysctl_table[] = {
+	{
+		.procname	= "userprocess_debug",
+		.data		= &show_unhandled_signals,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+};
+
+static int __init init_s390_fault_sysctls(void)
+{
+	register_sysctl_init("kernel", s390_fault_sysctl_table);
+	return 0;
+}
+arch_initcall(init_s390_fault_sysctls);
+
 void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 {
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
 	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 		return;
 	if (!unhandled_signal(current, signr))
 		return;
-	if (!printk_ratelimit())
+	if (!__ratelimit(&rs))
 		return;
-	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
-	       regs->int_code & 0xffff, regs->int_code >> 17);
+	pr_alert("User process fault: interruption code %04x ilc:%d ",
+		 regs->int_code & 0xffff, regs->int_code >> 17);
 	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 	if (is_mm_fault)
 		dump_fault_info(regs);
 	show_regs(regs);
 }
 
-/*
- * Send SIGSEGV to task.  This is an external routine
- * to keep the stack usage of do_page_fault small.
- */
-static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
+static void do_sigsegv(struct pt_regs *regs, int si_code)
 {
 	report_user_fault(regs, SIGSEGV, 1);
-	force_sig_fault(SIGSEGV, si_code,
-			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
+	force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs));
 }
 
-static noinline void do_no_context(struct pt_regs *regs)
+static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
 {
+	unsigned long address;
+	bool is_write;
+
+	if (user_mode(regs)) {
+		if (WARN_ON_ONCE(!si_code))
+			si_code = SEGV_MAPERR;
+		return do_sigsegv(regs, si_code);
+	}
 	if (fixup_exception(regs))
 		return;
-	/*
-	 * Oops. The kernel tried to access some bad page. We'll have to
-	 * terminate things with extreme prejudice.
-	 */
-	if (get_fault_type(regs) == KERNEL_FAULT)
-		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
-		       " in virtual kernel address space\n");
-	else
-		printk(KERN_ALERT "Unable to handle kernel paging request"
-		       " in virtual user address space\n");
+	if (is_kernel_fault(regs)) {
+		address = get_fault_address(regs);
+		is_write = fault_is_write(regs);
+		if (kfence_handle_page_fault(address, is_write, regs))
+			return;
+		pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n");
+	} else {
+		pr_alert("Unable to handle kernel paging request in virtual user address space\n");
+	}
 	dump_fault_info(regs);
 	die(regs, "Oops");
 }
 
-static noinline void do_low_address(struct pt_regs *regs)
+static void handle_fault_error(struct pt_regs *regs, int si_code)
 {
-	/* Low-address protection hit in kernel mode means
-	   NULL pointer write access in kernel mode.  */
-	if (regs->psw.mask & PSW_MASK_PSTATE) {
-		/* Low-address protection hit in user mode 'cannot happen'. */
-		die (regs, "Low-address protection");
-	}
-
-	do_no_context(regs);
-}
+	struct mm_struct *mm = current->mm;
 
-static noinline void do_sigbus(struct pt_regs *regs)
-{
-	/*
-	 * Send a sigbus, regardless of whether we were in kernel
-	 * or user mode.
-	 */
-	force_sig_fault(SIGBUS, BUS_ADRERR,
-			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
+	mmap_read_unlock(mm);
+	handle_fault_error_nolock(regs, si_code);
 }
 
-static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
+static void do_sigbus(struct pt_regs *regs)
 {
-	int si_code;
-
-	switch (fault) {
-	case VM_FAULT_BADACCESS:
-	case VM_FAULT_BADMAP:
-		/* Bad memory access. Check if it is kernel or user space. */
-		if (user_mode(regs)) {
-			/* User mode accesses just cause a SIGSEGV */
-			si_code = (fault == VM_FAULT_BADMAP) ?
-				SEGV_MAPERR : SEGV_ACCERR;
-			do_sigsegv(regs, si_code);
-			break;
-		}
-		fallthrough;
-	case VM_FAULT_BADCONTEXT:
-	case VM_FAULT_PFAULT:
-		do_no_context(regs);
-		break;
-	case VM_FAULT_SIGNAL:
-		if (!user_mode(regs))
-			do_no_context(regs);
-		break;
-	default: /* fault & VM_FAULT_ERROR */
-		if (fault & VM_FAULT_OOM) {
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				pagefault_out_of_memory();
-		} else if (fault & VM_FAULT_SIGSEGV) {
-			/* Kernel mode? Handle exceptions or die */
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				do_sigsegv(regs, SEGV_MAPERR);
-		} else if (fault & VM_FAULT_SIGBUS) {
-			/* Kernel mode? Handle exceptions or die */
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				do_sigbus(regs);
-		} else
-			BUG();
-		break;
-	}
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs));
 }
 
 /*
@@ -322,58 +258,32 @@ static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
  * routines.
  *
  * interruption code (int_code):
- *   04       Protection           ->  Write-Protection  (suppression)
- *   10       Segment translation  ->  Not present       (nullification)
- *   11       Page translation     ->  Not present       (nullification)
- *   3b       Region third trans.  ->  Not present       (nullification)
+ *   04       Protection	   ->  Write-Protection  (suppression)
+ *   10       Segment translation  ->  Not present	 (nullification)
+ *   11       Page translation	   ->  Not present	 (nullification)
+ *   3b       Region third trans.  ->  Not present	 (nullification)
  */
-static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
+static void do_exception(struct pt_regs *regs, int access)
 {
-	struct gmap *gmap;
-	struct task_struct *tsk;
-	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	enum fault_type type;
-	unsigned long trans_exc_code;
 	unsigned long address;
+	struct mm_struct *mm;
 	unsigned int flags;
 	vm_fault_t fault;
 	bool is_write;
 
-	tsk = current;
 	/*
 	 * The instruction that caused the program check has
 	 * been nullified. Don't signal single step via SIGTRAP.
 	 */
 	clear_thread_flag(TIF_PER_TRAP);
-
 	if (kprobe_page_fault(regs, 14))
-		return 0;
-
-	mm = tsk->mm;
-	trans_exc_code = regs->int_parm_long;
-	address = trans_exc_code & __FAIL_ADDR_MASK;
-	is_write = (trans_exc_code & store_indication) == 0x400;
-
-	/*
-	 * Verify that the fault happened in user space, that
-	 * we are not in an interrupt and that there is a 
-	 * user context.
-	 */
-	fault = VM_FAULT_BADCONTEXT;
-	type = get_fault_type(regs);
-	switch (type) {
-	case KERNEL_FAULT:
-		if (kfence_handle_page_fault(address, is_write, regs))
-			return 0;
-		goto out;
-	case USER_FAULT:
-	case GMAP_FAULT:
-		if (faulthandler_disabled() || !mm)
-			goto out;
-		break;
-	}
-
+		return;
+	mm = current->mm;
+	address = get_fault_address(regs);
+	is_write = fault_is_write(regs);
+	if (is_kernel_fault(regs) || faulthandler_disabled() || !mm)
+		return handle_fault_error_nolock(regs, 0);
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 	flags = FAULT_FLAG_DEFAULT;
 	if (user_mode(regs))
@@ -382,401 +292,136 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 		access = VM_WRITE;
 	if (access == VM_WRITE)
 		flags |= FAULT_FLAG_WRITE;
-	mmap_read_lock(mm);
-
-	gmap = NULL;
-	if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
-		gmap = (struct gmap *) S390_lowcore.gmap;
-		current->thread.gmap_addr = address;
-		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
-		current->thread.gmap_int_code = regs->int_code & 0xffff;
-		address = __gmap_translate(gmap, address);
-		if (address == -EFAULT) {
-			fault = VM_FAULT_BADMAP;
-			goto out_up;
-		}
-		if (gmap->pfault_enabled)
-			flags |= FAULT_FLAG_RETRY_NOWAIT;
+	if (!(flags & FAULT_FLAG_USER))
+		goto lock_mmap;
+	vma = lock_vma_under_rcu(mm, address);
+	if (!vma)
+		goto lock_mmap;
+	if (!(vma->vm_flags & access)) {
+		vma_end_read(vma);
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		return handle_fault_error_nolock(regs, SEGV_ACCERR);
 	}
-
+	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+	if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+		vma_end_read(vma);
+	if (!(fault & VM_FAULT_RETRY)) {
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		goto done;
+	}
+	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
+	/* Quick path to respond to signals */
+	if (fault_signal_pending(fault, regs)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		return;
+	}
+lock_mmap:
 retry:
-	fault = VM_FAULT_BADMAP;
-	vma = find_vma(mm, address);
+	vma = lock_mm_and_find_vma(mm, address, regs);
 	if (!vma)
-		goto out_up;
-
-	if (unlikely(vma->vm_start > address)) {
-		if (!(vma->vm_flags & VM_GROWSDOWN))
-			goto out_up;
-		if (expand_stack(vma, address))
-			goto out_up;
-	}
-
-	/*
-	 * Ok, we have a good vm_area for this memory access, so
-	 * we can handle it..
-	 */
-	fault = VM_FAULT_BADACCESS;
+		return handle_fault_error_nolock(regs, SEGV_MAPERR);
 	if (unlikely(!(vma->vm_flags & access)))
-		goto out_up;
-
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
+		return handle_fault_error(regs, SEGV_ACCERR);
 	fault = handle_mm_fault(vma, address, flags, regs);
 	if (fault_signal_pending(fault, regs)) {
-		fault = VM_FAULT_SIGNAL;
-		if (flags & FAULT_FLAG_RETRY_NOWAIT)
-			goto out_up;
-		goto out;
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		return;
 	}
-
 	/* The fault is fully completed (including releasing mmap lock) */
-	if (fault & VM_FAULT_COMPLETED) {
-		if (gmap) {
-			mmap_read_lock(mm);
-			goto out_gmap;
-		}
-		fault = 0;
-		goto out;
-	}
-
-	if (unlikely(fault & VM_FAULT_ERROR))
-		goto out_up;
-
+	if (fault & VM_FAULT_COMPLETED)
+		return;
 	if (fault & VM_FAULT_RETRY) {
-		if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
-			(flags & FAULT_FLAG_RETRY_NOWAIT)) {
-			/*
-			 * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
-			 * not been released
-			 */
-			current->thread.gmap_pfault = 1;
-			fault = VM_FAULT_PFAULT;
-			goto out_up;
-		}
-		flags &= ~FAULT_FLAG_RETRY_NOWAIT;
 		flags |= FAULT_FLAG_TRIED;
-		mmap_read_lock(mm);
 		goto retry;
 	}
-out_gmap:
-	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
-		address =  __gmap_link(gmap, current->thread.gmap_addr,
-				       address);
-		if (address == -EFAULT) {
-			fault = VM_FAULT_BADMAP;
-			goto out_up;
-		}
-		if (address == -ENOMEM) {
-			fault = VM_FAULT_OOM;
-			goto out_up;
-		}
-	}
-	fault = 0;
-out_up:
 	mmap_read_unlock(mm);
-out:
-	return fault;
+done:
+	if (!(fault & VM_FAULT_ERROR))
+		return;
+	if (fault & VM_FAULT_OOM) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			pagefault_out_of_memory();
+	} else if (fault & VM_FAULT_SIGSEGV) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			do_sigsegv(regs, SEGV_MAPERR);
+	} else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON |
+			    VM_FAULT_HWPOISON_LARGE)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			do_sigbus(regs);
+	} else {
+		pr_emerg("Unexpected fault flags: %08x\n", fault);
+		BUG();
+	}
 }
 
 void do_protection_exception(struct pt_regs *regs)
 {
-	unsigned long trans_exc_code;
-	int access;
-	vm_fault_t fault;
+	union teid teid = { .val = regs->int_parm_long };
 
-	trans_exc_code = regs->int_parm_long;
 	/*
 	 * Protection exceptions are suppressing, decrement psw address.
 	 * The exception to this rule are aborted transactions, for these
 	 * the PSW already points to the correct location.
 	 */
-	if (!(regs->int_code & 0x200))
+	if (!(regs->int_code & 0x200)) {
 		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
+		set_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED);
+	}
 	/*
-	 * Check for low-address protection.  This needs to be treated
-	 * as a special case because the translation exception code
-	 * field is not guaranteed to contain valid data in this case.
+	 * If bit 61 if the TEID is not set, the remainder of the
+	 * TEID is unpredictable. Special handling is required.
 	 */
-	if (unlikely(!(trans_exc_code & 4))) {
-		do_low_address(regs);
-		return;
+	if (unlikely(!teid.b61)) {
+		if (user_mode(regs)) {
+			dump_fault_info(regs);
+			die(regs, "Unexpected TEID");
+		}
+		/* Assume low-address protection in kernel mode. */
+		return handle_fault_error_nolock(regs, 0);
 	}
-	if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
-		regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
-					(regs->psw.addr & PAGE_MASK);
-		access = VM_EXEC;
-		fault = VM_FAULT_BADACCESS;
-	} else {
-		access = VM_WRITE;
-		fault = do_exception(regs, access);
+	if (unlikely(cpu_has_nx() && teid.b56)) {
+		regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK);
+		return handle_fault_error_nolock(regs, SEGV_ACCERR);
 	}
-	if (unlikely(fault))
-		do_fault_error(regs, fault);
+	do_exception(regs, VM_WRITE);
 }
 NOKPROBE_SYMBOL(do_protection_exception);
 
 void do_dat_exception(struct pt_regs *regs)
 {
-	int access;
-	vm_fault_t fault;
-
-	access = VM_ACCESS_FLAGS;
-	fault = do_exception(regs, access);
-	if (unlikely(fault))
-		do_fault_error(regs, fault);
+	do_exception(regs, VM_ACCESS_FLAGS);
 }
 NOKPROBE_SYMBOL(do_dat_exception);
 
-#ifdef CONFIG_PFAULT 
-/*
- * 'pfault' pseudo page faults routines.
- */
-static int pfault_disable;
-
-static int __init nopfault(char *str)
-{
-	pfault_disable = 1;
-	return 1;
-}
-
-__setup("nopfault", nopfault);
-
-struct pfault_refbk {
-	u16 refdiagc;
-	u16 reffcode;
-	u16 refdwlen;
-	u16 refversn;
-	u64 refgaddr;
-	u64 refselmk;
-	u64 refcmpmk;
-	u64 reserved;
-} __attribute__ ((packed, aligned(8)));
-
-static struct pfault_refbk pfault_init_refbk = {
-	.refdiagc = 0x258,
-	.reffcode = 0,
-	.refdwlen = 5,
-	.refversn = 2,
-	.refgaddr = __LC_LPP,
-	.refselmk = 1ULL << 48,
-	.refcmpmk = 1ULL << 48,
-	.reserved = __PF_RES_FIELD
-};
-
-int pfault_init(void)
-{
-        int rc;
-
-	if (pfault_disable)
-		return -1;
-	diag_stat_inc(DIAG_STAT_X258);
-	asm volatile(
-		"	diag	%1,%0,0x258\n"
-		"0:	j	2f\n"
-		"1:	la	%0,8\n"
-		"2:\n"
-		EX_TABLE(0b,1b)
-		: "=d" (rc)
-		: "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
-        return rc;
-}
-
-static struct pfault_refbk pfault_fini_refbk = {
-	.refdiagc = 0x258,
-	.reffcode = 1,
-	.refdwlen = 5,
-	.refversn = 2,
-};
-
-void pfault_fini(void)
-{
-
-	if (pfault_disable)
-		return;
-	diag_stat_inc(DIAG_STAT_X258);
-	asm volatile(
-		"	diag	%0,0,0x258\n"
-		"0:	nopr	%%r7\n"
-		EX_TABLE(0b,0b)
-		: : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
-}
-
-static DEFINE_SPINLOCK(pfault_lock);
-static LIST_HEAD(pfault_list);
-
-#define PF_COMPLETE	0x0080
-
-/*
- * The mechanism of our pfault code: if Linux is running as guest, runs a user
- * space process and the user space process accesses a page that the host has
- * paged out we get a pfault interrupt.
- *
- * This allows us, within the guest, to schedule a different process. Without
- * this mechanism the host would have to suspend the whole virtual cpu until
- * the page has been paged in.
- *
- * So when we get such an interrupt then we set the state of the current task
- * to uninterruptible and also set the need_resched flag. Both happens within
- * interrupt context(!). If we later on want to return to user space we
- * recognize the need_resched flag and then call schedule().  It's not very
- * obvious how this works...
- *
- * Of course we have a lot of additional fun with the completion interrupt (->
- * host signals that a page of a process has been paged in and the process can
- * continue to run). This interrupt can arrive on any cpu and, since we have
- * virtual cpus, actually appear before the interrupt that signals that a page
- * is missing.
- */
-static void pfault_interrupt(struct ext_code ext_code,
-			     unsigned int param32, unsigned long param64)
-{
-	struct task_struct *tsk;
-	__u16 subcode;
-	pid_t pid;
-
-	/*
-	 * Get the external interruption subcode & pfault initial/completion
-	 * signal bit. VM stores this in the 'cpu address' field associated
-	 * with the external interrupt.
-	 */
-	subcode = ext_code.subcode;
-	if ((subcode & 0xff00) != __SUBCODE_MASK)
-		return;
-	inc_irq_stat(IRQEXT_PFL);
-	/* Get the token (= pid of the affected task). */
-	pid = param64 & LPP_PID_MASK;
-	rcu_read_lock();
-	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
-	if (tsk)
-		get_task_struct(tsk);
-	rcu_read_unlock();
-	if (!tsk)
-		return;
-	spin_lock(&pfault_lock);
-	if (subcode & PF_COMPLETE) {
-		/* signal bit is set -> a page has been swapped in by VM */
-		if (tsk->thread.pfault_wait == 1) {
-			/* Initial interrupt was faster than the completion
-			 * interrupt. pfault_wait is valid. Set pfault_wait
-			 * back to zero and wake up the process. This can
-			 * safely be done because the task is still sleeping
-			 * and can't produce new pfaults. */
-			tsk->thread.pfault_wait = 0;
-			list_del(&tsk->thread.list);
-			wake_up_process(tsk);
-			put_task_struct(tsk);
-		} else {
-			/* Completion interrupt was faster than initial
-			 * interrupt. Set pfault_wait to -1 so the initial
-			 * interrupt doesn't put the task to sleep.
-			 * If the task is not running, ignore the completion
-			 * interrupt since it must be a leftover of a PFAULT
-			 * CANCEL operation which didn't remove all pending
-			 * completion interrupts. */
-			if (task_is_running(tsk))
-				tsk->thread.pfault_wait = -1;
-		}
-	} else {
-		/* signal bit not set -> a real page is missing. */
-		if (WARN_ON_ONCE(tsk != current))
-			goto out;
-		if (tsk->thread.pfault_wait == 1) {
-			/* Already on the list with a reference: put to sleep */
-			goto block;
-		} else if (tsk->thread.pfault_wait == -1) {
-			/* Completion interrupt was faster than the initial
-			 * interrupt (pfault_wait == -1). Set pfault_wait
-			 * back to zero and exit. */
-			tsk->thread.pfault_wait = 0;
-		} else {
-			/* Initial interrupt arrived before completion
-			 * interrupt. Let the task sleep.
-			 * An extra task reference is needed since a different
-			 * cpu may set the task state to TASK_RUNNING again
-			 * before the scheduler is reached. */
-			get_task_struct(tsk);
-			tsk->thread.pfault_wait = 1;
-			list_add(&tsk->thread.list, &pfault_list);
-block:
-			/* Since this must be a userspace fault, there
-			 * is no kernel task state to trample. Rely on the
-			 * return to userspace schedule() to block. */
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			set_tsk_need_resched(tsk);
-			set_preempt_need_resched();
-		}
-	}
-out:
-	spin_unlock(&pfault_lock);
-	put_task_struct(tsk);
-}
-
-static int pfault_cpu_dead(unsigned int cpu)
-{
-	struct thread_struct *thread, *next;
-	struct task_struct *tsk;
-
-	spin_lock_irq(&pfault_lock);
-	list_for_each_entry_safe(thread, next, &pfault_list, list) {
-		thread->pfault_wait = 0;
-		list_del(&thread->list);
-		tsk = container_of(thread, struct task_struct, thread);
-		wake_up_process(tsk);
-		put_task_struct(tsk);
-	}
-	spin_unlock_irq(&pfault_lock);
-	return 0;
-}
-
-static int __init pfault_irq_init(void)
-{
-	int rc;
-
-	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
-	if (rc)
-		goto out_extint;
-	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
-	if (rc)
-		goto out_pfault;
-	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
-	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
-				  NULL, pfault_cpu_dead);
-	return 0;
-
-out_pfault:
-	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
-out_extint:
-	pfault_disable = 1;
-	return rc;
-}
-early_initcall(pfault_irq_init);
-
-#endif /* CONFIG_PFAULT */
-
 #if IS_ENABLED(CONFIG_PGSTE)
 
 void do_secure_storage_access(struct pt_regs *regs)
 {
-	unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
+	union teid teid = { .val = regs->int_parm_long };
+	unsigned long addr = get_fault_address(regs);
 	struct vm_area_struct *vma;
+	struct folio_walk fw;
 	struct mm_struct *mm;
-	struct page *page;
-	struct gmap *gmap;
+	struct folio *folio;
 	int rc;
 
 	/*
-	 * bit 61 tells us if the address is valid, if it's not we
-	 * have a major problem and should stop the kernel or send a
-	 * SIGSEGV to the process. Unfortunately bit 61 is not
-	 * reliable without the misc UV feature so we need to check
-	 * for that as well.
+	 * Bit 61 indicates if the address is valid, if it is not the
+	 * kernel should be stopped or SIGSEGV should be sent to the
+	 * process. Bit 61 is not reliable without the misc UV feature,
+	 * therefore this needs to be checked too.
 	 */
-	if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
-	    !test_bit_inv(61, &regs->int_parm_long)) {
+	if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) {
 		/*
 		 * When this happens, userspace did something that it
 		 * was not supposed to do, e.g. branching into secure
@@ -786,100 +431,43 @@ void do_secure_storage_access(struct pt_regs *regs)
 			send_sig(SIGSEGV, current, 0);
 			return;
 		}
-
 		/*
-		 * The kernel should never run into this case and we
-		 * have no way out of this situation.
+		 * The kernel should never run into this case and
+		 * there is no way out of this situation.
 		 */
 		panic("Unexpected PGM 0x3d with TEID bit 61=0");
 	}
-
-	switch (get_fault_type(regs)) {
-	case GMAP_FAULT:
-		mm = current->mm;
-		gmap = (struct gmap *)S390_lowcore.gmap;
-		mmap_read_lock(mm);
-		addr = __gmap_translate(gmap, addr);
-		mmap_read_unlock(mm);
-		if (IS_ERR_VALUE(addr)) {
-			do_fault_error(regs, VM_FAULT_BADMAP);
-			break;
-		}
-		fallthrough;
-	case USER_FAULT:
+	if (is_kernel_fault(regs)) {
+		folio = phys_to_folio(addr);
+		if (unlikely(!folio_try_get(folio)))
+			return;
+		rc = arch_make_folio_accessible(folio);
+		folio_put(folio);
+		if (rc)
+			BUG();
+	} else {
+		if (faulthandler_disabled())
+			return handle_fault_error_nolock(regs, 0);
 		mm = current->mm;
 		mmap_read_lock(mm);
 		vma = find_vma(mm, addr);
-		if (!vma) {
+		if (!vma)
+			return handle_fault_error(regs, SEGV_MAPERR);
+		folio = folio_walk_start(&fw, vma, addr, 0);
+		if (!folio) {
 			mmap_read_unlock(mm);
-			do_fault_error(regs, VM_FAULT_BADMAP);
-			break;
-		}
-		page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
-		if (IS_ERR_OR_NULL(page)) {
-			mmap_read_unlock(mm);
-			break;
+			return;
 		}
-		if (arch_make_page_accessible(page))
+		/* arch_make_folio_accessible() needs a raised refcount. */
+		folio_get(folio);
+		rc = arch_make_folio_accessible(folio);
+		folio_put(folio);
+		folio_walk_end(&fw, vma);
+		if (rc)
 			send_sig(SIGSEGV, current, 0);
-		put_page(page);
 		mmap_read_unlock(mm);
-		break;
-	case KERNEL_FAULT:
-		page = phys_to_page(addr);
-		if (unlikely(!try_get_page(page)))
-			break;
-		rc = arch_make_page_accessible(page);
-		put_page(page);
-		if (rc)
-			BUG();
-		break;
-	default:
-		do_fault_error(regs, VM_FAULT_BADMAP);
-		WARN_ON_ONCE(1);
 	}
 }
 NOKPROBE_SYMBOL(do_secure_storage_access);
 
-void do_non_secure_storage_access(struct pt_regs *regs)
-{
-	unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
-	struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
-
-	if (get_fault_type(regs) != GMAP_FAULT) {
-		do_fault_error(regs, VM_FAULT_BADMAP);
-		WARN_ON_ONCE(1);
-		return;
-	}
-
-	if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
-		send_sig(SIGSEGV, current, 0);
-}
-NOKPROBE_SYMBOL(do_non_secure_storage_access);
-
-void do_secure_storage_violation(struct pt_regs *regs)
-{
-	unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
-	struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
-
-	/*
-	 * If the VM has been rebooted, its address space might still contain
-	 * secure pages from the previous boot.
-	 * Clear the page so it can be reused.
-	 */
-	if (!gmap_destroy_page(gmap, gaddr))
-		return;
-	/*
-	 * Either KVM messed up the secure guest mapping or the same
-	 * page is mapped into multiple secure guests.
-	 *
-	 * This exception is only triggered when a guest 2 is running
-	 * and can therefore never occur in kernel context.
-	 */
-	printk_ratelimited(KERN_WARNING
-			   "Secure storage violation in task: %s, pid %d\n",
-			   current->comm, current->pid);
-	send_sig(SIGSEGV, current, 0);
-}
-
 #endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 02d15c8dc92e..dd85bcca817d 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -8,6 +8,8 @@
  *		 Janosch Frank <frankja@linux.vnet.ibm.com>
  */
 
+#include <linux/cpufeature.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/pagewalk.h>
 #include <linux/swap.h>
@@ -18,20 +20,43 @@
 #include <linux/ksm.h>
 #include <linux/mman.h>
 #include <linux/pgtable.h>
-
+#include <asm/page-states.h>
 #include <asm/pgalloc.h>
+#include <asm/machine.h>
+#include <asm/gmap_helpers.h>
 #include <asm/gmap.h>
-#include <asm/tlb.h>
+#include <asm/page.h>
+
+/*
+ * The address is saved in a radix tree directly; NULL would be ambiguous,
+ * since 0 is a valid address, and NULL is returned when nothing was found.
+ * The lower bits are ignored by all users of the macro, so it can be used
+ * to distinguish a valid address 0 from a NULL.
+ */
+#define VALID_GADDR_FLAG 1
+#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
+#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
 
 #define GMAP_SHADOW_FAKE_TABLE 1ULL
 
+static struct page *gmap_alloc_crst(void)
+{
+	struct page *page;
+
+	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	if (!page)
+		return NULL;
+	__arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
+	return page;
+}
+
 /**
  * gmap_alloc - allocate and initialize a guest address space
  * @limit: maximum address of the gmap address space
  *
  * Returns a guest address space structure.
  */
-static struct gmap *gmap_alloc(unsigned long limit)
+struct gmap *gmap_alloc(unsigned long limit)
 {
 	struct gmap *gmap;
 	struct page *page;
@@ -58,21 +83,17 @@ static struct gmap *gmap_alloc(unsigned long limit)
 	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
 	if (!gmap)
 		goto out;
-	INIT_LIST_HEAD(&gmap->crst_list);
 	INIT_LIST_HEAD(&gmap->children);
-	INIT_LIST_HEAD(&gmap->pt_list);
 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
 	spin_lock_init(&gmap->guest_table_lock);
 	spin_lock_init(&gmap->shadow_lock);
 	refcount_set(&gmap->ref_count, 1);
-	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		goto out_free;
-	page->index = 0;
-	list_add(&page->lru, &gmap->crst_list);
-	table = (unsigned long *) page_to_phys(page);
+	table = page_to_virt(page);
 	crst_table_init(table, etype);
 	gmap->table = table;
 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
@@ -85,6 +106,7 @@ out_free:
 out:
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(gmap_alloc);
 
 /**
  * gmap_create - create a guest address space
@@ -116,10 +138,7 @@ EXPORT_SYMBOL_GPL(gmap_create);
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
-	if (MACHINE_HAS_IDTE)
-		__tlb_flush_idte(gmap->asce);
-	else
-		__tlb_flush_global();
+	__tlb_flush_idte(gmap->asce);
 }
 
 static void gmap_radix_tree_free(struct radix_tree_root *root)
@@ -173,30 +192,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
 	} while (nr > 0);
 }
 
+static void gmap_free_crst(unsigned long *table, bool free_ptes)
+{
+	bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
+	int i;
+
+	if (is_segment) {
+		if (!free_ptes)
+			goto out;
+		for (i = 0; i < _CRST_ENTRIES; i++)
+			if (!(table[i] & _SEGMENT_ENTRY_INVALID))
+				page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
+	} else {
+		for (i = 0; i < _CRST_ENTRIES; i++)
+			if (!(table[i] & _REGION_ENTRY_INVALID))
+				gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
+	}
+
+out:
+	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
 /**
  * gmap_free - free a guest address space
  * @gmap: pointer to the guest address space structure
  *
  * No locks required. There are no references to this gmap anymore.
  */
-static void gmap_free(struct gmap *gmap)
+void gmap_free(struct gmap *gmap)
 {
-	struct page *page, *next;
-
 	/* Flush tlb of all gmaps (if not already done for shadows) */
 	if (!(gmap_is_shadow(gmap) && gmap->removed))
 		gmap_flush_tlb(gmap);
 	/* Free all segment & region tables. */
-	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
-		__free_pages(page, CRST_ALLOC_ORDER);
+	gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
+
 	gmap_radix_tree_free(&gmap->guest_to_host);
 	gmap_radix_tree_free(&gmap->host_to_guest);
 
 	/* Free additional data for a shadow gmap */
 	if (gmap_is_shadow(gmap)) {
-		/* Free all page tables. */
-		list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
-			page_table_free_pgste(page);
 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
 		/* Release reference to the parent */
 		gmap_put(gmap->parent);
@@ -204,6 +239,7 @@ static void gmap_free(struct gmap *gmap)
 
 	kfree(gmap);
 }
+EXPORT_SYMBOL_GPL(gmap_free);
 
 /**
  * gmap_get - increase reference counter for guest address space
@@ -267,37 +303,6 @@ void gmap_remove(struct gmap *gmap)
 }
 EXPORT_SYMBOL_GPL(gmap_remove);
 
-/**
- * gmap_enable - switch primary space to the guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_enable(struct gmap *gmap)
-{
-	S390_lowcore.gmap = (unsigned long) gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_enable);
-
-/**
- * gmap_disable - switch back to the standard primary address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_disable(struct gmap *gmap)
-{
-	S390_lowcore.gmap = 0UL;
-}
-EXPORT_SYMBOL_GPL(gmap_disable);
-
-/**
- * gmap_get_enabled - get a pointer to the currently enabled gmap
- *
- * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
- */
-struct gmap *gmap_get_enabled(void)
-{
-	return (struct gmap *) S390_lowcore.gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_get_enabled);
-
 /*
  * gmap_alloc_table is assumed to be called with mmap_lock held
  */
@@ -308,17 +313,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 	unsigned long *new;
 
 	/* since we dont free the gmap table until gmap_free we can unlock */
-	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	new = (unsigned long *) page_to_phys(page);
+	new = page_to_virt(page);
 	crst_table_init(new, init);
 	spin_lock(&gmap->guest_table_lock);
 	if (*table & _REGION_ENTRY_INVALID) {
-		list_add(&page->lru, &gmap->crst_list);
-		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+		*table = __pa(new) | _REGION_ENTRY_LENGTH |
 			(*table & _REGION_ENTRY_TYPE_MASK);
-		page->index = gaddr;
 		page = NULL;
 	}
 	spin_unlock(&gmap->guest_table_lock);
@@ -327,22 +330,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 	return 0;
 }
 
-/**
- * __gmap_segment_gaddr - find virtual address from segment pointer
- * @entry: pointer to a segment table entry in the guest address space
- *
- * Returns the virtual address in the guest address space for the segment
- */
-static unsigned long __gmap_segment_gaddr(unsigned long *entry)
+static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
 {
-	struct page *page;
-	unsigned long offset, mask;
+	return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
 
-	offset = (unsigned long) entry / sizeof(unsigned long);
-	offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
-	mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
-	page = virt_to_page((void *)((unsigned long) entry & mask));
-	return page->index + offset;
+static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
+{
+	return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
+
+static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
+				       unsigned long *gaddr)
+{
+	*gaddr = host_to_guest_delete(gmap, vmaddr);
+	if (IS_GADDR_VALID(*gaddr))
+		return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
+	return NULL;
 }
 
 /**
@@ -354,16 +358,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry)
  */
 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 {
-	unsigned long *entry;
+	unsigned long gaddr;
 	int flush = 0;
+	pmd_t *pmdp;
 
 	BUG_ON(gmap_is_shadow(gmap));
 	spin_lock(&gmap->guest_table_lock);
-	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
-	if (entry) {
-		flush = (*entry != _SEGMENT_ENTRY_EMPTY);
-		*entry = _SEGMENT_ENTRY_EMPTY;
+
+	pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+	if (pmdp) {
+		flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
+		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 	}
+
 	spin_unlock(&gmap->guest_table_lock);
 	return flush;
 }
@@ -482,26 +489,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 EXPORT_SYMBOL_GPL(__gmap_translate);
 
 /**
- * gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- */
-unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long rc;
-
-	mmap_read_lock(gmap->mm);
-	rc = __gmap_translate(gmap, gaddr);
-	mmap_read_unlock(gmap->mm);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_translate);
-
-/**
  * gmap_unlink - disconnect a page table from the gmap shadow tables
  * @mm: pointer to the parent mm_struct
  * @table: pointer to the host page table
@@ -557,7 +544,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
 				     gaddr & _REGION1_MASK))
 			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 	}
 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
@@ -565,7 +552,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
 				     gaddr & _REGION2_MASK))
 			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 	}
 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
@@ -573,7 +560,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
 				     gaddr & _REGION3_MASK))
 			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 	}
 	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 	/* Walk the parent mm page table */
@@ -585,12 +572,12 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	pud = pud_offset(p4d, vmaddr);
 	VM_BUG_ON(pud_none(*pud));
 	/* large puds cannot yet be handled */
-	if (pud_large(*pud))
+	if (pud_leaf(*pud))
 		return -EFAULT;
 	pmd = pmd_offset(pud, vmaddr);
 	VM_BUG_ON(pmd_none(*pmd));
 	/* Are we allowed to use huge pages? */
-	if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
+	if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
 		return -EFAULT;
 	/* Link gmap segment table entry location to page table. */
 	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
@@ -600,15 +587,18 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	spin_lock(&gmap->guest_table_lock);
 	if (*table == _SEGMENT_ENTRY_EMPTY) {
 		rc = radix_tree_insert(&gmap->host_to_guest,
-				       vmaddr >> PMD_SHIFT, table);
+				       vmaddr >> PMD_SHIFT,
+				       (void *)MAKE_VALID_GADDR(gaddr));
 		if (!rc) {
-			if (pmd_large(*pmd)) {
+			if (pmd_leaf(*pmd)) {
 				*table = (pmd_val(*pmd) &
 					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
-					| _SEGMENT_ENTRY_GMAP_UC;
+					| _SEGMENT_ENTRY_GMAP_UC
+					| _SEGMENT_ENTRY;
 			} else
-				*table = pmd_val(*pmd) &
-					_SEGMENT_ENTRY_HARDWARE_BITS;
+				*table = (pmd_val(*pmd) &
+					_SEGMENT_ENTRY_HARDWARE_BITS)
+					| _SEGMENT_ENTRY;
 		}
 	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
 		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
@@ -622,113 +612,27 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	radix_tree_preload_end();
 	return rc;
 }
-
-/**
- * gmap_fault - resolve a fault on a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @fault_flags: flags to pass down to handle_mm_fault()
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
- */
-int gmap_fault(struct gmap *gmap, unsigned long gaddr,
-	       unsigned int fault_flags)
-{
-	unsigned long vmaddr;
-	int rc;
-	bool unlocked;
-
-	mmap_read_lock(gmap->mm);
-
-retry:
-	unlocked = false;
-	vmaddr = __gmap_translate(gmap, gaddr);
-	if (IS_ERR_VALUE(vmaddr)) {
-		rc = vmaddr;
-		goto out_up;
-	}
-	if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
-			     &unlocked)) {
-		rc = -EFAULT;
-		goto out_up;
-	}
-	/*
-	 * In the case that fixup_user_fault unlocked the mmap_lock during
-	 * faultin redo __gmap_translate to not race with a map/unmap_segment.
-	 */
-	if (unlocked)
-		goto retry;
-
-	rc = __gmap_link(gmap, gaddr, vmaddr);
-out_up:
-	mmap_read_unlock(gmap->mm);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_fault);
+EXPORT_SYMBOL(__gmap_link);
 
 /*
  * this function is assumed to be called with mmap_lock held
  */
 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 {
-	struct vm_area_struct *vma;
 	unsigned long vmaddr;
-	spinlock_t *ptl;
-	pte_t *ptep;
+
+	mmap_assert_locked(gmap->mm);
 
 	/* Find the vm address for the guest address */
 	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
 						   gaddr >> PMD_SHIFT);
 	if (vmaddr) {
 		vmaddr |= gaddr & ~PMD_MASK;
-
-		vma = vma_lookup(gmap->mm, vmaddr);
-		if (!vma || is_vm_hugetlb_page(vma))
-			return;
-
-		/* Get pointer to the page table entry */
-		ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
-		if (likely(ptep)) {
-			ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
-			pte_unmap_unlock(ptep, ptl);
-		}
+		gmap_helper_zap_one_page(gmap->mm, vmaddr);
 	}
 }
 EXPORT_SYMBOL_GPL(__gmap_zap);
 
-void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
-{
-	unsigned long gaddr, vmaddr, size;
-	struct vm_area_struct *vma;
-
-	mmap_read_lock(gmap->mm);
-	for (gaddr = from; gaddr < to;
-	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
-		/* Find the vm address for the guest address */
-		vmaddr = (unsigned long)
-			radix_tree_lookup(&gmap->guest_to_host,
-					  gaddr >> PMD_SHIFT);
-		if (!vmaddr)
-			continue;
-		vmaddr |= gaddr & ~PMD_MASK;
-		/* Find vma in the parent mm */
-		vma = find_vma(gmap->mm, vmaddr);
-		if (!vma)
-			continue;
-		/*
-		 * We do not discard pages that are backed by
-		 * hugetlbfs, so we don't have to refault them.
-		 */
-		if (is_vm_hugetlb_page(vma))
-			continue;
-		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
-		zap_page_range(vma, vmaddr, size);
-	}
-	mmap_read_unlock(gmap->mm);
-}
-EXPORT_SYMBOL_GPL(gmap_discard);
-
 static LIST_HEAD(gmap_notifier_list);
 static DEFINE_SPINLOCK(gmap_notifier_lock);
 
@@ -790,8 +694,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
  *
  * Note: Can also be called for shadow gmaps.
  */
-static inline unsigned long *gmap_table_walk(struct gmap *gmap,
-					     unsigned long gaddr, int level)
+unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
 {
 	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
 	unsigned long *table = gmap->table;
@@ -813,7 +716,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 		fallthrough;
 	case _ASCE_TYPE_REGION2:
 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
@@ -821,7 +724,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 		fallthrough;
 	case _ASCE_TYPE_REGION3:
 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
@@ -829,7 +732,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
 		fallthrough;
 	case _ASCE_TYPE_SEGMENT:
 		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
@@ -837,11 +740,12 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
-		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
-		table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
+		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
+		table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
 	}
 	return table;
 }
+EXPORT_SYMBOL(gmap_table_walk);
 
 /**
  * gmap_pte_op_walk - walk the gmap page table, get the page table lock
@@ -896,12 +800,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 
 /**
  * gmap_pte_op_end - release the page table lock
- * @ptl: pointer to the spinlock pointer
+ * @ptep: pointer to the locked pte
+ * @ptl: pointer to the page table spinlock
  */
-static void gmap_pte_op_end(spinlock_t *ptl)
+static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
 {
-	if (ptl)
-		spin_unlock(ptl);
+	pte_unmap_unlock(ptep, ptl);
 }
 
 /**
@@ -932,7 +836,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
 	}
 
 	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
-	if (!pmd_large(*pmdp))
+	if (!pmd_leaf(*pmdp))
 		spin_unlock(&gmap->guest_table_lock);
 	return pmdp;
 }
@@ -944,7 +848,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
  */
 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
 {
-	if (pmd_large(*pmdp))
+	if (pmd_leaf(*pmdp))
 		spin_unlock(&gmap->guest_table_lock);
 }
 
@@ -1012,7 +916,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 {
 	int rc;
 	pte_t *ptep;
-	spinlock_t *ptl = NULL;
+	spinlock_t *ptl;
 	unsigned long pbits = 0;
 
 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
@@ -1026,7 +930,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
 	/* Protect and unlock. */
 	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
-	gmap_pte_op_end(ptl);
+	gmap_pte_op_end(ptep, ptl);
 	return rc;
 }
 
@@ -1038,86 +942,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  * @bits: pgste notification bits to set
  *
- * Returns 0 if successfully protected, -ENOMEM if out of memory and
- * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ * Returns:
+ *   PAGE_SIZE if a small page was successfully protected;
+ *   HPAGE_SIZE if a large page was successfully protected;
+ *   -ENOMEM if out of memory;
+ *   -EFAULT if gaddr is invalid (or mapping for shadows is missing);
+ *   -EAGAIN if the guest mapping is missing and should be fixed by the caller.
  *
- * Called with sg->mm->mmap_lock in read.
+ * Context: Called with sg->mm->mmap_lock in read.
  */
-static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
-			      unsigned long len, int prot, unsigned long bits)
+int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
 {
-	unsigned long vmaddr, dist;
 	pmd_t *pmdp;
-	int rc;
+	int rc = 0;
 
 	BUG_ON(gmap_is_shadow(gmap));
-	while (len) {
-		rc = -EAGAIN;
-		pmdp = gmap_pmd_op_walk(gmap, gaddr);
-		if (pmdp) {
-			if (!pmd_large(*pmdp)) {
-				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
-						      bits);
-				if (!rc) {
-					len -= PAGE_SIZE;
-					gaddr += PAGE_SIZE;
-				}
-			} else {
-				rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
-						      bits);
-				if (!rc) {
-					dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
-					len = len < dist ? 0 : len - dist;
-					gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
-				}
-			}
-			gmap_pmd_op_end(gmap, pmdp);
-		}
-		if (rc) {
-			if (rc == -EINVAL)
-				return rc;
 
-			/* -EAGAIN, fixup of userspace mm and gmap */
-			vmaddr = __gmap_translate(gmap, gaddr);
-			if (IS_ERR_VALUE(vmaddr))
-				return vmaddr;
-			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
-			if (rc)
-				return rc;
-		}
-	}
-	return 0;
-}
+	pmdp = gmap_pmd_op_walk(gmap, gaddr);
+	if (!pmdp)
+		return -EAGAIN;
 
-/**
- * gmap_mprotect_notify - change access rights for a range of ptes and
- *                        call the notifier if any pte changes again
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @len: size of area
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- *
- * Returns 0 if for each page in the given range a gmap mapping exists,
- * the new access rights could be set and the notifier could be armed.
- * If the gmap mapping is missing for one or more pages -EFAULT is
- * returned. If no memory could be allocated -ENOMEM is returned.
- * This function establishes missing page table entries.
- */
-int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
-			 unsigned long len, int prot)
-{
-	int rc;
+	if (!pmd_leaf(*pmdp)) {
+		rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
+		if (!rc)
+			rc = PAGE_SIZE;
+	} else {
+		rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
+		if (!rc)
+			rc = HPAGE_SIZE;
+	}
+	gmap_pmd_op_end(gmap, pmdp);
 
-	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
-		return -EINVAL;
-	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
-		return -EINVAL;
-	mmap_read_lock(gmap->mm);
-	rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
-	mmap_read_unlock(gmap->mm);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+EXPORT_SYMBOL_GPL(gmap_protect_one);
 
 /**
  * gmap_read_table - get an unsigned long value from a guest page table using
@@ -1150,12 +1008,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
 			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
 				address = pte_val(pte) & PAGE_MASK;
 				address += gaddr & ~PAGE_MASK;
-				*val = *(unsigned long *) address;
+				*val = *(unsigned long *)__va(address);
 				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
 				/* Do *NOT* clear the _PAGE_INVALID bit! */
 				rc = 0;
 			}
-			gmap_pte_op_end(ptl);
+			gmap_pte_op_end(ptep, ptl);
 		}
 		if (!rc)
 			break;
@@ -1249,7 +1107,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 			if (!rc)
 				gmap_insert_rmap(sg, vmaddr, rmap);
 			spin_unlock(&sg->guest_table_lock);
-			gmap_pte_op_end(ptl);
+			gmap_pte_op_end(ptep, ptl);
 		}
 		radix_tree_preload_end();
 		if (rc) {
@@ -1304,7 +1162,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
 	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
 	if (!table || *table & _PAGE_INVALID)
 		return;
-	gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
+	gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
 	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
 }
 
@@ -1322,7 +1180,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
-	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
+	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
 		pgt[i] = _PAGE_INVALID;
 }
 
@@ -1335,23 +1193,23 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long sto, *ste, *pgt;
-	struct page *page;
+	unsigned long *ste;
+	phys_addr_t sto, pgt;
+	struct ptdesc *ptdesc;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
 	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
-	sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
+	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
 	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
-	pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
 	*ste = _SEGMENT_ENTRY_EMPTY;
-	__gmap_unshadow_pgt(sg, raddr, pgt);
+	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
 	/* Free page table */
-	page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
-	list_del(&page->lru);
-	page_table_free_pgste(page);
+	ptdesc = page_ptdesc(phys_to_page(pgt));
+	page_table_free_pgste(ptdesc);
 }
 
 /**
@@ -1365,21 +1223,20 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
 				unsigned long *sgt)
 {
-	unsigned long *pgt;
-	struct page *page;
+	struct ptdesc *ptdesc;
+	phys_addr_t pgt;
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
 		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
 			continue;
-		pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
 		sgt[i] = _SEGMENT_ENTRY_EMPTY;
-		__gmap_unshadow_pgt(sg, raddr, pgt);
+		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
 		/* Free page table */
-		page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
-		list_del(&page->lru);
-		page_table_free_pgste(page);
+		ptdesc = page_ptdesc(phys_to_page(pgt));
+		page_table_free_pgste(ptdesc);
 	}
 }
 
@@ -1392,7 +1249,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long r3o, *r3e, *sgt;
+	unsigned long r3o, *r3e;
+	phys_addr_t sgt;
 	struct page *page;
 
 	BUG_ON(!gmap_is_shadow(sg));
@@ -1401,13 +1259,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
 	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
-	gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
-	sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
+	sgt = *r3e & _REGION_ENTRY_ORIGIN;
 	*r3e = _REGION3_ENTRY_EMPTY;
-	__gmap_unshadow_sgt(sg, raddr, sgt);
+	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
 	/* Free segment table */
-	page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
-	list_del(&page->lru);
+	page = phys_to_page(sgt);
 	__free_pages(page, CRST_ALLOC_ORDER);
 }
 
@@ -1422,20 +1279,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
 				unsigned long *r3t)
 {
-	unsigned long *sgt;
 	struct page *page;
+	phys_addr_t sgt;
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
 		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
-		sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
 		r3t[i] = _REGION3_ENTRY_EMPTY;
-		__gmap_unshadow_sgt(sg, raddr, sgt);
+		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
 		/* Free segment table */
-		page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
-		list_del(&page->lru);
+		page = phys_to_page(sgt);
 		__free_pages(page, CRST_ALLOC_ORDER);
 	}
 }
@@ -1449,7 +1305,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long r2o, *r2e, *r3t;
+	unsigned long r2o, *r2e;
+	phys_addr_t r3t;
 	struct page *page;
 
 	BUG_ON(!gmap_is_shadow(sg));
@@ -1458,13 +1315,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
 	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
-	gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
-	r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
+	r3t = *r2e & _REGION_ENTRY_ORIGIN;
 	*r2e = _REGION2_ENTRY_EMPTY;
-	__gmap_unshadow_r3t(sg, raddr, r3t);
+	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
 	/* Free region 3 table */
-	page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
-	list_del(&page->lru);
+	page = phys_to_page(r3t);
 	__free_pages(page, CRST_ALLOC_ORDER);
 }
 
@@ -1479,7 +1335,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
 				unsigned long *r2t)
 {
-	unsigned long *r3t;
+	phys_addr_t r3t;
 	struct page *page;
 	int i;
 
@@ -1487,12 +1343,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
 		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
-		r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
 		r2t[i] = _REGION2_ENTRY_EMPTY;
-		__gmap_unshadow_r3t(sg, raddr, r3t);
+		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
 		/* Free region 3 table */
-		page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
-		list_del(&page->lru);
+		page = phys_to_page(r3t);
 		__free_pages(page, CRST_ALLOC_ORDER);
 	}
 }
@@ -1506,8 +1361,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
  */
 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
 {
-	unsigned long r1o, *r1e, *r2t;
+	unsigned long r1o, *r1e;
 	struct page *page;
+	phys_addr_t r2t;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
@@ -1515,13 +1371,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
 		return;
 	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
 	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
-	gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
-	r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
+	r2t = *r1e & _REGION_ENTRY_ORIGIN;
 	*r1e = _REGION1_ENTRY_EMPTY;
-	__gmap_unshadow_r2t(sg, raddr, r2t);
+	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
 	/* Free region 2 table */
-	page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
-	list_del(&page->lru);
+	page = phys_to_page(r2t);
 	__free_pages(page, CRST_ALLOC_ORDER);
 }
 
@@ -1536,23 +1391,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
 				unsigned long *r1t)
 {
-	unsigned long asce, *r2t;
+	unsigned long asce;
 	struct page *page;
+	phys_addr_t r2t;
 	int i;
 
 	BUG_ON(!gmap_is_shadow(sg));
-	asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
 		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
-		r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
-		__gmap_unshadow_r2t(sg, raddr, r2t);
+		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
+		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
 		/* Clear entry and flush translation r1t -> r2t */
 		gmap_idte_one(asce, raddr);
 		r1t[i] = _REGION1_ENTRY_EMPTY;
 		/* Free region 2 table */
-		page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
-		list_del(&page->lru);
+		page = phys_to_page(r2t);
 		__free_pages(page, CRST_ALLOC_ORDER);
 	}
 }
@@ -1563,7 +1418,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
  *
  * Called with sg->guest_table_lock
  */
-static void gmap_unshadow(struct gmap *sg)
+void gmap_unshadow(struct gmap *sg)
 {
 	unsigned long *table;
 
@@ -1573,7 +1428,7 @@ static void gmap_unshadow(struct gmap *sg)
 	sg->removed = 1;
 	gmap_call_notifier(sg, 0, -1UL);
 	gmap_flush_tlb(sg);
-	table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+	table = __va(sg->asce & _ASCE_ORIGIN);
 	switch (sg->asce & _ASCE_TYPE_MASK) {
 	case _ASCE_TYPE_REGION1:
 		__gmap_unshadow_r1t(sg, 0, table);
@@ -1589,142 +1444,7 @@ static void gmap_unshadow(struct gmap *sg)
 		break;
 	}
 }
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
-				     int edat_level)
-{
-	struct gmap *sg;
-
-	list_for_each_entry(sg, &parent->children, list) {
-		if (sg->orig_asce != asce || sg->edat_level != edat_level ||
-		    sg->removed)
-			continue;
-		if (!sg->initialized)
-			return ERR_PTR(-EAGAIN);
-		refcount_inc(&sg->ref_count);
-		return sg;
-	}
-	return NULL;
-}
-
-/**
- * gmap_shadow_valid - check if a shadow guest address space matches the
- *                     given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise, the
- * caller has to request a new shadow gmap in this case.
- *
- */
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
-	if (sg->removed)
-		return 0;
-	return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_valid);
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
-			 int edat_level)
-{
-	struct gmap *sg, *new;
-	unsigned long limit;
-	int rc;
-
-	BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
-	BUG_ON(gmap_is_shadow(parent));
-	spin_lock(&parent->shadow_lock);
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	spin_unlock(&parent->shadow_lock);
-	if (sg)
-		return sg;
-	/* Create a new shadow gmap */
-	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
-	if (asce & _ASCE_REAL_SPACE)
-		limit = -1UL;
-	new = gmap_alloc(limit);
-	if (!new)
-		return ERR_PTR(-ENOMEM);
-	new->mm = parent->mm;
-	new->parent = gmap_get(parent);
-	new->orig_asce = asce;
-	new->edat_level = edat_level;
-	new->initialized = false;
-	spin_lock(&parent->shadow_lock);
-	/* Recheck if another CPU created the same shadow */
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	if (sg) {
-		spin_unlock(&parent->shadow_lock);
-		gmap_free(new);
-		return sg;
-	}
-	if (asce & _ASCE_REAL_SPACE) {
-		/* only allow one real-space gmap shadow */
-		list_for_each_entry(sg, &parent->children, list) {
-			if (sg->orig_asce & _ASCE_REAL_SPACE) {
-				spin_lock(&sg->guest_table_lock);
-				gmap_unshadow(sg);
-				spin_unlock(&sg->guest_table_lock);
-				list_del(&sg->list);
-				gmap_put(sg);
-				break;
-			}
-		}
-	}
-	refcount_set(&new->ref_count, 2);
-	list_add(&new->list, &parent->children);
-	if (asce & _ASCE_REAL_SPACE) {
-		/* nothing to protect, return right away */
-		new->initialized = true;
-		spin_unlock(&parent->shadow_lock);
-		return new;
-	}
-	spin_unlock(&parent->shadow_lock);
-	/* protect after insertion, so it will get properly invalidated */
-	mmap_read_lock(parent->mm);
-	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
-				((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
-				PROT_READ, GMAP_NOTIFY_SHADOW);
-	mmap_read_unlock(parent->mm);
-	spin_lock(&parent->shadow_lock);
-	new->initialized = true;
-	if (rc) {
-		list_del(&new->list);
-		gmap_free(new);
-		new = ERR_PTR(rc);
-	}
-	spin_unlock(&parent->shadow_lock);
-	return new;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow);
+EXPORT_SYMBOL(gmap_unshadow);
 
 /**
  * gmap_shadow_r2t - create an empty shadow region 2 table
@@ -1736,7 +1456,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
  * The r2t parameter specifies the address of the source table. The
  * four pages of the source table are made read-only in the parent gmap
  * address space. A write to the source table area @r2t will automatically
- * remove the shadow r2 table and all of its decendents.
+ * remove the shadow r2 table and all of its descendants.
  *
  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  * shadow table structure is incomplete, -ENOMEM if out of memory and
@@ -1748,19 +1468,17 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
-	unsigned long *s_r2t, *table;
+	unsigned long *table;
+	phys_addr_t s_r2t;
 	struct page *page;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	/* Allocate a shadow region second table */
-	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	page->index = r2t & _REGION_ENTRY_ORIGIN;
-	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_r2t = (unsigned long *) page_to_phys(page);
+	s_r2t = page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
@@ -1775,13 +1493,12 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 		rc = -EAGAIN;		/* Race with shadow */
 		goto out_free;
 	}
-	crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
 	/* mark as invalid as long as the parent table is not protected */
-	*table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+	*table = s_r2t | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
 	if (sg->edat_level >= 1)
 		*table |= (r2t & _REGION_ENTRY_PROTECT);
-	list_add(&page->lru, &sg->crst_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_REGION_ENTRY_INVALID;
@@ -1798,8 +1515,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 4);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-			      (unsigned long) s_r2t)
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_REGION_ENTRY_INVALID;
@@ -1832,19 +1548,17 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
 		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
-	unsigned long *s_r3t, *table;
+	unsigned long *table;
+	phys_addr_t s_r3t;
 	struct page *page;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	/* Allocate a shadow region second table */
-	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	page->index = r3t & _REGION_ENTRY_ORIGIN;
-	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_r3t = (unsigned long *) page_to_phys(page);
+	s_r3t = page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
@@ -1859,13 +1573,12 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
 		rc = -EAGAIN;		/* Race with shadow */
 		goto out_free;
 	}
-	crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
 	/* mark as invalid as long as the parent table is not protected */
-	*table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+	*table = s_r3t | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
 	if (sg->edat_level >= 1)
 		*table |= (r3t & _REGION_ENTRY_PROTECT);
-	list_add(&page->lru, &sg->crst_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_REGION_ENTRY_INVALID;
@@ -1882,8 +1595,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 3);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-			      (unsigned long) s_r3t)
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_REGION_ENTRY_INVALID;
@@ -1916,19 +1628,17 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
-	unsigned long *s_sgt, *table;
+	unsigned long *table;
+	phys_addr_t s_sgt;
 	struct page *page;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
 	/* Allocate a shadow segment table */
-	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
-	page->index = sgt & _REGION_ENTRY_ORIGIN;
-	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_sgt = (unsigned long *) page_to_phys(page);
+	s_sgt = page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
@@ -1943,13 +1653,12 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 		rc = -EAGAIN;		/* Race with shadow */
 		goto out_free;
 	}
-	crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
 	/* mark as invalid as long as the parent table is not protected */
-	*table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+	*table = s_sgt | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
 	if (sg->edat_level >= 1)
 		*table |= sgt & _REGION_ENTRY_PROTECT;
-	list_add(&page->lru, &sg->crst_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_REGION_ENTRY_INVALID;
@@ -1966,8 +1675,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 2);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-			      (unsigned long) s_sgt)
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_REGION_ENTRY_INVALID;
@@ -1983,45 +1691,22 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
 
-/**
- * gmap_shadow_pgt_lookup - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
-			   unsigned long *pgt, int *dat_protection,
-			   int *fake)
+static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
 {
-	unsigned long *table;
-	struct page *page;
-	int rc;
+	unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
 
-	BUG_ON(!gmap_is_shadow(sg));
-	spin_lock(&sg->guest_table_lock);
-	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
-	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
-		/* Shadow page tables are full pages (pte+pgste) */
-		page = pfn_to_page(*table >> PAGE_SHIFT);
-		*pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
-		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
-		*fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
-		rc = 0;
-	} else  {
-		rc = -EAGAIN;
-	}
-	spin_unlock(&sg->guest_table_lock);
-	return rc;
+	pgstes += _PAGE_ENTRIES;
+
+	pgstes[0] &= ~PGSTE_ST2_MASK;
+	pgstes[1] &= ~PGSTE_ST2_MASK;
+	pgstes[2] &= ~PGSTE_ST2_MASK;
+	pgstes[3] &= ~PGSTE_ST2_MASK;
 
+	pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
+	pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
+	pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
+	pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
 }
-EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
 
 /**
  * gmap_shadow_pgt - instantiate a shadow page table
@@ -2040,19 +1725,21 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 		    int fake)
 {
 	unsigned long raddr, origin;
-	unsigned long *s_pgt, *table;
-	struct page *page;
+	unsigned long *table;
+	struct ptdesc *ptdesc;
+	phys_addr_t s_pgt;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
 	/* Allocate a shadow page table */
-	page = page_table_alloc_pgste(sg->mm);
-	if (!page)
+	ptdesc = page_table_alloc_pgste(sg->mm);
+	if (!ptdesc)
 		return -ENOMEM;
-	page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+	origin = pgt & _SEGMENT_ENTRY_ORIGIN;
 	if (fake)
-		page->index |= GMAP_SHADOW_FAKE_TABLE;
-	s_pgt = (unsigned long *) page_to_phys(page);
+		origin |= GMAP_SHADOW_FAKE_TABLE;
+	gmap_pgste_set_pgt_addr(ptdesc, origin);
+	s_pgt = page_to_phys(ptdesc_page(ptdesc));
 	/* Install shadow page table */
 	spin_lock(&sg->guest_table_lock);
 	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
@@ -2070,7 +1757,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 	/* mark as invalid as long as the parent table is not protected */
 	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
 		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
-	list_add(&page->lru, &sg->pt_list);
 	if (fake) {
 		/* nothing to protect for fake tables */
 		*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2085,8 +1771,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 1);
-		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
-			      (unsigned long) s_pgt)
+		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
 			rc = -EAGAIN;		/* Race with unshadow */
 		else
 			*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2097,7 +1782,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 	return rc;
 out_free:
 	spin_unlock(&sg->guest_table_lock);
-	page_table_free_pgste(page);
+	page_table_free_pgste(ptdesc);
 	return rc;
 
 }
@@ -2152,7 +1837,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
 			if (!tptep) {
 				spin_unlock(&sg->guest_table_lock);
-				gmap_pte_op_end(ptl);
+				gmap_pte_op_end(sptep, ptl);
 				radix_tree_preload_end();
 				break;
 			}
@@ -2163,7 +1848,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 				rmap = NULL;
 				rc = 0;
 			}
-			gmap_pte_op_end(ptl);
+			gmap_pte_op_end(sptep, ptl);
 			spin_unlock(&sg->guest_table_lock);
 		}
 		radix_tree_preload_end();
@@ -2249,7 +1934,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 		 pte_t *pte, unsigned long bits)
 {
 	unsigned long offset, gaddr = 0;
-	unsigned long *table;
 	struct gmap *gmap, *sg, *next;
 
 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
@@ -2257,12 +1941,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		table = radix_tree_lookup(&gmap->host_to_guest,
-					  vmaddr >> PMD_SHIFT);
-		if (table)
-			gaddr = __gmap_segment_gaddr(table) + offset;
+		gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
 		spin_unlock(&gmap->guest_table_lock);
-		if (!table)
+		if (!IS_GADDR_VALID(gaddr))
 			continue;
 
 		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
@@ -2302,13 +1983,11 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
 	gaddr &= HPAGE_MASK;
 	pmdp_notify_gmap(gmap, pmdp, gaddr);
 	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
 			    IDTE_GLOBAL);
-	else if (MACHINE_HAS_IDTE)
-		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
 	else
-		__pmdp_csp(pmdp);
+		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
 	set_pmd(pmdp, new);
 }
 
@@ -2322,15 +2001,14 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
-						  vmaddr >> PMD_SHIFT);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
 		if (pmdp) {
-			gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-						   _SEGMENT_ENTRY_GMAP_UC));
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
 			if (purge)
-				__pmdp_csp(pmdp);
+				__pmdp_cspg(pmdp);
 			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 		}
 		spin_unlock(&gmap->guest_table_lock);
@@ -2351,44 +2029,31 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
 
 /**
- * gmap_pmdp_csp - csp all affected guest pmd entries
- * @mm: pointer to the process mm_struct
- * @vmaddr: virtual address in the process address space
- */
-void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
-{
-	gmap_pmdp_clear(mm, vmaddr, 1);
-}
-EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
-
-/**
  * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
  * @mm: pointer to the process mm_struct
  * @vmaddr: virtual address in the process address space
  */
 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
 {
-	unsigned long *entry, gaddr;
+	unsigned long gaddr;
 	struct gmap *gmap;
 	pmd_t *pmdp;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		entry = radix_tree_delete(&gmap->host_to_guest,
-					  vmaddr >> PMD_SHIFT);
-		if (entry) {
-			pmdp = (pmd_t *)entry;
-			gaddr = __gmap_segment_gaddr(entry);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+		if (pmdp) {
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-					   _SEGMENT_ENTRY_GMAP_UC));
-			if (MACHINE_HAS_TLB_GUEST)
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
+			if (machine_has_tlb_guest())
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_LOCAL);
-			else if (MACHINE_HAS_IDTE)
+			else
 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
-			*entry = _SEGMENT_ENTRY_EMPTY;
+			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 		}
 		spin_unlock(&gmap->guest_table_lock);
 	}
@@ -2403,29 +2068,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
  */
 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
 {
-	unsigned long *entry, gaddr;
+	unsigned long gaddr;
 	struct gmap *gmap;
 	pmd_t *pmdp;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		spin_lock(&gmap->guest_table_lock);
-		entry = radix_tree_delete(&gmap->host_to_guest,
-					  vmaddr >> PMD_SHIFT);
-		if (entry) {
-			pmdp = (pmd_t *)entry;
-			gaddr = __gmap_segment_gaddr(entry);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+		if (pmdp) {
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-					   _SEGMENT_ENTRY_GMAP_UC));
-			if (MACHINE_HAS_TLB_GUEST)
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
+			if (machine_has_tlb_guest())
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_GLOBAL);
-			else if (MACHINE_HAS_IDTE)
-				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
 			else
-				__pmdp_csp(pmdp);
-			*entry = _SEGMENT_ENTRY_EMPTY;
+				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
+			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 		}
 		spin_unlock(&gmap->guest_table_lock);
 	}
@@ -2481,7 +2142,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
 	if (!pmdp)
 		return;
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
 			bitmap_fill(bitmap, _PAGE_ENTRIES);
 	} else {
@@ -2491,7 +2152,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
 				continue;
 			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
 				set_bit(i, bitmap);
-			spin_unlock(ptl);
+			pte_unmap_unlock(ptep, ptl);
 		}
 	}
 	gmap_pmd_op_end(gmap, pmdp);
@@ -2510,6 +2171,7 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
 
 static const struct mm_walk_ops thp_split_walk_ops = {
 	.pmd_entry	= thp_split_walk_pmd_entry,
+	.walk_lock	= PGWALK_WRLOCK_VERIFY,
 };
 
 static inline void thp_split_mm(struct mm_struct *mm)
@@ -2518,8 +2180,7 @@ static inline void thp_split_mm(struct mm_struct *mm)
 	VMA_ITERATOR(vmi, mm, 0);
 
 	for_each_vma(vmi, vma) {
-		vma->vm_flags &= ~VM_HUGEPAGE;
-		vma->vm_flags |= VM_NOHUGEPAGE;
+		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
 		walk_page_vma(vma, &thp_split_walk_ops, NULL);
 	}
 	mm->def_flags |= VM_NOHUGEPAGE;
@@ -2531,33 +2192,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
- * Remove all empty zero pages from the mapping for lazy refaulting
- * - This must be called after mm->context.has_pgste is set, to avoid
- *   future creation of zero pages
- * - This must be called after THP was enabled
- */
-static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
-			   unsigned long end, struct mm_walk *walk)
-{
-	unsigned long addr;
-
-	for (addr = start; addr != end; addr += PAGE_SIZE) {
-		pte_t *ptep;
-		spinlock_t *ptl;
-
-		ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
-		if (is_zero_pfn(pte_pfn(*ptep)))
-			ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
-		pte_unmap_unlock(ptep, ptl);
-	}
-	return 0;
-}
-
-static const struct mm_walk_ops zap_zero_walk_ops = {
-	.pmd_entry	= __zap_zero_pages,
-};
-
-/*
  * switch on pgstes for its userspace process (for kvm)
  */
 int s390_enable_sie(void)
@@ -2567,37 +2201,15 @@ int s390_enable_sie(void)
 	/* Do we have pgstes? if yes, we are done */
 	if (mm_has_pgste(mm))
 		return 0;
-	/* Fail if the page tables are 2K */
-	if (!mm_alloc_pgste(mm))
-		return -EINVAL;
 	mmap_write_lock(mm);
 	mm->context.has_pgste = 1;
 	/* split thp mappings and disable thp for future mappings */
 	thp_split_mm(mm);
-	walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
 	mmap_write_unlock(mm);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(s390_enable_sie);
 
-int gmap_mark_unmergeable(void)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	int ret;
-	VMA_ITERATOR(vmi, mm, 0);
-
-	for_each_vma(vmi, vma) {
-		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
-				  MADV_UNMERGEABLE, &vma->vm_flags);
-		if (ret)
-			return ret;
-	}
-	mm->def_flags &= ~VM_MERGEABLE;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
-
 /*
  * Enable storage key handling from now on and initialize the storage
  * keys with the default key.
@@ -2628,7 +2240,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
 {
 	pmd_t *pmd = (pmd_t *)pte;
 	unsigned long start, end;
-	struct page *page = pmd_page(*pmd);
+	struct folio *folio = page_folio(pmd_page(*pmd));
 
 	/*
 	 * The write check makes sure we do not set a key on shared
@@ -2641,9 +2253,9 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
 		return 0;
 
 	start = pmd_val(*pmd) & HPAGE_MASK;
-	end = start + HPAGE_SIZE - 1;
+	end = start + HPAGE_SIZE;
 	__storage_key_init_range(start, end);
-	set_bit(PG_arch_1, &page->flags);
+	set_bit(PG_arch_1, &folio->flags.f);
 	cond_resched();
 	return 0;
 }
@@ -2652,6 +2264,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = {
 	.hugetlb_entry		= __s390_enable_skey_hugetlb,
 	.pte_entry		= __s390_enable_skey_pte,
 	.pmd_entry		= __s390_enable_skey_pmd,
+	.walk_lock		= PGWALK_WRLOCK,
 };
 
 int s390_enable_skey(void)
@@ -2664,7 +2277,7 @@ int s390_enable_skey(void)
 		goto out_up;
 
 	mm->context.uses_skeys = 1;
-	rc = gmap_mark_unmergeable();
+	rc = gmap_helper_disable_cow_sharing();
 	if (rc) {
 		mm->context.uses_skeys = 0;
 		goto out_up;
@@ -2689,6 +2302,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
 
 static const struct mm_walk_ops reset_cmma_walk_ops = {
 	.pte_entry		= __s390_reset_cmma,
+	.walk_lock		= PGWALK_WRLOCK,
 };
 
 void s390_reset_cmma(struct mm_struct *mm)
@@ -2725,6 +2339,7 @@ static int s390_gather_pages(pte_t *ptep, unsigned long addr,
 
 static const struct mm_walk_ops gather_pages_ops = {
 	.pte_entry = s390_gather_pages,
+	.walk_lock = PGWALK_RDLOCK,
 };
 
 /*
@@ -2733,13 +2348,15 @@ static const struct mm_walk_ops gather_pages_ops = {
  */
 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
 {
+	struct folio *folio;
 	unsigned long i;
 
 	for (i = 0; i < count; i++) {
+		folio = pfn_folio(pfns[i]);
 		/* we always have an extra reference */
-		uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+		uv_destroy_folio(folio);
 		/* get rid of the extra reference */
-		put_page(pfn_to_page(pfns[i]));
+		folio_put(folio);
 		cond_resched();
 	}
 }
@@ -2780,52 +2397,12 @@ int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
 
 /**
- * s390_unlist_old_asce - Remove the topmost level of page tables from the
- * list of page tables of the gmap.
- * @gmap: the gmap whose table is to be removed
- *
- * On s390x, KVM keeps a list of all pages containing the page tables of the
- * gmap (the CRST list). This list is used at tear down time to free all
- * pages that are now not needed anymore.
- *
- * This function removes the topmost page of the tree (the one pointed to by
- * the ASCE) from the CRST list.
- *
- * This means that it will not be freed when the VM is torn down, and needs
- * to be handled separately by the caller, unless a leak is actually
- * intended. Notice that this function will only remove the page from the
- * list, the page will still be used as a top level page table (and ASCE).
- */
-void s390_unlist_old_asce(struct gmap *gmap)
-{
-	struct page *old;
-
-	old = virt_to_page(gmap->table);
-	spin_lock(&gmap->guest_table_lock);
-	list_del(&old->lru);
-	/*
-	 * Sometimes the topmost page might need to be "removed" multiple
-	 * times, for example if the VM is rebooted into secure mode several
-	 * times concurrently, or if s390_replace_asce fails after calling
-	 * s390_remove_old_asce and is attempted again later. In that case
-	 * the old asce has been removed from the list, and therefore it
-	 * will not be freed when the VM terminates, but the ASCE is still
-	 * in use and still pointed to.
-	 * A subsequent call to replace_asce will follow the pointer and try
-	 * to remove the same page from the list again.
-	 * Therefore it's necessary that the page of the ASCE has valid
-	 * pointers, so list_del can work (and do nothing) without
-	 * dereferencing stale or invalid pointers.
-	 */
-	INIT_LIST_HEAD(&old->lru);
-	spin_unlock(&gmap->guest_table_lock);
-}
-EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
-
-/**
  * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
  * @gmap: the gmap whose ASCE needs to be replaced
  *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
  * If the allocation of the new top level page table fails, the ASCE is not
  * replaced.
  * In any case, the old ASCE is always removed from the gmap CRST list.
@@ -2838,23 +2415,16 @@ int s390_replace_asce(struct gmap *gmap)
 	struct page *page;
 	void *table;
 
-	s390_unlist_old_asce(gmap);
+	/* Replacing segment type ASCEs would cause serious issues */
+	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+		return -EINVAL;
 
-	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	page = gmap_alloc_crst();
 	if (!page)
 		return -ENOMEM;
 	table = page_to_virt(page);
 	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
 
-	/*
-	 * The caller has to deal with the old ASCE, but here we make sure
-	 * the new one is properly added to the CRST list, so that
-	 * it will be freed when the VM is torn down.
-	 */
-	spin_lock(&gmap->guest_table_lock);
-	list_add(&page->lru, &gmap->crst_list);
-	spin_unlock(&gmap->guest_table_lock);
-
 	/* Set new table origin while preserving existing ASCE control bits */
 	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
 	WRITE_ONCE(gmap->asce, asce);
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
new file mode 100644
index 000000000000..549f14ad08af
--- /dev/null
+++ b/arch/s390/mm/gmap_helpers.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Helper functions for KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2025
+ */
+
+#include <linux/export.h>
+#include <linux/mm_types.h>
+#include <linux/mmap_lock.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/leafops.h>
+#include <linux/pagewalk.h>
+#include <linux/ksm.h>
+#include <asm/gmap_helpers.h>
+#include <asm/pgtable.h>
+
+/**
+ * ptep_zap_softleaf_entry() - discard a software leaf entry.
+ * @mm: the mm
+ * @entry: the software leaf entry that needs to be zapped
+ *
+ * Discards the given software leaf entry. If the leaf entry was an actual
+ * swap entry (and not a migration entry, for example), the actual swapped
+ * page is also discarded from swap.
+ */
+static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
+{
+	if (softleaf_is_swap(entry))
+		dec_mm_counter(mm, MM_SWAPENTS);
+	else if (softleaf_is_migration(entry))
+		dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
+	free_swap_and_cache(entry);
+}
+
+/**
+ * gmap_helper_zap_one_page() - discard a page if it was swapped.
+ * @mm: the mm
+ * @vmaddr: the userspace virtual address that needs to be discarded
+ *
+ * If the given address maps to a swap entry, discard it.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
+{
+	struct vm_area_struct *vma;
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pte_t *ptep;
+
+	mmap_assert_locked(mm);
+
+	/* Find the vm address for the guest address */
+	vma = vma_lookup(mm, vmaddr);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return;
+
+	/* Get pointer to the page table entry */
+	ptep = get_locked_pte(mm, vmaddr, &ptl);
+	if (unlikely(!ptep))
+		return;
+	if (pte_swap(*ptep)) {
+		preempt_disable();
+		pgste = pgste_get_lock(ptep);
+
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
+		pte_clear(mm, vmaddr, ptep);
+
+		pgste_set_unlock(ptep, pgste);
+		preempt_enable();
+	}
+	pte_unmap_unlock(ptep, ptl);
+}
+EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);
+
+/**
+ * gmap_helper_discard() - discard user pages in the given range
+ * @mm: the mm
+ * @vmaddr: starting userspace address
+ * @end: end address (first address outside the range)
+ *
+ * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
+{
+	struct vm_area_struct *vma;
+
+	mmap_assert_locked(mm);
+
+	while (vmaddr < end) {
+		vma = find_vma_intersection(mm, vmaddr, end);
+		if (!vma)
+			return;
+		if (!is_vm_hugetlb_page(vma))
+			zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL);
+		vmaddr = vma->vm_end;
+	}
+}
+EXPORT_SYMBOL_GPL(gmap_helper_discard);
+
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+				   unsigned long end, struct mm_walk *walk)
+{
+	unsigned long *found_addr = walk->private;
+
+	/* Return 1 of the page is a zeropage. */
+	if (is_zero_pfn(pte_pfn(*pte))) {
+		/*
+		 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+		 * right thing and likely don't care: FAULT_FLAG_UNSHARE
+		 * currently only works in COW mappings, which is also where
+		 * mm_forbids_zeropage() is checked.
+		 */
+		if (!is_cow_mapping(walk->vma->vm_flags))
+			return -EFAULT;
+
+		*found_addr = addr;
+		return 1;
+	}
+	return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+	.pte_entry      = find_zeropage_pte_entry,
+	.walk_lock      = PGWALK_WRLOCK,
+};
+
+/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
+ * @mm: the mm whose zero pages are to be unshared
+ *
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+	unsigned long addr;
+	vm_fault_t fault;
+	int rc;
+
+	for_each_vma(vmi, vma) {
+		/*
+		 * We could only look at COW mappings, but it's more future
+		 * proof to catch unexpected zeropages in other mappings and
+		 * fail.
+		 */
+		if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+			continue;
+		addr = vma->vm_start;
+
+retry:
+		rc = walk_page_range_vma(vma, addr, vma->vm_end,
+					 &find_zeropage_ops, &addr);
+		if (rc < 0)
+			return rc;
+		else if (!rc)
+			continue;
+
+		/* addr was updated by find_zeropage_pte_entry() */
+		fault = handle_mm_fault(vma, addr,
+					FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+					NULL);
+		if (fault & VM_FAULT_OOM)
+			return -ENOMEM;
+		/*
+		 * See break_ksm(): even after handle_mm_fault() returned 0, we
+		 * must start the lookup from the current address, because
+		 * handle_mm_fault() may back out if there's any difficulty.
+		 *
+		 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+		 * maybe they could trigger in the future on concurrent
+		 * truncation. In that case, the shared zeropage would be gone
+		 * and we can simply retry and make progress.
+		 */
+		cond_resched();
+		goto retry;
+	}
+
+	return 0;
+}
+
+/**
+ * gmap_helper_disable_cow_sharing() - disable all COW sharing
+ *
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int gmap_helper_disable_cow_sharing(void)
+{
+	struct mm_struct *mm = current->mm;
+	int rc;
+
+	mmap_assert_write_locked(mm);
+
+	if (!mm->context.allow_cow_sharing)
+		return 0;
+
+	mm->context.allow_cow_sharing = 0;
+
+	/* Replace all shared zeropages by anonymous pages. */
+	rc = __gmap_helper_unshare_zeropages(mm);
+	/*
+	 * Make sure to disable KSM (if enabled for the whole process or
+	 * individual VMAs). Note that nothing currently hinders user space
+	 * from re-enabling it.
+	 */
+	if (!rc)
+		rc = ksm_disable(mm);
+	if (rc)
+		mm->context.allow_cow_sharing = 1;
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index c299a18273ff..d42e61c7594e 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -6,15 +6,15 @@
  *    Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
-#define KMSG_COMPONENT "hugetlb"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "hugetlb: " fmt
 
-#include <asm/pgalloc.h>
+#include <linux/cpufeature.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/sched/mm.h>
 #include <linux/security.h>
+#include <asm/pgalloc.h>
 
 /*
  * If the bit selected by single-bit bitmask "a" is set within "x", move
@@ -24,6 +24,7 @@
 
 static inline unsigned long __pte_to_rste(pte_t pte)
 {
+	swp_entry_t arch_entry;
 	unsigned long rste;
 
 	/*
@@ -48,6 +49,7 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 	 */
 	if (pte_present(pte)) {
 		rste = pte_val(pte) & PAGE_MASK;
+		rste |= _SEGMENT_ENTRY_PRESENT;
 		rste |= move_set_bit(pte_val(pte), _PAGE_READ,
 				     _SEGMENT_ENTRY_READ);
 		rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
@@ -66,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 #endif
 		rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
 				     _SEGMENT_ENTRY_NOEXEC);
+	} else if (!pte_none(pte)) {
+		/* swap pte */
+		arch_entry = __pte_to_swp_entry(pte);
+		rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry));
 	} else
 		rste = _SEGMENT_ENTRY_EMPTY;
 	return rste;
@@ -73,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 
 static inline pte_t __rste_to_pte(unsigned long rste)
 {
+	swp_entry_t arch_entry;
 	unsigned long pteval;
-	int present;
+	int present, none;
+	pte_t pte;
 
-	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
 		present = pud_present(__pud(rste));
-	else
+		none = pud_none(__pud(rste));
+	} else {
 		present = pmd_present(__pmd(rste));
+		none = pmd_none(__pmd(rste));
+	}
 
 	/*
 	 * Convert encoding		pmd / pud bits	    pte bits
@@ -114,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste)
 		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
 #endif
 		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
+	} else if (!none) {
+		/* swap rste */
+		arch_entry = __rste_to_swp_entry(rste);
+		pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry));
+		pteval = pte_val(pte);
 	} else
 		pteval = _PAGE_INVALID;
 	return __pte(pteval);
@@ -121,7 +137,7 @@ static inline pte_t __rste_to_pte(unsigned long rste)
 
 static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
 {
-	struct page *page;
+	struct folio *folio;
 	unsigned long size, paddr;
 
 	if (!mm_uses_skeys(mm) ||
@@ -129,27 +145,25 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
 		return;
 
 	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
-		page = pud_page(__pud(rste));
+		folio = page_folio(pud_page(__pud(rste)));
 		size = PUD_SIZE;
 		paddr = rste & PUD_MASK;
 	} else {
-		page = pmd_page(__pmd(rste));
+		folio = page_folio(pmd_page(__pmd(rste)));
 		size = PMD_SIZE;
 		paddr = rste & PMD_MASK;
 	}
 
-	if (!test_and_set_bit(PG_arch_1, &page->flags))
-		__storage_key_init_range(paddr, paddr + size - 1);
+	if (!test_and_set_bit(PG_arch_1, &folio->flags.f))
+		__storage_key_init_range(paddr, paddr + size);
 }
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
 	unsigned long rste;
 
 	rste = __pte_to_rste(pte);
-	if (!MACHINE_HAS_NX)
-		rste &= ~_SEGMENT_ENTRY_NOEXEC;
 
 	/* Set correct table type for 2G hugepages */
 	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
@@ -163,15 +177,21 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	set_pte(ptep, __pte(rste));
 }
 
-pte_t huge_ptep_get(pte_t *ptep)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte, unsigned long sz)
+{
+	__set_huge_pte_at(mm, addr, ptep, pte);
+}
+
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	return __rste_to_pte(pte_val(*ptep));
 }
 
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-			      unsigned long addr, pte_t *ptep)
+pte_t __huge_ptep_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
 {
-	pte_t pte = huge_ptep_get(ptep);
+	pte_t pte = huge_ptep_get(mm, addr, ptep);
 	pmd_t *pmdp = (pmd_t *) ptep;
 	pud_t *pudp = (pud_t *) ptep;
 
@@ -217,120 +237,21 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 		p4dp = p4d_offset(pgdp, addr);
 		if (p4d_present(*p4dp)) {
 			pudp = pud_offset(p4dp, addr);
-			if (pud_present(*pudp)) {
-				if (pud_large(*pudp))
-					return (pte_t *) pudp;
+			if (sz == PUD_SIZE)
+				return (pte_t *)pudp;
+			if (pud_present(*pudp))
 				pmdp = pmd_offset(pudp, addr);
-			}
 		}
 	}
 	return (pte_t *) pmdp;
 }
 
-int pmd_huge(pmd_t pmd)
-{
-	return pmd_large(pmd);
-}
-
-int pud_huge(pud_t pud)
-{
-	return pud_large(pud);
-}
-
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
-	if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
+	if (cpu_has_edat1() && size == PMD_SIZE)
 		return true;
-	else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
+	else if (cpu_has_edat2() && size == PUD_SIZE)
 		return true;
 	else
 		return false;
 }
-
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
-		unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info;
-
-	info.flags = 0;
-	info.length = len;
-	info.low_limit = current->mm->mmap_base;
-	info.high_limit = TASK_SIZE;
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	info.align_offset = 0;
-	return vm_unmapped_area(&info);
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-		unsigned long addr0, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct vm_unmapped_area_info info;
-	unsigned long addr;
-
-	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-	info.length = len;
-	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-	info.high_limit = current->mm->mmap_base;
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	info.align_offset = 0;
-	addr = vm_unmapped_area(&info);
-
-	/*
-	 * A failed mmap() very likely causes application failure,
-	 * so fall back to the bottom-up function here. This scenario
-	 * can happen with large stack limits and large mmap()
-	 * allocations.
-	 */
-	if (addr & ~PAGE_MASK) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-	if (len > TASK_SIZE - mmap_min_addr)
-		return -ENOMEM;
-
-	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		goto check_asce_limit;
-	}
-
-	if (addr) {
-		addr = ALIGN(addr, huge_page_size(h));
-		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
-		    (!vma || addr + len <= vm_start_gap(vma)))
-			goto check_asce_limit;
-	}
-
-	if (mm->get_unmapped_area == arch_get_unmapped_area)
-		addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
-				pgoff, flags);
-	else
-		addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
-				pgoff, flags);
-	if (offset_in_page(addr))
-		return addr;
-
-check_asce_limit:
-	return check_asce_limit(mm, addr, len);
-}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 97d66a3e60fb..e4953453d254 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -8,6 +8,7 @@
  *    Copyright (C) 1995  Linus Torvalds
  */
 
+#include <linux/cpufeature.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -31,17 +32,16 @@
 #include <linux/cma.h>
 #include <linux/gfp.h>
 #include <linux/dma-direct.h>
+#include <linux/percpu.h>
 #include <asm/processor.h>
 #include <linux/uaccess.h>
 #include <asm/pgalloc.h>
+#include <asm/ctlreg.h>
 #include <asm/kfence.h>
-#include <asm/ptdump.h>
 #include <asm/dma.h>
 #include <asm/abs_lowcore.h>
-#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
-#include <asm/ctl_reg.h>
 #include <asm/sclp.h>
 #include <asm/set_memory.h>
 #include <asm/kasan.h>
@@ -49,11 +49,21 @@
 #include <asm/uv.h>
 #include <linux/virtio_anchor.h>
 #include <linux/virtio_config.h>
+#include <linux/execmem.h>
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
-static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
+pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
 
-unsigned long s390_invalid_asce;
+struct ctlreg __bootdata_preserved(s390_invalid_asce);
+
+unsigned long __bootdata_preserved(page_noexec_mask);
+EXPORT_SYMBOL(page_noexec_mask);
+
+unsigned long __bootdata_preserved(segment_noexec_mask);
+EXPORT_SYMBOL(segment_noexec_mask);
+
+unsigned long __bootdata_preserved(region_noexec_mask);
+EXPORT_SYMBOL(region_noexec_mask);
 
 unsigned long empty_zero_page, zero_page_mask;
 EXPORT_SYMBOL(empty_zero_page);
@@ -61,27 +71,17 @@ EXPORT_SYMBOL(zero_page_mask);
 
 static void __init setup_zero_pages(void)
 {
+	unsigned long total_pages = memblock_estimated_nr_free_pages();
 	unsigned int order;
-	struct page *page;
-	int i;
 
 	/* Latest machines require a mapping granularity of 512KB */
 	order = 7;
 
 	/* Limit number of empty zero pages for small memory sizes */
-	while (order > 2 && (totalram_pages() >> 10) < (1UL << order))
+	while (order > 2 && (total_pages >> 10) < (1UL << order))
 		order--;
 
-	empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!empty_zero_page)
-		panic("Out of memory in setup_zero_pages");
-
-	page = virt_to_page((void *) empty_zero_page);
-	split_page(page, order);
-	for (i = 1 << order; i > 0; i--) {
-		mark_page_reserved(page);
-		page++;
-	}
+	empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE);
 
 	zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
 }
@@ -92,41 +92,12 @@ static void __init setup_zero_pages(void)
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
-	unsigned long pgd_type, asce_bits;
-	psw_t psw;
-
-	s390_invalid_asce  = (unsigned long)invalid_pg_dir;
-	s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
-	crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY);
-	init_mm.pgd = swapper_pg_dir;
-	if (VMALLOC_END > _REGION2_SIZE) {
-		asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
-		pgd_type = _REGION2_ENTRY_EMPTY;
-	} else {
-		asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
-		pgd_type = _REGION3_ENTRY_EMPTY;
-	}
-	init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
-	S390_lowcore.kernel_asce = init_mm.context.asce;
-	S390_lowcore.user_asce = s390_invalid_asce;
-	crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
-	vmem_map_init();
-	kasan_copy_shadow_mapping();
-
-	/* enable virtual mapping in kernel mode */
-	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	__ctl_load(S390_lowcore.user_asce, 7, 7);
-	__ctl_load(S390_lowcore.kernel_asce, 13, 13);
-	psw.mask = __extract_psw();
-	psw_bits(psw).dat = 1;
-	psw_bits(psw).as = PSW_BITS_AS_HOME;
-	__load_psw_mask(psw.mask);
-	kasan_free_early_identity();
 
+	vmem_map_init();
 	sparse_init();
-	zone_dma_bits = 31;
+	zone_dma_limit = DMA_BIT_MASK(31);
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
+	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 	free_area_init(max_zone_pfns);
 }
@@ -135,30 +106,31 @@ void mark_rodata_ro(void)
 {
 	unsigned long size = __end_ro_after_init - __start_ro_after_init;
 
-	set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
+	if (cpu_has_nx())
+		system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT);
+	__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
 	pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
-	debug_checkwx();
 }
 
-int set_memory_encrypted(unsigned long addr, int numpages)
+int set_memory_encrypted(unsigned long vaddr, int numpages)
 {
 	int i;
 
 	/* make specified pages unshared, (swiotlb, dma_free) */
 	for (i = 0; i < numpages; ++i) {
-		uv_remove_shared(addr);
-		addr += PAGE_SIZE;
+		uv_remove_shared(virt_to_phys((void *)vaddr));
+		vaddr += PAGE_SIZE;
 	}
 	return 0;
 }
 
-int set_memory_decrypted(unsigned long addr, int numpages)
+int set_memory_decrypted(unsigned long vaddr, int numpages)
 {
 	int i;
 	/* make specified pages shared (swiotlb, dma_alloca) */
 	for (i = 0; i < numpages; ++i) {
-		uv_set_shared(addr);
-		addr += PAGE_SIZE;
+		uv_set_shared(virt_to_phys((void *)vaddr));
+		vaddr += PAGE_SIZE;
 	}
 	return 0;
 }
@@ -170,7 +142,7 @@ bool force_dma_unencrypted(struct device *dev)
 }
 
 /* protected virtualization */
-static void pv_init(void)
+static void __init pv_init(void)
 {
 	if (!is_prot_virt_guest())
 		return;
@@ -182,35 +154,14 @@ static void pv_init(void)
 	swiotlb_update_mem_attributes();
 }
 
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
 {
 	cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
 	cpumask_set_cpu(0, mm_cpumask(&init_mm));
 
-	set_max_mapnr(max_low_pfn);
-        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
 	pv_init();
-	kfence_split_mapping();
-	/* Setup guest page hinting */
-	cmma_init();
 
-	/* this will put all low memory onto the freelists */
-	memblock_free_all();
 	setup_zero_pages();	/* Setup zeroed pages. */
-
-	cmma_init_nodat();
-}
-
-void free_initmem(void)
-{
-	__set_memory((unsigned long)_sinittext,
-		     (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
-		     SET_MEMORY_RW | SET_MEMORY_NX);
-	free_reserved_area(sclp_early_sccb,
-			   sclp_early_sccb + EXT_SCCB_READ_SCP,
-			   POISON_FREE_INITMEM, "unused early sccb");
-	free_initmem_default(POISON_FREE_INITMEM);
 }
 
 unsigned long memory_block_size_bytes(void)
@@ -222,6 +173,41 @@ unsigned long memory_block_size_bytes(void)
 	return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
 }
 
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+	return LOCAL_DISTANCE;
+}
+
+static int __init pcpu_cpu_to_node(int cpu)
+{
+	return 0;
+}
+
+void __init setup_per_cpu_areas(void)
+{
+	unsigned long delta;
+	unsigned int cpu;
+	int rc;
+
+	/*
+	 * Always reserve area for module percpu variables.  That's
+	 * what the legacy allocator did.
+	 */
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
+				    pcpu_cpu_distance,
+				    pcpu_cpu_to_node);
+	if (rc < 0)
+		panic("Failed to initialize percpu areas.");
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 #ifdef CONFIG_CMA
@@ -236,16 +222,13 @@ struct s390_cma_mem_data {
 static int s390_cma_check_range(struct cma *cma, void *data)
 {
 	struct s390_cma_mem_data *mem_data;
-	unsigned long start, end;
 
 	mem_data = data;
-	start = cma_get_base(cma);
-	end = start + cma_get_size(cma);
-	if (end < mem_data->start)
-		return 0;
-	if (start >= mem_data->end)
-		return 0;
-	return -EBUSY;
+
+	if (cma_intersects(cma, mem_data->start, mem_data->end))
+		return -EBUSY;
+
+	return 0;
 }
 
 static int s390_cma_mem_notifier(struct notifier_block *nb,
@@ -282,10 +265,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	unsigned long size_pages = PFN_DOWN(size);
 	int rc;
 
-	if (WARN_ON_ONCE(params->altmap))
-		return -EINVAL;
-
-	if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
+	if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL)))
 		return -EINVAL;
 
 	VM_BUG_ON(!mhp_range_allowed(start, size, true));
@@ -308,3 +288,32 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 	vmem_remove_mapping(start, size);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+	unsigned long module_load_offset = 0;
+	unsigned long start;
+
+	if (kaslr_enabled())
+		module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+	start = MODULES_VADDR + module_load_offset;
+
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.flags	= EXECMEM_KASAN_SHADOW,
+				.start	= start,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = MODULE_ALIGN,
+			},
+		},
+	};
+
+	return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
deleted file mode 100644
index 9f988d4582ed..000000000000
--- a/arch/s390/mm/kasan_init.c
+++ /dev/null
@@ -1,403 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kasan.h>
-#include <linux/sched/task.h>
-#include <linux/memblock.h>
-#include <linux/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/kasan.h>
-#include <asm/mem_detect.h>
-#include <asm/processor.h>
-#include <asm/sclp.h>
-#include <asm/facility.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/uv.h>
-
-static unsigned long segment_pos __initdata;
-static unsigned long segment_low __initdata;
-static unsigned long pgalloc_pos __initdata;
-static unsigned long pgalloc_low __initdata;
-static unsigned long pgalloc_freeable __initdata;
-static bool has_edat __initdata;
-static bool has_nx __initdata;
-
-#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x))
-
-static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
-
-static void __init kasan_early_panic(const char *reason)
-{
-	sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n");
-	sclp_early_printk(reason);
-	disabled_wait();
-}
-
-static void * __init kasan_early_alloc_segment(void)
-{
-	segment_pos -= _SEGMENT_SIZE;
-
-	if (segment_pos < segment_low)
-		kasan_early_panic("out of memory during initialisation\n");
-
-	return (void *)segment_pos;
-}
-
-static void * __init kasan_early_alloc_pages(unsigned int order)
-{
-	pgalloc_pos -= (PAGE_SIZE << order);
-
-	if (pgalloc_pos < pgalloc_low)
-		kasan_early_panic("out of memory during initialisation\n");
-
-	return (void *)pgalloc_pos;
-}
-
-static void * __init kasan_early_crst_alloc(unsigned long val)
-{
-	unsigned long *table;
-
-	table = kasan_early_alloc_pages(CRST_ALLOC_ORDER);
-	if (table)
-		crst_table_init(table, val);
-	return table;
-}
-
-static pte_t * __init kasan_early_pte_alloc(void)
-{
-	static void *pte_leftover;
-	pte_t *pte;
-
-	BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE);
-
-	if (!pte_leftover) {
-		pte_leftover = kasan_early_alloc_pages(0);
-		pte = pte_leftover + _PAGE_TABLE_SIZE;
-	} else {
-		pte = pte_leftover;
-		pte_leftover = NULL;
-	}
-	memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
-	return pte;
-}
-
-enum populate_mode {
-	POPULATE_ONE2ONE,
-	POPULATE_MAP,
-	POPULATE_ZERO_SHADOW,
-	POPULATE_SHALLOW
-};
-static void __init kasan_early_pgtable_populate(unsigned long address,
-						unsigned long end,
-						enum populate_mode mode)
-{
-	unsigned long pgt_prot_zero, pgt_prot, sgt_prot;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-
-	pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO);
-	if (!has_nx)
-		pgt_prot_zero &= ~_PAGE_NOEXEC;
-	pgt_prot = pgprot_val(PAGE_KERNEL);
-	sgt_prot = pgprot_val(SEGMENT_KERNEL);
-	if (!has_nx || mode == POPULATE_ONE2ONE) {
-		pgt_prot &= ~_PAGE_NOEXEC;
-		sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
-	}
-
-	/*
-	 * The first 1MB of 1:1 mapping is mapped with 4KB pages
-	 */
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			if (mode == POPULATE_ZERO_SHADOW &&
-			    IS_ALIGNED(address, PGDIR_SIZE) &&
-			    end - address >= PGDIR_SIZE) {
-				pgd_populate(&init_mm, pg_dir,
-						kasan_early_shadow_p4d);
-				address = (address + PGDIR_SIZE) & PGDIR_MASK;
-				continue;
-			}
-			p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY);
-			pgd_populate(&init_mm, pg_dir, p4_dir);
-		}
-
-		if (mode == POPULATE_SHALLOW) {
-			address = (address + P4D_SIZE) & P4D_MASK;
-			continue;
-		}
-
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			if (mode == POPULATE_ZERO_SHADOW &&
-			    IS_ALIGNED(address, P4D_SIZE) &&
-			    end - address >= P4D_SIZE) {
-				p4d_populate(&init_mm, p4_dir,
-						kasan_early_shadow_pud);
-				address = (address + P4D_SIZE) & P4D_MASK;
-				continue;
-			}
-			pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY);
-			p4d_populate(&init_mm, p4_dir, pu_dir);
-		}
-
-		pu_dir = pud_offset(p4_dir, address);
-		if (pud_none(*pu_dir)) {
-			if (mode == POPULATE_ZERO_SHADOW &&
-			    IS_ALIGNED(address, PUD_SIZE) &&
-			    end - address >= PUD_SIZE) {
-				pud_populate(&init_mm, pu_dir,
-						kasan_early_shadow_pmd);
-				address = (address + PUD_SIZE) & PUD_MASK;
-				continue;
-			}
-			pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY);
-			pud_populate(&init_mm, pu_dir, pm_dir);
-		}
-
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-			if (IS_ALIGNED(address, PMD_SIZE) &&
-			    end - address >= PMD_SIZE) {
-				if (mode == POPULATE_ZERO_SHADOW) {
-					pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte);
-					address = (address + PMD_SIZE) & PMD_MASK;
-					continue;
-				} else if (has_edat && address) {
-					void *page;
-
-					if (mode == POPULATE_ONE2ONE) {
-						page = (void *)address;
-					} else {
-						page = kasan_early_alloc_segment();
-						memset(page, 0, _SEGMENT_SIZE);
-					}
-					set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot));
-					address = (address + PMD_SIZE) & PMD_MASK;
-					continue;
-				}
-			}
-			pt_dir = kasan_early_pte_alloc();
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		} else if (pmd_large(*pm_dir)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
-			continue;
-		}
-
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		if (pte_none(*pt_dir)) {
-			void *page;
-
-			switch (mode) {
-			case POPULATE_ONE2ONE:
-				page = (void *)address;
-				set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
-				break;
-			case POPULATE_MAP:
-				page = kasan_early_alloc_pages(0);
-				memset(page, 0, PAGE_SIZE);
-				set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
-				break;
-			case POPULATE_ZERO_SHADOW:
-				page = kasan_early_shadow_page;
-				set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero));
-				break;
-			case POPULATE_SHALLOW:
-				/* should never happen */
-				break;
-			}
-		}
-		address += PAGE_SIZE;
-	}
-}
-
-static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type)
-{
-	unsigned long asce_bits;
-
-	asce_bits = asce_type | _ASCE_TABLE_LENGTH;
-	S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits;
-	S390_lowcore.user_asce = S390_lowcore.kernel_asce;
-
-	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	__ctl_load(S390_lowcore.kernel_asce, 7, 7);
-	__ctl_load(S390_lowcore.kernel_asce, 13, 13);
-}
-
-static void __init kasan_enable_dat(void)
-{
-	psw_t psw;
-
-	psw.mask = __extract_psw();
-	psw_bits(psw).dat = 1;
-	psw_bits(psw).as = PSW_BITS_AS_HOME;
-	__load_psw_mask(psw.mask);
-}
-
-static void __init kasan_early_detect_facilities(void)
-{
-	if (test_facility(8)) {
-		has_edat = true;
-		__ctl_set_bit(0, 23);
-	}
-	if (!noexec_disabled && test_facility(130)) {
-		has_nx = true;
-		__ctl_set_bit(0, 20);
-	}
-}
-
-void __init kasan_early_init(void)
-{
-	unsigned long shadow_alloc_size;
-	unsigned long initrd_end;
-	unsigned long memsize;
-	unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO);
-	pte_t pte_z;
-	pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
-	pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
-	p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY);
-
-	kasan_early_detect_facilities();
-	if (!has_nx)
-		pgt_prot &= ~_PAGE_NOEXEC;
-	pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot);
-
-	memsize = get_mem_detect_end();
-	if (!memsize)
-		kasan_early_panic("cannot detect physical memory size\n");
-	/*
-	 * Kasan currently supports standby memory but only if it follows
-	 * online memory (default allocation), i.e. no memory holes.
-	 * - memsize represents end of online memory
-	 * - ident_map_size represents online + standby and memory limits
-	 *   accounted.
-	 * Kasan maps "memsize" right away.
-	 * [0, memsize]			- as identity mapping
-	 * [__sha(0), __sha(memsize)]	- shadow memory for identity mapping
-	 * The rest [memsize, ident_map_size] if memsize < ident_map_size
-	 * could be mapped/unmapped dynamically later during memory hotplug.
-	 */
-	memsize = min(memsize, ident_map_size);
-
-	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
-	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
-	crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY);
-
-	/* init kasan zero shadow */
-	crst_table_init((unsigned long *)kasan_early_shadow_p4d,
-				p4d_val(p4d_z));
-	crst_table_init((unsigned long *)kasan_early_shadow_pud,
-				pud_val(pud_z));
-	crst_table_init((unsigned long *)kasan_early_shadow_pmd,
-				pmd_val(pmd_z));
-	memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE);
-
-	shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT;
-	pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE);
-	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) {
-		initrd_end =
-		    round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE);
-		pgalloc_low = max(pgalloc_low, initrd_end);
-	}
-
-	if (pgalloc_low + shadow_alloc_size > memsize)
-		kasan_early_panic("out of memory during initialisation\n");
-
-	if (has_edat) {
-		segment_pos = round_down(memsize, _SEGMENT_SIZE);
-		segment_low = segment_pos - shadow_alloc_size;
-		pgalloc_pos = segment_low;
-	} else {
-		pgalloc_pos = memsize;
-	}
-	init_mm.pgd = early_pg_dir;
-	/*
-	 * Current memory layout:
-	 * +- 0 -------------+	   +- shadow start -+
-	 * | 1:1 ram mapping |	  /| 1/8 ram	    |
-	 * |		     |	 / |		    |
-	 * +- end of ram ----+	/  +----------------+
-	 * | ... gap ...     | /   |		    |
-	 * |		     |/    |	kasan	    |
-	 * +- shadow start --+	   |	zero	    |
-	 * | 1/8 addr space  |	   |	page	    |
-	 * +- shadow end    -+	   |	mapping	    |
-	 * | ... gap ...     |\    |  (untracked)   |
-	 * +- vmalloc area  -+ \   |		    |
-	 * | vmalloc_size    |	\  |		    |
-	 * +- modules vaddr -+	 \ +----------------+
-	 * | 2Gb	     |	  \|	  unmapped  | allocated per module
-	 * +-----------------+	   +- shadow end ---+
-	 *
-	 * Current memory layout (KASAN_VMALLOC):
-	 * +- 0 -------------+	   +- shadow start -+
-	 * | 1:1 ram mapping |	  /| 1/8 ram	    |
-	 * |		     |	 / |		    |
-	 * +- end of ram ----+	/  +----------------+
-	 * | ... gap ...     | /   |	kasan	    |
-	 * |		     |/    |	zero	    |
-	 * +- shadow start --+	   |	page	    |
-	 * | 1/8 addr space  |	   |	mapping     |
-	 * +- shadow end    -+	   |  (untracked)   |
-	 * | ... gap ...     |\    |		    |
-	 * +- vmalloc area  -+ \   +- vmalloc area -+
-	 * | vmalloc_size    |	\  |shallow populate|
-	 * +- modules vaddr -+	 \ +- modules area -+
-	 * | 2Gb	     |	  \|shallow populate|
-	 * +-----------------+	   +- shadow end ---+
-	 */
-	/* populate kasan shadow (for identity mapping and zero page mapping) */
-	kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP);
-	if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
-		/* shallowly populate kasan shadow for vmalloc and modules */
-		kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END),
-					     POPULATE_SHALLOW);
-	}
-	/* populate kasan shadow for untracked memory */
-	kasan_early_pgtable_populate(__sha(ident_map_size),
-				     IS_ENABLED(CONFIG_KASAN_VMALLOC) ?
-						   __sha(VMALLOC_START) :
-						   __sha(MODULES_VADDR),
-				     POPULATE_ZERO_SHADOW);
-	kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE),
-				     POPULATE_ZERO_SHADOW);
-	/* memory allocated for identity mapping structs will be freed later */
-	pgalloc_freeable = pgalloc_pos;
-	/* populate identity mapping */
-	kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE);
-	kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2);
-	kasan_enable_dat();
-	/* enable kasan */
-	init_task.kasan_depth = 0;
-	memblock_reserve(pgalloc_pos, memsize - pgalloc_pos);
-	sclp_early_printk("KernelAddressSanitizer initialized\n");
-}
-
-void __init kasan_copy_shadow_mapping(void)
-{
-	/*
-	 * At this point we are still running on early pages setup early_pg_dir,
-	 * while swapper_pg_dir has just been initialized with identity mapping.
-	 * Carry over shadow memory region from early_pg_dir to swapper_pg_dir.
-	 */
-
-	pgd_t *pg_dir_src;
-	pgd_t *pg_dir_dst;
-	p4d_t *p4_dir_src;
-	p4d_t *p4_dir_dst;
-
-	pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START);
-	pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START);
-	p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START);
-	p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START);
-	memcpy(p4_dir_dst, p4_dir_src,
-	       (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
-}
-
-void __init kasan_free_early_identity(void)
-{
-	memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
-}
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 1571cdcb0c50..cfd219fe495c 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -13,15 +13,16 @@
 #include <linux/gfp.h>
 #include <linux/cpu.h>
 #include <linux/uio.h>
+#include <linux/io.h>
 #include <asm/asm-extable.h>
-#include <asm/ctl_reg.h>
-#include <asm/io.h>
 #include <asm/abs_lowcore.h>
 #include <asm/stacktrace.h>
+#include <asm/sections.h>
 #include <asm/maccess.h>
+#include <asm/ctlreg.h>
 
 unsigned long __bootdata_preserved(__memcpy_real_area);
-static __ro_after_init pte_t *memcpy_real_ptep;
+pte_t *__bootdata_preserved(memcpy_real_ptep);
 static DEFINE_MUTEX(memcpy_real_mutex);
 
 static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
@@ -40,7 +41,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
 		"	ex	%1,0(1)\n"
 		"	lg	%1,0(%3)\n"
 		"	lra	%0,0(%0)\n"
-		"	sturg	%1,%0\n"
+		"	sturg	%1,%0"
 		: "+&a" (aligned), "+&a" (count), "=m" (tmp)
 		: "a" (&tmp), "a" (&tmp[offset]), "a" (src)
 		: "cc", "memory", "1");
@@ -48,7 +49,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
 }
 
 /*
- * s390_kernel_write - write to kernel memory bypassing DAT
+ * __s390_kernel_write - write to kernel memory bypassing DAT
  * @dst: destination address
  * @src: source address
  * @size: number of bytes to copy
@@ -61,35 +62,24 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
  */
 static DEFINE_SPINLOCK(s390_kernel_write_lock);
 
-notrace void *s390_kernel_write(void *dst, const void *src, size_t size)
+notrace void *__s390_kernel_write(void *dst, const void *src, size_t size)
 {
 	void *tmp = dst;
 	unsigned long flags;
 	long copied;
 
 	spin_lock_irqsave(&s390_kernel_write_lock, flags);
-	if (!(flags & PSW_MASK_DAT)) {
-		memcpy(dst, src, size);
-	} else {
-		while (size) {
-			copied = s390_kernel_write_odd(tmp, src, size);
-			tmp += copied;
-			src += copied;
-			size -= copied;
-		}
+	while (size) {
+		copied = s390_kernel_write_odd(tmp, src, size);
+		tmp += copied;
+		src += copied;
+		size -= copied;
 	}
 	spin_unlock_irqrestore(&s390_kernel_write_lock, flags);
 
 	return dst;
 }
 
-void __init memcpy_real_init(void)
-{
-	memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true);
-	if (!memcpy_real_ptep)
-		panic("Couldn't setup memcpy real area");
-}
-
 size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
 {
 	size_t len, copied, res = 0;
@@ -97,11 +87,12 @@ size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
 	void *chunk;
 	pte_t pte;
 
+	BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE);
 	while (count) {
-		phys = src & PAGE_MASK;
-		offset = src & ~PAGE_MASK;
+		phys = src & MEMCPY_REAL_MASK;
+		offset = src & ~MEMCPY_REAL_MASK;
 		chunk = (void *)(__memcpy_real_area + offset);
-		len = min(count, PAGE_SIZE - offset);
+		len = min(count, MEMCPY_REAL_SIZE - offset);
 		pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
 
 		mutex_lock(&memcpy_real_mutex);
@@ -128,7 +119,7 @@ int memcpy_real(void *dest, unsigned long src, size_t count)
 
 	kvec.iov_base = dest;
 	kvec.iov_len = count;
-	iov_iter_kvec(&iter, WRITE, &kvec, 1, count);
+	iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
 	if (memcpy_real_iter(&iter, src, count) < count)
 		return -EFAULT;
 	return 0;
@@ -162,7 +153,6 @@ void *xlate_dev_mem_ptr(phys_addr_t addr)
 	void *ptr = phys_to_virt(addr);
 	void *bounce = ptr;
 	struct lowcore *abs_lc;
-	unsigned long flags;
 	unsigned long size;
 	int this_cpu, cpu;
 
@@ -178,10 +168,10 @@ void *xlate_dev_mem_ptr(phys_addr_t addr)
 		goto out;
 	size = PAGE_SIZE - (addr & ~PAGE_MASK);
 	if (addr < sizeof(struct lowcore)) {
-		abs_lc = get_abs_lowcore(&flags);
+		abs_lc = get_abs_lowcore();
 		ptr = (void *)abs_lc + addr;
 		memcpy(bounce, ptr, size);
-		put_abs_lowcore(abs_lc, flags);
+		put_abs_lowcore(abs_lc);
 	} else if (cpu == this_cpu) {
 		ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
 		memcpy(bounce, ptr, size);
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 3327c47bc181..2a222a7e14f4 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -15,8 +15,8 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/random.h>
-#include <linux/compat.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 #include <asm/elf.h>
 
 static unsigned long stack_maxrandom_size(void)
@@ -26,7 +26,7 @@ static unsigned long stack_maxrandom_size(void)
 	return STACK_RND_MASK << PAGE_SHIFT;
 }
 
-static inline int mmap_is_legacy(struct rlimit *rlim_stack)
+static inline int mmap_is_legacy(const struct rlimit *rlim_stack)
 {
 	if (current->personality & ADDR_COMPAT_LAYOUT)
 		return 1;
@@ -46,11 +46,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd)
 }
 
 static inline unsigned long mmap_base(unsigned long rnd,
-				      struct rlimit *rlim_stack)
+				      const struct rlimit *rlim_stack)
 {
 	unsigned long gap = rlim_stack->rlim_cur;
 	unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
-	unsigned long gap_min, gap_max;
 
 	/* Values close to RLIM_INFINITY can overflow. */
 	if (gap + pad > gap)
@@ -60,24 +59,29 @@ static inline unsigned long mmap_base(unsigned long rnd,
 	 * Top of mmap area (just below the process stack).
 	 * Leave at least a ~128 MB hole.
 	 */
-	gap_min = SZ_128M;
-	gap_max = (STACK_TOP / 6) * 5;
-
-	if (gap < gap_min)
-		gap = gap_min;
-	else if (gap > gap_max)
-		gap = gap_max;
+	gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5);
 
 	return PAGE_ALIGN(STACK_TOP - gap - rnd);
 }
 
+static int get_align_mask(struct file *filp, unsigned long flags)
+{
+	if (filp && is_file_hugepages(filp))
+		return huge_page_mask_align(filp);
+	if (!(current->flags & PF_RANDOMIZE))
+		return 0;
+	if (filp || (flags & MAP_SHARED))
+		return MMAP_ALIGN_MASK << PAGE_SHIFT;
+	return 0;
+}
+
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 				     unsigned long len, unsigned long pgoff,
-				     unsigned long flags)
+				     unsigned long flags, vm_flags_t vm_flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
-	struct vm_unmapped_area_info info;
+	struct vm_unmapped_area_info info = {};
 
 	if (len > TASK_SIZE - mmap_min_addr)
 		return -ENOMEM;
@@ -93,15 +97,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 			goto check_asce_limit;
 	}
 
-	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
 	info.high_limit = TASK_SIZE;
-	if (filp || (flags & MAP_SHARED))
-		info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
-	else
-		info.align_mask = 0;
-	info.align_offset = pgoff << PAGE_SHIFT;
+	info.align_mask = get_align_mask(filp, flags);
+	if (!(filp && is_file_hugepages(filp)))
+		info.align_offset = pgoff << PAGE_SHIFT;
 	addr = vm_unmapped_area(&info);
 	if (offset_in_page(addr))
 		return addr;
@@ -112,11 +113,11 @@ check_asce_limit:
 
 unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 					     unsigned long len, unsigned long pgoff,
-					     unsigned long flags)
+					     unsigned long flags, vm_flags_t vm_flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	struct vm_unmapped_area_info info;
+	struct vm_unmapped_area_info info = {};
 
 	/* requested length too big for entire address space */
 	if (len > TASK_SIZE - mmap_min_addr)
@@ -136,13 +137,11 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
-	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+	info.low_limit = PAGE_SIZE;
 	info.high_limit = mm->mmap_base;
-	if (filp || (flags & MAP_SHARED))
-		info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
-	else
-		info.align_mask = 0;
-	info.align_offset = pgoff << PAGE_SHIFT;
+	info.align_mask = get_align_mask(filp, flags);
+	if (!(filp && is_file_hugepages(filp)))
+		info.align_offset = pgoff << PAGE_SHIFT;
 	addr = vm_unmapped_area(&info);
 
 	/*
@@ -169,7 +168,7 @@ check_asce_limit:
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
  */
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
 {
 	unsigned long random_factor = 0UL;
 
@@ -182,29 +181,35 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 	 */
 	if (mmap_is_legacy(rlim_stack)) {
 		mm->mmap_base = mmap_base_legacy(random_factor);
-		mm->get_unmapped_area = arch_get_unmapped_area;
+		mm_flags_clear(MMF_TOPDOWN, mm);
 	} else {
 		mm->mmap_base = mmap_base(random_factor, rlim_stack);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		mm_flags_set(MMF_TOPDOWN, mm);
 	}
 }
 
-static const pgprot_t protection_map[16] = {
-	[VM_NONE]					= PAGE_NONE,
-	[VM_READ]					= PAGE_RO,
-	[VM_WRITE]					= PAGE_RO,
-	[VM_WRITE | VM_READ]				= PAGE_RO,
-	[VM_EXEC]					= PAGE_RX,
-	[VM_EXEC | VM_READ]				= PAGE_RX,
-	[VM_EXEC | VM_WRITE]				= PAGE_RX,
-	[VM_EXEC | VM_WRITE | VM_READ]			= PAGE_RX,
-	[VM_SHARED]					= PAGE_NONE,
-	[VM_SHARED | VM_READ]				= PAGE_RO,
-	[VM_SHARED | VM_WRITE]				= PAGE_RW,
-	[VM_SHARED | VM_WRITE | VM_READ]		= PAGE_RW,
-	[VM_SHARED | VM_EXEC]				= PAGE_RX,
-	[VM_SHARED | VM_EXEC | VM_READ]			= PAGE_RX,
-	[VM_SHARED | VM_EXEC | VM_WRITE]		= PAGE_RWX,
-	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= PAGE_RWX
-};
+static pgprot_t protection_map[16] __ro_after_init;
+
+void __init setup_protection_map(void)
+{
+	pgprot_t *pm = protection_map;
+
+	pm[VM_NONE]					= PAGE_NONE;
+	pm[VM_READ]					= PAGE_RO;
+	pm[VM_WRITE]					= PAGE_RO;
+	pm[VM_WRITE | VM_READ]				= PAGE_RO;
+	pm[VM_EXEC]					= PAGE_RX;
+	pm[VM_EXEC | VM_READ]				= PAGE_RX;
+	pm[VM_EXEC | VM_WRITE]				= PAGE_RX;
+	pm[VM_EXEC | VM_WRITE | VM_READ]		= PAGE_RX;
+	pm[VM_SHARED]					= PAGE_NONE;
+	pm[VM_SHARED | VM_READ]				= PAGE_RO;
+	pm[VM_SHARED | VM_WRITE]			= PAGE_RW;
+	pm[VM_SHARED | VM_WRITE | VM_READ]		= PAGE_RW;
+	pm[VM_SHARED | VM_EXEC]				= PAGE_RX;
+	pm[VM_SHARED | VM_EXEC | VM_READ]		= PAGE_RX;
+	pm[VM_SHARED | VM_EXEC | VM_WRITE]		= PAGE_RWX;
+	pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= PAGE_RWX;
+}
+
 DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index d5ea09d78938..01f9b39e65f5 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -7,210 +7,18 @@
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
 #include <linux/mm.h>
-#include <linux/memblock.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <asm/asm-extable.h>
-#include <asm/facility.h>
 #include <asm/page-states.h>
+#include <asm/sections.h>
+#include <asm/page.h>
 
-static int cmma_flag = 1;
-
-static int __init cmma(char *str)
-{
-	bool enabled;
-
-	if (!kstrtobool(str, &enabled))
-		cmma_flag = enabled;
-	return 1;
-}
-__setup("cmma=", cmma);
-
-static inline int cmma_test_essa(void)
-{
-	unsigned long tmp = 0;
-	int rc = -EOPNOTSUPP;
-
-	/* test ESSA_GET_STATE */
-	asm volatile(
-		"	.insn	rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n"
-		"0:     la      %[rc],0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: [rc] "+&d" (rc), [tmp] "+&d" (tmp)
-		: [cmd] "i" (ESSA_GET_STATE));
-	return rc;
-}
-
-void __init cmma_init(void)
-{
-	if (!cmma_flag)
-		return;
-	if (cmma_test_essa()) {
-		cmma_flag = 0;
-		return;
-	}
-	if (test_facility(147))
-		cmma_flag = 2;
-}
-
-static inline unsigned char get_page_state(struct page *page)
-{
-	unsigned char state;
-
-	asm volatile("	.insn	rrf,0xb9ab0000,%0,%1,%2,0"
-		     : "=&d" (state)
-		     : "a" (page_to_phys(page)),
-		       "i" (ESSA_GET_STATE));
-	return state & 0x3f;
-}
-
-static inline void set_page_unused(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_UNUSED));
-}
-
-static inline void set_page_stable_dat(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_STABLE));
-}
-
-static inline void set_page_stable_nodat(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_STABLE_NODAT));
-}
-
-static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	struct page *page;
-	pmd_t *pmd;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none(*pmd) || pmd_large(*pmd))
-			continue;
-		page = phys_to_page(pmd_val(*pmd));
-		set_bit(PG_arch_1, &page->flags);
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	struct page *page;
-	pud_t *pud;
-	int i;
-
-	pud = pud_offset(p4d, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none(*pud) || pud_large(*pud))
-			continue;
-		if (!pud_folded(*pud)) {
-			page = phys_to_page(pud_val(*pud));
-			for (i = 0; i < 3; i++)
-				set_bit(PG_arch_1, &page[i].flags);
-		}
-		mark_kernel_pmd(pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-}
-
-static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	struct page *page;
-	p4d_t *p4d;
-	int i;
-
-	p4d = p4d_offset(pgd, addr);
-	do {
-		next = p4d_addr_end(addr, end);
-		if (p4d_none(*p4d))
-			continue;
-		if (!p4d_folded(*p4d)) {
-			page = phys_to_page(p4d_val(*p4d));
-			for (i = 0; i < 3; i++)
-				set_bit(PG_arch_1, &page[i].flags);
-		}
-		mark_kernel_pud(p4d, addr, next);
-	} while (p4d++, addr = next, addr != end);
-}
-
-static void mark_kernel_pgd(void)
-{
-	unsigned long addr, next;
-	struct page *page;
-	pgd_t *pgd;
-	int i;
-
-	addr = 0;
-	pgd = pgd_offset_k(addr);
-	do {
-		next = pgd_addr_end(addr, MODULES_END);
-		if (pgd_none(*pgd))
-			continue;
-		if (!pgd_folded(*pgd)) {
-			page = phys_to_page(pgd_val(*pgd));
-			for (i = 0; i < 3; i++)
-				set_bit(PG_arch_1, &page[i].flags);
-		}
-		mark_kernel_p4d(pgd, addr, next);
-	} while (pgd++, addr = next, addr != MODULES_END);
-}
-
-void __init cmma_init_nodat(void)
-{
-	struct page *page;
-	unsigned long start, end, ix;
-	int i;
-
-	if (cmma_flag < 2)
-		return;
-	/* Mark pages used in kernel page tables */
-	mark_kernel_pgd();
-
-	/* Set all kernel pages not used for page tables to stable/no-dat */
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
-		page = pfn_to_page(start);
-		for (ix = start; ix < end; ix++, page++) {
-			if (__test_and_clear_bit(PG_arch_1, &page->flags))
-				continue;	/* skip page table pages */
-			if (!list_empty(&page->lru))
-				continue;	/* skip free pages */
-			set_page_stable_nodat(page, 0);
-		}
-	}
-}
+int __bootdata_preserved(cmma_flag);
 
 void arch_free_page(struct page *page, int order)
 {
 	if (!cmma_flag)
 		return;
-	set_page_unused(page, order);
+	__set_page_unused(page_to_virt(page), 1UL << order);
 }
 
 void arch_alloc_page(struct page *page, int order)
@@ -218,14 +26,7 @@ void arch_alloc_page(struct page *page, int order)
 	if (!cmma_flag)
 		return;
 	if (cmma_flag < 2)
-		set_page_stable_dat(page, order);
+		__set_page_stable_dat(page_to_virt(page), 1UL << order);
 	else
-		set_page_stable_nodat(page, order);
-}
-
-void arch_set_page_dat(struct page *page, int order)
-{
-	if (!cmma_flag)
-		return;
-	set_page_stable_dat(page, order);
+		__set_page_stable_nodat(page_to_virt(page), 1UL << order);
 }
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 85195c18b2e8..3042647c9dbf 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -3,13 +3,17 @@
  * Copyright IBM Corp. 2011
  * Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
  */
+#include <linux/cpufeature.h>
 #include <linux/hugetlb.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/cacheflush.h>
 #include <asm/facility.h>
 #include <asm/pgalloc.h>
 #include <asm/kfence.h>
 #include <asm/page.h>
+#include <asm/asm.h>
 #include <asm/set_memory.h>
 
 static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
@@ -24,7 +28,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
 	unsigned long boundary, size;
 
 	while (start < end) {
-		if (MACHINE_HAS_EDAT1) {
+		if (cpu_has_edat1()) {
 			/* set storage keys for a 1MB frame */
 			size = 1UL << 20;
 			boundary = (start + size) & ~(size - 1);
@@ -41,7 +45,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
 }
 
 #ifdef CONFIG_PROC_FS
-atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
 
 void arch_report_meminfo(struct seq_file *m)
 {
@@ -60,7 +64,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
 	unsigned long *table, mask;
 
 	mask = 0;
-	if (MACHINE_HAS_EDAT2) {
+	if (cpu_has_edat2()) {
 		switch (dtt) {
 		case CRDTE_DTT_REGION3:
 			mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
@@ -73,11 +77,9 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
 			break;
 		}
 		table = (unsigned long *)((unsigned long)old & mask);
-		crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
-	} else if (MACHINE_HAS_IDTE) {
-		cspg(old, *old, new);
+		crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val);
 	} else {
-		csp((unsigned int *)old + 1, *old, new);
+		cspg(old, *old, new);
 	}
 }
 
@@ -96,11 +98,17 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		if (flags & SET_MEMORY_RO)
 			new = pte_wrprotect(new);
 		else if (flags & SET_MEMORY_RW)
-			new = pte_mkwrite(pte_mkdirty(new));
+			new = pte_mkwrite_novma(pte_mkdirty(new));
 		if (flags & SET_MEMORY_NX)
 			new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
 		else if (flags & SET_MEMORY_X)
 			new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
+		if (flags & SET_MEMORY_INV) {
+			new = set_pte_bit(new, __pgprot(_PAGE_INVALID));
+		} else if (flags & SET_MEMORY_DEF) {
+			new = __pte(pte_val(new) & PAGE_MASK);
+			new = set_pte_bit(new, PAGE_KERNEL);
+		}
 		pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
 		ptep++;
 		addr += PAGE_SIZE;
@@ -146,11 +154,17 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
 	if (flags & SET_MEMORY_RO)
 		new = pmd_wrprotect(new);
 	else if (flags & SET_MEMORY_RW)
-		new = pmd_mkwrite(pmd_mkdirty(new));
+		new = pmd_mkwrite_novma(pmd_mkdirty(new));
 	if (flags & SET_MEMORY_NX)
 		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
 	else if (flags & SET_MEMORY_X)
 		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+	if (flags & SET_MEMORY_INV) {
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+	} else if (flags & SET_MEMORY_DEF) {
+		new = __pmd(pmd_val(new) & PMD_MASK);
+		new = set_pmd_bit(new, SEGMENT_KERNEL);
+	}
 	pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
 }
 
@@ -167,7 +181,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
 		if (pmd_none(*pmdp))
 			return -EINVAL;
 		next = pmd_addr_end(addr, end);
-		if (pmd_large(*pmdp)) {
+		if (pmd_leaf(*pmdp)) {
 			need_split  = !!(flags & SET_MEMORY_4K);
 			need_split |= !!(addr & ~PMD_MASK);
 			need_split |= !!(addr + PMD_SIZE > next);
@@ -232,6 +246,12 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
 		new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
 	else if (flags & SET_MEMORY_X)
 		new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
+	if (flags & SET_MEMORY_INV) {
+		new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID));
+	} else if (flags & SET_MEMORY_DEF) {
+		new = __pud(pud_val(new) & PUD_MASK);
+		new = set_pud_bit(new, REGION3_KERNEL);
+	}
 	pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
 }
 
@@ -248,7 +268,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
 		if (pud_none(*pudp))
 			return -EINVAL;
 		next = pud_addr_end(addr, end);
-		if (pud_large(*pudp)) {
+		if (pud_leaf(*pudp)) {
 			need_split  = !!(flags & SET_MEMORY_4K);
 			need_split |= !!(addr & ~PUD_MASK);
 			need_split |= !!(addr + PUD_SIZE > next);
@@ -298,11 +318,6 @@ static int change_page_attr(unsigned long addr, unsigned long end,
 	int rc = -EINVAL;
 	pgd_t *pgdp;
 
-	if (addr == end)
-		return 0;
-	if (end >= MODULES_END)
-		return -EINVAL;
-	mutex_lock(&cpa_mutex);
 	pgdp = pgd_offset_k(addr);
 	do {
 		if (pgd_none(*pgdp))
@@ -313,18 +328,103 @@ static int change_page_attr(unsigned long addr, unsigned long end,
 			break;
 		cond_resched();
 	} while (pgdp++, addr = next, addr < end && !rc);
-	mutex_unlock(&cpa_mutex);
 	return rc;
 }
 
-int __set_memory(unsigned long addr, int numpages, unsigned long flags)
+static int change_page_attr_alias(unsigned long addr, unsigned long end,
+				  unsigned long flags)
+{
+	unsigned long alias, offset, va_start, va_end;
+	struct vm_struct *area;
+	int rc = 0;
+
+	/*
+	 * Changes to read-only permissions on kernel VA mappings are also
+	 * applied to the kernel direct mapping. Execute permissions are
+	 * intentionally not transferred to keep all allocated pages within
+	 * the direct mapping non-executable.
+	 */
+	flags &= SET_MEMORY_RO | SET_MEMORY_RW;
+	if (!flags)
+		return 0;
+	area = NULL;
+	while (addr < end) {
+		if (!area)
+			area = find_vm_area((void *)addr);
+		if (!area || !(area->flags & VM_ALLOC))
+			return 0;
+		va_start = (unsigned long)area->addr;
+		va_end = va_start + area->nr_pages * PAGE_SIZE;
+		offset = (addr - va_start) >> PAGE_SHIFT;
+		alias = (unsigned long)page_address(area->pages[offset]);
+		rc = change_page_attr(alias, alias + PAGE_SIZE, flags);
+		if (rc)
+			break;
+		addr += PAGE_SIZE;
+		if (addr >= va_end)
+			area = NULL;
+	}
+	return rc;
+}
+
+int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags)
 {
-	if (!MACHINE_HAS_NX)
+	unsigned long end;
+	int rc;
+
+	if (!cpu_has_nx())
 		flags &= ~(SET_MEMORY_NX | SET_MEMORY_X);
 	if (!flags)
 		return 0;
+	if (!numpages)
+		return 0;
 	addr &= PAGE_MASK;
-	return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags);
+	end = addr + numpages * PAGE_SIZE;
+	mutex_lock(&cpa_mutex);
+	rc = change_page_attr(addr, end, flags);
+	if (rc)
+		goto out;
+	rc = change_page_attr_alias(addr, end, flags);
+out:
+	mutex_unlock(&cpa_mutex);
+	return rc;
+}
+
+int set_direct_map_invalid_noflush(struct page *page)
+{
+	return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV);
+}
+
+int set_direct_map_default_noflush(struct page *page)
+{
+	return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
+}
+
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	unsigned long flags;
+
+	if (valid)
+		flags = SET_MEMORY_DEF;
+	else
+		flags = SET_MEMORY_INV;
+
+	return __set_memory((unsigned long)page_to_virt(page), nr, flags);
+}
+
+bool kernel_page_present(struct page *page)
+{
+	unsigned long addr;
+	unsigned int cc;
+
+	addr = (unsigned long)page_address(page);
+	asm volatile(
+		"	lra	%[addr],0(%[addr])\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [addr] "+a" (addr)
+		:
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc) == 0;
 }
 
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
new file mode 100644
index 000000000000..2f829448c719
--- /dev/null
+++ b/arch/s390/mm/pfault.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/sched/task.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <asm/asm-extable.h>
+#include <asm/asm-offsets.h>
+#include <asm/pfault.h>
+#include <asm/diag.h>
+
+#define __SUBCODE_MASK 0x0600
+#define __PF_RES_FIELD 0x8000000000000000UL
+
+/*
+ * 'pfault' pseudo page faults routines.
+ */
+static int pfault_disable;
+
+static int __init nopfault(char *str)
+{
+	pfault_disable = 1;
+	return 1;
+}
+early_param("nopfault", nopfault);
+
+struct pfault_refbk {
+	u16 refdiagc;
+	u16 reffcode;
+	u16 refdwlen;
+	u16 refversn;
+	u64 refgaddr;
+	u64 refselmk;
+	u64 refcmpmk;
+	u64 reserved;
+};
+
+static struct pfault_refbk pfault_init_refbk = {
+	.refdiagc = 0x258,
+	.reffcode = 0,
+	.refdwlen = 5,
+	.refversn = 2,
+	.refgaddr = __LC_LPP,
+	.refselmk = 1UL << 48,
+	.refcmpmk = 1UL << 48,
+	.reserved = __PF_RES_FIELD
+};
+
+int __pfault_init(void)
+{
+	int rc = -EOPNOTSUPP;
+
+	if (pfault_disable)
+		return rc;
+	diag_stat_inc(DIAG_STAT_X258);
+	asm_inline volatile(
+		"	diag	%[refbk],%[rc],0x258\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE(0b, 0b)
+		: [rc] "+d" (rc)
+		: [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk)
+		: "cc");
+	return rc;
+}
+
+static struct pfault_refbk pfault_fini_refbk = {
+	.refdiagc = 0x258,
+	.reffcode = 1,
+	.refdwlen = 5,
+	.refversn = 2,
+};
+
+void __pfault_fini(void)
+{
+	if (pfault_disable)
+		return;
+	diag_stat_inc(DIAG_STAT_X258);
+	asm_inline volatile(
+		"	diag	%[refbk],0,0x258\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE(0b, 0b)
+		:
+		: [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk)
+		: "cc");
+}
+
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
+#define PF_COMPLETE	0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule().  It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
+static void pfault_interrupt(struct ext_code ext_code,
+			     unsigned int param32, unsigned long param64)
+{
+	struct task_struct *tsk;
+	__u16 subcode;
+	pid_t pid;
+
+	/*
+	 * Get the external interruption subcode & pfault initial/completion
+	 * signal bit. VM stores this in the 'cpu address' field associated
+	 * with the external interrupt.
+	 */
+	subcode = ext_code.subcode;
+	if ((subcode & 0xff00) != __SUBCODE_MASK)
+		return;
+	inc_irq_stat(IRQEXT_PFL);
+	/* Get the token (= pid of the affected task). */
+	pid = param64 & LPP_PID_MASK;
+	rcu_read_lock();
+	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+	if (tsk)
+		get_task_struct(tsk);
+	rcu_read_unlock();
+	if (!tsk)
+		return;
+	spin_lock(&pfault_lock);
+	if (subcode & PF_COMPLETE) {
+		/* signal bit is set -> a page has been swapped in by VM */
+		if (tsk->thread.pfault_wait == 1) {
+			/*
+			 * Initial interrupt was faster than the completion
+			 * interrupt. pfault_wait is valid. Set pfault_wait
+			 * back to zero and wake up the process. This can
+			 * safely be done because the task is still sleeping
+			 * and can't produce new pfaults.
+			 */
+			tsk->thread.pfault_wait = 0;
+			list_del(&tsk->thread.list);
+			wake_up_process(tsk);
+			put_task_struct(tsk);
+		} else {
+			/*
+			 * Completion interrupt was faster than initial
+			 * interrupt. Set pfault_wait to -1 so the initial
+			 * interrupt doesn't put the task to sleep.
+			 * If the task is not running, ignore the completion
+			 * interrupt since it must be a leftover of a PFAULT
+			 * CANCEL operation which didn't remove all pending
+			 * completion interrupts.
+			 */
+			if (task_is_running(tsk))
+				tsk->thread.pfault_wait = -1;
+		}
+	} else {
+		/* signal bit not set -> a real page is missing. */
+		if (WARN_ON_ONCE(tsk != current))
+			goto out;
+		if (tsk->thread.pfault_wait == 1) {
+			/* Already on the list with a reference: put to sleep */
+			goto block;
+		} else if (tsk->thread.pfault_wait == -1) {
+			/*
+			 * Completion interrupt was faster than the initial
+			 * interrupt (pfault_wait == -1). Set pfault_wait
+			 * back to zero and exit.
+			 */
+			tsk->thread.pfault_wait = 0;
+		} else {
+			/*
+			 * Initial interrupt arrived before completion
+			 * interrupt. Let the task sleep.
+			 * An extra task reference is needed since a different
+			 * cpu may set the task state to TASK_RUNNING again
+			 * before the scheduler is reached.
+			 */
+			get_task_struct(tsk);
+			tsk->thread.pfault_wait = 1;
+			list_add(&tsk->thread.list, &pfault_list);
+block:
+			/*
+			 * Since this must be a userspace fault, there
+			 * is no kernel task state to trample. Rely on the
+			 * return to userspace schedule() to block.
+			 */
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			set_need_resched_current();
+		}
+	}
+out:
+	spin_unlock(&pfault_lock);
+	put_task_struct(tsk);
+}
+
+static int pfault_cpu_dead(unsigned int cpu)
+{
+	struct thread_struct *thread, *next;
+	struct task_struct *tsk;
+
+	spin_lock_irq(&pfault_lock);
+	list_for_each_entry_safe(thread, next, &pfault_list, list) {
+		thread->pfault_wait = 0;
+		list_del(&thread->list);
+		tsk = container_of(thread, struct task_struct, thread);
+		wake_up_process(tsk);
+		put_task_struct(tsk);
+	}
+	spin_unlock_irq(&pfault_lock);
+	return 0;
+}
+
+static int __init pfault_irq_init(void)
+{
+	int rc;
+
+	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+	if (rc)
+		goto out_extint;
+	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+	if (rc)
+		goto out_pfault;
+	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
+	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
+				  NULL, pfault_cpu_dead);
+	return 0;
+
+out_pfault:
+	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+out_extint:
+	pfault_disable = 1;
+	return rc;
+}
+early_initcall(pfault_irq_init);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 2de48b2c1b04..7df23528c01b 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -10,70 +10,45 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <asm/mmu_context.h>
+#include <asm/page-states.h>
 #include <asm/pgalloc.h>
-#include <asm/gmap.h>
-#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_PGSTE
-
-int page_table_allocate_pgste = 0;
-EXPORT_SYMBOL(page_table_allocate_pgste);
-
-static struct ctl_table page_table_sysctl[] = {
-	{
-		.procname	= "allocate_pgste",
-		.data		= &page_table_allocate_pgste,
-		.maxlen		= sizeof(int),
-		.mode		= S_IRUGO | S_IWUSR,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-	{ }
-};
-
-static struct ctl_table page_table_sysctl_dir[] = {
-	{
-		.procname	= "vm",
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= page_table_sysctl,
-	},
-	{ }
-};
-
-static int __init page_table_register_sysctl(void)
-{
-	return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
-}
-__initcall(page_table_register_sysctl);
-
-#endif /* CONFIG_PGSTE */
-
-unsigned long *crst_table_alloc(struct mm_struct *mm)
+unsigned long *crst_table_alloc_noprof(struct mm_struct *mm)
 {
-	struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	struct ptdesc *ptdesc;
+	unsigned long *table;
 
-	if (!page)
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER);
+	if (!ptdesc)
 		return NULL;
-	arch_set_page_dat(page, CRST_ALLOC_ORDER);
-	return (unsigned long *) page_to_virt(page);
+	table = ptdesc_address(ptdesc);
+	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+	return table;
 }
 
 void crst_table_free(struct mm_struct *mm, unsigned long *table)
 {
-	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+	if (!table)
+		return;
+	pagetable_free(virt_to_ptdesc(table));
 }
 
 static void __crst_table_upgrade(void *arg)
 {
 	struct mm_struct *mm = arg;
+	struct ctlreg asce;
 
 	/* change all active ASCEs to avoid the creation of new TLBs */
 	if (current->active_mm == mm) {
-		S390_lowcore.user_asce = mm->context.asce;
-		__ctl_load(S390_lowcore.user_asce, 7, 7);
+		asce.val = mm->context.asce;
+		get_lowcore()->user_asce = asce;
+		local_ctl_load(7, &asce);
+		if (!test_thread_flag(TIF_ASCE_PRIMARY))
+			local_ctl_load(1, &asce);
 	}
 	__tlb_flush_local();
 }
@@ -83,6 +58,8 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
 	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
 	unsigned long asce_limit = mm->context.asce_limit;
 
+	mmap_assert_write_locked(mm);
+
 	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
 	VM_BUG_ON(asce_limit < _REGION2_SIZE);
 
@@ -94,23 +71,18 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
 		if (unlikely(!p4d))
 			goto err_p4d;
 		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+		pagetable_p4d_ctor(virt_to_ptdesc(p4d));
 	}
 	if (end > _REGION1_SIZE) {
 		pgd = crst_table_alloc(mm);
 		if (unlikely(!pgd))
 			goto err_pgd;
 		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+		pagetable_pgd_ctor(virt_to_ptdesc(pgd));
 	}
 
 	spin_lock_bh(&mm->page_table_lock);
 
-	/*
-	 * This routine gets called with mmap_lock lock held and there is
-	 * no reason to optimize for the case of otherwise. However, if
-	 * that would ever change, the below check will let us know.
-	 */
-	VM_BUG_ON(asce_limit != mm->context.asce_limit);
-
 	if (p4d) {
 		__pgd = (unsigned long *) mm->pgd;
 		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
@@ -136,292 +108,82 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
 	return 0;
 
 err_pgd:
+	pagetable_dtor(virt_to_ptdesc(p4d));
 	crst_table_free(mm, p4d);
 err_p4d:
 	return -ENOMEM;
 }
 
-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
-{
-	unsigned int old, new;
-
-	do {
-		old = atomic_read(v);
-		new = old ^ bits;
-	} while (atomic_cmpxchg(v, old, new) != old);
-	return new;
-}
-
 #ifdef CONFIG_PGSTE
 
-struct page *page_table_alloc_pgste(struct mm_struct *mm)
+struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm)
 {
-	struct page *page;
+	struct ptdesc *ptdesc;
 	u64 *table;
 
-	page = alloc_page(GFP_KERNEL);
-	if (page) {
-		table = (u64 *)page_to_virt(page);
+	ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0);
+	if (ptdesc) {
+		table = (u64 *)ptdesc_address(ptdesc);
+		__arch_set_page_dat(table, 1);
 		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 	}
-	return page;
+	return ptdesc;
 }
 
-void page_table_free_pgste(struct page *page)
+void page_table_free_pgste(struct ptdesc *ptdesc)
 {
-	__free_page(page);
+	pagetable_free(ptdesc);
 }
 
 #endif /* CONFIG_PGSTE */
 
-/*
- * A 2KB-pgtable is either upper or lower half of a normal page.
- * The second half of the page may be unused or used as another
- * 2KB-pgtable.
- *
- * Whenever possible the parent page for a new 2KB-pgtable is picked
- * from the list of partially allocated pages mm_context_t::pgtable_list.
- * In case the list is empty a new parent page is allocated and added to
- * the list.
- *
- * When a parent page gets fully allocated it contains 2KB-pgtables in both
- * upper and lower halves and is removed from mm_context_t::pgtable_list.
- *
- * When 2KB-pgtable is freed from to fully allocated parent page that
- * page turns partially allocated and added to mm_context_t::pgtable_list.
- *
- * If 2KB-pgtable is freed from the partially allocated parent page that
- * page turns unused and gets removed from mm_context_t::pgtable_list.
- * Furthermore, the unused parent page is released.
- *
- * As follows from the above, no unallocated or fully allocated parent
- * pages are contained in mm_context_t::pgtable_list.
- *
- * The upper byte (bits 24-31) of the parent page _refcount is used
- * for tracking contained 2KB-pgtables and has the following format:
- *
- *   PP  AA
- * 01234567    upper byte (bits 24-31) of struct page::_refcount
- *   ||  ||
- *   ||  |+--- upper 2KB-pgtable is allocated
- *   ||  +---- lower 2KB-pgtable is allocated
- *   |+------- upper 2KB-pgtable is pending for removal
- *   +-------- lower 2KB-pgtable is pending for removal
- *
- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
- * using _refcount is possible).
- *
- * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
- * The parent page is either:
- *   - added to mm_context_t::pgtable_list in case the second half of the
- *     parent page is still unallocated;
- *   - removed from mm_context_t::pgtable_list in case both hales of the
- *     parent page are allocated;
- * These operations are protected with mm_context_t::lock.
- *
- * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
- * and the corresponding PP bit is set to 1 in a single atomic operation.
- * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
- * exclusive and may never be both set to 1!
- * The parent page is either:
- *   - added to mm_context_t::pgtable_list in case the second half of the
- *     parent page is still allocated;
- *   - removed from mm_context_t::pgtable_list in case the second half of
- *     the parent page is unallocated;
- * These operations are protected with mm_context_t::lock.
- *
- * It is important to understand that mm_context_t::lock only protects
- * mm_context_t::pgtable_list and AA bits, but not the parent page itself
- * and PP bits.
- *
- * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
- * while both AA bits and the second PP bit are already unset. Then the
- * parent page does not contain any 2KB-pgtable fragment anymore, and it has
- * also been removed from mm_context_t::pgtable_list. It is safe to release
- * the page therefore.
- *
- * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
- * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
- * while the PP bits are never used, nor such a page is added to or removed
- * from mm_context_t::pgtable_list.
- */
-unsigned long *page_table_alloc(struct mm_struct *mm)
+unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
 {
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	struct ptdesc *ptdesc;
 	unsigned long *table;
-	struct page *page;
-	unsigned int mask, bit;
-
-	/* Try to get a fragment of a 4K page as a 2K page table */
-	if (!mm_alloc_pgste(mm)) {
-		table = NULL;
-		spin_lock_bh(&mm->context.lock);
-		if (!list_empty(&mm->context.pgtable_list)) {
-			page = list_first_entry(&mm->context.pgtable_list,
-						struct page, lru);
-			mask = atomic_read(&page->_refcount) >> 24;
-			/*
-			 * The pending removal bits must also be checked.
-			 * Failure to do so might lead to an impossible
-			 * value of (i.e 0x13 or 0x23) written to _refcount.
-			 * Such values violate the assumption that pending and
-			 * allocation bits are mutually exclusive, and the rest
-			 * of the code unrails as result. That could lead to
-			 * a whole bunch of races and corruptions.
-			 */
-			mask = (mask | (mask >> 4)) & 0x03U;
-			if (mask != 0x03U) {
-				table = (unsigned long *) page_to_virt(page);
-				bit = mask & 1;		/* =1 -> second 2K */
-				if (bit)
-					table += PTRS_PER_PTE;
-				atomic_xor_bits(&page->_refcount,
-							0x01U << (bit + 24));
-				list_del(&page->lru);
-			}
-		}
-		spin_unlock_bh(&mm->context.lock);
-		if (table)
-			return table;
-	}
-	/* Allocate a fresh page */
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	ptdesc = pagetable_alloc_noprof(gfp, 0);
+	if (!ptdesc)
 		return NULL;
-	if (!pgtable_pte_page_ctor(page)) {
-		__free_page(page);
+	if (!pagetable_pte_ctor(mm, ptdesc)) {
+		pagetable_free(ptdesc);
 		return NULL;
 	}
-	arch_set_page_dat(page, 0);
-	/* Initialize page table */
-	table = (unsigned long *) page_to_virt(page);
-	if (mm_alloc_pgste(mm)) {
-		/* Return 4K page table with PGSTEs */
-		atomic_xor_bits(&page->_refcount, 0x03U << 24);
-		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
-		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
-	} else {
-		/* Return the first 2K fragment of the page */
-		atomic_xor_bits(&page->_refcount, 0x01U << 24);
-		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
-		spin_lock_bh(&mm->context.lock);
-		list_add(&page->lru, &mm->context.pgtable_list);
-		spin_unlock_bh(&mm->context.lock);
-	}
+	table = ptdesc_address(ptdesc);
+	__arch_set_page_dat(table, 1);
+	memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+	memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 	return table;
 }
 
-static void page_table_release_check(struct page *page, void *table,
-				     unsigned int half, unsigned int mask)
-{
-	char msg[128];
-
-	if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
-		return;
-	snprintf(msg, sizeof(msg),
-		 "Invalid pgtable %p release half 0x%02x mask 0x%02x",
-		 table, half, mask);
-	dump_page(page, msg);
-}
-
 void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
-	unsigned int mask, bit, half;
-	struct page *page;
-
-	page = virt_to_page(table);
-	if (!mm_alloc_pgste(mm)) {
-		/* Free 2K page table fragment of a 4K page */
-		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-		spin_lock_bh(&mm->context.lock);
-		/*
-		 * Mark the page for delayed release. The actual release
-		 * will happen outside of the critical section from this
-		 * function or from __tlb_remove_table()
-		 */
-		mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
-		mask >>= 24;
-		if (mask & 0x03U)
-			list_add(&page->lru, &mm->context.pgtable_list);
-		else
-			list_del(&page->lru);
-		spin_unlock_bh(&mm->context.lock);
-		mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
-		mask >>= 24;
-		if (mask != 0x00U)
-			return;
-		half = 0x01U << bit;
-	} else {
-		half = 0x03U;
-		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
-		mask >>= 24;
-	}
+	struct ptdesc *ptdesc = virt_to_ptdesc(table);
 
-	page_table_release_check(page, table, half, mask);
-	pgtable_pte_page_dtor(page);
-	__free_page(page);
+	if (pagetable_is_reserved(ptdesc))
+		return free_reserved_ptdesc(ptdesc);
+	pagetable_dtor_free(ptdesc);
 }
 
-void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
-			 unsigned long vmaddr)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
 {
-	struct mm_struct *mm;
-	struct page *page;
-	unsigned int bit, mask;
-
-	mm = tlb->mm;
-	page = virt_to_page(table);
-	if (mm_alloc_pgste(mm)) {
-		gmap_unlink(mm, table, vmaddr);
-		table = (unsigned long *) ((unsigned long)table | 0x03U);
-		tlb_remove_table(tlb, table);
-		return;
-	}
-	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-	spin_lock_bh(&mm->context.lock);
-	/*
-	 * Mark the page for delayed release. The actual release will happen
-	 * outside of the critical section from __tlb_remove_table() or from
-	 * page_table_free()
-	 */
-	mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
-	mask >>= 24;
-	if (mask & 0x03U)
-		list_add_tail(&page->lru, &mm->context.pgtable_list);
-	else
-		list_del(&page->lru);
-	spin_unlock_bh(&mm->context.lock);
-	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
-	tlb_remove_table(tlb, table);
+	struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+
+	pagetable_dtor_free(ptdesc);
 }
 
-void __tlb_remove_table(void *_table)
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
 {
-	unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
-	void *table = (void *)((unsigned long) _table ^ mask);
-	struct page *page = virt_to_page(table);
-
-	switch (half) {
-	case 0x00U:	/* pmd, pud, or p4d */
-		free_pages((unsigned long)table, CRST_ALLOC_ORDER);
-		return;
-	case 0x01U:	/* lower 2K of a 4K page table */
-	case 0x02U:	/* higher 2K of a 4K page table */
-		mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
-		mask >>= 24;
-		if (mask != 0x00U)
-			return;
-		break;
-	case 0x03U:	/* 4K page table with pgstes */
-		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
-		mask >>= 24;
-		break;
-	}
+	struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
 
-	page_table_release_check(page, table, half, mask);
-	pgtable_pte_page_dtor(page);
-	__free_page(page);
+	call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
 }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
  * Base infrastructure required to generate basic asces, region, segment,
@@ -448,16 +210,21 @@ static void base_pgt_free(unsigned long *table)
 static unsigned long *base_crst_alloc(unsigned long val)
 {
 	unsigned long *table;
+	struct ptdesc *ptdesc;
 
-	table =	(unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
-	if (table)
-		crst_table_init(table, val);
+	ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+	if (!ptdesc)
+		return NULL;
+	table = ptdesc_address(ptdesc);
+	crst_table_init(table, val);
 	return table;
 }
 
 static void base_crst_free(unsigned long *table)
 {
-	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+	if (!table)
+		return;
+	pagetable_free(virt_to_ptdesc(table));
 }
 
 #define BASE_ADDR_END_FUNC(NAME, SIZE)					\
@@ -469,7 +236,7 @@ static inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
 	return (next - 1) < (end - 1) ? next : end;			\
 }
 
-BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
+BASE_ADDR_END_FUNC(page,    PAGE_SIZE)
 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
@@ -480,7 +247,7 @@ static inline unsigned long base_lra(unsigned long address)
 	unsigned long real;
 
 	asm volatile(
-		"	lra	%0,0(%1)\n"
+		"	lra	%0,0(%1)"
 		: "=d" (real) : "a" (address) : "cc");
 	return real;
 }
@@ -493,7 +260,7 @@ static int base_page_walk(unsigned long *origin, unsigned long addr,
 	if (!alloc)
 		return 0;
 	pte = origin;
-	pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
+	pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
 	do {
 		next = base_page_addr_end(addr, end);
 		*pte = base_lra(addr);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4909dcd762e8..666adcd681ab 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -4,6 +4,8 @@
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
+#include <linux/cpufeature.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -14,15 +16,16 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/sysctl.h>
 #include <linux/ksm.h>
 #include <linux/mman.h>
 
-#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/page-states.h>
+#include <asm/pgtable.h>
+#include <asm/machine.h>
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
@@ -34,22 +37,12 @@ pgprot_t pgprot_writecombine(pgprot_t prot)
 }
 EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
-pgprot_t pgprot_writethrough(pgprot_t prot)
-{
-	/*
-	 * mio_wb_bit_mask may be set on a different CPU, but it is only set
-	 * once at init and only read afterwards.
-	 */
-	return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask);
-}
-EXPORT_SYMBOL_GPL(pgprot_writethrough);
-
 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
 				   pte_t *ptep, int nodat)
 {
 	unsigned long opt, asce;
 
-	if (MACHINE_HAS_TLB_GUEST) {
+	if (machine_has_tlb_guest()) {
 		opt = 0;
 		asce = READ_ONCE(mm->context.gmap_asce);
 		if (asce == 0UL || nodat)
@@ -69,7 +62,7 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr,
 {
 	unsigned long opt, asce;
 
-	if (MACHINE_HAS_TLB_GUEST) {
+	if (machine_has_tlb_guest()) {
 		opt = 0;
 		asce = READ_ONCE(mm->context.gmap_asce);
 		if (asce == 0UL || nodat)
@@ -94,7 +87,7 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm,
 	if (unlikely(pte_val(old) & _PAGE_INVALID))
 		return old;
 	atomic_inc(&mm->context.flush_count);
-	if (MACHINE_HAS_TLB_LC &&
+	if (cpu_has_tlb_lc() &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		ptep_ipte_local(mm, addr, ptep, nodat);
 	else
@@ -123,37 +116,6 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
 	return old;
 }
 
-static inline pgste_t pgste_get_lock(pte_t *ptep)
-{
-	unsigned long new = 0;
-#ifdef CONFIG_PGSTE
-	unsigned long old;
-
-	asm(
-		"	lg	%0,%2\n"
-		"0:	lgr	%1,%0\n"
-		"	nihh	%0,0xff7f\n"	/* clear PCL bit in old */
-		"	oihh	%1,0x0080\n"	/* set PCL bit in new */
-		"	csg	%0,%1,%2\n"
-		"	jl	0b\n"
-		: "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
-		: "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
-#endif
-	return __pgste(new);
-}
-
-static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	asm(
-		"	nihh	%1,0xff7f\n"	/* clear PCL bit */
-		"	stg	%1,%0\n"
-		: "=Q" (ptep[PTRS_PER_PTE])
-		: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
-		: "cc", "memory");
-#endif
-}
-
 static inline pgste_t pgste_get(pte_t *ptep)
 {
 	unsigned long pgste = 0;
@@ -182,10 +144,10 @@ static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
 	skey = (unsigned long) page_get_storage_key(address);
 	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 	/* Transfer page changed & referenced bit to guest bits in pgste */
-	pgste_val(pgste) |= bits << 48;		/* GR bit & GC bit */
+	pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */
 	/* Copy page access key and fetch protection bit to pgste */
-	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
 #endif
 	return pgste;
 
@@ -219,7 +181,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
 	if ((pte_val(entry) & _PAGE_PRESENT) &&
 	    (pte_val(entry) & _PAGE_WRITE) &&
 	    !(pte_val(entry) & _PAGE_INVALID)) {
-		if (!MACHINE_HAS_ESOP) {
+		if (!machine_has_esop()) {
 			/*
 			 * Without enhanced suppression-on-protection force
 			 * the dirty bit on for all writable ptes.
@@ -229,7 +191,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
 		}
 		if (!(pte_val(entry) & _PAGE_PROTECT))
 			/* This pte allows write access, set user-dirty */
-			pgste_val(pgste) |= PGSTE_UC_BIT;
+			pgste = set_pgste_bit(pgste, PGSTE_UC_BIT);
 	}
 #endif
 	set_pte(ptep, entry);
@@ -245,7 +207,7 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
 
 	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
 	if (bits) {
-		pgste_val(pgste) ^= bits;
+		pgste = __pgste(pgste_val(pgste) ^ bits);
 		ptep_notify(mm, addr, ptep, bits);
 	}
 #endif
@@ -302,6 +264,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(ptep_xchg_direct);
 
+/*
+ * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that
+ * RDP can be used instead of IPTE. See also comments at pte_allow_rdp().
+ */
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+			 pte_t new)
+{
+	preempt_disable();
+	atomic_inc(&mm->context.flush_count);
+	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		__ptep_rdp(addr, ptep, 1);
+	else
+		__ptep_rdp(addr, ptep, 0);
+	/*
+	 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That
+	 * means it is still valid and active, and must not be changed according
+	 * to the architecture. But writing a new value that only differs in SW
+	 * bits is allowed.
+	 */
+	set_pte(ptep, new);
+	atomic_dec(&mm->context.flush_count);
+	preempt_enable();
+}
+EXPORT_SYMBOL(ptep_reset_dat_prot);
+
 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t new)
 {
@@ -327,7 +314,6 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
 	int nodat;
 	struct mm_struct *mm = vma->vm_mm;
 
-	preempt_disable();
 	pgste = ptep_xchg_start(mm, addr, ptep);
 	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
 	old = ptep_flush_lazy(mm, addr, ptep, nodat);
@@ -344,8 +330,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 	pgste_t pgste;
 	struct mm_struct *mm = vma->vm_mm;
 
-	if (!MACHINE_HAS_NX)
-		pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC));
 	if (mm_has_pgste(mm)) {
 		pgste = pgste_get(ptep);
 		pgste_set_key(ptep, pgste, pte, mm);
@@ -354,13 +338,12 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 	} else {
 		set_pte(ptep, pte);
 	}
-	preempt_enable();
 }
 
 static inline void pmdp_idte_local(struct mm_struct *mm,
 				   unsigned long addr, pmd_t *pmdp)
 {
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_LOCAL);
 	else
@@ -372,19 +355,15 @@ static inline void pmdp_idte_local(struct mm_struct *mm,
 static inline void pmdp_idte_global(struct mm_struct *mm,
 				    unsigned long addr, pmd_t *pmdp)
 {
-	if (MACHINE_HAS_TLB_GUEST) {
+	if (machine_has_tlb_guest()) {
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
 		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
-	} else if (MACHINE_HAS_IDTE) {
+	} else {
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
 		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
-	} else {
-		__pmdp_csp(pmdp);
-		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-			gmap_pmdp_csp(mm, addr);
 	}
 }
 
@@ -397,7 +376,7 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
 	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
 		return old;
 	atomic_inc(&mm->context.flush_count);
-	if (MACHINE_HAS_TLB_LC &&
+	if (cpu_has_tlb_lc() &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		pmdp_idte_local(mm, addr, pmdp);
 	else
@@ -454,7 +433,7 @@ static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
 		return -ENOENT;
 
 	/* Large PUDs are not supported yet. */
-	if (pud_large(*pud))
+	if (pud_leaf(*pud))
 		return -EFAULT;
 
 	*pmdp = pmd_offset(pud, addr);
@@ -491,7 +470,7 @@ EXPORT_SYMBOL(pmdp_xchg_lazy);
 static inline void pudp_idte_local(struct mm_struct *mm,
 				   unsigned long addr, pud_t *pudp)
 {
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_LOCAL);
 	else
@@ -501,17 +480,11 @@ static inline void pudp_idte_local(struct mm_struct *mm,
 static inline void pudp_idte_global(struct mm_struct *mm,
 				    unsigned long addr, pud_t *pudp)
 {
-	if (MACHINE_HAS_TLB_GUEST)
+	if (machine_has_tlb_guest())
 		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
-	else if (MACHINE_HAS_IDTE)
-		__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
 	else
-		/*
-		 * Invalid bit position is the same for pmd and pud, so we can
-		 * re-use _pmd_csp() here
-		 */
-		__pmdp_csp((pmd_t *) pudp);
+		__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
 }
 
 static inline pud_t pudp_flush_direct(struct mm_struct *mm,
@@ -523,7 +496,7 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm,
 	if (pud_val(old) & _REGION_ENTRY_INVALID)
 		return old;
 	atomic_inc(&mm->context.flush_count);
-	if (MACHINE_HAS_TLB_LC &&
+	if (cpu_has_tlb_lc() &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		pudp_idte_local(mm, addr, pudp);
 	else
@@ -595,7 +568,7 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
 	/* the mm_has_pgste() check is done in set_pte_at() */
 	preempt_disable();
 	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
+	pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO);
 	pgste_set_key(ptep, pgste, entry, mm);
 	pgste = pgste_set_pte(ptep, pgste, entry);
 	pgste_set_unlock(ptep, pgste);
@@ -608,7 +581,7 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 
 	preempt_disable();
 	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) |= PGSTE_IN_BIT;
+	pgste = set_pgste_bit(pgste, PGSTE_IN_BIT);
 	pgste_set_unlock(ptep, pgste);
 	preempt_enable();
 }
@@ -653,7 +626,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
 		entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
 		entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
 	}
-	pgste_val(pgste) |= bit;
+	pgste = set_pgste_bit(pgste, bit);
 	pgste = pgste_set_pte(ptep, pgste, entry);
 	pgste_set_unlock(ptep, pgste);
 	return 0;
@@ -673,7 +646,7 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
 	if (!(pte_val(spte) & _PAGE_INVALID) &&
 	    !((pte_val(spte) & _PAGE_PROTECT) &&
 	      !(pte_val(pte) & _PAGE_PROTECT))) {
-		pgste_val(spgste) |= PGSTE_VSIE_BIT;
+		spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT);
 		tpgste = pgste_get_lock(tptep);
 		tpte = __pte((pte_val(spte) & PAGE_MASK) |
 			     (pte_val(pte) & _PAGE_PROTECT));
@@ -700,14 +673,14 @@ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
 	pgste_set_unlock(ptep, pgste);
 }
 
-static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
+static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
 {
-	if (!non_swap_entry(entry))
+	if (softleaf_is_swap(entry))
 		dec_mm_counter(mm, MM_SWAPENTS);
-	else if (is_migration_entry(entry)) {
-		struct page *page = pfn_swap_entry_to_page(entry);
+	else if (softleaf_is_migration(entry)) {
+		struct folio *folio = softleaf_to_folio(entry);
 
-		dec_mm_counter(mm, mm_counter(page));
+		dec_mm_counter(mm, mm_counter(folio));
 	}
 	free_swap_and_cache(entry);
 }
@@ -727,11 +700,11 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
 	if (!reset && pte_swap(pte) &&
 	    ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
 	     (pgstev & _PGSTE_GPS_ZERO))) {
-		ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte));
 		pte_clear(mm, addr, ptep);
 	}
 	if (reset)
-		pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
+		pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
 	pgste_set_unlock(ptep, pgste);
 	preempt_enable();
 }
@@ -744,8 +717,8 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 	/* Clear storage key ACC and F, but set R/C */
 	preempt_disable();
 	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
+	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT);
 	ptev = pte_val(*ptep);
 	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
 		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
@@ -766,13 +739,13 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
 
 	pgste = pgste_get_lock(ptep);
 	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
-	pgste_val(pgste) &= ~PGSTE_UC_BIT;
+	pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT);
 	pte = *ptep;
 	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
 		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
 		nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
 		ptep_ipte_global(mm, addr, ptep, nodat);
-		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
+		if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE))
 			pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
 		else
 			pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
@@ -804,14 +777,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	default:
 		return -EFAULT;
 	}
-
+again:
 	ptl = pmd_lock(mm, pmdp);
 	if (!pmd_present(*pmdp)) {
 		spin_unlock(ptl);
 		return key ? -EFAULT : 0;
 	}
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		paddr = pmd_val(*pmdp) & HPAGE_MASK;
 		paddr |= addr & ~HPAGE_MASK;
 		/*
@@ -825,12 +798,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	spin_unlock(ptl);
 
 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
 	new = old = pgste_get_lock(ptep);
-	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
-			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT |
+				   PGSTE_ACC_BITS | PGSTE_FP_BIT);
 	keyul = (unsigned long) key;
-	pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
-	pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+	new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48);
+	new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 		unsigned long bits, skey;
 
@@ -841,12 +816,12 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 		/* Set storage key ACC and FP */
 		page_set_storage_key(paddr, skey, !nq);
 		/* Merge host changed & referenced into pgste  */
-		pgste_val(new) |= bits << 52;
+		new = set_pgste_bit(new, bits << 52);
 	}
 	/* changing the guest storage key is considered a change of the page */
 	if ((pgste_val(new) ^ pgste_val(old)) &
 	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
-		pgste_val(new) |= PGSTE_UC_BIT;
+		new = set_pgste_bit(new, PGSTE_UC_BIT);
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
@@ -913,14 +888,14 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 	default:
 		return -EFAULT;
 	}
-
+again:
 	ptl = pmd_lock(mm, pmdp);
 	if (!pmd_present(*pmdp)) {
 		spin_unlock(ptl);
 		return 0;
 	}
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		paddr = pmd_val(*pmdp) & HPAGE_MASK;
 		paddr |= addr & ~HPAGE_MASK;
 		cc = page_reset_referenced(paddr);
@@ -930,21 +905,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 	spin_unlock(ptl);
 
 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
 	new = old = pgste_get_lock(ptep);
 	/* Reset guest reference bit only */
-	pgste_val(new) &= ~PGSTE_GR_BIT;
+	new = clear_pgste_bit(new, PGSTE_GR_BIT);
 
 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 		paddr = pte_val(*ptep) & PAGE_MASK;
 		cc = page_reset_referenced(paddr);
 		/* Merge real referenced bit into host-set */
-		pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
+		new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT);
 	}
 	/* Reflect guest's logical view, not physical */
 	cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
 	/* Changing the guest storage key is considered a change of the page */
 	if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
-		pgste_val(new) |= PGSTE_UC_BIT;
+		new = set_pgste_bit(new, PGSTE_UC_BIT);
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
@@ -975,14 +952,14 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	default:
 		return -EFAULT;
 	}
-
+again:
 	ptl = pmd_lock(mm, pmdp);
 	if (!pmd_present(*pmdp)) {
 		spin_unlock(ptl);
 		return 0;
 	}
 
-	if (pmd_large(*pmdp)) {
+	if (pmd_leaf(*pmdp)) {
 		paddr = pmd_val(*pmdp) & HPAGE_MASK;
 		paddr |= addr & ~HPAGE_MASK;
 		*key = page_get_storage_key(paddr);
@@ -992,6 +969,8 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	spin_unlock(ptl);
 
 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
 	pgste = pgste_get_lock(ptep);
 	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
 	paddr = pte_val(*ptep) & PAGE_MASK;
@@ -1106,7 +1085,7 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
 	if (res)
 		pgstev |= _PGSTE_GPS_ZERO;
 
-	pgste_val(pgste) = pgstev;
+	pgste = __pgste(pgstev);
 	pgste_set_unlock(ptep, pgste);
 	pte_unmap_unlock(ptep, ptl);
 	return res;
@@ -1139,8 +1118,8 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
 		return -EFAULT;
 	new = pgste_get_lock(ptep);
 
-	pgste_val(new) &= ~bits;
-	pgste_val(new) |= value & bits;
+	new = clear_pgste_bit(new, bits);
+	new = set_pgste_bit(new, value & bits);
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c
new file mode 100644
index 000000000000..59de866c72d9
--- /dev/null
+++ b/arch/s390/mm/physaddr.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mmdebug.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+
+unsigned long __phys_addr(unsigned long x, bool is_31bit)
+{
+	VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x)));
+	x = __pa_nodebug(x);
+	if (is_31bit)
+		VIRTUAL_BUG_ON(x >> 31);
+	return x;
+}
+EXPORT_SYMBOL(__phys_addr);
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ee1a97078527..d96587b84e81 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -4,6 +4,8 @@
  */
 
 #include <linux/memory_hotplug.h>
+#include <linux/bootmem_info.h>
+#include <linux/cpufeature.h>
 #include <linux/memblock.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
@@ -11,13 +13,19 @@
 #include <linux/list.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
+#include <asm/page-states.h>
+#include <asm/abs_lowcore.h>
 #include <asm/cacheflush.h>
+#include <asm/maccess.h>
 #include <asm/nospec-branch.h>
+#include <asm/ctlreg.h>
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/set_memory.h>
+#include <asm/physmem_info.h>
 
 static DEFINE_MUTEX(vmem_mutex);
 
@@ -30,13 +38,23 @@ static void __ref *vmem_alloc_pages(unsigned int order)
 	return memblock_alloc(size, size);
 }
 
-static void vmem_free_pages(unsigned long addr, int order)
+static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
 {
-	/* We don't expect boot memory to be removed ever. */
-	if (!slab_is_available() ||
-	    WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
+	unsigned int nr_pages = 1 << order;
+	struct page *page;
+
+	if (altmap) {
+		vmem_altmap_free(altmap, 1 << order);
 		return;
-	free_pages(addr, order);
+	}
+	page = virt_to_page((void *)addr);
+	if (PageReserved(page)) {
+		/* allocated from memblock */
+		while (nr_pages--)
+			free_bootmem_page(page++);
+	} else {
+		free_pages(addr, order);
+	}
 }
 
 void *vmem_crst_alloc(unsigned long val)
@@ -44,32 +62,30 @@ void *vmem_crst_alloc(unsigned long val)
 	unsigned long *table;
 
 	table = vmem_alloc_pages(CRST_ALLOC_ORDER);
-	if (table)
-		crst_table_init(table, val);
+	if (!table)
+		return NULL;
+	crst_table_init(table, val);
+	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
 	return table;
 }
 
 pte_t __ref *vmem_pte_alloc(void)
 {
-	unsigned long size = PTRS_PER_PTE * sizeof(pte_t);
 	pte_t *pte;
 
 	if (slab_is_available())
-		pte = (pte_t *) page_table_alloc(&init_mm);
+		pte = (pte_t *)page_table_alloc(&init_mm);
 	else
-		pte = (pte_t *) memblock_alloc(size, size);
+		pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 	if (!pte)
 		return NULL;
 	memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+	__arch_set_page_dat(pte, 1);
 	return pte;
 }
 
 static void vmem_pte_free(unsigned long *table)
 {
-	/* We don't expect boot memory to be removed ever. */
-	if (!slab_is_available() ||
-	    WARN_ON_ONCE(PageReserved(virt_to_page(table))))
-		return;
 	page_table_free(&init_mm, table);
 }
 
@@ -150,27 +166,25 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
 
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
-				  unsigned long end, bool add, bool direct)
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
 {
 	unsigned long prot, pages = 0;
 	int ret = -ENOMEM;
 	pte_t *pte;
 
 	prot = pgprot_val(PAGE_KERNEL);
-	if (!MACHINE_HAS_NX)
-		prot &= ~_PAGE_NOEXEC;
-
 	pte = pte_offset_kernel(pmd, addr);
 	for (; addr < end; addr += PAGE_SIZE, pte++) {
 		if (!add) {
 			if (pte_none(*pte))
 				continue;
 			if (!direct)
-				vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+				vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap);
 			pte_clear(&init_mm, addr, pte);
 		} else if (pte_none(*pte)) {
 			if (!direct) {
-				void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+				void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap);
 
 				if (!new_page)
 					goto out;
@@ -207,7 +221,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start)
 
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
-				  unsigned long end, bool add, bool direct)
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -215,24 +230,21 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 	pte_t *pte;
 
 	prot = pgprot_val(SEGMENT_KERNEL);
-	if (!MACHINE_HAS_NX)
-		prot &= ~_SEGMENT_ENTRY_NOEXEC;
-
 	pmd = pmd_offset(pud, addr);
 	for (; addr < end; addr = next, pmd++) {
 		next = pmd_addr_end(addr, end);
 		if (!add) {
 			if (pmd_none(*pmd))
 				continue;
-			if (pmd_large(*pmd)) {
+			if (pmd_leaf(*pmd)) {
 				if (IS_ALIGNED(addr, PMD_SIZE) &&
 				    IS_ALIGNED(next, PMD_SIZE)) {
 					if (!direct)
-						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
 					pmd_clear(pmd);
 					pages++;
 				} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
-					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
 					pmd_clear(pmd);
 				}
 				continue;
@@ -240,12 +252,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 		} else if (pmd_none(*pmd)) {
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE) &&
-			    MACHINE_HAS_EDAT1 && direct &&
+			    cpu_has_edat1() && direct &&
 			    !debug_pagealloc_enabled()) {
 				set_pmd(pmd, __pmd(__pa(addr) | prot));
 				pages++;
 				continue;
-			} else if (!direct && MACHINE_HAS_EDAT1) {
+			} else if (!direct && cpu_has_edat1()) {
 				void *new_page;
 
 				/*
@@ -255,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				 * page tables since vmemmap_populate gets
 				 * called for each section separately.
 				 */
-				new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+				new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap);
 				if (new_page) {
 					set_pmd(pmd, __pmd(__pa(new_page) | prot));
 					if (!IS_ALIGNED(addr, PMD_SIZE) ||
@@ -269,12 +281,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 			if (!pte)
 				goto out;
 			pmd_populate(&init_mm, pmd, pte);
-		} else if (pmd_large(*pmd)) {
+		} else if (pmd_leaf(*pmd)) {
 			if (!direct)
 				vmemmap_use_sub_pmd(addr, next);
 			continue;
 		}
-		ret = modify_pte_table(pmd, addr, next, add, direct);
+		ret = modify_pte_table(pmd, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -289,27 +301,19 @@ out:
 
 static void try_free_pmd_table(pud_t *pud, unsigned long start)
 {
-	const unsigned long end = start + PUD_SIZE;
 	pmd_t *pmd;
 	int i;
 
-	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
-	if (end > VMALLOC_START)
-		return;
-#ifdef CONFIG_KASAN
-	if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
-		return;
-#endif
 	pmd = pmd_offset(pud, start);
 	for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
 		if (!pmd_none(*pmd))
 			return;
-	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL);
 	pud_clear(pud);
 }
 
 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
-			    bool add, bool direct)
+			    bool add, bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -317,15 +321,13 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 	pmd_t *pmd;
 
 	prot = pgprot_val(REGION3_KERNEL);
-	if (!MACHINE_HAS_NX)
-		prot &= ~_REGION_ENTRY_NOEXEC;
 	pud = pud_offset(p4d, addr);
 	for (; addr < end; addr = next, pud++) {
 		next = pud_addr_end(addr, end);
 		if (!add) {
 			if (pud_none(*pud))
 				continue;
-			if (pud_large(*pud)) {
+			if (pud_leaf(*pud)) {
 				if (IS_ALIGNED(addr, PUD_SIZE) &&
 				    IS_ALIGNED(next, PUD_SIZE)) {
 					pud_clear(pud);
@@ -336,7 +338,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 		} else if (pud_none(*pud)) {
 			if (IS_ALIGNED(addr, PUD_SIZE) &&
 			    IS_ALIGNED(next, PUD_SIZE) &&
-			    MACHINE_HAS_EDAT2 && direct &&
+			    cpu_has_edat2() && direct &&
 			    !debug_pagealloc_enabled()) {
 				set_pud(pud, __pud(__pa(addr) | prot));
 				pages++;
@@ -346,10 +348,10 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 			if (!pmd)
 				goto out;
 			pud_populate(&init_mm, pud, pmd);
-		} else if (pud_large(*pud)) {
+		} else if (pud_leaf(*pud)) {
 			continue;
 		}
-		ret = modify_pmd_table(pud, addr, next, add, direct);
+		ret = modify_pmd_table(pud, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -364,29 +366,20 @@ out:
 
 static void try_free_pud_table(p4d_t *p4d, unsigned long start)
 {
-	const unsigned long end = start + P4D_SIZE;
 	pud_t *pud;
 	int i;
 
-	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
-	if (end > VMALLOC_START)
-		return;
-#ifdef CONFIG_KASAN
-	if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
-		return;
-#endif
-
 	pud = pud_offset(p4d, start);
 	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
 		if (!pud_none(*pud))
 			return;
 	}
-	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL);
 	p4d_clear(p4d);
 }
 
 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
-			    bool add, bool direct)
+			    bool add, bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long next;
 	int ret = -ENOMEM;
@@ -405,7 +398,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 				goto out;
 			p4d_populate(&init_mm, p4d, pud);
 		}
-		ret = modify_pud_table(p4d, addr, next, add, direct);
+		ret = modify_pud_table(p4d, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -418,29 +411,20 @@ out:
 
 static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
 {
-	const unsigned long end = start + PGDIR_SIZE;
 	p4d_t *p4d;
 	int i;
 
-	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
-	if (end > VMALLOC_START)
-		return;
-#ifdef CONFIG_KASAN
-	if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
-		return;
-#endif
-
 	p4d = p4d_offset(pgd, start);
 	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
 		if (!p4d_none(*p4d))
 			return;
 	}
-	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL);
 	pgd_clear(pgd);
 }
 
 static int modify_pagetable(unsigned long start, unsigned long end, bool add,
-			    bool direct)
+			    bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long addr, next;
 	int ret = -ENOMEM;
@@ -449,6 +433,9 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 
 	if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
 		return -EINVAL;
+	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+	if (WARN_ON_ONCE(end > __abs_lowcore))
+		return -EINVAL;
 	for (addr = start; addr < end; addr = next) {
 		next = pgd_addr_end(addr, end);
 		pgd = pgd_offset_k(addr);
@@ -462,7 +449,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 				goto out;
 			pgd_populate(&init_mm, pgd, p4d);
 		}
-		ret = modify_p4d_table(pgd, addr, next, add, direct);
+		ret = modify_p4d_table(pgd, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -475,14 +462,16 @@ out:
 	return ret;
 }
 
-static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+static int add_pagetable(unsigned long start, unsigned long end, bool direct,
+			 struct vmem_altmap *altmap)
 {
-	return modify_pagetable(start, end, true, direct);
+	return modify_pagetable(start, end, true, direct, altmap);
 }
 
-static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct,
+			    struct vmem_altmap *altmap)
 {
-	return modify_pagetable(start, end, false, direct);
+	return modify_pagetable(start, end, false, direct, altmap);
 }
 
 /*
@@ -490,7 +479,8 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
  */
 static int vmem_add_range(unsigned long start, unsigned long size)
 {
-	return add_pagetable(start, start + size, true);
+	start = (unsigned long)__va(start);
+	return add_pagetable(start, start + size, true, NULL);
 }
 
 /*
@@ -498,7 +488,8 @@ static int vmem_add_range(unsigned long start, unsigned long size)
  */
 static void vmem_remove_range(unsigned long start, unsigned long size)
 {
-	remove_pagetable(start, start + size, true);
+	start = (unsigned long)__va(start);
+	remove_pagetable(start, start + size, true, NULL);
 }
 
 /*
@@ -511,21 +502,25 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 
 	mutex_lock(&vmem_mutex);
 	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
-	ret = add_pagetable(start, end, false);
+	ret = add_pagetable(start, end, false, altmap);
 	if (ret)
-		remove_pagetable(start, end, false);
+		remove_pagetable(start, end, false, altmap);
 	mutex_unlock(&vmem_mutex);
 	return ret;
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+
 void vmemmap_free(unsigned long start, unsigned long end,
 		  struct vmem_altmap *altmap)
 {
 	mutex_lock(&vmem_mutex);
-	remove_pagetable(start, end, false);
+	remove_pagetable(start, end, false, altmap);
 	mutex_unlock(&vmem_mutex);
 }
 
+#endif
+
 void vmem_remove_mapping(unsigned long start, unsigned long size)
 {
 	mutex_lock(&vmem_mutex);
@@ -538,7 +533,7 @@ struct range arch_get_mappable_range(void)
 	struct range mhp_range;
 
 	mhp_range.start = 0;
-	mhp_range.end =  VMEM_MAX_PHYS - 1;
+	mhp_range.end = max_mappable - 1;
 	return mhp_range;
 }
 
@@ -565,7 +560,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
  * to any physical address. If missing, allocate segment- and region-
  * table entries along. Meeting a large segment- or region-table entry
  * while traversing is an error, since the function is expected to be
- * called against virtual regions reserverd for 4KB mappings only.
+ * called against virtual regions reserved for 4KB mappings only.
  */
 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
 {
@@ -602,7 +597,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
 		if (!pmd)
 			goto out;
 		pud_populate(&init_mm, pud, pmd);
-	} else if (WARN_ON_ONCE(pud_large(*pud))) {
+	} else if (WARN_ON_ONCE(pud_leaf(*pud))) {
 		goto out;
 	}
 	pmd = pmd_offset(pud, addr);
@@ -613,7 +608,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
 		if (!pte)
 			goto out;
 		pmd_populate(&init_mm, pmd, pte);
-	} else if (WARN_ON_ONCE(pmd_large(*pmd))) {
+	} else if (WARN_ON_ONCE(pmd_leaf(*pmd))) {
 		goto out;
 	}
 	ptep = pte_offset_kernel(pmd, addr);
@@ -657,37 +652,20 @@ void vmem_unmap_4k_page(unsigned long addr)
 	mutex_unlock(&vmem_mutex);
 }
 
-/*
- * map whole physical memory to virtual memory (identity mapping)
- * we reserve enough space in the vmalloc area for vmemmap to hotplug
- * additional memory segments.
- */
 void __init vmem_map_init(void)
 {
-	phys_addr_t base, end;
-	u64 i;
-
-	for_each_mem_range(i, &base, &end)
-		vmem_add_range(base, end - base);
-	__set_memory((unsigned long)_stext,
-		     (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
-		     SET_MEMORY_RO | SET_MEMORY_X);
-	__set_memory((unsigned long)_etext,
-		     (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT,
-		     SET_MEMORY_RO);
-	__set_memory((unsigned long)_sinittext,
-		     (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
-		     SET_MEMORY_RO | SET_MEMORY_X);
-	__set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
-		     SET_MEMORY_RO | SET_MEMORY_X);
-
-	/* lowcore requires 4k mapping for real addresses / prefixing */
-	set_memory_4k(0, LC_PAGES);
-
-	/* lowcore must be executable for LPSWE */
-	if (!static_key_enabled(&cpu_has_bear))
+	__set_memory_rox(_stext, _etext);
+	__set_memory_ro(_etext, __end_rodata);
+	__set_memory_rox(__stext_amode31, __etext_amode31);
+	/*
+	 * If the BEAR-enhancement facility is not installed the first
+	 * prefix page is used to return to the previous context with
+	 * an LPSWE instruction and therefore must be executable.
+	 */
+	if (!cpu_has_bear())
 		set_memory_x(0, 1);
-
+	if (debug_pagealloc_enabled())
+		__set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size);
 	pr_info("Write protected kernel read-only data: %luk\n",
 		(unsigned long)(__end_rodata - _stext) >> 10);
 }