21 files changed, 6970 insertions, 3489 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 839592ca265c..bd0401cc7ca5 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -1,10 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
 #
 # Makefile for the linux s390-specific parts of the memory manager.
 #
 
-obj-y		:= init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o
-obj-y		+= page-states.o gup.o extable.o pageattr.o mem_detect.o
+obj-y		:= init.o fault.o extmem.o mmap.o vmem.o maccess.o
+obj-y		+= page-states.o pageattr.o pgtable.o pgalloc.o extable.o
 
 obj-$(CONFIG_CMM)		+= cmm.o
+obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_S390_PTDUMP)	+= dump_pagetables.o
+obj-$(CONFIG_PTDUMP)		+= dump_pagetables.o
+obj-$(CONFIG_PGSTE)		+= gmap.o
+obj-$(CONFIG_PFAULT)		+= pfault.o
+
+obj-$(subst m,y,$(CONFIG_KVM))	+= gmap_helpers.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 9d84a1feefef..eb7ef63fab1e 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Collaborative memory management interface.
  *
@@ -10,17 +11,16 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
+#include <linux/string_helpers.h>
 #include <linux/sysctl.h>
-#include <linux/ctype.h>
 #include <linux/swap.h>
 #include <linux/kthread.h>
 #include <linux/oom.h>
-#include <linux/suspend.h>
 #include <linux/uaccess.h>
 
-#include <asm/pgalloc.h>
 #include <asm/diag.h>
 
 #ifdef CONFIG_CMM_IUCV
@@ -47,7 +47,6 @@ static volatile long cmm_pages_target;
 static volatile long cmm_timed_pages_target;
 static long cmm_timeout_pages;
 static long cmm_timeout_seconds;
-static int cmm_suspended;
 
 static struct cmm_page_array *cmm_page_list;
 static struct cmm_page_array *cmm_timed_page_list;
@@ -55,10 +54,10 @@ static DEFINE_SPINLOCK(cmm_lock);
 
 static struct task_struct *cmm_thread_ptr;
 static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait);
-static DEFINE_TIMER(cmm_timer, NULL, 0, 0);
 
-static void cmm_timer_fn(unsigned long);
+static void cmm_timer_fn(struct timer_list *);
 static void cmm_set_timer(void);
+static DEFINE_TIMER(cmm_timer, cmm_timer_fn);
 
 static long cmm_alloc_pages(long nr, long *counter,
 			    struct cmm_page_array **list)
@@ -91,16 +90,17 @@ static long cmm_alloc_pages(long nr, long *counter,
 			} else
 				free_page((unsigned long) npa);
 		}
-		diag10_range(addr >> PAGE_SHIFT, 1);
+		diag10_range(virt_to_pfn((void *)addr), 1);
 		pa->pages[pa->index++] = addr;
 		(*counter)++;
 		spin_unlock(&cmm_lock);
 		nr--;
+		cond_resched();
 	}
 	return nr;
 }
 
-static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+static long __cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
 {
 	struct cmm_page_array *pa;
 	unsigned long addr;
@@ -124,6 +124,21 @@ static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
 	return nr;
 }
 
+static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+{
+	long inc = 0;
+
+	while (nr) {
+		inc = min(256L, nr);
+		nr -= inc;
+		inc = __cmm_free_pages(inc, counter, list);
+		if (inc)
+			break;
+		cond_resched();
+	}
+	return nr + inc;
+}
+
 static int cmm_oom_notify(struct notifier_block *self,
 			  unsigned long dummy, void *parm)
 {
@@ -149,9 +164,9 @@ static int cmm_thread(void *dummy)
 
 	while (1) {
 		rc = wait_event_interruptible(cmm_thread_wait,
-			(!cmm_suspended && (cmm_pages != cmm_pages_target ||
-			 cmm_timed_pages != cmm_timed_pages_target)) ||
-			 kthread_should_stop());
+			cmm_pages != cmm_pages_target ||
+			cmm_timed_pages != cmm_timed_pages_target ||
+			kthread_should_stop());
 		if (kthread_should_stop() || rc == -ERESTARTSYS) {
 			cmm_pages_target = cmm_pages;
 			cmm_timed_pages_target = cmm_timed_pages;
@@ -186,20 +201,13 @@ static void cmm_set_timer(void)
 {
 	if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) {
 		if (timer_pending(&cmm_timer))
-			del_timer(&cmm_timer);
+			timer_delete(&cmm_timer);
 		return;
 	}
-	if (timer_pending(&cmm_timer)) {
-		if (mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds*HZ))
-			return;
-	}
-	cmm_timer.function = cmm_timer_fn;
-	cmm_timer.data = 0;
-	cmm_timer.expires = jiffies + cmm_timeout_seconds*HZ;
-	add_timer(&cmm_timer);
+	mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds));
 }
 
-static void cmm_timer_fn(unsigned long ignored)
+static void cmm_timer_fn(struct timer_list *unused)
 {
 	long nr;
 
@@ -251,54 +259,51 @@ static int cmm_skip_blanks(char *cp, char **endp)
 	return str != cp;
 }
 
-static struct ctl_table cmm_table[];
+static int cmm_pages_handler(const struct ctl_table *ctl, int write,
+			     void *buffer, size_t *lenp, loff_t *ppos)
+{
+	long nr = cmm_get_pages();
+	struct ctl_table ctl_entry = {
+		.procname	= ctl->procname,
+		.data		= &nr,
+		.maxlen		= sizeof(long),
+	};
+	int rc;
+
+	rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+	if (rc < 0 || !write)
+		return rc;
 
-static int cmm_pages_handler(ctl_table *ctl, int write, void __user *buffer,
-			     size_t *lenp, loff_t *ppos)
+	cmm_set_pages(nr);
+	return 0;
+}
+
+static int cmm_timed_pages_handler(const struct ctl_table *ctl, int write,
+				   void *buffer, size_t *lenp,
+				   loff_t *ppos)
 {
-	char buf[16], *p;
-	long nr;
-	int len;
+	long nr = cmm_get_timed_pages();
+	struct ctl_table ctl_entry = {
+		.procname	= ctl->procname,
+		.data		= &nr,
+		.maxlen		= sizeof(long),
+	};
+	int rc;
 
-	if (!*lenp || (*ppos && !write)) {
-		*lenp = 0;
-		return 0;
-	}
+	rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+	if (rc < 0 || !write)
+		return rc;
 
-	if (write) {
-		len = *lenp;
-		if (copy_from_user(buf, buffer,
-				   len > sizeof(buf) ? sizeof(buf) : len))
-			return -EFAULT;
-		buf[sizeof(buf) - 1] = '\0';
-		cmm_skip_blanks(buf, &p);
-		nr = simple_strtoul(p, &p, 0);
-		if (ctl == &cmm_table[0])
-			cmm_set_pages(nr);
-		else
-			cmm_add_timed_pages(nr);
-	} else {
-		if (ctl == &cmm_table[0])
-			nr = cmm_get_pages();
-		else
-			nr = cmm_get_timed_pages();
-		len = sprintf(buf, "%ld\n", nr);
-		if (len > *lenp)
-			len = *lenp;
-		if (copy_to_user(buffer, buf, len))
-			return -EFAULT;
-	}
-	*lenp = len;
-	*ppos += len;
+	cmm_add_timed_pages(nr);
 	return 0;
 }
 
-static int cmm_timeout_handler(ctl_table *ctl, int write,  void __user *buffer,
-			       size_t *lenp, loff_t *ppos)
+static int cmm_timeout_handler(const struct ctl_table *ctl, int write,
+			       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
 	long nr, seconds;
-	int len;
+	unsigned int len;
 
 	if (!*lenp || (*ppos && !write)) {
 		*lenp = 0;
@@ -306,30 +311,28 @@ static int cmm_timeout_handler(ctl_table *ctl, int write,  void __user *buffer,
 	}
 
 	if (write) {
-		len = *lenp;
-		if (copy_from_user(buf, buffer,
-				   len > sizeof(buf) ? sizeof(buf) : len))
-			return -EFAULT;
-		buf[sizeof(buf) - 1] = '\0';
+		len = min(*lenp, sizeof(buf));
+		memcpy(buf, buffer, len);
+		buf[len - 1] = '\0';
 		cmm_skip_blanks(buf, &p);
 		nr = simple_strtoul(p, &p, 0);
 		cmm_skip_blanks(p, &p);
 		seconds = simple_strtoul(p, &p, 0);
 		cmm_set_timeout(nr, seconds);
+		*ppos += *lenp;
 	} else {
-		len = sprintf(buf, "%ld %ld\n",
-			      cmm_timeout_pages, cmm_timeout_seconds);
+		len = scnprintf(buf, sizeof(buf), "%ld %ld\n",
+				cmm_timeout_pages, cmm_timeout_seconds);
 		if (len > *lenp)
 			len = *lenp;
-		if (copy_to_user(buffer, buf, len))
-			return -EFAULT;
+		memcpy(buffer, buf, len);
+		*lenp = len;
+		*ppos += len;
 	}
-	*lenp = len;
-	*ppos += len;
 	return 0;
 }
 
-static struct ctl_table cmm_table[] = {
+static const struct ctl_table cmm_table[] = {
 	{
 		.procname	= "cmm_pages",
 		.mode		= 0644,
@@ -338,24 +341,13 @@ static struct ctl_table cmm_table[] = {
 	{
 		.procname	= "cmm_timed_pages",
 		.mode		= 0644,
-		.proc_handler	= cmm_pages_handler,
+		.proc_handler	= cmm_timed_pages_handler,
 	},
 	{
 		.procname	= "cmm_timeout",
 		.mode		= 0644,
 		.proc_handler	= cmm_timeout_handler,
 	},
-	{ }
-};
-
-static struct ctl_table cmm_dir_table[] = {
-	{
-		.procname	= "vm",
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= cmm_table,
-	},
-	{ }
 };
 
 #ifdef CONFIG_CMM_IUCV
@@ -398,54 +390,19 @@ static void cmm_smsg_target(const char *from, char *msg)
 
 static struct ctl_table_header *cmm_sysctl_header;
 
-static int cmm_suspend(void)
-{
-	cmm_suspended = 1;
-	cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
-	cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
-	return 0;
-}
-
-static int cmm_resume(void)
-{
-	cmm_suspended = 0;
-	cmm_kick_thread();
-	return 0;
-}
-
-static int cmm_power_event(struct notifier_block *this,
-			   unsigned long event, void *ptr)
-{
-	switch (event) {
-	case PM_POST_HIBERNATION:
-		return cmm_resume();
-	case PM_HIBERNATION_PREPARE:
-		return cmm_suspend();
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block cmm_power_notifier = {
-	.notifier_call = cmm_power_event,
-};
-
 static int __init cmm_init(void)
 {
 	int rc = -ENOMEM;
 
-	cmm_sysctl_header = register_sysctl_table(cmm_dir_table);
+	cmm_sysctl_header = register_sysctl("vm", cmm_table);
 	if (!cmm_sysctl_header)
 		goto out_sysctl;
 #ifdef CONFIG_CMM_IUCV
 	/* convert sender to uppercase characters */
-	if (sender) {
-		int len = strlen(sender);
-		while (len--)
-			sender[len] = toupper(sender[len]);
-	} else {
+	if (sender)
+		string_upper(sender, sender);
+	else
 		sender = cmm_default_sender;
-	}
 
 	rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
 	if (rc < 0)
@@ -454,16 +411,11 @@ static int __init cmm_init(void)
 	rc = register_oom_notifier(&cmm_oom_nb);
 	if (rc < 0)
 		goto out_oom_notify;
-	rc = register_pm_notifier(&cmm_power_notifier);
-	if (rc)
-		goto out_pm;
 	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
 	if (!IS_ERR(cmm_thread_ptr))
 		return 0;
 
 	rc = PTR_ERR(cmm_thread_ptr);
-	unregister_pm_notifier(&cmm_power_notifier);
-out_pm:
 	unregister_oom_notifier(&cmm_oom_nb);
 out_oom_notify:
 #ifdef CONFIG_CMM_IUCV
@@ -472,7 +424,7 @@ out_smsg:
 #endif
 	unregister_sysctl_table(cmm_sysctl_header);
 out_sysctl:
-	del_timer_sync(&cmm_timer);
+	timer_delete_sync(&cmm_timer);
 	return rc;
 }
 module_init(cmm_init);
@@ -483,13 +435,13 @@ static void __exit cmm_exit(void)
 #ifdef CONFIG_CMM_IUCV
 	smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
 #endif
-	unregister_pm_notifier(&cmm_power_notifier);
 	unregister_oom_notifier(&cmm_oom_nb);
 	kthread_stop(cmm_thread_ptr);
-	del_timer_sync(&cmm_timer);
+	timer_delete_sync(&cmm_timer);
 	cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
 	cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
 }
 module_exit(cmm_exit);
 
+MODULE_DESCRIPTION("Cooperative memory management interface");
 MODULE_LICENSE("GPL");
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 3ad65b04ac15..89badbe72ae7 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,246 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpufeature.h>
+#include <linux/set_memory.h>
+#include <linux/ptdump.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
-#include <linux/module.h>
+#include <linux/sort.h>
 #include <linux/mm.h>
+#include <linux/kfence.h>
+#include <linux/kasan.h>
+#include <asm/kasan.h>
+#include <asm/abs_lowcore.h>
+#include <asm/nospec-branch.h>
 #include <asm/sections.h>
-#include <asm/pgtable.h>
+#include <asm/maccess.h>
 
 static unsigned long max_addr;
 
 struct addr_marker {
+	int is_start;
 	unsigned long start_address;
+	unsigned long size;
 	const char *name;
 };
 
-enum address_markers_idx {
-	IDENTITY_NR = 0,
-	KERNEL_START_NR,
-	KERNEL_END_NR,
-	VMEMMAP_NR,
-	VMALLOC_NR,
-#ifdef CONFIG_64BIT
-	MODULES_NR,
-#endif
-};
-
-static struct addr_marker address_markers[] = {
-	[IDENTITY_NR]	  = {0, "Identity Mapping"},
-	[KERNEL_START_NR] = {(unsigned long)&_stext, "Kernel Image Start"},
-	[KERNEL_END_NR]	  = {(unsigned long)&_end, "Kernel Image End"},
-	[VMEMMAP_NR]	  = {0, "vmemmap Area"},
-	[VMALLOC_NR]	  = {0, "vmalloc Area"},
-#ifdef CONFIG_64BIT
-	[MODULES_NR]	  = {0, "Modules Area"},
-#endif
-	{ -1, NULL }
-};
+static struct addr_marker *markers;
+static unsigned int markers_cnt;
 
 struct pg_state {
+	struct ptdump_state ptdump;
+	struct seq_file *seq;
 	int level;
 	unsigned int current_prot;
+	bool check_wx;
+	unsigned long wx_pages;
 	unsigned long start_address;
-	unsigned long current_address;
 	const struct addr_marker *marker;
 };
 
+#define pt_dump_seq_printf(m, fmt, args...)	\
+({						\
+	struct seq_file *__m = (m);		\
+						\
+	if (__m)				\
+		seq_printf(__m, fmt, ##args);	\
+})
+
+#define pt_dump_seq_puts(m, fmt)		\
+({						\
+	struct seq_file *__m = (m);		\
+						\
+	if (__m)				\
+		seq_puts(__m, fmt);		\
+})
+
 static void print_prot(struct seq_file *m, unsigned int pr, int level)
 {
 	static const char * const level_name[] =
 		{ "ASCE", "PGD", "PUD", "PMD", "PTE" };
 
-	seq_printf(m, "%s ", level_name[level]);
+	pt_dump_seq_printf(m, "%s ", level_name[level]);
 	if (pr & _PAGE_INVALID) {
-		seq_printf(m, "I\n");
+		pt_dump_seq_printf(m, "I\n");
 		return;
 	}
-	seq_printf(m, "%s", pr & _PAGE_RO ? "RO " : "RW ");
-	seq_printf(m, "%s", pr & _PAGE_CO ? "CO " : "   ");
-	seq_putc(m, '\n');
+	pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
+	pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
 }
 
-static void note_page(struct seq_file *m, struct pg_state *st,
-		     unsigned int new_prot, int level)
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+	if (!st->check_wx)
+		return;
+	if (st->current_prot & _PAGE_INVALID)
+		return;
+	if (st->current_prot & _PAGE_PROTECT)
+		return;
+	if (st->current_prot & _PAGE_NOEXEC)
+		return;
+	/*
+	 * The first lowcore page is W+X if spectre mitigations are using
+	 * trampolines or the BEAR enhancements facility is not installed,
+	 * in which case we have two lpswe instructions in lowcore that need
+	 * to be executable.
+	 */
+	if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear()))
+		return;
+	WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+		  "s390/mm: Found insecure W+X mapping at address %pS\n",
+		  (void *)st->start_address);
+	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
+static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level)
+{
+	struct seq_file *m = st->seq;
+
+	while (addr >= st->marker[1].start_address) {
+		st->marker++;
+		pt_dump_seq_printf(m, "---[ %s %s ]---\n", st->marker->name,
+				   st->marker->is_start ? "Start" : "End");
+	}
+	st->start_address = addr;
+	st->current_prot = prot;
+	st->level = level;
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
 {
-	static const char units[] = "KMGTPE";
 	int width = sizeof(unsigned long) * 2;
+	static const char units[] = "KMGTPE";
 	const char *unit = units;
-	unsigned int prot, cur;
 	unsigned long delta;
+	struct pg_state *st;
+	struct seq_file *m;
+	unsigned int prot;
 
-	/*
-	 * If we have a "break" in the series, we need to flush the state
-	 * that we have now. "break" is either changing perms, levels or
-	 * address space marker.
-	 */
-	prot = new_prot;
-	cur = st->current_prot;
-
-	if (!st->level) {
-		/* First entry */
-		st->current_prot = new_prot;
-		st->level = level;
-		st->marker = address_markers;
-		seq_printf(m, "---[ %s ]---\n", st->marker->name);
-	} else if (prot != cur || level != st->level ||
-		   st->current_address >= st->marker[1].start_address) {
-		/* Print the actual finished series */
-		seq_printf(m, "0x%0*lx-0x%0*lx",
-			   width, st->start_address,
-			   width, st->current_address);
-		delta = (st->current_address - st->start_address) >> 10;
+	st = container_of(pt_st, struct pg_state, ptdump);
+	m = st->seq;
+	prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC);
+	if (level == 4 && (val & _PAGE_INVALID))
+		prot = _PAGE_INVALID;
+	/* For pmd_none() & friends val gets passed as zero. */
+	if (level != 4 && !val)
+		prot = _PAGE_INVALID;
+	/* Final flush from generic code. */
+	if (level == -1)
+		addr = max_addr;
+	if (st->level == -1) {
+		pt_dump_seq_puts(m, "---[ Kernel Virtual Address Space ]---\n");
+		note_page_update_state(st, addr, prot, level);
+	} else if (prot != st->current_prot || level != st->level ||
+		   addr >= st->marker[1].start_address) {
+		note_prot_wx(st, addr);
+		pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
+				   width, st->start_address,
+				   width, addr);
+		delta = (addr - st->start_address) >> 10;
 		while (!(delta & 0x3ff) && unit[1]) {
 			delta >>= 10;
 			unit++;
 		}
-		seq_printf(m, "%9lu%c ", delta, *unit);
+		pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
 		print_prot(m, st->current_prot, st->level);
-		if (st->current_address >= st->marker[1].start_address) {
-			st->marker++;
-			seq_printf(m, "---[ %s ]---\n", st->marker->name);
-		}
-		st->start_address = st->current_address;
-		st->current_prot = new_prot;
-		st->level = level;
+		note_page_update_state(st, addr, prot, level);
 	}
 }
 
-/*
- * The actual page table walker functions. In order to keep the implementation
- * of print_prot() short, we only check and pass _PAGE_INVALID and _PAGE_RO
- * flags to note_page() if a region, segment or page table entry is invalid or
- * read-only.
- * After all it's just a hint that the current level being walked contains an
- * invalid or read-only entry.
- */
-static void walk_pte_level(struct seq_file *m, struct pg_state *st,
-			   pmd_t *pmd, unsigned long addr)
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
 {
-	unsigned int prot;
-	pte_t *pte;
-	int i;
-
-	for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
-		st->current_address = addr;
-		pte = pte_offset_kernel(pmd, addr);
-		prot = pte_val(*pte) & (_PAGE_RO | _PAGE_INVALID);
-		note_page(m, st, prot, 4);
-		addr += PAGE_SIZE;
-	}
+	note_page(pt_st, addr, 4, pte_val(pte));
 }
 
-#ifdef CONFIG_64BIT
-#define _PMD_PROT_MASK (_SEGMENT_ENTRY_RO | _SEGMENT_ENTRY_CO)
-#else
-#define _PMD_PROT_MASK 0
-#endif
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+	note_page(pt_st, addr, 3, pmd_val(pmd));
+}
 
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
-			   pud_t *pud, unsigned long addr)
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
 {
-	unsigned int prot;
-	pmd_t *pmd;
-	int i;
-
-	for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) {
-		st->current_address = addr;
-		pmd = pmd_offset(pud, addr);
-		if (!pmd_none(*pmd)) {
-			if (pmd_large(*pmd)) {
-				prot = pmd_val(*pmd) & _PMD_PROT_MASK;
-				note_page(m, st, prot, 3);
-			} else
-				walk_pte_level(m, st, pmd, addr);
-		} else
-			note_page(m, st, _PAGE_INVALID, 3);
-		addr += PMD_SIZE;
-	}
+	note_page(pt_st, addr, 2, pud_val(pud));
 }
 
-#ifdef CONFIG_64BIT
-#define _PUD_PROT_MASK (_REGION3_ENTRY_RO | _REGION3_ENTRY_CO)
-#else
-#define _PUD_PROT_MASK 0
-#endif
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+	note_page(pt_st, addr, 1, p4d_val(p4d));
+}
 
-static void walk_pud_level(struct seq_file *m, struct pg_state *st,
-			   pgd_t *pgd, unsigned long addr)
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
 {
-	unsigned int prot;
-	pud_t *pud;
-	int i;
-
-	for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) {
-		st->current_address = addr;
-		pud = pud_offset(pgd, addr);
-		if (!pud_none(*pud))
-			if (pud_large(*pud)) {
-				prot = pud_val(*pud) & _PUD_PROT_MASK;
-				note_page(m, st, prot, 2);
-			} else
-				walk_pmd_level(m, st, pud, addr);
-		else
-			note_page(m, st, _PAGE_INVALID, 2);
-		addr += PUD_SIZE;
-	}
+	note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+	pte_t pte_zero = {0};
+
+	note_page(pt_st, 0, -1, pte_val(pte_zero));
 }
 
-static void walk_pgd_level(struct seq_file *m)
+bool ptdump_check_wx(void)
 {
-	unsigned long addr = 0;
-	struct pg_state st;
-	pgd_t *pgd;
-	int i;
-
-	memset(&st, 0, sizeof(st));
-	for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
-		st.current_address = addr;
-		pgd = pgd_offset_k(addr);
-		if (!pgd_none(*pgd))
-			walk_pud_level(m, &st, pgd, addr);
-		else
-			note_page(m, &st, _PAGE_INVALID, 1);
-		addr += PGDIR_SIZE;
+	struct pg_state st = {
+		.ptdump = {
+			.note_page_pte = note_page_pte,
+			.note_page_pmd = note_page_pmd,
+			.note_page_pud = note_page_pud,
+			.note_page_p4d = note_page_p4d,
+			.note_page_pgd = note_page_pgd,
+			.note_page_flush = note_page_flush,
+			.range = (struct ptdump_range[]) {
+				{.start = 0, .end = max_addr},
+				{.start = 0, .end = 0},
+			}
+		},
+		.seq = NULL,
+		.level = -1,
+		.current_prot = 0,
+		.check_wx = true,
+		.wx_pages = 0,
+		.start_address = 0,
+		.marker = (struct addr_marker[]) {
+			{ .start_address =  0, .name = NULL},
+			{ .start_address = -1, .name = NULL},
+		},
+	};
+
+	if (!cpu_has_nx())
+		return true;
+	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+	if (st.wx_pages) {
+		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
+
+		return false;
+	} else {
+		pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
+			(nospec_uses_trampoline() || !cpu_has_bear()) ?
+			"unexpected " : "");
+
+		return true;
 	}
-	/* Flush out the last page */
-	st.current_address = max_addr;
-	note_page(m, &st, 0, 0);
 }
 
+#ifdef CONFIG_PTDUMP_DEBUGFS
 static int ptdump_show(struct seq_file *m, void *v)
 {
-	walk_pgd_level(m);
+	struct pg_state st = {
+		.ptdump = {
+			.note_page_pte = note_page_pte,
+			.note_page_pmd = note_page_pmd,
+			.note_page_pud = note_page_pud,
+			.note_page_p4d = note_page_p4d,
+			.note_page_pgd = note_page_pgd,
+			.note_page_flush = note_page_flush,
+			.range = (struct ptdump_range[]) {
+				{.start = 0, .end = max_addr},
+				{.start = 0, .end = 0},
+			}
+		},
+		.seq = m,
+		.level = -1,
+		.current_prot = 0,
+		.check_wx = false,
+		.wx_pages = 0,
+		.start_address = 0,
+		.marker = markers,
+	};
+
+	mutex_lock(&cpa_mutex);
+	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+	mutex_unlock(&cpa_mutex);
 	return 0;
 }
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
 
-static int ptdump_open(struct inode *inode, struct file *filp)
+static int ptdump_cmp(const void *a, const void *b)
 {
-	return single_open(filp, ptdump_show, NULL);
+	const struct addr_marker *ama = a;
+	const struct addr_marker *amb = b;
+
+	if (ama->start_address > amb->start_address)
+		return 1;
+	if (ama->start_address < amb->start_address)
+		return -1;
+	/*
+	 * If the start addresses of two markers are identical sort markers in an
+	 * order that considers areas contained within other areas correctly.
+	 */
+	if (ama->is_start && amb->is_start) {
+		if (ama->size > amb->size)
+			return -1;
+		if (ama->size < amb->size)
+			return 1;
+		return 0;
+	}
+	if (!ama->is_start && !amb->is_start) {
+		if (ama->size > amb->size)
+			return 1;
+		if (ama->size < amb->size)
+			return -1;
+		return 0;
+	}
+	if (ama->is_start)
+		return 1;
+	if (amb->is_start)
+		return -1;
+	return 0;
 }
 
-static const struct file_operations ptdump_fops = {
-	.open		= ptdump_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+static int add_marker(unsigned long start, unsigned long end, const char *name)
+{
+	struct addr_marker *new;
+	size_t newsize;
+
+	newsize = (markers_cnt + 2) * sizeof(*markers);
+	new = kvrealloc(markers, newsize, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+	markers = new;
+	markers[markers_cnt].is_start = 1;
+	markers[markers_cnt].start_address = start;
+	markers[markers_cnt].size = end - start;
+	markers[markers_cnt].name = name;
+	markers_cnt++;
+	markers[markers_cnt].is_start = 0;
+	markers[markers_cnt].start_address = end;
+	markers[markers_cnt].size = end - start;
+	markers[markers_cnt].name = name;
+	markers_cnt++;
+	return 0;
+}
 
 static int pt_dump_init(void)
 {
+#ifdef CONFIG_KFENCE
+	unsigned long kfence_start = (unsigned long)__kfence_pool;
+#endif
+	unsigned long lowcore = (unsigned long)get_lowcore();
+	int rc;
+
 	/*
 	 * Figure out the maximum virtual address being accessible with the
 	 * kernel ASCE. We need this to keep the page table walker functions
 	 * from accessing non-existent entries.
 	 */
-#ifdef CONFIG_32BIT
-	max_addr = 1UL << 31;
-#else
-	max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
+	max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2;
 	max_addr = 1UL << (max_addr * 11 + 31);
-	address_markers[MODULES_NR].start_address = MODULES_VADDR;
+	/* start + end markers - must be added first */
+	rc = add_marker(0, -1UL, NULL);
+	rc |= add_marker((unsigned long)_stext, (unsigned long)_end, "Kernel Image");
+	rc |= add_marker(lowcore, lowcore + sizeof(struct lowcore), "Lowcore");
+	rc |= add_marker(__identity_base, __identity_base + ident_map_size, "Identity Mapping");
+	rc |= add_marker((unsigned long)__samode31, (unsigned long)__eamode31, "Amode31 Area");
+	rc |= add_marker(MODULES_VADDR, MODULES_END, "Modules Area");
+	rc |= add_marker(__abs_lowcore, __abs_lowcore + ABS_LOWCORE_MAP_SIZE, "Lowcore Area");
+	rc |= add_marker(__memcpy_real_area, __memcpy_real_area + MEMCPY_REAL_SIZE, "Real Memory Copy Area");
+	rc |= add_marker((unsigned long)vmemmap, (unsigned long)vmemmap + vmemmap_size, "vmemmap Area");
+	rc |= add_marker(VMALLOC_START, VMALLOC_END, "vmalloc Area");
+#ifdef CONFIG_KFENCE
+	rc |= add_marker(kfence_start, kfence_start + KFENCE_POOL_SIZE, "KFence Pool");
+#endif
+#ifdef CONFIG_KMSAN
+	rc |= add_marker(KMSAN_VMALLOC_SHADOW_START, KMSAN_VMALLOC_SHADOW_END, "Kmsan vmalloc Shadow");
+	rc |= add_marker(KMSAN_VMALLOC_ORIGIN_START, KMSAN_VMALLOC_ORIGIN_END, "Kmsan vmalloc Origins");
+	rc |= add_marker(KMSAN_MODULES_SHADOW_START, KMSAN_MODULES_SHADOW_END, "Kmsan Modules Shadow");
+	rc |= add_marker(KMSAN_MODULES_ORIGIN_START, KMSAN_MODULES_ORIGIN_END, "Kmsan Modules Origins");
+#endif
+#ifdef CONFIG_KASAN
+	rc |= add_marker(KASAN_SHADOW_START, KASAN_SHADOW_END, "Kasan Shadow");
 #endif
-	address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
-	address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+	if (rc)
+		goto error;
+	sort(&markers[1], markers_cnt - 1, sizeof(*markers), ptdump_cmp, NULL);
+#ifdef CONFIG_PTDUMP_DEBUGFS
 	debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
 	return 0;
+error:
+	kvfree(markers);
+	return -ENOMEM;
 }
 device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
index 4d1ee88864e8..7498e858c401 100644
--- a/arch/s390/mm/extable.c
+++ b/arch/s390/mm/extable.c
@@ -1,81 +1,147 @@
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <asm/uaccess.h>
-
-/*
- * Search one exception table for an entry corresponding to the
- * given instruction address, and return the address of the entry,
- * or NULL if none is found.
- * We use a binary search, and thus we assume that the table is
- * already sorted.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-	       const struct exception_table_entry *last,
-	       unsigned long value)
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitfield.h>
+#include <linux/extable.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/panic.h>
+#include <asm/asm-extable.h>
+#include <asm/extable.h>
+#include <asm/fpu.h>
+
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
 {
-	const struct exception_table_entry *mid;
-	unsigned long addr;
-
-	while (first <= last) {
-		mid = ((last - first) >> 1) + first;
-		addr = extable_insn(mid);
-		if (addr < value)
-			first = mid + 1;
-		else if (addr > value)
-			last = mid - 1;
-		else
-			return mid;
-	}
-	return NULL;
+	const struct exception_table_entry *fixup;
+	size_t num;
+
+	fixup = search_exception_tables(addr);
+	if (fixup)
+		return fixup;
+	num = __stop_amode31_ex_table - __start_amode31_ex_table;
+	return search_extable(__start_amode31_ex_table, num, addr);
 }
 
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- *
- */
-static int cmp_ex(const void *a, const void *b)
+static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs)
 {
-	const struct exception_table_entry *x = a, *y = b;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
 
-	/* This compare is only valid after normalization. */
-	return x->insn - y->insn;
+static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+	regs->gprs[reg_err] = -EFAULT;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
 }
 
-void sort_extable(struct exception_table_entry *start,
-		  struct exception_table_entry *finish)
+static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex,
+				   bool pair, struct pt_regs *regs)
 {
-	struct exception_table_entry *p;
-	int i;
-
-	/* Normalize entries to being relative to the start of the section */
-	for (p = start, i = 0; p < finish; p++, i += 8)
-		p->insn += i;
-	sort(start, finish - start, sizeof(*start), cmp_ex, NULL);
-	/* Denormalize all entries */
-	for (p = start, i = 0; p < finish; p++, i += 8)
-		p->insn -= i;
+	unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+	unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+	regs->gprs[reg_err] = -EFAULT;
+	regs->gprs[reg_zero] = 0;
+	if (pair)
+		regs->gprs[reg_zero + 1] = 0;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
 }
 
-#ifdef CONFIG_MODULES
-/*
- * If the exception table is sorted, any referring to the module init
- * will be at the beginning or the end.
- */
-void trim_init_extable(struct module *m)
+static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs)
 {
-	/* Trim the beginning */
-	while (m->num_exentries &&
-	       within_module_init(extable_insn(&m->extable[0]), m)) {
-		m->extable++;
-		m->num_exentries--;
+	unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+	unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+	unsigned long data, addr, offset;
+
+	addr = regs->gprs[reg_addr];
+	offset = addr & (sizeof(unsigned long) - 1);
+	addr &= ~(sizeof(unsigned long) - 1);
+	data = *(unsigned long *)addr;
+	data <<= BITS_PER_BYTE * offset;
+	regs->gprs[reg_data] = data;
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+	fpu_sfpc(0);
+	regs->psw.addr = extable_fixup(ex);
+	return true;
+}
+
+struct insn_ssf {
+	u64	opc1 : 8;
+	u64	r3   : 4;
+	u64	opc2 : 4;
+	u64	b1   : 4;
+	u64	d1   : 12;
+	u64	b2   : 4;
+	u64	d2   : 12;
+} __packed;
+
+static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex,
+				bool from, struct pt_regs *regs)
+{
+	unsigned long uaddr, remainder;
+	struct insn_ssf *insn;
+
+	/*
+	 * If the faulting user space access crossed a page boundary retry by
+	 * limiting the access to the first page (adjust length accordingly).
+	 * Then the mvcos instruction will either complete with condition code
+	 * zero, or generate another fault where the user space access did not
+	 * cross a page boundary.
+	 * If the faulting user space access did not cross a page boundary set
+	 * length to zero and retry. In this case no user space access will
+	 * happen, and the mvcos instruction will complete with condition code
+	 * zero.
+	 * In both cases the instruction will complete with condition code
+	 * zero (copying finished), and the register which contains the
+	 * length, indicates the number of bytes copied.
+	 */
+	regs->psw.addr = extable_fixup(ex);
+	insn = (struct insn_ssf *)regs->psw.addr;
+	if (from)
+		uaddr = regs->gprs[insn->b2] + insn->d2;
+	else
+		uaddr = regs->gprs[insn->b1] + insn->d1;
+	remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1));
+	if (regs->gprs[insn->r3] <= remainder)
+		remainder = 0;
+	regs->gprs[insn->r3] = remainder;
+	return true;
+}
+
+bool fixup_exception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *ex;
+
+	ex = s390_search_extables(instruction_pointer(regs));
+	if (!ex)
+		return false;
+	switch (ex->type) {
+	case EX_TYPE_FIXUP:
+		return ex_handler_fixup(ex, regs);
+	case EX_TYPE_BPF:
+		return ex_handler_bpf(ex, regs);
+	case EX_TYPE_UA_FAULT:
+		return ex_handler_ua_fault(ex, regs);
+	case EX_TYPE_UA_LOAD_REG:
+		return ex_handler_ua_load_reg(ex, false, regs);
+	case EX_TYPE_UA_LOAD_REGPAIR:
+		return ex_handler_ua_load_reg(ex, true, regs);
+	case EX_TYPE_ZEROPAD:
+		return ex_handler_zeropad(ex, regs);
+	case EX_TYPE_FPC:
+		return ex_handler_fpc(ex, regs);
+	case EX_TYPE_UA_MVCOS_TO:
+		return ex_handler_ua_mvcos(ex, false, regs);
+	case EX_TYPE_UA_MVCOS_FROM:
+		return ex_handler_ua_mvcos(ex, true, regs);
 	}
-	/* Trim the end */
-	while (m->num_exentries &&
-	       within_module_init(extable_insn(&m->extable[m->num_exentries-1]), m))
-		m->num_exentries--;
+	panic("invalid exception table entry");
 }
-#endif /* CONFIG_MODULES */
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 519bba716cc3..6cc33c705de2 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Author(s)......: Carsten Otte <cotte@de.ibm.com>
  * 		    Rob M van der Heij <rvdheij@nl.ibm.com>
@@ -6,32 +7,30 @@
  * Copyright IBM Corp. 2002, 2004
  */
 
-#define KMSG_COMPONENT "extmem"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "extmem: " fmt
 
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/bootmem.h>
+#include <linux/export.h>
+#include <linux/memblock.h>
 #include <linux/ctype.h>
 #include <linux/ioport.h>
+#include <linux/refcount.h>
+#include <linux/pgtable.h>
+#include <asm/machine.h>
+#include <asm/diag.h>
 #include <asm/page.h>
-#include <asm/pgtable.h>
 #include <asm/ebcdic.h>
 #include <asm/errno.h>
 #include <asm/extmem.h>
 #include <asm/cpcmd.h>
 #include <asm/setup.h>
+#include <asm/asm.h>
 
-#define DCSS_LOADSHR    0x00
-#define DCSS_LOADNSR    0x04
 #define DCSS_PURGESEG   0x08
-#define DCSS_FINDSEG    0x0c
-#define DCSS_LOADNOLY   0x10
-#define DCSS_SEGEXT     0x18
 #define DCSS_LOADSHRX	0x20
 #define DCSS_LOADNSRX	0x24
 #define DCSS_FINDSEGX	0x2c
@@ -51,22 +50,6 @@ struct qout64 {
 	struct qrange range[6];
 };
 
-#ifdef CONFIG_64BIT
-struct qrange_old {
-	unsigned int start; /* last byte type */
-	unsigned int end;   /* last byte reserved */
-};
-
-/* output area format for the Diag x'64' old subcode x'18' */
-struct qout64_old {
-	int segstart;
-	int segend;
-	int segcnt;
-	int segrcnt;
-	struct qrange_old range[6];
-};
-#endif
-
 struct qin64 {
 	char qopcode;
 	char rsrv1[3];
@@ -80,10 +63,10 @@ struct qin64 {
 struct dcss_segment {
 	struct list_head list;
 	char dcss_name[8];
-	char res_name[15];
+	char res_name[16];
 	unsigned long start_addr;
 	unsigned long end;
-	atomic_t ref_count;
+	refcount_t ref_count;
 	int do_nonshared;
 	unsigned int vm_segtype;
 	struct qrange range[6];
@@ -95,55 +78,10 @@ static DEFINE_MUTEX(dcss_lock);
 static LIST_HEAD(dcss_list);
 static char *segtype_string[] = { "SW", "EW", "SR", "ER", "SN", "EN", "SC",
 					"EW/EN-MIXED" };
-static int loadshr_scode, loadnsr_scode, findseg_scode;
-static int segext_scode, purgeseg_scode;
-static int scode_set;
-
-/* set correct Diag x'64' subcodes. */
-static int
-dcss_set_subcodes(void)
-{
-#ifdef CONFIG_64BIT
-	char *name = kmalloc(8 * sizeof(char), GFP_KERNEL | GFP_DMA);
-	unsigned long rx, ry;
-	int rc;
-
-	if (name == NULL)
-		return -ENOMEM;
-
-	rx = (unsigned long) name;
-	ry = DCSS_FINDSEGX;
-
-	strcpy(name, "dummy");
-	asm volatile(
-		"	diag	%0,%1,0x64\n"
-		"0:	ipm	%2\n"
-		"	srl	%2,28\n"
-		"	j	2f\n"
-		"1:	la	%2,3\n"
-		"2:\n"
-		EX_TABLE(0b, 1b)
-		: "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
-
-	kfree(name);
-	/* Diag x'64' new subcodes are supported, set to new subcodes */
-	if (rc != 3) {
-		loadshr_scode = DCSS_LOADSHRX;
-		loadnsr_scode = DCSS_LOADNSRX;
-		purgeseg_scode = DCSS_PURGESEG;
-		findseg_scode = DCSS_FINDSEGX;
-		segext_scode = DCSS_SEGEXTX;
-		return 0;
-	}
-#endif
-	/* Diag x'64' new subcodes are not supported, set to old subcodes */
-	loadshr_scode = DCSS_LOADNOLY;
-	loadnsr_scode = DCSS_LOADNSR;
-	purgeseg_scode = DCSS_PURGESEG;
-	findseg_scode = DCSS_FINDSEG;
-	segext_scode = DCSS_SEGEXT;
-	return 0;
-}
+static int loadshr_scode = DCSS_LOADSHRX;
+static int loadnsr_scode = DCSS_LOADNSRX;
+static int purgeseg_scode = DCSS_PURGESEG;
+static int segext_scode = DCSS_SEGEXTX;
 
 /*
  * Create the 8 bytes, ebcdic VM segment name from
@@ -158,7 +96,7 @@ dcss_mkname(char *name, char *dcss_name)
 		if (name[i] == '\0')
 			break;
 		dcss_name[i] = toupper(name[i]);
-	};
+	}
 	for (; i < 8; i++)
 		dcss_name[i] = ' ';
 	ASCEBC(dcss_name, 8);
@@ -197,44 +135,21 @@ dcss_diag(int *func, void *parameter,
            unsigned long *ret1, unsigned long *ret2)
 {
 	unsigned long rx, ry;
-	int rc;
+	int cc;
 
-	if (scode_set == 0) {
-		rc = dcss_set_subcodes();
-		if (rc < 0)
-			return rc;
-		scode_set = 1;
-	}
-	rx = (unsigned long) parameter;
+	rx = virt_to_phys(parameter);
 	ry = (unsigned long) *func;
 
-#ifdef CONFIG_64BIT
-	/* 64-bit Diag x'64' new subcode, keep in 64-bit addressing mode */
-	if (*func > DCSS_SEGEXT)
-		asm volatile(
-			"	diag	%0,%1,0x64\n"
-			"	ipm	%2\n"
-			"	srl	%2,28\n"
-			: "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
-	/* 31-bit Diag x'64' old subcode, switch to 31-bit addressing mode */
-	else
-		asm volatile(
-			"	sam31\n"
-			"	diag	%0,%1,0x64\n"
-			"	sam64\n"
-			"	ipm	%2\n"
-			"	srl	%2,28\n"
-			: "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
-#else
+	diag_stat_inc(DIAG_STAT_X064);
 	asm volatile(
-		"	diag	%0,%1,0x64\n"
-		"	ipm	%2\n"
-		"	srl	%2,28\n"
-		: "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
-#endif
+		"	diag	%[rx],%[ry],0x64\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry)
+		:
+		: CC_CLOBBER);
 	*ret1 = rx;
 	*ret2 = ry;
-	return rc;
+	return CC_TRANSFORM(cc);
 }
 
 static inline int
@@ -265,7 +180,7 @@ query_segment_type (struct dcss_segment *seg)
 
 	/* initialize diag input parameters */
 	qin->qopcode = DCSS_FINDSEGA;
-	qin->qoutptr = (unsigned long) qout;
+	qin->qoutptr = virt_to_phys(qout);
 	qin->qoutlen = sizeof(struct qout64);
 	memcpy (qin->qname, seg->dcss_name, 8);
 
@@ -276,38 +191,11 @@ query_segment_type (struct dcss_segment *seg)
 		goto out_free;
 	}
 	if (diag_cc > 1) {
-		pr_warning("Querying a DCSS type failed with rc=%ld\n", vmrc);
+		pr_warn("Querying a DCSS type failed with rc=%ld\n", vmrc);
 		rc = dcss_diag_translate_rc (vmrc);
 		goto out_free;
 	}
 
-#ifdef CONFIG_64BIT
-	/* Only old format of output area of Diagnose x'64' is supported,
-	   copy data for the new format. */
-	if (segext_scode == DCSS_SEGEXT) {
-		struct qout64_old *qout_old;
-		qout_old = kzalloc(sizeof(*qout_old), GFP_KERNEL | GFP_DMA);
-		if (qout_old == NULL) {
-			rc = -ENOMEM;
-			goto out_free;
-		}
-		memcpy(qout_old, qout, sizeof(struct qout64_old));
-		qout->segstart = (unsigned long) qout_old->segstart;
-		qout->segend = (unsigned long) qout_old->segend;
-		qout->segcnt = qout_old->segcnt;
-		qout->segrcnt = qout_old->segrcnt;
-
-		if (qout->segcnt > 6)
-			qout->segrcnt = 6;
-		for (i = 0; i < qout->segrcnt; i++) {
-			qout->range[i].start =
-				(unsigned long) qout_old->range[i].start;
-			qout->range[i].end =
-				(unsigned long) qout_old->range[i].end;
-		}
-		kfree(qout_old);
-	}
-#endif
 	if (qout->segcnt > 6) {
 		rc = -EOPNOTSUPP;
 		goto out_free;
@@ -367,7 +255,7 @@ segment_type (char* name)
 	int rc;
 	struct dcss_segment seg;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -ENOSYS;
 
 	dcss_mkname(name, seg.dcss_name);
@@ -403,15 +291,17 @@ segment_overlaps_others (struct dcss_segment *seg)
 
 /*
  * real segment loading function, called from segment_load
+ * Must return either an error code < 0, or the segment type code >= 0
  */
 static int
 __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end)
 {
 	unsigned long start_addr, end_addr, dummy;
 	struct dcss_segment *seg;
-	int rc, diag_cc;
+	int rc, diag_cc, segtype;
 
 	start_addr = end_addr = 0;
+	segtype = -1;
 	seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA);
 	if (seg == NULL) {
 		rc = -ENOMEM;
@@ -422,22 +312,15 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 	if (rc < 0)
 		goto out_free;
 
-	if (loadshr_scode == DCSS_LOADSHRX) {
-		if (segment_overlaps_others(seg)) {
-			rc = -EBUSY;
-			goto out_free;
-		}
-	}
-
-	rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
-
-	if (rc)
+	if (segment_overlaps_others(seg)) {
+		rc = -EBUSY;
 		goto out_free;
+	}
 
 	seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	if (seg->res == NULL) {
 		rc = -ENOMEM;
-		goto out_shared;
+		goto out_free;
 	}
 	seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
 	seg->res->start = seg->start_addr;
@@ -445,18 +328,23 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 	memcpy(&seg->res_name, seg->dcss_name, 8);
 	EBCASC(seg->res_name, 8);
 	seg->res_name[8] = '\0';
-	strncat(seg->res_name, " (DCSS)", 7);
+	strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name));
 	seg->res->name = seg->res_name;
-	rc = seg->vm_segtype;
-	if (rc == SEG_TYPE_SC ||
-	    ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
+	segtype = seg->vm_segtype;
+	if (segtype == SEG_TYPE_SC ||
+	    ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared))
 		seg->res->flags |= IORESOURCE_READONLY;
+
+	/* Check for overlapping resources before adding the mapping. */
 	if (request_resource(&iomem_resource, seg->res)) {
 		rc = -EBUSY;
-		kfree(seg->res);
-		goto out_shared;
+		goto out_free_resource;
 	}
 
+	rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+	if (rc)
+		goto out_resource;
+
 	if (do_nonshared)
 		diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
 				&start_addr, &end_addr);
@@ -467,42 +355,42 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 		dcss_diag(&purgeseg_scode, seg->dcss_name,
 				&dummy, &dummy);
 		rc = diag_cc;
-		goto out_resource;
+		goto out_mapping;
 	}
 	if (diag_cc > 1) {
-		pr_warning("Loading DCSS %s failed with rc=%ld\n", name,
-			   end_addr);
+		pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
 		rc = dcss_diag_translate_rc(end_addr);
 		dcss_diag(&purgeseg_scode, seg->dcss_name,
 				&dummy, &dummy);
-		goto out_resource;
+		goto out_mapping;
 	}
 	seg->start_addr = start_addr;
 	seg->end = end_addr;
 	seg->do_nonshared = do_nonshared;
-	atomic_set(&seg->ref_count, 1);
+	refcount_set(&seg->ref_count, 1);
 	list_add(&seg->list, &dcss_list);
 	*addr = seg->start_addr;
 	*end  = seg->end;
 	if (do_nonshared)
-		pr_info("DCSS %s of range %p to %p and type %s loaded as "
+		pr_info("DCSS %s of range %px to %px and type %s loaded as "
 			"exclusive-writable\n", name, (void*) seg->start_addr,
 			(void*) seg->end, segtype_string[seg->vm_segtype]);
 	else {
-		pr_info("DCSS %s of range %p to %p and type %s loaded in "
+		pr_info("DCSS %s of range %px to %px and type %s loaded in "
 			"shared access mode\n", name, (void*) seg->start_addr,
 			(void*) seg->end, segtype_string[seg->vm_segtype]);
 	}
 	goto out;
+ out_mapping:
+	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
  out_resource:
 	release_resource(seg->res);
+ out_free_resource:
 	kfree(seg->res);
- out_shared:
-	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
  out_free:
 	kfree(seg);
  out:
-	return rc;
+	return rc < 0 ? rc : segtype;
 }
 
 /*
@@ -517,8 +405,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
  * -EIO     : could not perform query or load diagnose
  * -ENOENT  : no such segment
  * -EOPNOTSUPP: multi-part segment cannot be used with linux
- * -ENOSPC  : segment cannot be used (overlaps with storage)
- * -EBUSY   : segment can temporarily not be used (overlaps with dcss)
+ * -EBUSY   : segment cannot be used (overlaps with dcss or storage)
  * -ERANGE  : segment cannot be used (exceeds kernel mapping range)
  * -EPERM   : segment is currently loaded with incompatible permissions
  * -ENOMEM  : out of memory
@@ -531,7 +418,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr,
 	struct dcss_segment *seg;
 	int rc;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return -ENOSYS;
 
 	mutex_lock(&dcss_lock);
@@ -540,7 +427,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr,
 		rc = __segment_load (name, do_nonshared, addr, end);
 	else {
 		if (do_nonshared == seg->do_nonshared) {
-			atomic_inc(&seg->ref_count);
+			refcount_inc(&seg->ref_count);
 			*addr = seg->start_addr;
 			*end  = seg->end;
 			rc    = seg->vm_segtype;
@@ -586,9 +473,8 @@ segment_modify_shared (char *name, int do_nonshared)
 		rc = 0;
 		goto out_unlock;
 	}
-	if (atomic_read (&seg->ref_count) != 1) {
-		pr_warning("DCSS %s is in use and cannot be reloaded\n",
-			   name);
+	if (refcount_read(&seg->ref_count) != 1) {
+		pr_warn("DCSS %s is in use and cannot be reloaded\n", name);
 		rc = -EAGAIN;
 		goto out_unlock;
 	}
@@ -601,8 +487,8 @@ segment_modify_shared (char *name, int do_nonshared)
 			seg->res->flags |= IORESOURCE_READONLY;
 
 	if (request_resource(&iomem_resource, seg->res)) {
-		pr_warning("DCSS %s overlaps with used memory resources "
-			   "and cannot be reloaded\n", name);
+		pr_warn("DCSS %s overlaps with used memory resources and cannot be reloaded\n",
+			name);
 		rc = -EBUSY;
 		kfree(seg->res);
 		goto out_del_mem;
@@ -620,8 +506,8 @@ segment_modify_shared (char *name, int do_nonshared)
 		goto out_del_res;
 	}
 	if (diag_cc > 1) {
-		pr_warning("Reloading DCSS %s failed with rc=%ld\n", name,
-			   end_addr);
+		pr_warn("Reloading DCSS %s failed with rc=%ld\n",
+			name, end_addr);
 		rc = dcss_diag_translate_rc(end_addr);
 		goto out_del_res;
 	}
@@ -643,6 +529,14 @@ segment_modify_shared (char *name, int do_nonshared)
 	return rc;
 }
 
+static void __dcss_diag_purge_on_cpu_0(void *data)
+{
+	struct dcss_segment *seg = (struct dcss_segment *)data;
+	unsigned long dummy;
+
+	dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+}
+
 /*
  * Decrease the use count of a DCSS segment and remove
  * it from the address space if nobody is using it
@@ -651,10 +545,9 @@ segment_modify_shared (char *name, int do_nonshared)
 void
 segment_unload(char *name)
 {
-	unsigned long dummy;
 	struct dcss_segment *seg;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 
 	mutex_lock(&dcss_lock);
@@ -663,13 +556,20 @@ segment_unload(char *name)
 		pr_err("Unloading unknown DCSS %s failed\n", name);
 		goto out_unlock;
 	}
-	if (atomic_dec_return(&seg->ref_count) != 0)
+	if (!refcount_dec_and_test(&seg->ref_count))
 		goto out_unlock;
 	release_resource(seg->res);
 	kfree(seg->res);
 	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
 	list_del(&seg->list);
-	dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+	/*
+	 * Workaround for z/VM issue, where calling the DCSS unload diag on
+	 * a non-IPL CPU would cause bogus sclp maximum memory detection on
+	 * next IPL.
+	 * IPL CPU 0 cannot be set offline, so the dcss_diag() call can
+	 * directly be scheduled to that CPU.
+	 */
+	smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1);
 	kfree(seg);
 out_unlock:
 	mutex_unlock(&dcss_lock);
@@ -686,7 +586,7 @@ segment_save(char *name)
 	char cmd2[80];
 	int i, response;
 
-	if (!MACHINE_IS_VM)
+	if (!machine_is_vm())
 		return;
 
 	mutex_lock(&dcss_lock);
@@ -697,14 +597,16 @@ segment_save(char *name)
 		goto out;
 	}
 
-	sprintf(cmd1, "DEFSEG %s", name);
+	snprintf(cmd1, sizeof(cmd1), "DEFSEG %s", name);
 	for (i=0; i<seg->segcnt; i++) {
-		sprintf(cmd1+strlen(cmd1), " %lX-%lX %s",
-			seg->range[i].start >> PAGE_SHIFT,
-			seg->range[i].end >> PAGE_SHIFT,
-			segtype_string[seg->range[i].start & 0xff]);
+		size_t len = strlen(cmd1);
+
+		snprintf(cmd1 + len, sizeof(cmd1) - len, " %lX-%lX %s",
+			 seg->range[i].start >> PAGE_SHIFT,
+			 seg->range[i].end >> PAGE_SHIFT,
+			 segtype_string[seg->range[i].start & 0xff]);
 	}
-	sprintf(cmd2, "SAVESEG %s", name);
+	snprintf(cmd2, sizeof(cmd2), "SAVESEG %s", name);
 	response = 0;
 	cpcmd(cmd1, NULL, 0, &response);
 	if (response) {
@@ -744,10 +646,6 @@ void segment_warning(int rc, char *seg_name)
 		pr_err("DCSS %s has multiple page ranges and cannot be "
 		       "loaded or queried\n", seg_name);
 		break;
-	case -ENOSPC:
-		pr_err("DCSS %s overlaps with used storage and cannot "
-		       "be loaded\n", seg_name);
-		break;
 	case -EBUSY:
 		pr_err("%s needs used memory resources and cannot be "
 		       "loaded or queried\n", seg_name);
@@ -760,10 +658,13 @@ void segment_warning(int rc, char *seg_name)
 		pr_err("There is not enough memory to load or query "
 		       "DCSS %s\n", seg_name);
 		break;
-	case -ERANGE:
-		pr_err("DCSS %s exceeds the kernel mapping range (%lu) "
-		       "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS);
+	case -ERANGE: {
+		struct range mhp_range = arch_get_mappable_range();
+
+		pr_err("DCSS %s exceeds the kernel mapping range (%llu) "
+		       "and cannot be loaded\n", seg_name, mhp_range.end + 1);
 		break;
+	}
 	default:
 		break;
 	}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 047c3e4c59a2..e2e13778c36a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -1,17 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  S390 version
  *    Copyright IBM Corp. 1999
  *    Author(s): Hartmut Penner (hp@de.ibm.com)
- *               Ulrich Weigand (uweigand@de.ibm.com)
+ *		 Ulrich Weigand (uweigand@de.ibm.com)
  *
  *  Derived from "arch/i386/mm/fault.c"
  *    Copyright (C) 1995  Linus Torvalds
  */
 
 #include <linux/kernel_stat.h>
+#include <linux/mmu_context.h>
+#include <linux/cpufeature.h>
 #include <linux/perf_event.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
@@ -19,241 +23,233 @@
 #include <linux/ptrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
-#include <linux/compat.h>
 #include <linux/smp.h>
 #include <linux/kdebug.h>
 #include <linux/init.h>
 #include <linux/console.h>
-#include <linux/module.h>
+#include <linux/extable.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/hugetlb.h>
+#include <linux/kfence.h>
+#include <linux/pagewalk.h>
+#include <asm/asm-extable.h>
 #include <asm/asm-offsets.h>
-#include <asm/pgtable.h>
+#include <asm/ptrace.h>
+#include <asm/fault.h>
+#include <asm/diag.h>
 #include <asm/irq.h>
-#include <asm/mmu_context.h>
 #include <asm/facility.h>
+#include <asm/uv.h>
 #include "../kernel/entry.h"
 
-#ifndef CONFIG_64BIT
-#define __FAIL_ADDR_MASK 0x7ffff000
-#define __SUBCODE_MASK 0x0200
-#define __PF_RES_FIELD 0ULL
-#else /* CONFIG_64BIT */
-#define __FAIL_ADDR_MASK -4096L
-#define __SUBCODE_MASK 0x0600
-#define __PF_RES_FIELD 0x8000000000000000ULL
-#endif /* CONFIG_64BIT */
+/*
+ * Find out which address space caused the exception.
+ */
+static bool is_kernel_fault(struct pt_regs *regs)
+{
+	union teid teid = { .val = regs->int_parm_long };
 
-#define VM_FAULT_BADCONTEXT	0x010000
-#define VM_FAULT_BADMAP		0x020000
-#define VM_FAULT_BADACCESS	0x040000
-#define VM_FAULT_SIGNAL		0x080000
+	if (user_mode(regs))
+		return false;
+	if (teid.as == PSW_BITS_AS_SECONDARY)
+		return false;
+	return true;
+}
+
+static unsigned long get_fault_address(struct pt_regs *regs)
+{
+	union teid teid = { .val = regs->int_parm_long };
 
-static unsigned long store_indication __read_mostly;
+	return teid.addr * PAGE_SIZE;
+}
 
-#ifdef CONFIG_64BIT
-static int __init fault_init(void)
+static __always_inline bool fault_is_write(struct pt_regs *regs)
 {
+	union teid teid = { .val = regs->int_parm_long };
+
 	if (test_facility(75))
-		store_indication = 0xc00;
-	return 0;
+		return teid.fsi == TEID_FSI_STORE;
+	return false;
 }
-early_initcall(fault_init);
-#endif
 
-static inline int notify_page_fault(struct pt_regs *regs)
+static void dump_pagetable(unsigned long asce, unsigned long address)
 {
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
+	unsigned long entry, *table = __va(asce & _ASCE_ORIGIN);
+
+	pr_alert("AS:%016lx ", asce);
+	switch (asce & _ASCE_TYPE_MASK) {
+	case _ASCE_TYPE_REGION1:
+		table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
+		if (get_kernel_nofault(entry, table))
+			goto bad;
+		pr_cont("R1:%016lx ", entry);
+		if (entry & _REGION_ENTRY_INVALID)
+			goto out;
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
+		fallthrough;
+	case _ASCE_TYPE_REGION2:
+		table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
+		if (get_kernel_nofault(entry, table))
+			goto bad;
+		pr_cont("R2:%016lx ", entry);
+		if (entry & _REGION_ENTRY_INVALID)
+			goto out;
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
+		fallthrough;
+	case _ASCE_TYPE_REGION3:
+		table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
+		if (get_kernel_nofault(entry, table))
+			goto bad;
+		pr_cont("R3:%016lx ", entry);
+		if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
+			goto out;
+		table = __va(entry & _REGION_ENTRY_ORIGIN);
+		fallthrough;
+	case _ASCE_TYPE_SEGMENT:
+		table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+		if (get_kernel_nofault(entry, table))
+			goto bad;
+		pr_cont("S:%016lx ", entry);
+		if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
+			goto out;
+		table = __va(entry & _SEGMENT_ENTRY_ORIGIN);
 	}
-	return ret;
+	table += (address & _PAGE_INDEX) >> PAGE_SHIFT;
+	if (get_kernel_nofault(entry, table))
+		goto bad;
+	pr_cont("P:%016lx ", entry);
+out:
+	pr_cont("\n");
+	return;
+bad:
+	pr_cont("BAD\n");
 }
 
-
-/*
- * Unlock any spinlocks which will prevent us from getting the
- * message out.
- */
-void bust_spinlocks(int yes)
+static void dump_fault_info(struct pt_regs *regs)
 {
-	if (yes) {
-		oops_in_progress = 1;
+	union teid teid = { .val = regs->int_parm_long };
+	unsigned long asce;
+
+	pr_alert("Failing address: %016lx TEID: %016lx",
+		 get_fault_address(regs), teid.val);
+	if (test_facility(131))
+		pr_cont(" ESOP-2");
+	else if (machine_has_esop())
+		pr_cont(" ESOP-1");
+	else
+		pr_cont(" SOP");
+	if (test_facility(75))
+		pr_cont(" FSI");
+	pr_cont("\n");
+	pr_alert("Fault in ");
+	switch (teid.as) {
+	case PSW_BITS_AS_HOME:
+		pr_cont("home space ");
+		break;
+	case PSW_BITS_AS_SECONDARY:
+		pr_cont("secondary space ");
+		break;
+	case PSW_BITS_AS_ACCREG:
+		pr_cont("access register ");
+		break;
+	case PSW_BITS_AS_PRIMARY:
+		pr_cont("primary space ");
+		break;
+	}
+	pr_cont("mode while using ");
+	if (is_kernel_fault(regs)) {
+		asce = get_lowcore()->kernel_asce.val;
+		pr_cont("kernel ");
 	} else {
-		int loglevel_save = console_loglevel;
-		console_unblank();
-		oops_in_progress = 0;
-		/*
-		 * OK, the message is on the console.  Now we call printk()
-		 * without oops_in_progress set so that printk will give klogd
-		 * a poke.  Hold onto your hats...
-		 */
-		console_loglevel = 15;
-		printk(" ");
-		console_loglevel = loglevel_save;
+		asce = get_lowcore()->user_asce.val;
+		pr_cont("user ");
 	}
+	pr_cont("ASCE.\n");
+	dump_pagetable(asce, get_fault_address(regs));
 }
 
-/*
- * Returns the address space associated with the fault.
- * Returns 0 for kernel space and 1 for user space.
- */
-static inline int user_space_fault(unsigned long trans_exc_code)
+int show_unhandled_signals = 1;
+
+static const struct ctl_table s390_fault_sysctl_table[] = {
+	{
+		.procname	= "userprocess_debug",
+		.data		= &show_unhandled_signals,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+};
+
+static int __init init_s390_fault_sysctls(void)
 {
-	/*
-	 * The lowest two bits of the translation exception
-	 * identification indicate which paging table was used.
-	 */
-	trans_exc_code &= 3;
-	if (trans_exc_code == 2)
-		/* Access via secondary space, set_fs setting decides */
-		return current->thread.mm_segment.ar4;
-	if (s390_user_mode == HOME_SPACE_MODE)
-		/* User space if the access has been done via home space. */
-		return trans_exc_code == 3;
-	/*
-	 * If the user space is not the home space the kernel runs in home
-	 * space. Access via secondary space has already been covered,
-	 * access via primary space or access register is from user space
-	 * and access via home space is from the kernel.
-	 */
-	return trans_exc_code != 3;
+	register_sysctl_init("kernel", s390_fault_sysctl_table);
+	return 0;
 }
+arch_initcall(init_s390_fault_sysctls);
 
-static inline void report_user_fault(struct pt_regs *regs, long signr)
+void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 {
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
 	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 		return;
 	if (!unhandled_signal(current, signr))
 		return;
-	if (!printk_ratelimit())
+	if (!__ratelimit(&rs))
 		return;
-	printk(KERN_ALERT "User process fault: interruption code 0x%X ",
-	       regs->int_code);
-	print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN);
-	printk(KERN_CONT "\n");
-	printk(KERN_ALERT "failing address: %lX\n",
-	       regs->int_parm_long & __FAIL_ADDR_MASK);
+	pr_alert("User process fault: interruption code %04x ilc:%d ",
+		 regs->int_code & 0xffff, regs->int_code >> 17);
+	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
+	pr_cont("\n");
+	if (is_mm_fault)
+		dump_fault_info(regs);
 	show_regs(regs);
 }
 
-/*
- * Send SIGSEGV to task.  This is an external routine
- * to keep the stack usage of do_page_fault small.
- */
-static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
+static void do_sigsegv(struct pt_regs *regs, int si_code)
 {
-	struct siginfo si;
-
-	report_user_fault(regs, SIGSEGV);
-	si.si_signo = SIGSEGV;
-	si.si_code = si_code;
-	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
-	force_sig_info(SIGSEGV, &si, current);
+	report_user_fault(regs, SIGSEGV, 1);
+	force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs));
 }
 
-static noinline void do_no_context(struct pt_regs *regs)
+static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
 {
-	const struct exception_table_entry *fixup;
 	unsigned long address;
+	bool is_write;
 
-	/* Are we prepared to handle this kernel fault?  */
-	fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);
-	if (fixup) {
-		regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE;
+	if (user_mode(regs)) {
+		if (WARN_ON_ONCE(!si_code))
+			si_code = SEGV_MAPERR;
+		return do_sigsegv(regs, si_code);
+	}
+	if (fixup_exception(regs))
 		return;
+	if (is_kernel_fault(regs)) {
+		address = get_fault_address(regs);
+		is_write = fault_is_write(regs);
+		if (kfence_handle_page_fault(address, is_write, regs))
+			return;
+		pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n");
+	} else {
+		pr_alert("Unable to handle kernel paging request in virtual user address space\n");
 	}
-
-	/*
-	 * Oops. The kernel tried to access some bad page. We'll have to
-	 * terminate things with extreme prejudice.
-	 */
-	address = regs->int_parm_long & __FAIL_ADDR_MASK;
-	if (!user_space_fault(regs->int_parm_long))
-		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
-		       " at virtual kernel address %p\n", (void *)address);
-	else
-		printk(KERN_ALERT "Unable to handle kernel paging request"
-		       " at virtual user address %p\n", (void *)address);
-
+	dump_fault_info(regs);
 	die(regs, "Oops");
-	do_exit(SIGKILL);
 }
 
-static noinline void do_low_address(struct pt_regs *regs)
+static void handle_fault_error(struct pt_regs *regs, int si_code)
 {
-	/* Low-address protection hit in kernel mode means
-	   NULL pointer write access in kernel mode.  */
-	if (regs->psw.mask & PSW_MASK_PSTATE) {
-		/* Low-address protection hit in user mode 'cannot happen'. */
-		die (regs, "Low-address protection");
-		do_exit(SIGKILL);
-	}
+	struct mm_struct *mm = current->mm;
 
-	do_no_context(regs);
+	mmap_read_unlock(mm);
+	handle_fault_error_nolock(regs, si_code);
 }
 
-static noinline void do_sigbus(struct pt_regs *regs)
+static void do_sigbus(struct pt_regs *regs)
 {
-	struct task_struct *tsk = current;
-	struct siginfo si;
-
-	/*
-	 * Send a sigbus, regardless of whether we were in kernel
-	 * or user mode.
-	 */
-	si.si_signo = SIGBUS;
-	si.si_errno = 0;
-	si.si_code = BUS_ADRERR;
-	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
-	force_sig_info(SIGBUS, &si, tsk);
-}
-
-static noinline void do_fault_error(struct pt_regs *regs, int fault)
-{
-	int si_code;
-
-	switch (fault) {
-	case VM_FAULT_BADACCESS:
-	case VM_FAULT_BADMAP:
-		/* Bad memory access. Check if it is kernel or user space. */
-		if (user_mode(regs)) {
-			/* User mode accesses just cause a SIGSEGV */
-			si_code = (fault == VM_FAULT_BADMAP) ?
-				SEGV_MAPERR : SEGV_ACCERR;
-			do_sigsegv(regs, si_code);
-			return;
-		}
-	case VM_FAULT_BADCONTEXT:
-		do_no_context(regs);
-		break;
-	case VM_FAULT_SIGNAL:
-		if (!user_mode(regs))
-			do_no_context(regs);
-		break;
-	default: /* fault & VM_FAULT_ERROR */
-		if (fault & VM_FAULT_OOM) {
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				pagefault_out_of_memory();
-		} else if (fault & VM_FAULT_SIGBUS) {
-			/* Kernel mode? Handle exceptions or die */
-			if (!user_mode(regs))
-				do_no_context(regs);
-			else
-				do_sigbus(regs);
-		} else
-			BUG();
-		break;
-	}
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs));
 }
 
 /*
@@ -262,427 +258,216 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
  * routines.
  *
  * interruption code (int_code):
- *   04       Protection           ->  Write-Protection  (suprression)
- *   10       Segment translation  ->  Not present       (nullification)
- *   11       Page translation     ->  Not present       (nullification)
- *   3b       Region third trans.  ->  Not present       (nullification)
+ *   04       Protection	   ->  Write-Protection  (suppression)
+ *   10       Segment translation  ->  Not present	 (nullification)
+ *   11       Page translation	   ->  Not present	 (nullification)
+ *   3b       Region third trans.  ->  Not present	 (nullification)
  */
-static inline int do_exception(struct pt_regs *regs, int access)
+static void do_exception(struct pt_regs *regs, int access)
 {
-	struct task_struct *tsk;
-	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	unsigned long trans_exc_code;
 	unsigned long address;
+	struct mm_struct *mm;
 	unsigned int flags;
-	int fault;
+	vm_fault_t fault;
+	bool is_write;
 
-	tsk = current;
 	/*
 	 * The instruction that caused the program check has
 	 * been nullified. Don't signal single step via SIGTRAP.
 	 */
-	clear_tsk_thread_flag(tsk, TIF_PER_TRAP);
-
-	if (notify_page_fault(regs))
-		return 0;
-
-	mm = tsk->mm;
-	trans_exc_code = regs->int_parm_long;
-
-	/*
-	 * Verify that the fault happened in user space, that
-	 * we are not in an interrupt and that there is a 
-	 * user context.
-	 */
-	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
-		goto out;
-
-	address = trans_exc_code & __FAIL_ADDR_MASK;
+	clear_thread_flag(TIF_PER_TRAP);
+	if (kprobe_page_fault(regs, 14))
+		return;
+	mm = current->mm;
+	address = get_fault_address(regs);
+	is_write = fault_is_write(regs);
+	if (is_kernel_fault(regs) || faulthandler_disabled() || !mm)
+		return handle_fault_error_nolock(regs, 0);
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
-	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+	flags = FAULT_FLAG_DEFAULT;
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (is_write)
+		access = VM_WRITE;
+	if (access == VM_WRITE)
 		flags |= FAULT_FLAG_WRITE;
-	down_read(&mm->mmap_sem);
-
-#ifdef CONFIG_PGSTE
-	if ((current->flags & PF_VCPU) && S390_lowcore.gmap) {
-		address = __gmap_fault(address,
-				     (struct gmap *) S390_lowcore.gmap);
-		if (address == -EFAULT) {
-			fault = VM_FAULT_BADMAP;
-			goto out_up;
-		}
-		if (address == -ENOMEM) {
-			fault = VM_FAULT_OOM;
-			goto out_up;
-		}
+	if (!(flags & FAULT_FLAG_USER))
+		goto lock_mmap;
+	vma = lock_vma_under_rcu(mm, address);
+	if (!vma)
+		goto lock_mmap;
+	if (!(vma->vm_flags & access)) {
+		vma_end_read(vma);
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		return handle_fault_error_nolock(regs, SEGV_ACCERR);
 	}
-#endif
-
+	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+	if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+		vma_end_read(vma);
+	if (!(fault & VM_FAULT_RETRY)) {
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		goto done;
+	}
+	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
+	/* Quick path to respond to signals */
+	if (fault_signal_pending(fault, regs)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		return;
+	}
+lock_mmap:
 retry:
-	fault = VM_FAULT_BADMAP;
-	vma = find_vma(mm, address);
+	vma = lock_mm_and_find_vma(mm, address, regs);
 	if (!vma)
-		goto out_up;
-
-	if (unlikely(vma->vm_start > address)) {
-		if (!(vma->vm_flags & VM_GROWSDOWN))
-			goto out_up;
-		if (expand_stack(vma, address))
-			goto out_up;
-	}
-
-	/*
-	 * Ok, we have a good vm_area for this memory access, so
-	 * we can handle it..
-	 */
-	fault = VM_FAULT_BADACCESS;
+		return handle_fault_error_nolock(regs, SEGV_MAPERR);
 	if (unlikely(!(vma->vm_flags & access)))
-		goto out_up;
-
-	if (is_vm_hugetlb_page(vma))
-		address &= HPAGE_MASK;
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
-	fault = handle_mm_fault(mm, vma, address, flags);
-	/* No reason to continue if interrupted by SIGKILL. */
-	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
-		fault = VM_FAULT_SIGNAL;
-		goto out;
+		return handle_fault_error(regs, SEGV_ACCERR);
+	fault = handle_mm_fault(vma, address, flags, regs);
+	if (fault_signal_pending(fault, regs)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		return;
 	}
-	if (unlikely(fault & VM_FAULT_ERROR))
-		goto out_up;
-
-	/*
-	 * Major/minor page fault accounting is only done on the
-	 * initial attempt. If we go through a retry, it is extremely
-	 * likely that the page will be found in page cache at that point.
-	 */
-	if (flags & FAULT_FLAG_ALLOW_RETRY) {
-		if (fault & VM_FAULT_MAJOR) {
-			tsk->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-				      regs, address);
-		} else {
-			tsk->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-				      regs, address);
-		}
-		if (fault & VM_FAULT_RETRY) {
-			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
-			 * of starvation. */
-			flags &= ~FAULT_FLAG_ALLOW_RETRY;
-			flags |= FAULT_FLAG_TRIED;
-			down_read(&mm->mmap_sem);
-			goto retry;
-		}
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+	if (fault & VM_FAULT_RETRY) {
+		flags |= FAULT_FLAG_TRIED;
+		goto retry;
+	}
+	mmap_read_unlock(mm);
+done:
+	if (!(fault & VM_FAULT_ERROR))
+		return;
+	if (fault & VM_FAULT_OOM) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			pagefault_out_of_memory();
+	} else if (fault & VM_FAULT_SIGSEGV) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			do_sigsegv(regs, SEGV_MAPERR);
+	} else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON |
+			    VM_FAULT_HWPOISON_LARGE)) {
+		if (!user_mode(regs))
+			handle_fault_error_nolock(regs, 0);
+		else
+			do_sigbus(regs);
+	} else {
+		pr_emerg("Unexpected fault flags: %08x\n", fault);
+		BUG();
 	}
-	fault = 0;
-out_up:
-	up_read(&mm->mmap_sem);
-out:
-	return fault;
 }
 
-void __kprobes do_protection_exception(struct pt_regs *regs)
+void do_protection_exception(struct pt_regs *regs)
 {
-	unsigned long trans_exc_code;
-	int fault;
+	union teid teid = { .val = regs->int_parm_long };
 
-	trans_exc_code = regs->int_parm_long;
 	/*
 	 * Protection exceptions are suppressing, decrement psw address.
 	 * The exception to this rule are aborted transactions, for these
 	 * the PSW already points to the correct location.
 	 */
-	if (!(regs->int_code & 0x200))
+	if (!(regs->int_code & 0x200)) {
 		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
-	/*
-	 * Check for low-address protection.  This needs to be treated
-	 * as a special case because the translation exception code
-	 * field is not guaranteed to contain valid data in this case.
-	 */
-	if (unlikely(!(trans_exc_code & 4))) {
-		do_low_address(regs);
-		return;
+		set_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED);
 	}
-	fault = do_exception(regs, VM_WRITE);
-	if (unlikely(fault))
-		do_fault_error(regs, fault);
-}
-
-void __kprobes do_dat_exception(struct pt_regs *regs)
-{
-	int access, fault;
-
-	access = VM_READ | VM_EXEC | VM_WRITE;
-	fault = do_exception(regs, access);
-	if (unlikely(fault))
-		do_fault_error(regs, fault);
-}
-
-#ifdef CONFIG_64BIT
-void __kprobes do_asce_exception(struct pt_regs *regs)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long trans_exc_code;
-
 	/*
-	 * The instruction that caused the program check has
-	 * been nullified. Don't signal single step via SIGTRAP.
+	 * If bit 61 if the TEID is not set, the remainder of the
+	 * TEID is unpredictable. Special handling is required.
 	 */
-	clear_tsk_thread_flag(current, TIF_PER_TRAP);
-
-	trans_exc_code = regs->int_parm_long;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
-		goto no_context;
-
-	down_read(&mm->mmap_sem);
-	vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK);
-	up_read(&mm->mmap_sem);
-
-	if (vma) {
-		update_mm(mm, current);
-		return;
+	if (unlikely(!teid.b61)) {
+		if (user_mode(regs)) {
+			dump_fault_info(regs);
+			die(regs, "Unexpected TEID");
+		}
+		/* Assume low-address protection in kernel mode. */
+		return handle_fault_error_nolock(regs, 0);
 	}
-
-	/* User mode accesses just cause a SIGSEGV */
-	if (user_mode(regs)) {
-		do_sigsegv(regs, SEGV_MAPERR);
-		return;
+	if (unlikely(cpu_has_nx() && teid.b56)) {
+		regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK);
+		return handle_fault_error_nolock(regs, SEGV_ACCERR);
 	}
-
-no_context:
-	do_no_context(regs);
-}
-#endif
-
-int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write)
-{
-	struct pt_regs regs;
-	int access, fault;
-
-	/* Emulate a uaccess fault from kernel mode. */
-	regs.psw.mask = psw_kernel_bits | PSW_MASK_DAT | PSW_MASK_MCHECK;
-	if (!irqs_disabled())
-		regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT;
-	regs.psw.addr = (unsigned long) __builtin_return_address(0);
-	regs.psw.addr |= PSW_ADDR_AMODE;
-	regs.int_code = pgm_int_code;
-	regs.int_parm_long = (uaddr & PAGE_MASK) | 2;
-	access = write ? VM_WRITE : VM_READ;
-	fault = do_exception(&regs, access);
-	/*
-	 * Since the fault happened in kernel mode while performing a uaccess
-	 * all we need to do now is emulating a fixup in case "fault" is not
-	 * zero.
-	 * For the calling uaccess functions this results always in -EFAULT.
-	 */
-	return fault ? -EFAULT : 0;
+	do_exception(regs, VM_WRITE);
 }
+NOKPROBE_SYMBOL(do_protection_exception);
 
-#ifdef CONFIG_PFAULT 
-/*
- * 'pfault' pseudo page faults routines.
- */
-static int pfault_disable;
-
-static int __init nopfault(char *str)
+void do_dat_exception(struct pt_regs *regs)
 {
-	pfault_disable = 1;
-	return 1;
+	do_exception(regs, VM_ACCESS_FLAGS);
 }
+NOKPROBE_SYMBOL(do_dat_exception);
 
-__setup("nopfault", nopfault);
-
-struct pfault_refbk {
-	u16 refdiagc;
-	u16 reffcode;
-	u16 refdwlen;
-	u16 refversn;
-	u64 refgaddr;
-	u64 refselmk;
-	u64 refcmpmk;
-	u64 reserved;
-} __attribute__ ((packed, aligned(8)));
-
-int pfault_init(void)
-{
-	struct pfault_refbk refbk = {
-		.refdiagc = 0x258,
-		.reffcode = 0,
-		.refdwlen = 5,
-		.refversn = 2,
-		.refgaddr = __LC_CURRENT_PID,
-		.refselmk = 1ULL << 48,
-		.refcmpmk = 1ULL << 48,
-		.reserved = __PF_RES_FIELD };
-        int rc;
-
-	if (pfault_disable)
-		return -1;
-	asm volatile(
-		"	diag	%1,%0,0x258\n"
-		"0:	j	2f\n"
-		"1:	la	%0,8\n"
-		"2:\n"
-		EX_TABLE(0b,1b)
-		: "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
-        return rc;
-}
+#if IS_ENABLED(CONFIG_PGSTE)
 
-void pfault_fini(void)
+void do_secure_storage_access(struct pt_regs *regs)
 {
-	struct pfault_refbk refbk = {
-		.refdiagc = 0x258,
-		.reffcode = 1,
-		.refdwlen = 5,
-		.refversn = 2,
-	};
-
-	if (pfault_disable)
-		return;
-	asm volatile(
-		"	diag	%0,0,0x258\n"
-		"0:\n"
-		EX_TABLE(0b,0b)
-		: : "a" (&refbk), "m" (refbk) : "cc");
-}
-
-static DEFINE_SPINLOCK(pfault_lock);
-static LIST_HEAD(pfault_list);
-
-static void pfault_interrupt(struct ext_code ext_code,
-			     unsigned int param32, unsigned long param64)
-{
-	struct task_struct *tsk;
-	__u16 subcode;
-	pid_t pid;
+	union teid teid = { .val = regs->int_parm_long };
+	unsigned long addr = get_fault_address(regs);
+	struct vm_area_struct *vma;
+	struct folio_walk fw;
+	struct mm_struct *mm;
+	struct folio *folio;
+	int rc;
 
 	/*
-	 * Get the external interruption subcode & pfault
-	 * initial/completion signal bit. VM stores this 
-	 * in the 'cpu address' field associated with the
-         * external interrupt. 
+	 * Bit 61 indicates if the address is valid, if it is not the
+	 * kernel should be stopped or SIGSEGV should be sent to the
+	 * process. Bit 61 is not reliable without the misc UV feature,
+	 * therefore this needs to be checked too.
 	 */
-	subcode = ext_code.subcode;
-	if ((subcode & 0xff00) != __SUBCODE_MASK)
-		return;
-	inc_irq_stat(IRQEXT_PFL);
-	/* Get the token (= pid of the affected task). */
-	pid = sizeof(void *) == 4 ? param32 : param64;
-	rcu_read_lock();
-	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
-	if (tsk)
-		get_task_struct(tsk);
-	rcu_read_unlock();
-	if (!tsk)
-		return;
-	spin_lock(&pfault_lock);
-	if (subcode & 0x0080) {
-		/* signal bit is set -> a page has been swapped in by VM */
-		if (tsk->thread.pfault_wait == 1) {
-			/* Initial interrupt was faster than the completion
-			 * interrupt. pfault_wait is valid. Set pfault_wait
-			 * back to zero and wake up the process. This can
-			 * safely be done because the task is still sleeping
-			 * and can't produce new pfaults. */
-			tsk->thread.pfault_wait = 0;
-			list_del(&tsk->thread.list);
-			wake_up_process(tsk);
-			put_task_struct(tsk);
-		} else {
-			/* Completion interrupt was faster than initial
-			 * interrupt. Set pfault_wait to -1 so the initial
-			 * interrupt doesn't put the task to sleep.
-			 * If the task is not running, ignore the completion
-			 * interrupt since it must be a leftover of a PFAULT
-			 * CANCEL operation which didn't remove all pending
-			 * completion interrupts. */
-			if (tsk->state == TASK_RUNNING)
-				tsk->thread.pfault_wait = -1;
-		}
-	} else {
-		/* signal bit not set -> a real page is missing. */
-		if (WARN_ON_ONCE(tsk != current))
-			goto out;
-		if (tsk->thread.pfault_wait == 1) {
-			/* Already on the list with a reference: put to sleep */
-			__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-			set_tsk_need_resched(tsk);
-		} else if (tsk->thread.pfault_wait == -1) {
-			/* Completion interrupt was faster than the initial
-			 * interrupt (pfault_wait == -1). Set pfault_wait
-			 * back to zero and exit. */
-			tsk->thread.pfault_wait = 0;
-		} else {
-			/* Initial interrupt arrived before completion
-			 * interrupt. Let the task sleep.
-			 * An extra task reference is needed since a different
-			 * cpu may set the task state to TASK_RUNNING again
-			 * before the scheduler is reached. */
-			get_task_struct(tsk);
-			tsk->thread.pfault_wait = 1;
-			list_add(&tsk->thread.list, &pfault_list);
-			__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-			set_tsk_need_resched(tsk);
+	if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) {
+		/*
+		 * When this happens, userspace did something that it
+		 * was not supposed to do, e.g. branching into secure
+		 * memory. Trigger a segmentation fault.
+		 */
+		if (user_mode(regs)) {
+			send_sig(SIGSEGV, current, 0);
+			return;
 		}
+		/*
+		 * The kernel should never run into this case and
+		 * there is no way out of this situation.
+		 */
+		panic("Unexpected PGM 0x3d with TEID bit 61=0");
 	}
-out:
-	spin_unlock(&pfault_lock);
-	put_task_struct(tsk);
-}
-
-static int __cpuinit pfault_cpu_notify(struct notifier_block *self,
-				       unsigned long action, void *hcpu)
-{
-	struct thread_struct *thread, *next;
-	struct task_struct *tsk;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DEAD:
-		spin_lock_irq(&pfault_lock);
-		list_for_each_entry_safe(thread, next, &pfault_list, list) {
-			thread->pfault_wait = 0;
-			list_del(&thread->list);
-			tsk = container_of(thread, struct task_struct, thread);
-			wake_up_process(tsk);
-			put_task_struct(tsk);
+	if (is_kernel_fault(regs)) {
+		folio = phys_to_folio(addr);
+		if (unlikely(!folio_try_get(folio)))
+			return;
+		rc = arch_make_folio_accessible(folio);
+		folio_put(folio);
+		if (rc)
+			BUG();
+	} else {
+		if (faulthandler_disabled())
+			return handle_fault_error_nolock(regs, 0);
+		mm = current->mm;
+		mmap_read_lock(mm);
+		vma = find_vma(mm, addr);
+		if (!vma)
+			return handle_fault_error(regs, SEGV_MAPERR);
+		folio = folio_walk_start(&fw, vma, addr, 0);
+		if (!folio) {
+			mmap_read_unlock(mm);
+			return;
 		}
-		spin_unlock_irq(&pfault_lock);
-		break;
-	default:
-		break;
+		/* arch_make_folio_accessible() needs a raised refcount. */
+		folio_get(folio);
+		rc = arch_make_folio_accessible(folio);
+		folio_put(folio);
+		folio_walk_end(&fw, vma);
+		if (rc)
+			send_sig(SIGSEGV, current, 0);
+		mmap_read_unlock(mm);
 	}
-	return NOTIFY_OK;
-}
-
-static int __init pfault_irq_init(void)
-{
-	int rc;
-
-	rc = register_external_interrupt(0x2603, pfault_interrupt);
-	if (rc)
-		goto out_extint;
-	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
-	if (rc)
-		goto out_pfault;
-	service_subclass_irq_register();
-	hotcpu_notifier(pfault_cpu_notify, 0);
-	return 0;
-
-out_pfault:
-	unregister_external_interrupt(0x2603, pfault_interrupt);
-out_extint:
-	pfault_disable = 1;
-	return rc;
 }
-early_initcall(pfault_irq_init);
+NOKPROBE_SYMBOL(do_secure_storage_access);
 
-#endif /* CONFIG_PFAULT */
+#endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
new file mode 100644
index 000000000000..dd85bcca817d
--- /dev/null
+++ b/arch/s390/mm/gmap.c
@@ -0,0 +1,2436 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2020
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *		 David Hildenbrand <david@redhat.com>
+ *		 Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/pagewalk.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/ksm.h>
+#include <linux/mman.h>
+#include <linux/pgtable.h>
+#include <asm/page-states.h>
+#include <asm/pgalloc.h>
+#include <asm/machine.h>
+#include <asm/gmap_helpers.h>
+#include <asm/gmap.h>
+#include <asm/page.h>
+
+/*
+ * The address is saved in a radix tree directly; NULL would be ambiguous,
+ * since 0 is a valid address, and NULL is returned when nothing was found.
+ * The lower bits are ignored by all users of the macro, so it can be used
+ * to distinguish a valid address 0 from a NULL.
+ */
+#define VALID_GADDR_FLAG 1
+#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
+#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
+
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
+static struct page *gmap_alloc_crst(void)
+{
+	struct page *page;
+
+	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+	if (!page)
+		return NULL;
+	__arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
+	return page;
+}
+
+/**
+ * gmap_alloc - allocate and initialize a guest address space
+ * @limit: maximum address of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_alloc(unsigned long limit)
+{
+	struct gmap *gmap;
+	struct page *page;
+	unsigned long *table;
+	unsigned long etype, atype;
+
+	if (limit < _REGION3_SIZE) {
+		limit = _REGION3_SIZE - 1;
+		atype = _ASCE_TYPE_SEGMENT;
+		etype = _SEGMENT_ENTRY_EMPTY;
+	} else if (limit < _REGION2_SIZE) {
+		limit = _REGION2_SIZE - 1;
+		atype = _ASCE_TYPE_REGION3;
+		etype = _REGION3_ENTRY_EMPTY;
+	} else if (limit < _REGION1_SIZE) {
+		limit = _REGION1_SIZE - 1;
+		atype = _ASCE_TYPE_REGION2;
+		etype = _REGION2_ENTRY_EMPTY;
+	} else {
+		limit = -1UL;
+		atype = _ASCE_TYPE_REGION1;
+		etype = _REGION1_ENTRY_EMPTY;
+	}
+	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
+	if (!gmap)
+		goto out;
+	INIT_LIST_HEAD(&gmap->children);
+	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
+	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
+	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
+	spin_lock_init(&gmap->guest_table_lock);
+	spin_lock_init(&gmap->shadow_lock);
+	refcount_set(&gmap->ref_count, 1);
+	page = gmap_alloc_crst();
+	if (!page)
+		goto out_free;
+	table = page_to_virt(page);
+	crst_table_init(table, etype);
+	gmap->table = table;
+	gmap->asce = atype | _ASCE_TABLE_LENGTH |
+		_ASCE_USER_BITS | __pa(table);
+	gmap->asce_end = limit;
+	return gmap;
+
+out_free:
+	kfree(gmap);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+	struct gmap *gmap;
+	unsigned long gmap_asce;
+
+	gmap = gmap_alloc(limit);
+	if (!gmap)
+		return NULL;
+	gmap->mm = mm;
+	spin_lock(&mm->context.lock);
+	list_add_rcu(&gmap->list, &mm->context.gmap_list);
+	if (list_is_singular(&mm->context.gmap_list))
+		gmap_asce = gmap->asce;
+	else
+		gmap_asce = -1UL;
+	WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
+	spin_unlock(&mm->context.lock);
+	return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
+
+static void gmap_flush_tlb(struct gmap *gmap)
+{
+	__tlb_flush_idte(gmap->asce);
+}
+
+static void gmap_radix_tree_free(struct radix_tree_root *root)
+{
+	struct radix_tree_iter iter;
+	unsigned long indices[16];
+	unsigned long index;
+	void __rcu **slot;
+	int i, nr;
+
+	/* A radix tree is freed by deleting all of its entries */
+	index = 0;
+	do {
+		nr = 0;
+		radix_tree_for_each_slot(slot, root, &iter, index) {
+			indices[nr] = iter.index;
+			if (++nr == 16)
+				break;
+		}
+		for (i = 0; i < nr; i++) {
+			index = indices[i];
+			radix_tree_delete(root, index);
+		}
+	} while (nr > 0);
+}
+
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+	struct gmap_rmap *rmap, *rnext, *head;
+	struct radix_tree_iter iter;
+	unsigned long indices[16];
+	unsigned long index;
+	void __rcu **slot;
+	int i, nr;
+
+	/* A radix tree is freed by deleting all of its entries */
+	index = 0;
+	do {
+		nr = 0;
+		radix_tree_for_each_slot(slot, root, &iter, index) {
+			indices[nr] = iter.index;
+			if (++nr == 16)
+				break;
+		}
+		for (i = 0; i < nr; i++) {
+			index = indices[i];
+			head = radix_tree_delete(root, index);
+			gmap_for_each_rmap_safe(rmap, rnext, head)
+				kfree(rmap);
+		}
+	} while (nr > 0);
+}
+
+static void gmap_free_crst(unsigned long *table, bool free_ptes)
+{
+	bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
+	int i;
+
+	if (is_segment) {
+		if (!free_ptes)
+			goto out;
+		for (i = 0; i < _CRST_ENTRIES; i++)
+			if (!(table[i] & _SEGMENT_ENTRY_INVALID))
+				page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
+	} else {
+		for (i = 0; i < _CRST_ENTRIES; i++)
+			if (!(table[i] & _REGION_ENTRY_INVALID))
+				gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
+	}
+
+out:
+	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
+/**
+ * gmap_free - free a guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
+ */
+void gmap_free(struct gmap *gmap)
+{
+	/* Flush tlb of all gmaps (if not already done for shadows) */
+	if (!(gmap_is_shadow(gmap) && gmap->removed))
+		gmap_flush_tlb(gmap);
+	/* Free all segment & region tables. */
+	gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
+
+	gmap_radix_tree_free(&gmap->guest_to_host);
+	gmap_radix_tree_free(&gmap->host_to_guest);
+
+	/* Free additional data for a shadow gmap */
+	if (gmap_is_shadow(gmap)) {
+		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+		/* Release reference to the parent */
+		gmap_put(gmap->parent);
+	}
+
+	kfree(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+	refcount_inc(&gmap->ref_count);
+	return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+	if (refcount_dec_and_test(&gmap->ref_count))
+		gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+	struct gmap *sg, *next;
+	unsigned long gmap_asce;
+
+	/* Remove all shadow gmaps linked to this gmap */
+	if (!list_empty(&gmap->children)) {
+		spin_lock(&gmap->shadow_lock);
+		list_for_each_entry_safe(sg, next, &gmap->children, list) {
+			list_del(&sg->list);
+			gmap_put(sg);
+		}
+		spin_unlock(&gmap->shadow_lock);
+	}
+	/* Remove gmap from the pre-mm list */
+	spin_lock(&gmap->mm->context.lock);
+	list_del_rcu(&gmap->list);
+	if (list_empty(&gmap->mm->context.gmap_list))
+		gmap_asce = 0;
+	else if (list_is_singular(&gmap->mm->context.gmap_list))
+		gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
+					     struct gmap, list)->asce;
+	else
+		gmap_asce = -1UL;
+	WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
+	spin_unlock(&gmap->mm->context.lock);
+	synchronize_rcu();
+	/* Put reference */
+	gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
+
+/*
+ * gmap_alloc_table is assumed to be called with mmap_lock held
+ */
+static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
+			    unsigned long init, unsigned long gaddr)
+{
+	struct page *page;
+	unsigned long *new;
+
+	/* since we dont free the gmap table until gmap_free we can unlock */
+	page = gmap_alloc_crst();
+	if (!page)
+		return -ENOMEM;
+	new = page_to_virt(page);
+	crst_table_init(new, init);
+	spin_lock(&gmap->guest_table_lock);
+	if (*table & _REGION_ENTRY_INVALID) {
+		*table = __pa(new) | _REGION_ENTRY_LENGTH |
+			(*table & _REGION_ENTRY_TYPE_MASK);
+		page = NULL;
+	}
+	spin_unlock(&gmap->guest_table_lock);
+	if (page)
+		__free_pages(page, CRST_ALLOC_ORDER);
+	return 0;
+}
+
+static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
+{
+	return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
+
+static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
+{
+	return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
+
+static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
+				       unsigned long *gaddr)
+{
+	*gaddr = host_to_guest_delete(gmap, vmaddr);
+	if (IS_GADDR_VALID(*gaddr))
+		return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
+	return NULL;
+}
+
+/**
+ * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
+ * @gmap: pointer to the guest address space structure
+ * @vmaddr: address in the host process address space
+ *
+ * Returns 1 if a TLB flush is required
+ */
+static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
+{
+	unsigned long gaddr;
+	int flush = 0;
+	pmd_t *pmdp;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	spin_lock(&gmap->guest_table_lock);
+
+	pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+	if (pmdp) {
+		flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
+		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
+	}
+
+	spin_unlock(&gmap->guest_table_lock);
+	return flush;
+}
+
+/**
+ * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
+ * @gmap: pointer to the guest address space structure
+ * @gaddr: address in the guest address space
+ *
+ * Returns 1 if a TLB flush is required
+ */
+static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
+{
+	unsigned long vmaddr;
+
+	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
+						   gaddr >> PMD_SHIFT);
+	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
+}
+
+/**
+ * gmap_unmap_segment - unmap segment from the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @to: address in the guest address space
+ * @len: length of the memory area to unmap
+ *
+ * Returns 0 if the unmap succeeded, -EINVAL if not.
+ */
+int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
+{
+	unsigned long off;
+	int flush;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	if ((to | len) & (PMD_SIZE - 1))
+		return -EINVAL;
+	if (len == 0 || to + len < to)
+		return -EINVAL;
+
+	flush = 0;
+	mmap_write_lock(gmap->mm);
+	for (off = 0; off < len; off += PMD_SIZE)
+		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
+	mmap_write_unlock(gmap->mm);
+	if (flush)
+		gmap_flush_tlb(gmap);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_unmap_segment);
+
+/**
+ * gmap_map_segment - map a segment to the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @from: source address in the parent address space
+ * @to: target address in the guest address space
+ * @len: length of the memory area to map
+ *
+ * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
+ */
+int gmap_map_segment(struct gmap *gmap, unsigned long from,
+		     unsigned long to, unsigned long len)
+{
+	unsigned long off;
+	int flush;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	if ((from | to | len) & (PMD_SIZE - 1))
+		return -EINVAL;
+	if (len == 0 || from + len < from || to + len < to ||
+	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
+		return -EINVAL;
+
+	flush = 0;
+	mmap_write_lock(gmap->mm);
+	for (off = 0; off < len; off += PMD_SIZE) {
+		/* Remove old translation */
+		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
+		/* Store new translation */
+		if (radix_tree_insert(&gmap->guest_to_host,
+				      (to + off) >> PMD_SHIFT,
+				      (void *) from + off))
+			break;
+	}
+	mmap_write_unlock(gmap->mm);
+	if (flush)
+		gmap_flush_tlb(gmap);
+	if (off >= len)
+		return 0;
+	gmap_unmap_segment(gmap, to, len);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(gmap_map_segment);
+
+/**
+ * __gmap_translate - translate a guest address to a user space address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ *
+ * Returns user space address which corresponds to the guest address or
+ * -EFAULT if no such mapping exists.
+ * This function does not establish potentially missing page table entries.
+ * The mmap_lock of the mm that belongs to the address space must be held
+ * when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
+{
+	unsigned long vmaddr;
+
+	vmaddr = (unsigned long)
+		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+	/* Note: guest_to_host is empty for a shadow gmap */
+	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
+}
+EXPORT_SYMBOL_GPL(__gmap_translate);
+
+/**
+ * gmap_unlink - disconnect a page table from the gmap shadow tables
+ * @mm: pointer to the parent mm_struct
+ * @table: pointer to the host page table
+ * @vmaddr: vm address associated with the host page table
+ */
+void gmap_unlink(struct mm_struct *mm, unsigned long *table,
+		 unsigned long vmaddr)
+{
+	struct gmap *gmap;
+	int flush;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
+		if (flush)
+			gmap_flush_tlb(gmap);
+	}
+	rcu_read_unlock();
+}
+
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
+			   unsigned long gaddr);
+
+/**
+ * __gmap_link - set up shadow page tables to connect a host to a guest address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ * @vmaddr: vm address
+ *
+ * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
+ * if the vm address is already mapped to a different guest segment.
+ * The mmap_lock of the mm that belongs to the address space must be held
+ * when this function gets called.
+ */
+int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	unsigned long *table;
+	spinlock_t *ptl;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	u64 unprot;
+	int rc;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	/* Create higher level tables in the gmap page table */
+	table = gmap->table;
+	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
+		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
+		if ((*table & _REGION_ENTRY_INVALID) &&
+		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
+				     gaddr & _REGION1_MASK))
+			return -ENOMEM;
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+	}
+	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
+		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
+		if ((*table & _REGION_ENTRY_INVALID) &&
+		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
+				     gaddr & _REGION2_MASK))
+			return -ENOMEM;
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+	}
+	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
+		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
+		if ((*table & _REGION_ENTRY_INVALID) &&
+		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
+				     gaddr & _REGION3_MASK))
+			return -ENOMEM;
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+	}
+	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+	/* Walk the parent mm page table */
+	mm = gmap->mm;
+	pgd = pgd_offset(mm, vmaddr);
+	VM_BUG_ON(pgd_none(*pgd));
+	p4d = p4d_offset(pgd, vmaddr);
+	VM_BUG_ON(p4d_none(*p4d));
+	pud = pud_offset(p4d, vmaddr);
+	VM_BUG_ON(pud_none(*pud));
+	/* large puds cannot yet be handled */
+	if (pud_leaf(*pud))
+		return -EFAULT;
+	pmd = pmd_offset(pud, vmaddr);
+	VM_BUG_ON(pmd_none(*pmd));
+	/* Are we allowed to use huge pages? */
+	if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
+		return -EFAULT;
+	/* Link gmap segment table entry location to page table. */
+	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
+	if (rc)
+		return rc;
+	ptl = pmd_lock(mm, pmd);
+	spin_lock(&gmap->guest_table_lock);
+	if (*table == _SEGMENT_ENTRY_EMPTY) {
+		rc = radix_tree_insert(&gmap->host_to_guest,
+				       vmaddr >> PMD_SHIFT,
+				       (void *)MAKE_VALID_GADDR(gaddr));
+		if (!rc) {
+			if (pmd_leaf(*pmd)) {
+				*table = (pmd_val(*pmd) &
+					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
+					| _SEGMENT_ENTRY_GMAP_UC
+					| _SEGMENT_ENTRY;
+			} else
+				*table = (pmd_val(*pmd) &
+					_SEGMENT_ENTRY_HARDWARE_BITS)
+					| _SEGMENT_ENTRY;
+		}
+	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
+		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
+		unprot = (u64)*table;
+		unprot &= ~_SEGMENT_ENTRY_PROTECT;
+		unprot |= _SEGMENT_ENTRY_GMAP_UC;
+		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
+	}
+	spin_unlock(&gmap->guest_table_lock);
+	spin_unlock(ptl);
+	radix_tree_preload_end();
+	return rc;
+}
+EXPORT_SYMBOL(__gmap_link);
+
+/*
+ * this function is assumed to be called with mmap_lock held
+ */
+void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
+{
+	unsigned long vmaddr;
+
+	mmap_assert_locked(gmap->mm);
+
+	/* Find the vm address for the guest address */
+	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
+						   gaddr >> PMD_SHIFT);
+	if (vmaddr) {
+		vmaddr |= gaddr & ~PMD_MASK;
+		gmap_helper_zap_one_page(gmap->mm, vmaddr);
+	}
+}
+EXPORT_SYMBOL_GPL(__gmap_zap);
+
+static LIST_HEAD(gmap_notifier_list);
+static DEFINE_SPINLOCK(gmap_notifier_lock);
+
+/**
+ * gmap_register_pte_notifier - register a pte invalidation callback
+ * @nb: pointer to the gmap notifier block
+ */
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
+{
+	spin_lock(&gmap_notifier_lock);
+	list_add_rcu(&nb->list, &gmap_notifier_list);
+	spin_unlock(&gmap_notifier_lock);
+}
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
+
+/**
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
+ * @nb: pointer to the gmap notifier block
+ */
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
+{
+	spin_lock(&gmap_notifier_lock);
+	list_del_rcu(&nb->list);
+	spin_unlock(&gmap_notifier_lock);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+			       unsigned long end)
+{
+	struct gmap_notifier *nb;
+
+	list_for_each_entry(nb, &gmap_notifier_list, list)
+		nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
+{
+	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
+	unsigned long *table = gmap->table;
+
+	if (gmap_is_shadow(gmap) && gmap->removed)
+		return NULL;
+
+	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
+		return NULL;
+
+	if (asce_type != _ASCE_TYPE_REGION1 &&
+	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
+		return NULL;
+
+	switch (asce_type) {
+	case _ASCE_TYPE_REGION1:
+		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
+		if (level == 4)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		fallthrough;
+	case _ASCE_TYPE_REGION2:
+		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
+		if (level == 3)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		fallthrough;
+	case _ASCE_TYPE_REGION3:
+		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
+		if (level == 2)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = __va(*table & _REGION_ENTRY_ORIGIN);
+		fallthrough;
+	case _ASCE_TYPE_SEGMENT:
+		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+		if (level == 1)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
+		table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
+	}
+	return table;
+}
+EXPORT_SYMBOL(gmap_table_walk);
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ *		      and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+			       spinlock_t **ptl)
+{
+	unsigned long *table;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	/* Walk the gmap page table, lock and get pte pointer */
+	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+	if (!table || *table & _SEGMENT_ENTRY_INVALID)
+		return NULL;
+	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+			     unsigned long vmaddr, int prot)
+{
+	struct mm_struct *mm = gmap->mm;
+	unsigned int fault_flags;
+	bool unlocked = false;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
+		return -EFAULT;
+	if (unlocked)
+		/* lost mmap_lock, caller has to retry __gmap_translate */
+		return 0;
+	/* Connect the page tables */
+	return __gmap_link(gmap, gaddr, vmaddr);
+}
+
+/**
+ * gmap_pte_op_end - release the page table lock
+ * @ptep: pointer to the locked pte
+ * @ptl: pointer to the page table spinlock
+ */
+static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
+{
+	pte_unmap_unlock(ptep, ptl);
+}
+
+/**
+ * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
+ *		      and return the pmd pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ *
+ * Returns a pointer to the pmd for a guest address, or NULL
+ */
+static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
+{
+	pmd_t *pmdp;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
+	if (!pmdp)
+		return NULL;
+
+	/* without huge pages, there is no need to take the table lock */
+	if (!gmap->mm->context.allow_gmap_hpage_1m)
+		return pmd_none(*pmdp) ? NULL : pmdp;
+
+	spin_lock(&gmap->guest_table_lock);
+	if (pmd_none(*pmdp)) {
+		spin_unlock(&gmap->guest_table_lock);
+		return NULL;
+	}
+
+	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
+	if (!pmd_leaf(*pmdp))
+		spin_unlock(&gmap->guest_table_lock);
+	return pmdp;
+}
+
+/**
+ * gmap_pmd_op_end - release the guest_table_lock if needed
+ * @gmap: pointer to the guest mapping meta data structure
+ * @pmdp: pointer to the pmd
+ */
+static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
+{
+	if (pmd_leaf(*pmdp))
+		spin_unlock(&gmap->guest_table_lock);
+}
+
+/*
+ * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
+ * @pmdp: pointer to the pmd to be protected
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns:
+ * 0 if successfully protected
+ * -EAGAIN if a fixup is needed
+ * -EINVAL if unsupported notifier bits have been specified
+ *
+ * Expected to be called with sg->mm->mmap_lock in read and
+ * guest_table_lock held.
+ */
+static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
+			    pmd_t *pmdp, int prot, unsigned long bits)
+{
+	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
+	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
+	pmd_t new = *pmdp;
+
+	/* Fixup needed */
+	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
+		return -EAGAIN;
+
+	if (prot == PROT_NONE && !pmd_i) {
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+	}
+
+	if (prot == PROT_READ && !pmd_p) {
+		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
+		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+	}
+
+	if (bits & GMAP_NOTIFY_MPROT)
+		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
+
+	/* Shadow GMAP protection needs split PMDs */
+	if (bits & GMAP_NOTIFY_SHADOW)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * gmap_protect_pte - remove access rights to memory and set pgste bits
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @pmdp: pointer to the pmd associated with the pte
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EAGAIN if a fixup is needed.
+ *
+ * Expected to be called with sg->mm->mmap_lock in read
+ */
+static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
+			    pmd_t *pmdp, int prot, unsigned long bits)
+{
+	int rc;
+	pte_t *ptep;
+	spinlock_t *ptl;
+	unsigned long pbits = 0;
+
+	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+		return -EAGAIN;
+
+	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
+	if (!ptep)
+		return -ENOMEM;
+
+	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
+	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
+	/* Protect and unlock. */
+	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
+	gmap_pte_op_end(ptep, ptl);
+	return rc;
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns:
+ *   PAGE_SIZE if a small page was successfully protected;
+ *   HPAGE_SIZE if a large page was successfully protected;
+ *   -ENOMEM if out of memory;
+ *   -EFAULT if gaddr is invalid (or mapping for shadows is missing);
+ *   -EAGAIN if the guest mapping is missing and should be fixed by the caller.
+ *
+ * Context: Called with sg->mm->mmap_lock in read.
+ */
+int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
+{
+	pmd_t *pmdp;
+	int rc = 0;
+
+	BUG_ON(gmap_is_shadow(gmap));
+
+	pmdp = gmap_pmd_op_walk(gmap, gaddr);
+	if (!pmdp)
+		return -EAGAIN;
+
+	if (!pmd_leaf(*pmdp)) {
+		rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
+		if (!rc)
+			rc = PAGE_SIZE;
+	} else {
+		rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
+		if (!rc)
+			rc = HPAGE_SIZE;
+	}
+	gmap_pmd_op_end(gmap, pmdp);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_protect_one);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ *                   absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed. -EINVAL if called on a gmap
+ * shadow.
+ *
+ * Called with gmap->mm->mmap_lock in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+	unsigned long address, vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep, pte;
+	int rc;
+
+	if (gmap_is_shadow(gmap))
+		return -EINVAL;
+
+	while (1) {
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+		if (ptep) {
+			pte = *ptep;
+			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+				address = pte_val(pte) & PAGE_MASK;
+				address += gaddr & ~PAGE_MASK;
+				*val = *(unsigned long *)__va(address);
+				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
+				/* Do *NOT* clear the _PAGE_INVALID bit! */
+				rc = 0;
+			}
+			gmap_pte_op_end(ptep, ptl);
+		}
+		if (!rc)
+			break;
+		vmaddr = __gmap_translate(gmap, gaddr);
+		if (IS_ERR_VALUE(vmaddr)) {
+			rc = vmaddr;
+			break;
+		}
+		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+				    struct gmap_rmap *rmap)
+{
+	struct gmap_rmap *temp;
+	void __rcu **slot;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+	if (slot) {
+		rmap->next = radix_tree_deref_slot_protected(slot,
+							&sg->guest_table_lock);
+		for (temp = rmap->next; temp; temp = temp->next) {
+			if (temp->raddr == rmap->raddr) {
+				kfree(rmap);
+				return;
+			}
+		}
+		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
+	} else {
+		rmap->next = NULL;
+		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+				  rmap);
+	}
+}
+
+/**
+ * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+			     unsigned long paddr, unsigned long len)
+{
+	struct gmap *parent;
+	struct gmap_rmap *rmap;
+	unsigned long vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	parent = sg->parent;
+	while (len) {
+		vmaddr = __gmap_translate(parent, paddr);
+		if (IS_ERR_VALUE(vmaddr))
+			return vmaddr;
+		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
+		if (!rmap)
+			return -ENOMEM;
+		rmap->raddr = raddr;
+		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
+		if (rc) {
+			kfree(rmap);
+			return rc;
+		}
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+		if (ptep) {
+			spin_lock(&sg->guest_table_lock);
+			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
+					     PGSTE_VSIE_BIT);
+			if (!rc)
+				gmap_insert_rmap(sg, vmaddr, rmap);
+			spin_unlock(&sg->guest_table_lock);
+			gmap_pte_op_end(ptep, ptl);
+		}
+		radix_tree_preload_end();
+		if (rc) {
+			kfree(rmap);
+			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
+			if (rc)
+				return rc;
+			continue;
+		}
+		paddr += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+	return 0;
+}
+
+#define _SHADOW_RMAP_MASK	0x7
+#define _SHADOW_RMAP_REGION1	0x5
+#define _SHADOW_RMAP_REGION2	0x4
+#define _SHADOW_RMAP_REGION3	0x3
+#define _SHADOW_RMAP_SEGMENT	0x2
+#define _SHADOW_RMAP_PGTABLE	0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+	asm volatile(
+		"	idte	%0,0,%1"
+		: : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+	if (!table || *table & _PAGE_INVALID)
+		return;
+	gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
+	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+				unsigned long *pgt)
+{
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
+		pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long *ste;
+	phys_addr_t sto, pgt;
+	struct ptdesc *ptdesc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
+	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
+	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
+	*ste = _SEGMENT_ENTRY_EMPTY;
+	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
+	/* Free page table */
+	ptdesc = page_ptdesc(phys_to_page(pgt));
+	page_table_free_pgste(ptdesc);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+				unsigned long *sgt)
+{
+	struct ptdesc *ptdesc;
+	phys_addr_t pgt;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
+		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+			continue;
+		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
+		sgt[i] = _SEGMENT_ENTRY_EMPTY;
+		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
+		/* Free page table */
+		ptdesc = page_ptdesc(phys_to_page(pgt));
+		page_table_free_pgste(ptdesc);
+	}
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r3o, *r3e;
+	phys_addr_t sgt;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
+	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
+	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
+	sgt = *r3e & _REGION_ENTRY_ORIGIN;
+	*r3e = _REGION3_ENTRY_EMPTY;
+	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
+	/* Free segment table */
+	page = phys_to_page(sgt);
+	__free_pages(page, CRST_ALLOC_ORDER);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r3t)
+{
+	struct page *page;
+	phys_addr_t sgt;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
+		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+			continue;
+		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
+		r3t[i] = _REGION3_ENTRY_EMPTY;
+		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
+		/* Free segment table */
+		page = phys_to_page(sgt);
+		__free_pages(page, CRST_ALLOC_ORDER);
+	}
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r2o, *r2e;
+	phys_addr_t r3t;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
+	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
+	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
+	r3t = *r2e & _REGION_ENTRY_ORIGIN;
+	*r2e = _REGION2_ENTRY_EMPTY;
+	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
+	/* Free region 3 table */
+	page = phys_to_page(r3t);
+	__free_pages(page, CRST_ALLOC_ORDER);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r2t)
+{
+	phys_addr_t r3t;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
+		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+			continue;
+		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
+		r2t[i] = _REGION2_ENTRY_EMPTY;
+		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
+		/* Free region 3 table */
+		page = phys_to_page(r3t);
+		__free_pages(page, CRST_ALLOC_ORDER);
+	}
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r1o, *r1e;
+	struct page *page;
+	phys_addr_t r2t;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
+	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
+	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
+	r2t = *r1e & _REGION_ENTRY_ORIGIN;
+	*r1e = _REGION1_ENTRY_EMPTY;
+	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
+	/* Free region 2 table */
+	page = phys_to_page(r2t);
+	__free_pages(page, CRST_ALLOC_ORDER);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r1t)
+{
+	unsigned long asce;
+	struct page *page;
+	phys_addr_t r2t;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
+	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
+		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+			continue;
+		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
+		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
+		/* Clear entry and flush translation r1t -> r2t */
+		gmap_idte_one(asce, raddr);
+		r1t[i] = _REGION1_ENTRY_EMPTY;
+		/* Free region 2 table */
+		page = phys_to_page(r2t);
+		__free_pages(page, CRST_ALLOC_ORDER);
+	}
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+void gmap_unshadow(struct gmap *sg)
+{
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	if (sg->removed)
+		return;
+	sg->removed = 1;
+	gmap_call_notifier(sg, 0, -1UL);
+	gmap_flush_tlb(sg);
+	table = __va(sg->asce & _ASCE_ORIGIN);
+	switch (sg->asce & _ASCE_TYPE_MASK) {
+	case _ASCE_TYPE_REGION1:
+		__gmap_unshadow_r1t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_REGION2:
+		__gmap_unshadow_r2t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_REGION3:
+		__gmap_unshadow_r3t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_SEGMENT:
+		__gmap_unshadow_sgt(sg, 0, table);
+		break;
+	}
+}
+EXPORT_SYMBOL(gmap_unshadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its descendants.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_lock in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+		    int fake)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *table;
+	phys_addr_t s_r2t;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow region second table */
+	page = gmap_alloc_crst();
+	if (!page)
+		return -ENOMEM;
+	s_r2t = page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
+	}
+	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
+	/* mark as invalid as long as the parent table is not protected */
+	*table = s_r2t | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= (r2t & _REGION_ENTRY_PROTECT);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make r2t read-only in parent gmap page table */
+	raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
+	origin = r2t & _REGION_ENTRY_ORIGIN;
+	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
+	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 4);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_r2t(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, CRST_ALLOC_ORDER);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_lock in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+		    int fake)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *table;
+	phys_addr_t s_r3t;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow region second table */
+	page = gmap_alloc_crst();
+	if (!page)
+		return -ENOMEM;
+	s_r3t = page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
+	}
+	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
+	/* mark as invalid as long as the parent table is not protected */
+	*table = s_r3t | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= (r3t & _REGION_ENTRY_PROTECT);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make r3t read-only in parent gmap page table */
+	raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
+	origin = r3t & _REGION_ENTRY_ORIGIN;
+	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
+	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 3);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_r3t(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, CRST_ALLOC_ORDER);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_lock in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+		    int fake)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *table;
+	phys_addr_t s_sgt;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+	/* Allocate a shadow segment table */
+	page = gmap_alloc_crst();
+	if (!page)
+		return -ENOMEM;
+	s_sgt = page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
+	}
+	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
+	/* mark as invalid as long as the parent table is not protected */
+	*table = s_sgt | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= sgt & _REGION_ENTRY_PROTECT;
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make sgt read-only in parent gmap page table */
+	raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
+	origin = sgt & _REGION_ENTRY_ORIGIN;
+	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
+	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 2);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_sgt(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, CRST_ALLOC_ORDER);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
+{
+	unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
+
+	pgstes += _PAGE_ENTRIES;
+
+	pgstes[0] &= ~PGSTE_ST2_MASK;
+	pgstes[1] &= ~PGSTE_ST2_MASK;
+	pgstes[2] &= ~PGSTE_ST2_MASK;
+	pgstes[3] &= ~PGSTE_ST2_MASK;
+
+	pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
+	pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
+	pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
+	pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
+}
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_lock in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+		    int fake)
+{
+	unsigned long raddr, origin;
+	unsigned long *table;
+	struct ptdesc *ptdesc;
+	phys_addr_t s_pgt;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+	/* Allocate a shadow page table */
+	ptdesc = page_table_alloc_pgste(sg->mm);
+	if (!ptdesc)
+		return -ENOMEM;
+	origin = pgt & _SEGMENT_ENTRY_ORIGIN;
+	if (fake)
+		origin |= GMAP_SHADOW_FAKE_TABLE;
+	gmap_pgste_set_pgt_addr(ptdesc, origin);
+	s_pgt = page_to_phys(ptdesc_page(ptdesc));
+	/* Install shadow page table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
+	}
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_SEGMENT_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make pgt read-only in parent gmap page table (not the pgste) */
+	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
+	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 1);
+		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_SEGMENT_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_pgt(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	page_table_free_pgste(ptdesc);
+	return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_lock in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+	struct gmap *parent;
+	struct gmap_rmap *rmap;
+	unsigned long vmaddr, paddr;
+	spinlock_t *ptl;
+	pte_t *sptep, *tptep;
+	int prot;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	parent = sg->parent;
+	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
+	if (!rmap)
+		return -ENOMEM;
+	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+	while (1) {
+		paddr = pte_val(pte) & PAGE_MASK;
+		vmaddr = __gmap_translate(parent, paddr);
+		if (IS_ERR_VALUE(vmaddr)) {
+			rc = vmaddr;
+			break;
+		}
+		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
+		if (rc)
+			break;
+		rc = -EAGAIN;
+		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+		if (sptep) {
+			spin_lock(&sg->guest_table_lock);
+			/* Get page table pointer */
+			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+			if (!tptep) {
+				spin_unlock(&sg->guest_table_lock);
+				gmap_pte_op_end(sptep, ptl);
+				radix_tree_preload_end();
+				break;
+			}
+			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+			if (rc > 0) {
+				/* Success and a new mapping */
+				gmap_insert_rmap(sg, vmaddr, rmap);
+				rmap = NULL;
+				rc = 0;
+			}
+			gmap_pte_op_end(sptep, ptl);
+			spin_unlock(&sg->guest_table_lock);
+		}
+		radix_tree_preload_end();
+		if (!rc)
+			break;
+		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+		if (rc)
+			break;
+	}
+	kfree(rmap);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/*
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+			       unsigned long gaddr)
+{
+	struct gmap_rmap *rmap, *rnext, *head;
+	unsigned long start, end, bits, raddr;
+
+	BUG_ON(!gmap_is_shadow(sg));
+
+	spin_lock(&sg->guest_table_lock);
+	if (sg->removed) {
+		spin_unlock(&sg->guest_table_lock);
+		return;
+	}
+	/* Check for top level table */
+	start = sg->orig_asce & _ASCE_ORIGIN;
+	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
+	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+	    gaddr < end) {
+		/* The complete shadow table has to go */
+		gmap_unshadow(sg);
+		spin_unlock(&sg->guest_table_lock);
+		list_del(&sg->list);
+		gmap_put(sg);
+		return;
+	}
+	/* Remove the page table tree from on specific entry */
+	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+	gmap_for_each_rmap_safe(rmap, rnext, head) {
+		bits = rmap->raddr & _SHADOW_RMAP_MASK;
+		raddr = rmap->raddr ^ bits;
+		switch (bits) {
+		case _SHADOW_RMAP_REGION1:
+			gmap_unshadow_r2t(sg, raddr);
+			break;
+		case _SHADOW_RMAP_REGION2:
+			gmap_unshadow_r3t(sg, raddr);
+			break;
+		case _SHADOW_RMAP_REGION3:
+			gmap_unshadow_sgt(sg, raddr);
+			break;
+		case _SHADOW_RMAP_SEGMENT:
+			gmap_unshadow_pgt(sg, raddr);
+			break;
+		case _SHADOW_RMAP_PGTABLE:
+			gmap_unshadow_page(sg, raddr);
+			break;
+		}
+		kfree(rmap);
+	}
+	spin_unlock(&sg->guest_table_lock);
+}
+
+/**
+ * ptep_notify - call all invalidation callbacks for a specific pte.
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
+ *
+ * This function is assumed to be called with the page table lock held
+ * for the pte to notify.
+ */
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+		 pte_t *pte, unsigned long bits)
+{
+	unsigned long offset, gaddr = 0;
+	struct gmap *gmap, *sg, *next;
+
+	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+	offset = offset * (PAGE_SIZE / sizeof(pte_t));
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
+		spin_unlock(&gmap->guest_table_lock);
+		if (!IS_GADDR_VALID(gaddr))
+			continue;
+
+		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+			spin_lock(&gmap->shadow_lock);
+			list_for_each_entry_safe(sg, next,
+						 &gmap->children, list)
+				gmap_shadow_notify(sg, vmaddr, gaddr);
+			spin_unlock(&gmap->shadow_lock);
+		}
+		if (bits & PGSTE_IN_BIT)
+			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ptep_notify);
+
+static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
+			     unsigned long gaddr)
+{
+	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
+	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
+}
+
+/**
+ * gmap_pmdp_xchg - exchange a gmap pmd with another
+ * @gmap: pointer to the guest address space structure
+ * @pmdp: pointer to the pmd entry
+ * @new: replacement entry
+ * @gaddr: the affected guest address
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
+			   unsigned long gaddr)
+{
+	gaddr &= HPAGE_MASK;
+	pmdp_notify_gmap(gmap, pmdp, gaddr);
+	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
+	if (machine_has_tlb_guest())
+		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
+			    IDTE_GLOBAL);
+	else
+		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
+	set_pmd(pmdp, new);
+}
+
+static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
+			    int purge)
+{
+	pmd_t *pmdp;
+	struct gmap *gmap;
+	unsigned long gaddr;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+		if (pmdp) {
+			pmdp_notify_gmap(gmap, pmdp, gaddr);
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
+			if (purge)
+				__pmdp_cspg(pmdp);
+			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+		}
+		spin_unlock(&gmap->guest_table_lock);
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
+ *                        flushing
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
+{
+	gmap_pmdp_clear(mm, vmaddr, 0);
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
+
+/**
+ * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
+{
+	unsigned long gaddr;
+	struct gmap *gmap;
+	pmd_t *pmdp;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+		if (pmdp) {
+			pmdp_notify_gmap(gmap, pmdp, gaddr);
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
+			if (machine_has_tlb_guest())
+				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+					    gmap->asce, IDTE_LOCAL);
+			else
+				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
+			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
+		}
+		spin_unlock(&gmap->guest_table_lock);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
+
+/**
+ * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
+{
+	unsigned long gaddr;
+	struct gmap *gmap;
+	pmd_t *pmdp;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+		if (pmdp) {
+			pmdp_notify_gmap(gmap, pmdp, gaddr);
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC |
+						   _SEGMENT_ENTRY));
+			if (machine_has_tlb_guest())
+				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+					    gmap->asce, IDTE_GLOBAL);
+			else
+				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
+			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
+		}
+		spin_unlock(&gmap->guest_table_lock);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
+
+/**
+ * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
+ * @gmap: pointer to guest address space
+ * @pmdp: pointer to the pmd to be tested
+ * @gaddr: virtual address in the guest address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
+					  unsigned long gaddr)
+{
+	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+		return false;
+
+	/* Already protected memory, which did not change is clean */
+	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
+	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
+		return false;
+
+	/* Clear UC indication and reset protection */
+	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
+	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
+	return true;
+}
+
+/**
+ * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
+ * @gmap: pointer to guest address space
+ * @bitmap: dirty bitmap for this pmd
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: virtual address in the host address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
+			     unsigned long gaddr, unsigned long vmaddr)
+{
+	int i;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	pmdp = gmap_pmd_op_walk(gmap, gaddr);
+	if (!pmdp)
+		return;
+
+	if (pmd_leaf(*pmdp)) {
+		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
+			bitmap_fill(bitmap, _PAGE_ENTRIES);
+	} else {
+		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
+			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
+			if (!ptep)
+				continue;
+			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
+				set_bit(i, bitmap);
+			pte_unmap_unlock(ptep, ptl);
+		}
+	}
+	gmap_pmd_op_end(gmap, pmdp);
+}
+EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+				    unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+
+	split_huge_pmd(vma, pmd, addr);
+	return 0;
+}
+
+static const struct mm_walk_ops thp_split_walk_ops = {
+	.pmd_entry	= thp_split_walk_pmd_entry,
+	.walk_lock	= PGWALK_WRLOCK_VERIFY,
+};
+
+static inline void thp_split_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+
+	for_each_vma(vmi, vma) {
+		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
+		walk_page_vma(vma, &thp_split_walk_ops, NULL);
+	}
+	mm->def_flags |= VM_NOHUGEPAGE;
+}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * switch on pgstes for its userspace process (for kvm)
+ */
+int s390_enable_sie(void)
+{
+	struct mm_struct *mm = current->mm;
+
+	/* Do we have pgstes? if yes, we are done */
+	if (mm_has_pgste(mm))
+		return 0;
+	mmap_write_lock(mm);
+	mm->context.has_pgste = 1;
+	/* split thp mappings and disable thp for future mappings */
+	thp_split_mm(mm);
+	mmap_write_unlock(mm);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(s390_enable_sie);
+
+/*
+ * Enable storage key handling from now on and initialize the storage
+ * keys with the default key.
+ */
+static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
+				  unsigned long next, struct mm_walk *walk)
+{
+	/* Clear storage key */
+	ptep_zap_key(walk->mm, addr, pte);
+	return 0;
+}
+
+/*
+ * Give a chance to schedule after setting a key to 256 pages.
+ * We only hold the mm lock, which is a rwsem and the kvm srcu.
+ * Both can sleep.
+ */
+static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
+				  unsigned long next, struct mm_walk *walk)
+{
+	cond_resched();
+	return 0;
+}
+
+static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
+				      unsigned long hmask, unsigned long next,
+				      struct mm_walk *walk)
+{
+	pmd_t *pmd = (pmd_t *)pte;
+	unsigned long start, end;
+	struct folio *folio = page_folio(pmd_page(*pmd));
+
+	/*
+	 * The write check makes sure we do not set a key on shared
+	 * memory. This is needed as the walker does not differentiate
+	 * between actual guest memory and the process executable or
+	 * shared libraries.
+	 */
+	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
+	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
+		return 0;
+
+	start = pmd_val(*pmd) & HPAGE_MASK;
+	end = start + HPAGE_SIZE;
+	__storage_key_init_range(start, end);
+	set_bit(PG_arch_1, &folio->flags.f);
+	cond_resched();
+	return 0;
+}
+
+static const struct mm_walk_ops enable_skey_walk_ops = {
+	.hugetlb_entry		= __s390_enable_skey_hugetlb,
+	.pte_entry		= __s390_enable_skey_pte,
+	.pmd_entry		= __s390_enable_skey_pmd,
+	.walk_lock		= PGWALK_WRLOCK,
+};
+
+int s390_enable_skey(void)
+{
+	struct mm_struct *mm = current->mm;
+	int rc = 0;
+
+	mmap_write_lock(mm);
+	if (mm_uses_skeys(mm))
+		goto out_up;
+
+	mm->context.uses_skeys = 1;
+	rc = gmap_helper_disable_cow_sharing();
+	if (rc) {
+		mm->context.uses_skeys = 0;
+		goto out_up;
+	}
+	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
+
+out_up:
+	mmap_write_unlock(mm);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(s390_enable_skey);
+
+/*
+ * Reset CMMA state, make all pages stable again.
+ */
+static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
+			     unsigned long next, struct mm_walk *walk)
+{
+	ptep_zap_unused(walk->mm, addr, pte, 1);
+	return 0;
+}
+
+static const struct mm_walk_ops reset_cmma_walk_ops = {
+	.pte_entry		= __s390_reset_cmma,
+	.walk_lock		= PGWALK_WRLOCK,
+};
+
+void s390_reset_cmma(struct mm_struct *mm)
+{
+	mmap_write_lock(mm);
+	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
+	mmap_write_unlock(mm);
+}
+EXPORT_SYMBOL_GPL(s390_reset_cmma);
+
+#define GATHER_GET_PAGES 32
+
+struct reset_walk_state {
+	unsigned long next;
+	unsigned long count;
+	unsigned long pfns[GATHER_GET_PAGES];
+};
+
+static int s390_gather_pages(pte_t *ptep, unsigned long addr,
+			     unsigned long next, struct mm_walk *walk)
+{
+	struct reset_walk_state *p = walk->private;
+	pte_t pte = READ_ONCE(*ptep);
+
+	if (pte_present(pte)) {
+		/* we have a reference from the mapping, take an extra one */
+		get_page(phys_to_page(pte_val(pte)));
+		p->pfns[p->count] = phys_to_pfn(pte_val(pte));
+		p->next = next;
+		p->count++;
+	}
+	return p->count >= GATHER_GET_PAGES;
+}
+
+static const struct mm_walk_ops gather_pages_ops = {
+	.pte_entry = s390_gather_pages,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
+/*
+ * Call the Destroy secure page UVC on each page in the given array of PFNs.
+ * Each page needs to have an extra reference, which will be released here.
+ */
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
+{
+	struct folio *folio;
+	unsigned long i;
+
+	for (i = 0; i < count; i++) {
+		folio = pfn_folio(pfns[i]);
+		/* we always have an extra reference */
+		uv_destroy_folio(folio);
+		/* get rid of the extra reference */
+		folio_put(folio);
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
+
+/**
+ * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
+ * in the given range of the given address space.
+ * @mm: the mm to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ * @interruptible: if not 0, stop when a fatal signal is received
+ *
+ * Walk the given range of the given address space and call the destroy
+ * secure page UVC on each page. Optionally exit early if a fatal signal is
+ * pending.
+ *
+ * Return: 0 on success, -EINTR if the function stopped before completing
+ */
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+			    unsigned long end, bool interruptible)
+{
+	struct reset_walk_state state = { .next = start };
+	int r = 1;
+
+	while (r > 0) {
+		state.count = 0;
+		mmap_read_lock(mm);
+		r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
+		mmap_read_unlock(mm);
+		cond_resched();
+		s390_uv_destroy_pfns(state.count, state.pfns);
+		if (interruptible && fatal_signal_pending(current))
+			return -EINTR;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
+
+/**
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
+ * @gmap: the gmap whose ASCE needs to be replaced
+ *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+	unsigned long asce;
+	struct page *page;
+	void *table;
+
+	/* Replacing segment type ASCEs would cause serious issues */
+	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+		return -EINVAL;
+
+	page = gmap_alloc_crst();
+	if (!page)
+		return -ENOMEM;
+	table = page_to_virt(page);
+	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
+	/* Set new table origin while preserving existing ASCE control bits */
+	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
+	WRITE_ONCE(gmap->asce, asce);
+	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+	WRITE_ONCE(gmap->table, table);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
new file mode 100644
index 000000000000..549f14ad08af
--- /dev/null
+++ b/arch/s390/mm/gmap_helpers.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Helper functions for KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2025
+ */
+
+#include <linux/export.h>
+#include <linux/mm_types.h>
+#include <linux/mmap_lock.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/leafops.h>
+#include <linux/pagewalk.h>
+#include <linux/ksm.h>
+#include <asm/gmap_helpers.h>
+#include <asm/pgtable.h>
+
+/**
+ * ptep_zap_softleaf_entry() - discard a software leaf entry.
+ * @mm: the mm
+ * @entry: the software leaf entry that needs to be zapped
+ *
+ * Discards the given software leaf entry. If the leaf entry was an actual
+ * swap entry (and not a migration entry, for example), the actual swapped
+ * page is also discarded from swap.
+ */
+static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
+{
+	if (softleaf_is_swap(entry))
+		dec_mm_counter(mm, MM_SWAPENTS);
+	else if (softleaf_is_migration(entry))
+		dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
+	free_swap_and_cache(entry);
+}
+
+/**
+ * gmap_helper_zap_one_page() - discard a page if it was swapped.
+ * @mm: the mm
+ * @vmaddr: the userspace virtual address that needs to be discarded
+ *
+ * If the given address maps to a swap entry, discard it.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
+{
+	struct vm_area_struct *vma;
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pte_t *ptep;
+
+	mmap_assert_locked(mm);
+
+	/* Find the vm address for the guest address */
+	vma = vma_lookup(mm, vmaddr);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return;
+
+	/* Get pointer to the page table entry */
+	ptep = get_locked_pte(mm, vmaddr, &ptl);
+	if (unlikely(!ptep))
+		return;
+	if (pte_swap(*ptep)) {
+		preempt_disable();
+		pgste = pgste_get_lock(ptep);
+
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
+		pte_clear(mm, vmaddr, ptep);
+
+		pgste_set_unlock(ptep, pgste);
+		preempt_enable();
+	}
+	pte_unmap_unlock(ptep, ptl);
+}
+EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);
+
+/**
+ * gmap_helper_discard() - discard user pages in the given range
+ * @mm: the mm
+ * @vmaddr: starting userspace address
+ * @end: end address (first address outside the range)
+ *
+ * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
+{
+	struct vm_area_struct *vma;
+
+	mmap_assert_locked(mm);
+
+	while (vmaddr < end) {
+		vma = find_vma_intersection(mm, vmaddr, end);
+		if (!vma)
+			return;
+		if (!is_vm_hugetlb_page(vma))
+			zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL);
+		vmaddr = vma->vm_end;
+	}
+}
+EXPORT_SYMBOL_GPL(gmap_helper_discard);
+
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+				   unsigned long end, struct mm_walk *walk)
+{
+	unsigned long *found_addr = walk->private;
+
+	/* Return 1 of the page is a zeropage. */
+	if (is_zero_pfn(pte_pfn(*pte))) {
+		/*
+		 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+		 * right thing and likely don't care: FAULT_FLAG_UNSHARE
+		 * currently only works in COW mappings, which is also where
+		 * mm_forbids_zeropage() is checked.
+		 */
+		if (!is_cow_mapping(walk->vma->vm_flags))
+			return -EFAULT;
+
+		*found_addr = addr;
+		return 1;
+	}
+	return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+	.pte_entry      = find_zeropage_pte_entry,
+	.walk_lock      = PGWALK_WRLOCK,
+};
+
+/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
+ * @mm: the mm whose zero pages are to be unshared
+ *
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+	unsigned long addr;
+	vm_fault_t fault;
+	int rc;
+
+	for_each_vma(vmi, vma) {
+		/*
+		 * We could only look at COW mappings, but it's more future
+		 * proof to catch unexpected zeropages in other mappings and
+		 * fail.
+		 */
+		if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+			continue;
+		addr = vma->vm_start;
+
+retry:
+		rc = walk_page_range_vma(vma, addr, vma->vm_end,
+					 &find_zeropage_ops, &addr);
+		if (rc < 0)
+			return rc;
+		else if (!rc)
+			continue;
+
+		/* addr was updated by find_zeropage_pte_entry() */
+		fault = handle_mm_fault(vma, addr,
+					FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+					NULL);
+		if (fault & VM_FAULT_OOM)
+			return -ENOMEM;
+		/*
+		 * See break_ksm(): even after handle_mm_fault() returned 0, we
+		 * must start the lookup from the current address, because
+		 * handle_mm_fault() may back out if there's any difficulty.
+		 *
+		 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+		 * maybe they could trigger in the future on concurrent
+		 * truncation. In that case, the shared zeropage would be gone
+		 * and we can simply retry and make progress.
+		 */
+		cond_resched();
+		goto retry;
+	}
+
+	return 0;
+}
+
+/**
+ * gmap_helper_disable_cow_sharing() - disable all COW sharing
+ *
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int gmap_helper_disable_cow_sharing(void)
+{
+	struct mm_struct *mm = current->mm;
+	int rc;
+
+	mmap_assert_write_locked(mm);
+
+	if (!mm->context.allow_cow_sharing)
+		return 0;
+
+	mm->context.allow_cow_sharing = 0;
+
+	/* Replace all shared zeropages by anonymous pages. */
+	rc = __gmap_helper_unshare_zeropages(mm);
+	/*
+	 * Make sure to disable KSM (if enabled for the whole process or
+	 * individual VMAs). Note that nothing currently hinders user space
+	 * from re-enabling it.
+	 */
+	if (!rc)
+		rc = ksm_disable(mm);
+	if (rc)
+		mm->context.allow_cow_sharing = 1;
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
deleted file mode 100644
index 1f5315d1215c..000000000000
--- a/arch/s390/mm/gup.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- *  Lockless get_user_pages_fast for s390
- *
- *  Copyright IBM Corp. 2010
- *  Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/vmstat.h>
-#include <linux/pagemap.h>
-#include <linux/rwsem.h>
-#include <asm/pgtable.h>
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	unsigned long mask;
-	pte_t *ptep, pte;
-	struct page *page;
-
-	mask = (write ? _PAGE_RO : 0) | _PAGE_INVALID | _PAGE_SPECIAL;
-
-	ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr);
-	do {
-		pte = *ptep;
-		barrier();
-		if ((pte_val(pte) & mask) != 0)
-			return 0;
-		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-		page = pte_page(pte);
-		if (!page_cache_get_speculative(page))
-			return 0;
-		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-			put_page(page);
-			return 0;
-		}
-		pages[*nr] = page;
-		(*nr)++;
-
-	} while (ptep++, addr += PAGE_SIZE, addr != end);
-
-	return 1;
-}
-
-static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	unsigned long mask, result;
-	struct page *head, *page, *tail;
-	int refs;
-
-	result = write ? 0 : _SEGMENT_ENTRY_RO;
-	mask = result | _SEGMENT_ENTRY_INV;
-	if ((pmd_val(pmd) & mask) != result)
-		return 0;
-	VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT));
-
-	refs = 0;
-	head = pmd_page(pmd);
-	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-	tail = page;
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	if (!page_cache_add_speculative(head, refs)) {
-		*nr -= refs;
-		return 0;
-	}
-
-	if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
-		*nr -= refs;
-		while (refs--)
-			put_page(head);
-		return 0;
-	}
-
-	/*
-	 * Any tail page need their mapcount reference taken before we
-	 * return.
-	 */
-	while (refs--) {
-		if (PageTail(tail))
-			get_huge_page_tail(tail);
-		tail++;
-	}
-
-	return 1;
-}
-
-
-static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pmd_t *pmdp, pmd;
-
-	pmdp = (pmd_t *) pudp;
-#ifdef CONFIG_64BIT
-	if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
-		pmdp = (pmd_t *) pud_deref(pud);
-	pmdp += pmd_index(addr);
-#endif
-	do {
-		pmd = *pmdp;
-		barrier();
-		next = pmd_addr_end(addr, end);
-		/*
-		 * The pmd_trans_splitting() check below explains why
-		 * pmdp_splitting_flush() has to serialize with
-		 * smp_call_function() against our disabled IRQs, to stop
-		 * this gup-fast code from running while we set the
-		 * splitting bit in the pmd. Returning zero will take
-		 * the slow path that will call wait_split_huge_page()
-		 * if the pmd is still in splitting state.
-		 */
-		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
-			return 0;
-		if (unlikely(pmd_large(pmd))) {
-			if (!gup_huge_pmd(pmdp, pmd, addr, next,
-					  write, pages, nr))
-				return 0;
-		} else if (!gup_pte_range(pmdp, pmd, addr, next,
-					  write, pages, nr))
-			return 0;
-	} while (pmdp++, addr = next, addr != end);
-
-	return 1;
-}
-
-static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	unsigned long next;
-	pud_t *pudp, pud;
-
-	pudp = (pud_t *) pgdp;
-#ifdef CONFIG_64BIT
-	if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
-		pudp = (pud_t *) pgd_deref(pgd);
-	pudp += pud_index(addr);
-#endif
-	do {
-		pud = *pudp;
-		barrier();
-		next = pud_addr_end(addr, end);
-		if (pud_none(pud))
-			return 0;
-		if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
-			return 0;
-	} while (pudp++, addr = next, addr != end);
-
-	return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			  struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next, flags;
-	pgd_t *pgdp, pgd;
-	int nr = 0;
-
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-	end = start + len;
-	if ((end < start) || (end > TASK_SIZE))
-		return 0;
-
-	local_irq_save(flags);
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd = *pgdp;
-		barrier();
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			break;
-		if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
-			break;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
-
-	return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start:	starting user address
- * @nr_pages:	number of pages from start to pin
- * @write:	whether pages will be written to
- * @pages:	array that receives pointers to the pages pinned.
- *		Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	pgd_t *pgdp, pgd;
-	int nr = 0;
-
-	start &= PAGE_MASK;
-	addr = start;
-	len = (unsigned long) nr_pages << PAGE_SHIFT;
-	end = start + len;
-	if ((end < start) || (end > TASK_SIZE))
-		goto slow_irqon;
-
-	/*
-	 * local_irq_disable() doesn't prevent pagetable teardown, but does
-	 * prevent the pagetables from being freed on s390.
-	 *
-	 * So long as we atomically load page table pointers versus teardown,
-	 * we can follow the address down to the the page and take a ref on it.
-	 */
-	local_irq_disable();
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd = *pgdp;
-		barrier();
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			goto slow;
-		if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
-			goto slow;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_enable();
-
-	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
-	return nr;
-
-	{
-		int ret;
-slow:
-		local_irq_enable();
-slow_irqon:
-		/* Try to get the remaining pages with get_user_pages */
-		start += nr << PAGE_SHIFT;
-		pages += nr;
-
-		down_read(&mm->mmap_sem);
-		ret = get_user_pages(current, mm, start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
-		up_read(&mm->mmap_sem);
-
-		/* Have to be a bit careful with return values */
-		if (nr > 0) {
-			if (ret < 0)
-				ret = nr;
-			else
-				ret += nr;
-		}
-
-		return ret;
-	}
-}
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 121089d57802..d42e61c7594e 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -1,132 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  IBM System z Huge TLB Page Support for Kernel.
  *
- *    Copyright IBM Corp. 2007
+ *    Copyright IBM Corp. 2007,2020
  *    Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
+#define pr_fmt(fmt) "hugetlb: " fmt
+
+#include <linux/cpufeature.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
+#include <asm/pgalloc.h>
 
+/*
+ * If the bit selected by single-bit bitmask "a" is set within "x", move
+ * it to the position indicated by single-bit bitmask "b".
+ */
+#define move_set_bit(x, a, b)	(((x) & (a)) >> ilog2(a) << ilog2(b))
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-				   pte_t *pteptr, pte_t pteval)
+static inline unsigned long __pte_to_rste(pte_t pte)
 {
-	pmd_t *pmdp = (pmd_t *) pteptr;
-	unsigned long mask;
-
-	if (!MACHINE_HAS_HPAGE) {
-		pteptr = (pte_t *) pte_page(pteval)[1].index;
-		mask = pte_val(pteval) &
-				(_SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO);
-		pte_val(pteval) = (_SEGMENT_ENTRY + __pa(pteptr)) | mask;
-	}
+	swp_entry_t arch_entry;
+	unsigned long rste;
 
-	pmd_val(*pmdp) = pte_val(pteval);
+	/*
+	 * Convert encoding		  pte bits	pmd / pud bits
+	 *				lIR.uswrdy.p	dy..R...I...wr
+	 * empty			010.000000.0 -> 00..0...1...00
+	 * prot-none, clean, old	111.000000.1 -> 00..1...1...00
+	 * prot-none, clean, young	111.000001.1 -> 01..1...1...00
+	 * prot-none, dirty, old	111.000010.1 -> 10..1...1...00
+	 * prot-none, dirty, young	111.000011.1 -> 11..1...1...00
+	 * read-only, clean, old	111.000100.1 -> 00..1...1...01
+	 * read-only, clean, young	101.000101.1 -> 01..1...0...01
+	 * read-only, dirty, old	111.000110.1 -> 10..1...1...01
+	 * read-only, dirty, young	101.000111.1 -> 11..1...0...01
+	 * read-write, clean, old	111.001100.1 -> 00..1...1...11
+	 * read-write, clean, young	101.001101.1 -> 01..1...0...11
+	 * read-write, dirty, old	110.001110.1 -> 10..0...1...11
+	 * read-write, dirty, young	100.001111.1 -> 11..0...0...11
+	 * HW-bits: R read-only, I invalid
+	 * SW-bits: p present, y young, d dirty, r read, w write, s special,
+	 *	    u unused, l large
+	 */
+	if (pte_present(pte)) {
+		rste = pte_val(pte) & PAGE_MASK;
+		rste |= _SEGMENT_ENTRY_PRESENT;
+		rste |= move_set_bit(pte_val(pte), _PAGE_READ,
+				     _SEGMENT_ENTRY_READ);
+		rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
+				     _SEGMENT_ENTRY_WRITE);
+		rste |= move_set_bit(pte_val(pte), _PAGE_INVALID,
+				     _SEGMENT_ENTRY_INVALID);
+		rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT,
+				     _SEGMENT_ENTRY_PROTECT);
+		rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY,
+				     _SEGMENT_ENTRY_DIRTY);
+		rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG,
+				     _SEGMENT_ENTRY_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+		rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY,
+				     _SEGMENT_ENTRY_SOFT_DIRTY);
+#endif
+		rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
+				     _SEGMENT_ENTRY_NOEXEC);
+	} else if (!pte_none(pte)) {
+		/* swap pte */
+		arch_entry = __pte_to_swp_entry(pte);
+		rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry));
+	} else
+		rste = _SEGMENT_ENTRY_EMPTY;
+	return rste;
 }
 
-int arch_prepare_hugepage(struct page *page)
+static inline pte_t __rste_to_pte(unsigned long rste)
 {
-	unsigned long addr = page_to_phys(page);
+	swp_entry_t arch_entry;
+	unsigned long pteval;
+	int present, none;
 	pte_t pte;
-	pte_t *ptep;
-	int i;
 
-	if (MACHINE_HAS_HPAGE)
-		return 0;
+	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+		present = pud_present(__pud(rste));
+		none = pud_none(__pud(rste));
+	} else {
+		present = pmd_present(__pmd(rste));
+		none = pmd_none(__pmd(rste));
+	}
 
-	ptep = (pte_t *) pte_alloc_one(&init_mm, addr);
-	if (!ptep)
-		return -ENOMEM;
+	/*
+	 * Convert encoding		pmd / pud bits	    pte bits
+	 *				dy..R...I...wr	  lIR.uswrdy.p
+	 * empty			00..0...1...00 -> 010.000000.0
+	 * prot-none, clean, old	00..1...1...00 -> 111.000000.1
+	 * prot-none, clean, young	01..1...1...00 -> 111.000001.1
+	 * prot-none, dirty, old	10..1...1...00 -> 111.000010.1
+	 * prot-none, dirty, young	11..1...1...00 -> 111.000011.1
+	 * read-only, clean, old	00..1...1...01 -> 111.000100.1
+	 * read-only, clean, young	01..1...0...01 -> 101.000101.1
+	 * read-only, dirty, old	10..1...1...01 -> 111.000110.1
+	 * read-only, dirty, young	11..1...0...01 -> 101.000111.1
+	 * read-write, clean, old	00..1...1...11 -> 111.001100.1
+	 * read-write, clean, young	01..1...0...11 -> 101.001101.1
+	 * read-write, dirty, old	10..0...1...11 -> 110.001110.1
+	 * read-write, dirty, young	11..0...0...11 -> 100.001111.1
+	 * HW-bits: R read-only, I invalid
+	 * SW-bits: p present, y young, d dirty, r read, w write, s special,
+	 *	    u unused, l large
+	 */
+	if (present) {
+		pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
+		pteval |= _PAGE_LARGE | _PAGE_PRESENT;
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
+#endif
+		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
+	} else if (!none) {
+		/* swap rste */
+		arch_entry = __rste_to_swp_entry(rste);
+		pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry));
+		pteval = pte_val(pte);
+	} else
+		pteval = _PAGE_INVALID;
+	return __pte(pteval);
+}
+
+static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
+{
+	struct folio *folio;
+	unsigned long size, paddr;
+
+	if (!mm_uses_skeys(mm) ||
+	    rste & _SEGMENT_ENTRY_INVALID)
+		return;
 
-	pte_val(pte) = addr;
-	for (i = 0; i < PTRS_PER_PTE; i++) {
-		set_pte_at(&init_mm, addr + i * PAGE_SIZE, ptep + i, pte);
-		pte_val(pte) += PAGE_SIZE;
+	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+		folio = page_folio(pud_page(__pud(rste)));
+		size = PUD_SIZE;
+		paddr = rste & PUD_MASK;
+	} else {
+		folio = page_folio(pmd_page(__pmd(rste)));
+		size = PMD_SIZE;
+		paddr = rste & PMD_MASK;
 	}
-	page[1].index = (unsigned long) ptep;
-	return 0;
+
+	if (!test_and_set_bit(PG_arch_1, &folio->flags.f))
+		__storage_key_init_range(paddr, paddr + size);
 }
 
-void arch_release_hugepage(struct page *page)
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte)
 {
-	pte_t *ptep;
+	unsigned long rste;
 
-	if (MACHINE_HAS_HPAGE)
-		return;
+	rste = __pte_to_rste(pte);
 
-	ptep = (pte_t *) page[1].index;
-	if (!ptep)
-		return;
-	clear_table((unsigned long *) ptep, _PAGE_TYPE_EMPTY,
-		    PTRS_PER_PTE * sizeof(pte_t));
-	page_table_free(&init_mm, (unsigned long *) ptep);
-	page[1].index = 0;
+	/* Set correct table type for 2G hugepages */
+	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+		if (likely(pte_present(pte)))
+			rste |= _REGION3_ENTRY_LARGE;
+		rste |= _REGION_ENTRY_TYPE_R3;
+	} else if (likely(pte_present(pte)))
+		rste |= _SEGMENT_ENTRY_LARGE;
+
+	clear_huge_pte_skeys(mm, rste);
+	set_pte(ptep, __pte(rste));
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte, unsigned long sz)
+{
+	__set_huge_pte_at(mm, addr, ptep, pte);
+}
+
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	return __rste_to_pte(pte_val(*ptep));
+}
+
+pte_t __huge_ptep_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = huge_ptep_get(mm, addr, ptep);
+	pmd_t *pmdp = (pmd_t *) ptep;
+	pud_t *pudp = (pud_t *) ptep;
+
+	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+		pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY));
+	else
+		pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+	return pte;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp = NULL;
 
 	pgdp = pgd_offset(mm, addr);
-	pudp = pud_alloc(mm, pgdp, addr);
-	if (pudp)
-		pmdp = pmd_alloc(mm, pudp, addr);
+	p4dp = p4d_alloc(mm, pgdp, addr);
+	if (p4dp) {
+		pudp = pud_alloc(mm, p4dp, addr);
+		if (pudp) {
+			if (sz == PUD_SIZE)
+				return (pte_t *) pudp;
+			else if (sz == PMD_SIZE)
+				pmdp = pmd_alloc(mm, pudp, addr);
+		}
+	}
 	return (pte_t *) pmdp;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp = NULL;
 
 	pgdp = pgd_offset(mm, addr);
 	if (pgd_present(*pgdp)) {
-		pudp = pud_offset(pgdp, addr);
-		if (pud_present(*pudp))
-			pmdp = pmd_offset(pudp, addr);
+		p4dp = p4d_offset(pgdp, addr);
+		if (p4d_present(*p4dp)) {
+			pudp = pud_offset(p4dp, addr);
+			if (sz == PUD_SIZE)
+				return (pte_t *)pudp;
+			if (pud_present(*pudp))
+				pmdp = pmd_offset(pudp, addr);
+		}
 	}
 	return (pte_t *) pmdp;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+bool __init arch_hugetlb_valid_size(unsigned long size)
 {
-	return 0;
-}
-
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
-int pmd_huge(pmd_t pmd)
-{
-	if (!MACHINE_HAS_HPAGE)
-		return 0;
-
-	return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE);
-}
-
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
-
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmdp, int write)
-{
-	struct page *page;
-
-	if (!MACHINE_HAS_HPAGE)
-		return NULL;
-
-	page = pmd_page(*pmdp);
-	if (page)
-		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
+	if (cpu_has_edat1() && size == PMD_SIZE)
+		return true;
+	else if (cpu_has_edat2() && size == PUD_SIZE)
+		return true;
+	else
+		return false;
 }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ce36ea80e4f9..e4953453d254 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  S390 version
  *    Copyright IBM Corp. 1999
@@ -7,6 +8,7 @@
  *    Copyright (C) 1995  Linus Torvalds
  */
 
+#include <linux/cpufeature.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -17,76 +19,69 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
+#include <linux/swiotlb.h>
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/initrd.h>
 #include <linux/export.h>
+#include <linux/cma.h>
 #include <linux/gfp.h>
+#include <linux/dma-direct.h>
+#include <linux/percpu.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
+#include <asm/ctlreg.h>
+#include <asm/kfence.h>
 #include <asm/dma.h>
-#include <asm/lowcore.h>
-#include <asm/tlb.h>
+#include <asm/abs_lowcore.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
-#include <asm/ctl_reg.h>
 #include <asm/sclp.h>
+#include <asm/set_memory.h>
+#include <asm/kasan.h>
+#include <asm/dma-mapping.h>
+#include <asm/uv.h>
+#include <linux/virtio_anchor.h>
+#include <linux/virtio_config.h>
+#include <linux/execmem.h>
 
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
+pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
+
+struct ctlreg __bootdata_preserved(s390_invalid_asce);
+
+unsigned long __bootdata_preserved(page_noexec_mask);
+EXPORT_SYMBOL(page_noexec_mask);
+
+unsigned long __bootdata_preserved(segment_noexec_mask);
+EXPORT_SYMBOL(segment_noexec_mask);
+
+unsigned long __bootdata_preserved(region_noexec_mask);
+EXPORT_SYMBOL(region_noexec_mask);
 
 unsigned long empty_zero_page, zero_page_mask;
 EXPORT_SYMBOL(empty_zero_page);
+EXPORT_SYMBOL(zero_page_mask);
 
 static void __init setup_zero_pages(void)
 {
-	struct cpuid cpu_id;
+	unsigned long total_pages = memblock_estimated_nr_free_pages();
 	unsigned int order;
-	struct page *page;
-	int i;
 
-	get_cpu_id(&cpu_id);
-	switch (cpu_id.machine) {
-	case 0x9672:	/* g5 */
-	case 0x2064:	/* z900 */
-	case 0x2066:	/* z900 */
-	case 0x2084:	/* z990 */
-	case 0x2086:	/* z990 */
-	case 0x2094:	/* z9-109 */
-	case 0x2096:	/* z9-109 */
-		order = 0;
-		break;
-	case 0x2097:	/* z10 */
-	case 0x2098:	/* z10 */
-	case 0x2817:	/* z196 */
-	case 0x2818:	/* z196 */
-		order = 2;
-		break;
-	case 0x2827:	/* zEC12 */
-	default:
-		order = 5;
-		break;
-	}
+	/* Latest machines require a mapping granularity of 512KB */
+	order = 7;
+
 	/* Limit number of empty zero pages for small memory sizes */
-	if (order > 2 && totalram_pages <= 16384)
-		order = 2;
-
-	empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!empty_zero_page)
-		panic("Out of memory in setup_zero_pages");
-
-	page = virt_to_page((void *) empty_zero_page);
-	split_page(page, order);
-	for (i = 1 << order; i > 0; i--) {
-		mark_page_reserved(page);
-		page++;
-	}
+	while (order > 2 && (total_pages >> 10) < (1UL << order))
+		order--;
+
+	empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE);
 
 	zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
 }
@@ -97,131 +92,228 @@ static void __init setup_zero_pages(void)
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
-	unsigned long pgd_type, asce_bits;
-
-	init_mm.pgd = swapper_pg_dir;
-#ifdef CONFIG_64BIT
-	if (VMALLOC_END > (1UL << 42)) {
-		asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
-		pgd_type = _REGION2_ENTRY_EMPTY;
-	} else {
-		asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
-		pgd_type = _REGION3_ENTRY_EMPTY;
-	}
-#else
-	asce_bits = _ASCE_TABLE_LENGTH;
-	pgd_type = _SEGMENT_ENTRY_EMPTY;
-#endif
-	S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
-	clear_table((unsigned long *) init_mm.pgd, pgd_type,
-		    sizeof(unsigned long)*2048);
-	vmem_map_init();
-
-        /* enable virtual mapping in kernel mode */
-	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	__ctl_load(S390_lowcore.kernel_asce, 7, 7);
-	__ctl_load(S390_lowcore.kernel_asce, 13, 13);
-	arch_local_irq_restore(4UL << (BITS_PER_LONG - 8));
 
-	atomic_set(&init_mm.context.attach_count, 1);
-
-	sparse_memory_present_with_active_regions(MAX_NUMNODES);
+	vmem_map_init();
 	sparse_init();
+	zone_dma_limit = DMA_BIT_MASK(31);
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
+	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-	free_area_init_nodes(max_zone_pfns);
+	free_area_init(max_zone_pfns);
 }
 
-void __init mem_init(void)
+void mark_rodata_ro(void)
 {
-        max_mapnr = max_low_pfn;
-        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+	unsigned long size = __end_ro_after_init - __start_ro_after_init;
+
+	if (cpu_has_nx())
+		system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT);
+	__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
+	pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
+}
 
-	/* Setup guest page hinting */
-	cmma_init();
+int set_memory_encrypted(unsigned long vaddr, int numpages)
+{
+	int i;
+
+	/* make specified pages unshared, (swiotlb, dma_free) */
+	for (i = 0; i < numpages; ++i) {
+		uv_remove_shared(virt_to_phys((void *)vaddr));
+		vaddr += PAGE_SIZE;
+	}
+	return 0;
+}
+
+int set_memory_decrypted(unsigned long vaddr, int numpages)
+{
+	int i;
+	/* make specified pages shared (swiotlb, dma_alloca) */
+	for (i = 0; i < numpages; ++i) {
+		uv_set_shared(virt_to_phys((void *)vaddr));
+		vaddr += PAGE_SIZE;
+	}
+	return 0;
+}
+
+/* are we a protected virtualization guest? */
+bool force_dma_unencrypted(struct device *dev)
+{
+	return is_prot_virt_guest();
+}
+
+/* protected virtualization */
+static void __init pv_init(void)
+{
+	if (!is_prot_virt_guest())
+		return;
+
+	virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+
+	/* make sure bounce buffers are shared */
+	swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
+	swiotlb_update_mem_attributes();
+}
+
+void __init arch_mm_preinit(void)
+{
+	cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
+	cpumask_set_cpu(0, mm_cpumask(&init_mm));
+
+	pv_init();
 
-	/* this will put all low memory onto the freelists */
-	free_all_bootmem();
 	setup_zero_pages();	/* Setup zeroed pages. */
+}
+
+unsigned long memory_block_size_bytes(void)
+{
+	/*
+	 * Make sure the memory block size is always greater
+	 * or equal than the memory increment size.
+	 */
+	return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
+}
+
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
 
-	mem_init_print_info(NULL);
-	printk("Write protected kernel read-only data: %#lx - %#lx\n",
-	       (unsigned long)&_stext,
-	       PFN_ALIGN((unsigned long)&_eshared) - 1);
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+	return LOCAL_DISTANCE;
 }
 
-void free_initmem(void)
+static int __init pcpu_cpu_to_node(int cpu)
 {
-	free_initmem_default(POISON_FREE_INITMEM);
+	return 0;
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
+void __init setup_per_cpu_areas(void)
 {
-	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-			   "initrd");
+	unsigned long delta;
+	unsigned int cpu;
+	int rc;
+
+	/*
+	 * Always reserve area for module percpu variables.  That's
+	 * what the legacy allocator did.
+	 */
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
+				    pcpu_cpu_distance,
+				    pcpu_cpu_to_node);
+	if (rc < 0)
+		panic("Failed to initialize percpu areas.");
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
-#endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+
+#ifdef CONFIG_CMA
+
+/* Prevent memory blocks which contain cma regions from going offline */
+
+struct s390_cma_mem_data {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int s390_cma_check_range(struct cma *cma, void *data)
+{
+	struct s390_cma_mem_data *mem_data;
+
+	mem_data = data;
+
+	if (cma_intersects(cma, mem_data->start, mem_data->end))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int s390_cma_mem_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct s390_cma_mem_data mem_data;
+	struct memory_notify *arg;
+	int rc = 0;
+
+	arg = data;
+	mem_data.start = arg->start_pfn << PAGE_SHIFT;
+	mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT);
+	if (action == MEM_GOING_OFFLINE)
+		rc = cma_for_each_area(s390_cma_check_range, &mem_data);
+	return notifier_from_errno(rc);
+}
+
+static struct notifier_block s390_cma_mem_nb = {
+	.notifier_call = s390_cma_mem_notifier,
+};
+
+static int __init s390_cma_mem_init(void)
+{
+	return register_memory_notifier(&s390_cma_mem_nb);
+}
+device_initcall(s390_cma_mem_init);
+
+#endif /* CONFIG_CMA */
+
+int arch_add_memory(int nid, u64 start, u64 size,
+		    struct mhp_params *params)
 {
-	unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
-	struct zone *zone;
 	int rc;
 
+	if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL)))
+		return -EINVAL;
+
+	VM_BUG_ON(!mhp_range_allowed(start, size, true));
 	rc = vmem_add_mapping(start, size);
 	if (rc)
 		return rc;
-	for_each_zone(zone) {
-		if (zone_idx(zone) != ZONE_MOVABLE) {
-			/* Add range within existing zone limits */
-			zone_start_pfn = zone->zone_start_pfn;
-			zone_end_pfn = zone->zone_start_pfn +
-				       zone->spanned_pages;
-		} else {
-			/* Add remaining range to ZONE_MOVABLE */
-			zone_start_pfn = start_pfn;
-			zone_end_pfn = start_pfn + size_pages;
-		}
-		if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn)
-			continue;
-		nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
-			   zone_end_pfn - start_pfn : size_pages;
-		rc = __add_pages(nid, zone, start_pfn, nr_pages);
-		if (rc)
-			break;
-		start_pfn += nr_pages;
-		size_pages -= nr_pages;
-		if (!size_pages)
-			break;
-	}
+
+	rc = __add_pages(nid, start_pfn, size_pages, params);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
 }
 
-unsigned long memory_block_size_bytes(void)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
-	/*
-	 * Make sure the memory block size is always greater
-	 * or equal than the memory increment size.
-	 */
-	return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp_get_rzm());
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	__remove_pages(start_pfn, nr_pages, altmap);
+	vmem_remove_mapping(start, size);
 }
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size)
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	/*
-	 * There is no hardware or firmware interface which could trigger a
-	 * hot memory remove on s390. So there is nothing that needs to be
-	 * implemented.
-	 */
-	return -EBUSY;
+	unsigned long module_load_offset = 0;
+	unsigned long start;
+
+	if (kaslr_enabled())
+		module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+	start = MODULES_VADDR + module_load_offset;
+
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.flags	= EXECMEM_KASAN_SHADOW,
+				.start	= start,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = MODULE_ALIGN,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
-#endif
-#endif /* CONFIG_MEMORY_HOTPLUG */
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 921fa541dc04..cfd219fe495c 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Access kernel memory without faulting -- s390 specific implementation.
  *
- * Copyright IBM Corp. 2009
- *
- *   Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
+ * Copyright IBM Corp. 2009, 2015
  *
  */
 
@@ -13,186 +12,134 @@
 #include <linux/errno.h>
 #include <linux/gfp.h>
 #include <linux/cpu.h>
-#include <asm/ctl_reg.h>
-
-/*
- * This function writes to kernel memory bypassing DAT and possible
- * write protection. It copies one to four bytes from src to dst
- * using the stura instruction.
- * Returns the number of bytes copied or -EFAULT.
- */
-static long probe_kernel_write_odd(void *dst, const void *src, size_t size)
+#include <linux/uio.h>
+#include <linux/io.h>
+#include <asm/asm-extable.h>
+#include <asm/abs_lowcore.h>
+#include <asm/stacktrace.h>
+#include <asm/sections.h>
+#include <asm/maccess.h>
+#include <asm/ctlreg.h>
+
+unsigned long __bootdata_preserved(__memcpy_real_area);
+pte_t *__bootdata_preserved(memcpy_real_ptep);
+static DEFINE_MUTEX(memcpy_real_mutex);
+
+static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
 {
-	unsigned long count, aligned;
-	int offset, mask;
-	int rc = -EFAULT;
-
-	aligned = (unsigned long) dst & ~3UL;
-	offset = (unsigned long) dst & 3;
-	count = min_t(unsigned long, 4 - offset, size);
-	mask = (0xf << (4 - count)) & 0xf;
-	mask >>= offset;
+	unsigned long aligned, offset, count;
+	char tmp[8];
+
+	aligned = (unsigned long) dst & ~7UL;
+	offset = (unsigned long) dst & 7UL;
+	size = min(8UL - offset, size);
+	count = size - 1;
 	asm volatile(
 		"	bras	1,0f\n"
-		"	icm	0,0,0(%3)\n"
-		"0:	l	0,0(%1)\n"
-		"	lra	%1,0(%1)\n"
-		"1:	ex	%2,0(1)\n"
-		"2:	stura	0,%1\n"
-		"	la	%0,0\n"
-		"3:\n"
-		EX_TABLE(0b,3b) EX_TABLE(1b,3b) EX_TABLE(2b,3b)
-		: "+d" (rc), "+a" (aligned)
-		: "a" (mask), "a" (src) : "cc", "memory", "0", "1");
-	return rc ? rc : count;
+		"	mvc	0(1,%4),0(%5)\n"
+		"0:	mvc	0(8,%3),0(%0)\n"
+		"	ex	%1,0(1)\n"
+		"	lg	%1,0(%3)\n"
+		"	lra	%0,0(%0)\n"
+		"	sturg	%1,%0"
+		: "+&a" (aligned), "+&a" (count), "=m" (tmp)
+		: "a" (&tmp), "a" (&tmp[offset]), "a" (src)
+		: "cc", "memory", "1");
+	return size;
 }
 
-long probe_kernel_write(void *dst, const void *src, size_t size)
+/*
+ * __s390_kernel_write - write to kernel memory bypassing DAT
+ * @dst: destination address
+ * @src: source address
+ * @size: number of bytes to copy
+ *
+ * This function writes to kernel memory bypassing DAT and possible page table
+ * write protection. It writes to the destination using the sturg instruction.
+ * Therefore we have a read-modify-write sequence: the function reads eight
+ * bytes from destination at an eight byte boundary, modifies the bytes
+ * requested and writes the result back in a loop.
+ */
+static DEFINE_SPINLOCK(s390_kernel_write_lock);
+
+notrace void *__s390_kernel_write(void *dst, const void *src, size_t size)
 {
-	long copied = 0;
+	void *tmp = dst;
+	unsigned long flags;
+	long copied;
 
+	spin_lock_irqsave(&s390_kernel_write_lock, flags);
 	while (size) {
-		copied = probe_kernel_write_odd(dst, src, size);
-		if (copied < 0)
-			break;
-		dst += copied;
+		copied = s390_kernel_write_odd(tmp, src, size);
+		tmp += copied;
 		src += copied;
 		size -= copied;
 	}
-	return copied < 0 ? -EFAULT : 0;
-}
+	spin_unlock_irqrestore(&s390_kernel_write_lock, flags);
 
-static int __memcpy_real(void *dest, void *src, size_t count)
-{
-	register unsigned long _dest asm("2") = (unsigned long) dest;
-	register unsigned long _len1 asm("3") = (unsigned long) count;
-	register unsigned long _src  asm("4") = (unsigned long) src;
-	register unsigned long _len2 asm("5") = (unsigned long) count;
-	int rc = -EFAULT;
-
-	asm volatile (
-		"0:	mvcle	%1,%2,0x0\n"
-		"1:	jo	0b\n"
-		"	lhi	%0,0x0\n"
-		"2:\n"
-		EX_TABLE(1b,2b)
-		: "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
-		  "+d" (_len2), "=m" (*((long *) dest))
-		: "m" (*((long *) src))
-		: "cc", "memory");
-	return rc;
+	return dst;
 }
 
-/*
- * Copy memory in real mode (kernel to kernel)
- */
-int memcpy_real(void *dest, void *src, size_t count)
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
 {
-	unsigned long flags;
-	int rc;
-
-	if (!count)
-		return 0;
-	local_irq_save(flags);
-	__arch_local_irq_stnsm(0xfbUL);
-	rc = __memcpy_real(dest, src, count);
-	local_irq_restore(flags);
-	return rc;
-}
-
-/*
- * Copy memory in absolute mode (kernel to kernel)
- */
-void memcpy_absolute(void *dest, void *src, size_t count)
-{
-	unsigned long cr0, flags, prefix;
-
-	flags = arch_local_irq_save();
-	__ctl_store(cr0, 0, 0);
-	__ctl_clear_bit(0, 28); /* disable lowcore protection */
-	prefix = store_prefix();
-	if (prefix) {
-		local_mcck_disable();
-		set_prefix(0);
-		memcpy(dest, src, count);
-		set_prefix(prefix);
-		local_mcck_enable();
-	} else {
-		memcpy(dest, src, count);
-	}
-	__ctl_load(cr0, 0, 0);
-	arch_local_irq_restore(flags);
-}
-
-/*
- * Copy memory from kernel (real) to user (virtual)
- */
-int copy_to_user_real(void __user *dest, void *src, size_t count)
-{
-	int offs = 0, size, rc;
-	char *buf;
-
-	buf = (char *) __get_free_page(GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	rc = -EFAULT;
-	while (offs < count) {
-		size = min(PAGE_SIZE, count - offs);
-		if (memcpy_real(buf, src + offs, size))
-			goto out;
-		if (copy_to_user(dest + offs, buf, size))
-			goto out;
-		offs += size;
+	size_t len, copied, res = 0;
+	unsigned long phys, offset;
+	void *chunk;
+	pte_t pte;
+
+	BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE);
+	while (count) {
+		phys = src & MEMCPY_REAL_MASK;
+		offset = src & ~MEMCPY_REAL_MASK;
+		chunk = (void *)(__memcpy_real_area + offset);
+		len = min(count, MEMCPY_REAL_SIZE - offset);
+		pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
+
+		mutex_lock(&memcpy_real_mutex);
+		if (pte_val(pte) != pte_val(*memcpy_real_ptep)) {
+			__ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL);
+			set_pte(memcpy_real_ptep, pte);
+		}
+		copied = copy_to_iter(chunk, len, iter);
+		mutex_unlock(&memcpy_real_mutex);
+
+		count -= copied;
+		src += copied;
+		res += copied;
+		if (copied < len)
+			break;
 	}
-	rc = 0;
-out:
-	free_page((unsigned long) buf);
-	return rc;
+	return res;
 }
 
-/*
- * Copy memory from user (virtual) to kernel (real)
- */
-int copy_from_user_real(void *dest, void __user *src, size_t count)
+int memcpy_real(void *dest, unsigned long src, size_t count)
 {
-	int offs = 0, size, rc;
-	char *buf;
-
-	buf = (char *) __get_free_page(GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	rc = -EFAULT;
-	while (offs < count) {
-		size = min(PAGE_SIZE, count - offs);
-		if (copy_from_user(buf, src + offs, size))
-			goto out;
-		if (memcpy_real(dest + offs, buf, size))
-			goto out;
-		offs += size;
-	}
-	rc = 0;
-out:
-	free_page((unsigned long) buf);
-	return rc;
+	struct iov_iter iter;
+	struct kvec kvec;
+
+	kvec.iov_base = dest;
+	kvec.iov_len = count;
+	iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+	if (memcpy_real_iter(&iter, src, count) < count)
+		return -EFAULT;
+	return 0;
 }
 
 /*
- * Check if physical address is within prefix or zero page
+ * Find CPU that owns swapped prefix page
  */
-static int is_swapped(unsigned long addr)
+static int get_swapped_owner(phys_addr_t addr)
 {
-	unsigned long lc;
+	phys_addr_t lc;
 	int cpu;
 
-	if (addr < sizeof(struct _lowcore))
-		return 1;
 	for_each_online_cpu(cpu) {
-		lc = (unsigned long) lowcore_ptr[cpu];
-		if (addr > lc + sizeof(struct _lowcore) - 1 || addr < lc)
+		lc = virt_to_phys(lowcore_ptr[cpu]);
+		if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
 			continue;
-		return 1;
+		return cpu;
 	}
-	return 0;
+	return -1;
 }
 
 /*
@@ -201,29 +148,47 @@ static int is_swapped(unsigned long addr)
  * For swapped prefix pages a new buffer is returned that contains a copy of
  * the absolute memory. The buffer size is maximum one page large.
  */
-void *xlate_dev_mem_ptr(unsigned long addr)
+void *xlate_dev_mem_ptr(phys_addr_t addr)
 {
-	void *bounce = (void *) addr;
+	void *ptr = phys_to_virt(addr);
+	void *bounce = ptr;
+	struct lowcore *abs_lc;
 	unsigned long size;
+	int this_cpu, cpu;
 
-	get_online_cpus();
-	preempt_disable();
-	if (is_swapped(addr)) {
-		size = PAGE_SIZE - (addr & ~PAGE_MASK);
-		bounce = (void *) __get_free_page(GFP_ATOMIC);
-		if (bounce)
-			memcpy_absolute(bounce, (void *) addr, size);
+	cpus_read_lock();
+	this_cpu = get_cpu();
+	if (addr >= sizeof(struct lowcore)) {
+		cpu = get_swapped_owner(addr);
+		if (cpu < 0)
+			goto out;
 	}
-	preempt_enable();
-	put_online_cpus();
+	bounce = (void *)__get_free_page(GFP_ATOMIC);
+	if (!bounce)
+		goto out;
+	size = PAGE_SIZE - (addr & ~PAGE_MASK);
+	if (addr < sizeof(struct lowcore)) {
+		abs_lc = get_abs_lowcore();
+		ptr = (void *)abs_lc + addr;
+		memcpy(bounce, ptr, size);
+		put_abs_lowcore(abs_lc);
+	} else if (cpu == this_cpu) {
+		ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
+		memcpy(bounce, ptr, size);
+	} else {
+		memcpy(bounce, ptr, size);
+	}
+out:
+	put_cpu();
+	cpus_read_unlock();
 	return bounce;
 }
 
 /*
  * Free converted buffer for /dev/mem access (if necessary)
  */
-void unxlate_dev_mem_ptr(unsigned long addr, void *buf)
+void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr)
 {
-	if ((void *) addr != buf)
-		free_page((unsigned long) buf);
+	if (addr != virt_to_phys(ptr))
+		free_page((unsigned long)ptr);
 }
diff --git a/arch/s390/mm/mem_detect.c b/arch/s390/mm/mem_detect.c
deleted file mode 100644
index cca388253a39..000000000000
--- a/arch/s390/mm/mem_detect.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright IBM Corp. 2008, 2009
- *
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/ipl.h>
-#include <asm/sclp.h>
-#include <asm/setup.h>
-
-#define ADDR2G (1ULL << 31)
-
-static void find_memory_chunks(struct mem_chunk chunk[], unsigned long maxsize)
-{
-	unsigned long long memsize, rnmax, rzm;
-	unsigned long addr = 0, size;
-	int i = 0, type;
-
-	rzm = sclp_get_rzm();
-	rnmax = sclp_get_rnmax();
-	memsize = rzm * rnmax;
-	if (!rzm)
-		rzm = 1ULL << 17;
-	if (sizeof(long) == 4) {
-		rzm = min(ADDR2G, rzm);
-		memsize = memsize ? min(ADDR2G, memsize) : ADDR2G;
-	}
-	if (maxsize)
-		memsize = memsize ? min((unsigned long)memsize, maxsize) : maxsize;
-	do {
-		size = 0;
-		type = tprot(addr);
-		do {
-			size += rzm;
-			if (memsize && addr + size >= memsize)
-				break;
-		} while (type == tprot(addr + size));
-		if (type == CHUNK_READ_WRITE || type == CHUNK_READ_ONLY) {
-			if (memsize && (addr + size > memsize))
-				size = memsize - addr;
-			chunk[i].addr = addr;
-			chunk[i].size = size;
-			chunk[i].type = type;
-			i++;
-		}
-		addr += size;
-	} while (addr < memsize && i < MEMORY_CHUNKS);
-}
-
-/**
- * detect_memory_layout - fill mem_chunk array with memory layout data
- * @chunk: mem_chunk array to be filled
- * @maxsize: maximum address where memory detection should stop
- *
- * Fills the passed in memory chunk array with the memory layout of the
- * machine. The array must have a size of at least MEMORY_CHUNKS and will
- * be fully initialized afterwards.
- * If the maxsize paramater has a value > 0 memory detection will stop at
- * that address. It is guaranteed that all chunks have an ending address
- * that is smaller than maxsize.
- * If maxsize is 0 all memory will be detected.
- */
-void detect_memory_layout(struct mem_chunk chunk[], unsigned long maxsize)
-{
-	unsigned long flags, flags_dat, cr0;
-
-	memset(chunk, 0, MEMORY_CHUNKS * sizeof(struct mem_chunk));
-	/*
-	 * Disable IRQs, DAT and low address protection so tprot does the
-	 * right thing and we don't get scheduled away with low address
-	 * protection disabled.
-	 */
-	local_irq_save(flags);
-	flags_dat = __arch_local_irq_stnsm(0xfb);
-	/*
-	 * In case DAT was enabled, make sure chunk doesn't reside in vmalloc
-	 * space. We have disabled DAT and any access to vmalloc area will
-	 * cause an exception.
-	 * If DAT was disabled we are called from early ipl code.
-	 */
-	if (test_bit(5, &flags_dat)) {
-		if (WARN_ON_ONCE(is_vmalloc_or_module_addr(chunk)))
-			goto out;
-	}
-	__ctl_store(cr0, 0, 0);
-	__ctl_clear_bit(0, 28);
-	find_memory_chunks(chunk, maxsize);
-	__ctl_load(cr0, 0, 0);
-out:
-	__arch_local_irq_ssm(flags_dat);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(detect_memory_layout);
-
-/*
- * Create memory hole with given address and size.
- */
-void create_mem_hole(struct mem_chunk mem_chunk[], unsigned long addr,
-		     unsigned long size)
-{
-	int i;
-
-	for (i = 0; i < MEMORY_CHUNKS; i++) {
-		struct mem_chunk *chunk = &mem_chunk[i];
-
-		if (chunk->size == 0)
-			continue;
-		if (addr > chunk->addr + chunk->size)
-			continue;
-		if (addr + size <= chunk->addr)
-			continue;
-		/* Split */
-		if ((addr > chunk->addr) &&
-		    (addr + size < chunk->addr + chunk->size)) {
-			struct mem_chunk *new = chunk + 1;
-
-			memmove(new, chunk, (MEMORY_CHUNKS-i-1) * sizeof(*new));
-			new->addr = addr + size;
-			new->size = chunk->addr + chunk->size - new->addr;
-			chunk->size = addr - chunk->addr;
-			continue;
-		} else if ((addr <= chunk->addr) &&
-			   (addr + size >= chunk->addr + chunk->size)) {
-			memmove(chunk, chunk + 1, (MEMORY_CHUNKS-i-1) * sizeof(*chunk));
-			memset(&mem_chunk[MEMORY_CHUNKS-1], 0, sizeof(*chunk));
-		} else if (addr + size < chunk->addr + chunk->size) {
-			chunk->size =  chunk->addr + chunk->size - addr - size;
-			chunk->addr = addr + size;
-		} else if (addr > chunk->addr) {
-			chunk->size = addr - chunk->addr;
-		}
-	}
-}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 40023290ee5b..2a222a7e14f4 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -1,183 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *  flexible mmap layout support
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
  * All Rights Reserved.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
  * Started by Ingo Molnar <mingo@elte.hu>
  */
 
+#include <linux/elf-randomize.h>
 #include <linux/personality.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/module.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
 #include <linux/random.h>
-#include <linux/compat.h>
-#include <asm/pgalloc.h>
+#include <linux/security.h>
+#include <linux/hugetlb.h>
+#include <asm/elf.h>
 
 static unsigned long stack_maxrandom_size(void)
 {
 	if (!(current->flags & PF_RANDOMIZE))
 		return 0;
-	if (current->personality & ADDR_NO_RANDOMIZE)
-		return 0;
 	return STACK_RND_MASK << PAGE_SHIFT;
 }
 
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave at least a ~32 MB hole.
- */
-#define MIN_GAP (32*1024*1024)
-#define MAX_GAP (STACK_TOP/6*5)
-
-static inline int mmap_is_legacy(void)
+static inline int mmap_is_legacy(const struct rlimit *rlim_stack)
 {
 	if (current->personality & ADDR_COMPAT_LAYOUT)
 		return 1;
-	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+	if (rlim_stack->rlim_cur == RLIM_INFINITY)
 		return 1;
 	return sysctl_legacy_va_layout;
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
-	if (!(current->flags & PF_RANDOMIZE))
-		return 0;
-	/* 8MB randomization for mmap_base */
-	return (get_random_int() & 0x7ffUL) << PAGE_SHIFT;
+	return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT;
 }
 
-static inline unsigned long mmap_base(void)
+static unsigned long mmap_base_legacy(unsigned long rnd)
 {
-	unsigned long gap = rlimit(RLIMIT_STACK);
-
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
-	gap &= PAGE_MASK;
-	return STACK_TOP - stack_maxrandom_size() - mmap_rnd() - gap;
+	return TASK_UNMAPPED_BASE + rnd;
 }
 
-#ifndef CONFIG_64BIT
-
-/*
- * This function, called very early during the creation of a new
- * process VM image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm)
+static inline unsigned long mmap_base(unsigned long rnd,
+				      const struct rlimit *rlim_stack)
 {
+	unsigned long gap = rlim_stack->rlim_cur;
+	unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
+
+	/* Values close to RLIM_INFINITY can overflow. */
+	if (gap + pad > gap)
+		gap += pad;
+
 	/*
-	 * Fall back to the standard layout if the personality
-	 * bit is set, or if the expected stack growth is unlimited:
+	 * Top of mmap area (just below the process stack).
+	 * Leave at least a ~128 MB hole.
 	 */
-	if (mmap_is_legacy()) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
-		mm->get_unmapped_area = arch_get_unmapped_area;
-	} else {
-		mm->mmap_base = mmap_base();
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-	}
-}
+	gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5);
 
-#else
+	return PAGE_ALIGN(STACK_TOP - gap - rnd);
+}
 
-int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
+static int get_align_mask(struct file *filp, unsigned long flags)
 {
-	int rc;
-
-	if (is_compat_task() || (TASK_SIZE >= (1UL << 53)))
+	if (filp && is_file_hugepages(filp))
+		return huge_page_mask_align(filp);
+	if (!(current->flags & PF_RANDOMIZE))
 		return 0;
-	if (!(flags & MAP_FIXED))
-		addr = 0;
-	if ((addr + len) >= TASK_SIZE) {
-		rc = crst_table_upgrade(current->mm, 1UL << 53);
-		if (rc)
-			return rc;
-		update_mm(current->mm, current);
-	}
+	if (filp || (flags & MAP_SHARED))
+		return MMAP_ALIGN_MASK << PAGE_SHIFT;
 	return 0;
 }
 
-static unsigned long
-s390_get_unmapped_area(struct file *filp, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+				     unsigned long len, unsigned long pgoff,
+				     unsigned long flags, vm_flags_t vm_flags)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long area;
-	int rc;
-
-	area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
-	if (!(area & ~PAGE_MASK))
-		return area;
-	if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
-		/* Upgrade the page table to 4 levels and retry. */
-		rc = crst_table_upgrade(mm, 1UL << 53);
-		if (rc)
-			return (unsigned long) rc;
-		update_mm(mm, current);
-		area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
+	struct vm_area_struct *vma;
+	struct vm_unmapped_area_info info = {};
+
+	if (len > TASK_SIZE - mmap_min_addr)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		goto check_asce_limit;
+
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		    (!vma || addr + len <= vm_start_gap(vma)))
+			goto check_asce_limit;
 	}
-	return area;
+
+	info.length = len;
+	info.low_limit = mm->mmap_base;
+	info.high_limit = TASK_SIZE;
+	info.align_mask = get_align_mask(filp, flags);
+	if (!(filp && is_file_hugepages(filp)))
+		info.align_offset = pgoff << PAGE_SHIFT;
+	addr = vm_unmapped_area(&info);
+	if (offset_in_page(addr))
+		return addr;
+
+check_asce_limit:
+	return check_asce_limit(mm, addr, len);
 }
 
-static unsigned long
-s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
-			  const unsigned long len, const unsigned long pgoff,
-			  const unsigned long flags)
+unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+					     unsigned long len, unsigned long pgoff,
+					     unsigned long flags, vm_flags_t vm_flags)
 {
+	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	unsigned long area;
-	int rc;
-
-	area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
-	if (!(area & ~PAGE_MASK))
-		return area;
-	if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
-		/* Upgrade the page table to 4 levels and retry. */
-		rc = crst_table_upgrade(mm, 1UL << 53);
-		if (rc)
-			return (unsigned long) rc;
-		update_mm(mm, current);
-		area = arch_get_unmapped_area_topdown(filp, addr, len,
-						      pgoff, flags);
+	struct vm_unmapped_area_info info = {};
+
+	/* requested length too big for entire address space */
+	if (len > TASK_SIZE - mmap_min_addr)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		goto check_asce_limit;
+
+	/* requesting a specific address */
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+				(!vma || addr + len <= vm_start_gap(vma)))
+			goto check_asce_limit;
+	}
+
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = PAGE_SIZE;
+	info.high_limit = mm->mmap_base;
+	info.align_mask = get_align_mask(filp, flags);
+	if (!(filp && is_file_hugepages(filp)))
+		info.align_offset = pgoff << PAGE_SHIFT;
+	addr = vm_unmapped_area(&info);
+
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	if (offset_in_page(addr)) {
+		VM_BUG_ON(addr != -ENOMEM);
+		info.flags = 0;
+		info.low_limit = TASK_UNMAPPED_BASE;
+		info.high_limit = TASK_SIZE;
+		addr = vm_unmapped_area(&info);
+		if (offset_in_page(addr))
+			return addr;
 	}
-	return area;
+
+check_asce_limit:
+	return check_asce_limit(mm, addr, len);
 }
+
 /*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
  */
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
 {
+	unsigned long random_factor = 0UL;
+
+	if (current->flags & PF_RANDOMIZE)
+		random_factor = arch_mmap_rnd();
+
 	/*
 	 * Fall back to the standard layout if the personality
 	 * bit is set, or if the expected stack growth is unlimited:
 	 */
-	if (mmap_is_legacy()) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
-		mm->get_unmapped_area = s390_get_unmapped_area;
+	if (mmap_is_legacy(rlim_stack)) {
+		mm->mmap_base = mmap_base_legacy(random_factor);
+		mm_flags_clear(MMF_TOPDOWN, mm);
 	} else {
-		mm->mmap_base = mmap_base();
-		mm->get_unmapped_area = s390_get_unmapped_area_topdown;
+		mm->mmap_base = mmap_base(random_factor, rlim_stack);
+		mm_flags_set(MMF_TOPDOWN, mm);
 	}
 }
 
-#endif
+static pgprot_t protection_map[16] __ro_after_init;
+
+void __init setup_protection_map(void)
+{
+	pgprot_t *pm = protection_map;
+
+	pm[VM_NONE]					= PAGE_NONE;
+	pm[VM_READ]					= PAGE_RO;
+	pm[VM_WRITE]					= PAGE_RO;
+	pm[VM_WRITE | VM_READ]				= PAGE_RO;
+	pm[VM_EXEC]					= PAGE_RX;
+	pm[VM_EXEC | VM_READ]				= PAGE_RX;
+	pm[VM_EXEC | VM_WRITE]				= PAGE_RX;
+	pm[VM_EXEC | VM_WRITE | VM_READ]		= PAGE_RX;
+	pm[VM_SHARED]					= PAGE_NONE;
+	pm[VM_SHARED | VM_READ]				= PAGE_RO;
+	pm[VM_SHARED | VM_WRITE]			= PAGE_RW;
+	pm[VM_SHARED | VM_WRITE | VM_READ]		= PAGE_RW;
+	pm[VM_SHARED | VM_EXEC]				= PAGE_RX;
+	pm[VM_SHARED | VM_EXEC | VM_READ]		= PAGE_RX;
+	pm[VM_SHARED | VM_EXEC | VM_WRITE]		= PAGE_RWX;
+	pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= PAGE_RWX;
+}
+
+DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index a90d45e9dfb0..01f9b39e65f5 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright IBM Corp. 2008
  *
@@ -6,109 +7,26 @@
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
 #include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
+#include <asm/page-states.h>
+#include <asm/sections.h>
+#include <asm/page.h>
 
-#define ESSA_SET_STABLE		1
-#define ESSA_SET_UNUSED		2
-
-static int cmma_flag = 1;
-
-static int __init cmma(char *str)
-{
-	char *parm;
-
-	parm = strstrip(str);
-	if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) {
-		cmma_flag = 1;
-		return 1;
-	}
-	cmma_flag = 0;
-	if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0)
-		return 1;
-	return 0;
-}
-__setup("cmma=", cmma);
-
-void __init cmma_init(void)
-{
-	register unsigned long tmp asm("0") = 0;
-	register int rc asm("1") = -EOPNOTSUPP;
-
-	if (!cmma_flag)
-		return;
-	asm volatile(
-		"       .insn rrf,0xb9ab0000,%1,%1,0,0\n"
-		"0:     la      %0,0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "+&d" (rc), "+&d" (tmp));
-	if (rc)
-		cmma_flag = 0;
-}
-
-static inline void set_page_unstable(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_UNUSED));
-}
+int __bootdata_preserved(cmma_flag);
 
 void arch_free_page(struct page *page, int order)
 {
 	if (!cmma_flag)
 		return;
-	set_page_unstable(page, order);
-}
-
-static inline void set_page_stable(struct page *page, int order)
-{
-	int i, rc;
-
-	for (i = 0; i < (1 << order); i++)
-		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
-			     : "=&d" (rc)
-			     : "a" (page_to_phys(page + i)),
-			       "i" (ESSA_SET_STABLE));
+	__set_page_unused(page_to_virt(page), 1UL << order);
 }
 
 void arch_alloc_page(struct page *page, int order)
 {
 	if (!cmma_flag)
 		return;
-	set_page_stable(page, order);
-}
-
-void arch_set_page_states(int make_stable)
-{
-	unsigned long flags, order, t;
-	struct list_head *l;
-	struct page *page;
-	struct zone *zone;
-
-	if (!cmma_flag)
-		return;
-	if (make_stable)
-		drain_local_pages(NULL);
-	for_each_populated_zone(zone) {
-		spin_lock_irqsave(&zone->lock, flags);
-		for_each_migratetype_order(order, t) {
-			list_for_each(l, &zone->free_area[order].free_list[t]) {
-				page = list_entry(l, struct page, lru);
-				if (make_stable)
-					set_page_stable(page, order);
-				else
-					set_page_unstable(page, order);
-			}
-		}
-		spin_unlock_irqrestore(&zone->lock, flags);
-	}
+	if (cmma_flag < 2)
+		__set_page_stable_dat(page_to_virt(page), 1UL << order);
+	else
+		__set_page_stable_nodat(page_to_virt(page), 1UL << order);
 }
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 80adfbf75065..3042647c9dbf 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -1,27 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright IBM Corp. 2011
  * Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
  */
+#include <linux/cpufeature.h>
 #include <linux/hugetlb.h>
-#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/cacheflush.h>
-#include <asm/pgtable.h>
+#include <asm/facility.h>
+#include <asm/pgalloc.h>
+#include <asm/kfence.h>
 #include <asm/page.h>
+#include <asm/asm.h>
+#include <asm/set_memory.h>
 
 static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
 {
-	asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0"
+	asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0"
 		     : [addr] "+a" (addr) : [skey] "d" (skey));
 	return addr;
 }
 
-void storage_key_init_range(unsigned long start, unsigned long end)
+void __storage_key_init_range(unsigned long start, unsigned long end)
 {
 	unsigned long boundary, size;
 
 	while (start < end) {
-		if (MACHINE_HAS_EDAT1) {
+		if (cpu_has_edat1()) {
 			/* set storage keys for a 1MB frame */
 			size = 1UL << 20;
 			boundary = (start + size) & ~(size - 1);
@@ -32,113 +39,435 @@ void storage_key_init_range(unsigned long start, unsigned long end)
 				continue;
 			}
 		}
-		page_set_storage_key(start, PAGE_DEFAULT_KEY, 0);
+		page_set_storage_key(start, PAGE_DEFAULT_KEY, 1);
 		start += PAGE_SIZE;
 	}
 }
 
-static pte_t *walk_page_table(unsigned long addr)
-{
-	pgd_t *pgdp;
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
+#ifdef CONFIG_PROC_FS
+atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
 
-	pgdp = pgd_offset_k(addr);
-	if (pgd_none(*pgdp))
-		return NULL;
-	pudp = pud_offset(pgdp, addr);
-	if (pud_none(*pudp) || pud_large(*pudp))
-		return NULL;
-	pmdp = pmd_offset(pudp, addr);
-	if (pmd_none(*pmdp) || pmd_large(*pmdp))
-		return NULL;
-	ptep = pte_offset_kernel(pmdp, addr);
-	if (pte_none(*ptep))
-		return NULL;
-	return ptep;
+void arch_report_meminfo(struct seq_file *m)
+{
+	seq_printf(m, "DirectMap4k:    %8lu kB\n",
+		   atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2);
+	seq_printf(m, "DirectMap1M:    %8lu kB\n",
+		   atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10);
+	seq_printf(m, "DirectMap2G:    %8lu kB\n",
+		   atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21);
 }
+#endif /* CONFIG_PROC_FS */
 
-static void change_page_attr(unsigned long addr, int numpages,
-			     pte_t (*set) (pte_t))
+static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
+		    unsigned long dtt)
 {
-	pte_t *ptep, pte;
-	int i;
+	unsigned long *table, mask;
 
-	for (i = 0; i < numpages; i++) {
-		ptep = walk_page_table(addr);
-		if (WARN_ON_ONCE(!ptep))
+	mask = 0;
+	if (cpu_has_edat2()) {
+		switch (dtt) {
+		case CRDTE_DTT_REGION3:
+			mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
 			break;
-		pte = *ptep;
-		pte = set(pte);
-		__ptep_ipte(addr, ptep);
-		*ptep = pte;
-		addr += PAGE_SIZE;
+		case CRDTE_DTT_SEGMENT:
+			mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+			break;
+		case CRDTE_DTT_PAGE:
+			mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+			break;
+		}
+		table = (unsigned long *)((unsigned long)old & mask);
+		crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val);
+	} else {
+		cspg(old, *old, new);
 	}
 }
 
-int set_memory_ro(unsigned long addr, int numpages)
+static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
+			  unsigned long flags)
 {
-	change_page_attr(addr, numpages, pte_wrprotect);
+	pte_t *ptep, new;
+
+	if (flags == SET_MEMORY_4K)
+		return 0;
+	ptep = pte_offset_kernel(pmdp, addr);
+	do {
+		new = *ptep;
+		if (pte_none(new))
+			return -EINVAL;
+		if (flags & SET_MEMORY_RO)
+			new = pte_wrprotect(new);
+		else if (flags & SET_MEMORY_RW)
+			new = pte_mkwrite_novma(pte_mkdirty(new));
+		if (flags & SET_MEMORY_NX)
+			new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
+		else if (flags & SET_MEMORY_X)
+			new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
+		if (flags & SET_MEMORY_INV) {
+			new = set_pte_bit(new, __pgprot(_PAGE_INVALID));
+		} else if (flags & SET_MEMORY_DEF) {
+			new = __pte(pte_val(new) & PAGE_MASK);
+			new = set_pte_bit(new, PAGE_KERNEL);
+		}
+		pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
+		ptep++;
+		addr += PAGE_SIZE;
+		cond_resched();
+	} while (addr < end);
 	return 0;
 }
 
-int set_memory_rw(unsigned long addr, int numpages)
+static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
 {
-	change_page_attr(addr, numpages, pte_mkwrite);
+	unsigned long pte_addr, prot;
+	pte_t *pt_dir, *ptep;
+	pmd_t new;
+	int i, ro, nx;
+
+	pt_dir = vmem_pte_alloc();
+	if (!pt_dir)
+		return -ENOMEM;
+	pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT;
+	ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT);
+	nx = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_NOEXEC);
+	prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
+	if (!nx)
+		prot &= ~_PAGE_NOEXEC;
+	ptep = pt_dir;
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		set_pte(ptep, __pte(pte_addr | prot));
+		pte_addr += PAGE_SIZE;
+		ptep++;
+	}
+	new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY);
+	pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+	update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
+	update_page_count(PG_DIRECT_MAP_1M, -1);
 	return 0;
 }
 
-/* not possible */
-int set_memory_nx(unsigned long addr, int numpages)
+static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
+			    unsigned long flags)
 {
-	return 0;
+	pmd_t new = *pmdp;
+
+	if (flags & SET_MEMORY_RO)
+		new = pmd_wrprotect(new);
+	else if (flags & SET_MEMORY_RW)
+		new = pmd_mkwrite_novma(pmd_mkdirty(new));
+	if (flags & SET_MEMORY_NX)
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+	else if (flags & SET_MEMORY_X)
+		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+	if (flags & SET_MEMORY_INV) {
+		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+	} else if (flags & SET_MEMORY_DEF) {
+		new = __pmd(pmd_val(new) & PMD_MASK);
+		new = set_pmd_bit(new, SEGMENT_KERNEL);
+	}
+	pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+}
+
+static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
+			  unsigned long flags)
+{
+	unsigned long next;
+	int need_split;
+	pmd_t *pmdp;
+	int rc = 0;
+
+	pmdp = pmd_offset(pudp, addr);
+	do {
+		if (pmd_none(*pmdp))
+			return -EINVAL;
+		next = pmd_addr_end(addr, end);
+		if (pmd_leaf(*pmdp)) {
+			need_split  = !!(flags & SET_MEMORY_4K);
+			need_split |= !!(addr & ~PMD_MASK);
+			need_split |= !!(addr + PMD_SIZE > next);
+			if (need_split) {
+				rc = split_pmd_page(pmdp, addr);
+				if (rc)
+					return rc;
+				continue;
+			}
+			modify_pmd_page(pmdp, addr, flags);
+		} else {
+			rc = walk_pte_level(pmdp, addr, next, flags);
+			if (rc)
+				return rc;
+		}
+		pmdp++;
+		addr = next;
+		cond_resched();
+	} while (addr < end);
+	return rc;
 }
 
-int set_memory_x(unsigned long addr, int numpages)
+static int split_pud_page(pud_t *pudp, unsigned long addr)
 {
+	unsigned long pmd_addr, prot;
+	pmd_t *pm_dir, *pmdp;
+	pud_t new;
+	int i, ro, nx;
+
+	pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+	if (!pm_dir)
+		return -ENOMEM;
+	pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT;
+	ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT);
+	nx = !!(pud_val(*pudp) & _REGION_ENTRY_NOEXEC);
+	prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL);
+	if (!nx)
+		prot &= ~_SEGMENT_ENTRY_NOEXEC;
+	pmdp = pm_dir;
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		set_pmd(pmdp, __pmd(pmd_addr | prot));
+		pmd_addr += PMD_SIZE;
+		pmdp++;
+	}
+	new = __pud(__pa(pm_dir) | _REGION3_ENTRY);
+	pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+	update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
+	update_page_count(PG_DIRECT_MAP_2G, -1);
 	return 0;
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
+static void modify_pud_page(pud_t *pudp, unsigned long addr,
+			    unsigned long flags)
 {
-	unsigned long address;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	int i;
+	pud_t new = *pudp;
 
-	for (i = 0; i < numpages; i++) {
-		address = page_to_phys(page + i);
-		pgd = pgd_offset_k(address);
-		pud = pud_offset(pgd, address);
-		pmd = pmd_offset(pud, address);
-		pte = pte_offset_kernel(pmd, address);
-		if (!enable) {
-			__ptep_ipte(address, pte);
-			pte_val(*pte) = _PAGE_TYPE_EMPTY;
-			continue;
+	if (flags & SET_MEMORY_RO)
+		new = pud_wrprotect(new);
+	else if (flags & SET_MEMORY_RW)
+		new = pud_mkwrite(pud_mkdirty(new));
+	if (flags & SET_MEMORY_NX)
+		new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
+	else if (flags & SET_MEMORY_X)
+		new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
+	if (flags & SET_MEMORY_INV) {
+		new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID));
+	} else if (flags & SET_MEMORY_DEF) {
+		new = __pud(pud_val(new) & PUD_MASK);
+		new = set_pud_bit(new, REGION3_KERNEL);
+	}
+	pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+}
+
+static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
+			  unsigned long flags)
+{
+	unsigned long next;
+	int need_split;
+	pud_t *pudp;
+	int rc = 0;
+
+	pudp = pud_offset(p4d, addr);
+	do {
+		if (pud_none(*pudp))
+			return -EINVAL;
+		next = pud_addr_end(addr, end);
+		if (pud_leaf(*pudp)) {
+			need_split  = !!(flags & SET_MEMORY_4K);
+			need_split |= !!(addr & ~PUD_MASK);
+			need_split |= !!(addr + PUD_SIZE > next);
+			if (need_split) {
+				rc = split_pud_page(pudp, addr);
+				if (rc)
+					break;
+				continue;
+			}
+			modify_pud_page(pudp, addr, flags);
+		} else {
+			rc = walk_pmd_level(pudp, addr, next, flags);
 		}
-		pte_val(*pte) = __pa(address);
+		pudp++;
+		addr = next;
+		cond_resched();
+	} while (addr < end && !rc);
+	return rc;
+}
+
+static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end,
+			  unsigned long flags)
+{
+	unsigned long next;
+	p4d_t *p4dp;
+	int rc = 0;
+
+	p4dp = p4d_offset(pgd, addr);
+	do {
+		if (p4d_none(*p4dp))
+			return -EINVAL;
+		next = p4d_addr_end(addr, end);
+		rc = walk_pud_level(p4dp, addr, next, flags);
+		p4dp++;
+		addr = next;
+		cond_resched();
+	} while (addr < end && !rc);
+	return rc;
+}
+
+DEFINE_MUTEX(cpa_mutex);
+
+static int change_page_attr(unsigned long addr, unsigned long end,
+			    unsigned long flags)
+{
+	unsigned long next;
+	int rc = -EINVAL;
+	pgd_t *pgdp;
+
+	pgdp = pgd_offset_k(addr);
+	do {
+		if (pgd_none(*pgdp))
+			break;
+		next = pgd_addr_end(addr, end);
+		rc = walk_p4d_level(pgdp, addr, next, flags);
+		if (rc)
+			break;
+		cond_resched();
+	} while (pgdp++, addr = next, addr < end && !rc);
+	return rc;
+}
+
+static int change_page_attr_alias(unsigned long addr, unsigned long end,
+				  unsigned long flags)
+{
+	unsigned long alias, offset, va_start, va_end;
+	struct vm_struct *area;
+	int rc = 0;
+
+	/*
+	 * Changes to read-only permissions on kernel VA mappings are also
+	 * applied to the kernel direct mapping. Execute permissions are
+	 * intentionally not transferred to keep all allocated pages within
+	 * the direct mapping non-executable.
+	 */
+	flags &= SET_MEMORY_RO | SET_MEMORY_RW;
+	if (!flags)
+		return 0;
+	area = NULL;
+	while (addr < end) {
+		if (!area)
+			area = find_vm_area((void *)addr);
+		if (!area || !(area->flags & VM_ALLOC))
+			return 0;
+		va_start = (unsigned long)area->addr;
+		va_end = va_start + area->nr_pages * PAGE_SIZE;
+		offset = (addr - va_start) >> PAGE_SHIFT;
+		alias = (unsigned long)page_address(area->pages[offset]);
+		rc = change_page_attr(alias, alias + PAGE_SIZE, flags);
+		if (rc)
+			break;
+		addr += PAGE_SIZE;
+		if (addr >= va_end)
+			area = NULL;
 	}
+	return rc;
+}
+
+int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags)
+{
+	unsigned long end;
+	int rc;
+
+	if (!cpu_has_nx())
+		flags &= ~(SET_MEMORY_NX | SET_MEMORY_X);
+	if (!flags)
+		return 0;
+	if (!numpages)
+		return 0;
+	addr &= PAGE_MASK;
+	end = addr + numpages * PAGE_SIZE;
+	mutex_lock(&cpa_mutex);
+	rc = change_page_attr(addr, end, flags);
+	if (rc)
+		goto out;
+	rc = change_page_attr_alias(addr, end, flags);
+out:
+	mutex_unlock(&cpa_mutex);
+	return rc;
+}
+
+int set_direct_map_invalid_noflush(struct page *page)
+{
+	return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV);
+}
+
+int set_direct_map_default_noflush(struct page *page)
+{
+	return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
+}
+
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+	unsigned long flags;
+
+	if (valid)
+		flags = SET_MEMORY_DEF;
+	else
+		flags = SET_MEMORY_INV;
+
+	return __set_memory((unsigned long)page_to_virt(page), nr, flags);
 }
 
-#ifdef CONFIG_HIBERNATION
 bool kernel_page_present(struct page *page)
 {
 	unsigned long addr;
-	int cc;
+	unsigned int cc;
 
-	addr = page_to_phys(page);
+	addr = (unsigned long)page_address(page);
 	asm volatile(
-		"	lra	%1,0(%1)\n"
-		"	ipm	%0\n"
-		"	srl	%0,28"
-		: "=d" (cc), "+a" (addr) : : "cc");
-	return cc == 0;
+		"	lra	%[addr],0(%[addr])\n"
+		CC_IPM(cc)
+		: CC_OUT(cc, cc), [addr] "+a" (addr)
+		:
+		: CC_CLOBBER);
+	return CC_TRANSFORM(cc) == 0;
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+
+static void ipte_range(pte_t *pte, unsigned long address, int nr)
+{
+	int i;
+
+	if (test_facility(13)) {
+		__ptep_ipte_range(address, nr - 1, pte, IPTE_GLOBAL);
+		return;
+	}
+	for (i = 0; i < nr; i++) {
+		__ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL);
+		address += PAGE_SIZE;
+		pte++;
+	}
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	unsigned long address;
+	pte_t *ptep, pte;
+	int nr, i, j;
+
+	for (i = 0; i < numpages;) {
+		address = (unsigned long)page_to_virt(page + i);
+		ptep = virt_to_kpte(address);
+		nr = (unsigned long)ptep >> ilog2(sizeof(long));
+		nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
+		nr = min(numpages - i, nr);
+		if (enable) {
+			for (j = 0; j < nr; j++) {
+				pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID));
+				set_pte(ptep, pte);
+				address += PAGE_SIZE;
+				ptep++;
+			}
+		} else {
+			ipte_range(ptep, address, nr);
+		}
+		i += nr;
+	}
 }
-#endif /* CONFIG_HIBERNATION */
 
 #endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
new file mode 100644
index 000000000000..2f829448c719
--- /dev/null
+++ b/arch/s390/mm/pfault.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/sched/task.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <asm/asm-extable.h>
+#include <asm/asm-offsets.h>
+#include <asm/pfault.h>
+#include <asm/diag.h>
+
+#define __SUBCODE_MASK 0x0600
+#define __PF_RES_FIELD 0x8000000000000000UL
+
+/*
+ * 'pfault' pseudo page faults routines.
+ */
+static int pfault_disable;
+
+static int __init nopfault(char *str)
+{
+	pfault_disable = 1;
+	return 1;
+}
+early_param("nopfault", nopfault);
+
+struct pfault_refbk {
+	u16 refdiagc;
+	u16 reffcode;
+	u16 refdwlen;
+	u16 refversn;
+	u64 refgaddr;
+	u64 refselmk;
+	u64 refcmpmk;
+	u64 reserved;
+};
+
+static struct pfault_refbk pfault_init_refbk = {
+	.refdiagc = 0x258,
+	.reffcode = 0,
+	.refdwlen = 5,
+	.refversn = 2,
+	.refgaddr = __LC_LPP,
+	.refselmk = 1UL << 48,
+	.refcmpmk = 1UL << 48,
+	.reserved = __PF_RES_FIELD
+};
+
+int __pfault_init(void)
+{
+	int rc = -EOPNOTSUPP;
+
+	if (pfault_disable)
+		return rc;
+	diag_stat_inc(DIAG_STAT_X258);
+	asm_inline volatile(
+		"	diag	%[refbk],%[rc],0x258\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE(0b, 0b)
+		: [rc] "+d" (rc)
+		: [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk)
+		: "cc");
+	return rc;
+}
+
+static struct pfault_refbk pfault_fini_refbk = {
+	.refdiagc = 0x258,
+	.reffcode = 1,
+	.refdwlen = 5,
+	.refversn = 2,
+};
+
+void __pfault_fini(void)
+{
+	if (pfault_disable)
+		return;
+	diag_stat_inc(DIAG_STAT_X258);
+	asm_inline volatile(
+		"	diag	%[refbk],0,0x258\n"
+		"0:	nopr	%%r7\n"
+		EX_TABLE(0b, 0b)
+		:
+		: [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk)
+		: "cc");
+}
+
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
+#define PF_COMPLETE	0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule().  It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
+static void pfault_interrupt(struct ext_code ext_code,
+			     unsigned int param32, unsigned long param64)
+{
+	struct task_struct *tsk;
+	__u16 subcode;
+	pid_t pid;
+
+	/*
+	 * Get the external interruption subcode & pfault initial/completion
+	 * signal bit. VM stores this in the 'cpu address' field associated
+	 * with the external interrupt.
+	 */
+	subcode = ext_code.subcode;
+	if ((subcode & 0xff00) != __SUBCODE_MASK)
+		return;
+	inc_irq_stat(IRQEXT_PFL);
+	/* Get the token (= pid of the affected task). */
+	pid = param64 & LPP_PID_MASK;
+	rcu_read_lock();
+	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+	if (tsk)
+		get_task_struct(tsk);
+	rcu_read_unlock();
+	if (!tsk)
+		return;
+	spin_lock(&pfault_lock);
+	if (subcode & PF_COMPLETE) {
+		/* signal bit is set -> a page has been swapped in by VM */
+		if (tsk->thread.pfault_wait == 1) {
+			/*
+			 * Initial interrupt was faster than the completion
+			 * interrupt. pfault_wait is valid. Set pfault_wait
+			 * back to zero and wake up the process. This can
+			 * safely be done because the task is still sleeping
+			 * and can't produce new pfaults.
+			 */
+			tsk->thread.pfault_wait = 0;
+			list_del(&tsk->thread.list);
+			wake_up_process(tsk);
+			put_task_struct(tsk);
+		} else {
+			/*
+			 * Completion interrupt was faster than initial
+			 * interrupt. Set pfault_wait to -1 so the initial
+			 * interrupt doesn't put the task to sleep.
+			 * If the task is not running, ignore the completion
+			 * interrupt since it must be a leftover of a PFAULT
+			 * CANCEL operation which didn't remove all pending
+			 * completion interrupts.
+			 */
+			if (task_is_running(tsk))
+				tsk->thread.pfault_wait = -1;
+		}
+	} else {
+		/* signal bit not set -> a real page is missing. */
+		if (WARN_ON_ONCE(tsk != current))
+			goto out;
+		if (tsk->thread.pfault_wait == 1) {
+			/* Already on the list with a reference: put to sleep */
+			goto block;
+		} else if (tsk->thread.pfault_wait == -1) {
+			/*
+			 * Completion interrupt was faster than the initial
+			 * interrupt (pfault_wait == -1). Set pfault_wait
+			 * back to zero and exit.
+			 */
+			tsk->thread.pfault_wait = 0;
+		} else {
+			/*
+			 * Initial interrupt arrived before completion
+			 * interrupt. Let the task sleep.
+			 * An extra task reference is needed since a different
+			 * cpu may set the task state to TASK_RUNNING again
+			 * before the scheduler is reached.
+			 */
+			get_task_struct(tsk);
+			tsk->thread.pfault_wait = 1;
+			list_add(&tsk->thread.list, &pfault_list);
+block:
+			/*
+			 * Since this must be a userspace fault, there
+			 * is no kernel task state to trample. Rely on the
+			 * return to userspace schedule() to block.
+			 */
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			set_need_resched_current();
+		}
+	}
+out:
+	spin_unlock(&pfault_lock);
+	put_task_struct(tsk);
+}
+
+static int pfault_cpu_dead(unsigned int cpu)
+{
+	struct thread_struct *thread, *next;
+	struct task_struct *tsk;
+
+	spin_lock_irq(&pfault_lock);
+	list_for_each_entry_safe(thread, next, &pfault_list, list) {
+		thread->pfault_wait = 0;
+		list_del(&thread->list);
+		tsk = container_of(thread, struct task_struct, thread);
+		wake_up_process(tsk);
+		put_task_struct(tsk);
+	}
+	spin_unlock_irq(&pfault_lock);
+	return 0;
+}
+
+static int __init pfault_irq_init(void)
+{
+	int rc;
+
+	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+	if (rc)
+		goto out_extint;
+	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+	if (rc)
+		goto out_pfault;
+	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
+	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
+				  NULL, pfault_cpu_dead);
+	return 0;
+
+out_pfault:
+	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+out_extint:
+	pfault_disable = 1;
+	return rc;
+}
+early_initcall(pfault_irq_init);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
new file mode 100644
index 000000000000..7df23528c01b
--- /dev/null
+++ b/arch/s390/mm/pgalloc.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Page table allocation functions
+ *
+ *    Copyright IBM Corp. 2016
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <asm/mmu_context.h>
+#include <asm/page-states.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+unsigned long *crst_table_alloc_noprof(struct mm_struct *mm)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	struct ptdesc *ptdesc;
+	unsigned long *table;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER);
+	if (!ptdesc)
+		return NULL;
+	table = ptdesc_address(ptdesc);
+	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+	return table;
+}
+
+void crst_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	if (!table)
+		return;
+	pagetable_free(virt_to_ptdesc(table));
+}
+
+static void __crst_table_upgrade(void *arg)
+{
+	struct mm_struct *mm = arg;
+	struct ctlreg asce;
+
+	/* change all active ASCEs to avoid the creation of new TLBs */
+	if (current->active_mm == mm) {
+		asce.val = mm->context.asce;
+		get_lowcore()->user_asce = asce;
+		local_ctl_load(7, &asce);
+		if (!test_thread_flag(TIF_ASCE_PRIMARY))
+			local_ctl_load(1, &asce);
+	}
+	__tlb_flush_local();
+}
+
+int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
+{
+	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
+	unsigned long asce_limit = mm->context.asce_limit;
+
+	mmap_assert_write_locked(mm);
+
+	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
+	VM_BUG_ON(asce_limit < _REGION2_SIZE);
+
+	if (end <= asce_limit)
+		return 0;
+
+	if (asce_limit == _REGION2_SIZE) {
+		p4d = crst_table_alloc(mm);
+		if (unlikely(!p4d))
+			goto err_p4d;
+		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+		pagetable_p4d_ctor(virt_to_ptdesc(p4d));
+	}
+	if (end > _REGION1_SIZE) {
+		pgd = crst_table_alloc(mm);
+		if (unlikely(!pgd))
+			goto err_pgd;
+		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+		pagetable_pgd_ctor(virt_to_ptdesc(pgd));
+	}
+
+	spin_lock_bh(&mm->page_table_lock);
+
+	if (p4d) {
+		__pgd = (unsigned long *) mm->pgd;
+		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
+		mm->pgd = (pgd_t *) p4d;
+		mm->context.asce_limit = _REGION1_SIZE;
+		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+			_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+		mm_inc_nr_puds(mm);
+	}
+	if (pgd) {
+		__pgd = (unsigned long *) mm->pgd;
+		pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
+		mm->pgd = (pgd_t *) pgd;
+		mm->context.asce_limit = TASK_SIZE_MAX;
+		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+	}
+
+	spin_unlock_bh(&mm->page_table_lock);
+
+	on_each_cpu(__crst_table_upgrade, mm, 0);
+
+	return 0;
+
+err_pgd:
+	pagetable_dtor(virt_to_ptdesc(p4d));
+	crst_table_free(mm, p4d);
+err_p4d:
+	return -ENOMEM;
+}
+
+#ifdef CONFIG_PGSTE
+
+struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm)
+{
+	struct ptdesc *ptdesc;
+	u64 *table;
+
+	ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0);
+	if (ptdesc) {
+		table = (u64 *)ptdesc_address(ptdesc);
+		__arch_set_page_dat(table, 1);
+		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
+		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
+	}
+	return ptdesc;
+}
+
+void page_table_free_pgste(struct ptdesc *ptdesc)
+{
+	pagetable_free(ptdesc);
+}
+
+#endif /* CONFIG_PGSTE */
+
+unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	struct ptdesc *ptdesc;
+	unsigned long *table;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	ptdesc = pagetable_alloc_noprof(gfp, 0);
+	if (!ptdesc)
+		return NULL;
+	if (!pagetable_pte_ctor(mm, ptdesc)) {
+		pagetable_free(ptdesc);
+		return NULL;
+	}
+	table = ptdesc_address(ptdesc);
+	__arch_set_page_dat(table, 1);
+	memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+	memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
+	return table;
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	struct ptdesc *ptdesc = virt_to_ptdesc(table);
+
+	if (pagetable_is_reserved(ptdesc))
+		return free_reserved_ptdesc(ptdesc);
+	pagetable_dtor_free(ptdesc);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
+{
+	struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+
+	pagetable_dtor_free(ptdesc);
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+	struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
+
+	call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * Base infrastructure required to generate basic asces, region, segment,
+ * and page tables that do not make use of enhanced features like EDAT1.
+ */
+
+static struct kmem_cache *base_pgt_cache;
+
+static unsigned long *base_pgt_alloc(void)
+{
+	unsigned long *table;
+
+	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
+	if (table)
+		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+	return table;
+}
+
+static void base_pgt_free(unsigned long *table)
+{
+	kmem_cache_free(base_pgt_cache, table);
+}
+
+static unsigned long *base_crst_alloc(unsigned long val)
+{
+	unsigned long *table;
+	struct ptdesc *ptdesc;
+
+	ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+	if (!ptdesc)
+		return NULL;
+	table = ptdesc_address(ptdesc);
+	crst_table_init(table, val);
+	return table;
+}
+
+static void base_crst_free(unsigned long *table)
+{
+	if (!table)
+		return;
+	pagetable_free(virt_to_ptdesc(table));
+}
+
+#define BASE_ADDR_END_FUNC(NAME, SIZE)					\
+static inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
+						   unsigned long end)	\
+{									\
+	unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);		\
+									\
+	return (next - 1) < (end - 1) ? next : end;			\
+}
+
+BASE_ADDR_END_FUNC(page,    PAGE_SIZE)
+BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
+BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
+BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
+BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
+
+static inline unsigned long base_lra(unsigned long address)
+{
+	unsigned long real;
+
+	asm volatile(
+		"	lra	%0,0(%1)"
+		: "=d" (real) : "a" (address) : "cc");
+	return real;
+}
+
+static int base_page_walk(unsigned long *origin, unsigned long addr,
+			  unsigned long end, int alloc)
+{
+	unsigned long *pte, next;
+
+	if (!alloc)
+		return 0;
+	pte = origin;
+	pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
+	do {
+		next = base_page_addr_end(addr, end);
+		*pte = base_lra(addr);
+	} while (pte++, addr = next, addr < end);
+	return 0;
+}
+
+static int base_segment_walk(unsigned long *origin, unsigned long addr,
+			     unsigned long end, int alloc)
+{
+	unsigned long *ste, next, *table;
+	int rc;
+
+	ste = origin;
+	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+	do {
+		next = base_segment_addr_end(addr, end);
+		if (*ste & _SEGMENT_ENTRY_INVALID) {
+			if (!alloc)
+				continue;
+			table = base_pgt_alloc();
+			if (!table)
+				return -ENOMEM;
+			*ste = __pa(table) | _SEGMENT_ENTRY;
+		}
+		table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
+		rc = base_page_walk(table, addr, next, alloc);
+		if (rc)
+			return rc;
+		if (!alloc)
+			base_pgt_free(table);
+		cond_resched();
+	} while (ste++, addr = next, addr < end);
+	return 0;
+}
+
+static int base_region3_walk(unsigned long *origin, unsigned long addr,
+			     unsigned long end, int alloc)
+{
+	unsigned long *rtte, next, *table;
+	int rc;
+
+	rtte = origin;
+	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
+	do {
+		next = base_region3_addr_end(addr, end);
+		if (*rtte & _REGION_ENTRY_INVALID) {
+			if (!alloc)
+				continue;
+			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+			if (!table)
+				return -ENOMEM;
+			*rtte = __pa(table) | _REGION3_ENTRY;
+		}
+		table = __va(*rtte & _REGION_ENTRY_ORIGIN);
+		rc = base_segment_walk(table, addr, next, alloc);
+		if (rc)
+			return rc;
+		if (!alloc)
+			base_crst_free(table);
+	} while (rtte++, addr = next, addr < end);
+	return 0;
+}
+
+static int base_region2_walk(unsigned long *origin, unsigned long addr,
+			     unsigned long end, int alloc)
+{
+	unsigned long *rste, next, *table;
+	int rc;
+
+	rste = origin;
+	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
+	do {
+		next = base_region2_addr_end(addr, end);
+		if (*rste & _REGION_ENTRY_INVALID) {
+			if (!alloc)
+				continue;
+			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
+			if (!table)
+				return -ENOMEM;
+			*rste = __pa(table) | _REGION2_ENTRY;
+		}
+		table = __va(*rste & _REGION_ENTRY_ORIGIN);
+		rc = base_region3_walk(table, addr, next, alloc);
+		if (rc)
+			return rc;
+		if (!alloc)
+			base_crst_free(table);
+	} while (rste++, addr = next, addr < end);
+	return 0;
+}
+
+static int base_region1_walk(unsigned long *origin, unsigned long addr,
+			     unsigned long end, int alloc)
+{
+	unsigned long *rfte, next, *table;
+	int rc;
+
+	rfte = origin;
+	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
+	do {
+		next = base_region1_addr_end(addr, end);
+		if (*rfte & _REGION_ENTRY_INVALID) {
+			if (!alloc)
+				continue;
+			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
+			if (!table)
+				return -ENOMEM;
+			*rfte = __pa(table) | _REGION1_ENTRY;
+		}
+		table = __va(*rfte & _REGION_ENTRY_ORIGIN);
+		rc = base_region2_walk(table, addr, next, alloc);
+		if (rc)
+			return rc;
+		if (!alloc)
+			base_crst_free(table);
+	} while (rfte++, addr = next, addr < end);
+	return 0;
+}
+
+/**
+ * base_asce_free - free asce and tables returned from base_asce_alloc()
+ * @asce: asce to be freed
+ *
+ * Frees all region, segment, and page tables that were allocated with a
+ * corresponding base_asce_alloc() call.
+ */
+void base_asce_free(unsigned long asce)
+{
+	unsigned long *table = __va(asce & _ASCE_ORIGIN);
+
+	if (!asce)
+		return;
+	switch (asce & _ASCE_TYPE_MASK) {
+	case _ASCE_TYPE_SEGMENT:
+		base_segment_walk(table, 0, _REGION3_SIZE, 0);
+		break;
+	case _ASCE_TYPE_REGION3:
+		base_region3_walk(table, 0, _REGION2_SIZE, 0);
+		break;
+	case _ASCE_TYPE_REGION2:
+		base_region2_walk(table, 0, _REGION1_SIZE, 0);
+		break;
+	case _ASCE_TYPE_REGION1:
+		base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
+		break;
+	}
+	base_crst_free(table);
+}
+
+static int base_pgt_cache_init(void)
+{
+	static DEFINE_MUTEX(base_pgt_cache_mutex);
+	unsigned long sz = _PAGE_TABLE_SIZE;
+
+	if (base_pgt_cache)
+		return 0;
+	mutex_lock(&base_pgt_cache_mutex);
+	if (!base_pgt_cache)
+		base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
+	mutex_unlock(&base_pgt_cache_mutex);
+	return base_pgt_cache ? 0 : -ENOMEM;
+}
+
+/**
+ * base_asce_alloc - create kernel mapping without enhanced DAT features
+ * @addr: virtual start address of kernel mapping
+ * @num_pages: number of consecutive pages
+ *
+ * Generate an asce, including all required region, segment and page tables,
+ * that can be used to access the virtual kernel mapping. The difference is
+ * that the returned asce does not make use of any enhanced DAT features like
+ * e.g. large pages. This is required for some I/O functions that pass an
+ * asce, like e.g. some service call requests.
+ *
+ * Note: the returned asce may NEVER be attached to any cpu. It may only be
+ *	 used for I/O requests. tlb entries that might result because the
+ *	 asce was attached to a cpu won't be cleared.
+ */
+unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
+{
+	unsigned long asce, *table, end;
+	int rc;
+
+	if (base_pgt_cache_init())
+		return 0;
+	end = addr + num_pages * PAGE_SIZE;
+	if (end <= _REGION3_SIZE) {
+		table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+		if (!table)
+			return 0;
+		rc = base_segment_walk(table, addr, end, 1);
+		asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+	} else if (end <= _REGION2_SIZE) {
+		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
+		if (!table)
+			return 0;
+		rc = base_region3_walk(table, addr, end, 1);
+		asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+	} else if (end <= _REGION1_SIZE) {
+		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
+		if (!table)
+			return 0;
+		rc = base_region2_walk(table, addr, end, 1);
+		asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+	} else {
+		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
+		if (!table)
+			return 0;
+		rc = base_region1_walk(table, addr, end, 1);
+		asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+	}
+	if (rc) {
+		base_asce_free(asce);
+		asce = 0;
+	}
+	return asce;
+}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index a8154a1a2c94..666adcd681ab 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1,8 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *    Copyright IBM Corp. 2007, 2011
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
+#include <linux/cpufeature.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -10,1197 +13,1143 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/smp.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
 #include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/quicklist.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
+#include <linux/leafops.h>
+#include <linux/sysctl.h>
+#include <linux/ksm.h>
+#include <linux/mman.h>
 
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/page-states.h>
+#include <asm/pgtable.h>
+#include <asm/machine.h>
 
-#ifndef CONFIG_64BIT
-#define ALLOC_ORDER	1
-#define FRAG_MASK	0x0f
-#else
-#define ALLOC_ORDER	2
-#define FRAG_MASK	0x03
-#endif
-
-
-unsigned long *crst_table_alloc(struct mm_struct *mm)
+pgprot_t pgprot_writecombine(pgprot_t prot)
 {
-	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
-
-	if (!page)
-		return NULL;
-	return (unsigned long *) page_to_phys(page);
+	/*
+	 * mio_wb_bit_mask may be set on a different CPU, but it is only set
+	 * once at init and only read afterwards.
+	 */
+	return __pgprot(pgprot_val(prot) | mio_wb_bit_mask);
 }
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
-void crst_table_free(struct mm_struct *mm, unsigned long *table)
+static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
+				   pte_t *ptep, int nodat)
 {
-	free_pages((unsigned long) table, ALLOC_ORDER);
+	unsigned long opt, asce;
+
+	if (machine_has_tlb_guest()) {
+		opt = 0;
+		asce = READ_ONCE(mm->context.gmap_asce);
+		if (asce == 0UL || nodat)
+			opt |= IPTE_NODAT;
+		if (asce != -1UL) {
+			asce = asce ? : mm->context.asce;
+			opt |= IPTE_GUEST_ASCE;
+		}
+		__ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL);
+	} else {
+		__ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL);
+	}
 }
 
-#ifdef CONFIG_64BIT
-int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
+static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr,
+				    pte_t *ptep, int nodat)
 {
-	unsigned long *table, *pgd;
-	unsigned long entry;
-
-	BUG_ON(limit > (1UL << 53));
-repeat:
-	table = crst_table_alloc(mm);
-	if (!table)
-		return -ENOMEM;
-	spin_lock_bh(&mm->page_table_lock);
-	if (mm->context.asce_limit < limit) {
-		pgd = (unsigned long *) mm->pgd;
-		if (mm->context.asce_limit <= (1UL << 31)) {
-			entry = _REGION3_ENTRY_EMPTY;
-			mm->context.asce_limit = 1UL << 42;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_REGION3;
-		} else {
-			entry = _REGION2_ENTRY_EMPTY;
-			mm->context.asce_limit = 1UL << 53;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_REGION2;
+	unsigned long opt, asce;
+
+	if (machine_has_tlb_guest()) {
+		opt = 0;
+		asce = READ_ONCE(mm->context.gmap_asce);
+		if (asce == 0UL || nodat)
+			opt |= IPTE_NODAT;
+		if (asce != -1UL) {
+			asce = asce ? : mm->context.asce;
+			opt |= IPTE_GUEST_ASCE;
 		}
-		crst_table_init(table, entry);
-		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
-		mm->pgd = (pgd_t *) table;
-		mm->task_size = mm->context.asce_limit;
-		table = NULL;
+		__ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL);
+	} else {
+		__ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
 	}
-	spin_unlock_bh(&mm->page_table_lock);
-	if (table)
-		crst_table_free(mm, table);
-	if (mm->context.asce_limit < limit)
-		goto repeat;
-	return 0;
 }
 
-void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
+static inline pte_t ptep_flush_direct(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep,
+				      int nodat)
 {
-	pgd_t *pgd;
+	pte_t old;
+
+	old = *ptep;
+	if (unlikely(pte_val(old) & _PAGE_INVALID))
+		return old;
+	atomic_inc(&mm->context.flush_count);
+	if (cpu_has_tlb_lc() &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		ptep_ipte_local(mm, addr, ptep, nodat);
+	else
+		ptep_ipte_global(mm, addr, ptep, nodat);
+	atomic_dec(&mm->context.flush_count);
+	return old;
+}
 
-	while (mm->context.asce_limit > limit) {
-		pgd = mm->pgd;
-		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
-		case _REGION_ENTRY_TYPE_R2:
-			mm->context.asce_limit = 1UL << 42;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_REGION3;
-			break;
-		case _REGION_ENTRY_TYPE_R3:
-			mm->context.asce_limit = 1UL << 31;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_SEGMENT;
-			break;
-		default:
-			BUG();
-		}
-		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
-		mm->task_size = mm->context.asce_limit;
-		crst_table_free(mm, (unsigned long *) pgd);
-	}
+static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
+				    unsigned long addr, pte_t *ptep,
+				    int nodat)
+{
+	pte_t old;
+
+	old = *ptep;
+	if (unlikely(pte_val(old) & _PAGE_INVALID))
+		return old;
+	atomic_inc(&mm->context.flush_count);
+	if (cpumask_equal(&mm->context.cpu_attach_mask,
+			  cpumask_of(smp_processor_id()))) {
+		set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID)));
+		mm->context.flush_mm = 1;
+	} else
+		ptep_ipte_global(mm, addr, ptep, nodat);
+	atomic_dec(&mm->context.flush_count);
+	return old;
 }
+
+static inline pgste_t pgste_get(pte_t *ptep)
+{
+	unsigned long pgste = 0;
+#ifdef CONFIG_PGSTE
+	pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
 #endif
+	return __pgste(pgste);
+}
 
+static inline void pgste_set(pte_t *ptep, pgste_t pgste)
+{
 #ifdef CONFIG_PGSTE
+	*(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
+#endif
+}
 
-/**
- * gmap_alloc - allocate a guest address space
- * @mm: pointer to the parent mm_struct
- *
- * Returns a guest address space structure.
- */
-struct gmap *gmap_alloc(struct mm_struct *mm)
+static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
+				       struct mm_struct *mm)
 {
-	struct gmap *gmap;
-	struct page *page;
-	unsigned long *table;
-
-	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
-	if (!gmap)
-		goto out;
-	INIT_LIST_HEAD(&gmap->crst_list);
-	gmap->mm = mm;
-	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
-	if (!page)
-		goto out_free;
-	list_add(&page->lru, &gmap->crst_list);
-	table = (unsigned long *) page_to_phys(page);
-	crst_table_init(table, _REGION1_ENTRY_EMPTY);
-	gmap->table = table;
-	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
-		     _ASCE_USER_BITS | __pa(table);
-	list_add(&gmap->list, &mm->context.gmap_list);
-	return gmap;
-
-out_free:
-	kfree(gmap);
-out:
-	return NULL;
+#ifdef CONFIG_PGSTE
+	unsigned long address, bits, skey;
+
+	if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID)
+		return pgste;
+	address = pte_val(pte) & PAGE_MASK;
+	skey = (unsigned long) page_get_storage_key(address);
+	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+	/* Transfer page changed & referenced bit to guest bits in pgste */
+	pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */
+	/* Copy page access key and fetch protection bit to pgste */
+	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
+#endif
+	return pgste;
+
 }
-EXPORT_SYMBOL_GPL(gmap_alloc);
 
-static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
+static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
+				 struct mm_struct *mm)
 {
-	struct gmap_pgtable *mp;
-	struct gmap_rmap *rmap;
-	struct page *page;
+#ifdef CONFIG_PGSTE
+	unsigned long address;
+	unsigned long nkey;
 
-	if (*table & _SEGMENT_ENTRY_INV)
-		return 0;
-	page = pfn_to_page(*table >> PAGE_SHIFT);
-	mp = (struct gmap_pgtable *) page->index;
-	list_for_each_entry(rmap, &mp->mapper, list) {
-		if (rmap->entry != table)
-			continue;
-		list_del(&rmap->list);
-		kfree(rmap);
-		break;
-	}
-	*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
-	return 1;
+	if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID)
+		return;
+	VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
+	address = pte_val(entry) & PAGE_MASK;
+	/*
+	 * Set page access key and fetch protection bit from pgste.
+	 * The guest C/R information is still in the PGSTE, set real
+	 * key C/R to 0.
+	 */
+	nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+	nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
+	page_set_storage_key(address, nkey, 0);
+#endif
 }
 
-static void gmap_flush_tlb(struct gmap *gmap)
+static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
 {
-	if (MACHINE_HAS_IDTE)
-		__tlb_flush_idte((unsigned long) gmap->table |
-				 _ASCE_TYPE_REGION1);
-	else
-		__tlb_flush_global();
+#ifdef CONFIG_PGSTE
+	if ((pte_val(entry) & _PAGE_PRESENT) &&
+	    (pte_val(entry) & _PAGE_WRITE) &&
+	    !(pte_val(entry) & _PAGE_INVALID)) {
+		if (!machine_has_esop()) {
+			/*
+			 * Without enhanced suppression-on-protection force
+			 * the dirty bit on for all writable ptes.
+			 */
+			entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
+			entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
+		}
+		if (!(pte_val(entry) & _PAGE_PROTECT))
+			/* This pte allows write access, set user-dirty */
+			pgste = set_pgste_bit(pgste, PGSTE_UC_BIT);
+	}
+#endif
+	set_pte(ptep, entry);
+	return pgste;
 }
 
-/**
- * gmap_free - free a guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_free(struct gmap *gmap)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+				       unsigned long addr,
+				       pte_t *ptep, pgste_t pgste)
 {
-	struct page *page, *next;
-	unsigned long *table;
-	int i;
+#ifdef CONFIG_PGSTE
+	unsigned long bits;
+
+	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+	if (bits) {
+		pgste = __pgste(pgste_val(pgste) ^ bits);
+		ptep_notify(mm, addr, ptep, bits);
+	}
+#endif
+	return pgste;
+}
 
+static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep)
+{
+	pgste_t pgste = __pgste(0);
 
-	/* Flush tlb. */
-	if (MACHINE_HAS_IDTE)
-		__tlb_flush_idte((unsigned long) gmap->table |
-				 _ASCE_TYPE_REGION1);
-	else
-		__tlb_flush_global();
-
-	/* Free all segment & region tables. */
-	down_read(&gmap->mm->mmap_sem);
-	spin_lock(&gmap->mm->page_table_lock);
-	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
-		table = (unsigned long *) page_to_phys(page);
-		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
-			/* Remove gmap rmap structures for segment table. */
-			for (i = 0; i < PTRS_PER_PMD; i++, table++)
-				gmap_unlink_segment(gmap, table);
-		__free_pages(page, ALLOC_ORDER);
+	if (mm_has_pgste(mm)) {
+		pgste = pgste_get_lock(ptep);
+		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
 	}
-	spin_unlock(&gmap->mm->page_table_lock);
-	up_read(&gmap->mm->mmap_sem);
-	list_del(&gmap->list);
-	kfree(gmap);
+	return pgste;
 }
-EXPORT_SYMBOL_GPL(gmap_free);
 
-/**
- * gmap_enable - switch primary space to the guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_enable(struct gmap *gmap)
+static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
+				    unsigned long addr, pte_t *ptep,
+				    pgste_t pgste, pte_t old, pte_t new)
 {
-	S390_lowcore.gmap = (unsigned long) gmap;
+	if (mm_has_pgste(mm)) {
+		if (pte_val(old) & _PAGE_INVALID)
+			pgste_set_key(ptep, pgste, new, mm);
+		if (pte_val(new) & _PAGE_INVALID) {
+			pgste = pgste_update_all(old, pgste, mm);
+			if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
+			    _PGSTE_GPS_USAGE_UNUSED)
+				old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
+		}
+		pgste = pgste_set_pte(ptep, pgste, new);
+		pgste_set_unlock(ptep, pgste);
+	} else {
+		set_pte(ptep, new);
+	}
+	return old;
 }
-EXPORT_SYMBOL_GPL(gmap_enable);
 
-/**
- * gmap_disable - switch back to the standard primary address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_disable(struct gmap *gmap)
+pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
+		       pte_t *ptep, pte_t new)
 {
-	S390_lowcore.gmap = 0UL;
+	pgste_t pgste;
+	pte_t old;
+	int nodat;
+
+	preempt_disable();
+	pgste = ptep_xchg_start(mm, addr, ptep);
+	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+	old = ptep_flush_direct(mm, addr, ptep, nodat);
+	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	preempt_enable();
+	return old;
 }
-EXPORT_SYMBOL_GPL(gmap_disable);
+EXPORT_SYMBOL(ptep_xchg_direct);
 
 /*
- * gmap_alloc_table is assumed to be called with mmap_sem held
+ * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that
+ * RDP can be used instead of IPTE. See also comments at pte_allow_rdp().
  */
-static int gmap_alloc_table(struct gmap *gmap,
-			       unsigned long *table, unsigned long init)
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+			 pte_t new)
 {
-	struct page *page;
-	unsigned long *new;
-
-	/* since we dont free the gmap table until gmap_free we can unlock */
-	spin_unlock(&gmap->mm->page_table_lock);
-	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
-	spin_lock(&gmap->mm->page_table_lock);
-	if (!page)
-		return -ENOMEM;
-	new = (unsigned long *) page_to_phys(page);
-	crst_table_init(new, init);
-	if (*table & _REGION_ENTRY_INV) {
-		list_add(&page->lru, &gmap->crst_list);
-		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
-			(*table & _REGION_ENTRY_TYPE_MASK);
-	} else
-		__free_pages(page, ALLOC_ORDER);
-	return 0;
+	preempt_disable();
+	atomic_inc(&mm->context.flush_count);
+	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		__ptep_rdp(addr, ptep, 1);
+	else
+		__ptep_rdp(addr, ptep, 0);
+	/*
+	 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That
+	 * means it is still valid and active, and must not be changed according
+	 * to the architecture. But writing a new value that only differs in SW
+	 * bits is allowed.
+	 */
+	set_pte(ptep, new);
+	atomic_dec(&mm->context.flush_count);
+	preempt_enable();
 }
+EXPORT_SYMBOL(ptep_reset_dat_prot);
 
-/**
- * gmap_unmap_segment - unmap segment from the guest address space
- * @gmap: pointer to the guest address space structure
- * @addr: address in the guest address space
- * @len: length of the memory area to unmap
- *
- * Returns 0 if the unmap succeded, -EINVAL if not.
- */
-int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
+pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t new)
 {
-	unsigned long *table;
-	unsigned long off;
-	int flush;
+	pgste_t pgste;
+	pte_t old;
+	int nodat;
 
-	if ((to | len) & (PMD_SIZE - 1))
-		return -EINVAL;
-	if (len == 0 || to + len < to)
-		return -EINVAL;
+	preempt_disable();
+	pgste = ptep_xchg_start(mm, addr, ptep);
+	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+	old = ptep_flush_lazy(mm, addr, ptep, nodat);
+	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	preempt_enable();
+	return old;
+}
+EXPORT_SYMBOL(ptep_xchg_lazy);
 
-	flush = 0;
-	down_read(&gmap->mm->mmap_sem);
-	spin_lock(&gmap->mm->page_table_lock);
-	for (off = 0; off < len; off += PMD_SIZE) {
-		/* Walk the guest addr space page table */
-		table = gmap->table + (((to + off) >> 53) & 0x7ff);
-		if (*table & _REGION_ENTRY_INV)
-			goto out;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + (((to + off) >> 42) & 0x7ff);
-		if (*table & _REGION_ENTRY_INV)
-			goto out;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + (((to + off) >> 31) & 0x7ff);
-		if (*table & _REGION_ENTRY_INV)
-			goto out;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + (((to + off) >> 20) & 0x7ff);
-
-		/* Clear segment table entry in guest address space. */
-		flush |= gmap_unlink_segment(gmap, table);
-		*table = _SEGMENT_ENTRY_INV;
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t *ptep)
+{
+	pgste_t pgste;
+	pte_t old;
+	int nodat;
+	struct mm_struct *mm = vma->vm_mm;
+
+	pgste = ptep_xchg_start(mm, addr, ptep);
+	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+	old = ptep_flush_lazy(mm, addr, ptep, nodat);
+	if (mm_has_pgste(mm)) {
+		pgste = pgste_update_all(old, pgste, mm);
+		pgste_set(ptep, pgste);
 	}
-out:
-	spin_unlock(&gmap->mm->page_table_lock);
-	up_read(&gmap->mm->mmap_sem);
-	if (flush)
-		gmap_flush_tlb(gmap);
-	return 0;
+	return old;
 }
-EXPORT_SYMBOL_GPL(gmap_unmap_segment);
 
-/**
- * gmap_mmap_segment - map a segment to the guest address space
- * @gmap: pointer to the guest address space structure
- * @from: source address in the parent address space
- * @to: target address in the guest address space
- *
- * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
- */
-int gmap_map_segment(struct gmap *gmap, unsigned long from,
-		     unsigned long to, unsigned long len)
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t *ptep, pte_t old_pte, pte_t pte)
 {
-	unsigned long *table;
-	unsigned long off;
-	int flush;
-
-	if ((from | to | len) & (PMD_SIZE - 1))
-		return -EINVAL;
-	if (len == 0 || from + len > PGDIR_SIZE ||
-	    from + len < from || to + len < to)
-		return -EINVAL;
+	pgste_t pgste;
+	struct mm_struct *mm = vma->vm_mm;
 
-	flush = 0;
-	down_read(&gmap->mm->mmap_sem);
-	spin_lock(&gmap->mm->page_table_lock);
-	for (off = 0; off < len; off += PMD_SIZE) {
-		/* Walk the gmap address space page table */
-		table = gmap->table + (((to + off) >> 53) & 0x7ff);
-		if ((*table & _REGION_ENTRY_INV) &&
-		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
-			goto out_unmap;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + (((to + off) >> 42) & 0x7ff);
-		if ((*table & _REGION_ENTRY_INV) &&
-		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
-			goto out_unmap;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + (((to + off) >> 31) & 0x7ff);
-		if ((*table & _REGION_ENTRY_INV) &&
-		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
-			goto out_unmap;
-		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
-		table = table + (((to + off) >> 20) & 0x7ff);
-
-		/* Store 'from' address in an invalid segment table entry. */
-		flush |= gmap_unlink_segment(gmap, table);
-		*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
+	if (mm_has_pgste(mm)) {
+		pgste = pgste_get(ptep);
+		pgste_set_key(ptep, pgste, pte, mm);
+		pgste = pgste_set_pte(ptep, pgste, pte);
+		pgste_set_unlock(ptep, pgste);
+	} else {
+		set_pte(ptep, pte);
 	}
-	spin_unlock(&gmap->mm->page_table_lock);
-	up_read(&gmap->mm->mmap_sem);
-	if (flush)
-		gmap_flush_tlb(gmap);
-	return 0;
-
-out_unmap:
-	spin_unlock(&gmap->mm->page_table_lock);
-	up_read(&gmap->mm->mmap_sem);
-	gmap_unmap_segment(gmap, to, len);
-	return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(gmap_map_segment);
 
-static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap)
+static inline void pmdp_idte_local(struct mm_struct *mm,
+				   unsigned long addr, pmd_t *pmdp)
 {
-	unsigned long *table;
-
-	table = gmap->table + ((address >> 53) & 0x7ff);
-	if (unlikely(*table & _REGION_ENTRY_INV))
-		return ERR_PTR(-EFAULT);
-	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-	table = table + ((address >> 42) & 0x7ff);
-	if (unlikely(*table & _REGION_ENTRY_INV))
-		return ERR_PTR(-EFAULT);
-	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-	table = table + ((address >> 31) & 0x7ff);
-	if (unlikely(*table & _REGION_ENTRY_INV))
-		return ERR_PTR(-EFAULT);
-	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-	table = table + ((address >> 20) & 0x7ff);
-	return table;
+	if (machine_has_tlb_guest())
+		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
+			    mm->context.asce, IDTE_LOCAL);
+	else
+		__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
+	if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+		gmap_pmdp_idte_local(mm, addr);
 }
 
-/**
- * __gmap_translate - translate a guest address to a user space address
- * @address: guest address
- * @gmap: pointer to guest mapping meta data structure
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- * The mmap_sem of the mm that belongs to the address space must be held
- * when this function gets called.
- */
-unsigned long __gmap_translate(unsigned long address, struct gmap *gmap)
+static inline void pmdp_idte_global(struct mm_struct *mm,
+				    unsigned long addr, pmd_t *pmdp)
 {
-	unsigned long *segment_ptr, vmaddr, segment;
-	struct gmap_pgtable *mp;
-	struct page *page;
-
-	current->thread.gmap_addr = address;
-	segment_ptr = gmap_table_walk(address, gmap);
-	if (IS_ERR(segment_ptr))
-		return PTR_ERR(segment_ptr);
-	/* Convert the gmap address to an mm address. */
-	segment = *segment_ptr;
-	if (!(segment & _SEGMENT_ENTRY_INV)) {
-		page = pfn_to_page(segment >> PAGE_SHIFT);
-		mp = (struct gmap_pgtable *) page->index;
-		return mp->vmaddr | (address & ~PMD_MASK);
-	} else if (segment & _SEGMENT_ENTRY_RO) {
-		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
-		return vmaddr | (address & ~PMD_MASK);
+	if (machine_has_tlb_guest()) {
+		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
+			    mm->context.asce, IDTE_GLOBAL);
+		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+			gmap_pmdp_idte_global(mm, addr);
+	} else {
+		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
+		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+			gmap_pmdp_idte_global(mm, addr);
 	}
-	return -EFAULT;
 }
-EXPORT_SYMBOL_GPL(__gmap_translate);
 
-/**
- * gmap_translate - translate a guest address to a user space address
- * @address: guest address
- * @gmap: pointer to guest mapping meta data structure
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- */
-unsigned long gmap_translate(unsigned long address, struct gmap *gmap)
+static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
+				      unsigned long addr, pmd_t *pmdp)
 {
-	unsigned long rc;
+	pmd_t old;
+
+	old = *pmdp;
+	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
+		return old;
+	atomic_inc(&mm->context.flush_count);
+	if (cpu_has_tlb_lc() &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		pmdp_idte_local(mm, addr, pmdp);
+	else
+		pmdp_idte_global(mm, addr, pmdp);
+	atomic_dec(&mm->context.flush_count);
+	return old;
+}
 
-	down_read(&gmap->mm->mmap_sem);
-	rc = __gmap_translate(address, gmap);
-	up_read(&gmap->mm->mmap_sem);
-	return rc;
+static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
+				    unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t old;
+
+	old = *pmdp;
+	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
+		return old;
+	atomic_inc(&mm->context.flush_count);
+	if (cpumask_equal(&mm->context.cpu_attach_mask,
+			  cpumask_of(smp_processor_id()))) {
+		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
+		mm->context.flush_mm = 1;
+		if (mm_has_pgste(mm))
+			gmap_pmdp_invalidate(mm, addr);
+	} else {
+		pmdp_idte_global(mm, addr, pmdp);
+	}
+	atomic_dec(&mm->context.flush_count);
+	return old;
 }
-EXPORT_SYMBOL_GPL(gmap_translate);
 
-static int gmap_connect_pgtable(unsigned long address, unsigned long segment,
-				unsigned long *segment_ptr, struct gmap *gmap)
+#ifdef CONFIG_PGSTE
+static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
 {
-	unsigned long vmaddr;
 	struct vm_area_struct *vma;
-	struct gmap_pgtable *mp;
-	struct gmap_rmap *rmap;
-	struct mm_struct *mm;
-	struct page *page;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
-	pmd_t *pmd;
 
-	mm = gmap->mm;
-	vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
-	vma = find_vma(mm, vmaddr);
-	if (!vma || vma->vm_start > vmaddr)
+	/* We need a valid VMA, otherwise this is clearly a fault. */
+	vma = vma_lookup(mm, addr);
+	if (!vma)
 		return -EFAULT;
-	/* Walk the parent mm page table */
-	pgd = pgd_offset(mm, vmaddr);
-	pud = pud_alloc(mm, pgd, vmaddr);
-	if (!pud)
-		return -ENOMEM;
-	pmd = pmd_alloc(mm, pud, vmaddr);
-	if (!pmd)
-		return -ENOMEM;
-	if (!pmd_present(*pmd) &&
-	    __pte_alloc(mm, vma, pmd, vmaddr))
-		return -ENOMEM;
-	/* pmd now points to a valid segment table entry. */
-	rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
-	if (!rmap)
-		return -ENOMEM;
-	/* Link gmap segment table entry location to page table. */
-	page = pmd_page(*pmd);
-	mp = (struct gmap_pgtable *) page->index;
-	rmap->gmap = gmap;
-	rmap->entry = segment_ptr;
-	rmap->vmaddr = address & PMD_MASK;
-	spin_lock(&mm->page_table_lock);
-	if (*segment_ptr == segment) {
-		list_add(&rmap->list, &mp->mapper);
-		/* Set gmap segment table entry to page table. */
-		*segment_ptr = pmd_val(*pmd) & PAGE_MASK;
-		rmap = NULL;
-	}
-	spin_unlock(&mm->page_table_lock);
-	kfree(rmap);
-	return 0;
-}
 
-static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table)
-{
-	struct gmap_rmap *rmap, *next;
-	struct gmap_pgtable *mp;
-	struct page *page;
-	int flush;
-
-	flush = 0;
-	spin_lock(&mm->page_table_lock);
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	mp = (struct gmap_pgtable *) page->index;
-	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
-		*rmap->entry =
-			_SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
-		list_del(&rmap->list);
-		kfree(rmap);
-		flush = 1;
-	}
-	spin_unlock(&mm->page_table_lock);
-	if (flush)
-		__tlb_flush_global();
-}
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		return -ENOENT;
 
-/*
- * this function is assumed to be called with mmap_sem held
- */
-unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
-{
-	unsigned long *segment_ptr, segment;
-	struct gmap_pgtable *mp;
-	struct page *page;
-	int rc;
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return -ENOENT;
 
-	current->thread.gmap_addr = address;
-	segment_ptr = gmap_table_walk(address, gmap);
-	if (IS_ERR(segment_ptr))
+	pud = pud_offset(p4d, addr);
+	if (!pud_present(*pud))
+		return -ENOENT;
+
+	/* Large PUDs are not supported yet. */
+	if (pud_leaf(*pud))
 		return -EFAULT;
-	/* Convert the gmap address to an mm address. */
-	while (1) {
-		segment = *segment_ptr;
-		if (!(segment & _SEGMENT_ENTRY_INV)) {
-			/* Page table is present */
-			page = pfn_to_page(segment >> PAGE_SHIFT);
-			mp = (struct gmap_pgtable *) page->index;
-			return mp->vmaddr | (address & ~PMD_MASK);
-		}
-		if (!(segment & _SEGMENT_ENTRY_RO))
-			/* Nothing mapped in the gmap address space. */
-			break;
-		rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
-		if (rc)
-			return rc;
-	}
-	return -EFAULT;
+
+	*pmdp = pmd_offset(pud, addr);
+	return 0;
 }
+#endif
 
-unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
+pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+		       pmd_t *pmdp, pmd_t new)
 {
-	unsigned long rc;
-
-	down_read(&gmap->mm->mmap_sem);
-	rc = __gmap_fault(address, gmap);
-	up_read(&gmap->mm->mmap_sem);
+	pmd_t old;
 
-	return rc;
+	preempt_disable();
+	old = pmdp_flush_direct(mm, addr, pmdp);
+	set_pmd(pmdp, new);
+	preempt_enable();
+	return old;
 }
-EXPORT_SYMBOL_GPL(gmap_fault);
+EXPORT_SYMBOL(pmdp_xchg_direct);
 
-void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
+pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
+		     pmd_t *pmdp, pmd_t new)
 {
+	pmd_t old;
 
-	unsigned long *table, address, size;
-	struct vm_area_struct *vma;
-	struct gmap_pgtable *mp;
-	struct page *page;
-
-	down_read(&gmap->mm->mmap_sem);
-	address = from;
-	while (address < to) {
-		/* Walk the gmap address space page table */
-		table = gmap->table + ((address >> 53) & 0x7ff);
-		if (unlikely(*table & _REGION_ENTRY_INV)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
-			continue;
-		}
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + ((address >> 42) & 0x7ff);
-		if (unlikely(*table & _REGION_ENTRY_INV)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
-			continue;
-		}
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + ((address >> 31) & 0x7ff);
-		if (unlikely(*table & _REGION_ENTRY_INV)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
-			continue;
-		}
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-		table = table + ((address >> 20) & 0x7ff);
-		if (unlikely(*table & _SEGMENT_ENTRY_INV)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
-			continue;
-		}
-		page = pfn_to_page(*table >> PAGE_SHIFT);
-		mp = (struct gmap_pgtable *) page->index;
-		vma = find_vma(gmap->mm, mp->vmaddr);
-		size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
-		zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
-			       size, NULL);
-		address = (address + PMD_SIZE) & PMD_MASK;
-	}
-	up_read(&gmap->mm->mmap_sem);
+	preempt_disable();
+	old = pmdp_flush_lazy(mm, addr, pmdp);
+	set_pmd(pmdp, new);
+	preempt_enable();
+	return old;
 }
-EXPORT_SYMBOL_GPL(gmap_discard);
+EXPORT_SYMBOL(pmdp_xchg_lazy);
 
-static LIST_HEAD(gmap_notifier_list);
-static DEFINE_SPINLOCK(gmap_notifier_lock);
-
-/**
- * gmap_register_ipte_notifier - register a pte invalidation callback
- * @nb: pointer to the gmap notifier block
- */
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+static inline void pudp_idte_local(struct mm_struct *mm,
+				   unsigned long addr, pud_t *pudp)
 {
-	spin_lock(&gmap_notifier_lock);
-	list_add(&nb->list, &gmap_notifier_list);
-	spin_unlock(&gmap_notifier_lock);
+	if (machine_has_tlb_guest())
+		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
+			    mm->context.asce, IDTE_LOCAL);
+	else
+		__pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL);
 }
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
 
-/**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
- * @nb: pointer to the gmap notifier block
- */
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+static inline void pudp_idte_global(struct mm_struct *mm,
+				    unsigned long addr, pud_t *pudp)
 {
-	spin_lock(&gmap_notifier_lock);
-	list_del_init(&nb->list);
-	spin_unlock(&gmap_notifier_lock);
+	if (machine_has_tlb_guest())
+		__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
+			    mm->context.asce, IDTE_GLOBAL);
+	else
+		__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
 }
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 
-/**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
- * @gmap: pointer to guest mapping meta data structure
- * @address: virtual address in the guest address space
- * @len: size of area
- *
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
- */
-int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
+static inline pud_t pudp_flush_direct(struct mm_struct *mm,
+				      unsigned long addr, pud_t *pudp)
 {
-	unsigned long addr;
-	spinlock_t *ptl;
-	pte_t *ptep, entry;
-	pgste_t pgste;
-	int rc = 0;
-
-	if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK))
-		return -EINVAL;
-	down_read(&gmap->mm->mmap_sem);
-	while (len) {
-		/* Convert gmap address and connect the page tables */
-		addr = __gmap_fault(start, gmap);
-		if (IS_ERR_VALUE(addr)) {
-			rc = addr;
-			break;
-		}
-		/* Get the page mapped */
-		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
-			rc = -EFAULT;
-			break;
-		}
-		/* Walk the process page table, lock and get pte pointer */
-		ptep = get_locked_pte(gmap->mm, addr, &ptl);
-		if (unlikely(!ptep))
-			continue;
-		/* Set notification bit in the pgste of the pte */
-		entry = *ptep;
-		if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) {
-			pgste = pgste_get_lock(ptep);
-			pgste_val(pgste) |= PGSTE_IN_BIT;
-			pgste_set_unlock(ptep, pgste);
-			start += PAGE_SIZE;
-			len -= PAGE_SIZE;
-		}
-		spin_unlock(ptl);
-	}
-	up_read(&gmap->mm->mmap_sem);
-	return rc;
+	pud_t old;
+
+	old = *pudp;
+	if (pud_val(old) & _REGION_ENTRY_INVALID)
+		return old;
+	atomic_inc(&mm->context.flush_count);
+	if (cpu_has_tlb_lc() &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		pudp_idte_local(mm, addr, pudp);
+	else
+		pudp_idte_global(mm, addr, pudp);
+	atomic_dec(&mm->context.flush_count);
+	return old;
 }
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
 
-/**
- * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
- * @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
- * @pte: pointer to the page table entry
- *
- * This function is assumed to be called with the page table lock held
- * for the pte to notify.
- */
-void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
+pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+		       pud_t *pudp, pud_t new)
 {
-	unsigned long segment_offset;
-	struct gmap_notifier *nb;
-	struct gmap_pgtable *mp;
-	struct gmap_rmap *rmap;
-	struct page *page;
-
-	segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-	segment_offset = segment_offset * (4096 / sizeof(pte_t));
-	page = pfn_to_page(__pa(pte) >> PAGE_SHIFT);
-	mp = (struct gmap_pgtable *) page->index;
-	spin_lock(&gmap_notifier_lock);
-	list_for_each_entry(rmap, &mp->mapper, list) {
-		list_for_each_entry(nb, &gmap_notifier_list, list)
-			nb->notifier_call(rmap->gmap,
-					  rmap->vmaddr + segment_offset);
-	}
-	spin_unlock(&gmap_notifier_lock);
-}
+	pud_t old;
 
-static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
-						    unsigned long vmaddr)
-{
-	struct page *page;
-	unsigned long *table;
-	struct gmap_pgtable *mp;
-
-	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
-	if (!page)
-		return NULL;
-	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
-	if (!mp) {
-		__free_page(page);
-		return NULL;
-	}
-	pgtable_page_ctor(page);
-	mp->vmaddr = vmaddr & PMD_MASK;
-	INIT_LIST_HEAD(&mp->mapper);
-	page->index = (unsigned long) mp;
-	atomic_set(&page->_mapcount, 3);
-	table = (unsigned long *) page_to_phys(page);
-	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
-	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
-	return table;
+	preempt_disable();
+	old = pudp_flush_direct(mm, addr, pudp);
+	set_pud(pudp, new);
+	preempt_enable();
+	return old;
 }
+EXPORT_SYMBOL(pudp_xchg_direct);
 
-static inline void page_table_free_pgste(unsigned long *table)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pgtable)
 {
-	struct page *page;
-	struct gmap_pgtable *mp;
-
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	mp = (struct gmap_pgtable *) page->index;
-	BUG_ON(!list_empty(&mp->mapper));
-	pgtable_page_dtor(page);
-	atomic_set(&page->_mapcount, -1);
-	kfree(mp);
-	__free_page(page);
+	struct list_head *lh = (struct list_head *) pgtable;
+
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+	/* FIFO */
+	if (!pmd_huge_pte(mm, pmdp))
+		INIT_LIST_HEAD(lh);
+	else
+		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+	pmd_huge_pte(mm, pmdp) = pgtable;
 }
 
-int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned long key, bool nq)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
-	spinlock_t *ptl;
-	pgste_t old, new;
+	struct list_head *lh;
+	pgtable_t pgtable;
 	pte_t *ptep;
 
-	down_read(&mm->mmap_sem);
-	ptep = get_locked_pte(current->mm, addr, &ptl);
-	if (unlikely(!ptep)) {
-		up_read(&mm->mmap_sem);
-		return -EFAULT;
-	}
-
-	new = old = pgste_get_lock(ptep);
-	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
-			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
-	pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
-	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		unsigned long address, bits;
-		unsigned char skey;
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
 
-		address = pte_val(*ptep) & PAGE_MASK;
-		skey = page_get_storage_key(address);
-		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-		/* Set storage key ACC and FP */
-		page_set_storage_key(address,
-				(key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)),
-				!nq);
-
-		/* Merge host changed & referenced into pgste  */
-		pgste_val(new) |= bits << 52;
-		/* Transfer skey changed & referenced bit to kvm user bits */
-		pgste_val(new) |= bits << 45;	/* PGSTE_UR_BIT & PGSTE_UC_BIT */
+	/* FIFO */
+	pgtable = pmd_huge_pte(mm, pmdp);
+	lh = (struct list_head *) pgtable;
+	if (list_empty(lh))
+		pmd_huge_pte(mm, pmdp) = NULL;
+	else {
+		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+		list_del(lh);
 	}
-	/* changing the guest storage key is considered a change of the page */
-	if ((pgste_val(new) ^ pgste_val(old)) &
-	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
-		pgste_val(new) |= PGSTE_UC_BIT;
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(*ptep, ptl);
-	up_read(&mm->mmap_sem);
-	return 0;
-}
-EXPORT_SYMBOL(set_guest_storage_key);
-
-#else /* CONFIG_PGSTE */
-
-static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
-						    unsigned long vmaddr)
-{
-	return NULL;
+	ptep = (pte_t *) pgtable;
+	set_pte(ptep, __pte(_PAGE_INVALID));
+	ptep++;
+	set_pte(ptep, __pte(_PAGE_INVALID));
+	return pgtable;
 }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static inline void page_table_free_pgste(unsigned long *table)
+#ifdef CONFIG_PGSTE
+void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
 {
-}
+	pgste_t pgste;
 
-static inline void gmap_disconnect_pgtable(struct mm_struct *mm,
-					   unsigned long *table)
-{
+	/* the mm_has_pgste() check is done in set_pte_at() */
+	preempt_disable();
+	pgste = pgste_get_lock(ptep);
+	pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO);
+	pgste_set_key(ptep, pgste, entry, mm);
+	pgste = pgste_set_pte(ptep, pgste, entry);
+	pgste_set_unlock(ptep, pgste);
+	preempt_enable();
 }
 
-#endif /* CONFIG_PGSTE */
-
-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
+void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-	unsigned int old, new;
+	pgste_t pgste;
 
-	do {
-		old = atomic_read(v);
-		new = old ^ bits;
-	} while (atomic_cmpxchg(v, old, new) != old);
-	return new;
+	preempt_disable();
+	pgste = pgste_get_lock(ptep);
+	pgste = set_pgste_bit(pgste, PGSTE_IN_BIT);
+	pgste_set_unlock(ptep, pgste);
+	preempt_enable();
 }
 
-/*
- * page table entry allocation/free routines.
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
  */
-unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, int prot, unsigned long bit)
 {
-	unsigned long *uninitialized_var(table);
-	struct page *uninitialized_var(page);
-	unsigned int mask, bit;
-
-	if (mm_has_pgste(mm))
-		return page_table_alloc_pgste(mm, vmaddr);
-	/* Allocate fragments of a 4K page as 1K/2K page table */
-	spin_lock_bh(&mm->context.list_lock);
-	mask = FRAG_MASK;
-	if (!list_empty(&mm->context.pgtable_list)) {
-		page = list_first_entry(&mm->context.pgtable_list,
-					struct page, lru);
-		table = (unsigned long *) page_to_phys(page);
-		mask = atomic_read(&page->_mapcount);
-		mask = mask | (mask >> 4);
+	pte_t entry;
+	pgste_t pgste;
+	int pte_i, pte_p, nodat;
+
+	pgste = pgste_get_lock(ptep);
+	entry = *ptep;
+	/* Check pte entry after all locks have been acquired */
+	pte_i = pte_val(entry) & _PAGE_INVALID;
+	pte_p = pte_val(entry) & _PAGE_PROTECT;
+	if ((pte_i && (prot != PROT_NONE)) ||
+	    (pte_p && (prot & PROT_WRITE))) {
+		pgste_set_unlock(ptep, pgste);
+		return -EAGAIN;
 	}
-	if ((mask & FRAG_MASK) == FRAG_MASK) {
-		spin_unlock_bh(&mm->context.list_lock);
-		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
-		if (!page)
-			return NULL;
-		pgtable_page_ctor(page);
-		atomic_set(&page->_mapcount, 1);
-		table = (unsigned long *) page_to_phys(page);
-		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
-		spin_lock_bh(&mm->context.list_lock);
-		list_add(&page->lru, &mm->context.pgtable_list);
-	} else {
-		for (bit = 1; mask & bit; bit <<= 1)
-			table += PTRS_PER_PTE;
-		mask = atomic_xor_bits(&page->_mapcount, bit);
-		if ((mask & FRAG_MASK) == FRAG_MASK)
-			list_del(&page->lru);
+	/* Change access rights and set pgste bit */
+	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+	if (prot == PROT_NONE && !pte_i) {
+		ptep_flush_direct(mm, addr, ptep, nodat);
+		pgste = pgste_update_all(entry, pgste, mm);
+		entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
 	}
-	spin_unlock_bh(&mm->context.list_lock);
-	return table;
+	if (prot == PROT_READ && !pte_p) {
+		ptep_flush_direct(mm, addr, ptep, nodat);
+		entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
+		entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
+	}
+	pgste = set_pgste_bit(pgste, bit);
+	pgste = pgste_set_pte(ptep, pgste, entry);
+	pgste_set_unlock(ptep, pgste);
+	return 0;
 }
 
-void page_table_free(struct mm_struct *mm, unsigned long *table)
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+		    pte_t *sptep, pte_t *tptep, pte_t pte)
 {
-	struct page *page;
-	unsigned int bit, mask;
-
-	if (mm_has_pgste(mm)) {
-		gmap_disconnect_pgtable(mm, table);
-		return page_table_free_pgste(table);
-	}
-	/* Free 1K/2K page table fragment of a 4K page */
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
-	spin_lock_bh(&mm->context.list_lock);
-	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
-		list_del(&page->lru);
-	mask = atomic_xor_bits(&page->_mapcount, bit);
-	if (mask & FRAG_MASK)
-		list_add(&page->lru, &mm->context.pgtable_list);
-	spin_unlock_bh(&mm->context.list_lock);
-	if (mask == 0) {
-		pgtable_page_dtor(page);
-		atomic_set(&page->_mapcount, -1);
-		__free_page(page);
+	pgste_t spgste, tpgste;
+	pte_t spte, tpte;
+	int rc = -EAGAIN;
+
+	if (!(pte_val(*tptep) & _PAGE_INVALID))
+		return 0;	/* already shadowed */
+	spgste = pgste_get_lock(sptep);
+	spte = *sptep;
+	if (!(pte_val(spte) & _PAGE_INVALID) &&
+	    !((pte_val(spte) & _PAGE_PROTECT) &&
+	      !(pte_val(pte) & _PAGE_PROTECT))) {
+		spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT);
+		tpgste = pgste_get_lock(tptep);
+		tpte = __pte((pte_val(spte) & PAGE_MASK) |
+			     (pte_val(pte) & _PAGE_PROTECT));
+		/* don't touch the storage key - it belongs to parent pgste */
+		tpgste = pgste_set_pte(tptep, tpgste, tpte);
+		pgste_set_unlock(tptep, tpgste);
+		rc = 1;
 	}
+	pgste_set_unlock(sptep, spgste);
+	return rc;
 }
 
-static void __page_table_free_rcu(void *table, unsigned bit)
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
 {
-	struct page *page;
-
-	if (bit == FRAG_MASK)
-		return page_table_free_pgste(table);
-	/* Free 1K/2K page table fragment of a 4K page */
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
-		pgtable_page_dtor(page);
-		atomic_set(&page->_mapcount, -1);
-		__free_page(page);
-	}
+	pgste_t pgste;
+	int nodat;
+
+	pgste = pgste_get_lock(ptep);
+	/* notifier is called by the caller */
+	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+	ptep_flush_direct(mm, saddr, ptep, nodat);
+	/* don't touch the storage key - it belongs to parent pgste */
+	pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+	pgste_set_unlock(ptep, pgste);
 }
 
-void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
+static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
 {
-	struct mm_struct *mm;
-	struct page *page;
-	unsigned int bit, mask;
+	if (softleaf_is_swap(entry))
+		dec_mm_counter(mm, MM_SWAPENTS);
+	else if (softleaf_is_migration(entry)) {
+		struct folio *folio = softleaf_to_folio(entry);
 
-	mm = tlb->mm;
-	if (mm_has_pgste(mm)) {
-		gmap_disconnect_pgtable(mm, table);
-		table = (unsigned long *) (__pa(table) | FRAG_MASK);
-		tlb_remove_table(tlb, table);
-		return;
+		dec_mm_counter(mm, mm_counter(folio));
 	}
-	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	spin_lock_bh(&mm->context.list_lock);
-	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
-		list_del(&page->lru);
-	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
-	if (mask & FRAG_MASK)
-		list_add_tail(&page->lru, &mm->context.pgtable_list);
-	spin_unlock_bh(&mm->context.list_lock);
-	table = (unsigned long *) (__pa(table) | (bit << 4));
-	tlb_remove_table(tlb, table);
+	free_swap_and_cache(entry);
 }
 
-void __tlb_remove_table(void *_table)
+void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, int reset)
 {
-	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
-	void *table = (void *)((unsigned long) _table & ~mask);
-	unsigned type = (unsigned long) _table & mask;
+	unsigned long pgstev;
+	pgste_t pgste;
+	pte_t pte;
 
-	if (type)
-		__page_table_free_rcu(table, type);
-	else
-		free_pages((unsigned long) table, ALLOC_ORDER);
+	/* Zap unused and logically-zero pages */
+	preempt_disable();
+	pgste = pgste_get_lock(ptep);
+	pgstev = pgste_val(pgste);
+	pte = *ptep;
+	if (!reset && pte_swap(pte) &&
+	    ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
+	     (pgstev & _PGSTE_GPS_ZERO))) {
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte));
+		pte_clear(mm, addr, ptep);
+	}
+	if (reset)
+		pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
+	pgste_set_unlock(ptep, pgste);
+	preempt_enable();
 }
 
-static void tlb_remove_table_smp_sync(void *arg)
+void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-	/* Simply deliver the interrupt */
-}
+	unsigned long ptev;
+	pgste_t pgste;
 
-static void tlb_remove_table_one(void *table)
-{
-	/*
-	 * This isn't an RCU grace period and hence the page-tables cannot be
-	 * assumed to be actually RCU-freed.
-	 *
-	 * It is however sufficient for software page-table walkers that rely
-	 * on IRQ disabling. See the comment near struct mmu_table_batch.
-	 */
-	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
-	__tlb_remove_table(table);
+	/* Clear storage key ACC and F, but set R/C */
+	preempt_disable();
+	pgste = pgste_get_lock(ptep);
+	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT);
+	ptev = pte_val(*ptep);
+	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
+		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
+	pgste_set_unlock(ptep, pgste);
+	preempt_enable();
 }
 
-static void tlb_remove_table_rcu(struct rcu_head *head)
+/*
+ * Test and reset if a guest page is dirty
+ */
+bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
+		       pte_t *ptep)
 {
-	struct mmu_table_batch *batch;
-	int i;
-
-	batch = container_of(head, struct mmu_table_batch, rcu);
-
-	for (i = 0; i < batch->nr; i++)
-		__tlb_remove_table(batch->tables[i]);
-
-	free_page((unsigned long)batch);
+	pgste_t pgste;
+	pte_t pte;
+	bool dirty;
+	int nodat;
+
+	pgste = pgste_get_lock(ptep);
+	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
+	pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT);
+	pte = *ptep;
+	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
+		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
+		nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+		ptep_ipte_global(mm, addr, ptep, nodat);
+		if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE))
+			pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
+		else
+			pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
+		set_pte(ptep, pte);
+	}
+	pgste_set_unlock(ptep, pgste);
+	return dirty;
 }
+EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
 
-void tlb_table_flush(struct mmu_gather *tlb)
+int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			  unsigned char key, bool nq)
 {
-	struct mmu_table_batch **batch = &tlb->batch;
+	unsigned long keyul, paddr;
+	spinlock_t *ptl;
+	pgste_t old, new;
+	pmd_t *pmdp;
+	pte_t *ptep;
 
-	if (*batch) {
-		__tlb_flush_mm(tlb->mm);
-		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
-		*batch = NULL;
+	/*
+	 * If we don't have a PTE table and if there is no huge page mapped,
+	 * we can ignore attempts to set the key to 0, because it already is 0.
+	 */
+	switch (pmd_lookup(mm, addr, &pmdp)) {
+	case -ENOENT:
+		return key ? -EFAULT : 0;
+	case 0:
+		break;
+	default:
+		return -EFAULT;
+	}
+again:
+	ptl = pmd_lock(mm, pmdp);
+	if (!pmd_present(*pmdp)) {
+		spin_unlock(ptl);
+		return key ? -EFAULT : 0;
 	}
-}
 
-void tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
-	struct mmu_table_batch **batch = &tlb->batch;
-
-	if (*batch == NULL) {
-		*batch = (struct mmu_table_batch *)
-			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-		if (*batch == NULL) {
-			__tlb_flush_mm(tlb->mm);
-			tlb_remove_table_one(table);
-			return;
-		}
-		(*batch)->nr = 0;
+	if (pmd_leaf(*pmdp)) {
+		paddr = pmd_val(*pmdp) & HPAGE_MASK;
+		paddr |= addr & ~HPAGE_MASK;
+		/*
+		 * Huge pmds need quiescing operations, they are
+		 * always mapped.
+		 */
+		page_set_storage_key(paddr, key, 1);
+		spin_unlock(ptl);
+		return 0;
 	}
-	(*batch)->tables[(*batch)->nr++] = table;
-	if ((*batch)->nr == MAX_TABLE_BATCH)
-		tlb_table_flush(tlb);
-}
+	spin_unlock(ptl);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void thp_split_vma(struct vm_area_struct *vma)
-{
-	unsigned long addr;
-	struct page *page;
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
+	new = old = pgste_get_lock(ptep);
+	new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT |
+				   PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	keyul = (unsigned long) key;
+	new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48);
+	new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
+	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+		unsigned long bits, skey;
 
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
-		page = follow_page(vma, addr, FOLL_SPLIT);
+		paddr = pte_val(*ptep) & PAGE_MASK;
+		skey = (unsigned long) page_get_storage_key(paddr);
+		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
+		/* Set storage key ACC and FP */
+		page_set_storage_key(paddr, skey, !nq);
+		/* Merge host changed & referenced into pgste  */
+		new = set_pgste_bit(new, bits << 52);
 	}
+	/* changing the guest storage key is considered a change of the page */
+	if ((pgste_val(new) ^ pgste_val(old)) &
+	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
+		new = set_pgste_bit(new, PGSTE_UC_BIT);
+
+	pgste_set_unlock(ptep, new);
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
 }
+EXPORT_SYMBOL(set_guest_storage_key);
 
-void thp_split_mm(struct mm_struct *mm)
+/*
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			       unsigned char key, unsigned char *oldkey,
+			       bool nq, bool mr, bool mc)
 {
-	struct vm_area_struct *vma = mm->mmap;
+	unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+	int rc;
 
-	while (vma != NULL) {
-		thp_split_vma(vma);
-		vma->vm_flags &= ~VM_HUGEPAGE;
-		vma->vm_flags |= VM_NOHUGEPAGE;
-		vma = vma->vm_next;
+	/* we can drop the pgste lock between getting and setting the key */
+	if (mr | mc) {
+		rc = get_guest_storage_key(current->mm, addr, &tmp);
+		if (rc)
+			return rc;
+		if (oldkey)
+			*oldkey = tmp;
+		if (!mr)
+			mask |= _PAGE_REFERENCED;
+		if (!mc)
+			mask |= _PAGE_CHANGED;
+		if (!((tmp ^ key) & mask))
+			return 0;
 	}
+	rc = set_guest_storage_key(current->mm, addr, key, nq);
+	return rc < 0 ? rc : 1;
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+EXPORT_SYMBOL(cond_set_guest_storage_key);
 
 /*
- * switch on pgstes for its userspace process (for kvm)
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
  */
-int s390_enable_sie(void)
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 {
-	struct task_struct *tsk = current;
-	struct mm_struct *mm, *old_mm;
-
-	/* Do we have switched amode? If no, we cannot do sie */
-	if (s390_user_mode == HOME_SPACE_MODE)
-		return -EINVAL;
+	spinlock_t *ptl;
+	unsigned long paddr;
+	pgste_t old, new;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	int cc = 0;
 
-	/* Do we have pgstes? if yes, we are done */
-	if (mm_has_pgste(tsk->mm))
+	/*
+	 * If we don't have a PTE table and if there is no huge page mapped,
+	 * the storage key is 0 and there is nothing for us to do.
+	 */
+	switch (pmd_lookup(mm, addr, &pmdp)) {
+	case -ENOENT:
+		return 0;
+	case 0:
+		break;
+	default:
+		return -EFAULT;
+	}
+again:
+	ptl = pmd_lock(mm, pmdp);
+	if (!pmd_present(*pmdp)) {
+		spin_unlock(ptl);
 		return 0;
-
-	/* lets check if we are allowed to replace the mm */
-	task_lock(tsk);
-	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-	    tsk->mm != tsk->active_mm) {
-		task_unlock(tsk);
-		return -EINVAL;
 	}
-	task_unlock(tsk);
 
-	/* we copy the mm and let dup_mm create the page tables with_pgstes */
-	tsk->mm->context.alloc_pgste = 1;
-	/* make sure that both mms have a correct rss state */
-	sync_mm_rss(tsk->mm);
-	mm = dup_mm(tsk);
-	tsk->mm->context.alloc_pgste = 0;
-	if (!mm)
-		return -ENOMEM;
+	if (pmd_leaf(*pmdp)) {
+		paddr = pmd_val(*pmdp) & HPAGE_MASK;
+		paddr |= addr & ~HPAGE_MASK;
+		cc = page_reset_referenced(paddr);
+		spin_unlock(ptl);
+		return cc;
+	}
+	spin_unlock(ptl);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	/* split thp mappings and disable thp for future mappings */
-	thp_split_mm(mm);
-	mm->def_flags |= VM_NOHUGEPAGE;
-#endif
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
+	new = old = pgste_get_lock(ptep);
+	/* Reset guest reference bit only */
+	new = clear_pgste_bit(new, PGSTE_GR_BIT);
 
-	/* Now lets check again if something happened */
-	task_lock(tsk);
-	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-	    tsk->mm != tsk->active_mm) {
-		mmput(mm);
-		task_unlock(tsk);
-		return -EINVAL;
+	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+		paddr = pte_val(*ptep) & PAGE_MASK;
+		cc = page_reset_referenced(paddr);
+		/* Merge real referenced bit into host-set */
+		new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT);
 	}
+	/* Reflect guest's logical view, not physical */
+	cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+	/* Changing the guest storage key is considered a change of the page */
+	if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+		new = set_pgste_bit(new, PGSTE_UC_BIT);
 
-	/* ok, we are alone. No ptrace, no threads, etc. */
-	old_mm = tsk->mm;
-	tsk->mm = tsk->active_mm = mm;
-	preempt_disable();
-	update_mm(mm, tsk);
-	atomic_inc(&mm->context.attach_count);
-	atomic_dec(&old_mm->context.attach_count);
-	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-	preempt_enable();
-	task_unlock(tsk);
-	mmput(old_mm);
-	return 0;
+	pgste_set_unlock(ptep, new);
+	pte_unmap_unlock(ptep, ptl);
+	return cc;
 }
-EXPORT_SYMBOL_GPL(s390_enable_sie);
+EXPORT_SYMBOL(reset_guest_reference_bit);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
-			   pmd_t *pmdp)
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			  unsigned char *key)
 {
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	/* No need to flush TLB
-	 * On s390 reference bits are in storage key and never in TLB */
-	return pmdp_test_and_clear_young(vma, address, pmdp);
-}
+	unsigned long paddr;
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pmd_t *pmdp;
+	pte_t *ptep;
 
-int pmdp_set_access_flags(struct vm_area_struct *vma,
-			  unsigned long address, pmd_t *pmdp,
-			  pmd_t entry, int dirty)
-{
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	/*
+	 * If we don't have a PTE table and if there is no huge page mapped,
+	 * the storage key is 0.
+	 */
+	*key = 0;
 
-	if (pmd_same(*pmdp, entry))
+	switch (pmd_lookup(mm, addr, &pmdp)) {
+	case -ENOENT:
 		return 0;
-	pmdp_invalidate(vma, address, pmdp);
-	set_pmd_at(vma->vm_mm, address, pmdp, entry);
-	return 1;
-}
+	case 0:
+		break;
+	default:
+		return -EFAULT;
+	}
+again:
+	ptl = pmd_lock(mm, pmdp);
+	if (!pmd_present(*pmdp)) {
+		spin_unlock(ptl);
+		return 0;
+	}
 
-static void pmdp_splitting_flush_sync(void *arg)
-{
-	/* Simply deliver the interrupt */
+	if (pmd_leaf(*pmdp)) {
+		paddr = pmd_val(*pmdp) & HPAGE_MASK;
+		paddr |= addr & ~HPAGE_MASK;
+		*key = page_get_storage_key(paddr);
+		spin_unlock(ptl);
+		return 0;
+	}
+	spin_unlock(ptl);
+
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!ptep)
+		goto again;
+	pgste = pgste_get_lock(ptep);
+	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+	paddr = pte_val(*ptep) & PAGE_MASK;
+	if (!(pte_val(*ptep) & _PAGE_INVALID))
+		*key = page_get_storage_key(paddr);
+	/* Reflect guest's logical view, not physical */
+	*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
+	pgste_set_unlock(ptep, pgste);
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
 }
+EXPORT_SYMBOL(get_guest_storage_key);
 
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-			  pmd_t *pmdp)
+/**
+ * pgste_perform_essa - perform ESSA actions on the PGSTE.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @orc: the specific action to perform, see the ESSA_SET_* macros.
+ * @oldpte: the PTE will be saved there if the pointer is not NULL.
+ * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL.
+ *
+ * Return: 1 if the page is to be added to the CBRL, otherwise 0,
+ *	   or < 0 in case of error. -EINVAL is returned for invalid values
+ *	   of orc, -EFAULT for invalid addresses.
+ */
+int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
+			unsigned long *oldpte, unsigned long *oldpgste)
 {
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
-			      (unsigned long *) pmdp)) {
-		/* need to serialize against gup-fast (IRQ disabled) */
-		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
+	struct vm_area_struct *vma;
+	unsigned long pgstev;
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pte_t *ptep;
+	int res = 0;
+
+	WARN_ON_ONCE(orc > ESSA_MAX);
+	if (unlikely(orc > ESSA_MAX))
+		return -EINVAL;
+
+	vma = vma_lookup(mm, hva);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return -EFAULT;
+	ptep = get_locked_pte(mm, hva, &ptl);
+	if (unlikely(!ptep))
+		return -EFAULT;
+	pgste = pgste_get_lock(ptep);
+	pgstev = pgste_val(pgste);
+	if (oldpte)
+		*oldpte = pte_val(*ptep);
+	if (oldpgste)
+		*oldpgste = pgstev;
+
+	switch (orc) {
+	case ESSA_GET_STATE:
+		break;
+	case ESSA_SET_STABLE:
+		pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
+		pgstev |= _PGSTE_GPS_USAGE_STABLE;
+		break;
+	case ESSA_SET_UNUSED:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_UNUSED;
+		if (pte_val(*ptep) & _PAGE_INVALID)
+			res = 1;
+		break;
+	case ESSA_SET_VOLATILE:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+		if (pte_val(*ptep) & _PAGE_INVALID)
+			res = 1;
+		break;
+	case ESSA_SET_POT_VOLATILE:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+			pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE;
+			break;
+		}
+		if (pgstev & _PGSTE_GPS_ZERO) {
+			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+			break;
+		}
+		if (!(pgstev & PGSTE_GC_BIT)) {
+			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+			res = 1;
+			break;
+		}
+		break;
+	case ESSA_SET_STABLE_RESIDENT:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_STABLE;
+		/*
+		 * Since the resident state can go away any time after this
+		 * call, we will not make this page resident. We can revisit
+		 * this decision if a guest will ever start using this.
+		 */
+		break;
+	case ESSA_SET_STABLE_IF_RESIDENT:
+		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+			pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+			pgstev |= _PGSTE_GPS_USAGE_STABLE;
+		}
+		break;
+	case ESSA_SET_STABLE_NODAT:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT;
+		break;
+	default:
+		/* we should never get here! */
+		break;
 	}
+	/* If we are discarding a page, set it to logical zero */
+	if (res)
+		pgstev |= _PGSTE_GPS_ZERO;
+
+	pgste = __pgste(pgstev);
+	pgste_set_unlock(ptep, pgste);
+	pte_unmap_unlock(ptep, ptl);
+	return res;
 }
+EXPORT_SYMBOL(pgste_perform_essa);
 
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-				pgtable_t pgtable)
+/**
+ * set_pgste_bits - set specific PGSTE bits.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @bits: a bitmask representing the bits that will be touched
+ * @value: the values of the bits to be written. Only the bits in the mask
+ *	   will be written.
+ *
+ * Return: 0 on success, < 0 in case of error.
+ */
+int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
+			unsigned long bits, unsigned long value)
 {
-	struct list_head *lh = (struct list_head *) pgtable;
+	struct vm_area_struct *vma;
+	spinlock_t *ptl;
+	pgste_t new;
+	pte_t *ptep;
 
-	assert_spin_locked(&mm->page_table_lock);
+	vma = vma_lookup(mm, hva);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return -EFAULT;
+	ptep = get_locked_pte(mm, hva, &ptl);
+	if (unlikely(!ptep))
+		return -EFAULT;
+	new = pgste_get_lock(ptep);
 
-	/* FIFO */
-	if (!mm->pmd_huge_pte)
-		INIT_LIST_HEAD(lh);
-	else
-		list_add(lh, (struct list_head *) mm->pmd_huge_pte);
-	mm->pmd_huge_pte = pgtable;
+	new = clear_pgste_bit(new, bits);
+	new = set_pgste_bit(new, value & bits);
+
+	pgste_set_unlock(ptep, new);
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
 }
+EXPORT_SYMBOL(set_pgste_bits);
 
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+/**
+ * get_pgste - get the current PGSTE for the given address.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @pgstep: will be written with the current PGSTE for the given address.
+ *
+ * Return: 0 on success, < 0 in case of error.
+ */
+int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
 {
-	struct list_head *lh;
-	pgtable_t pgtable;
+	struct vm_area_struct *vma;
+	spinlock_t *ptl;
 	pte_t *ptep;
 
-	assert_spin_locked(&mm->page_table_lock);
-
-	/* FIFO */
-	pgtable = mm->pmd_huge_pte;
-	lh = (struct list_head *) pgtable;
-	if (list_empty(lh))
-		mm->pmd_huge_pte = NULL;
-	else {
-		mm->pmd_huge_pte = (pgtable_t) lh->next;
-		list_del(lh);
-	}
-	ptep = (pte_t *) pgtable;
-	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
-	ptep++;
-	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
-	return pgtable;
+	vma = vma_lookup(mm, hva);
+	if (!vma || is_vm_hugetlb_page(vma))
+		return -EFAULT;
+	ptep = get_locked_pte(mm, hva, &ptl);
+	if (unlikely(!ptep))
+		return -EFAULT;
+	*pgstep = pgste_val(pgste_get(ptep));
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+EXPORT_SYMBOL(get_pgste);
+#endif
diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c
new file mode 100644
index 000000000000..59de866c72d9
--- /dev/null
+++ b/arch/s390/mm/physaddr.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mmdebug.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+
+unsigned long __phys_addr(unsigned long x, bool is_31bit)
+{
+	VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x)));
+	x = __pa_nodebug(x);
+	if (is_31bit)
+		VIRTUAL_BUG_ON(x >> 31);
+	return x;
+}
+EXPORT_SYMBOL(__phys_addr);
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 8b268fcc4612..d96587b84e81 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -1,425 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *    Copyright IBM Corp. 2006
- *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 
-#include <linux/bootmem.h>
+#include <linux/memory_hotplug.h>
+#include <linux/bootmem_info.h>
+#include <linux/cpufeature.h>
+#include <linux/memblock.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/list.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
+#include <asm/page-states.h>
+#include <asm/abs_lowcore.h>
+#include <asm/cacheflush.h>
+#include <asm/maccess.h>
+#include <asm/nospec-branch.h>
+#include <asm/ctlreg.h>
 #include <asm/pgalloc.h>
-#include <asm/pgtable.h>
 #include <asm/setup.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/set_memory.h>
+#include <asm/physmem_info.h>
 
 static DEFINE_MUTEX(vmem_mutex);
 
-struct memory_segment {
-	struct list_head list;
-	unsigned long start;
-	unsigned long size;
-};
-
-static LIST_HEAD(mem_segs);
-
 static void __ref *vmem_alloc_pages(unsigned int order)
 {
+	unsigned long size = PAGE_SIZE << order;
+
 	if (slab_is_available())
 		return (void *)__get_free_pages(GFP_KERNEL, order);
-	return alloc_bootmem_pages((1 << order) * PAGE_SIZE);
+	return memblock_alloc(size, size);
 }
 
-static inline pud_t *vmem_pud_alloc(void)
+static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
 {
-	pud_t *pud = NULL;
+	unsigned int nr_pages = 1 << order;
+	struct page *page;
 
-#ifdef CONFIG_64BIT
-	pud = vmem_alloc_pages(2);
-	if (!pud)
-		return NULL;
-	clear_table((unsigned long *) pud, _REGION3_ENTRY_EMPTY, PAGE_SIZE * 4);
-#endif
-	return pud;
+	if (altmap) {
+		vmem_altmap_free(altmap, 1 << order);
+		return;
+	}
+	page = virt_to_page((void *)addr);
+	if (PageReserved(page)) {
+		/* allocated from memblock */
+		while (nr_pages--)
+			free_bootmem_page(page++);
+	} else {
+		free_pages(addr, order);
+	}
 }
 
-static inline pmd_t *vmem_pmd_alloc(void)
+void *vmem_crst_alloc(unsigned long val)
 {
-	pmd_t *pmd = NULL;
+	unsigned long *table;
 
-#ifdef CONFIG_64BIT
-	pmd = vmem_alloc_pages(2);
-	if (!pmd)
+	table = vmem_alloc_pages(CRST_ALLOC_ORDER);
+	if (!table)
 		return NULL;
-	clear_table((unsigned long *) pmd, _SEGMENT_ENTRY_EMPTY, PAGE_SIZE * 4);
-#endif
-	return pmd;
+	crst_table_init(table, val);
+	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+	return table;
 }
 
-static pte_t __ref *vmem_pte_alloc(unsigned long address)
+pte_t __ref *vmem_pte_alloc(void)
 {
 	pte_t *pte;
 
 	if (slab_is_available())
-		pte = (pte_t *) page_table_alloc(&init_mm, address);
+		pte = (pte_t *)page_table_alloc(&init_mm);
 	else
-		pte = alloc_bootmem(PTRS_PER_PTE * sizeof(pte_t));
+		pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 	if (!pte)
 		return NULL;
-	clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY,
-		    PTRS_PER_PTE * sizeof(pte_t));
+	memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+	__arch_set_page_dat(pte, 1);
 	return pte;
 }
 
+static void vmem_pte_free(unsigned long *table)
+{
+	page_table_free(&init_mm, table);
+}
+
+#define PAGE_UNUSED 0xFD
+
 /*
- * Add a physical memory range to the 1:1 mapping.
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_sub_pmd_start to next PMD_SIZE boundary.
  */
-static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
-{
-	unsigned long end = start + size;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
+static unsigned long unused_sub_pmd_start;
+
+static void vmemmap_flush_unused_sub_pmd(void)
+{
+	if (!unused_sub_pmd_start)
+		return;
+	memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
+	       ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
+	unused_sub_pmd_start = 0;
+}
+
+static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
+{
+	/*
+	 * As we expect to add in the same granularity as we remove, it's
+	 * sufficient to mark only some piece used to block the memmap page from
+	 * getting removed (just in case the memmap never gets initialized,
+	 * e.g., because the memory block never gets onlined).
+	 */
+	memset((void *)start, 0, sizeof(struct page));
+}
+
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+	/*
+	 * We only optimize if the new used range directly follows the
+	 * previously unused range (esp., when populating consecutive sections).
+	 */
+	if (unused_sub_pmd_start == start) {
+		unused_sub_pmd_start = end;
+		if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
+			unused_sub_pmd_start = 0;
+		return;
+	}
+	vmemmap_flush_unused_sub_pmd();
+	vmemmap_mark_sub_pmd_used(start, end);
+}
+
+static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+	unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+	vmemmap_flush_unused_sub_pmd();
+
+	/* Could be our memmap page is filled with PAGE_UNUSED already ... */
+	vmemmap_mark_sub_pmd_used(start, end);
+
+	/* Mark the unused parts of the new memmap page PAGE_UNUSED. */
+	if (!IS_ALIGNED(start, PMD_SIZE))
+		memset((void *)page, PAGE_UNUSED, start - page);
+	/*
+	 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+	 * consecutive sections. Remember for the last added PMD the last
+	 * unused range in the populated PMD.
+	 */
+	if (!IS_ALIGNED(end, PMD_SIZE))
+		unused_sub_pmd_start = end;
+}
+
+/* Returns true if the PMD is completely unused and can be freed. */
+static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
+{
+	unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+	vmemmap_flush_unused_sub_pmd();
+	memset((void *)start, PAGE_UNUSED, end - start);
+	return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
+}
+
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
+{
+	unsigned long prot, pages = 0;
 	int ret = -ENOMEM;
+	pte_t *pte;
 
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			pu_dir = vmem_pud_alloc();
-			if (!pu_dir)
-				goto out;
-			pgd_populate(&init_mm, pg_dir, pu_dir);
-		}
-		pu_dir = pud_offset(pg_dir, address);
-#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC)
-		if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
-		    !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) {
-			pud_val(*pu_dir) = __pa(address) |
-				_REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
-				(ro ? _REGION_ENTRY_RO : 0);
-			address += PUD_SIZE;
-			continue;
-		}
-#endif
-		if (pud_none(*pu_dir)) {
-			pm_dir = vmem_pmd_alloc();
-			if (!pm_dir)
-				goto out;
-			pud_populate(&init_mm, pu_dir, pm_dir);
-		}
-		pm_dir = pmd_offset(pu_dir, address);
-#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC)
-		if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
-		    !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) {
-			pmd_val(*pm_dir) = __pa(address) |
-				_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
-				(ro ? _SEGMENT_ENTRY_RO : 0);
-			address += PMD_SIZE;
+	prot = pgprot_val(PAGE_KERNEL);
+	pte = pte_offset_kernel(pmd, addr);
+	for (; addr < end; addr += PAGE_SIZE, pte++) {
+		if (!add) {
+			if (pte_none(*pte))
+				continue;
+			if (!direct)
+				vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap);
+			pte_clear(&init_mm, addr, pte);
+		} else if (pte_none(*pte)) {
+			if (!direct) {
+				void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap);
+
+				if (!new_page)
+					goto out;
+				set_pte(pte, __pte(__pa(new_page) | prot));
+			} else {
+				set_pte(pte, __pte(__pa(addr) | prot));
+			}
+		} else {
 			continue;
 		}
-#endif
-		if (pmd_none(*pm_dir)) {
-			pt_dir = vmem_pte_alloc(address);
-			if (!pt_dir)
-				goto out;
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		}
-
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		pte_val(*pt_dir) = __pa(address) | (ro ? _PAGE_RO : 0);
-		address += PAGE_SIZE;
+		pages++;
 	}
 	ret = 0;
 out:
-	flush_tlb_kernel_range(start, end);
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
 	return ret;
 }
 
-/*
- * Remove a physical memory range from the 1:1 mapping.
- * Currently only invalidates page table entries.
- */
-static void vmem_remove_range(unsigned long start, unsigned long size)
+static void try_free_pte_table(pmd_t *pmd, unsigned long start)
 {
-	unsigned long end = start + size;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-	pte_t  pte;
-
-	pte_val(pte) = _PAGE_TYPE_EMPTY;
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			address += PGDIR_SIZE;
-			continue;
-		}
-		pu_dir = pud_offset(pg_dir, address);
-		if (pud_none(*pu_dir)) {
-			address += PUD_SIZE;
-			continue;
-		}
-		if (pud_large(*pu_dir)) {
-			pud_clear(pu_dir);
-			address += PUD_SIZE;
-			continue;
-		}
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-			address += PMD_SIZE;
-			continue;
-		}
-		if (pmd_large(*pm_dir)) {
-			pmd_clear(pm_dir);
-			address += PMD_SIZE;
-			continue;
-		}
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		*pt_dir = pte;
-		address += PAGE_SIZE;
+	pte_t *pte;
+	int i;
+
+	/* We can safely assume this is fully in 1:1 mapping & vmemmap area */
+	pte = pte_offset_kernel(pmd, start);
+	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+		if (!pte_none(*pte))
+			return;
 	}
-	flush_tlb_kernel_range(start, end);
+	vmem_pte_free((unsigned long *) pmd_deref(*pmd));
+	pmd_clear(pmd);
 }
 
-/*
- * Add a backed mem_map array to the virtual mem_map array.
- */
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
 {
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
+	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
+	pmd_t *pmd;
+	pte_t *pte;
 
-	for (address = start; address < end;) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			pu_dir = vmem_pud_alloc();
-			if (!pu_dir)
-				goto out;
-			pgd_populate(&init_mm, pg_dir, pu_dir);
-		}
+	prot = pgprot_val(SEGMENT_KERNEL);
+	pmd = pmd_offset(pud, addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+		if (!add) {
+			if (pmd_none(*pmd))
+				continue;
+			if (pmd_leaf(*pmd)) {
+				if (IS_ALIGNED(addr, PMD_SIZE) &&
+				    IS_ALIGNED(next, PMD_SIZE)) {
+					if (!direct)
+						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
+					pmd_clear(pmd);
+					pages++;
+				} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
+					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
+					pmd_clear(pmd);
+				}
+				continue;
+			}
+		} else if (pmd_none(*pmd)) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE) &&
+			    cpu_has_edat1() && direct &&
+			    !debug_pagealloc_enabled()) {
+				set_pmd(pmd, __pmd(__pa(addr) | prot));
+				pages++;
+				continue;
+			} else if (!direct && cpu_has_edat1()) {
+				void *new_page;
 
-		pu_dir = pud_offset(pg_dir, address);
-		if (pud_none(*pu_dir)) {
-			pm_dir = vmem_pmd_alloc();
-			if (!pm_dir)
+				/*
+				 * Use 1MB frames for vmemmap if available. We
+				 * always use large frames even if they are only
+				 * partially used. Otherwise we would have also
+				 * page tables since vmemmap_populate gets
+				 * called for each section separately.
+				 */
+				new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap);
+				if (new_page) {
+					set_pmd(pmd, __pmd(__pa(new_page) | prot));
+					if (!IS_ALIGNED(addr, PMD_SIZE) ||
+					    !IS_ALIGNED(next, PMD_SIZE)) {
+						vmemmap_use_new_sub_pmd(addr, next);
+					}
+					continue;
+				}
+			}
+			pte = vmem_pte_alloc();
+			if (!pte)
 				goto out;
-			pud_populate(&init_mm, pu_dir, pm_dir);
+			pmd_populate(&init_mm, pmd, pte);
+		} else if (pmd_leaf(*pmd)) {
+			if (!direct)
+				vmemmap_use_sub_pmd(addr, next);
+			continue;
 		}
+		ret = modify_pte_table(pmd, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_pte_table(pmd, addr & PMD_MASK);
+	}
+	ret = 0;
+out:
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+	return ret;
+}
 
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-#ifdef CONFIG_64BIT
-			/* Use 1MB frames for vmemmap if available. We always
-			 * use large frames even if they are only partially
-			 * used.
-			 * Otherwise we would have also page tables since
-			 * vmemmap_populate gets called for each section
-			 * separately. */
-			if (MACHINE_HAS_EDAT1) {
-				void *new_page;
+static void try_free_pmd_table(pud_t *pud, unsigned long start)
+{
+	pmd_t *pmd;
+	int i;
 
-				new_page = vmemmap_alloc_block(PMD_SIZE, node);
-				if (!new_page)
-					goto out;
-				pmd_val(*pm_dir) = __pa(new_page) |
-					_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
-					_SEGMENT_ENTRY_CO;
-				address = (address + PMD_SIZE) & PMD_MASK;
+	pmd = pmd_offset(pud, start);
+	for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
+		if (!pmd_none(*pmd))
+			return;
+	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL);
+	pud_clear(pud);
+}
+
+static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
+			    bool add, bool direct, struct vmem_altmap *altmap)
+{
+	unsigned long next, prot, pages = 0;
+	int ret = -ENOMEM;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	prot = pgprot_val(REGION3_KERNEL);
+	pud = pud_offset(p4d, addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+		if (!add) {
+			if (pud_none(*pud))
+				continue;
+			if (pud_leaf(*pud)) {
+				if (IS_ALIGNED(addr, PUD_SIZE) &&
+				    IS_ALIGNED(next, PUD_SIZE)) {
+					pud_clear(pud);
+					pages++;
+				}
 				continue;
 			}
-#endif
-			pt_dir = vmem_pte_alloc(address);
-			if (!pt_dir)
+		} else if (pud_none(*pud)) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE) &&
+			    cpu_has_edat2() && direct &&
+			    !debug_pagealloc_enabled()) {
+				set_pud(pud, __pud(__pa(addr) | prot));
+				pages++;
+				continue;
+			}
+			pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+			if (!pmd)
 				goto out;
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		} else if (pmd_large(*pm_dir)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
+			pud_populate(&init_mm, pud, pmd);
+		} else if (pud_leaf(*pud)) {
 			continue;
 		}
+		ret = modify_pmd_table(pud, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_pmd_table(pud, addr & PUD_MASK);
+	}
+	ret = 0;
+out:
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+	return ret;
+}
 
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		if (pte_none(*pt_dir)) {
-			unsigned long new_page;
+static void try_free_pud_table(p4d_t *p4d, unsigned long start)
+{
+	pud_t *pud;
+	int i;
+
+	pud = pud_offset(p4d, start);
+	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+		if (!pud_none(*pud))
+			return;
+	}
+	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL);
+	p4d_clear(p4d);
+}
 
-			new_page =__pa(vmem_alloc_pages(0));
-			if (!new_page)
+static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
+			    bool add, bool direct, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	int ret = -ENOMEM;
+	p4d_t *p4d;
+	pud_t *pud;
+
+	p4d = p4d_offset(pgd, addr);
+	for (; addr < end; addr = next, p4d++) {
+		next = p4d_addr_end(addr, end);
+		if (!add) {
+			if (p4d_none(*p4d))
+				continue;
+		} else if (p4d_none(*p4d)) {
+			pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+			if (!pud)
 				goto out;
-			pte_val(*pt_dir) = __pa(new_page);
+			p4d_populate(&init_mm, p4d, pud);
 		}
-		address += PAGE_SIZE;
+		ret = modify_pud_table(p4d, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_pud_table(p4d, addr & P4D_MASK);
 	}
-	memset((void *)start, 0, end - start);
 	ret = 0;
 out:
-	flush_tlb_kernel_range(start, end);
 	return ret;
 }
 
-void vmemmap_free(unsigned long start, unsigned long end)
+static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
 {
+	p4d_t *p4d;
+	int i;
+
+	p4d = p4d_offset(pgd, start);
+	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+		if (!p4d_none(*p4d))
+			return;
+	}
+	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL);
+	pgd_clear(pgd);
 }
 
-/*
- * Add memory segment to the segment list if it doesn't overlap with
- * an already present segment.
- */
-static int insert_memory_segment(struct memory_segment *seg)
+static int modify_pagetable(unsigned long start, unsigned long end, bool add,
+			    bool direct, struct vmem_altmap *altmap)
 {
-	struct memory_segment *tmp;
+	unsigned long addr, next;
+	int ret = -ENOMEM;
+	pgd_t *pgd;
+	p4d_t *p4d;
+
+	if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
+		return -EINVAL;
+	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+	if (WARN_ON_ONCE(end > __abs_lowcore))
+		return -EINVAL;
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset_k(addr);
+
+		if (!add) {
+			if (pgd_none(*pgd))
+				continue;
+		} else if (pgd_none(*pgd)) {
+			p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+			if (!p4d)
+				goto out;
+			pgd_populate(&init_mm, pgd, p4d);
+		}
+		ret = modify_p4d_table(pgd, addr, next, add, direct, altmap);
+		if (ret)
+			goto out;
+		if (!add)
+			try_free_p4d_table(pgd, addr & PGDIR_MASK);
+	}
+	ret = 0;
+out:
+	if (!add)
+		flush_tlb_kernel_range(start, end);
+	return ret;
+}
 
-	if (seg->start + seg->size > VMEM_MAX_PHYS ||
-	    seg->start + seg->size < seg->start)
-		return -ERANGE;
+static int add_pagetable(unsigned long start, unsigned long end, bool direct,
+			 struct vmem_altmap *altmap)
+{
+	return modify_pagetable(start, end, true, direct, altmap);
+}
 
-	list_for_each_entry(tmp, &mem_segs, list) {
-		if (seg->start >= tmp->start + tmp->size)
-			continue;
-		if (seg->start + seg->size <= tmp->start)
-			continue;
-		return -ENOSPC;
-	}
-	list_add(&seg->list, &mem_segs);
-	return 0;
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct,
+			    struct vmem_altmap *altmap)
+{
+	return modify_pagetable(start, end, false, direct, altmap);
 }
 
 /*
- * Remove memory segment from the segment list.
+ * Add a physical memory range to the 1:1 mapping.
  */
-static void remove_memory_segment(struct memory_segment *seg)
+static int vmem_add_range(unsigned long start, unsigned long size)
 {
-	list_del(&seg->list);
+	start = (unsigned long)__va(start);
+	return add_pagetable(start, start + size, true, NULL);
 }
 
-static void __remove_shared_memory(struct memory_segment *seg)
+/*
+ * Remove a physical memory range from the 1:1 mapping.
+ */
+static void vmem_remove_range(unsigned long start, unsigned long size)
 {
-	remove_memory_segment(seg);
-	vmem_remove_range(seg->start, seg->size);
+	start = (unsigned long)__va(start);
+	remove_pagetable(start, start + size, true, NULL);
 }
 
-int vmem_remove_mapping(unsigned long start, unsigned long size)
+/*
+ * Add a backed mem_map array to the virtual mem_map array.
+ */
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+			       struct vmem_altmap *altmap)
 {
-	struct memory_segment *seg;
 	int ret;
 
 	mutex_lock(&vmem_mutex);
+	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
+	ret = add_pagetable(start, end, false, altmap);
+	if (ret)
+		remove_pagetable(start, end, false, altmap);
+	mutex_unlock(&vmem_mutex);
+	return ret;
+}
 
-	ret = -ENOENT;
-	list_for_each_entry(seg, &mem_segs, list) {
-		if (seg->start == start && seg->size == size)
-			break;
-	}
+#ifdef CONFIG_MEMORY_HOTPLUG
 
-	if (seg->start != start || seg->size != size)
-		goto out;
+void vmemmap_free(unsigned long start, unsigned long end,
+		  struct vmem_altmap *altmap)
+{
+	mutex_lock(&vmem_mutex);
+	remove_pagetable(start, end, false, altmap);
+	mutex_unlock(&vmem_mutex);
+}
 
-	ret = 0;
-	__remove_shared_memory(seg);
-	kfree(seg);
-out:
+#endif
+
+void vmem_remove_mapping(unsigned long start, unsigned long size)
+{
+	mutex_lock(&vmem_mutex);
+	vmem_remove_range(start, size);
 	mutex_unlock(&vmem_mutex);
-	return ret;
+}
+
+struct range arch_get_mappable_range(void)
+{
+	struct range mhp_range;
+
+	mhp_range.start = 0;
+	mhp_range.end = max_mappable - 1;
+	return mhp_range;
 }
 
 int vmem_add_mapping(unsigned long start, unsigned long size)
 {
-	struct memory_segment *seg;
+	struct range range = arch_get_mappable_range();
 	int ret;
 
-	mutex_lock(&vmem_mutex);
-	ret = -ENOMEM;
-	seg = kzalloc(sizeof(*seg), GFP_KERNEL);
-	if (!seg)
-		goto out;
-	seg->start = start;
-	seg->size = size;
-
-	ret = insert_memory_segment(seg);
-	if (ret)
-		goto out_free;
+	if (start < range.start ||
+	    start + size > range.end + 1 ||
+	    start + size < start)
+		return -ERANGE;
 
-	ret = vmem_add_mem(start, size, 0);
+	mutex_lock(&vmem_mutex);
+	ret = vmem_add_range(start, size);
 	if (ret)
-		goto out_remove;
-	goto out;
-
-out_remove:
-	__remove_shared_memory(seg);
-out_free:
-	kfree(seg);
-out:
+		vmem_remove_range(start, size);
 	mutex_unlock(&vmem_mutex);
 	return ret;
 }
 
 /*
- * map whole physical memory to virtual memory (identity mapping)
- * we reserve enough space in the vmalloc area for vmemmap to hotplug
- * additional memory segments.
+ * Allocate new or return existing page-table entry, but do not map it
+ * to any physical address. If missing, allocate segment- and region-
+ * table entries along. Meeting a large segment- or region-table entry
+ * while traversing is an error, since the function is expected to be
+ * called against virtual regions reserved for 4KB mappings only.
  */
-void __init vmem_map_init(void)
+pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
 {
-	unsigned long ro_start, ro_end;
-	unsigned long start, end;
-	int i;
+	pte_t *ptep = NULL;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
 
-	ro_start = PFN_ALIGN((unsigned long)&_stext);
-	ro_end = (unsigned long)&_eshared & PAGE_MASK;
-	for (i = 0; i < MEMORY_CHUNKS; i++) {
-		if (!memory_chunk[i].size)
-			continue;
-		start = memory_chunk[i].addr;
-		end = memory_chunk[i].addr + memory_chunk[i].size;
-		if (start >= ro_end || end <= ro_start)
-			vmem_add_mem(start, end - start, 0);
-		else if (start >= ro_start && end <= ro_end)
-			vmem_add_mem(start, end - start, 1);
-		else if (start >= ro_start) {
-			vmem_add_mem(start, ro_end - start, 1);
-			vmem_add_mem(ro_end, end - ro_end, 0);
-		} else if (end < ro_end) {
-			vmem_add_mem(start, ro_start - start, 0);
-			vmem_add_mem(ro_start, end - ro_start, 1);
-		} else {
-			vmem_add_mem(start, ro_start - start, 0);
-			vmem_add_mem(ro_start, ro_end - ro_start, 1);
-			vmem_add_mem(ro_end, end - ro_end, 0);
-		}
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd)) {
+		if (!alloc)
+			goto out;
+		p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+		if (!p4d)
+			goto out;
+		pgd_populate(&init_mm, pgd, p4d);
 	}
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d)) {
+		if (!alloc)
+			goto out;
+		pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+		if (!pud)
+			goto out;
+		p4d_populate(&init_mm, p4d, pud);
+	}
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud)) {
+		if (!alloc)
+			goto out;
+		pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+		if (!pmd)
+			goto out;
+		pud_populate(&init_mm, pud, pmd);
+	} else if (WARN_ON_ONCE(pud_leaf(*pud))) {
+		goto out;
+	}
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd)) {
+		if (!alloc)
+			goto out;
+		pte = vmem_pte_alloc();
+		if (!pte)
+			goto out;
+		pmd_populate(&init_mm, pmd, pte);
+	} else if (WARN_ON_ONCE(pmd_leaf(*pmd))) {
+		goto out;
+	}
+	ptep = pte_offset_kernel(pmd, addr);
+out:
+	return ptep;
 }
 
-/*
- * Convert memory chunk array to a memory segment list so there is a single
- * list that contains both r/w memory and shared memory segments.
- */
-static int __init vmem_convert_memory_chunk(void)
+int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc)
 {
-	struct memory_segment *seg;
-	int i;
+	pte_t *ptep, pte;
+
+	if (!IS_ALIGNED(addr, PAGE_SIZE))
+		return -EINVAL;
+	ptep = vmem_get_alloc_pte(addr, alloc);
+	if (!ptep)
+		return -ENOMEM;
+	__ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+	pte = mk_pte_phys(phys, prot);
+	set_pte(ptep, pte);
+	return 0;
+}
+
+int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
+{
+	int rc;
 
 	mutex_lock(&vmem_mutex);
-	for (i = 0; i < MEMORY_CHUNKS; i++) {
-		if (!memory_chunk[i].size)
-			continue;
-		seg = kzalloc(sizeof(*seg), GFP_KERNEL);
-		if (!seg)
-			panic("Out of memory...\n");
-		seg->start = memory_chunk[i].addr;
-		seg->size = memory_chunk[i].size;
-		insert_memory_segment(seg);
-	}
+	rc = __vmem_map_4k_page(addr, phys, prot, true);
+	mutex_unlock(&vmem_mutex);
+	return rc;
+}
+
+void vmem_unmap_4k_page(unsigned long addr)
+{
+	pte_t *ptep;
+
+	mutex_lock(&vmem_mutex);
+	ptep = virt_to_kpte(addr);
+	__ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+	pte_clear(&init_mm, addr, ptep);
 	mutex_unlock(&vmem_mutex);
-	return 0;
 }
 
-core_initcall(vmem_convert_memory_chunk);
+void __init vmem_map_init(void)
+{
+	__set_memory_rox(_stext, _etext);
+	__set_memory_ro(_etext, __end_rodata);
+	__set_memory_rox(__stext_amode31, __etext_amode31);
+	/*
+	 * If the BEAR-enhancement facility is not installed the first
+	 * prefix page is used to return to the previous context with
+	 * an LPSWE instruction and therefore must be executable.
+	 */
+	if (!cpu_has_bear())
+		set_memory_x(0, 1);
+	if (debug_pagealloc_enabled())
+		__set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size);
+	pr_info("Write protected kernel read-only data: %luk\n",
+		(unsigned long)(__end_rodata - _stext) >> 10);
+}