summaryrefslogtreecommitdiff
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile1
-rw-r--r--arch/s390/mm/cmm.c29
-rw-r--r--arch/s390/mm/dump_pagetables.c232
-rw-r--r--arch/s390/mm/extable.c30
-rw-r--r--arch/s390/mm/extmem.c18
-rw-r--r--arch/s390/mm/fault.c232
-rw-r--r--arch/s390/mm/gmap.c823
-rw-r--r--arch/s390/mm/hugetlbpage.c147
-rw-r--r--arch/s390/mm/init.c58
-rw-r--r--arch/s390/mm/maccess.c5
-rw-r--r--arch/s390/mm/mmap.c83
-rw-r--r--arch/s390/mm/pageattr.c40
-rw-r--r--arch/s390/mm/pgalloc.c50
-rw-r--r--arch/s390/mm/pgtable.c16
-rw-r--r--arch/s390/mm/physaddr.c15
-rw-r--r--arch/s390/mm/vmem.c100
16 files changed, 739 insertions, 1140 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 352ff520fd94..f6c2db7a8669 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -7,6 +7,7 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
obj-$(CONFIG_CMM) += cmm.o
+obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index f8b13f247646..39f44b6256e0 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -95,11 +95,12 @@ static long cmm_alloc_pages(long nr, long *counter,
(*counter)++;
spin_unlock(&cmm_lock);
nr--;
+ cond_resched();
}
return nr;
}
-static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+static long __cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
{
struct cmm_page_array *pa;
unsigned long addr;
@@ -123,6 +124,21 @@ static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
return nr;
}
+static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+{
+ long inc = 0;
+
+ while (nr) {
+ inc = min(256L, nr);
+ nr -= inc;
+ inc = __cmm_free_pages(inc, counter, list);
+ if (inc)
+ break;
+ cond_resched();
+ }
+ return nr + inc;
+}
+
static int cmm_oom_notify(struct notifier_block *self,
unsigned long dummy, void *parm)
{
@@ -188,7 +204,7 @@ static void cmm_set_timer(void)
del_timer(&cmm_timer);
return;
}
- mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
+ mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds));
}
static void cmm_timer_fn(struct timer_list *unused)
@@ -243,7 +259,7 @@ static int cmm_skip_blanks(char *cp, char **endp)
return str != cp;
}
-static int cmm_pages_handler(struct ctl_table *ctl, int write,
+static int cmm_pages_handler(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
long nr = cmm_get_pages();
@@ -262,7 +278,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
return 0;
}
-static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
+static int cmm_timed_pages_handler(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -282,7 +298,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
return 0;
}
-static int cmm_timeout_handler(struct ctl_table *ctl, int write,
+static int cmm_timeout_handler(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char buf[64], *p;
@@ -316,7 +332,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
return 0;
}
-static struct ctl_table cmm_table[] = {
+static const struct ctl_table cmm_table[] = {
{
.procname = "cmm_pages",
.mode = 0644,
@@ -427,4 +443,5 @@ static void __exit cmm_exit(void)
}
module_exit(cmm_exit);
+MODULE_DESCRIPTION("Cooperative memory management interface");
MODULE_LICENSE("GPL");
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index d37a8f607b71..fa54f3bc0c8d 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -3,10 +3,10 @@
#include <linux/ptdump.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/sort.h>
#include <linux/mm.h>
#include <linux/kfence.h>
#include <linux/kasan.h>
-#include <asm/ptdump.h>
#include <asm/kasan.h>
#include <asm/abs_lowcore.h>
#include <asm/nospec-branch.h>
@@ -16,68 +16,14 @@
static unsigned long max_addr;
struct addr_marker {
+ int is_start;
unsigned long start_address;
+ unsigned long size;
const char *name;
};
-enum address_markers_idx {
- IDENTITY_BEFORE_NR = 0,
- IDENTITY_BEFORE_END_NR,
- AMODE31_START_NR,
- AMODE31_END_NR,
- KERNEL_START_NR,
- KERNEL_END_NR,
-#ifdef CONFIG_KFENCE
- KFENCE_START_NR,
- KFENCE_END_NR,
-#endif
- IDENTITY_AFTER_NR,
- IDENTITY_AFTER_END_NR,
- VMEMMAP_NR,
- VMEMMAP_END_NR,
- VMALLOC_NR,
- VMALLOC_END_NR,
- MODULES_NR,
- MODULES_END_NR,
- ABS_LOWCORE_NR,
- ABS_LOWCORE_END_NR,
- MEMCPY_REAL_NR,
- MEMCPY_REAL_END_NR,
-#ifdef CONFIG_KASAN
- KASAN_SHADOW_START_NR,
- KASAN_SHADOW_END_NR,
-#endif
-};
-
-static struct addr_marker address_markers[] = {
- [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"},
- [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"},
- [AMODE31_START_NR] = {0, "Amode31 Area Start"},
- [AMODE31_END_NR] = {0, "Amode31 Area End"},
- [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"},
- [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"},
-#ifdef CONFIG_KFENCE
- [KFENCE_START_NR] = {0, "KFence Pool Start"},
- [KFENCE_END_NR] = {0, "KFence Pool End"},
-#endif
- [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"},
- [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"},
- [VMEMMAP_NR] = {0, "vmemmap Area Start"},
- [VMEMMAP_END_NR] = {0, "vmemmap Area End"},
- [VMALLOC_NR] = {0, "vmalloc Area Start"},
- [VMALLOC_END_NR] = {0, "vmalloc Area End"},
- [MODULES_NR] = {0, "Modules Area Start"},
- [MODULES_END_NR] = {0, "Modules Area End"},
- [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"},
- [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"},
- [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"},
- [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"},
-#ifdef CONFIG_KASAN
- [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"},
- [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"},
-#endif
- { -1, NULL }
-};
+static struct addr_marker *markers;
+static unsigned int markers_cnt;
struct pg_state {
struct ptdump_state ptdump;
@@ -122,7 +68,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level)
static void note_prot_wx(struct pg_state *st, unsigned long addr)
{
-#ifdef CONFIG_DEBUG_WX
if (!st->check_wx)
return;
if (st->current_prot & _PAGE_INVALID)
@@ -139,10 +84,24 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
*/
if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
return;
- WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+ WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+ "s390/mm: Found insecure W+X mapping at address %pS\n",
(void *)st->start_address);
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
-#endif /* CONFIG_DEBUG_WX */
+}
+
+static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level)
+{
+ struct seq_file *m = st->seq;
+
+ while (addr >= st->marker[1].start_address) {
+ st->marker++;
+ pt_dump_seq_printf(m, "---[ %s %s ]---\n", st->marker->name,
+ st->marker->is_start ? "Start" : "End");
+ }
+ st->start_address = addr;
+ st->current_prot = prot;
+ st->level = level;
}
static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
@@ -167,10 +126,8 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
if (level == -1)
addr = max_addr;
if (st->level == -1) {
- pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
- st->start_address = addr;
- st->current_prot = prot;
- st->level = level;
+ pt_dump_seq_puts(m, "---[ Kernel Virtual Address Space ]---\n");
+ note_page_update_state(st, addr, prot, level);
} else if (prot != st->current_prot || level != st->level ||
addr >= st->marker[1].start_address) {
note_prot_wx(st, addr);
@@ -184,18 +141,11 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
print_prot(m, st->current_prot, st->level);
- while (addr >= st->marker[1].start_address) {
- st->marker++;
- pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
- }
- st->start_address = addr;
- st->current_prot = prot;
- st->level = level;
+ note_page_update_state(st, addr, prot, level);
}
}
-#ifdef CONFIG_DEBUG_WX
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
{
struct pg_state st = {
.ptdump = {
@@ -218,16 +168,20 @@ void ptdump_check_wx(void)
};
if (!MACHINE_HAS_NX)
- return;
+ return true;
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
- if (st.wx_pages)
+ if (st.wx_pages) {
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
- else
+
+ return false;
+ } else {
pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
(nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
"unexpected " : "");
+
+ return true;
+ }
}
-#endif /* CONFIG_DEBUG_WX */
#ifdef CONFIG_PTDUMP_DEBUGFS
static int ptdump_show(struct seq_file *m, void *v)
@@ -246,7 +200,7 @@ static int ptdump_show(struct seq_file *m, void *v)
.check_wx = false,
.wx_pages = 0,
.start_address = 0,
- .marker = address_markers,
+ .marker = markers,
};
get_online_mems();
@@ -259,22 +213,66 @@ static int ptdump_show(struct seq_file *m, void *v)
DEFINE_SHOW_ATTRIBUTE(ptdump);
#endif /* CONFIG_PTDUMP_DEBUGFS */
-/*
- * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple
- * insertion sort to preserve the original order of markers with the same
- * start address.
- */
-static void sort_address_markers(void)
+static int ptdump_cmp(const void *a, const void *b)
{
- struct addr_marker tmp;
- int i, j;
+ const struct addr_marker *ama = a;
+ const struct addr_marker *amb = b;
- for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) {
- tmp = address_markers[i];
- for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--)
- address_markers[j + 1] = address_markers[j];
- address_markers[j + 1] = tmp;
+ if (ama->start_address > amb->start_address)
+ return 1;
+ if (ama->start_address < amb->start_address)
+ return -1;
+ /*
+ * If the start addresses of two markers are identical sort markers in an
+ * order that considers areas contained within other areas correctly.
+ */
+ if (ama->is_start && amb->is_start) {
+ if (ama->size > amb->size)
+ return -1;
+ if (ama->size < amb->size)
+ return 1;
+ return 0;
+ }
+ if (!ama->is_start && !amb->is_start) {
+ if (ama->size > amb->size)
+ return 1;
+ if (ama->size < amb->size)
+ return -1;
+ return 0;
}
+ if (ama->is_start)
+ return 1;
+ if (amb->is_start)
+ return -1;
+ return 0;
+}
+
+static int add_marker(unsigned long start, unsigned long end, const char *name)
+{
+ size_t oldsize, newsize;
+
+ oldsize = markers_cnt * sizeof(*markers);
+ newsize = oldsize + 2 * sizeof(*markers);
+ if (!oldsize)
+ markers = kvmalloc(newsize, GFP_KERNEL);
+ else
+ markers = kvrealloc(markers, newsize, GFP_KERNEL);
+ if (!markers)
+ goto error;
+ markers[markers_cnt].is_start = 1;
+ markers[markers_cnt].start_address = start;
+ markers[markers_cnt].size = end - start;
+ markers[markers_cnt].name = name;
+ markers_cnt++;
+ markers[markers_cnt].is_start = 0;
+ markers[markers_cnt].start_address = end;
+ markers[markers_cnt].size = end - start;
+ markers[markers_cnt].name = name;
+ markers_cnt++;
+ return 0;
+error:
+ markers_cnt = 0;
+ return -ENOMEM;
}
static int pt_dump_init(void)
@@ -282,34 +280,48 @@ static int pt_dump_init(void)
#ifdef CONFIG_KFENCE
unsigned long kfence_start = (unsigned long)__kfence_pool;
#endif
+ unsigned long lowcore = (unsigned long)get_lowcore();
+ int rc;
+
/*
* Figure out the maximum virtual address being accessible with the
* kernel ASCE. We need this to keep the page table walker functions
* from accessing non-existent entries.
*/
- max_addr = (S390_lowcore.kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2;
+ max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2;
max_addr = 1UL << (max_addr * 11 + 31);
- address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size;
- address_markers[AMODE31_START_NR].start_address = (unsigned long)__samode31;
- address_markers[AMODE31_END_NR].start_address = (unsigned long)__eamode31;
- address_markers[MODULES_NR].start_address = MODULES_VADDR;
- address_markers[MODULES_END_NR].start_address = MODULES_END;
- address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
- address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
- address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
- address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE;
- address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
- address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
- address_markers[VMALLOC_NR].start_address = VMALLOC_START;
- address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+ /* start + end markers - must be added first */
+ rc = add_marker(0, -1UL, NULL);
+ rc |= add_marker((unsigned long)_stext, (unsigned long)_end, "Kernel Image");
+ rc |= add_marker(lowcore, lowcore + sizeof(struct lowcore), "Lowcore");
+ rc |= add_marker(__identity_base, __identity_base + ident_map_size, "Identity Mapping");
+ rc |= add_marker((unsigned long)__samode31, (unsigned long)__eamode31, "Amode31 Area");
+ rc |= add_marker(MODULES_VADDR, MODULES_END, "Modules Area");
+ rc |= add_marker(__abs_lowcore, __abs_lowcore + ABS_LOWCORE_MAP_SIZE, "Lowcore Area");
+ rc |= add_marker(__memcpy_real_area, __memcpy_real_area + MEMCPY_REAL_SIZE, "Real Memory Copy Area");
+ rc |= add_marker((unsigned long)vmemmap, (unsigned long)vmemmap + vmemmap_size, "vmemmap Area");
+ rc |= add_marker(VMALLOC_START, VMALLOC_END, "vmalloc Area");
#ifdef CONFIG_KFENCE
- address_markers[KFENCE_START_NR].start_address = kfence_start;
- address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE;
+ rc |= add_marker(kfence_start, kfence_start + KFENCE_POOL_SIZE, "KFence Pool");
+#endif
+#ifdef CONFIG_KMSAN
+ rc |= add_marker(KMSAN_VMALLOC_SHADOW_START, KMSAN_VMALLOC_SHADOW_END, "Kmsan vmalloc Shadow");
+ rc |= add_marker(KMSAN_VMALLOC_ORIGIN_START, KMSAN_VMALLOC_ORIGIN_END, "Kmsan vmalloc Origins");
+ rc |= add_marker(KMSAN_MODULES_SHADOW_START, KMSAN_MODULES_SHADOW_END, "Kmsan Modules Shadow");
+ rc |= add_marker(KMSAN_MODULES_ORIGIN_START, KMSAN_MODULES_ORIGIN_END, "Kmsan Modules Origins");
+#endif
+#ifdef CONFIG_KASAN
+ rc |= add_marker(KASAN_SHADOW_START, KASAN_SHADOW_END, "Kasan Shadow");
#endif
- sort_address_markers();
+ if (rc)
+ goto error;
+ sort(&markers[1], markers_cnt - 1, sizeof(*markers), ptdump_cmp, NULL);
#ifdef CONFIG_PTDUMP_DEBUGFS
debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
#endif /* CONFIG_PTDUMP_DEBUGFS */
return 0;
+error:
+ kvfree(markers);
+ return -ENOMEM;
}
device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
index 0a0738a473af..a046be1715cf 100644
--- a/arch/s390/mm/extable.c
+++ b/arch/s390/mm/extable.c
@@ -7,6 +7,7 @@
#include <linux/panic.h>
#include <asm/asm-extable.h>
#include <asm/extable.h>
+#include <asm/fpu.h>
const struct exception_table_entry *s390_search_extables(unsigned long addr)
{
@@ -26,7 +27,7 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_r
return true;
}
-static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs)
+static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs)
{
unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
@@ -35,18 +36,6 @@ static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct p
return true;
}
-static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs)
-{
- unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
- unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
- size_t len = FIELD_GET(EX_DATA_LEN, ex->data);
-
- regs->gprs[reg_err] = -EFAULT;
- memset((void *)regs->gprs[reg_addr], 0, len);
- regs->psw.addr = extable_fixup(ex);
- return true;
-}
-
static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex,
bool pair, struct pt_regs *regs)
{
@@ -77,6 +66,13 @@ static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt
return true;
}
+static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ fpu_sfpc(0);
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
bool fixup_exception(struct pt_regs *regs)
{
const struct exception_table_entry *ex;
@@ -89,16 +85,16 @@ bool fixup_exception(struct pt_regs *regs)
return ex_handler_fixup(ex, regs);
case EX_TYPE_BPF:
return ex_handler_bpf(ex, regs);
- case EX_TYPE_UA_STORE:
- return ex_handler_ua_store(ex, regs);
- case EX_TYPE_UA_LOAD_MEM:
- return ex_handler_ua_load_mem(ex, regs);
+ case EX_TYPE_UA_FAULT:
+ return ex_handler_ua_fault(ex, regs);
case EX_TYPE_UA_LOAD_REG:
return ex_handler_ua_load_reg(ex, false, regs);
case EX_TYPE_UA_LOAD_REGPAIR:
return ex_handler_ua_load_reg(ex, true, regs);
case EX_TYPE_ZEROPAD:
return ex_handler_zeropad(ex, regs);
+ case EX_TYPE_FPC:
+ return ex_handler_fpc(ex, regs);
}
panic("invalid exception table entry");
}
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index e41869f5cc95..4692136c0af1 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -28,6 +28,7 @@
#include <asm/extmem.h>
#include <asm/cpcmd.h>
#include <asm/setup.h>
+#include <asm/asm.h>
#define DCSS_PURGESEG 0x08
#define DCSS_LOADSHRX 0x20
@@ -134,20 +135,21 @@ dcss_diag(int *func, void *parameter,
unsigned long *ret1, unsigned long *ret2)
{
unsigned long rx, ry;
- int rc;
+ int cc;
- rx = (unsigned long) parameter;
+ rx = virt_to_phys(parameter);
ry = (unsigned long) *func;
diag_stat_inc(DIAG_STAT_X064);
asm volatile(
- " diag %0,%1,0x64\n"
- " ipm %2\n"
- " srl %2,28\n"
- : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+ " diag %[rx],%[ry],0x64\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry)
+ :
+ : CC_CLOBBER);
*ret1 = rx;
*ret2 = ry;
- return rc;
+ return CC_TRANSFORM(cc);
}
static inline int
@@ -178,7 +180,7 @@ query_segment_type (struct dcss_segment *seg)
/* initialize diag input parameters */
qin->qopcode = DCSS_FINDSEGA;
- qin->qoutptr = (unsigned long) qout;
+ qin->qoutptr = virt_to_phys(qout);
qin->qoutlen = sizeof(struct qout64);
memcpy (qin->qname, seg->dcss_name, 8);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index ac4c78546d97..9b681f74dccc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -34,6 +34,7 @@
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
#include <linux/kfence.h>
+#include <linux/pagewalk.h>
#include <asm/asm-extable.h>
#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
@@ -45,12 +46,6 @@
#include <asm/uv.h>
#include "../kernel/entry.h"
-enum fault_type {
- KERNEL_FAULT,
- USER_FAULT,
- GMAP_FAULT,
-};
-
static DEFINE_STATIC_KEY_FALSE(have_store_indication);
static int __init fault_init(void)
@@ -64,26 +59,15 @@ early_initcall(fault_init);
/*
* Find out which address space caused the exception.
*/
-static enum fault_type get_fault_type(struct pt_regs *regs)
+static bool is_kernel_fault(struct pt_regs *regs)
{
union teid teid = { .val = regs->int_parm_long };
- if (likely(teid.as == PSW_BITS_AS_PRIMARY)) {
- if (user_mode(regs))
- return USER_FAULT;
- if (!IS_ENABLED(CONFIG_PGSTE))
- return KERNEL_FAULT;
- if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
- return GMAP_FAULT;
- return KERNEL_FAULT;
- }
+ if (user_mode(regs))
+ return false;
if (teid.as == PSW_BITS_AS_SECONDARY)
- return USER_FAULT;
- /* Access register mode, not used in the kernel */
- if (teid.as == PSW_BITS_AS_ACCREG)
- return USER_FAULT;
- /* Home space -> access via kernel ASCE */
- return KERNEL_FAULT;
+ return false;
+ return true;
}
static unsigned long get_fault_address(struct pt_regs *regs)
@@ -144,7 +128,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
goto out;
table = __va(entry & _SEGMENT_ENTRY_ORIGIN);
}
- table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
+ table += (address & _PAGE_INDEX) >> PAGE_SHIFT;
if (get_kernel_nofault(entry, table))
goto bad;
pr_cont("P:%016lx ", entry);
@@ -178,21 +162,12 @@ static void dump_fault_info(struct pt_regs *regs)
break;
}
pr_cont("mode while using ");
- switch (get_fault_type(regs)) {
- case USER_FAULT:
- asce = S390_lowcore.user_asce.val;
- pr_cont("user ");
- break;
- case GMAP_FAULT:
- asce = ((struct gmap *)S390_lowcore.gmap)->asce;
- pr_cont("gmap ");
- break;
- case KERNEL_FAULT:
- asce = S390_lowcore.kernel_asce.val;
+ if (is_kernel_fault(regs)) {
+ asce = get_lowcore()->kernel_asce.val;
pr_cont("kernel ");
- break;
- default:
- unreachable();
+ } else {
+ asce = get_lowcore()->user_asce.val;
+ pr_cont("user ");
}
pr_cont("ASCE.\n");
dump_pagetable(asce, get_fault_address(regs));
@@ -227,7 +202,6 @@ static void do_sigsegv(struct pt_regs *regs, int si_code)
static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
{
- enum fault_type fault_type;
unsigned long address;
bool is_write;
@@ -238,17 +212,15 @@ static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
}
if (fixup_exception(regs))
return;
- fault_type = get_fault_type(regs);
- if (fault_type == KERNEL_FAULT) {
+ if (is_kernel_fault(regs)) {
address = get_fault_address(regs);
is_write = fault_is_write(regs);
if (kfence_handle_page_fault(address, is_write, regs))
return;
- }
- if (fault_type == KERNEL_FAULT)
pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n");
- else
+ } else {
pr_alert("Unable to handle kernel paging request in virtual user address space\n");
+ }
dump_fault_info(regs);
die(regs, "Oops");
}
@@ -282,9 +254,7 @@ static void do_exception(struct pt_regs *regs, int access)
struct vm_area_struct *vma;
unsigned long address;
struct mm_struct *mm;
- enum fault_type type;
unsigned int flags;
- struct gmap *gmap;
vm_fault_t fault;
bool is_write;
@@ -298,16 +268,8 @@ static void do_exception(struct pt_regs *regs, int access)
mm = current->mm;
address = get_fault_address(regs);
is_write = fault_is_write(regs);
- type = get_fault_type(regs);
- switch (type) {
- case KERNEL_FAULT:
+ if (is_kernel_fault(regs) || faulthandler_disabled() || !mm)
return handle_fault_error_nolock(regs, 0);
- case USER_FAULT:
- case GMAP_FAULT:
- if (faulthandler_disabled() || !mm)
- return handle_fault_error_nolock(regs, 0);
- break;
- }
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
@@ -323,21 +285,19 @@ static void do_exception(struct pt_regs *regs, int access)
goto lock_mmap;
if (!(vma->vm_flags & access)) {
vma_end_read(vma);
- goto lock_mmap;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return handle_fault_error_nolock(regs, SEGV_ACCERR);
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
- if (unlikely(fault & VM_FAULT_ERROR))
- goto error;
- return;
+ goto done;
}
count_vm_vma_lock_event(VMA_LOCK_RETRY);
if (fault & VM_FAULT_MAJOR)
flags |= FAULT_FLAG_TRIED;
-
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
if (!user_mode(regs))
@@ -345,81 +305,29 @@ static void do_exception(struct pt_regs *regs, int access)
return;
}
lock_mmap:
- mmap_read_lock(mm);
- gmap = NULL;
- if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
- gmap = (struct gmap *)S390_lowcore.gmap;
- current->thread.gmap_addr = address;
- current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
- current->thread.gmap_int_code = regs->int_code & 0xffff;
- address = __gmap_translate(gmap, address);
- if (address == -EFAULT)
- return handle_fault_error(regs, SEGV_MAPERR);
- if (gmap->pfault_enabled)
- flags |= FAULT_FLAG_RETRY_NOWAIT;
- }
retry:
- vma = find_vma(mm, address);
+ vma = lock_mm_and_find_vma(mm, address, regs);
if (!vma)
- return handle_fault_error(regs, SEGV_MAPERR);
- if (unlikely(vma->vm_start > address)) {
- if (!(vma->vm_flags & VM_GROWSDOWN))
- return handle_fault_error(regs, SEGV_MAPERR);
- vma = expand_stack(mm, address);
- if (!vma)
- return handle_fault_error_nolock(regs, SEGV_MAPERR);
- }
+ return handle_fault_error_nolock(regs, SEGV_MAPERR);
if (unlikely(!(vma->vm_flags & access)))
return handle_fault_error(regs, SEGV_ACCERR);
fault = handle_mm_fault(vma, address, flags, regs);
if (fault_signal_pending(fault, regs)) {
- if (flags & FAULT_FLAG_RETRY_NOWAIT)
- mmap_read_unlock(mm);
if (!user_mode(regs))
handle_fault_error_nolock(regs, 0);
return;
}
/* The fault is fully completed (including releasing mmap lock) */
- if (fault & VM_FAULT_COMPLETED) {
- if (gmap) {
- mmap_read_lock(mm);
- goto gmap;
- }
+ if (fault & VM_FAULT_COMPLETED)
return;
- }
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mmap_read_unlock(mm);
- goto error;
- }
if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /*
- * FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_lock has not been released
- */
- current->thread.gmap_pfault = 1;
- return handle_fault_error(regs, 0);
- }
- flags &= ~FAULT_FLAG_RETRY_NOWAIT;
flags |= FAULT_FLAG_TRIED;
- mmap_read_lock(mm);
goto retry;
}
-gmap:
- if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
- address = __gmap_link(gmap, current->thread.gmap_addr,
- address);
- if (address == -EFAULT)
- return handle_fault_error(regs, SEGV_MAPERR);
- if (address == -ENOMEM) {
- fault = VM_FAULT_OOM;
- mmap_read_unlock(mm);
- goto error;
- }
- }
mmap_read_unlock(mm);
- return;
-error:
+done:
+ if (!(fault & VM_FAULT_ERROR))
+ return;
if (fault & VM_FAULT_OOM) {
if (!user_mode(regs))
handle_fault_error_nolock(regs, 0);
@@ -430,12 +338,14 @@ error:
handle_fault_error_nolock(regs, 0);
else
do_sigsegv(regs, SEGV_MAPERR);
- } else if (fault & VM_FAULT_SIGBUS) {
+ } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON |
+ VM_FAULT_HWPOISON_LARGE)) {
if (!user_mode(regs))
handle_fault_error_nolock(regs, 0);
else
do_sigbus(regs);
} else {
+ pr_emerg("Unexpected fault flags: %08x\n", fault);
BUG();
}
}
@@ -488,9 +398,9 @@ void do_secure_storage_access(struct pt_regs *regs)
union teid teid = { .val = regs->int_parm_long };
unsigned long addr = get_fault_address(regs);
struct vm_area_struct *vma;
+ struct folio_walk fw;
struct mm_struct *mm;
- struct page *page;
- struct gmap *gmap;
+ struct folio *folio;
int rc;
/*
@@ -515,81 +425,35 @@ void do_secure_storage_access(struct pt_regs *regs)
*/
panic("Unexpected PGM 0x3d with TEID bit 61=0");
}
- switch (get_fault_type(regs)) {
- case GMAP_FAULT:
- mm = current->mm;
- gmap = (struct gmap *)S390_lowcore.gmap;
- mmap_read_lock(mm);
- addr = __gmap_translate(gmap, addr);
- mmap_read_unlock(mm);
- if (IS_ERR_VALUE(addr))
- return handle_fault_error_nolock(regs, SEGV_MAPERR);
- fallthrough;
- case USER_FAULT:
+ if (is_kernel_fault(regs)) {
+ folio = phys_to_folio(addr);
+ if (unlikely(!folio_try_get(folio)))
+ return;
+ rc = arch_make_folio_accessible(folio);
+ folio_put(folio);
+ if (rc)
+ BUG();
+ } else {
mm = current->mm;
mmap_read_lock(mm);
vma = find_vma(mm, addr);
if (!vma)
return handle_fault_error(regs, SEGV_MAPERR);
- page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
- if (IS_ERR_OR_NULL(page)) {
+ folio = folio_walk_start(&fw, vma, addr, 0);
+ if (!folio) {
mmap_read_unlock(mm);
- break;
+ return;
}
- if (arch_make_page_accessible(page))
+ /* arch_make_folio_accessible() needs a raised refcount. */
+ folio_get(folio);
+ rc = arch_make_folio_accessible(folio);
+ folio_put(folio);
+ folio_walk_end(&fw, vma);
+ if (rc)
send_sig(SIGSEGV, current, 0);
- put_page(page);
mmap_read_unlock(mm);
- break;
- case KERNEL_FAULT:
- page = phys_to_page(addr);
- if (unlikely(!try_get_page(page)))
- break;
- rc = arch_make_page_accessible(page);
- put_page(page);
- if (rc)
- BUG();
- break;
- default:
- unreachable();
}
}
NOKPROBE_SYMBOL(do_secure_storage_access);
-void do_non_secure_storage_access(struct pt_regs *regs)
-{
- struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
- unsigned long gaddr = get_fault_address(regs);
-
- if (WARN_ON_ONCE(get_fault_type(regs) != GMAP_FAULT))
- return handle_fault_error_nolock(regs, SEGV_MAPERR);
- if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
- send_sig(SIGSEGV, current, 0);
-}
-NOKPROBE_SYMBOL(do_non_secure_storage_access);
-
-void do_secure_storage_violation(struct pt_regs *regs)
-{
- struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
- unsigned long gaddr = get_fault_address(regs);
-
- /*
- * If the VM has been rebooted, its address space might still contain
- * secure pages from the previous boot.
- * Clear the page so it can be reused.
- */
- if (!gmap_destroy_page(gmap, gaddr))
- return;
- /*
- * Either KVM messed up the secure guest mapping or the same
- * page is mapped into multiple secure guests.
- *
- * This exception is only triggered when a guest 2 is running
- * and can therefore never occur in kernel context.
- */
- pr_warn_ratelimited("Secure storage violation in task: %s, pid %d\n",
- current->comm, current->pid);
- send_sig(SIGSEGV, current, 0);
-}
-
#endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 8da39deb56ca..d14b488e7a1f 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -24,6 +24,16 @@
#include <asm/page.h>
#include <asm/tlb.h>
+/*
+ * The address is saved in a radix tree directly; NULL would be ambiguous,
+ * since 0 is a valid address, and NULL is returned when nothing was found.
+ * The lower bits are ignored by all users of the macro, so it can be used
+ * to distinguish a valid address 0 from a NULL.
+ */
+#define VALID_GADDR_FLAG 1
+#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
+#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
+
#define GMAP_SHADOW_FAKE_TABLE 1ULL
static struct page *gmap_alloc_crst(void)
@@ -43,7 +53,7 @@ static struct page *gmap_alloc_crst(void)
*
* Returns a guest address space structure.
*/
-static struct gmap *gmap_alloc(unsigned long limit)
+struct gmap *gmap_alloc(unsigned long limit)
{
struct gmap *gmap;
struct page *page;
@@ -70,9 +80,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
if (!gmap)
goto out;
- INIT_LIST_HEAD(&gmap->crst_list);
INIT_LIST_HEAD(&gmap->children);
- INIT_LIST_HEAD(&gmap->pt_list);
INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
@@ -82,8 +90,6 @@ static struct gmap *gmap_alloc(unsigned long limit)
page = gmap_alloc_crst();
if (!page)
goto out_free;
- page->index = 0;
- list_add(&page->lru, &gmap->crst_list);
table = page_to_virt(page);
crst_table_init(table, etype);
gmap->table = table;
@@ -97,6 +103,7 @@ out_free:
out:
return NULL;
}
+EXPORT_SYMBOL_GPL(gmap_alloc);
/**
* gmap_create - create a guest address space
@@ -185,30 +192,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
} while (nr > 0);
}
+static void gmap_free_crst(unsigned long *table, bool free_ptes)
+{
+ bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
+ int i;
+
+ if (is_segment) {
+ if (!free_ptes)
+ goto out;
+ for (i = 0; i < _CRST_ENTRIES; i++)
+ if (!(table[i] & _SEGMENT_ENTRY_INVALID))
+ page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
+ } else {
+ for (i = 0; i < _CRST_ENTRIES; i++)
+ if (!(table[i] & _REGION_ENTRY_INVALID))
+ gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
+ }
+
+out:
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
/**
* gmap_free - free a guest address space
* @gmap: pointer to the guest address space structure
*
* No locks required. There are no references to this gmap anymore.
*/
-static void gmap_free(struct gmap *gmap)
+void gmap_free(struct gmap *gmap)
{
- struct page *page, *next;
-
/* Flush tlb of all gmaps (if not already done for shadows) */
if (!(gmap_is_shadow(gmap) && gmap->removed))
gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
- list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
- __free_pages(page, CRST_ALLOC_ORDER);
+ gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
+
gmap_radix_tree_free(&gmap->guest_to_host);
gmap_radix_tree_free(&gmap->host_to_guest);
/* Free additional data for a shadow gmap */
if (gmap_is_shadow(gmap)) {
- /* Free all page tables. */
- list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
- page_table_free_pgste(page);
gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
/* Release reference to the parent */
gmap_put(gmap->parent);
@@ -216,6 +239,7 @@ static void gmap_free(struct gmap *gmap)
kfree(gmap);
}
+EXPORT_SYMBOL_GPL(gmap_free);
/**
* gmap_get - increase reference counter for guest address space
@@ -279,37 +303,6 @@ void gmap_remove(struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_remove);
-/**
- * gmap_enable - switch primary space to the guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_enable(struct gmap *gmap)
-{
- S390_lowcore.gmap = (unsigned long) gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_enable);
-
-/**
- * gmap_disable - switch back to the standard primary address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_disable(struct gmap *gmap)
-{
- S390_lowcore.gmap = 0UL;
-}
-EXPORT_SYMBOL_GPL(gmap_disable);
-
-/**
- * gmap_get_enabled - get a pointer to the currently enabled gmap
- *
- * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
- */
-struct gmap *gmap_get_enabled(void)
-{
- return (struct gmap *) S390_lowcore.gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_get_enabled);
-
/*
* gmap_alloc_table is assumed to be called with mmap_lock held
*/
@@ -327,10 +320,8 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
crst_table_init(new, init);
spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
- list_add(&page->lru, &gmap->crst_list);
*table = __pa(new) | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
- page->index = gaddr;
page = NULL;
}
spin_unlock(&gmap->guest_table_lock);
@@ -339,21 +330,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
return 0;
}
-/**
- * __gmap_segment_gaddr - find virtual address from segment pointer
- * @entry: pointer to a segment table entry in the guest address space
- *
- * Returns the virtual address in the guest address space for the segment
- */
-static unsigned long __gmap_segment_gaddr(unsigned long *entry)
+static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
{
- struct page *page;
- unsigned long offset;
+ return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
- offset = (unsigned long) entry / sizeof(unsigned long);
- offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
- page = pmd_pgtable_page((pmd_t *) entry);
- return page->index + offset;
+static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
+{
+ return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
+
+static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
+ unsigned long *gaddr)
+{
+ *gaddr = host_to_guest_delete(gmap, vmaddr);
+ if (IS_GADDR_VALID(*gaddr))
+ return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
+ return NULL;
}
/**
@@ -365,16 +358,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry)
*/
static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
{
- unsigned long *entry;
+ unsigned long gaddr;
int flush = 0;
+ pmd_t *pmdp;
BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
- if (entry) {
- flush = (*entry != _SEGMENT_ENTRY_EMPTY);
- *entry = _SEGMENT_ENTRY_EMPTY;
+
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
+ flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
+
spin_unlock(&gmap->guest_table_lock);
return flush;
}
@@ -493,26 +489,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
EXPORT_SYMBOL_GPL(__gmap_translate);
/**
- * gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- */
-unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
- unsigned long rc;
-
- mmap_read_lock(gmap->mm);
- rc = __gmap_translate(gmap, gaddr);
- mmap_read_unlock(gmap->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_translate);
-
-/**
* gmap_unlink - disconnect a page table from the gmap shadow tables
* @mm: pointer to the parent mm_struct
* @table: pointer to the host page table
@@ -596,12 +572,12 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
pud = pud_offset(p4d, vmaddr);
VM_BUG_ON(pud_none(*pud));
/* large puds cannot yet be handled */
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
return -EFAULT;
pmd = pmd_offset(pud, vmaddr);
VM_BUG_ON(pmd_none(*pmd));
/* Are we allowed to use huge pages? */
- if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
+ if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
@@ -611,12 +587,14 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
spin_lock(&gmap->guest_table_lock);
if (*table == _SEGMENT_ENTRY_EMPTY) {
rc = radix_tree_insert(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT, table);
+ vmaddr >> PMD_SHIFT,
+ (void *)MAKE_VALID_GADDR(gaddr));
if (!rc) {
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
*table = (pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
- | _SEGMENT_ENTRY_GMAP_UC;
+ | _SEGMENT_ENTRY_GMAP_UC
+ | _SEGMENT_ENTRY;
} else
*table = pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS;
@@ -633,50 +611,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
radix_tree_preload_end();
return rc;
}
-
-/**
- * gmap_fault - resolve a fault on a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @fault_flags: flags to pass down to handle_mm_fault()
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
- */
-int gmap_fault(struct gmap *gmap, unsigned long gaddr,
- unsigned int fault_flags)
-{
- unsigned long vmaddr;
- int rc;
- bool unlocked;
-
- mmap_read_lock(gmap->mm);
-
-retry:
- unlocked = false;
- vmaddr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(vmaddr)) {
- rc = vmaddr;
- goto out_up;
- }
- if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
- &unlocked)) {
- rc = -EFAULT;
- goto out_up;
- }
- /*
- * In the case that fixup_user_fault unlocked the mmap_lock during
- * faultin redo __gmap_translate to not race with a map/unmap_segment.
- */
- if (unlocked)
- goto retry;
-
- rc = __gmap_link(gmap, gaddr, vmaddr);
-out_up:
- mmap_read_unlock(gmap->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_fault);
+EXPORT_SYMBOL(__gmap_link);
/*
* this function is assumed to be called with mmap_lock held
@@ -801,8 +736,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
*
* Note: Can also be called for shadow gmaps.
*/
-static inline unsigned long *gmap_table_walk(struct gmap *gmap,
- unsigned long gaddr, int level)
+unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
{
const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
unsigned long *table = gmap->table;
@@ -849,10 +783,11 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
- table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
+ table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
}
return table;
}
+EXPORT_SYMBOL(gmap_table_walk);
/**
* gmap_pte_op_walk - walk the gmap page table, get the page table lock
@@ -943,7 +878,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
}
/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
- if (!pmd_large(*pmdp))
+ if (!pmd_leaf(*pmdp))
spin_unlock(&gmap->guest_table_lock);
return pmdp;
}
@@ -955,7 +890,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
*/
static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
{
- if (pmd_large(*pmdp))
+ if (pmd_leaf(*pmdp))
spin_unlock(&gmap->guest_table_lock);
}
@@ -1049,86 +984,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
* @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
* @bits: pgste notification bits to set
*
- * Returns 0 if successfully protected, -ENOMEM if out of memory and
- * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ * Returns:
+ * PAGE_SIZE if a small page was successfully protected;
+ * HPAGE_SIZE if a large page was successfully protected;
+ * -ENOMEM if out of memory;
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing);
+ * -EAGAIN if the guest mapping is missing and should be fixed by the caller.
*
- * Called with sg->mm->mmap_lock in read.
+ * Context: Called with sg->mm->mmap_lock in read.
*/
-static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
- unsigned long len, int prot, unsigned long bits)
+int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
{
- unsigned long vmaddr, dist;
pmd_t *pmdp;
- int rc;
+ int rc = 0;
BUG_ON(gmap_is_shadow(gmap));
- while (len) {
- rc = -EAGAIN;
- pmdp = gmap_pmd_op_walk(gmap, gaddr);
- if (pmdp) {
- if (!pmd_large(*pmdp)) {
- rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
- bits);
- if (!rc) {
- len -= PAGE_SIZE;
- gaddr += PAGE_SIZE;
- }
- } else {
- rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
- bits);
- if (!rc) {
- dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
- len = len < dist ? 0 : len - dist;
- gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
- }
- }
- gmap_pmd_op_end(gmap, pmdp);
- }
- if (rc) {
- if (rc == -EINVAL)
- return rc;
- /* -EAGAIN, fixup of userspace mm and gmap */
- vmaddr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(vmaddr))
- return vmaddr;
- rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
- if (rc)
- return rc;
- }
- }
- return 0;
-}
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (!pmdp)
+ return -EAGAIN;
-/**
- * gmap_mprotect_notify - change access rights for a range of ptes and
- * call the notifier if any pte changes again
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @len: size of area
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- *
- * Returns 0 if for each page in the given range a gmap mapping exists,
- * the new access rights could be set and the notifier could be armed.
- * If the gmap mapping is missing for one or more pages -EFAULT is
- * returned. If no memory could be allocated -ENOMEM is returned.
- * This function establishes missing page table entries.
- */
-int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
- unsigned long len, int prot)
-{
- int rc;
+ if (!pmd_leaf(*pmdp)) {
+ rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
+ if (!rc)
+ rc = PAGE_SIZE;
+ } else {
+ rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
+ if (!rc)
+ rc = HPAGE_SIZE;
+ }
+ gmap_pmd_op_end(gmap, pmdp);
- if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
- return -EINVAL;
- if (!MACHINE_HAS_ESOP && prot == PROT_READ)
- return -EINVAL;
- mmap_read_lock(gmap->mm);
- rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
- mmap_read_unlock(gmap->mm);
return rc;
}
-EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+EXPORT_SYMBOL_GPL(gmap_protect_one);
/**
* gmap_read_table - get an unsigned long value from a guest page table using
@@ -1315,7 +1204,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
if (!table || *table & _PAGE_INVALID)
return;
- gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
+ gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
}
@@ -1333,7 +1222,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
int i;
BUG_ON(!gmap_is_shadow(sg));
- for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
+ for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
pgt[i] = _PAGE_INVALID;
}
@@ -1348,7 +1237,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
{
unsigned long *ste;
phys_addr_t sto, pgt;
- struct page *page;
+ struct ptdesc *ptdesc;
BUG_ON(!gmap_is_shadow(sg));
ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
@@ -1361,9 +1250,8 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
*ste = _SEGMENT_ENTRY_EMPTY;
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = phys_to_page(pgt);
- list_del(&page->lru);
- page_table_free_pgste(page);
+ ptdesc = page_ptdesc(phys_to_page(pgt));
+ page_table_free_pgste(ptdesc);
}
/**
@@ -1377,7 +1265,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
unsigned long *sgt)
{
- struct page *page;
+ struct ptdesc *ptdesc;
phys_addr_t pgt;
int i;
@@ -1389,9 +1277,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
sgt[i] = _SEGMENT_ENTRY_EMPTY;
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = phys_to_page(pgt);
- list_del(&page->lru);
- page_table_free_pgste(page);
+ ptdesc = page_ptdesc(phys_to_page(pgt));
+ page_table_free_pgste(ptdesc);
}
}
@@ -1420,7 +1307,6 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
page = phys_to_page(sgt);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1448,7 +1334,6 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
page = phys_to_page(sgt);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1478,7 +1363,6 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
page = phys_to_page(r3t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1506,7 +1390,6 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
page = phys_to_page(r3t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1536,7 +1419,6 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_r2t(sg, raddr, __va(r2t));
/* Free region 2 table */
page = phys_to_page(r2t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1568,7 +1450,6 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
r1t[i] = _REGION1_ENTRY_EMPTY;
/* Free region 2 table */
page = phys_to_page(r2t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1579,7 +1460,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
*
* Called with sg->guest_table_lock
*/
-static void gmap_unshadow(struct gmap *sg)
+void gmap_unshadow(struct gmap *sg)
{
unsigned long *table;
@@ -1605,143 +1486,7 @@ static void gmap_unshadow(struct gmap *sg)
break;
}
}
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
- int edat_level)
-{
- struct gmap *sg;
-
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce != asce || sg->edat_level != edat_level ||
- sg->removed)
- continue;
- if (!sg->initialized)
- return ERR_PTR(-EAGAIN);
- refcount_inc(&sg->ref_count);
- return sg;
- }
- return NULL;
-}
-
-/**
- * gmap_shadow_valid - check if a shadow guest address space matches the
- * given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise, the
- * caller has to request a new shadow gmap in this case.
- *
- */
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
- if (sg->removed)
- return 0;
- return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_valid);
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
- int edat_level)
-{
- struct gmap *sg, *new;
- unsigned long limit;
- int rc;
-
- BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
- BUG_ON(gmap_is_shadow(parent));
- spin_lock(&parent->shadow_lock);
- sg = gmap_find_shadow(parent, asce, edat_level);
- spin_unlock(&parent->shadow_lock);
- if (sg)
- return sg;
- /* Create a new shadow gmap */
- limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
- if (asce & _ASCE_REAL_SPACE)
- limit = -1UL;
- new = gmap_alloc(limit);
- if (!new)
- return ERR_PTR(-ENOMEM);
- new->mm = parent->mm;
- new->parent = gmap_get(parent);
- new->private = parent->private;
- new->orig_asce = asce;
- new->edat_level = edat_level;
- new->initialized = false;
- spin_lock(&parent->shadow_lock);
- /* Recheck if another CPU created the same shadow */
- sg = gmap_find_shadow(parent, asce, edat_level);
- if (sg) {
- spin_unlock(&parent->shadow_lock);
- gmap_free(new);
- return sg;
- }
- if (asce & _ASCE_REAL_SPACE) {
- /* only allow one real-space gmap shadow */
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce & _ASCE_REAL_SPACE) {
- spin_lock(&sg->guest_table_lock);
- gmap_unshadow(sg);
- spin_unlock(&sg->guest_table_lock);
- list_del(&sg->list);
- gmap_put(sg);
- break;
- }
- }
- }
- refcount_set(&new->ref_count, 2);
- list_add(&new->list, &parent->children);
- if (asce & _ASCE_REAL_SPACE) {
- /* nothing to protect, return right away */
- new->initialized = true;
- spin_unlock(&parent->shadow_lock);
- return new;
- }
- spin_unlock(&parent->shadow_lock);
- /* protect after insertion, so it will get properly invalidated */
- mmap_read_lock(parent->mm);
- rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
- ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
- PROT_READ, GMAP_NOTIFY_SHADOW);
- mmap_read_unlock(parent->mm);
- spin_lock(&parent->shadow_lock);
- new->initialized = true;
- if (rc) {
- list_del(&new->list);
- gmap_free(new);
- new = ERR_PTR(rc);
- }
- spin_unlock(&parent->shadow_lock);
- return new;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow);
+EXPORT_SYMBOL(gmap_unshadow);
/**
* gmap_shadow_r2t - create an empty shadow region 2 table
@@ -1775,9 +1520,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = r2t & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_r2t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1799,7 +1541,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r2t & _REGION_ENTRY_PROTECT);
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -1859,9 +1600,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = r3t & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_r3t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1883,7 +1621,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r3t & _REGION_ENTRY_PROTECT);
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -1943,9 +1680,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = sgt & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_sgt = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1967,7 +1701,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= sgt & _REGION_ENTRY_PROTECT;
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -2000,45 +1733,22 @@ out_free:
}
EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
-/**
- * gmap_shadow_pgt_lookup - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
- unsigned long *pgt, int *dat_protection,
- int *fake)
+static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
{
- unsigned long *table;
- struct page *page;
- int rc;
+ unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
- BUG_ON(!gmap_is_shadow(sg));
- spin_lock(&sg->guest_table_lock);
- table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
- if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
- /* Shadow page tables are full pages (pte+pgste) */
- page = pfn_to_page(*table >> PAGE_SHIFT);
- *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
- *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
- *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
- rc = 0;
- } else {
- rc = -EAGAIN;
- }
- spin_unlock(&sg->guest_table_lock);
- return rc;
+ pgstes += _PAGE_ENTRIES;
+
+ pgstes[0] &= ~PGSTE_ST2_MASK;
+ pgstes[1] &= ~PGSTE_ST2_MASK;
+ pgstes[2] &= ~PGSTE_ST2_MASK;
+ pgstes[3] &= ~PGSTE_ST2_MASK;
+ pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
+ pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
+ pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
+ pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
}
-EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
/**
* gmap_shadow_pgt - instantiate a shadow page table
@@ -2058,19 +1768,20 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
{
unsigned long raddr, origin;
unsigned long *table;
- struct page *page;
+ struct ptdesc *ptdesc;
phys_addr_t s_pgt;
int rc;
BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
/* Allocate a shadow page table */
- page = page_table_alloc_pgste(sg->mm);
- if (!page)
+ ptdesc = page_table_alloc_pgste(sg->mm);
+ if (!ptdesc)
return -ENOMEM;
- page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN;
if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_pgt = page_to_phys(page);
+ origin |= GMAP_SHADOW_FAKE_TABLE;
+ gmap_pgste_set_pgt_addr(ptdesc, origin);
+ s_pgt = page_to_phys(ptdesc_page(ptdesc));
/* Install shadow page table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
@@ -2088,7 +1799,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
/* mark as invalid as long as the parent table is not protected */
*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
(pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
- list_add(&page->lru, &sg->pt_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2114,7 +1824,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
return rc;
out_free:
spin_unlock(&sg->guest_table_lock);
- page_table_free_pgste(page);
+ page_table_free_pgste(ptdesc);
return rc;
}
@@ -2266,7 +1976,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
pte_t *pte, unsigned long bits)
{
unsigned long offset, gaddr = 0;
- unsigned long *table;
struct gmap *gmap, *sg, *next;
offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
@@ -2274,12 +1983,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- table = radix_tree_lookup(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (table)
- gaddr = __gmap_segment_gaddr(table) + offset;
+ gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
spin_unlock(&gmap->guest_table_lock);
- if (!table)
+ if (!IS_GADDR_VALID(gaddr))
continue;
if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
@@ -2339,13 +2045,12 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
if (pmdp) {
- gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC));
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
if (purge)
__pmdp_csp(pmdp);
set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
@@ -2385,27 +2090,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
*/
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
{
- unsigned long *entry, gaddr;
+ unsigned long gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (entry) {
- pmdp = (pmd_t *)entry;
- gaddr = __gmap_segment_gaddr(entry);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
pmdp_notify_gmap(gmap, pmdp, gaddr);
- WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC));
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_LOCAL);
else if (MACHINE_HAS_IDTE)
__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
- *entry = _SEGMENT_ENTRY_EMPTY;
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2420,21 +2123,19 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
*/
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
{
- unsigned long *entry, gaddr;
+ unsigned long gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (entry) {
- pmdp = (pmd_t *)entry;
- gaddr = __gmap_segment_gaddr(entry);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
pmdp_notify_gmap(gmap, pmdp, gaddr);
- WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC));
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_GLOBAL);
@@ -2442,7 +2143,7 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
- *entry = _SEGMENT_ENTRY_EMPTY;
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2498,7 +2199,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
if (!pmdp)
return;
- if (pmd_large(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
bitmap_fill(bitmap, _PAGE_ENTRIES);
} else {
@@ -2548,41 +2249,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Remove all empty zero pages from the mapping for lazy refaulting
- * - This must be called after mm->context.has_pgste is set, to avoid
- * future creation of zero pages
- * - This must be called after THP was disabled.
- *
- * mm contracts with s390, that even if mm were to remove a page table,
- * racing with the loop below and so causing pte_offset_map_lock() to fail,
- * it will never insert a page table containing empty zero pages once
- * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set.
- */
-static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
-{
- unsigned long addr;
-
- for (addr = start; addr != end; addr += PAGE_SIZE) {
- pte_t *ptep;
- spinlock_t *ptl;
-
- ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!ptep)
- break;
- if (is_zero_pfn(pte_pfn(*ptep)))
- ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
- pte_unmap_unlock(ptep, ptl);
- }
- return 0;
-}
-
-static const struct mm_walk_ops zap_zero_walk_ops = {
- .pmd_entry = __zap_zero_pages,
- .walk_lock = PGWALK_WRLOCK,
-};
-
-/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
@@ -2599,22 +2265,142 @@ int s390_enable_sie(void)
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
- walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
-int gmap_mark_unmergeable(void)
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
+ unsigned long *found_addr = walk->private;
+
+ /* Return 1 of the page is a zeropage. */
+ if (is_zero_pfn(pte_pfn(*pte))) {
+ /*
+ * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+ * right thing and likely don't care: FAULT_FLAG_UNSHARE
+ * currently only works in COW mappings, which is also where
+ * mm_forbids_zeropage() is checked.
+ */
+ if (!is_cow_mapping(walk->vma->vm_flags))
+ return -EFAULT;
+
+ *found_addr = addr;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+ .pte_entry = find_zeropage_pte_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
+/*
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __s390_unshare_zeropages(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+ unsigned long addr;
+ vm_fault_t fault;
+ int rc;
+
+ for_each_vma(vmi, vma) {
+ /*
+ * We could only look at COW mappings, but it's more future
+ * proof to catch unexpected zeropages in other mappings and
+ * fail.
+ */
+ if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+ continue;
+ addr = vma->vm_start;
+
+retry:
+ rc = walk_page_range_vma(vma, addr, vma->vm_end,
+ &find_zeropage_ops, &addr);
+ if (rc < 0)
+ return rc;
+ else if (!rc)
+ continue;
+
+ /* addr was updated by find_zeropage_pte_entry() */
+ fault = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ if (fault & VM_FAULT_OOM)
+ return -ENOMEM;
+ /*
+ * See break_ksm(): even after handle_mm_fault() returned 0, we
+ * must start the lookup from the current address, because
+ * handle_mm_fault() may back out if there's any difficulty.
+ *
+ * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+ * maybe they could trigger in the future on concurrent
+ * truncation. In that case, the shared zeropage would be gone
+ * and we can simply retry and make progress.
+ */
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+static int __s390_disable_cow_sharing(struct mm_struct *mm)
+{
+ int rc;
+
+ if (!mm->context.allow_cow_sharing)
+ return 0;
+
+ mm->context.allow_cow_sharing = 0;
+
+ /* Replace all shared zeropages by anonymous pages. */
+ rc = __s390_unshare_zeropages(mm);
/*
* Make sure to disable KSM (if enabled for the whole process or
* individual VMAs). Note that nothing currently hinders user space
* from re-enabling it.
*/
- return ksm_disable(current->mm);
+ if (!rc)
+ rc = ksm_disable(mm);
+ if (rc)
+ mm->context.allow_cow_sharing = 1;
+ return rc;
+}
+
+/*
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int s390_disable_cow_sharing(void)
+{
+ int rc;
+
+ mmap_write_lock(current->mm);
+ rc = __s390_disable_cow_sharing(current->mm);
+ mmap_write_unlock(current->mm);
+ return rc;
}
-EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
/*
* Enable storage key handling from now on and initialize the storage
@@ -2646,7 +2432,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
{
pmd_t *pmd = (pmd_t *)pte;
unsigned long start, end;
- struct page *page = pmd_page(*pmd);
+ struct folio *folio = page_folio(pmd_page(*pmd));
/*
* The write check makes sure we do not set a key on shared
@@ -2659,9 +2445,9 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
return 0;
start = pmd_val(*pmd) & HPAGE_MASK;
- end = start + HPAGE_SIZE - 1;
+ end = start + HPAGE_SIZE;
__storage_key_init_range(start, end);
- set_bit(PG_arch_1, &page->flags);
+ set_bit(PG_arch_1, &folio->flags);
cond_resched();
return 0;
}
@@ -2683,7 +2469,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
- rc = gmap_mark_unmergeable();
+ rc = __s390_disable_cow_sharing(mm);
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;
@@ -2754,13 +2540,15 @@ static const struct mm_walk_ops gather_pages_ops = {
*/
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
{
+ struct folio *folio;
unsigned long i;
for (i = 0; i < count; i++) {
+ folio = pfn_folio(pfns[i]);
/* we always have an extra reference */
- uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+ uv_destroy_folio(folio);
/* get rid of the extra reference */
- put_page(pfn_to_page(pfns[i]));
+ folio_put(folio);
cond_resched();
}
}
@@ -2801,49 +2589,6 @@ int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
/**
- * s390_unlist_old_asce - Remove the topmost level of page tables from the
- * list of page tables of the gmap.
- * @gmap: the gmap whose table is to be removed
- *
- * On s390x, KVM keeps a list of all pages containing the page tables of the
- * gmap (the CRST list). This list is used at tear down time to free all
- * pages that are now not needed anymore.
- *
- * This function removes the topmost page of the tree (the one pointed to by
- * the ASCE) from the CRST list.
- *
- * This means that it will not be freed when the VM is torn down, and needs
- * to be handled separately by the caller, unless a leak is actually
- * intended. Notice that this function will only remove the page from the
- * list, the page will still be used as a top level page table (and ASCE).
- */
-void s390_unlist_old_asce(struct gmap *gmap)
-{
- struct page *old;
-
- old = virt_to_page(gmap->table);
- spin_lock(&gmap->guest_table_lock);
- list_del(&old->lru);
- /*
- * Sometimes the topmost page might need to be "removed" multiple
- * times, for example if the VM is rebooted into secure mode several
- * times concurrently, or if s390_replace_asce fails after calling
- * s390_remove_old_asce and is attempted again later. In that case
- * the old asce has been removed from the list, and therefore it
- * will not be freed when the VM terminates, but the ASCE is still
- * in use and still pointed to.
- * A subsequent call to replace_asce will follow the pointer and try
- * to remove the same page from the list again.
- * Therefore it's necessary that the page of the ASCE has valid
- * pointers, so list_del can work (and do nothing) without
- * dereferencing stale or invalid pointers.
- */
- INIT_LIST_HEAD(&old->lru);
- spin_unlock(&gmap->guest_table_lock);
-}
-EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
-
-/**
* s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
* @gmap: the gmap whose ASCE needs to be replaced
*
@@ -2862,8 +2607,6 @@ int s390_replace_asce(struct gmap *gmap)
struct page *page;
void *table;
- s390_unlist_old_asce(gmap);
-
/* Replacing segment type ASCEs would cause serious issues */
if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
return -EINVAL;
@@ -2871,19 +2614,9 @@ int s390_replace_asce(struct gmap *gmap)
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = 0;
table = page_to_virt(page);
memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
- /*
- * The caller has to deal with the old ASCE, but here we make sure
- * the new one is properly added to the CRST list, so that
- * it will be freed when the VM is torn down.
- */
- spin_lock(&gmap->guest_table_lock);
- list_add(&page->lru, &gmap->crst_list);
- spin_unlock(&gmap->guest_table_lock);
-
/* Set new table origin while preserving existing ASCE control bits */
asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
WRITE_ONCE(gmap->asce, asce);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 297a6d897d5a..2e568f175cd4 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -24,6 +24,7 @@
static inline unsigned long __pte_to_rste(pte_t pte)
{
+ swp_entry_t arch_entry;
unsigned long rste;
/*
@@ -48,6 +49,7 @@ static inline unsigned long __pte_to_rste(pte_t pte)
*/
if (pte_present(pte)) {
rste = pte_val(pte) & PAGE_MASK;
+ rste |= _SEGMENT_ENTRY_PRESENT;
rste |= move_set_bit(pte_val(pte), _PAGE_READ,
_SEGMENT_ENTRY_READ);
rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
@@ -66,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte)
#endif
rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
_SEGMENT_ENTRY_NOEXEC);
+ } else if (!pte_none(pte)) {
+ /* swap pte */
+ arch_entry = __pte_to_swp_entry(pte);
+ rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry));
} else
rste = _SEGMENT_ENTRY_EMPTY;
return rste;
@@ -73,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte)
static inline pte_t __rste_to_pte(unsigned long rste)
{
+ swp_entry_t arch_entry;
unsigned long pteval;
- int present;
+ int present, none;
+ pte_t pte;
- if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
present = pud_present(__pud(rste));
- else
+ none = pud_none(__pud(rste));
+ } else {
present = pmd_present(__pmd(rste));
+ none = pmd_none(__pmd(rste));
+ }
/*
* Convert encoding pmd / pud bits pte bits
@@ -114,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste)
pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
#endif
pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
+ } else if (!none) {
+ /* swap rste */
+ arch_entry = __rste_to_swp_entry(rste);
+ pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry));
+ pteval = pte_val(pte);
} else
pteval = _PAGE_INVALID;
return __pte(pteval);
@@ -121,7 +137,7 @@ static inline pte_t __rste_to_pte(unsigned long rste)
static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
{
- struct page *page;
+ struct folio *folio;
unsigned long size, paddr;
if (!mm_uses_skeys(mm) ||
@@ -129,17 +145,17 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
return;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
- page = pud_page(__pud(rste));
+ folio = page_folio(pud_page(__pud(rste)));
size = PUD_SIZE;
paddr = rste & PUD_MASK;
} else {
- page = pmd_page(__pmd(rste));
+ folio = page_folio(pmd_page(__pmd(rste)));
size = PMD_SIZE;
paddr = rste & PMD_MASK;
}
- if (!test_and_set_bit(PG_arch_1, &page->flags))
- __storage_key_init_range(paddr, paddr + size - 1);
+ if (!test_and_set_bit(PG_arch_1, &folio->flags))
+ __storage_key_init_range(paddr, paddr + size);
}
void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -148,8 +164,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
unsigned long rste;
rste = __pte_to_rste(pte);
- if (!MACHINE_HAS_NX)
- rste &= ~_SEGMENT_ENTRY_NOEXEC;
/* Set correct table type for 2G hugepages */
if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
@@ -169,15 +183,15 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
__set_huge_pte_at(mm, addr, ptep, pte);
}
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
return __rste_to_pte(pte_val(*ptep));
}
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+pte_t __huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
- pte_t pte = huge_ptep_get(ptep);
+ pte_t pte = huge_ptep_get(mm, addr, ptep);
pmd_t *pmdp = (pmd_t *) ptep;
pud_t *pudp = (pud_t *) ptep;
@@ -223,26 +237,15 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
p4dp = p4d_offset(pgdp, addr);
if (p4d_present(*p4dp)) {
pudp = pud_offset(p4dp, addr);
- if (pud_present(*pudp)) {
- if (pud_large(*pudp))
- return (pte_t *) pudp;
+ if (sz == PUD_SIZE)
+ return (pte_t *)pudp;
+ if (pud_present(*pudp))
pmdp = pmd_offset(pudp, addr);
- }
}
}
return (pte_t *) pmdp;
}
-int pmd_huge(pmd_t pmd)
-{
- return pmd_large(pmd);
-}
-
-int pud_huge(pud_t pud)
-{
- return pud_large(pud);
-}
-
bool __init arch_hugetlb_valid_size(unsigned long size)
{
if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
@@ -252,91 +255,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
else
return false;
}
-
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
-
- info.flags = 0;
- info.length = len;
- info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- return vm_unmapped_area(&info);
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
- unsigned long addr0, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
- unsigned long addr;
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = PAGE_SIZE;
- info.high_limit = current->mm->mmap_base;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- addr = vm_unmapped_area(&info);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- if (addr & ~PAGE_MASK) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (len > TASK_SIZE - mmap_min_addr)
- return -ENOMEM;
-
- if (flags & MAP_FIXED) {
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- goto check_asce_limit;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- goto check_asce_limit;
- }
-
- if (mm->get_unmapped_area == arch_get_unmapped_area)
- addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
- pgoff, flags);
- else
- addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
- pgoff, flags);
- if (offset_in_page(addr))
- return addr;
-
-check_asce_limit:
- return check_asce_limit(mm, addr, len);
-}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 43e612bc2bcd..f2298f7a3f21 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -37,7 +37,6 @@
#include <asm/pgalloc.h>
#include <asm/ctlreg.h>
#include <asm/kfence.h>
-#include <asm/ptdump.h>
#include <asm/dma.h>
#include <asm/abs_lowcore.h>
#include <asm/tlb.h>
@@ -50,18 +49,29 @@
#include <asm/uv.h>
#include <linux/virtio_anchor.h>
#include <linux/virtio_config.h>
+#include <linux/execmem.h>
pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
struct ctlreg __bootdata_preserved(s390_invalid_asce);
+unsigned long __bootdata_preserved(page_noexec_mask);
+EXPORT_SYMBOL(page_noexec_mask);
+
+unsigned long __bootdata_preserved(segment_noexec_mask);
+EXPORT_SYMBOL(segment_noexec_mask);
+
+unsigned long __bootdata_preserved(region_noexec_mask);
+EXPORT_SYMBOL(region_noexec_mask);
+
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(zero_page_mask);
static void __init setup_zero_pages(void)
{
+ unsigned long total_pages = memblock_estimated_nr_free_pages();
unsigned int order;
struct page *page;
int i;
@@ -70,7 +80,7 @@ static void __init setup_zero_pages(void)
order = 7;
/* Limit number of empty zero pages for small memory sizes */
- while (order > 2 && (totalram_pages() >> 10) < (1UL << order))
+ while (order > 2 && (total_pages >> 10) < (1UL << order))
order--;
empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
@@ -96,7 +106,7 @@ void __init paging_init(void)
vmem_map_init();
sparse_init();
- zone_dma_bits = 31;
+ zone_dma_limit = DMA_BIT_MASK(31);
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
@@ -107,9 +117,10 @@ void mark_rodata_ro(void)
{
unsigned long size = __end_ro_after_init - __start_ro_after_init;
+ if (MACHINE_HAS_NX)
+ system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT);
__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
- debug_checkwx();
}
int set_memory_encrypted(unsigned long vaddr, int numpages)
@@ -170,13 +181,6 @@ void __init mem_init(void)
setup_zero_pages(); /* Setup zeroed pages. */
}
-void free_initmem(void)
-{
- set_memory_rwnx((unsigned long)_sinittext,
- (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT);
- free_initmem_default(POISON_FREE_INITMEM);
-}
-
unsigned long memory_block_size_bytes(void)
{
/*
@@ -281,9 +285,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(params->altmap))
- return -EINVAL;
-
if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
return -EINVAL;
@@ -307,3 +308,32 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
vmem_remove_mapping(start, size);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ unsigned long module_load_offset = 0;
+ unsigned long start;
+
+ if (kaslr_enabled())
+ module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+ start = MODULES_VADDR + module_load_offset;
+
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_DEFAULT] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 632c3a55feed..44426e0f2944 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -17,6 +17,7 @@
#include <asm/asm-extable.h>
#include <asm/abs_lowcore.h>
#include <asm/stacktrace.h>
+#include <asm/sections.h>
#include <asm/maccess.h>
#include <asm/ctlreg.h>
@@ -48,7 +49,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
}
/*
- * s390_kernel_write - write to kernel memory bypassing DAT
+ * __s390_kernel_write - write to kernel memory bypassing DAT
* @dst: destination address
* @src: source address
* @size: number of bytes to copy
@@ -61,7 +62,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
*/
static DEFINE_SPINLOCK(s390_kernel_write_lock);
-notrace void *s390_kernel_write(void *dst, const void *src, size_t size)
+notrace void *__s390_kernel_write(void *dst, const void *src, size_t size)
{
void *tmp = dst;
unsigned long flags;
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index fc9a7dc26c5e..76f376876e0d 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,6 +17,7 @@
#include <linux/random.h>
#include <linux/compat.h>
#include <linux/security.h>
+#include <linux/hugetlb.h>
#include <asm/elf.h>
static unsigned long stack_maxrandom_size(void)
@@ -71,13 +72,24 @@ static inline unsigned long mmap_base(unsigned long rnd,
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
+static int get_align_mask(struct file *filp, unsigned long flags)
+{
+ if (filp && is_file_hugepages(filp))
+ return huge_page_mask_align(filp);
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+ if (filp || (flags & MAP_SHARED))
+ return MMAP_ALIGN_MASK << PAGE_SHIFT;
+ return 0;
+}
+
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
@@ -93,15 +105,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
goto check_asce_limit;
}
- info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
- if (filp || (flags & MAP_SHARED))
- info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
- else
- info.align_mask = 0;
- info.align_offset = pgoff << PAGE_SHIFT;
+ info.align_mask = get_align_mask(filp, flags);
+ if (!(filp && is_file_hugepages(filp)))
+ info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
if (offset_in_page(addr))
return addr;
@@ -112,11 +121,11 @@ check_asce_limit:
unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags, vm_flags_t vm_flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/* requested length too big for entire address space */
if (len > TASK_SIZE - mmap_min_addr)
@@ -138,11 +147,9 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
- if (filp || (flags & MAP_SHARED))
- info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
- else
- info.align_mask = 0;
- info.align_offset = pgoff << PAGE_SHIFT;
+ info.align_mask = get_align_mask(filp, flags);
+ if (!(filp && is_file_hugepages(filp)))
+ info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
/*
@@ -182,29 +189,35 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
*/
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = mmap_base_legacy(random_factor);
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
}
}
-static const pgprot_t protection_map[16] = {
- [VM_NONE] = PAGE_NONE,
- [VM_READ] = PAGE_RO,
- [VM_WRITE] = PAGE_RO,
- [VM_WRITE | VM_READ] = PAGE_RO,
- [VM_EXEC] = PAGE_RX,
- [VM_EXEC | VM_READ] = PAGE_RX,
- [VM_EXEC | VM_WRITE] = PAGE_RX,
- [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX,
- [VM_SHARED] = PAGE_NONE,
- [VM_SHARED | VM_READ] = PAGE_RO,
- [VM_SHARED | VM_WRITE] = PAGE_RW,
- [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW,
- [VM_SHARED | VM_EXEC] = PAGE_RX,
- [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX,
- [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX,
- [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX
-};
+static pgprot_t protection_map[16] __ro_after_init;
+
+void __init setup_protection_map(void)
+{
+ pgprot_t *pm = protection_map;
+
+ pm[VM_NONE] = PAGE_NONE;
+ pm[VM_READ] = PAGE_RO;
+ pm[VM_WRITE] = PAGE_RO;
+ pm[VM_WRITE | VM_READ] = PAGE_RO;
+ pm[VM_EXEC] = PAGE_RX;
+ pm[VM_EXEC | VM_READ] = PAGE_RX;
+ pm[VM_EXEC | VM_WRITE] = PAGE_RX;
+ pm[VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX;
+ pm[VM_SHARED] = PAGE_NONE;
+ pm[VM_SHARED | VM_READ] = PAGE_RO;
+ pm[VM_SHARED | VM_WRITE] = PAGE_RW;
+ pm[VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW;
+ pm[VM_SHARED | VM_EXEC] = PAGE_RX;
+ pm[VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX;
+ pm[VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX;
+ pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX;
+}
+
DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 631e3a4ee2de..eae97fb61712 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -12,6 +12,7 @@
#include <asm/pgalloc.h>
#include <asm/kfence.h>
#include <asm/page.h>
+#include <asm/asm.h>
#include <asm/set_memory.h>
static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
@@ -75,7 +76,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
break;
}
table = (unsigned long *)((unsigned long)old & mask);
- crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce.val);
+ crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val);
} else if (MACHINE_HAS_IDTE) {
cspg(old, *old, new);
} else {
@@ -108,8 +109,6 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
} else if (flags & SET_MEMORY_DEF) {
new = __pte(pte_val(new) & PAGE_MASK);
new = set_pte_bit(new, PAGE_KERNEL);
- if (!MACHINE_HAS_NX)
- new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
}
pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
ptep++;
@@ -166,8 +165,6 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
} else if (flags & SET_MEMORY_DEF) {
new = __pmd(pmd_val(new) & PMD_MASK);
new = set_pmd_bit(new, SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX)
- new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
}
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
}
@@ -185,7 +182,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
if (pmd_none(*pmdp))
return -EINVAL;
next = pmd_addr_end(addr, end);
- if (pmd_large(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
need_split = !!(flags & SET_MEMORY_4K);
need_split |= !!(addr & ~PMD_MASK);
need_split |= !!(addr + PMD_SIZE > next);
@@ -255,8 +252,6 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
} else if (flags & SET_MEMORY_DEF) {
new = __pud(pud_val(new) & PUD_MASK);
new = set_pud_bit(new, REGION3_KERNEL);
- if (!MACHINE_HAS_NX)
- new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
}
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
@@ -274,7 +269,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
if (pud_none(*pudp))
return -EINVAL;
next = pud_addr_end(addr, end);
- if (pud_large(*pudp)) {
+ if (pud_leaf(*pudp)) {
need_split = !!(flags & SET_MEMORY_4K);
need_split |= !!(addr & ~PUD_MASK);
need_split |= !!(addr + PUD_SIZE > next);
@@ -406,6 +401,33 @@ int set_direct_map_default_noflush(struct page *page)
return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ unsigned long flags;
+
+ if (valid)
+ flags = SET_MEMORY_DEF;
+ else
+ flags = SET_MEMORY_INV;
+
+ return __set_memory((unsigned long)page_to_virt(page), nr, flags);
+}
+
+bool kernel_page_present(struct page *page)
+{
+ unsigned long addr;
+ unsigned int cc;
+
+ addr = (unsigned long)page_address(page);
+ asm volatile(
+ " lra %[addr],0(%[addr])\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [addr] "+a" (addr)
+ :
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc) == 0;
+}
+
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
static void ipte_range(pte_t *pte, unsigned long address, int nr)
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 008e487c94a6..30387a6e98ff 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -21,7 +21,7 @@
int page_table_allocate_pgste = 0;
EXPORT_SYMBOL(page_table_allocate_pgste);
-static struct ctl_table page_table_sysctl[] = {
+static const struct ctl_table page_table_sysctl[] = {
{
.procname = "allocate_pgste",
.data = &page_table_allocate_pgste,
@@ -55,6 +55,8 @@ unsigned long *crst_table_alloc(struct mm_struct *mm)
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
+ if (!table)
+ return;
pagetable_free(virt_to_ptdesc(table));
}
@@ -64,8 +66,8 @@ static void __crst_table_upgrade(void *arg)
/* change all active ASCEs to avoid the creation of new TLBs */
if (current->active_mm == mm) {
- S390_lowcore.user_asce.val = mm->context.asce;
- local_ctl_load(7, &S390_lowcore.user_asce);
+ get_lowcore()->user_asce.val = mm->context.asce;
+ local_ctl_load(7, &get_lowcore()->user_asce);
}
__tlb_flush_local();
}
@@ -86,12 +88,14 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
if (unlikely(!p4d))
goto err_p4d;
crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+ pagetable_p4d_ctor(virt_to_ptdesc(p4d));
}
if (end > _REGION1_SIZE) {
pgd = crst_table_alloc(mm);
if (unlikely(!pgd))
goto err_pgd;
crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+ pagetable_pgd_ctor(virt_to_ptdesc(pgd));
}
spin_lock_bh(&mm->page_table_lock);
@@ -128,6 +132,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
return 0;
err_pgd:
+ pagetable_dtor(virt_to_ptdesc(p4d));
crst_table_free(mm, p4d);
err_p4d:
return -ENOMEM;
@@ -135,7 +140,7 @@ err_p4d:
#ifdef CONFIG_PGSTE
-struct page *page_table_alloc_pgste(struct mm_struct *mm)
+struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm)
{
struct ptdesc *ptdesc;
u64 *table;
@@ -147,12 +152,12 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm)
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
- return ptdesc_page(ptdesc);
+ return ptdesc;
}
-void page_table_free_pgste(struct page *page)
+void page_table_free_pgste(struct ptdesc *ptdesc)
{
- pagetable_free(page_ptdesc(page));
+ pagetable_free(ptdesc);
}
#endif /* CONFIG_PGSTE */
@@ -171,37 +176,16 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
}
table = ptdesc_to_virt(ptdesc);
__arch_set_page_dat(table, 1);
- /* pt_list is used by gmap only */
- INIT_LIST_HEAD(&ptdesc->pt_list);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
return table;
}
-static void pagetable_pte_dtor_free(struct ptdesc *ptdesc)
-{
- pagetable_pte_dtor(ptdesc);
- pagetable_free(ptdesc);
-}
-
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
struct ptdesc *ptdesc = virt_to_ptdesc(table);
- pagetable_pte_dtor_free(ptdesc);
-}
-
-void __tlb_remove_table(void *table)
-{
- struct ptdesc *ptdesc = virt_to_ptdesc(table);
- struct page *page = ptdesc_page(ptdesc);
-
- if (compound_order(page) == CRST_ALLOC_ORDER) {
- /* pmd, pud, or p4d */
- pagetable_free(ptdesc);
- return;
- }
- pagetable_pte_dtor_free(ptdesc);
+ pagetable_dtor_free(ptdesc);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -209,7 +193,7 @@ static void pte_free_now(struct rcu_head *head)
{
struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
- pagetable_pte_dtor_free(ptdesc);
+ pagetable_dtor_free(ptdesc);
}
void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
@@ -262,6 +246,8 @@ static unsigned long *base_crst_alloc(unsigned long val)
static void base_crst_free(unsigned long *table)
{
+ if (!table)
+ return;
pagetable_free(virt_to_ptdesc(table));
}
@@ -274,7 +260,7 @@ static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
return (next - 1) < (end - 1) ? next : end; \
}
-BASE_ADDR_END_FUNC(page, _PAGE_SIZE)
+BASE_ADDR_END_FUNC(page, PAGE_SIZE)
BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
@@ -298,7 +284,7 @@ static int base_page_walk(unsigned long *origin, unsigned long addr,
if (!alloc)
return 0;
pte = origin;
- pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
+ pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
do {
next = base_page_addr_end(addr, end);
*pte = base_lra(addr);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 99422926efe1..f05e62e037c2 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -360,8 +360,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pgste_t pgste;
struct mm_struct *mm = vma->vm_mm;
- if (!MACHINE_HAS_NX)
- pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC));
if (mm_has_pgste(mm)) {
pgste = pgste_get(ptep);
pgste_set_key(ptep, pgste, pte, mm);
@@ -470,7 +468,7 @@ static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
return -ENOENT;
/* Large PUDs are not supported yet. */
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
return -EFAULT;
*pmdp = pmd_offset(pud, addr);
@@ -525,7 +523,7 @@ static inline void pudp_idte_global(struct mm_struct *mm,
else
/*
* Invalid bit position is the same for pmd and pud, so we can
- * re-use _pmd_csp() here
+ * reuse _pmd_csp() here
*/
__pmdp_csp((pmd_t *) pudp);
}
@@ -721,9 +719,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
if (!non_swap_entry(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(folio));
}
free_swap_and_cache(entry);
}
@@ -827,7 +825,7 @@ again:
return key ? -EFAULT : 0;
}
- if (pmd_large(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
paddr = pmd_val(*pmdp) & HPAGE_MASK;
paddr |= addr & ~HPAGE_MASK;
/*
@@ -938,7 +936,7 @@ again:
return 0;
}
- if (pmd_large(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
paddr = pmd_val(*pmdp) & HPAGE_MASK;
paddr |= addr & ~HPAGE_MASK;
cc = page_reset_referenced(paddr);
@@ -1002,7 +1000,7 @@ again:
return 0;
}
- if (pmd_large(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
paddr = pmd_val(*pmdp) & HPAGE_MASK;
paddr |= addr & ~HPAGE_MASK;
*key = page_get_storage_key(paddr);
diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c
new file mode 100644
index 000000000000..59de866c72d9
--- /dev/null
+++ b/arch/s390/mm/physaddr.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mmdebug.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+
+unsigned long __phys_addr(unsigned long x, bool is_31bit)
+{
+ VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x)));
+ x = __pa_nodebug(x);
+ if (is_31bit)
+ VIRTUAL_BUG_ON(x >> 31);
+ return x;
+}
+EXPORT_SYMBOL(__phys_addr);
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 186a020857cf..8ead999e340b 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -13,7 +13,9 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <asm/page-states.h>
+#include <asm/abs_lowcore.h>
#include <asm/cacheflush.h>
+#include <asm/maccess.h>
#include <asm/nospec-branch.h>
#include <asm/ctlreg.h>
#include <asm/pgalloc.h>
@@ -21,6 +23,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/set_memory.h>
+#include <asm/physmem_info.h>
static DEFINE_MUTEX(vmem_mutex);
@@ -33,8 +36,12 @@ static void __ref *vmem_alloc_pages(unsigned int order)
return memblock_alloc(size, size);
}
-static void vmem_free_pages(unsigned long addr, int order)
+static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
{
+ if (altmap) {
+ vmem_altmap_free(altmap, 1 << order);
+ return;
+ }
/* We don't expect boot memory to be removed ever. */
if (!slab_is_available() ||
WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
@@ -156,27 +163,25 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
- unsigned long end, bool add, bool direct)
+ unsigned long end, bool add, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long prot, pages = 0;
int ret = -ENOMEM;
pte_t *pte;
prot = pgprot_val(PAGE_KERNEL);
- if (!MACHINE_HAS_NX)
- prot &= ~_PAGE_NOEXEC;
-
pte = pte_offset_kernel(pmd, addr);
for (; addr < end; addr += PAGE_SIZE, pte++) {
if (!add) {
if (pte_none(*pte))
continue;
if (!direct)
- vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+ vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap);
pte_clear(&init_mm, addr, pte);
} else if (pte_none(*pte)) {
if (!direct) {
- void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap);
if (!new_page)
goto out;
@@ -213,7 +218,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start)
/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
- unsigned long end, bool add, bool direct)
+ unsigned long end, bool add, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, prot, pages = 0;
int ret = -ENOMEM;
@@ -221,24 +227,21 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
pte_t *pte;
prot = pgprot_val(SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX)
- prot &= ~_SEGMENT_ENTRY_NOEXEC;
-
pmd = pmd_offset(pud, addr);
for (; addr < end; addr = next, pmd++) {
next = pmd_addr_end(addr, end);
if (!add) {
if (pmd_none(*pmd))
continue;
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE)) {
if (!direct)
- vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
pmd_clear(pmd);
pages++;
} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
- vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
pmd_clear(pmd);
}
continue;
@@ -261,7 +264,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
* page tables since vmemmap_populate gets
* called for each section separately.
*/
- new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+ new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap);
if (new_page) {
set_pmd(pmd, __pmd(__pa(new_page) | prot));
if (!IS_ALIGNED(addr, PMD_SIZE) ||
@@ -275,12 +278,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
if (!pte)
goto out;
pmd_populate(&init_mm, pmd, pte);
- } else if (pmd_large(*pmd)) {
+ } else if (pmd_leaf(*pmd)) {
if (!direct)
vmemmap_use_sub_pmd(addr, next);
continue;
}
- ret = modify_pte_table(pmd, addr, next, add, direct);
+ ret = modify_pte_table(pmd, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -302,12 +305,12 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start)
for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
if (!pmd_none(*pmd))
return;
- vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+ vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL);
pud_clear(pud);
}
static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
- bool add, bool direct)
+ bool add, bool direct, struct vmem_altmap *altmap)
{
unsigned long next, prot, pages = 0;
int ret = -ENOMEM;
@@ -315,15 +318,13 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
pmd_t *pmd;
prot = pgprot_val(REGION3_KERNEL);
- if (!MACHINE_HAS_NX)
- prot &= ~_REGION_ENTRY_NOEXEC;
pud = pud_offset(p4d, addr);
for (; addr < end; addr = next, pud++) {
next = pud_addr_end(addr, end);
if (!add) {
if (pud_none(*pud))
continue;
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
IS_ALIGNED(next, PUD_SIZE)) {
pud_clear(pud);
@@ -344,10 +345,10 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
if (!pmd)
goto out;
pud_populate(&init_mm, pud, pmd);
- } else if (pud_large(*pud)) {
+ } else if (pud_leaf(*pud)) {
continue;
}
- ret = modify_pmd_table(pud, addr, next, add, direct);
+ ret = modify_pmd_table(pud, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -370,12 +371,12 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start)
if (!pud_none(*pud))
return;
}
- vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+ vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL);
p4d_clear(p4d);
}
static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
- bool add, bool direct)
+ bool add, bool direct, struct vmem_altmap *altmap)
{
unsigned long next;
int ret = -ENOMEM;
@@ -394,7 +395,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
goto out;
p4d_populate(&init_mm, p4d, pud);
}
- ret = modify_pud_table(p4d, addr, next, add, direct);
+ ret = modify_pud_table(p4d, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -415,12 +416,12 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
if (!p4d_none(*p4d))
return;
}
- vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+ vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL);
pgd_clear(pgd);
}
static int modify_pagetable(unsigned long start, unsigned long end, bool add,
- bool direct)
+ bool direct, struct vmem_altmap *altmap)
{
unsigned long addr, next;
int ret = -ENOMEM;
@@ -430,7 +431,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
return -EINVAL;
/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
- if (WARN_ON_ONCE(end > VMALLOC_START))
+ if (WARN_ON_ONCE(end > __abs_lowcore))
return -EINVAL;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
@@ -445,7 +446,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
goto out;
pgd_populate(&init_mm, pgd, p4d);
}
- ret = modify_p4d_table(pgd, addr, next, add, direct);
+ ret = modify_p4d_table(pgd, addr, next, add, direct, altmap);
if (ret)
goto out;
if (!add)
@@ -458,14 +459,16 @@ out:
return ret;
}
-static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+static int add_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- return modify_pagetable(start, end, true, direct);
+ return modify_pagetable(start, end, true, direct, altmap);
}
-static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- return modify_pagetable(start, end, false, direct);
+ return modify_pagetable(start, end, false, direct, altmap);
}
/*
@@ -474,7 +477,7 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
static int vmem_add_range(unsigned long start, unsigned long size)
{
start = (unsigned long)__va(start);
- return add_pagetable(start, start + size, true);
+ return add_pagetable(start, start + size, true, NULL);
}
/*
@@ -483,7 +486,7 @@ static int vmem_add_range(unsigned long start, unsigned long size)
static void vmem_remove_range(unsigned long start, unsigned long size)
{
start = (unsigned long)__va(start);
- remove_pagetable(start, start + size, true);
+ remove_pagetable(start, start + size, true, NULL);
}
/*
@@ -496,9 +499,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
mutex_lock(&vmem_mutex);
/* We don't care about the node, just use NUMA_NO_NODE on allocations */
- ret = add_pagetable(start, end, false);
+ ret = add_pagetable(start, end, false, altmap);
if (ret)
- remove_pagetable(start, end, false);
+ remove_pagetable(start, end, false, altmap);
mutex_unlock(&vmem_mutex);
return ret;
}
@@ -509,7 +512,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
mutex_lock(&vmem_mutex);
- remove_pagetable(start, end, false);
+ remove_pagetable(start, end, false, altmap);
mutex_unlock(&vmem_mutex);
}
@@ -591,7 +594,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
if (!pmd)
goto out;
pud_populate(&init_mm, pud, pmd);
- } else if (WARN_ON_ONCE(pud_large(*pud))) {
+ } else if (WARN_ON_ONCE(pud_leaf(*pud))) {
goto out;
}
pmd = pmd_offset(pud, addr);
@@ -602,7 +605,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
if (!pte)
goto out;
pmd_populate(&init_mm, pmd, pte);
- } else if (WARN_ON_ONCE(pmd_large(*pmd))) {
+ } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) {
goto out;
}
ptep = pte_offset_kernel(pmd, addr);
@@ -650,7 +653,6 @@ void __init vmem_map_init(void)
{
__set_memory_rox(_stext, _etext);
__set_memory_ro(_etext, __end_rodata);
- __set_memory_rox(_sinittext, _einittext);
__set_memory_rox(__stext_amode31, __etext_amode31);
/*
* If the BEAR-enhancement facility is not installed the first
@@ -659,16 +661,8 @@ void __init vmem_map_init(void)
*/
if (!static_key_enabled(&cpu_has_bear))
set_memory_x(0, 1);
- if (debug_pagealloc_enabled()) {
- /*
- * Use RELOC_HIDE() as long as __va(0) translates to NULL,
- * since performing pointer arithmetic on a NULL pointer
- * has undefined behavior and generates compiler warnings.
- */
- __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size));
- }
- if (MACHINE_HAS_NX)
- system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT);
+ if (debug_pagealloc_enabled())
+ __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size);
pr_info("Write protected kernel read-only data: %luk\n",
(unsigned long)(__end_rodata - _stext) >> 10);
}