diff options
Diffstat (limited to 'arch/s390/mm')
| -rw-r--r-- | arch/s390/mm/Makefile | 7 | ||||
| -rw-r--r-- | arch/s390/mm/cmm.c | 54 | ||||
| -rw-r--r-- | arch/s390/mm/dump_pagetables.c | 287 | ||||
| -rw-r--r-- | arch/s390/mm/extable.c | 96 | ||||
| -rw-r--r-- | arch/s390/mm/extmem.c | 83 | ||||
| -rw-r--r-- | arch/s390/mm/fault.c | 886 | ||||
| -rw-r--r-- | arch/s390/mm/gmap.c | 1004 | ||||
| -rw-r--r-- | arch/s390/mm/gmap_helpers.c | 233 | ||||
| -rw-r--r-- | arch/s390/mm/hugetlbpage.c | 165 | ||||
| -rw-r--r-- | arch/s390/mm/init.c | 195 | ||||
| -rw-r--r-- | arch/s390/mm/kasan_init.c | 403 | ||||
| -rw-r--r-- | arch/s390/mm/maccess.c | 48 | ||||
| -rw-r--r-- | arch/s390/mm/mmap.c | 101 | ||||
| -rw-r--r-- | arch/s390/mm/page-states.c | 211 | ||||
| -rw-r--r-- | arch/s390/mm/pageattr.c | 140 | ||||
| -rw-r--r-- | arch/s390/mm/pfault.c | 248 | ||||
| -rw-r--r-- | arch/s390/mm/pgalloc.c | 381 | ||||
| -rw-r--r-- | arch/s390/mm/pgtable.c | 195 | ||||
| -rw-r--r-- | arch/s390/mm/physaddr.c | 15 | ||||
| -rw-r--r-- | arch/s390/mm/vmem.c | 204 |
20 files changed, 1951 insertions, 3005 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 57e4f3a24829..bd0401cc7ca5 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -7,9 +7,10 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o +obj-$(CONFIG_PFAULT) += pfault.o -KASAN_SANITIZE_kasan_init.o := n -obj-$(CONFIG_KASAN) += kasan_init.o +obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 9141ed4c52e9..eb7ef63fab1e 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -90,16 +90,17 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(virt_to_pfn(addr), 1); + diag10_range(virt_to_pfn((void *)addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); nr--; + cond_resched(); } return nr; } -static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +static long __cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) { struct cmm_page_array *pa; unsigned long addr; @@ -123,6 +124,21 @@ static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) return nr; } +static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +{ + long inc = 0; + + while (nr) { + inc = min(256L, nr); + nr -= inc; + inc = __cmm_free_pages(inc, counter, list); + if (inc) + break; + cond_resched(); + } + return nr + inc; +} + static int cmm_oom_notify(struct notifier_block *self, unsigned long dummy, void *parm) { @@ -185,10 +201,10 @@ static void cmm_set_timer(void) { if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) { if (timer_pending(&cmm_timer)) - del_timer(&cmm_timer); + timer_delete(&cmm_timer); return; } - mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); + mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds)); } static void cmm_timer_fn(struct timer_list *unused) @@ -243,7 +259,7 @@ static int cmm_skip_blanks(char *cp, char **endp) return str != cp; } -static int cmm_pages_handler(struct ctl_table *ctl, int write, +static int cmm_pages_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); @@ -262,7 +278,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, return 0; } -static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, +static int cmm_timed_pages_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -282,7 +298,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, return 0; } -static int cmm_timeout_handler(struct ctl_table *ctl, int write, +static int cmm_timeout_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; @@ -305,8 +321,8 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_set_timeout(nr, seconds); *ppos += *lenp; } else { - len = sprintf(buf, "%ld %ld\n", - cmm_timeout_pages, cmm_timeout_seconds); + len = scnprintf(buf, sizeof(buf), "%ld %ld\n", + cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; memcpy(buffer, buf, len); @@ -316,7 +332,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, return 0; } -static struct ctl_table cmm_table[] = { +static const struct ctl_table cmm_table[] = { { .procname = "cmm_pages", .mode = 0644, @@ -332,17 +348,6 @@ static struct ctl_table cmm_table[] = { .mode = 0644, .proc_handler = cmm_timeout_handler, }, - { } -}; - -static struct ctl_table cmm_dir_table[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = cmm_table, - }, - { } }; #ifdef CONFIG_CMM_IUCV @@ -389,7 +394,7 @@ static int __init cmm_init(void) { int rc = -ENOMEM; - cmm_sysctl_header = register_sysctl_table(cmm_dir_table); + cmm_sysctl_header = register_sysctl("vm", cmm_table); if (!cmm_sysctl_header) goto out_sysctl; #ifdef CONFIG_CMM_IUCV @@ -419,7 +424,7 @@ out_smsg: #endif unregister_sysctl_table(cmm_sysctl_header); out_sysctl: - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); return rc; } module_init(cmm_init); @@ -432,10 +437,11 @@ static void __exit cmm_exit(void) #endif unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); } module_exit(cmm_exit); +MODULE_DESCRIPTION("Cooperative memory management interface"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 9953819d7959..89badbe72ae7 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpufeature.h> #include <linux/set_memory.h> #include <linux/ptdump.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/sort.h> #include <linux/mm.h> #include <linux/kfence.h> #include <linux/kasan.h> -#include <asm/ptdump.h> #include <asm/kasan.h> #include <asm/abs_lowcore.h> #include <asm/nospec-branch.h> @@ -16,68 +18,14 @@ static unsigned long max_addr; struct addr_marker { + int is_start; unsigned long start_address; + unsigned long size; const char *name; }; -enum address_markers_idx { - IDENTITY_BEFORE_NR = 0, - IDENTITY_BEFORE_END_NR, - AMODE31_START_NR, - AMODE31_END_NR, - KERNEL_START_NR, - KERNEL_END_NR, -#ifdef CONFIG_KFENCE - KFENCE_START_NR, - KFENCE_END_NR, -#endif - IDENTITY_AFTER_NR, - IDENTITY_AFTER_END_NR, -#ifdef CONFIG_KASAN - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, -#endif - VMEMMAP_NR, - VMEMMAP_END_NR, - VMALLOC_NR, - VMALLOC_END_NR, - MODULES_NR, - MODULES_END_NR, - ABS_LOWCORE_NR, - ABS_LOWCORE_END_NR, - MEMCPY_REAL_NR, - MEMCPY_REAL_END_NR, -}; - -static struct addr_marker address_markers[] = { - [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, - [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, - [AMODE31_START_NR] = {0, "Amode31 Area Start"}, - [AMODE31_END_NR] = {0, "Amode31 Area End"}, - [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, - [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, -#ifdef CONFIG_KFENCE - [KFENCE_START_NR] = {0, "KFence Pool Start"}, - [KFENCE_END_NR] = {0, "KFence Pool End"}, -#endif - [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, - [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, -#ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, - [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, -#endif - [VMEMMAP_NR] = {0, "vmemmap Area Start"}, - [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, - [VMALLOC_NR] = {0, "vmalloc Area Start"}, - [VMALLOC_END_NR] = {0, "vmalloc Area End"}, - [MODULES_NR] = {0, "Modules Area Start"}, - [MODULES_END_NR] = {0, "Modules Area End"}, - [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, - [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, - [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, - [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, - { -1, NULL } -}; +static struct addr_marker *markers; +static unsigned int markers_cnt; struct pg_state { struct ptdump_state ptdump; @@ -103,7 +51,7 @@ struct pg_state { struct seq_file *__m = (m); \ \ if (__m) \ - seq_printf(__m, fmt); \ + seq_puts(__m, fmt); \ }) static void print_prot(struct seq_file *m, unsigned int pr, int level) @@ -122,7 +70,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level) static void note_prot_wx(struct pg_state *st, unsigned long addr) { -#ifdef CONFIG_DEBUG_WX if (!st->check_wx) return; if (st->current_prot & _PAGE_INVALID) @@ -137,12 +84,26 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) * in which case we have two lpswe instructions in lowcore that need * to be executable. */ - if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear())) return; - WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "s390/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; -#endif /* CONFIG_DEBUG_WX */ +} + +static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level) +{ + struct seq_file *m = st->seq; + + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(m, "---[ %s %s ]---\n", st->marker->name, + st->marker->is_start ? "Start" : "End"); + } + st->start_address = addr; + st->current_prot = prot; + st->level = level; } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) @@ -167,10 +128,8 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, if (level == -1) addr = max_addr; if (st->level == -1) { - pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); - st->start_address = addr; - st->current_prot = prot; - st->level = level; + pt_dump_seq_puts(m, "---[ Kernel Virtual Address Space ]---\n"); + note_page_update_state(st, addr, prot, level); } else if (prot != st->current_prot || level != st->level || addr >= st->marker[1].start_address) { note_prot_wx(st, addr); @@ -184,22 +143,52 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - while (addr >= st->marker[1].start_address) { - st->marker++; - pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); - } - st->start_address = addr; - st->current_prot = prot; - st->level = level; + note_page_update_state(st, addr, prot, level); } } -#ifdef CONFIG_DEBUG_WX -void ptdump_check_wx(void) +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + +bool ptdump_check_wx(void) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, @@ -217,24 +206,33 @@ void ptdump_check_wx(void) }, }; - if (!MACHINE_HAS_NX) - return; + if (!cpu_has_nx()) + return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", - (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + (nospec_uses_trampoline() || !cpu_has_bear()) ? "unexpected " : ""); + + return true; + } } -#endif /* CONFIG_DEBUG_WX */ #ifdef CONFIG_PTDUMP_DEBUGFS static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, @@ -246,35 +244,72 @@ static int ptdump_show(struct seq_file *m, void *v) .check_wx = false, .wx_pages = 0, .start_address = 0, - .marker = address_markers, + .marker = markers, }; - get_online_mems(); mutex_lock(&cpa_mutex); ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); mutex_unlock(&cpa_mutex); - put_online_mems(); return 0; } DEFINE_SHOW_ATTRIBUTE(ptdump); #endif /* CONFIG_PTDUMP_DEBUGFS */ -/* - * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple - * insertion sort to preserve the original order of markers with the same - * start address. - */ -static void sort_address_markers(void) +static int ptdump_cmp(const void *a, const void *b) { - struct addr_marker tmp; - int i, j; - - for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) { - tmp = address_markers[i]; - for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--) - address_markers[j + 1] = address_markers[j]; - address_markers[j + 1] = tmp; + const struct addr_marker *ama = a; + const struct addr_marker *amb = b; + + if (ama->start_address > amb->start_address) + return 1; + if (ama->start_address < amb->start_address) + return -1; + /* + * If the start addresses of two markers are identical sort markers in an + * order that considers areas contained within other areas correctly. + */ + if (ama->is_start && amb->is_start) { + if (ama->size > amb->size) + return -1; + if (ama->size < amb->size) + return 1; + return 0; } + if (!ama->is_start && !amb->is_start) { + if (ama->size > amb->size) + return 1; + if (ama->size < amb->size) + return -1; + return 0; + } + if (ama->is_start) + return 1; + if (amb->is_start) + return -1; + return 0; +} + +static int add_marker(unsigned long start, unsigned long end, const char *name) +{ + struct addr_marker *new; + size_t newsize; + + newsize = (markers_cnt + 2) * sizeof(*markers); + new = kvrealloc(markers, newsize, GFP_KERNEL); + if (!new) + return -ENOMEM; + markers = new; + markers[markers_cnt].is_start = 1; + markers[markers_cnt].start_address = start; + markers[markers_cnt].size = end - start; + markers[markers_cnt].name = name; + markers_cnt++; + markers[markers_cnt].is_start = 0; + markers[markers_cnt].start_address = end; + markers[markers_cnt].size = end - start; + markers[markers_cnt].name = name; + markers_cnt++; + return 0; } static int pt_dump_init(void) @@ -282,34 +317,48 @@ static int pt_dump_init(void) #ifdef CONFIG_KFENCE unsigned long kfence_start = (unsigned long)__kfence_pool; #endif + unsigned long lowcore = (unsigned long)get_lowcore(); + int rc; + /* * Figure out the maximum virtual address being accessible with the * kernel ASCE. We need this to keep the page table walker functions * from accessing non-existent entries. */ - max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); - address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; - address_markers[AMODE31_START_NR].start_address = __samode31; - address_markers[AMODE31_END_NR].start_address = __eamode31; - address_markers[MODULES_NR].start_address = MODULES_VADDR; - address_markers[MODULES_END_NR].start_address = MODULES_END; - address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; - address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; - address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; - address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE; - address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; - address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; - address_markers[VMALLOC_NR].start_address = VMALLOC_START; - address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; + /* start + end markers - must be added first */ + rc = add_marker(0, -1UL, NULL); + rc |= add_marker((unsigned long)_stext, (unsigned long)_end, "Kernel Image"); + rc |= add_marker(lowcore, lowcore + sizeof(struct lowcore), "Lowcore"); + rc |= add_marker(__identity_base, __identity_base + ident_map_size, "Identity Mapping"); + rc |= add_marker((unsigned long)__samode31, (unsigned long)__eamode31, "Amode31 Area"); + rc |= add_marker(MODULES_VADDR, MODULES_END, "Modules Area"); + rc |= add_marker(__abs_lowcore, __abs_lowcore + ABS_LOWCORE_MAP_SIZE, "Lowcore Area"); + rc |= add_marker(__memcpy_real_area, __memcpy_real_area + MEMCPY_REAL_SIZE, "Real Memory Copy Area"); + rc |= add_marker((unsigned long)vmemmap, (unsigned long)vmemmap + vmemmap_size, "vmemmap Area"); + rc |= add_marker(VMALLOC_START, VMALLOC_END, "vmalloc Area"); #ifdef CONFIG_KFENCE - address_markers[KFENCE_START_NR].start_address = kfence_start; - address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE; + rc |= add_marker(kfence_start, kfence_start + KFENCE_POOL_SIZE, "KFence Pool"); +#endif +#ifdef CONFIG_KMSAN + rc |= add_marker(KMSAN_VMALLOC_SHADOW_START, KMSAN_VMALLOC_SHADOW_END, "Kmsan vmalloc Shadow"); + rc |= add_marker(KMSAN_VMALLOC_ORIGIN_START, KMSAN_VMALLOC_ORIGIN_END, "Kmsan vmalloc Origins"); + rc |= add_marker(KMSAN_MODULES_SHADOW_START, KMSAN_MODULES_SHADOW_END, "Kmsan Modules Shadow"); + rc |= add_marker(KMSAN_MODULES_ORIGIN_START, KMSAN_MODULES_ORIGIN_END, "Kmsan Modules Origins"); +#endif +#ifdef CONFIG_KASAN + rc |= add_marker(KASAN_SHADOW_START, KASAN_SHADOW_END, "Kasan Shadow"); #endif - sort_address_markers(); + if (rc) + goto error; + sort(&markers[1], markers_cnt - 1, sizeof(*markers), ptdump_cmp, NULL); #ifdef CONFIG_PTDUMP_DEBUGFS debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); #endif /* CONFIG_PTDUMP_DEBUGFS */ return 0; +error: + kvfree(markers); + return -ENOMEM; } device_initcall(pt_dump_init); diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c index 1e4d2187541a..7498e858c401 100644 --- a/arch/s390/mm/extable.c +++ b/arch/s390/mm/extable.c @@ -7,6 +7,7 @@ #include <linux/panic.h> #include <asm/asm-extable.h> #include <asm/extable.h> +#include <asm/fpu.h> const struct exception_table_entry *s390_search_extables(unsigned long addr) { @@ -26,7 +27,7 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_r return true; } -static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs) { unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); @@ -35,26 +36,83 @@ static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct p return true; } -static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, + bool pair, struct pt_regs *regs) { - unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); - size_t len = FIELD_GET(EX_DATA_LEN, ex->data); regs->gprs[reg_err] = -EFAULT; - memset((void *)regs->gprs[reg_addr], 0, len); + regs->gprs[reg_zero] = 0; + if (pair) + regs->gprs[reg_zero + 1] = 0; regs->psw.addr = extable_fixup(ex); return true; } -static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) { - unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); - unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data); + unsigned long data, addr, offset; - regs->gprs[reg_err] = -EFAULT; - regs->gprs[reg_zero] = 0; + addr = regs->gprs[reg_addr]; + offset = addr & (sizeof(unsigned long) - 1); + addr &= ~(sizeof(unsigned long) - 1); + data = *(unsigned long *)addr; + data <<= BITS_PER_BYTE * offset; + regs->gprs[reg_data] = data; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + fpu_sfpc(0); + regs->psw.addr = extable_fixup(ex); + return true; +} + +struct insn_ssf { + u64 opc1 : 8; + u64 r3 : 4; + u64 opc2 : 4; + u64 b1 : 4; + u64 d1 : 12; + u64 b2 : 4; + u64 d2 : 12; +} __packed; + +static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex, + bool from, struct pt_regs *regs) +{ + unsigned long uaddr, remainder; + struct insn_ssf *insn; + + /* + * If the faulting user space access crossed a page boundary retry by + * limiting the access to the first page (adjust length accordingly). + * Then the mvcos instruction will either complete with condition code + * zero, or generate another fault where the user space access did not + * cross a page boundary. + * If the faulting user space access did not cross a page boundary set + * length to zero and retry. In this case no user space access will + * happen, and the mvcos instruction will complete with condition code + * zero. + * In both cases the instruction will complete with condition code + * zero (copying finished), and the register which contains the + * length, indicates the number of bytes copied. + */ regs->psw.addr = extable_fixup(ex); + insn = (struct insn_ssf *)regs->psw.addr; + if (from) + uaddr = regs->gprs[insn->b2] + insn->d2; + else + uaddr = regs->gprs[insn->b1] + insn->d1; + remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1)); + if (regs->gprs[insn->r3] <= remainder) + remainder = 0; + regs->gprs[insn->r3] = remainder; return true; } @@ -70,12 +128,20 @@ bool fixup_exception(struct pt_regs *regs) return ex_handler_fixup(ex, regs); case EX_TYPE_BPF: return ex_handler_bpf(ex, regs); - case EX_TYPE_UA_STORE: - return ex_handler_ua_store(ex, regs); - case EX_TYPE_UA_LOAD_MEM: - return ex_handler_ua_load_mem(ex, regs); + case EX_TYPE_UA_FAULT: + return ex_handler_ua_fault(ex, regs); case EX_TYPE_UA_LOAD_REG: - return ex_handler_ua_load_reg(ex, regs); + return ex_handler_ua_load_reg(ex, false, regs); + case EX_TYPE_UA_LOAD_REGPAIR: + return ex_handler_ua_load_reg(ex, true, regs); + case EX_TYPE_ZEROPAD: + return ex_handler_zeropad(ex, regs); + case EX_TYPE_FPC: + return ex_handler_fpc(ex, regs); + case EX_TYPE_UA_MVCOS_TO: + return ex_handler_ua_mvcos(ex, false, regs); + case EX_TYPE_UA_MVCOS_FROM: + return ex_handler_ua_mvcos(ex, true, regs); } panic("invalid exception table entry"); } diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 5060956b8e7d..6cc33c705de2 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -7,8 +7,7 @@ * Copyright IBM Corp. 2002, 2004 */ -#define KMSG_COMPONENT "extmem" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "extmem: " fmt #include <linux/kernel.h> #include <linux/string.h> @@ -21,6 +20,7 @@ #include <linux/ioport.h> #include <linux/refcount.h> #include <linux/pgtable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/page.h> #include <asm/ebcdic.h> @@ -28,6 +28,7 @@ #include <asm/extmem.h> #include <asm/cpcmd.h> #include <asm/setup.h> +#include <asm/asm.h> #define DCSS_PURGESEG 0x08 #define DCSS_LOADSHRX 0x20 @@ -134,20 +135,21 @@ dcss_diag(int *func, void *parameter, unsigned long *ret1, unsigned long *ret2) { unsigned long rx, ry; - int rc; + int cc; - rx = (unsigned long) parameter; + rx = virt_to_phys(parameter); ry = (unsigned long) *func; diag_stat_inc(DIAG_STAT_X064); asm volatile( - " diag %0,%1,0x64\n" - " ipm %2\n" - " srl %2,28\n" - : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); + " diag %[rx],%[ry],0x64\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry) + : + : CC_CLOBBER); *ret1 = rx; *ret2 = ry; - return rc; + return CC_TRANSFORM(cc); } static inline int @@ -178,7 +180,7 @@ query_segment_type (struct dcss_segment *seg) /* initialize diag input parameters */ qin->qopcode = DCSS_FINDSEGA; - qin->qoutptr = (unsigned long) qout; + qin->qoutptr = virt_to_phys(qout); qin->qoutlen = sizeof(struct qout64); memcpy (qin->qname, seg->dcss_name, 8); @@ -253,7 +255,7 @@ segment_type (char* name) int rc; struct dcss_segment seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; dcss_mkname(name, seg.dcss_name); @@ -289,15 +291,17 @@ segment_overlaps_others (struct dcss_segment *seg) /* * real segment loading function, called from segment_load + * Must return either an error code < 0, or the segment type code >= 0 */ static int __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end) { unsigned long start_addr, end_addr, dummy; struct dcss_segment *seg; - int rc, diag_cc; + int rc, diag_cc, segtype; start_addr = end_addr = 0; + segtype = -1; seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); if (seg == NULL) { rc = -ENOMEM; @@ -326,9 +330,9 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long seg->res_name[8] = '\0'; strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); seg->res->name = seg->res_name; - rc = seg->vm_segtype; - if (rc == SEG_TYPE_SC || - ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) + segtype = seg->vm_segtype; + if (segtype == SEG_TYPE_SC || + ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; /* Check for overlapping resources before adding the mapping. */ @@ -386,7 +390,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long out_free: kfree(seg); out: - return rc; + return rc < 0 ? rc : segtype; } /* @@ -414,7 +418,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr, struct dcss_segment *seg; int rc; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; mutex_lock(&dcss_lock); @@ -525,6 +529,14 @@ segment_modify_shared (char *name, int do_nonshared) return rc; } +static void __dcss_diag_purge_on_cpu_0(void *data) +{ + struct dcss_segment *seg = (struct dcss_segment *)data; + unsigned long dummy; + + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); +} + /* * Decrease the use count of a DCSS segment and remove * it from the address space if nobody is using it @@ -533,10 +545,9 @@ segment_modify_shared (char *name, int do_nonshared) void segment_unload(char *name) { - unsigned long dummy; struct dcss_segment *seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -551,7 +562,14 @@ segment_unload(char *name) kfree(seg->res); vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); list_del(&seg->list); - dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + /* + * Workaround for z/VM issue, where calling the DCSS unload diag on + * a non-IPL CPU would cause bogus sclp maximum memory detection on + * next IPL. + * IPL CPU 0 cannot be set offline, so the dcss_diag() call can + * directly be scheduled to that CPU. + */ + smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1); kfree(seg); out_unlock: mutex_unlock(&dcss_lock); @@ -568,7 +586,7 @@ segment_save(char *name) char cmd2[80]; int i, response; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -579,14 +597,16 @@ segment_save(char *name) goto out; } - sprintf(cmd1, "DEFSEG %s", name); + snprintf(cmd1, sizeof(cmd1), "DEFSEG %s", name); for (i=0; i<seg->segcnt; i++) { - sprintf(cmd1+strlen(cmd1), " %lX-%lX %s", - seg->range[i].start >> PAGE_SHIFT, - seg->range[i].end >> PAGE_SHIFT, - segtype_string[seg->range[i].start & 0xff]); + size_t len = strlen(cmd1); + + snprintf(cmd1 + len, sizeof(cmd1) - len, " %lX-%lX %s", + seg->range[i].start >> PAGE_SHIFT, + seg->range[i].end >> PAGE_SHIFT, + segtype_string[seg->range[i].start & 0xff]); } - sprintf(cmd2, "SAVESEG %s", name); + snprintf(cmd2, sizeof(cmd2), "SAVESEG %s", name); response = 0; cpcmd(cmd1, NULL, 0, &response); if (response) { @@ -638,10 +658,13 @@ void segment_warning(int rc, char *seg_name) pr_err("There is not enough memory to load or query " "DCSS %s\n", seg_name); break; - case -ERANGE: - pr_err("DCSS %s exceeds the kernel mapping range (%lu) " - "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS); + case -ERANGE: { + struct range mhp_range = arch_get_mappable_range(); + + pr_err("DCSS %s exceeds the kernel mapping range (%llu) " + "and cannot be loaded\n", seg_name, mhp_range.end + 1); break; + } default: break; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 9649d9382e0a..e2e13778c36a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -3,13 +3,15 @@ * S390 version * Copyright IBM Corp. 1999 * Author(s): Hartmut Penner (hp@de.ibm.com) - * Ulrich Weigand (uweigand@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds */ #include <linux/kernel_stat.h> +#include <linux/mmu_context.h> +#include <linux/cpufeature.h> #include <linux/perf_event.h> #include <linux/signal.h> #include <linux/sched.h> @@ -21,7 +23,6 @@ #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> -#include <linux/compat.h> #include <linux/smp.h> #include <linux/kdebug.h> #include <linux/init.h> @@ -32,123 +33,93 @@ #include <linux/uaccess.h> #include <linux/hugetlb.h> #include <linux/kfence.h> +#include <linux/pagewalk.h> #include <asm/asm-extable.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> +#include <asm/fault.h> #include <asm/diag.h> -#include <asm/gmap.h> #include <asm/irq.h> -#include <asm/mmu_context.h> #include <asm/facility.h> #include <asm/uv.h> #include "../kernel/entry.h" -#define __FAIL_ADDR_MASK -4096L -#define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL - -#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) -#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) -#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) -#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) -#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) - -enum fault_type { - KERNEL_FAULT, - USER_FAULT, - GMAP_FAULT, -}; - -static unsigned long store_indication __read_mostly; - -static int __init fault_init(void) -{ - if (test_facility(75)) - store_indication = 0xc00; - return 0; -} -early_initcall(fault_init); - /* * Find out which address space caused the exception. */ -static enum fault_type get_fault_type(struct pt_regs *regs) +static bool is_kernel_fault(struct pt_regs *regs) { - unsigned long trans_exc_code; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long & 3; - if (likely(trans_exc_code == 0)) { - /* primary space exception */ - if (user_mode(regs)) - return USER_FAULT; - if (!IS_ENABLED(CONFIG_PGSTE)) - return KERNEL_FAULT; - if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) - return GMAP_FAULT; - return KERNEL_FAULT; - } - if (trans_exc_code == 2) - return USER_FAULT; - if (trans_exc_code == 1) { - /* access register mode, not used in the kernel */ - return USER_FAULT; - } - /* home space exception -> access via kernel ASCE */ - return KERNEL_FAULT; + if (user_mode(regs)) + return false; + if (teid.as == PSW_BITS_AS_SECONDARY) + return false; + return true; } -static int bad_address(void *p) +static unsigned long get_fault_address(struct pt_regs *regs) { - unsigned long dummy; + union teid teid = { .val = regs->int_parm_long }; - return get_kernel_nofault(dummy, (unsigned long *)p); + return teid.addr * PAGE_SIZE; +} + +static __always_inline bool fault_is_write(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; + + if (test_facility(75)) + return teid.fsi == TEID_FSI_STORE; + return false; } static void dump_pagetable(unsigned long asce, unsigned long address) { - unsigned long *table = __va(asce & _ASCE_ORIGIN); + unsigned long entry, *table = __va(asce & _ASCE_ORIGIN); pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R1:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R1:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R2:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R2:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R3:%016lx ", *table); - if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + pr_cont("R3:%016lx ", entry); + if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("S:%016lx ", *table); - if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + pr_cont("S:%016lx ", entry); + if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(entry & _SEGMENT_ENTRY_ORIGIN); } - table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; - if (bad_address(table)) + table += (address & _PAGE_INDEX) >> PAGE_SHIFT; + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("P:%016lx ", *table); + pr_cont("P:%016lx ", entry); out: pr_cont("\n"); return; @@ -158,162 +129,127 @@ bad: static void dump_fault_info(struct pt_regs *regs) { + union teid teid = { .val = regs->int_parm_long }; unsigned long asce; - pr_alert("Failing address: %016lx TEID: %016lx\n", - regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); + pr_alert("Failing address: %016lx TEID: %016lx", + get_fault_address(regs), teid.val); + if (test_facility(131)) + pr_cont(" ESOP-2"); + else if (machine_has_esop()) + pr_cont(" ESOP-1"); + else + pr_cont(" SOP"); + if (test_facility(75)) + pr_cont(" FSI"); + pr_cont("\n"); pr_alert("Fault in "); - switch (regs->int_parm_long & 3) { - case 3: + switch (teid.as) { + case PSW_BITS_AS_HOME: pr_cont("home space "); break; - case 2: + case PSW_BITS_AS_SECONDARY: pr_cont("secondary space "); break; - case 1: + case PSW_BITS_AS_ACCREG: pr_cont("access register "); break; - case 0: + case PSW_BITS_AS_PRIMARY: pr_cont("primary space "); break; } pr_cont("mode while using "); - switch (get_fault_type(regs)) { - case USER_FAULT: - asce = S390_lowcore.user_asce; - pr_cont("user "); - break; - case GMAP_FAULT: - asce = ((struct gmap *) S390_lowcore.gmap)->asce; - pr_cont("gmap "); - break; - case KERNEL_FAULT: - asce = S390_lowcore.kernel_asce; + if (is_kernel_fault(regs)) { + asce = get_lowcore()->kernel_asce.val; pr_cont("kernel "); - break; - default: - unreachable(); + } else { + asce = get_lowcore()->user_asce.val; + pr_cont("user "); } pr_cont("ASCE.\n"); - dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); + dump_pagetable(asce, get_fault_address(regs)); } int show_unhandled_signals = 1; +static const struct ctl_table s390_fault_sysctl_table[] = { + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_s390_fault_sysctls(void) +{ + register_sysctl_init("kernel", s390_fault_sysctl_table); + return 0; +} +arch_initcall(init_s390_fault_sysctls); + void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) { + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; - if (!printk_ratelimit()) + if (!__ratelimit(&rs)) return; - printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ", - regs->int_code & 0xffff, regs->int_code >> 17); + pr_alert("User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); print_vma_addr(KERN_CONT "in ", regs->psw.addr); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (is_mm_fault) dump_fault_info(regs); show_regs(regs); } -/* - * Send SIGSEGV to task. This is an external routine - * to keep the stack usage of do_page_fault small. - */ -static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +static void do_sigsegv(struct pt_regs *regs, int si_code) { report_user_fault(regs, SIGSEGV, 1); - force_sig_fault(SIGSEGV, si_code, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); + force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs)); } -static noinline void do_no_context(struct pt_regs *regs) +static void handle_fault_error_nolock(struct pt_regs *regs, int si_code) { + unsigned long address; + bool is_write; + + if (user_mode(regs)) { + if (WARN_ON_ONCE(!si_code)) + si_code = SEGV_MAPERR; + return do_sigsegv(regs, si_code); + } if (fixup_exception(regs)) return; - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - if (get_fault_type(regs) == KERNEL_FAULT) - printk(KERN_ALERT "Unable to handle kernel pointer dereference" - " in virtual kernel address space\n"); - else - printk(KERN_ALERT "Unable to handle kernel paging request" - " in virtual user address space\n"); + if (is_kernel_fault(regs)) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; + pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n"); + } else { + pr_alert("Unable to handle kernel paging request in virtual user address space\n"); + } dump_fault_info(regs); die(regs, "Oops"); } -static noinline void do_low_address(struct pt_regs *regs) +static void handle_fault_error(struct pt_regs *regs, int si_code) { - /* Low-address protection hit in kernel mode means - NULL pointer write access in kernel mode. */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - /* Low-address protection hit in user mode 'cannot happen'. */ - die (regs, "Low-address protection"); - } - - do_no_context(regs); -} + struct mm_struct *mm = current->mm; -static noinline void do_sigbus(struct pt_regs *regs) -{ - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ - force_sig_fault(SIGBUS, BUS_ADRERR, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); + mmap_read_unlock(mm); + handle_fault_error_nolock(regs, si_code); } -static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) +static void do_sigbus(struct pt_regs *regs) { - int si_code; - - switch (fault) { - case VM_FAULT_BADACCESS: - case VM_FAULT_BADMAP: - /* Bad memory access. Check if it is kernel or user space. */ - if (user_mode(regs)) { - /* User mode accesses just cause a SIGSEGV */ - si_code = (fault == VM_FAULT_BADMAP) ? - SEGV_MAPERR : SEGV_ACCERR; - do_sigsegv(regs, si_code); - break; - } - fallthrough; - case VM_FAULT_BADCONTEXT: - case VM_FAULT_PFAULT: - do_no_context(regs); - break; - case VM_FAULT_SIGNAL: - if (!user_mode(regs)) - do_no_context(regs); - break; - default: /* fault & VM_FAULT_ERROR */ - if (fault & VM_FAULT_OOM) { - if (!user_mode(regs)) - do_no_context(regs); - else - pagefault_out_of_memory(); - } else if (fault & VM_FAULT_SIGSEGV) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & VM_FAULT_SIGBUS) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigbus(regs); - } else - BUG(); - break; - } + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs)); } /* @@ -322,58 +258,32 @@ static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suppression) - * 10 Segment translation -> Not present (nullification) - * 11 Page translation -> Not present (nullification) - * 3b Region third trans. -> Not present (nullification) + * 04 Protection -> Write-Protection (suppression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) */ -static inline vm_fault_t do_exception(struct pt_regs *regs, int access) +static void do_exception(struct pt_regs *regs, int access) { - struct gmap *gmap; - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; - enum fault_type type; - unsigned long trans_exc_code; unsigned long address; + struct mm_struct *mm; unsigned int flags; vm_fault_t fault; bool is_write; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ clear_thread_flag(TIF_PER_TRAP); - if (kprobe_page_fault(regs, 14)) - return 0; - - mm = tsk->mm; - trans_exc_code = regs->int_parm_long; - address = trans_exc_code & __FAIL_ADDR_MASK; - is_write = (trans_exc_code & store_indication) == 0x400; - - /* - * Verify that the fault happened in user space, that - * we are not in an interrupt and that there is a - * user context. - */ - fault = VM_FAULT_BADCONTEXT; - type = get_fault_type(regs); - switch (type) { - case KERNEL_FAULT: - if (kfence_handle_page_fault(address, is_write, regs)) - return 0; - goto out; - case USER_FAULT: - case GMAP_FAULT: - if (faulthandler_disabled() || !mm) - goto out; - break; - } - + return; + mm = current->mm; + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (is_kernel_fault(regs) || faulthandler_disabled() || !mm) + return handle_fault_error_nolock(regs, 0); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) @@ -382,401 +292,136 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - mmap_read_lock(mm); - - gmap = NULL; - if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { - gmap = (struct gmap *) S390_lowcore.gmap; - current->thread.gmap_addr = address; - current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); - current->thread.gmap_int_code = regs->int_code & 0xffff; - address = __gmap_translate(gmap, address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } - if (gmap->pfault_enabled) - flags |= FAULT_FLAG_RETRY_NOWAIT; + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + if (!(vma->vm_flags & access)) { + vma_end_read(vma); + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; + } +lock_mmap: retry: - fault = VM_FAULT_BADMAP; - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto out_up; - - if (unlikely(vma->vm_start > address)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_up; - if (expand_stack(vma, address)) - goto out_up; - } - - /* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ - fault = VM_FAULT_BADACCESS; + return handle_fault_error_nolock(regs, SEGV_MAPERR); if (unlikely(!(vma->vm_flags & access))) - goto out_up; - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ + return handle_fault_error(regs, SEGV_ACCERR); fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { - fault = VM_FAULT_SIGNAL; - if (flags & FAULT_FLAG_RETRY_NOWAIT) - goto out_up; - goto out; + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; } - /* The fault is fully completed (including releasing mmap lock) */ - if (fault & VM_FAULT_COMPLETED) { - if (gmap) { - mmap_read_lock(mm); - goto out_gmap; - } - fault = 0; - goto out; - } - - if (unlikely(fault & VM_FAULT_ERROR)) - goto out_up; - + if (fault & VM_FAULT_COMPLETED) + return; if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* - * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has - * not been released - */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - flags &= ~FAULT_FLAG_RETRY_NOWAIT; flags |= FAULT_FLAG_TRIED; - mmap_read_lock(mm); goto retry; } -out_gmap: - if (IS_ENABLED(CONFIG_PGSTE) && gmap) { - address = __gmap_link(gmap, current->thread.gmap_addr, - address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } - if (address == -ENOMEM) { - fault = VM_FAULT_OOM; - goto out_up; - } - } - fault = 0; -out_up: mmap_read_unlock(mm); -out: - return fault; +done: + if (!(fault & VM_FAULT_ERROR)) + return; + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigbus(regs); + } else { + pr_emerg("Unexpected fault flags: %08x\n", fault); + BUG(); + } } void do_protection_exception(struct pt_regs *regs) { - unsigned long trans_exc_code; - int access; - vm_fault_t fault; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long; /* * Protection exceptions are suppressing, decrement psw address. * The exception to this rule are aborted transactions, for these * the PSW already points to the correct location. */ - if (!(regs->int_code & 0x200)) + if (!(regs->int_code & 0x200)) { regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); + set_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED); + } /* - * Check for low-address protection. This needs to be treated - * as a special case because the translation exception code - * field is not guaranteed to contain valid data in this case. + * If bit 61 if the TEID is not set, the remainder of the + * TEID is unpredictable. Special handling is required. */ - if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs); - return; + if (unlikely(!teid.b61)) { + if (user_mode(regs)) { + dump_fault_info(regs); + die(regs, "Unexpected TEID"); + } + /* Assume low-address protection in kernel mode. */ + return handle_fault_error_nolock(regs, 0); } - if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) { - regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) | - (regs->psw.addr & PAGE_MASK); - access = VM_EXEC; - fault = VM_FAULT_BADACCESS; - } else { - access = VM_WRITE; - fault = do_exception(regs, access); + if (unlikely(cpu_has_nx() && teid.b56)) { + regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - if (unlikely(fault)) - do_fault_error(regs, fault); + do_exception(regs, VM_WRITE); } NOKPROBE_SYMBOL(do_protection_exception); void do_dat_exception(struct pt_regs *regs) { - int access; - vm_fault_t fault; - - access = VM_ACCESS_FLAGS; - fault = do_exception(regs, access); - if (unlikely(fault)) - do_fault_error(regs, fault); + do_exception(regs, VM_ACCESS_FLAGS); } NOKPROBE_SYMBOL(do_dat_exception); -#ifdef CONFIG_PFAULT -/* - * 'pfault' pseudo page faults routines. - */ -static int pfault_disable; - -static int __init nopfault(char *str) -{ - pfault_disable = 1; - return 1; -} - -__setup("nopfault", nopfault); - -struct pfault_refbk { - u16 refdiagc; - u16 reffcode; - u16 refdwlen; - u16 refversn; - u64 refgaddr; - u64 refselmk; - u64 refcmpmk; - u64 reserved; -} __attribute__ ((packed, aligned(8))); - -static struct pfault_refbk pfault_init_refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_LPP, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD -}; - -int pfault_init(void) -{ - int rc; - - if (pfault_disable) - return -1; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) - : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); - return rc; -} - -static struct pfault_refbk pfault_fini_refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, -}; - -void pfault_fini(void) -{ - - if (pfault_disable) - return; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %0,0,0x258\n" - "0: nopr %%r7\n" - EX_TABLE(0b,0b) - : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); -} - -static DEFINE_SPINLOCK(pfault_lock); -static LIST_HEAD(pfault_list); - -#define PF_COMPLETE 0x0080 - -/* - * The mechanism of our pfault code: if Linux is running as guest, runs a user - * space process and the user space process accesses a page that the host has - * paged out we get a pfault interrupt. - * - * This allows us, within the guest, to schedule a different process. Without - * this mechanism the host would have to suspend the whole virtual cpu until - * the page has been paged in. - * - * So when we get such an interrupt then we set the state of the current task - * to uninterruptible and also set the need_resched flag. Both happens within - * interrupt context(!). If we later on want to return to user space we - * recognize the need_resched flag and then call schedule(). It's not very - * obvious how this works... - * - * Of course we have a lot of additional fun with the completion interrupt (-> - * host signals that a page of a process has been paged in and the process can - * continue to run). This interrupt can arrive on any cpu and, since we have - * virtual cpus, actually appear before the interrupt that signals that a page - * is missing. - */ -static void pfault_interrupt(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct task_struct *tsk; - __u16 subcode; - pid_t pid; - - /* - * Get the external interruption subcode & pfault initial/completion - * signal bit. VM stores this in the 'cpu address' field associated - * with the external interrupt. - */ - subcode = ext_code.subcode; - if ((subcode & 0xff00) != __SUBCODE_MASK) - return; - inc_irq_stat(IRQEXT_PFL); - /* Get the token (= pid of the affected task). */ - pid = param64 & LPP_PID_MASK; - rcu_read_lock(); - tsk = find_task_by_pid_ns(pid, &init_pid_ns); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return; - spin_lock(&pfault_lock); - if (subcode & PF_COMPLETE) { - /* signal bit is set -> a page has been swapped in by VM */ - if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion - * interrupt. pfault_wait is valid. Set pfault_wait - * back to zero and wake up the process. This can - * safely be done because the task is still sleeping - * and can't produce new pfaults. */ - tsk->thread.pfault_wait = 0; - list_del(&tsk->thread.list); - wake_up_process(tsk); - put_task_struct(tsk); - } else { - /* Completion interrupt was faster than initial - * interrupt. Set pfault_wait to -1 so the initial - * interrupt doesn't put the task to sleep. - * If the task is not running, ignore the completion - * interrupt since it must be a leftover of a PFAULT - * CANCEL operation which didn't remove all pending - * completion interrupts. */ - if (task_is_running(tsk)) - tsk->thread.pfault_wait = -1; - } - } else { - /* signal bit not set -> a real page is missing. */ - if (WARN_ON_ONCE(tsk != current)) - goto out; - if (tsk->thread.pfault_wait == 1) { - /* Already on the list with a reference: put to sleep */ - goto block; - } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial - * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ - tsk->thread.pfault_wait = 0; - } else { - /* Initial interrupt arrived before completion - * interrupt. Let the task sleep. - * An extra task reference is needed since a different - * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ - get_task_struct(tsk); - tsk->thread.pfault_wait = 1; - list_add(&tsk->thread.list, &pfault_list); -block: - /* Since this must be a userspace fault, there - * is no kernel task state to trample. Rely on the - * return to userspace schedule() to block. */ - __set_current_state(TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - set_preempt_need_resched(); - } - } -out: - spin_unlock(&pfault_lock); - put_task_struct(tsk); -} - -static int pfault_cpu_dead(unsigned int cpu) -{ - struct thread_struct *thread, *next; - struct task_struct *tsk; - - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); - } - spin_unlock_irq(&pfault_lock); - return 0; -} - -static int __init pfault_irq_init(void) -{ - int rc; - - rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); - if (rc) - goto out_extint; - rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; - if (rc) - goto out_pfault; - irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", - NULL, pfault_cpu_dead); - return 0; - -out_pfault: - unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); -out_extint: - pfault_disable = 1; - return rc; -} -early_initcall(pfault_irq_init); - -#endif /* CONFIG_PFAULT */ - #if IS_ENABLED(CONFIG_PGSTE) void do_secure_storage_access(struct pt_regs *regs) { - unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + union teid teid = { .val = regs->int_parm_long }; + unsigned long addr = get_fault_address(regs); struct vm_area_struct *vma; + struct folio_walk fw; struct mm_struct *mm; - struct page *page; - struct gmap *gmap; + struct folio *folio; int rc; /* - * bit 61 tells us if the address is valid, if it's not we - * have a major problem and should stop the kernel or send a - * SIGSEGV to the process. Unfortunately bit 61 is not - * reliable without the misc UV feature so we need to check - * for that as well. + * Bit 61 indicates if the address is valid, if it is not the + * kernel should be stopped or SIGSEGV should be sent to the + * process. Bit 61 is not reliable without the misc UV feature, + * therefore this needs to be checked too. */ - if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) && - !test_bit_inv(61, ®s->int_parm_long)) { + if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) { /* * When this happens, userspace did something that it * was not supposed to do, e.g. branching into secure @@ -786,100 +431,43 @@ void do_secure_storage_access(struct pt_regs *regs) send_sig(SIGSEGV, current, 0); return; } - /* - * The kernel should never run into this case and we - * have no way out of this situation. + * The kernel should never run into this case and + * there is no way out of this situation. */ panic("Unexpected PGM 0x3d with TEID bit 61=0"); } - - switch (get_fault_type(regs)) { - case GMAP_FAULT: - mm = current->mm; - gmap = (struct gmap *)S390_lowcore.gmap; - mmap_read_lock(mm); - addr = __gmap_translate(gmap, addr); - mmap_read_unlock(mm); - if (IS_ERR_VALUE(addr)) { - do_fault_error(regs, VM_FAULT_BADMAP); - break; - } - fallthrough; - case USER_FAULT: + if (is_kernel_fault(regs)) { + folio = phys_to_folio(addr); + if (unlikely(!folio_try_get(folio))) + return; + rc = arch_make_folio_accessible(folio); + folio_put(folio); + if (rc) + BUG(); + } else { + if (faulthandler_disabled()) + return handle_fault_error_nolock(regs, 0); mm = current->mm; mmap_read_lock(mm); vma = find_vma(mm, addr); - if (!vma) { + if (!vma) + return handle_fault_error(regs, SEGV_MAPERR); + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) { mmap_read_unlock(mm); - do_fault_error(regs, VM_FAULT_BADMAP); - break; - } - page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) { - mmap_read_unlock(mm); - break; + return; } - if (arch_make_page_accessible(page)) + /* arch_make_folio_accessible() needs a raised refcount. */ + folio_get(folio); + rc = arch_make_folio_accessible(folio); + folio_put(folio); + folio_walk_end(&fw, vma); + if (rc) send_sig(SIGSEGV, current, 0); - put_page(page); mmap_read_unlock(mm); - break; - case KERNEL_FAULT: - page = phys_to_page(addr); - if (unlikely(!try_get_page(page))) - break; - rc = arch_make_page_accessible(page); - put_page(page); - if (rc) - BUG(); - break; - default: - do_fault_error(regs, VM_FAULT_BADMAP); - WARN_ON_ONCE(1); } } NOKPROBE_SYMBOL(do_secure_storage_access); -void do_non_secure_storage_access(struct pt_regs *regs) -{ - unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; - struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; - - if (get_fault_type(regs) != GMAP_FAULT) { - do_fault_error(regs, VM_FAULT_BADMAP); - WARN_ON_ONCE(1); - return; - } - - if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) - send_sig(SIGSEGV, current, 0); -} -NOKPROBE_SYMBOL(do_non_secure_storage_access); - -void do_secure_storage_violation(struct pt_regs *regs) -{ - unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; - struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; - - /* - * If the VM has been rebooted, its address space might still contain - * secure pages from the previous boot. - * Clear the page so it can be reused. - */ - if (!gmap_destroy_page(gmap, gaddr)) - return; - /* - * Either KVM messed up the secure guest mapping or the same - * page is mapped into multiple secure guests. - * - * This exception is only triggered when a guest 2 is running - * and can therefore never occur in kernel context. - */ - printk_ratelimited(KERN_WARNING - "Secure storage violation in task: %s, pid %d\n", - current->comm, current->pid); - send_sig(SIGSEGV, current, 0); -} - #endif /* CONFIG_PGSTE */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 02d15c8dc92e..dd85bcca817d 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -8,6 +8,8 @@ * Janosch Frank <frankja@linux.vnet.ibm.com> */ +#include <linux/cpufeature.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/pagewalk.h> #include <linux/swap.h> @@ -18,20 +20,43 @@ #include <linux/ksm.h> #include <linux/mman.h> #include <linux/pgtable.h> - +#include <asm/page-states.h> #include <asm/pgalloc.h> +#include <asm/machine.h> +#include <asm/gmap_helpers.h> #include <asm/gmap.h> -#include <asm/tlb.h> +#include <asm/page.h> + +/* + * The address is saved in a radix tree directly; NULL would be ambiguous, + * since 0 is a valid address, and NULL is returned when nothing was found. + * The lower bits are ignored by all users of the macro, so it can be used + * to distinguish a valid address 0 from a NULL. + */ +#define VALID_GADDR_FLAG 1 +#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) +#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) #define GMAP_SHADOW_FAKE_TABLE 1ULL +static struct page *gmap_alloc_crst(void) +{ + struct page *page; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return NULL; + __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); + return page; +} + /** * gmap_alloc - allocate and initialize a guest address space * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ -static struct gmap *gmap_alloc(unsigned long limit) +struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -58,21 +83,17 @@ static struct gmap *gmap_alloc(unsigned long limit) gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); if (!gmap) goto out; - INIT_LIST_HEAD(&gmap->crst_list); INIT_LIST_HEAD(&gmap->children); - INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); refcount_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) goto out_free; - page->index = 0; - list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); + table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; gmap->asce = atype | _ASCE_TABLE_LENGTH | @@ -85,6 +106,7 @@ out_free: out: return NULL; } +EXPORT_SYMBOL_GPL(gmap_alloc); /** * gmap_create - create a guest address space @@ -116,10 +138,7 @@ EXPORT_SYMBOL_GPL(gmap_create); static void gmap_flush_tlb(struct gmap *gmap) { - if (MACHINE_HAS_IDTE) - __tlb_flush_idte(gmap->asce); - else - __tlb_flush_global(); + __tlb_flush_idte(gmap->asce); } static void gmap_radix_tree_free(struct radix_tree_root *root) @@ -173,30 +192,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_free_crst(unsigned long *table, bool free_ptes) +{ + bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; + int i; + + if (is_segment) { + if (!free_ptes) + goto out; + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _SEGMENT_ENTRY_INVALID)) + page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); + } else { + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _REGION_ENTRY_INVALID)) + gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); + } + +out: + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure * * No locks required. There are no references to this gmap anymore. */ -static void gmap_free(struct gmap *gmap) +void gmap_free(struct gmap *gmap) { - struct page *page, *next; - /* Flush tlb of all gmaps (if not already done for shadows) */ if (!(gmap_is_shadow(gmap) && gmap->removed)) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, CRST_ALLOC_ORDER); + gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); + gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); /* Free additional data for a shadow gmap */ if (gmap_is_shadow(gmap)) { - /* Free all page tables. */ - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) - page_table_free_pgste(page); gmap_rmap_radix_tree_free(&gmap->host_to_rmap); /* Release reference to the parent */ gmap_put(gmap->parent); @@ -204,6 +239,7 @@ static void gmap_free(struct gmap *gmap) kfree(gmap); } +EXPORT_SYMBOL_GPL(gmap_free); /** * gmap_get - increase reference counter for guest address space @@ -267,37 +303,6 @@ void gmap_remove(struct gmap *gmap) } EXPORT_SYMBOL_GPL(gmap_remove); -/** - * gmap_enable - switch primary space to the guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_enable(struct gmap *gmap) -{ - S390_lowcore.gmap = (unsigned long) gmap; -} -EXPORT_SYMBOL_GPL(gmap_enable); - -/** - * gmap_disable - switch back to the standard primary address space - * @gmap: pointer to the guest address space structure - */ -void gmap_disable(struct gmap *gmap) -{ - S390_lowcore.gmap = 0UL; -} -EXPORT_SYMBOL_GPL(gmap_disable); - -/** - * gmap_get_enabled - get a pointer to the currently enabled gmap - * - * Returns a pointer to the currently enabled gmap. 0 if none is enabled. - */ -struct gmap *gmap_get_enabled(void) -{ - return (struct gmap *) S390_lowcore.gmap; -} -EXPORT_SYMBOL_GPL(gmap_get_enabled); - /* * gmap_alloc_table is assumed to be called with mmap_lock held */ @@ -308,17 +313,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - new = (unsigned long *) page_to_phys(page); + new = page_to_virt(page); crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; page = NULL; } spin_unlock(&gmap->guest_table_lock); @@ -327,22 +330,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return 0; } -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) +static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) { - struct page *page; - unsigned long offset, mask; + return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); - return page->index + offset; +static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) +{ + return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} + +static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, + unsigned long *gaddr) +{ + *gaddr = host_to_guest_delete(gmap, vmaddr); + if (IS_GADDR_VALID(*gaddr)) + return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); + return NULL; } /** @@ -354,16 +358,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry) */ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) { - unsigned long *entry; + unsigned long gaddr; int flush = 0; + pmd_t *pmdp; BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_EMPTY); - *entry = _SEGMENT_ENTRY_EMPTY; + + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } + spin_unlock(&gmap->guest_table_lock); return flush; } @@ -482,26 +489,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) EXPORT_SYMBOL_GPL(__gmap_translate); /** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long rc; - - mmap_read_lock(gmap->mm); - rc = __gmap_translate(gmap, gaddr); - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_translate); - -/** * gmap_unlink - disconnect a page table from the gmap shadow tables * @mm: pointer to the parent mm_struct * @table: pointer to the host page table @@ -557,7 +544,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, gaddr & _REGION1_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -565,7 +552,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, gaddr & _REGION2_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -573,7 +560,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, gaddr & _REGION3_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ @@ -585,12 +572,12 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pud = pud_offset(p4d, vmaddr); VM_BUG_ON(pud_none(*pud)); /* large puds cannot yet be handled */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); /* Are we allowed to use huge pages? */ - if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) + if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); @@ -600,15 +587,18 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) spin_lock(&gmap->guest_table_lock); if (*table == _SEGMENT_ENTRY_EMPTY) { rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); + vmaddr >> PMD_SHIFT, + (void *)MAKE_VALID_GADDR(gaddr)); if (!rc) { - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC; + | _SEGMENT_ENTRY_GMAP_UC + | _SEGMENT_ENTRY; } else - *table = pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS; + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS) + | _SEGMENT_ENTRY; } } else if (*table & _SEGMENT_ENTRY_PROTECT && !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { @@ -622,113 +612,27 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) radix_tree_preload_end(); return rc; } - -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) -{ - unsigned long vmaddr; - int rc; - bool unlocked; - - mmap_read_lock(gmap->mm); - -retry: - unlocked = false; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - goto out_up; - } - if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, - &unlocked)) { - rc = -EFAULT; - goto out_up; - } - /* - * In the case that fixup_user_fault unlocked the mmap_lock during - * faultin redo __gmap_translate to not race with a map/unmap_segment. - */ - if (unlocked) - goto retry; - - rc = __gmap_link(gmap, gaddr, vmaddr); -out_up: - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_fault); +EXPORT_SYMBOL(__gmap_link); /* * this function is assumed to be called with mmap_lock held */ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { - struct vm_area_struct *vma; unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; + + mmap_assert_locked(gmap->mm); /* Find the vm address for the guest address */ vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); if (vmaddr) { vmaddr |= gaddr & ~PMD_MASK; - - vma = vma_lookup(gmap->mm, vmaddr); - if (!vma || is_vm_hugetlb_page(vma)) - return; - - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) { - ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); - } + gmap_helper_zap_one_page(gmap->mm, vmaddr); } } EXPORT_SYMBOL_GPL(__gmap_zap); -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) -{ - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - mmap_read_lock(gmap->mm); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - if (!vma) - continue; - /* - * We do not discard pages that are backed by - * hugetlbfs, so we don't have to refault them. - */ - if (is_vm_hugetlb_page(vma)) - continue; - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); - } - mmap_read_unlock(gmap->mm); -} -EXPORT_SYMBOL_GPL(gmap_discard); - static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); @@ -790,8 +694,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, * * Note: Can also be called for shadow gmaps. */ -static inline unsigned long *gmap_table_walk(struct gmap *gmap, - unsigned long gaddr, int level) +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) { const int asce_type = gmap->asce & _ASCE_TYPE_MASK; unsigned long *table = gmap->table; @@ -813,7 +716,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -821,7 +724,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -829,7 +732,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; @@ -837,11 +740,12 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; } return table; } +EXPORT_SYMBOL(gmap_table_walk); /** * gmap_pte_op_walk - walk the gmap page table, get the page table lock @@ -896,12 +800,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, /** * gmap_pte_op_end - release the page table lock - * @ptl: pointer to the spinlock pointer + * @ptep: pointer to the locked pte + * @ptl: pointer to the page table spinlock */ -static void gmap_pte_op_end(spinlock_t *ptl) +static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) { - if (ptl) - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } /** @@ -932,7 +836,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) } /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ - if (!pmd_large(*pmdp)) + if (!pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); return pmdp; } @@ -944,7 +848,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) */ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) { - if (pmd_large(*pmdp)) + if (pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); } @@ -1012,7 +916,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, { int rc; pte_t *ptep; - spinlock_t *ptl = NULL; + spinlock_t *ptl; unsigned long pbits = 0; if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) @@ -1026,7 +930,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; /* Protect and unlock. */ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); return rc; } @@ -1038,86 +942,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE * @bits: pgste notification bits to set * - * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * Returns: + * PAGE_SIZE if a small page was successfully protected; + * HPAGE_SIZE if a large page was successfully protected; + * -ENOMEM if out of memory; + * -EFAULT if gaddr is invalid (or mapping for shadows is missing); + * -EAGAIN if the guest mapping is missing and should be fixed by the caller. * - * Called with sg->mm->mmap_lock in read. + * Context: Called with sg->mm->mmap_lock in read. */ -static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot, unsigned long bits) +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) { - unsigned long vmaddr, dist; pmd_t *pmdp; - int rc; + int rc = 0; BUG_ON(gmap_is_shadow(gmap)); - while (len) { - rc = -EAGAIN; - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (pmdp) { - if (!pmd_large(*pmdp)) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - len -= PAGE_SIZE; - gaddr += PAGE_SIZE; - } - } else { - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); - len = len < dist ? 0 : len - dist; - gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; - } - } - gmap_pmd_op_end(gmap, pmdp); - } - if (rc) { - if (rc == -EINVAL) - return rc; - /* -EAGAIN, fixup of userspace mm and gmap */ - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); - if (rc) - return rc; - } - } - return 0; -} + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return -EAGAIN; -/** - * gmap_mprotect_notify - change access rights for a range of ptes and - * call the notifier if any pte changes again - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * - * Returns 0 if for each page in the given range a gmap mapping exists, - * the new access rights could be set and the notifier could be armed. - * If the gmap mapping is missing for one or more pages -EFAULT is - * returned. If no memory could be allocated -ENOMEM is returned. - * This function establishes missing page table entries. - */ -int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot) -{ - int rc; + if (!pmd_leaf(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = PAGE_SIZE; + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = HPAGE_SIZE; + } + gmap_pmd_op_end(gmap, pmdp); - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) - return -EINVAL; - if (!MACHINE_HAS_ESOP && prot == PROT_READ) - return -EINVAL; - mmap_read_lock(gmap->mm); - rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); - mmap_read_unlock(gmap->mm); return rc; } -EXPORT_SYMBOL_GPL(gmap_mprotect_notify); +EXPORT_SYMBOL_GPL(gmap_protect_one); /** * gmap_read_table - get an unsigned long value from a guest page table using @@ -1150,12 +1008,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *) address; + *val = *(unsigned long *)__va(address); set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } if (!rc) break; @@ -1249,7 +1107,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, if (!rc) gmap_insert_rmap(sg, vmaddr, rmap); spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } radix_tree_preload_end(); if (rc) { @@ -1304,7 +1162,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ if (!table || *table & _PAGE_INVALID) return; - gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); + gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); } @@ -1322,7 +1180,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, int i; BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) pgt[i] = _PAGE_INVALID; } @@ -1335,23 +1193,23 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { - unsigned long sto, *ste, *pgt; - struct page *page; + unsigned long *ste; + phys_addr_t sto, pgt; + struct ptdesc *ptdesc; BUG_ON(!gmap_is_shadow(sg)); ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); } /** @@ -1365,21 +1223,20 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long *pgt; - struct page *page; + struct ptdesc *ptdesc; + phys_addr_t pgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; - pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); } } @@ -1392,7 +1249,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) { - unsigned long r3o, *r3e, *sgt; + unsigned long r3o, *r3e; + phys_addr_t sgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1401,13 +1259,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); - sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(sgt); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1422,20 +1279,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long *sgt; struct page *page; + phys_addr_t sgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; - sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(sgt); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1449,7 +1305,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) { - unsigned long r2o, *r2e, *r3t; + unsigned long r2o, *r2e; + phys_addr_t r3t; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1458,13 +1315,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); - r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r3t); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1479,7 +1335,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long *r3t; + phys_addr_t r3t; struct page *page; int i; @@ -1487,12 +1343,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; - r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r3t); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1506,8 +1361,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) { - unsigned long r1o, *r1e, *r2t; + unsigned long r1o, *r1e; struct page *page; + phys_addr_t r2t; BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ @@ -1515,13 +1371,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); - r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, r2t); + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r2t); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1536,23 +1391,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, unsigned long *r1t) { - unsigned long asce, *r2t; + unsigned long asce; struct page *page; + phys_addr_t r2t; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + asce = __pa(r1t) | _ASCE_TYPE_REGION1; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; - r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); - __gmap_unshadow_r2t(sg, raddr, r2t); + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Clear entry and flush translation r1t -> r2t */ gmap_idte_one(asce, raddr); r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r2t); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1563,7 +1418,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, * * Called with sg->guest_table_lock */ -static void gmap_unshadow(struct gmap *sg) +void gmap_unshadow(struct gmap *sg) { unsigned long *table; @@ -1573,7 +1428,7 @@ static void gmap_unshadow(struct gmap *sg) sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); gmap_flush_tlb(sg); - table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + table = __va(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: __gmap_unshadow_r1t(sg, 0, table); @@ -1589,142 +1444,7 @@ static void gmap_unshadow(struct gmap *sg) break; } } - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg; - - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce != asce || sg->edat_level != edat_level || - sg->removed) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. - * - */ -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} -EXPORT_SYMBOL_GPL(gmap_shadow_valid); - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - BUG_ON(parent->mm->context.allow_gmap_hpage_1m); - BUG_ON(gmap_is_shadow(parent)); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} -EXPORT_SYMBOL_GPL(gmap_shadow); +EXPORT_SYMBOL(gmap_unshadow); /** * gmap_shadow_r2t - create an empty shadow region 2 table @@ -1736,7 +1456,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * The r2t parameter specifies the address of the source table. The * four pages of the source table are made read-only in the parent gmap * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its decendents. + * remove the shadow r2 table and all of its descendants. * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1748,19 +1468,17 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r2t, *table; + unsigned long *table; + phys_addr_t s_r2t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r2t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r2t = (unsigned long *) page_to_phys(page); + s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ @@ -1775,13 +1493,12 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + *table = s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1798,8 +1515,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r2t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1832,19 +1548,17 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r3t, *table; + unsigned long *table; + phys_addr_t s_r3t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r3t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r3t = (unsigned long *) page_to_phys(page); + s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ @@ -1859,13 +1573,12 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + *table = s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1882,8 +1595,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r3t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1916,19 +1628,17 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_sgt, *table; + unsigned long *table; + phys_addr_t s_sgt; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = sgt & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_sgt = (unsigned long *) page_to_phys(page); + s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ @@ -1943,13 +1653,12 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + *table = s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1966,8 +1675,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_sgt) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1983,45 +1691,22 @@ out_free: } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); -/** - * gmap_shadow_pgt_lookup - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) { - unsigned long *table; - struct page *page; - int rc; + unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); - BUG_ON(!gmap_is_shadow(sg)); - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; - } - spin_unlock(&sg->guest_table_lock); - return rc; + pgstes += _PAGE_ENTRIES; + + pgstes[0] &= ~PGSTE_ST2_MASK; + pgstes[1] &= ~PGSTE_ST2_MASK; + pgstes[2] &= ~PGSTE_ST2_MASK; + pgstes[3] &= ~PGSTE_ST2_MASK; + pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; + pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; + pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; + pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; } -EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); /** * gmap_shadow_pgt - instantiate a shadow page table @@ -2040,19 +1725,21 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) { unsigned long raddr, origin; - unsigned long *s_pgt, *table; - struct page *page; + unsigned long *table; + struct ptdesc *ptdesc; + phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); /* Allocate a shadow page table */ - page = page_table_alloc_pgste(sg->mm); - if (!page) + ptdesc = page_table_alloc_pgste(sg->mm); + if (!ptdesc) return -ENOMEM; - page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + origin = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = (unsigned long *) page_to_phys(page); + origin |= GMAP_SHADOW_FAKE_TABLE; + gmap_pgste_set_pgt_addr(ptdesc, origin); + s_pgt = page_to_phys(ptdesc_page(ptdesc)); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -2070,7 +1757,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - list_add(&page->lru, &sg->pt_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2085,8 +1771,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != - (unsigned long) s_pgt) + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2097,7 +1782,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(page); + page_table_free_pgste(ptdesc); return rc; } @@ -2152,7 +1837,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); if (!tptep) { spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); radix_tree_preload_end(); break; } @@ -2163,7 +1848,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rmap = NULL; rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); spin_unlock(&sg->guest_table_lock); } radix_tree_preload_end(); @@ -2249,7 +1934,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte, unsigned long bits) { unsigned long offset, gaddr = 0; - unsigned long *table; struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); @@ -2257,12 +1941,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (table) - gaddr = __gmap_segment_gaddr(table) + offset; + gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; spin_unlock(&gmap->guest_table_lock); - if (!table) + if (!IS_GADDR_VALID(gaddr)) continue; if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { @@ -2302,13 +1983,11 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, gaddr &= HPAGE_MASK; pmdp_notify_gmap(gmap, pmdp, gaddr); new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) - __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); else - __pmdp_csp(pmdp); + __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); set_pmd(pmdp, new); } @@ -2322,15 +2001,14 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); if (pmdp) { - gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (purge) - __pmdp_csp(pmdp); + __pmdp_cspg(pmdp); set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } spin_unlock(&gmap->guest_table_lock); @@ -2351,44 +2029,31 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); /** - * gmap_pmdp_csp - csp all affected guest pmd entries - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) -{ - gmap_pmdp_clear(mm, vmaddr, 1); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_csp); - -/** * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry * @mm: pointer to the process mm_struct * @vmaddr: virtual address in the process address space */ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); - if (MACHINE_HAS_TLB_GUEST) + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); - else if (MACHINE_HAS_IDTE) + else __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); - *entry = _SEGMENT_ENTRY_EMPTY; + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2403,29 +2068,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); */ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); - if (MACHINE_HAS_TLB_GUEST) + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) - __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); else - __pmdp_csp(pmdp); - *entry = _SEGMENT_ENTRY_EMPTY; + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2481,7 +2142,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], if (!pmdp) return; - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) bitmap_fill(bitmap, _PAGE_ENTRIES); } else { @@ -2491,7 +2152,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], continue; if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) set_bit(i, bitmap); - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } } gmap_pmd_op_end(gmap, pmdp); @@ -2510,6 +2171,7 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops thp_split_walk_ops = { .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static inline void thp_split_mm(struct mm_struct *mm) @@ -2518,8 +2180,7 @@ static inline void thp_split_mm(struct mm_struct *mm) VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; @@ -2531,33 +2192,6 @@ static inline void thp_split_mm(struct mm_struct *mm) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Remove all empty zero pages from the mapping for lazy refaulting - * - This must be called after mm->context.has_pgste is set, to avoid - * future creation of zero pages - * - This must be called after THP was enabled - */ -static int __zap_zero_pages(pmd_t *pmd, unsigned long start, - unsigned long end, struct mm_walk *walk) -{ - unsigned long addr; - - for (addr = start; addr != end; addr += PAGE_SIZE) { - pte_t *ptep; - spinlock_t *ptl; - - ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (is_zero_pfn(pte_pfn(*ptep))) - ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); - pte_unmap_unlock(ptep, ptl); - } - return 0; -} - -static const struct mm_walk_ops zap_zero_walk_ops = { - .pmd_entry = __zap_zero_pages, -}; - -/* * switch on pgstes for its userspace process (for kvm) */ int s390_enable_sie(void) @@ -2567,37 +2201,15 @@ int s390_enable_sie(void) /* Do we have pgstes? if yes, we are done */ if (mm_has_pgste(mm)) return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) - return -EINVAL; mmap_write_lock(mm); mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); - walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); mmap_write_unlock(mm); return 0; } EXPORT_SYMBOL_GPL(s390_enable_sie); -int gmap_mark_unmergeable(void) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int ret; - VMA_ITERATOR(vmi, mm, 0); - - for_each_vma(vmi, vma) { - ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); - if (ret) - return ret; - } - mm->def_flags &= ~VM_MERGEABLE; - return 0; -} -EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); - /* * Enable storage key handling from now on and initialize the storage * keys with the default key. @@ -2628,7 +2240,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, { pmd_t *pmd = (pmd_t *)pte; unsigned long start, end; - struct page *page = pmd_page(*pmd); + struct folio *folio = page_folio(pmd_page(*pmd)); /* * The write check makes sure we do not set a key on shared @@ -2641,9 +2253,9 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, return 0; start = pmd_val(*pmd) & HPAGE_MASK; - end = start + HPAGE_SIZE - 1; + end = start + HPAGE_SIZE; __storage_key_init_range(start, end); - set_bit(PG_arch_1, &page->flags); + set_bit(PG_arch_1, &folio->flags.f); cond_resched(); return 0; } @@ -2652,6 +2264,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, }; int s390_enable_skey(void) @@ -2664,7 +2277,7 @@ int s390_enable_skey(void) goto out_up; mm->context.uses_skeys = 1; - rc = gmap_mark_unmergeable(); + rc = gmap_helper_disable_cow_sharing(); if (rc) { mm->context.uses_skeys = 0; goto out_up; @@ -2689,6 +2302,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, static const struct mm_walk_ops reset_cmma_walk_ops = { .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, }; void s390_reset_cmma(struct mm_struct *mm) @@ -2725,6 +2339,7 @@ static int s390_gather_pages(pte_t *ptep, unsigned long addr, static const struct mm_walk_ops gather_pages_ops = { .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -2733,13 +2348,15 @@ static const struct mm_walk_ops gather_pages_ops = { */ void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) { + struct folio *folio; unsigned long i; for (i = 0; i < count; i++) { + folio = pfn_folio(pfns[i]); /* we always have an extra reference */ - uv_destroy_owned_page(pfn_to_phys(pfns[i])); + uv_destroy_folio(folio); /* get rid of the extra reference */ - put_page(pfn_to_page(pfns[i])); + folio_put(folio); cond_resched(); } } @@ -2780,52 +2397,12 @@ int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); /** - * s390_unlist_old_asce - Remove the topmost level of page tables from the - * list of page tables of the gmap. - * @gmap: the gmap whose table is to be removed - * - * On s390x, KVM keeps a list of all pages containing the page tables of the - * gmap (the CRST list). This list is used at tear down time to free all - * pages that are now not needed anymore. - * - * This function removes the topmost page of the tree (the one pointed to by - * the ASCE) from the CRST list. - * - * This means that it will not be freed when the VM is torn down, and needs - * to be handled separately by the caller, unless a leak is actually - * intended. Notice that this function will only remove the page from the - * list, the page will still be used as a top level page table (and ASCE). - */ -void s390_unlist_old_asce(struct gmap *gmap) -{ - struct page *old; - - old = virt_to_page(gmap->table); - spin_lock(&gmap->guest_table_lock); - list_del(&old->lru); - /* - * Sometimes the topmost page might need to be "removed" multiple - * times, for example if the VM is rebooted into secure mode several - * times concurrently, or if s390_replace_asce fails after calling - * s390_remove_old_asce and is attempted again later. In that case - * the old asce has been removed from the list, and therefore it - * will not be freed when the VM terminates, but the ASCE is still - * in use and still pointed to. - * A subsequent call to replace_asce will follow the pointer and try - * to remove the same page from the list again. - * Therefore it's necessary that the page of the ASCE has valid - * pointers, so list_del can work (and do nothing) without - * dereferencing stale or invalid pointers. - */ - INIT_LIST_HEAD(&old->lru); - spin_unlock(&gmap->guest_table_lock); -} -EXPORT_SYMBOL_GPL(s390_unlist_old_asce); - -/** * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy * @gmap: the gmap whose ASCE needs to be replaced * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. * If the allocation of the new top level page table fails, the ASCE is not * replaced. * In any case, the old ASCE is always removed from the gmap CRST list. @@ -2838,23 +2415,16 @@ int s390_replace_asce(struct gmap *gmap) struct page *page; void *table; - s390_unlist_old_asce(gmap); + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; table = page_to_virt(page); memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); - /* - * The caller has to deal with the old ASCE, but here we make sure - * the new one is properly added to the CRST list, so that - * it will be freed when the VM is torn down. - */ - spin_lock(&gmap->guest_table_lock); - list_add(&page->lru, &gmap->crst_list); - spin_unlock(&gmap->guest_table_lock); - /* Set new table origin while preserving existing ASCE control bits */ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); WRITE_ONCE(gmap->asce, asce); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c new file mode 100644 index 000000000000..549f14ad08af --- /dev/null +++ b/arch/s390/mm/gmap_helpers.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2025 + */ + +#include <linux/export.h> +#include <linux/mm_types.h> +#include <linux/mmap_lock.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/leafops.h> +#include <linux/pagewalk.h> +#include <linux/ksm.h> +#include <asm/gmap_helpers.h> +#include <asm/pgtable.h> + +/** + * ptep_zap_softleaf_entry() - discard a software leaf entry. + * @mm: the mm + * @entry: the software leaf entry that needs to be zapped + * + * Discards the given software leaf entry. If the leaf entry was an actual + * swap entry (and not a migration entry, for example), the actual swapped + * page is also discarded from swap. + */ +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) +{ + if (softleaf_is_swap(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (softleaf_is_migration(entry)) + dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); + free_swap_and_cache(entry); +} + +/** + * gmap_helper_zap_one_page() - discard a page if it was swapped. + * @mm: the mm + * @vmaddr: the userspace virtual address that needs to be discarded + * + * If the given address maps to a swap entry, discard it. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + + mmap_assert_locked(mm); + + /* Find the vm address for the guest address */ + vma = vma_lookup(mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + + /* Get pointer to the page table entry */ + ptep = get_locked_pte(mm, vmaddr, &ptl); + if (unlikely(!ptep)) + return; + if (pte_swap(*ptep)) { + preempt_disable(); + pgste = pgste_get_lock(ptep); + + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + pte_clear(mm, vmaddr, ptep); + + pgste_set_unlock(ptep, pgste); + preempt_enable(); + } + pte_unmap_unlock(ptep, ptl); +} +EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page); + +/** + * gmap_helper_discard() - discard user pages in the given range + * @mm: the mm + * @vmaddr: starting userspace address + * @end: end address (first address outside the range) + * + * All userpace pages in the range [@vamddr, @end) are discarded and unmapped. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + + while (vmaddr < end) { + vma = find_vma_intersection(mm, vmaddr, end); + if (!vma) + return; + if (!is_vm_hugetlb_page(vma)) + zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); + vmaddr = vma->vm_end; + } +} +EXPORT_SYMBOL_GPL(gmap_helper_discard); + +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + unsigned long *found_addr = walk->private; + + /* Return 1 of the page is a zeropage. */ + if (is_zero_pfn(pte_pfn(*pte))) { + /* + * Shared zeropage in e.g., a FS DAX mapping? We cannot do the + * right thing and likely don't care: FAULT_FLAG_UNSHARE + * currently only works in COW mappings, which is also where + * mm_forbids_zeropage() is checked. + */ + if (!is_cow_mapping(walk->vma->vm_flags)) + return -EFAULT; + + *found_addr = addr; + return 1; + } + return 0; +} + +static const struct mm_walk_ops find_zeropage_ops = { + .pte_entry = find_zeropage_pte_entry, + .walk_lock = PGWALK_WRLOCK, +}; + +/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages + * @mm: the mm whose zero pages are to be unshared + * + * Unshare all shared zeropages, replacing them by anonymous pages. Note that + * we cannot simply zap all shared zeropages, because this could later + * trigger unexpected userfaultfd missing events. + * + * This must be called after mm->context.allow_cow_sharing was + * set to 0, to avoid future mappings of shared zeropages. + * + * mm contracts with s390, that even if mm were to remove a page table, + * and racing with walk_page_range_vma() calling pte_offset_map_lock() + * would fail, it will never insert a page table containing empty zero + * pages once mm_forbids_zeropage(mm) i.e. + * mm->context.allow_cow_sharing is set to 0. + */ +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + unsigned long addr; + vm_fault_t fault; + int rc; + + for_each_vma(vmi, vma) { + /* + * We could only look at COW mappings, but it's more future + * proof to catch unexpected zeropages in other mappings and + * fail. + */ + if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) + continue; + addr = vma->vm_start; + +retry: + rc = walk_page_range_vma(vma, addr, vma->vm_end, + &find_zeropage_ops, &addr); + if (rc < 0) + return rc; + else if (!rc) + continue; + + /* addr was updated by find_zeropage_pte_entry() */ + fault = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + if (fault & VM_FAULT_OOM) + return -ENOMEM; + /* + * See break_ksm(): even after handle_mm_fault() returned 0, we + * must start the lookup from the current address, because + * handle_mm_fault() may back out if there's any difficulty. + * + * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but + * maybe they could trigger in the future on concurrent + * truncation. In that case, the shared zeropage would be gone + * and we can simply retry and make progress. + */ + cond_resched(); + goto retry; + } + + return 0; +} + +/** + * gmap_helper_disable_cow_sharing() - disable all COW sharing + * + * Disable most COW-sharing of memory pages for the whole process: + * (1) Disable KSM and unmerge/unshare any KSM pages. + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. + * + * Not that we currently don't bother with COW-shared pages that are shared + * with parent/child processes due to fork(). + */ +int gmap_helper_disable_cow_sharing(void) +{ + struct mm_struct *mm = current->mm; + int rc; + + mmap_assert_write_locked(mm); + + if (!mm->context.allow_cow_sharing) + return 0; + + mm->context.allow_cow_sharing = 0; + + /* Replace all shared zeropages by anonymous pages. */ + rc = __gmap_helper_unshare_zeropages(mm); + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + if (!rc) + rc = ksm_disable(mm); + if (rc) + mm->context.allow_cow_sharing = 1; + return rc; +} +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index c299a18273ff..d42e61c7594e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -6,15 +6,15 @@ * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> */ -#define KMSG_COMPONENT "hugetlb" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "hugetlb: " fmt -#include <asm/pgalloc.h> +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/security.h> +#include <asm/pgalloc.h> /* * If the bit selected by single-bit bitmask "a" is set within "x", move @@ -24,6 +24,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) { + swp_entry_t arch_entry; unsigned long rste; /* @@ -48,6 +49,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) */ if (pte_present(pte)) { rste = pte_val(pte) & PAGE_MASK; + rste |= _SEGMENT_ENTRY_PRESENT; rste |= move_set_bit(pte_val(pte), _PAGE_READ, _SEGMENT_ENTRY_READ); rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, @@ -66,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte) #endif rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, _SEGMENT_ENTRY_NOEXEC); + } else if (!pte_none(pte)) { + /* swap pte */ + arch_entry = __pte_to_swp_entry(pte); + rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry)); } else rste = _SEGMENT_ENTRY_EMPTY; return rste; @@ -73,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + swp_entry_t arch_entry; unsigned long pteval; - int present; + int present, none; + pte_t pte; - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { present = pud_present(__pud(rste)); - else + none = pud_none(__pud(rste)); + } else { present = pmd_present(__pmd(rste)); + none = pmd_none(__pmd(rste)); + } /* * Convert encoding pmd / pud bits pte bits @@ -114,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste) pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); + } else if (!none) { + /* swap rste */ + arch_entry = __rste_to_swp_entry(rste); + pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry)); + pteval = pte_val(pte); } else pteval = _PAGE_INVALID; return __pte(pteval); @@ -121,7 +137,7 @@ static inline pte_t __rste_to_pte(unsigned long rste) static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) { - struct page *page; + struct folio *folio; unsigned long size, paddr; if (!mm_uses_skeys(mm) || @@ -129,27 +145,25 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) return; if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { - page = pud_page(__pud(rste)); + folio = page_folio(pud_page(__pud(rste))); size = PUD_SIZE; paddr = rste & PUD_MASK; } else { - page = pmd_page(__pmd(rste)); + folio = page_folio(pmd_page(__pmd(rste))); size = PMD_SIZE; paddr = rste & PMD_MASK; } - if (!test_and_set_bit(PG_arch_1, &page->flags)) - __storage_key_init_range(paddr, paddr + size - 1); + if (!test_and_set_bit(PG_arch_1, &folio->flags.f)) + __storage_key_init_range(paddr, paddr + size); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { unsigned long rste; rste = __pte_to_rste(pte); - if (!MACHINE_HAS_NX) - rste &= ~_SEGMENT_ENTRY_NOEXEC; /* Set correct table type for 2G hugepages */ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { @@ -163,15 +177,21 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(rste)); } -pte_t huge_ptep_get(pte_t *ptep) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); +} + +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return __rste_to_pte(pte_val(*ptep)); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get(ptep); + pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; pud_t *pudp = (pud_t *) ptep; @@ -217,120 +237,21 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4dp = p4d_offset(pgdp, addr); if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); - if (pud_present(*pudp)) { - if (pud_large(*pudp)) - return (pte_t *) pudp; + if (sz == PUD_SIZE) + return (pte_t *)pudp; + if (pud_present(*pudp)) pmdp = pmd_offset(pudp, addr); - } } } return (pte_t *) pmdp; } -int pmd_huge(pmd_t pmd) -{ - return pmd_large(pmd); -} - -int pud_huge(pud_t pud) -{ - return pud_large(pud); -} - bool __init arch_hugetlb_valid_size(unsigned long size) { - if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + if (cpu_has_edat1() && size == PMD_SIZE) return true; - else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + else if (cpu_has_edat2() && size == PUD_SIZE) return true; else return false; } - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; - - info.flags = 0; - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; - unsigned long addr; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - goto check_asce_limit; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - goto check_asce_limit; - } - - if (mm->get_unmapped_area == arch_get_unmapped_area) - addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - addr = hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - if (offset_in_page(addr)) - return addr; - -check_asce_limit: - return check_asce_limit(mm, addr, len); -} diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 97d66a3e60fb..e4953453d254 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -8,6 +8,7 @@ * Copyright (C) 1995 Linus Torvalds */ +#include <linux/cpufeature.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -31,17 +32,16 @@ #include <linux/cma.h> #include <linux/gfp.h> #include <linux/dma-direct.h> +#include <linux/percpu.h> #include <asm/processor.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> +#include <asm/ctlreg.h> #include <asm/kfence.h> -#include <asm/ptdump.h> #include <asm/dma.h> #include <asm/abs_lowcore.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> -#include <asm/ctl_reg.h> #include <asm/sclp.h> #include <asm/set_memory.h> #include <asm/kasan.h> @@ -49,11 +49,21 @@ #include <asm/uv.h> #include <linux/virtio_anchor.h> #include <linux/virtio_config.h> +#include <linux/execmem.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); -static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); -unsigned long s390_invalid_asce; +struct ctlreg __bootdata_preserved(s390_invalid_asce); + +unsigned long __bootdata_preserved(page_noexec_mask); +EXPORT_SYMBOL(page_noexec_mask); + +unsigned long __bootdata_preserved(segment_noexec_mask); +EXPORT_SYMBOL(segment_noexec_mask); + +unsigned long __bootdata_preserved(region_noexec_mask); +EXPORT_SYMBOL(region_noexec_mask); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); @@ -61,27 +71,17 @@ EXPORT_SYMBOL(zero_page_mask); static void __init setup_zero_pages(void) { + unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; - struct page *page; - int i; /* Latest machines require a mapping granularity of 512KB */ order = 7; /* Limit number of empty zero pages for small memory sizes */ - while (order > 2 && (totalram_pages() >> 10) < (1UL << order)) + while (order > 2 && (total_pages >> 10) < (1UL << order)) order--; - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Out of memory in setup_zero_pages"); - - page = virt_to_page((void *) empty_zero_page); - split_page(page, order); - for (i = 1 << order; i > 0; i--) { - mark_page_reserved(page); - page++; - } + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } @@ -92,41 +92,12 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - psw_t psw; - - s390_invalid_asce = (unsigned long)invalid_pg_dir; - s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); - init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > _REGION2_SIZE) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } - init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = s390_invalid_asce; - crst_table_init((unsigned long *) init_mm.pgd, pgd_type); - vmem_map_init(); - kasan_copy_shadow_mapping(); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.user_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); - kasan_free_early_identity(); + vmem_map_init(); sparse_init(); - zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(31); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; free_area_init(max_zone_pfns); } @@ -135,30 +106,31 @@ void mark_rodata_ro(void) { unsigned long size = __end_ro_after_init - __start_ro_after_init; - set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); + if (cpu_has_nx()) + system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); + __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); - debug_checkwx(); } -int set_memory_encrypted(unsigned long addr, int numpages) +int set_memory_encrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages unshared, (swiotlb, dma_free) */ for (i = 0; i < numpages; ++i) { - uv_remove_shared(addr); - addr += PAGE_SIZE; + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } -int set_memory_decrypted(unsigned long addr, int numpages) +int set_memory_decrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages shared (swiotlb, dma_alloca) */ for (i = 0; i < numpages; ++i) { - uv_set_shared(addr); - addr += PAGE_SIZE; + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } @@ -170,7 +142,7 @@ bool force_dma_unencrypted(struct device *dev) } /* protected virtualization */ -static void pv_init(void) +static void __init pv_init(void) { if (!is_prot_virt_guest()) return; @@ -182,35 +154,14 @@ static void pv_init(void) swiotlb_update_mem_attributes(); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); - set_max_mapnr(max_low_pfn); - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - pv_init(); - kfence_split_mapping(); - /* Setup guest page hinting */ - cmma_init(); - /* this will put all low memory onto the freelists */ - memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ - - cmma_init_nodat(); -} - -void free_initmem(void) -{ - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RW | SET_MEMORY_NX); - free_reserved_area(sclp_early_sccb, - sclp_early_sccb + EXT_SCCB_READ_SCP, - POISON_FREE_INITMEM, "unused early sccb"); - free_initmem_default(POISON_FREE_INITMEM); } unsigned long memory_block_size_bytes(void) @@ -222,6 +173,41 @@ unsigned long memory_block_size_bytes(void) return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm); } +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + return LOCAL_DISTANCE; +} + +static int __init pcpu_cpu_to_node(int cpu) +{ + return 0; +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} + #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_CMA @@ -236,16 +222,13 @@ struct s390_cma_mem_data { static int s390_cma_check_range(struct cma *cma, void *data) { struct s390_cma_mem_data *mem_data; - unsigned long start, end; mem_data = data; - start = cma_get_base(cma); - end = start + cma_get_size(cma); - if (end < mem_data->start) - return 0; - if (start >= mem_data->end) - return 0; - return -EBUSY; + + if (cma_intersects(cma, mem_data->start, mem_data->end)) + return -EBUSY; + + return 0; } static int s390_cma_mem_notifier(struct notifier_block *nb, @@ -282,10 +265,7 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(params->altmap)) - return -EINVAL; - - if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) + if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL))) return -EINVAL; VM_BUG_ON(!mhp_range_allowed(start, size, true)); @@ -308,3 +288,32 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) vmem_remove_mapping(start, size); } #endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_EXECMEM +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) +{ + unsigned long module_load_offset = 0; + unsigned long start; + + if (kaslr_enabled()) + module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; + + start = MODULES_VADDR + module_load_offset; + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .flags = EXECMEM_KASAN_SHADOW, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = MODULE_ALIGN, + }, + }, + }; + + return &execmem_info; +} +#endif /* CONFIG_EXECMEM */ diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c deleted file mode 100644 index 9f988d4582ed..000000000000 --- a/arch/s390/mm/kasan_init.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kasan.h> -#include <linux/sched/task.h> -#include <linux/memblock.h> -#include <linux/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/kasan.h> -#include <asm/mem_detect.h> -#include <asm/processor.h> -#include <asm/sclp.h> -#include <asm/facility.h> -#include <asm/sections.h> -#include <asm/setup.h> -#include <asm/uv.h> - -static unsigned long segment_pos __initdata; -static unsigned long segment_low __initdata; -static unsigned long pgalloc_pos __initdata; -static unsigned long pgalloc_low __initdata; -static unsigned long pgalloc_freeable __initdata; -static bool has_edat __initdata; -static bool has_nx __initdata; - -#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) - -static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - -static void __init kasan_early_panic(const char *reason) -{ - sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); - sclp_early_printk(reason); - disabled_wait(); -} - -static void * __init kasan_early_alloc_segment(void) -{ - segment_pos -= _SEGMENT_SIZE; - - if (segment_pos < segment_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)segment_pos; -} - -static void * __init kasan_early_alloc_pages(unsigned int order) -{ - pgalloc_pos -= (PAGE_SIZE << order); - - if (pgalloc_pos < pgalloc_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)pgalloc_pos; -} - -static void * __init kasan_early_crst_alloc(unsigned long val) -{ - unsigned long *table; - - table = kasan_early_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); - return table; -} - -static pte_t * __init kasan_early_pte_alloc(void) -{ - static void *pte_leftover; - pte_t *pte; - - BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); - - if (!pte_leftover) { - pte_leftover = kasan_early_alloc_pages(0); - pte = pte_leftover + _PAGE_TABLE_SIZE; - } else { - pte = pte_leftover; - pte_leftover = NULL; - } - memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); - return pte; -} - -enum populate_mode { - POPULATE_ONE2ONE, - POPULATE_MAP, - POPULATE_ZERO_SHADOW, - POPULATE_SHALLOW -}; -static void __init kasan_early_pgtable_populate(unsigned long address, - unsigned long end, - enum populate_mode mode) -{ - unsigned long pgt_prot_zero, pgt_prot, sgt_prot; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); - if (!has_nx) - pgt_prot_zero &= ~_PAGE_NOEXEC; - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!has_nx || mode == POPULATE_ONE2ONE) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - } - - /* - * The first 1MB of 1:1 mapping is mapped with 4KB pages - */ - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PGDIR_SIZE) && - end - address >= PGDIR_SIZE) { - pgd_populate(&init_mm, pg_dir, - kasan_early_shadow_p4d); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - continue; - } - p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY); - pgd_populate(&init_mm, pg_dir, p4_dir); - } - - if (mode == POPULATE_SHALLOW) { - address = (address + P4D_SIZE) & P4D_MASK; - continue; - } - - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, P4D_SIZE) && - end - address >= P4D_SIZE) { - p4d_populate(&init_mm, p4_dir, - kasan_early_shadow_pud); - address = (address + P4D_SIZE) & P4D_MASK; - continue; - } - pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY); - p4d_populate(&init_mm, p4_dir, pu_dir); - } - - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PUD_SIZE) && - end - address >= PUD_SIZE) { - pud_populate(&init_mm, pu_dir, - kasan_early_shadow_pmd); - address = (address + PUD_SIZE) & PUD_MASK; - continue; - } - pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY); - pud_populate(&init_mm, pu_dir, pm_dir); - } - - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - if (IS_ALIGNED(address, PMD_SIZE) && - end - address >= PMD_SIZE) { - if (mode == POPULATE_ZERO_SHADOW) { - pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } else if (has_edat && address) { - void *page; - - if (mode == POPULATE_ONE2ONE) { - page = (void *)address; - } else { - page = kasan_early_alloc_segment(); - memset(page, 0, _SEGMENT_SIZE); - } - set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot)); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - } - pt_dir = kasan_early_pte_alloc(); - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *page; - - switch (mode) { - case POPULATE_ONE2ONE: - page = (void *)address; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); - break; - case POPULATE_MAP: - page = kasan_early_alloc_pages(0); - memset(page, 0, PAGE_SIZE); - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); - break; - case POPULATE_ZERO_SHADOW: - page = kasan_early_shadow_page; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero)); - break; - case POPULATE_SHALLOW: - /* should never happen */ - break; - } - } - address += PAGE_SIZE; - } -} - -static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) -{ - unsigned long asce_bits; - - asce_bits = asce_type | _ASCE_TABLE_LENGTH; - S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); -} - -static void __init kasan_enable_dat(void) -{ - psw_t psw; - - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); -} - -static void __init kasan_early_detect_facilities(void) -{ - if (test_facility(8)) { - has_edat = true; - __ctl_set_bit(0, 23); - } - if (!noexec_disabled && test_facility(130)) { - has_nx = true; - __ctl_set_bit(0, 20); - } -} - -void __init kasan_early_init(void) -{ - unsigned long shadow_alloc_size; - unsigned long initrd_end; - unsigned long memsize; - unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); - pte_t pte_z; - pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); - pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); - p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); - - kasan_early_detect_facilities(); - if (!has_nx) - pgt_prot &= ~_PAGE_NOEXEC; - pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot); - - memsize = get_mem_detect_end(); - if (!memsize) - kasan_early_panic("cannot detect physical memory size\n"); - /* - * Kasan currently supports standby memory but only if it follows - * online memory (default allocation), i.e. no memory holes. - * - memsize represents end of online memory - * - ident_map_size represents online + standby and memory limits - * accounted. - * Kasan maps "memsize" right away. - * [0, memsize] - as identity mapping - * [__sha(0), __sha(memsize)] - shadow memory for identity mapping - * The rest [memsize, ident_map_size] if memsize < ident_map_size - * could be mapped/unmapped dynamically later during memory hotplug. - */ - memsize = min(memsize, ident_map_size); - - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); - - /* init kasan zero shadow */ - crst_table_init((unsigned long *)kasan_early_shadow_p4d, - p4d_val(p4d_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pud, - pud_val(pud_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pmd, - pmd_val(pmd_z)); - memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); - - shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; - pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { - initrd_end = - round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); - pgalloc_low = max(pgalloc_low, initrd_end); - } - - if (pgalloc_low + shadow_alloc_size > memsize) - kasan_early_panic("out of memory during initialisation\n"); - - if (has_edat) { - segment_pos = round_down(memsize, _SEGMENT_SIZE); - segment_low = segment_pos - shadow_alloc_size; - pgalloc_pos = segment_low; - } else { - pgalloc_pos = memsize; - } - init_mm.pgd = early_pg_dir; - /* - * Current memory layout: - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | | - * | |/ | kasan | - * +- shadow start --+ | zero | - * | 1/8 addr space | | page | - * +- shadow end -+ | mapping | - * | ... gap ... |\ | (untracked) | - * +- vmalloc area -+ \ | | - * | vmalloc_size | \ | | - * +- modules vaddr -+ \ +----------------+ - * | 2Gb | \| unmapped | allocated per module - * +-----------------+ +- shadow end ---+ - * - * Current memory layout (KASAN_VMALLOC): - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | kasan | - * | |/ | zero | - * +- shadow start --+ | page | - * | 1/8 addr space | | mapping | - * +- shadow end -+ | (untracked) | - * | ... gap ... |\ | | - * +- vmalloc area -+ \ +- vmalloc area -+ - * | vmalloc_size | \ |shallow populate| - * +- modules vaddr -+ \ +- modules area -+ - * | 2Gb | \|shallow populate| - * +-----------------+ +- shadow end ---+ - */ - /* populate kasan shadow (for identity mapping and zero page mapping) */ - kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP); - if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - /* shallowly populate kasan shadow for vmalloc and modules */ - kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END), - POPULATE_SHALLOW); - } - /* populate kasan shadow for untracked memory */ - kasan_early_pgtable_populate(__sha(ident_map_size), - IS_ENABLED(CONFIG_KASAN_VMALLOC) ? - __sha(VMALLOC_START) : - __sha(MODULES_VADDR), - POPULATE_ZERO_SHADOW); - kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), - POPULATE_ZERO_SHADOW); - /* memory allocated for identity mapping structs will be freed later */ - pgalloc_freeable = pgalloc_pos; - /* populate identity mapping */ - kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2); - kasan_enable_dat(); - /* enable kasan */ - init_task.kasan_depth = 0; - memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); - sclp_early_printk("KernelAddressSanitizer initialized\n"); -} - -void __init kasan_copy_shadow_mapping(void) -{ - /* - * At this point we are still running on early pages setup early_pg_dir, - * while swapper_pg_dir has just been initialized with identity mapping. - * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. - */ - - pgd_t *pg_dir_src; - pgd_t *pg_dir_dst; - p4d_t *p4_dir_src; - p4d_t *p4_dir_dst; - - pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START); - p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); - p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); -} - -void __init kasan_free_early_identity(void) -{ - memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); -} diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 1571cdcb0c50..cfd219fe495c 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -13,15 +13,16 @@ #include <linux/gfp.h> #include <linux/cpu.h> #include <linux/uio.h> +#include <linux/io.h> #include <asm/asm-extable.h> -#include <asm/ctl_reg.h> -#include <asm/io.h> #include <asm/abs_lowcore.h> #include <asm/stacktrace.h> +#include <asm/sections.h> #include <asm/maccess.h> +#include <asm/ctlreg.h> unsigned long __bootdata_preserved(__memcpy_real_area); -static __ro_after_init pte_t *memcpy_real_ptep; +pte_t *__bootdata_preserved(memcpy_real_ptep); static DEFINE_MUTEX(memcpy_real_mutex); static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) @@ -40,7 +41,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz " ex %1,0(1)\n" " lg %1,0(%3)\n" " lra %0,0(%0)\n" - " sturg %1,%0\n" + " sturg %1,%0" : "+&a" (aligned), "+&a" (count), "=m" (tmp) : "a" (&tmp), "a" (&tmp[offset]), "a" (src) : "cc", "memory", "1"); @@ -48,7 +49,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz } /* - * s390_kernel_write - write to kernel memory bypassing DAT + * __s390_kernel_write - write to kernel memory bypassing DAT * @dst: destination address * @src: source address * @size: number of bytes to copy @@ -61,35 +62,24 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz */ static DEFINE_SPINLOCK(s390_kernel_write_lock); -notrace void *s390_kernel_write(void *dst, const void *src, size_t size) +notrace void *__s390_kernel_write(void *dst, const void *src, size_t size) { void *tmp = dst; unsigned long flags; long copied; spin_lock_irqsave(&s390_kernel_write_lock, flags); - if (!(flags & PSW_MASK_DAT)) { - memcpy(dst, src, size); - } else { - while (size) { - copied = s390_kernel_write_odd(tmp, src, size); - tmp += copied; - src += copied; - size -= copied; - } + while (size) { + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; + src += copied; + size -= copied; } spin_unlock_irqrestore(&s390_kernel_write_lock, flags); return dst; } -void __init memcpy_real_init(void) -{ - memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true); - if (!memcpy_real_ptep) - panic("Couldn't setup memcpy real area"); -} - size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { size_t len, copied, res = 0; @@ -97,11 +87,12 @@ size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) void *chunk; pte_t pte; + BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE); while (count) { - phys = src & PAGE_MASK; - offset = src & ~PAGE_MASK; + phys = src & MEMCPY_REAL_MASK; + offset = src & ~MEMCPY_REAL_MASK; chunk = (void *)(__memcpy_real_area + offset); - len = min(count, PAGE_SIZE - offset); + len = min(count, MEMCPY_REAL_SIZE - offset); pte = mk_pte_phys(phys, PAGE_KERNEL_RO); mutex_lock(&memcpy_real_mutex); @@ -128,7 +119,7 @@ int memcpy_real(void *dest, unsigned long src, size_t count) kvec.iov_base = dest; kvec.iov_len = count; - iov_iter_kvec(&iter, WRITE, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); if (memcpy_real_iter(&iter, src, count) < count) return -EFAULT; return 0; @@ -162,7 +153,6 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) void *ptr = phys_to_virt(addr); void *bounce = ptr; struct lowcore *abs_lc; - unsigned long flags; unsigned long size; int this_cpu, cpu; @@ -178,10 +168,10 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) goto out; size = PAGE_SIZE - (addr & ~PAGE_MASK); if (addr < sizeof(struct lowcore)) { - abs_lc = get_abs_lowcore(&flags); + abs_lc = get_abs_lowcore(); ptr = (void *)abs_lc + addr; memcpy(bounce, ptr, size); - put_abs_lowcore(abs_lc, flags); + put_abs_lowcore(abs_lc); } else if (cpu == this_cpu) { ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); memcpy(bounce, ptr, size); diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 3327c47bc181..2a222a7e14f4 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -15,8 +15,8 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/random.h> -#include <linux/compat.h> #include <linux/security.h> +#include <linux/hugetlb.h> #include <asm/elf.h> static unsigned long stack_maxrandom_size(void) @@ -26,7 +26,7 @@ static unsigned long stack_maxrandom_size(void) return STACK_RND_MASK << PAGE_SHIFT; } -static inline int mmap_is_legacy(struct rlimit *rlim_stack) +static inline int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; @@ -46,11 +46,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd) } static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; - unsigned long gap_min, gap_max; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) @@ -60,24 +59,29 @@ static inline unsigned long mmap_base(unsigned long rnd, * Top of mmap area (just below the process stack). * Leave at least a ~128 MB hole. */ - gap_min = SZ_128M; - gap_max = (STACK_TOP / 6) * 5; - - if (gap < gap_min) - gap = gap_min; - else if (gap > gap_max) - gap = gap_max; + gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5); return PAGE_ALIGN(STACK_TOP - gap - rnd); } +static int get_align_mask(struct file *filp, unsigned long flags) +{ + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); + if (!(current->flags & PF_RANDOMIZE)) + return 0; + if (filp || (flags & MAP_SHARED)) + return MMAP_ALIGN_MASK << PAGE_SHIFT; + return 0; +} + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; @@ -93,15 +97,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, goto check_asce_limit; } - info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; - if (filp || (flags & MAP_SHARED)) - info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; - else - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if (offset_in_page(addr)) return addr; @@ -112,11 +113,11 @@ check_asce_limit: unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; /* requested length too big for entire address space */ if (len > TASK_SIZE - mmap_min_addr) @@ -136,13 +137,11 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - if (filp || (flags & MAP_SHARED)) - info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; - else - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); /* @@ -169,7 +168,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -182,29 +181,35 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) */ if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area; + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm_flags_set(MMF_TOPDOWN, mm); } } -static const pgprot_t protection_map[16] = { - [VM_NONE] = PAGE_NONE, - [VM_READ] = PAGE_RO, - [VM_WRITE] = PAGE_RO, - [VM_WRITE | VM_READ] = PAGE_RO, - [VM_EXEC] = PAGE_RX, - [VM_EXEC | VM_READ] = PAGE_RX, - [VM_EXEC | VM_WRITE] = PAGE_RX, - [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX, - [VM_SHARED] = PAGE_NONE, - [VM_SHARED | VM_READ] = PAGE_RO, - [VM_SHARED | VM_WRITE] = PAGE_RW, - [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW, - [VM_SHARED | VM_EXEC] = PAGE_RX, - [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX, - [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, - [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX -}; +static pgprot_t protection_map[16] __ro_after_init; + +void __init setup_protection_map(void) +{ + pgprot_t *pm = protection_map; + + pm[VM_NONE] = PAGE_NONE; + pm[VM_READ] = PAGE_RO; + pm[VM_WRITE] = PAGE_RO; + pm[VM_WRITE | VM_READ] = PAGE_RO; + pm[VM_EXEC] = PAGE_RX; + pm[VM_EXEC | VM_READ] = PAGE_RX; + pm[VM_EXEC | VM_WRITE] = PAGE_RX; + pm[VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX; + pm[VM_SHARED] = PAGE_NONE; + pm[VM_SHARED | VM_READ] = PAGE_RO; + pm[VM_SHARED | VM_WRITE] = PAGE_RW; + pm[VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW; + pm[VM_SHARED | VM_EXEC] = PAGE_RX; + pm[VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX; + pm[VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX; + pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX; +} + DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index d5ea09d78938..01f9b39e65f5 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -7,210 +7,18 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> #include <linux/mm.h> -#include <linux/memblock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <asm/asm-extable.h> -#include <asm/facility.h> #include <asm/page-states.h> +#include <asm/sections.h> +#include <asm/page.h> -static int cmma_flag = 1; - -static int __init cmma(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - cmma_flag = enabled; - return 1; -} -__setup("cmma=", cmma); - -static inline int cmma_test_essa(void) -{ - unsigned long tmp = 0; - int rc = -EOPNOTSUPP; - - /* test ESSA_GET_STATE */ - asm volatile( - " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" - "0: la %[rc],0\n" - "1:\n" - EX_TABLE(0b,1b) - : [rc] "+&d" (rc), [tmp] "+&d" (tmp) - : [cmd] "i" (ESSA_GET_STATE)); - return rc; -} - -void __init cmma_init(void) -{ - if (!cmma_flag) - return; - if (cmma_test_essa()) { - cmma_flag = 0; - return; - } - if (test_facility(147)) - cmma_flag = 2; -} - -static inline unsigned char get_page_state(struct page *page) -{ - unsigned char state; - - asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (state) - : "a" (page_to_phys(page)), - "i" (ESSA_GET_STATE)); - return state & 0x3f; -} - -static inline void set_page_unused(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_UNUSED)); -} - -static inline void set_page_stable_dat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); -} - -static inline void set_page_stable_nodat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE_NODAT)); -} - -static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pmd_t *pmd; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || pmd_large(*pmd)) - continue; - page = phys_to_page(pmd_val(*pmd)); - set_bit(PG_arch_1, &page->flags); - } while (pmd++, addr = next, addr != end); -} - -static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pud_t *pud; - int i; - - pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none(*pud) || pud_large(*pud)) - continue; - if (!pud_folded(*pud)) { - page = phys_to_page(pud_val(*pud)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pmd(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - p4d_t *p4d; - int i; - - p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(*p4d)) - continue; - if (!p4d_folded(*p4d)) { - page = phys_to_page(p4d_val(*p4d)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pud(p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -static void mark_kernel_pgd(void) -{ - unsigned long addr, next; - struct page *page; - pgd_t *pgd; - int i; - - addr = 0; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, MODULES_END); - if (pgd_none(*pgd)) - continue; - if (!pgd_folded(*pgd)) { - page = phys_to_page(pgd_val(*pgd)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_p4d(pgd, addr, next); - } while (pgd++, addr = next, addr != MODULES_END); -} - -void __init cmma_init_nodat(void) -{ - struct page *page; - unsigned long start, end, ix; - int i; - - if (cmma_flag < 2) - return; - /* Mark pages used in kernel page tables */ - mark_kernel_pgd(); - - /* Set all kernel pages not used for page tables to stable/no-dat */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { - page = pfn_to_page(start); - for (ix = start; ix < end; ix++, page++) { - if (__test_and_clear_bit(PG_arch_1, &page->flags)) - continue; /* skip page table pages */ - if (!list_empty(&page->lru)) - continue; /* skip free pages */ - set_page_stable_nodat(page, 0); - } - } -} +int __bootdata_preserved(cmma_flag); void arch_free_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_unused(page, order); + __set_page_unused(page_to_virt(page), 1UL << order); } void arch_alloc_page(struct page *page, int order) @@ -218,14 +26,7 @@ void arch_alloc_page(struct page *page, int order) if (!cmma_flag) return; if (cmma_flag < 2) - set_page_stable_dat(page, order); + __set_page_stable_dat(page_to_virt(page), 1UL << order); else - set_page_stable_nodat(page, order); -} - -void arch_set_page_dat(struct page *page, int order) -{ - if (!cmma_flag) - return; - set_page_stable_dat(page, order); + __set_page_stable_nodat(page_to_virt(page), 1UL << order); } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 85195c18b2e8..3042647c9dbf 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -3,13 +3,17 @@ * Copyright IBM Corp. 2011 * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/hugetlb.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <linux/mm.h> #include <asm/cacheflush.h> #include <asm/facility.h> #include <asm/pgalloc.h> #include <asm/kfence.h> #include <asm/page.h> +#include <asm/asm.h> #include <asm/set_memory.h> static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) @@ -24,7 +28,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) unsigned long boundary, size; while (start < end) { - if (MACHINE_HAS_EDAT1) { + if (cpu_has_edat1()) { /* set storage keys for a 1MB frame */ size = 1UL << 20; boundary = (start + size) & ~(size - 1); @@ -41,7 +45,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) } #ifdef CONFIG_PROC_FS -atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); void arch_report_meminfo(struct seq_file *m) { @@ -60,7 +64,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, unsigned long *table, mask; mask = 0; - if (MACHINE_HAS_EDAT2) { + if (cpu_has_edat2()) { switch (dtt) { case CRDTE_DTT_REGION3: mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1); @@ -73,11 +77,9 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, break; } table = (unsigned long *)((unsigned long)old & mask); - crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); - } else if (MACHINE_HAS_IDTE) { - cspg(old, *old, new); + crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val); } else { - csp((unsigned int *)old + 1, *old, new); + cspg(old, *old, new); } } @@ -96,11 +98,17 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, if (flags & SET_MEMORY_RO) new = pte_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pte_mkwrite(pte_mkdirty(new)); + new = pte_mkwrite_novma(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pte_bit(new, __pgprot(_PAGE_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pte(pte_val(new) & PAGE_MASK); + new = set_pte_bit(new, PAGE_KERNEL); + } pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); ptep++; addr += PAGE_SIZE; @@ -146,11 +154,17 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, if (flags & SET_MEMORY_RO) new = pmd_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pmd_mkwrite(pmd_mkdirty(new)); + new = pmd_mkwrite_novma(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pmd(pmd_val(new) & PMD_MASK); + new = set_pmd_bit(new, SEGMENT_KERNEL); + } pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); } @@ -167,7 +181,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, if (pmd_none(*pmdp)) return -EINVAL; next = pmd_addr_end(addr, end); - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PMD_MASK); need_split |= !!(addr + PMD_SIZE > next); @@ -232,6 +246,12 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pud(pud_val(new) & PUD_MASK); + new = set_pud_bit(new, REGION3_KERNEL); + } pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); } @@ -248,7 +268,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pudp)) return -EINVAL; next = pud_addr_end(addr, end); - if (pud_large(*pudp)) { + if (pud_leaf(*pudp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PUD_MASK); need_split |= !!(addr + PUD_SIZE > next); @@ -298,11 +318,6 @@ static int change_page_attr(unsigned long addr, unsigned long end, int rc = -EINVAL; pgd_t *pgdp; - if (addr == end) - return 0; - if (end >= MODULES_END) - return -EINVAL; - mutex_lock(&cpa_mutex); pgdp = pgd_offset_k(addr); do { if (pgd_none(*pgdp)) @@ -313,18 +328,103 @@ static int change_page_attr(unsigned long addr, unsigned long end, break; cond_resched(); } while (pgdp++, addr = next, addr < end && !rc); - mutex_unlock(&cpa_mutex); return rc; } -int __set_memory(unsigned long addr, int numpages, unsigned long flags) +static int change_page_attr_alias(unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long alias, offset, va_start, va_end; + struct vm_struct *area; + int rc = 0; + + /* + * Changes to read-only permissions on kernel VA mappings are also + * applied to the kernel direct mapping. Execute permissions are + * intentionally not transferred to keep all allocated pages within + * the direct mapping non-executable. + */ + flags &= SET_MEMORY_RO | SET_MEMORY_RW; + if (!flags) + return 0; + area = NULL; + while (addr < end) { + if (!area) + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_ALLOC)) + return 0; + va_start = (unsigned long)area->addr; + va_end = va_start + area->nr_pages * PAGE_SIZE; + offset = (addr - va_start) >> PAGE_SHIFT; + alias = (unsigned long)page_address(area->pages[offset]); + rc = change_page_attr(alias, alias + PAGE_SIZE, flags); + if (rc) + break; + addr += PAGE_SIZE; + if (addr >= va_end) + area = NULL; + } + return rc; +} + +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags) { - if (!MACHINE_HAS_NX) + unsigned long end; + int rc; + + if (!cpu_has_nx()) flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); if (!flags) return 0; + if (!numpages) + return 0; addr &= PAGE_MASK; - return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags); + end = addr + numpages * PAGE_SIZE; + mutex_lock(&cpa_mutex); + rc = change_page_attr(addr, end, flags); + if (rc) + goto out; + rc = change_page_attr_alias(addr, end, flags); +out: + mutex_unlock(&cpa_mutex); + return rc; +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); +} + +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long flags; + + if (valid) + flags = SET_MEMORY_DEF; + else + flags = SET_MEMORY_INV; + + return __set_memory((unsigned long)page_to_virt(page), nr, flags); +} + +bool kernel_page_present(struct page *page) +{ + unsigned long addr; + unsigned int cc; + + addr = (unsigned long)page_address(page); + asm volatile( + " lra %[addr],0(%[addr])\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [addr] "+a" (addr) + : + : CC_CLOBBER); + return CC_TRANSFORM(cc) == 0; } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c new file mode 100644 index 000000000000..2f829448c719 --- /dev/null +++ b/arch/s390/mm/pfault.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/cpuhotplug.h> +#include <linux/sched/task.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <asm/asm-extable.h> +#include <asm/asm-offsets.h> +#include <asm/pfault.h> +#include <asm/diag.h> + +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000UL + +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} +early_param("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +}; + +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1UL << 48, + .refcmpmk = 1UL << 48, + .reserved = __PF_RES_FIELD +}; + +int __pfault_init(void) +{ + int rc = -EOPNOTSUPP; + + if (pfault_disable) + return rc; + diag_stat_inc(DIAG_STAT_X258); + asm_inline volatile( + " diag %[refbk],%[rc],0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rc] "+d" (rc) + : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : "cc"); + return rc; +} + +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + +void __pfault_fini(void) +{ + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm_inline volatile( + " diag %[refbk],0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : + : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* + * Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. + */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* + * Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. + */ + if (task_is_running(tsk)) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* + * Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. + */ + tsk->thread.pfault_wait = 0; + } else { + /* + * Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. + */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* + * Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_need_resched_current(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 2de48b2c1b04..7df23528c01b 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -10,70 +10,45 @@ #include <linux/slab.h> #include <linux/mm.h> #include <asm/mmu_context.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> -#include <asm/gmap.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> -#ifdef CONFIG_PGSTE - -int page_table_allocate_pgste = 0; -EXPORT_SYMBOL(page_table_allocate_pgste); - -static struct ctl_table page_table_sysctl[] = { - { - .procname = "allocate_pgste", - .data = &page_table_allocate_pgste, - .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { } -}; - -static struct ctl_table page_table_sysctl_dir[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = page_table_sysctl, - }, - { } -}; - -static int __init page_table_register_sysctl(void) -{ - return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; -} -__initcall(page_table_register_sysctl); - -#endif /* CONFIG_PGSTE */ - -unsigned long *crst_table_alloc(struct mm_struct *mm) +unsigned long *crst_table_alloc_noprof(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; + unsigned long *table; - if (!page) + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER); + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + table = ptdesc_address(ptdesc); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + if (!table) + return; + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) { struct mm_struct *mm = arg; + struct ctlreg asce; /* change all active ASCEs to avoid the creation of new TLBs */ if (current->active_mm == mm) { - S390_lowcore.user_asce = mm->context.asce; - __ctl_load(S390_lowcore.user_asce, 7, 7); + asce.val = mm->context.asce; + get_lowcore()->user_asce = asce; + local_ctl_load(7, &asce); + if (!test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &asce); } __tlb_flush_local(); } @@ -83,6 +58,8 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) unsigned long *pgd = NULL, *p4d = NULL, *__pgd; unsigned long asce_limit = mm->context.asce_limit; + mmap_assert_write_locked(mm); + /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ VM_BUG_ON(asce_limit < _REGION2_SIZE); @@ -94,23 +71,18 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) if (unlikely(!p4d)) goto err_p4d; crst_table_init(p4d, _REGION2_ENTRY_EMPTY); + pagetable_p4d_ctor(virt_to_ptdesc(p4d)); } if (end > _REGION1_SIZE) { pgd = crst_table_alloc(mm); if (unlikely(!pgd)) goto err_pgd; crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + pagetable_pgd_ctor(virt_to_ptdesc(pgd)); } spin_lock_bh(&mm->page_table_lock); - /* - * This routine gets called with mmap_lock lock held and there is - * no reason to optimize for the case of otherwise. However, if - * that would ever change, the below check will let us know. - */ - VM_BUG_ON(asce_limit != mm->context.asce_limit); - if (p4d) { __pgd = (unsigned long *) mm->pgd; p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); @@ -136,292 +108,82 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) return 0; err_pgd: + pagetable_dtor(virt_to_ptdesc(p4d)); crst_table_free(mm, p4d); err_p4d: return -ENOMEM; } -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - unsigned int old, new; - - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; -} - #ifdef CONFIG_PGSTE -struct page *page_table_alloc_pgste(struct mm_struct *mm) +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0); + if (ptdesc) { + table = (u64 *)ptdesc_address(ptdesc); + __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc; } -void page_table_free_pgste(struct page *page) +void page_table_free_pgste(struct ptdesc *ptdesc) { - __free_page(page); + pagetable_free(ptdesc); } #endif /* CONFIG_PGSTE */ -/* - * A 2KB-pgtable is either upper or lower half of a normal page. - * The second half of the page may be unused or used as another - * 2KB-pgtable. - * - * Whenever possible the parent page for a new 2KB-pgtable is picked - * from the list of partially allocated pages mm_context_t::pgtable_list. - * In case the list is empty a new parent page is allocated and added to - * the list. - * - * When a parent page gets fully allocated it contains 2KB-pgtables in both - * upper and lower halves and is removed from mm_context_t::pgtable_list. - * - * When 2KB-pgtable is freed from to fully allocated parent page that - * page turns partially allocated and added to mm_context_t::pgtable_list. - * - * If 2KB-pgtable is freed from the partially allocated parent page that - * page turns unused and gets removed from mm_context_t::pgtable_list. - * Furthermore, the unused parent page is released. - * - * As follows from the above, no unallocated or fully allocated parent - * pages are contained in mm_context_t::pgtable_list. - * - * The upper byte (bits 24-31) of the parent page _refcount is used - * for tracking contained 2KB-pgtables and has the following format: - * - * PP AA - * 01234567 upper byte (bits 24-31) of struct page::_refcount - * || || - * || |+--- upper 2KB-pgtable is allocated - * || +---- lower 2KB-pgtable is allocated - * |+------- upper 2KB-pgtable is pending for removal - * +-------- lower 2KB-pgtable is pending for removal - * - * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why - * using _refcount is possible). - * - * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still unallocated; - * - removed from mm_context_t::pgtable_list in case both hales of the - * parent page are allocated; - * These operations are protected with mm_context_t::lock. - * - * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 - * and the corresponding PP bit is set to 1 in a single atomic operation. - * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually - * exclusive and may never be both set to 1! - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still allocated; - * - removed from mm_context_t::pgtable_list in case the second half of - * the parent page is unallocated; - * These operations are protected with mm_context_t::lock. - * - * It is important to understand that mm_context_t::lock only protects - * mm_context_t::pgtable_list and AA bits, but not the parent page itself - * and PP bits. - * - * Releasing the parent page happens whenever the PP bit turns from 1 to 0, - * while both AA bits and the second PP bit are already unset. Then the - * parent page does not contain any 2KB-pgtable fragment anymore, and it has - * also been removed from mm_context_t::pgtable_list. It is safe to release - * the page therefore. - * - * PGSTE memory spaces use full 4KB-pgtables and do not need most of the - * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable - * while the PP bits are never used, nor such a page is added to or removed - * from mm_context_t::pgtable_list. - */ -unsigned long *page_table_alloc(struct mm_struct *mm) +unsigned long *page_table_alloc_noprof(struct mm_struct *mm) { + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; unsigned long *table; - struct page *page; - unsigned int mask, bit; - - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.lock); - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; - /* - * The pending removal bits must also be checked. - * Failure to do so might lead to an impossible - * value of (i.e 0x13 or 0x23) written to _refcount. - * Such values violate the assumption that pending and - * allocation bits are mutually exclusive, and the rest - * of the code unrails as result. That could lead to - * a whole bunch of races and corruptions. - */ - mask = (mask | (mask >> 4)) & 0x03U; - if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, - 0x01U << (bit + 24)); - list_del(&page->lru); - } - } - spin_unlock_bh(&mm->context.lock); - if (table) - return table; - } - /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); - /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 0x03U << 24); - memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); - memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } else { - /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); - memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); - spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.lock); - } + table = ptdesc_address(ptdesc); + __arch_set_page_dat(table, 1); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); return table; } -static void page_table_release_check(struct page *page, void *table, - unsigned int half, unsigned int mask) -{ - char msg[128]; - - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) - return; - snprintf(msg, sizeof(msg), - "Invalid pgtable %p release half 0x%02x mask 0x%02x", - table, half, mask); - dump_page(page, msg); -} - void page_table_free(struct mm_struct *mm, unsigned long *table) { - unsigned int mask, bit, half; - struct page *page; - - page = virt_to_page(table); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - /* - * Mark the page for delayed release. The actual release - * will happen outside of the critical section from this - * function or from __tlb_remove_table() - */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if (mask & 0x03U) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - half = 0x01U << bit; - } else { - half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); - mask >>= 24; - } + struct ptdesc *ptdesc = virt_to_ptdesc(table); - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); + pagetable_dtor_free(ptdesc); } -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) { - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; - - mm = tlb->mm; - page = virt_to_page(table); - if (mm_alloc_pgste(mm)) { - gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); - return; - } - bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - /* - * Mark the page for delayed release. The actual release will happen - * outside of the critical section from __tlb_remove_table() or from - * page_table_free() - */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if (mask & 0x03U) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); - tlb_remove_table(tlb, table); + struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + + pagetable_dtor_free(ptdesc); } -void __tlb_remove_table(void *_table) +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { - unsigned int mask = (unsigned long) _table & 0x03U, half = mask; - void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); - - switch (half) { - case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); - return; - case 0x01U: /* lower 2K of a 4K page table */ - case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - break; - case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); - mask >>= 24; - break; - } + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, @@ -448,16 +210,21 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + if (!table) + return; + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ @@ -469,7 +236,7 @@ static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \ return (next - 1) < (end - 1) ? next : end; \ } -BASE_ADDR_END_FUNC(page, _PAGE_SIZE) +BASE_ADDR_END_FUNC(page, PAGE_SIZE) BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE) BASE_ADDR_END_FUNC(region3, _REGION3_SIZE) BASE_ADDR_END_FUNC(region2, _REGION2_SIZE) @@ -480,7 +247,7 @@ static inline unsigned long base_lra(unsigned long address) unsigned long real; asm volatile( - " lra %0,0(%1)\n" + " lra %0,0(%1)" : "=d" (real) : "a" (address) : "cc"); return real; } @@ -493,7 +260,7 @@ static int base_page_walk(unsigned long *origin, unsigned long addr, if (!alloc) return 0; pte = origin; - pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; + pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT; do { next = base_page_addr_end(addr, end); *pte = base_lra(addr); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4909dcd762e8..666adcd681ab 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -4,6 +4,8 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ +#include <linux/cpufeature.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -14,15 +16,16 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/slab.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/sysctl.h> #include <linux/ksm.h> #include <linux/mman.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/page-states.h> +#include <asm/pgtable.h> +#include <asm/machine.h> pgprot_t pgprot_writecombine(pgprot_t prot) { @@ -34,22 +37,12 @@ pgprot_t pgprot_writecombine(pgprot_t prot) } EXPORT_SYMBOL_GPL(pgprot_writecombine); -pgprot_t pgprot_writethrough(pgprot_t prot) -{ - /* - * mio_wb_bit_mask may be set on a different CPU, but it is only set - * once at init and only read afterwards. - */ - return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); -} -EXPORT_SYMBOL_GPL(pgprot_writethrough); - static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int nodat) { unsigned long opt, asce; - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL || nodat) @@ -69,7 +62,7 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, { unsigned long opt, asce; - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL || nodat) @@ -94,7 +87,7 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, if (unlikely(pte_val(old) & _PAGE_INVALID)) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) ptep_ipte_local(mm, addr, ptep, nodat); else @@ -123,37 +116,6 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, return old; } -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long new = 0; -#ifdef CONFIG_PGSTE - unsigned long old; - - asm( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " nihh %0,0xff7f\n" /* clear PCL bit in old */ - " oihh %1,0x0080\n" /* set PCL bit in new */ - " csg %0,%1,%2\n" - " jl 0b\n" - : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) - : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); -#endif - return __pgste(new); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - asm( - " nihh %1,0xff7f\n" /* clear PCL bit */ - " stg %1,%0\n" - : "=Q" (ptep[PTRS_PER_PTE]) - : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) - : "cc", "memory"); -#endif -} - static inline pgste_t pgste_get(pte_t *ptep) { unsigned long pgste = 0; @@ -182,10 +144,10 @@ static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, skey = (unsigned long) page_get_storage_key(address); bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); /* Transfer page changed & referenced bit to guest bits in pgste */ - pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ + pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ /* Copy page access key and fetch protection bit to pgste */ - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); #endif return pgste; @@ -219,7 +181,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) if ((pte_val(entry) & _PAGE_PRESENT) && (pte_val(entry) & _PAGE_WRITE) && !(pte_val(entry) & _PAGE_INVALID)) { - if (!MACHINE_HAS_ESOP) { + if (!machine_has_esop()) { /* * Without enhanced suppression-on-protection force * the dirty bit on for all writable ptes. @@ -229,7 +191,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) } if (!(pte_val(entry) & _PAGE_PROTECT)) /* This pte allows write access, set user-dirty */ - pgste_val(pgste) |= PGSTE_UC_BIT; + pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); } #endif set_pte(ptep, entry); @@ -245,7 +207,7 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm, bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); if (bits) { - pgste_val(pgste) ^= bits; + pgste = __pgste(pgste_val(pgste) ^ bits); ptep_notify(mm, addr, ptep, bits); } #endif @@ -302,6 +264,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_direct); +/* + * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that + * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). + */ +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new) +{ + preempt_disable(); + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_rdp(addr, ptep, 1); + else + __ptep_rdp(addr, ptep, 0); + /* + * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That + * means it is still valid and active, and must not be changed according + * to the architecture. But writing a new value that only differs in SW + * bits is allowed. + */ + set_pte(ptep, new); + atomic_dec(&mm->context.flush_count); + preempt_enable(); +} +EXPORT_SYMBOL(ptep_reset_dat_prot); + pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { @@ -327,7 +314,6 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, int nodat; struct mm_struct *mm = vma->vm_mm; - preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); old = ptep_flush_lazy(mm, addr, ptep, nodat); @@ -344,8 +330,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pgste_t pgste; struct mm_struct *mm = vma->vm_mm; - if (!MACHINE_HAS_NX) - pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); if (mm_has_pgste(mm)) { pgste = pgste_get(ptep); pgste_set_key(ptep, pgste, pte, mm); @@ -354,13 +338,12 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, } else { set_pte(ptep, pte); } - preempt_enable(); } static inline void pmdp_idte_local(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else @@ -372,19 +355,15 @@ static inline void pmdp_idte_local(struct mm_struct *mm, static inline void pmdp_idte_global(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_idte_global(mm, addr); - } else if (MACHINE_HAS_IDTE) { + } else { __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_idte_global(mm, addr); - } else { - __pmdp_csp(pmdp); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_csp(mm, addr); } } @@ -397,7 +376,7 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) pmdp_idte_local(mm, addr, pmdp); else @@ -454,7 +433,7 @@ static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) return -ENOENT; /* Large PUDs are not supported yet. */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; *pmdp = pmd_offset(pud, addr); @@ -491,7 +470,7 @@ EXPORT_SYMBOL(pmdp_xchg_lazy); static inline void pudp_idte_local(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else @@ -501,17 +480,11 @@ static inline void pudp_idte_local(struct mm_struct *mm, static inline void pudp_idte_global(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) - __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); else - /* - * Invalid bit position is the same for pmd and pud, so we can - * re-use _pmd_csp() here - */ - __pmdp_csp((pmd_t *) pudp); + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); } static inline pud_t pudp_flush_direct(struct mm_struct *mm, @@ -523,7 +496,7 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm, if (pud_val(old) & _REGION_ENTRY_INVALID) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) pudp_idte_local(mm, addr, pudp); else @@ -595,7 +568,7 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, /* the mm_has_pgste() check is done in set_pte_at() */ preempt_disable(); pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; + pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); pgste_set_key(ptep, pgste, entry, mm); pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); @@ -608,7 +581,7 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) preempt_disable(); pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= PGSTE_IN_BIT; + pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -653,7 +626,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } - pgste_val(pgste) |= bit; + pgste = set_pgste_bit(pgste, bit); pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); return 0; @@ -673,7 +646,7 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, if (!(pte_val(spte) & _PAGE_INVALID) && !((pte_val(spte) & _PAGE_PROTECT) && !(pte_val(pte) & _PAGE_PROTECT))) { - pgste_val(spgste) |= PGSTE_VSIE_BIT; + spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); tpgste = pgste_get_lock(tptep); tpte = __pte((pte_val(spte) & PAGE_MASK) | (pte_val(pte) & _PAGE_PROTECT)); @@ -700,14 +673,14 @@ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) pgste_set_unlock(ptep, pgste); } -static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) { - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(folio)); } free_swap_and_cache(entry); } @@ -727,11 +700,11 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, if (!reset && pte_swap(pte) && ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || (pgstev & _PGSTE_GPS_ZERO))) { - ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte)); pte_clear(mm, addr, ptep); } if (reset) - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -744,8 +717,8 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) /* Clear storage key ACC and F, but set R/C */ preempt_disable(); pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; + pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); ptev = pte_val(*ptep); if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); @@ -766,13 +739,13 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, pgste = pgste_get_lock(ptep); dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); - pgste_val(pgste) &= ~PGSTE_UC_BIT; + pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); ptep_ipte_global(mm, addr, ptep, nodat); - if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); else pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); @@ -804,14 +777,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return key ? -EFAULT : 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; /* @@ -825,12 +798,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); - pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); + new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); keyul = (unsigned long) key; - pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; - pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); + new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); if (!(pte_val(*ptep) & _PAGE_INVALID)) { unsigned long bits, skey; @@ -841,12 +816,12 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, /* Set storage key ACC and FP */ page_set_storage_key(paddr, skey, !nq); /* Merge host changed & referenced into pgste */ - pgste_val(new) |= bits << 52; + new = set_pgste_bit(new, bits << 52); } /* changing the guest storage key is considered a change of the page */ if ((pgste_val(new) ^ pgste_val(old)) & (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - pgste_val(new) |= PGSTE_UC_BIT; + new = set_pgste_bit(new, PGSTE_UC_BIT); pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); @@ -913,14 +888,14 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; cc = page_reset_referenced(paddr); @@ -930,21 +905,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); /* Reset guest reference bit only */ - pgste_val(new) &= ~PGSTE_GR_BIT; + new = clear_pgste_bit(new, PGSTE_GR_BIT); if (!(pte_val(*ptep) & _PAGE_INVALID)) { paddr = pte_val(*ptep) & PAGE_MASK; cc = page_reset_referenced(paddr); /* Merge real referenced bit into host-set */ - pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; + new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); } /* Reflect guest's logical view, not physical */ cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; /* Changing the guest storage key is considered a change of the page */ if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) - pgste_val(new) |= PGSTE_UC_BIT; + new = set_pgste_bit(new, PGSTE_UC_BIT); pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); @@ -975,14 +952,14 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; *key = page_get_storage_key(paddr); @@ -992,6 +969,8 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; paddr = pte_val(*ptep) & PAGE_MASK; @@ -1106,7 +1085,7 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, if (res) pgstev |= _PGSTE_GPS_ZERO; - pgste_val(pgste) = pgstev; + pgste = __pgste(pgstev); pgste_set_unlock(ptep, pgste); pte_unmap_unlock(ptep, ptl); return res; @@ -1139,8 +1118,8 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long hva, return -EFAULT; new = pgste_get_lock(ptep); - pgste_val(new) &= ~bits; - pgste_val(new) |= value & bits; + new = clear_pgste_bit(new, bits); + new = set_pgste_bit(new, value & bits); pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c new file mode 100644 index 000000000000..59de866c72d9 --- /dev/null +++ b/arch/s390/mm/physaddr.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mmdebug.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <asm/page.h> + +unsigned long __phys_addr(unsigned long x, bool is_31bit) +{ + VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x))); + x = __pa_nodebug(x); + if (is_31bit) + VIRTUAL_BUG_ON(x >> 31); + return x; +} +EXPORT_SYMBOL(__phys_addr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index ee1a97078527..d96587b84e81 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,6 +4,8 @@ */ #include <linux/memory_hotplug.h> +#include <linux/bootmem_info.h> +#include <linux/cpufeature.h> #include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mm.h> @@ -11,13 +13,19 @@ #include <linux/list.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/sort.h> +#include <asm/page-states.h> +#include <asm/abs_lowcore.h> #include <asm/cacheflush.h> +#include <asm/maccess.h> #include <asm/nospec-branch.h> +#include <asm/ctlreg.h> #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/set_memory.h> +#include <asm/physmem_info.h> static DEFINE_MUTEX(vmem_mutex); @@ -30,13 +38,23 @@ static void __ref *vmem_alloc_pages(unsigned int order) return memblock_alloc(size, size); } -static void vmem_free_pages(unsigned long addr, int order) +static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) { - /* We don't expect boot memory to be removed ever. */ - if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(virt_to_page(addr)))) + unsigned int nr_pages = 1 << order; + struct page *page; + + if (altmap) { + vmem_altmap_free(altmap, 1 << order); return; - free_pages(addr, order); + } + page = virt_to_page((void *)addr); + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_bootmem_page(page++); + } else { + free_pages(addr, order); + } } void *vmem_crst_alloc(unsigned long val) @@ -44,32 +62,30 @@ void *vmem_crst_alloc(unsigned long val) unsigned long *table; table = vmem_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + if (!table) + return NULL; + crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } pte_t __ref *vmem_pte_alloc(void) { - unsigned long size = PTRS_PER_PTE * sizeof(pte_t); pte_t *pte; if (slab_is_available()) - pte = (pte_t *) page_table_alloc(&init_mm); + pte = (pte_t *)page_table_alloc(&init_mm); else - pte = (pte_t *) memblock_alloc(size, size); + pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + __arch_set_page_dat(pte, 1); return pte; } static void vmem_pte_free(unsigned long *table) { - /* We don't expect boot memory to be removed ever. */ - if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(virt_to_page(table)))) - return; page_table_free(&init_mm, table); } @@ -150,27 +166,25 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long prot, pages = 0; int ret = -ENOMEM; pte_t *pte; prot = pgprot_val(PAGE_KERNEL); - if (!MACHINE_HAS_NX) - prot &= ~_PAGE_NOEXEC; - pte = pte_offset_kernel(pmd, addr); for (; addr < end; addr += PAGE_SIZE, pte++) { if (!add) { if (pte_none(*pte)) continue; if (!direct) - vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { if (!direct) { - void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); if (!new_page) goto out; @@ -207,7 +221,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -215,24 +230,21 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, pte_t *pte; prot = pgprot_val(SEGMENT_KERNEL); - if (!MACHINE_HAS_NX) - prot &= ~_SEGMENT_ENTRY_NOEXEC; - pmd = pmd_offset(pud, addr); for (; addr < end; addr = next, pmd++) { next = pmd_addr_end(addr, end); if (!add) { if (pmd_none(*pmd)) continue; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); pages++; } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); } continue; @@ -240,12 +252,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, } else if (pmd_none(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE) && - MACHINE_HAS_EDAT1 && direct && + cpu_has_edat1() && direct && !debug_pagealloc_enabled()) { set_pmd(pmd, __pmd(__pa(addr) | prot)); pages++; continue; - } else if (!direct && MACHINE_HAS_EDAT1) { + } else if (!direct && cpu_has_edat1()) { void *new_page; /* @@ -255,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, * page tables since vmemmap_populate gets * called for each section separately. */ - new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); if (new_page) { set_pmd(pmd, __pmd(__pa(new_page) | prot)); if (!IS_ALIGNED(addr, PMD_SIZE) || @@ -269,12 +281,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (!pte) goto out; pmd_populate(&init_mm, pmd, pte); - } else if (pmd_large(*pmd)) { + } else if (pmd_leaf(*pmd)) { if (!direct) vmemmap_use_sub_pmd(addr, next); continue; } - ret = modify_pte_table(pmd, addr, next, add, direct); + ret = modify_pte_table(pmd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -289,27 +301,19 @@ out: static void try_free_pmd_table(pud_t *pud, unsigned long start) { - const unsigned long end = start + PUD_SIZE; pmd_t *pmd; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif pmd = pmd_offset(pud, start); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) if (!pmd_none(*pmd)) return; - vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); pud_clear(pud); } static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -317,15 +321,13 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, pmd_t *pmd; prot = pgprot_val(REGION3_KERNEL); - if (!MACHINE_HAS_NX) - prot &= ~_REGION_ENTRY_NOEXEC; pud = pud_offset(p4d, addr); for (; addr < end; addr = next, pud++) { next = pud_addr_end(addr, end); if (!add) { if (pud_none(*pud)) continue; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { pud_clear(pud); @@ -336,7 +338,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE) && - MACHINE_HAS_EDAT2 && direct && + cpu_has_edat2() && direct && !debug_pagealloc_enabled()) { set_pud(pud, __pud(__pa(addr) | prot)); pages++; @@ -346,10 +348,10 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) { + } else if (pud_leaf(*pud)) { continue; } - ret = modify_pmd_table(pud, addr, next, add, direct); + ret = modify_pmd_table(pud, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -364,29 +366,20 @@ out: static void try_free_pud_table(p4d_t *p4d, unsigned long start) { - const unsigned long end = start + P4D_SIZE; pud_t *pud; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif - pud = pud_offset(p4d, start); for (i = 0; i < PTRS_PER_PUD; i++, pud++) { if (!pud_none(*pud)) return; } - vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); p4d_clear(p4d); } static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next; int ret = -ENOMEM; @@ -405,7 +398,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, goto out; p4d_populate(&init_mm, p4d, pud); } - ret = modify_pud_table(p4d, addr, next, add, direct); + ret = modify_pud_table(p4d, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -418,29 +411,20 @@ out: static void try_free_p4d_table(pgd_t *pgd, unsigned long start) { - const unsigned long end = start + PGDIR_SIZE; p4d_t *p4d; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif - p4d = p4d_offset(pgd, start); for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { if (!p4d_none(*p4d)) return; } - vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); pgd_clear(pgd); } static int modify_pagetable(unsigned long start, unsigned long end, bool add, - bool direct) + bool direct, struct vmem_altmap *altmap) { unsigned long addr, next; int ret = -ENOMEM; @@ -449,6 +433,9 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (WARN_ON_ONCE(end > __abs_lowcore)) + return -EINVAL; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); @@ -462,7 +449,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, goto out; pgd_populate(&init_mm, pgd, p4d); } - ret = modify_p4d_table(pgd, addr, next, add, direct); + ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -475,14 +462,16 @@ out: return ret; } -static int add_pagetable(unsigned long start, unsigned long end, bool direct) +static int add_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, true, direct); + return modify_pagetable(start, end, true, direct, altmap); } -static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +static int remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, false, direct); + return modify_pagetable(start, end, false, direct, altmap); } /* @@ -490,7 +479,8 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct) */ static int vmem_add_range(unsigned long start, unsigned long size) { - return add_pagetable(start, start + size, true); + start = (unsigned long)__va(start); + return add_pagetable(start, start + size, true, NULL); } /* @@ -498,7 +488,8 @@ static int vmem_add_range(unsigned long start, unsigned long size) */ static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_pagetable(start, start + size, true); + start = (unsigned long)__va(start); + remove_pagetable(start, start + size, true, NULL); } /* @@ -511,21 +502,25 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, mutex_lock(&vmem_mutex); /* We don't care about the node, just use NUMA_NO_NODE on allocations */ - ret = add_pagetable(start, end, false); + ret = add_pagetable(start, end, false, altmap); if (ret) - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); return ret; } +#ifdef CONFIG_MEMORY_HOTPLUG + void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { mutex_lock(&vmem_mutex); - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); } +#endif + void vmem_remove_mapping(unsigned long start, unsigned long size) { mutex_lock(&vmem_mutex); @@ -538,7 +533,7 @@ struct range arch_get_mappable_range(void) struct range mhp_range; mhp_range.start = 0; - mhp_range.end = VMEM_MAX_PHYS - 1; + mhp_range.end = max_mappable - 1; return mhp_range; } @@ -565,7 +560,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size) * to any physical address. If missing, allocate segment- and region- * table entries along. Meeting a large segment- or region-table entry * while traversing is an error, since the function is expected to be - * called against virtual regions reserverd for 4KB mappings only. + * called against virtual regions reserved for 4KB mappings only. */ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) { @@ -602,7 +597,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (WARN_ON_ONCE(pud_large(*pud))) { + } else if (WARN_ON_ONCE(pud_leaf(*pud))) { goto out; } pmd = pmd_offset(pud, addr); @@ -613,7 +608,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) if (!pte) goto out; pmd_populate(&init_mm, pmd, pte); - } else if (WARN_ON_ONCE(pmd_large(*pmd))) { + } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { goto out; } ptep = pte_offset_kernel(pmd, addr); @@ -657,37 +652,20 @@ void vmem_unmap_4k_page(unsigned long addr) mutex_unlock(&vmem_mutex); } -/* - * map whole physical memory to virtual memory (identity mapping) - * we reserve enough space in the vmalloc area for vmemmap to hotplug - * additional memory segments. - */ void __init vmem_map_init(void) { - phys_addr_t base, end; - u64 i; - - for_each_mem_range(i, &base, &end) - vmem_add_range(base, end - base); - __set_memory((unsigned long)_stext, - (unsigned long)(_etext - _stext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory((unsigned long)_etext, - (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT, - SET_MEMORY_RO); - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - - /* lowcore requires 4k mapping for real addresses / prefixing */ - set_memory_4k(0, LC_PAGES); - - /* lowcore must be executable for LPSWE */ - if (!static_key_enabled(&cpu_has_bear)) + __set_memory_rox(_stext, _etext); + __set_memory_ro(_etext, __end_rodata); + __set_memory_rox(__stext_amode31, __etext_amode31); + /* + * If the BEAR-enhancement facility is not installed the first + * prefix page is used to return to the previous context with + * an LPSWE instruction and therefore must be executable. + */ + if (!cpu_has_bear()) set_memory_x(0, 1); - + if (debug_pagealloc_enabled()) + __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } |
