diff options
Diffstat (limited to 'arch/s390/mm')
| -rw-r--r-- | arch/s390/mm/Makefile | 12 | ||||
| -rw-r--r-- | arch/s390/mm/cmm.c | 212 | ||||
| -rw-r--r-- | arch/s390/mm/dump_pagetables.c | 450 | ||||
| -rw-r--r-- | arch/s390/mm/extable.c | 202 | ||||
| -rw-r--r-- | arch/s390/mm/extmem.c | 295 | ||||
| -rw-r--r-- | arch/s390/mm/fault.c | 893 | ||||
| -rw-r--r-- | arch/s390/mm/gmap.c | 2436 | ||||
| -rw-r--r-- | arch/s390/mm/gmap_helpers.c | 233 | ||||
| -rw-r--r-- | arch/s390/mm/gup.c | 281 | ||||
| -rw-r--r-- | arch/s390/mm/hugetlbpage.c | 289 | ||||
| -rw-r--r-- | arch/s390/mm/init.c | 362 | ||||
| -rw-r--r-- | arch/s390/mm/maccess.c | 301 | ||||
| -rw-r--r-- | arch/s390/mm/mem_detect.c | 135 | ||||
| -rw-r--r-- | arch/s390/mm/mmap.c | 266 | ||||
| -rw-r--r-- | arch/s390/mm/page-states.c | 102 | ||||
| -rw-r--r-- | arch/s390/mm/pageattr.c | 475 | ||||
| -rw-r--r-- | arch/s390/mm/pfault.c | 248 | ||||
| -rw-r--r-- | arch/s390/mm/pgalloc.c | 481 | ||||
| -rw-r--r-- | arch/s390/mm/pgtable.c | 1933 | ||||
| -rw-r--r-- | arch/s390/mm/physaddr.c | 15 | ||||
| -rw-r--r-- | arch/s390/mm/vmem.c | 838 |
21 files changed, 6970 insertions, 3489 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 839592ca265c..bd0401cc7ca5 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -1,10 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux s390-specific parts of the memory manager. # -obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o -obj-y += page-states.o gup.o extable.o pageattr.o mem_detect.o +obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PGSTE) += gmap.o +obj-$(CONFIG_PFAULT) += pfault.o + +obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 9d84a1feefef..eb7ef63fab1e 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Collaborative memory management interface. * @@ -10,17 +11,16 @@ #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/gfp.h> #include <linux/sched.h> +#include <linux/string_helpers.h> #include <linux/sysctl.h> -#include <linux/ctype.h> #include <linux/swap.h> #include <linux/kthread.h> #include <linux/oom.h> -#include <linux/suspend.h> #include <linux/uaccess.h> -#include <asm/pgalloc.h> #include <asm/diag.h> #ifdef CONFIG_CMM_IUCV @@ -47,7 +47,6 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; -static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; @@ -55,10 +54,10 @@ static DEFINE_SPINLOCK(cmm_lock); static struct task_struct *cmm_thread_ptr; static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait); -static DEFINE_TIMER(cmm_timer, NULL, 0, 0); -static void cmm_timer_fn(unsigned long); +static void cmm_timer_fn(struct timer_list *); static void cmm_set_timer(void); +static DEFINE_TIMER(cmm_timer, cmm_timer_fn); static long cmm_alloc_pages(long nr, long *counter, struct cmm_page_array **list) @@ -91,16 +90,17 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(addr >> PAGE_SHIFT, 1); + diag10_range(virt_to_pfn((void *)addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); nr--; + cond_resched(); } return nr; } -static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +static long __cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) { struct cmm_page_array *pa; unsigned long addr; @@ -124,6 +124,21 @@ static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) return nr; } +static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +{ + long inc = 0; + + while (nr) { + inc = min(256L, nr); + nr -= inc; + inc = __cmm_free_pages(inc, counter, list); + if (inc) + break; + cond_resched(); + } + return nr + inc; +} + static int cmm_oom_notify(struct notifier_block *self, unsigned long dummy, void *parm) { @@ -149,9 +164,9 @@ static int cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (!cmm_suspended && (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target)) || - kthread_should_stop()); + cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -186,20 +201,13 @@ static void cmm_set_timer(void) { if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) { if (timer_pending(&cmm_timer)) - del_timer(&cmm_timer); + timer_delete(&cmm_timer); return; } - if (timer_pending(&cmm_timer)) { - if (mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds*HZ)) - return; - } - cmm_timer.function = cmm_timer_fn; - cmm_timer.data = 0; - cmm_timer.expires = jiffies + cmm_timeout_seconds*HZ; - add_timer(&cmm_timer); + mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds)); } -static void cmm_timer_fn(unsigned long ignored) +static void cmm_timer_fn(struct timer_list *unused) { long nr; @@ -251,54 +259,51 @@ static int cmm_skip_blanks(char *cp, char **endp) return str != cp; } -static struct ctl_table cmm_table[]; +static int cmm_pages_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + long nr = cmm_get_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; + + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; -static int cmm_pages_handler(ctl_table *ctl, int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + cmm_set_pages(nr); + return 0; +} + +static int cmm_timed_pages_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) { - char buf[16], *p; - long nr; - int len; + long nr = cmm_get_timed_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; - if (!*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; - if (write) { - len = *lenp; - if (copy_from_user(buf, buffer, - len > sizeof(buf) ? sizeof(buf) : len)) - return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - cmm_skip_blanks(buf, &p); - nr = simple_strtoul(p, &p, 0); - if (ctl == &cmm_table[0]) - cmm_set_pages(nr); - else - cmm_add_timed_pages(nr); - } else { - if (ctl == &cmm_table[0]) - nr = cmm_get_pages(); - else - nr = cmm_get_timed_pages(); - len = sprintf(buf, "%ld\n", nr); - if (len > *lenp) - len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; - } - *lenp = len; - *ppos += len; + cmm_add_timed_pages(nr); return 0; } -static int cmm_timeout_handler(ctl_table *ctl, int write, void __user *buffer, - size_t *lenp, loff_t *ppos) +static int cmm_timeout_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; - int len; + unsigned int len; if (!*lenp || (*ppos && !write)) { *lenp = 0; @@ -306,30 +311,28 @@ static int cmm_timeout_handler(ctl_table *ctl, int write, void __user *buffer, } if (write) { - len = *lenp; - if (copy_from_user(buf, buffer, - len > sizeof(buf) ? sizeof(buf) : len)) - return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; + len = min(*lenp, sizeof(buf)); + memcpy(buf, buffer, len); + buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); cmm_skip_blanks(p, &p); seconds = simple_strtoul(p, &p, 0); cmm_set_timeout(nr, seconds); + *ppos += *lenp; } else { - len = sprintf(buf, "%ld %ld\n", - cmm_timeout_pages, cmm_timeout_seconds); + len = scnprintf(buf, sizeof(buf), "%ld %ld\n", + cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); + *lenp = len; + *ppos += len; } - *lenp = len; - *ppos += len; return 0; } -static struct ctl_table cmm_table[] = { +static const struct ctl_table cmm_table[] = { { .procname = "cmm_pages", .mode = 0644, @@ -338,24 +341,13 @@ static struct ctl_table cmm_table[] = { { .procname = "cmm_timed_pages", .mode = 0644, - .proc_handler = cmm_pages_handler, + .proc_handler = cmm_timed_pages_handler, }, { .procname = "cmm_timeout", .mode = 0644, .proc_handler = cmm_timeout_handler, }, - { } -}; - -static struct ctl_table cmm_dir_table[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = cmm_table, - }, - { } }; #ifdef CONFIG_CMM_IUCV @@ -398,54 +390,19 @@ static void cmm_smsg_target(const char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; -static int cmm_suspend(void) -{ - cmm_suspended = 1; - cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); - cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); - return 0; -} - -static int cmm_resume(void) -{ - cmm_suspended = 0; - cmm_kick_thread(); - return 0; -} - -static int cmm_power_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - switch (event) { - case PM_POST_HIBERNATION: - return cmm_resume(); - case PM_HIBERNATION_PREPARE: - return cmm_suspend(); - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block cmm_power_notifier = { - .notifier_call = cmm_power_event, -}; - static int __init cmm_init(void) { int rc = -ENOMEM; - cmm_sysctl_header = register_sysctl_table(cmm_dir_table); + cmm_sysctl_header = register_sysctl("vm", cmm_table); if (!cmm_sysctl_header) goto out_sysctl; #ifdef CONFIG_CMM_IUCV /* convert sender to uppercase characters */ - if (sender) { - int len = strlen(sender); - while (len--) - sender[len] = toupper(sender[len]); - } else { + if (sender) + string_upper(sender, sender); + else sender = cmm_default_sender; - } rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); if (rc < 0) @@ -454,16 +411,11 @@ static int __init cmm_init(void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; - rc = register_pm_notifier(&cmm_power_notifier); - if (rc) - goto out_pm; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (!IS_ERR(cmm_thread_ptr)) return 0; rc = PTR_ERR(cmm_thread_ptr); - unregister_pm_notifier(&cmm_power_notifier); -out_pm: unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV @@ -472,7 +424,7 @@ out_smsg: #endif unregister_sysctl_table(cmm_sysctl_header); out_sysctl: - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); return rc; } module_init(cmm_init); @@ -483,13 +435,13 @@ static void __exit cmm_exit(void) #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); #endif - unregister_pm_notifier(&cmm_power_notifier); unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); } module_exit(cmm_exit); +MODULE_DESCRIPTION("Cooperative memory management interface"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 3ad65b04ac15..89badbe72ae7 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -1,246 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpufeature.h> +#include <linux/set_memory.h> +#include <linux/ptdump.h> #include <linux/seq_file.h> #include <linux/debugfs.h> -#include <linux/module.h> +#include <linux/sort.h> #include <linux/mm.h> +#include <linux/kfence.h> +#include <linux/kasan.h> +#include <asm/kasan.h> +#include <asm/abs_lowcore.h> +#include <asm/nospec-branch.h> #include <asm/sections.h> -#include <asm/pgtable.h> +#include <asm/maccess.h> static unsigned long max_addr; struct addr_marker { + int is_start; unsigned long start_address; + unsigned long size; const char *name; }; -enum address_markers_idx { - IDENTITY_NR = 0, - KERNEL_START_NR, - KERNEL_END_NR, - VMEMMAP_NR, - VMALLOC_NR, -#ifdef CONFIG_64BIT - MODULES_NR, -#endif -}; - -static struct addr_marker address_markers[] = { - [IDENTITY_NR] = {0, "Identity Mapping"}, - [KERNEL_START_NR] = {(unsigned long)&_stext, "Kernel Image Start"}, - [KERNEL_END_NR] = {(unsigned long)&_end, "Kernel Image End"}, - [VMEMMAP_NR] = {0, "vmemmap Area"}, - [VMALLOC_NR] = {0, "vmalloc Area"}, -#ifdef CONFIG_64BIT - [MODULES_NR] = {0, "Modules Area"}, -#endif - { -1, NULL } -}; +static struct addr_marker *markers; +static unsigned int markers_cnt; struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; int level; unsigned int current_prot; + bool check_wx; + unsigned long wx_pages; unsigned long start_address; - unsigned long current_address; const struct addr_marker *marker; }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_puts(__m, fmt); \ +}) + static void print_prot(struct seq_file *m, unsigned int pr, int level) { static const char * const level_name[] = { "ASCE", "PGD", "PUD", "PMD", "PTE" }; - seq_printf(m, "%s ", level_name[level]); + pt_dump_seq_printf(m, "%s ", level_name[level]); if (pr & _PAGE_INVALID) { - seq_printf(m, "I\n"); + pt_dump_seq_printf(m, "I\n"); return; } - seq_printf(m, "%s", pr & _PAGE_RO ? "RO " : "RW "); - seq_printf(m, "%s", pr & _PAGE_CO ? "CO " : " "); - seq_putc(m, '\n'); + pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); + pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); } -static void note_page(struct seq_file *m, struct pg_state *st, - unsigned int new_prot, int level) +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + if (st->current_prot & _PAGE_INVALID) + return; + if (st->current_prot & _PAGE_PROTECT) + return; + if (st->current_prot & _PAGE_NOEXEC) + return; + /* + * The first lowcore page is W+X if spectre mitigations are using + * trampolines or the BEAR enhancements facility is not installed, + * in which case we have two lpswe instructions in lowcore that need + * to be executable. + */ + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear())) + return; + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "s390/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + +static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level) +{ + struct seq_file *m = st->seq; + + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(m, "---[ %s %s ]---\n", st->marker->name, + st->marker->is_start ? "Start" : "End"); + } + st->start_address = addr; + st->current_prot = prot; + st->level = level; +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { - static const char units[] = "KMGTPE"; int width = sizeof(unsigned long) * 2; + static const char units[] = "KMGTPE"; const char *unit = units; - unsigned int prot, cur; unsigned long delta; + struct pg_state *st; + struct seq_file *m; + unsigned int prot; - /* - * If we have a "break" in the series, we need to flush the state - * that we have now. "break" is either changing perms, levels or - * address space marker. - */ - prot = new_prot; - cur = st->current_prot; - - if (!st->level) { - /* First entry */ - st->current_prot = new_prot; - st->level = level; - st->marker = address_markers; - seq_printf(m, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || level != st->level || - st->current_address >= st->marker[1].start_address) { - /* Print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx", - width, st->start_address, - width, st->current_address); - delta = (st->current_address - st->start_address) >> 10; + st = container_of(pt_st, struct pg_state, ptdump); + m = st->seq; + prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC); + if (level == 4 && (val & _PAGE_INVALID)) + prot = _PAGE_INVALID; + /* For pmd_none() & friends val gets passed as zero. */ + if (level != 4 && !val) + prot = _PAGE_INVALID; + /* Final flush from generic code. */ + if (level == -1) + addr = max_addr; + if (st->level == -1) { + pt_dump_seq_puts(m, "---[ Kernel Virtual Address Space ]---\n"); + note_page_update_state(st, addr, prot, level); + } else if (prot != st->current_prot || level != st->level || + addr >= st->marker[1].start_address) { + note_prot_wx(st, addr); + pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, addr); + delta = (addr - st->start_address) >> 10; while (!(delta & 0x3ff) && unit[1]) { delta >>= 10; unit++; } - seq_printf(m, "%9lu%c ", delta, *unit); + pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - if (st->current_address >= st->marker[1].start_address) { - st->marker++; - seq_printf(m, "---[ %s ]---\n", st->marker->name); - } - st->start_address = st->current_address; - st->current_prot = new_prot; - st->level = level; + note_page_update_state(st, addr, prot, level); } } -/* - * The actual page table walker functions. In order to keep the implementation - * of print_prot() short, we only check and pass _PAGE_INVALID and _PAGE_RO - * flags to note_page() if a region, segment or page table entry is invalid or - * read-only. - * After all it's just a hint that the current level being walked contains an - * invalid or read-only entry. - */ -static void walk_pte_level(struct seq_file *m, struct pg_state *st, - pmd_t *pmd, unsigned long addr) +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) { - unsigned int prot; - pte_t *pte; - int i; - - for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) { - st->current_address = addr; - pte = pte_offset_kernel(pmd, addr); - prot = pte_val(*pte) & (_PAGE_RO | _PAGE_INVALID); - note_page(m, st, prot, 4); - addr += PAGE_SIZE; - } + note_page(pt_st, addr, 4, pte_val(pte)); } -#ifdef CONFIG_64BIT -#define _PMD_PROT_MASK (_SEGMENT_ENTRY_RO | _SEGMENT_ENTRY_CO) -#else -#define _PMD_PROT_MASK 0 -#endif +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, - pud_t *pud, unsigned long addr) +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) { - unsigned int prot; - pmd_t *pmd; - int i; - - for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) { - st->current_address = addr; - pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) { - if (pmd_large(*pmd)) { - prot = pmd_val(*pmd) & _PMD_PROT_MASK; - note_page(m, st, prot, 3); - } else - walk_pte_level(m, st, pmd, addr); - } else - note_page(m, st, _PAGE_INVALID, 3); - addr += PMD_SIZE; - } + note_page(pt_st, addr, 2, pud_val(pud)); } -#ifdef CONFIG_64BIT -#define _PUD_PROT_MASK (_REGION3_ENTRY_RO | _REGION3_ENTRY_CO) -#else -#define _PUD_PROT_MASK 0 -#endif +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} -static void walk_pud_level(struct seq_file *m, struct pg_state *st, - pgd_t *pgd, unsigned long addr) +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) { - unsigned int prot; - pud_t *pud; - int i; - - for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) { - st->current_address = addr; - pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) - if (pud_large(*pud)) { - prot = pud_val(*pud) & _PUD_PROT_MASK; - note_page(m, st, prot, 2); - } else - walk_pmd_level(m, st, pud, addr); - else - note_page(m, st, _PAGE_INVALID, 2); - addr += PUD_SIZE; - } + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); } -static void walk_pgd_level(struct seq_file *m) +bool ptdump_check_wx(void) { - unsigned long addr = 0; - struct pg_state st; - pgd_t *pgd; - int i; - - memset(&st, 0, sizeof(st)); - for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) { - st.current_address = addr; - pgd = pgd_offset_k(addr); - if (!pgd_none(*pgd)) - walk_pud_level(m, &st, pgd, addr); - else - note_page(m, &st, _PAGE_INVALID, 1); - addr += PGDIR_SIZE; + struct pg_state st = { + .ptdump = { + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = NULL, + .level = -1, + .current_prot = 0, + .check_wx = true, + .wx_pages = 0, + .start_address = 0, + .marker = (struct addr_marker[]) { + { .start_address = 0, .name = NULL}, + { .start_address = -1, .name = NULL}, + }, + }; + + if (!cpu_has_nx()) + return true; + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + if (st.wx_pages) { + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); + + return false; + } else { + pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", + (nospec_uses_trampoline() || !cpu_has_bear()) ? + "unexpected " : ""); + + return true; } - /* Flush out the last page */ - st.current_address = max_addr; - note_page(m, &st, 0, 0); } +#ifdef CONFIG_PTDUMP_DEBUGFS static int ptdump_show(struct seq_file *m, void *v) { - walk_pgd_level(m); + struct pg_state st = { + .ptdump = { + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = m, + .level = -1, + .current_prot = 0, + .check_wx = false, + .wx_pages = 0, + .start_address = 0, + .marker = markers, + }; + + mutex_lock(&cpa_mutex); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + mutex_unlock(&cpa_mutex); return 0; } +DEFINE_SHOW_ATTRIBUTE(ptdump); +#endif /* CONFIG_PTDUMP_DEBUGFS */ -static int ptdump_open(struct inode *inode, struct file *filp) +static int ptdump_cmp(const void *a, const void *b) { - return single_open(filp, ptdump_show, NULL); + const struct addr_marker *ama = a; + const struct addr_marker *amb = b; + + if (ama->start_address > amb->start_address) + return 1; + if (ama->start_address < amb->start_address) + return -1; + /* + * If the start addresses of two markers are identical sort markers in an + * order that considers areas contained within other areas correctly. + */ + if (ama->is_start && amb->is_start) { + if (ama->size > amb->size) + return -1; + if (ama->size < amb->size) + return 1; + return 0; + } + if (!ama->is_start && !amb->is_start) { + if (ama->size > amb->size) + return 1; + if (ama->size < amb->size) + return -1; + return 0; + } + if (ama->is_start) + return 1; + if (amb->is_start) + return -1; + return 0; } -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +static int add_marker(unsigned long start, unsigned long end, const char *name) +{ + struct addr_marker *new; + size_t newsize; + + newsize = (markers_cnt + 2) * sizeof(*markers); + new = kvrealloc(markers, newsize, GFP_KERNEL); + if (!new) + return -ENOMEM; + markers = new; + markers[markers_cnt].is_start = 1; + markers[markers_cnt].start_address = start; + markers[markers_cnt].size = end - start; + markers[markers_cnt].name = name; + markers_cnt++; + markers[markers_cnt].is_start = 0; + markers[markers_cnt].start_address = end; + markers[markers_cnt].size = end - start; + markers[markers_cnt].name = name; + markers_cnt++; + return 0; +} static int pt_dump_init(void) { +#ifdef CONFIG_KFENCE + unsigned long kfence_start = (unsigned long)__kfence_pool; +#endif + unsigned long lowcore = (unsigned long)get_lowcore(); + int rc; + /* * Figure out the maximum virtual address being accessible with the * kernel ASCE. We need this to keep the page table walker functions * from accessing non-existent entries. */ -#ifdef CONFIG_32BIT - max_addr = 1UL << 31; -#else - max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); - address_markers[MODULES_NR].start_address = MODULES_VADDR; + /* start + end markers - must be added first */ + rc = add_marker(0, -1UL, NULL); + rc |= add_marker((unsigned long)_stext, (unsigned long)_end, "Kernel Image"); + rc |= add_marker(lowcore, lowcore + sizeof(struct lowcore), "Lowcore"); + rc |= add_marker(__identity_base, __identity_base + ident_map_size, "Identity Mapping"); + rc |= add_marker((unsigned long)__samode31, (unsigned long)__eamode31, "Amode31 Area"); + rc |= add_marker(MODULES_VADDR, MODULES_END, "Modules Area"); + rc |= add_marker(__abs_lowcore, __abs_lowcore + ABS_LOWCORE_MAP_SIZE, "Lowcore Area"); + rc |= add_marker(__memcpy_real_area, __memcpy_real_area + MEMCPY_REAL_SIZE, "Real Memory Copy Area"); + rc |= add_marker((unsigned long)vmemmap, (unsigned long)vmemmap + vmemmap_size, "vmemmap Area"); + rc |= add_marker(VMALLOC_START, VMALLOC_END, "vmalloc Area"); +#ifdef CONFIG_KFENCE + rc |= add_marker(kfence_start, kfence_start + KFENCE_POOL_SIZE, "KFence Pool"); +#endif +#ifdef CONFIG_KMSAN + rc |= add_marker(KMSAN_VMALLOC_SHADOW_START, KMSAN_VMALLOC_SHADOW_END, "Kmsan vmalloc Shadow"); + rc |= add_marker(KMSAN_VMALLOC_ORIGIN_START, KMSAN_VMALLOC_ORIGIN_END, "Kmsan vmalloc Origins"); + rc |= add_marker(KMSAN_MODULES_SHADOW_START, KMSAN_MODULES_SHADOW_END, "Kmsan Modules Shadow"); + rc |= add_marker(KMSAN_MODULES_ORIGIN_START, KMSAN_MODULES_ORIGIN_END, "Kmsan Modules Origins"); +#endif +#ifdef CONFIG_KASAN + rc |= add_marker(KASAN_SHADOW_START, KASAN_SHADOW_END, "Kasan Shadow"); #endif - address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; - address_markers[VMALLOC_NR].start_address = VMALLOC_START; + if (rc) + goto error; + sort(&markers[1], markers_cnt - 1, sizeof(*markers), ptdump_cmp, NULL); +#ifdef CONFIG_PTDUMP_DEBUGFS debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); +#endif /* CONFIG_PTDUMP_DEBUGFS */ return 0; +error: + kvfree(markers); + return -ENOMEM; } device_initcall(pt_dump_init); diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c index 4d1ee88864e8..7498e858c401 100644 --- a/arch/s390/mm/extable.c +++ b/arch/s390/mm/extable.c @@ -1,81 +1,147 @@ -#include <linux/module.h> -#include <linux/sort.h> -#include <asm/uaccess.h> - -/* - * Search one exception table for an entry corresponding to the - * given instruction address, and return the address of the entry, - * or NULL if none is found. - * We use a binary search, and thus we assume that the table is - * already sorted. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bitfield.h> +#include <linux/extable.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/panic.h> +#include <asm/asm-extable.h> +#include <asm/extable.h> +#include <asm/fpu.h> + +const struct exception_table_entry *s390_search_extables(unsigned long addr) { - const struct exception_table_entry *mid; - unsigned long addr; - - while (first <= last) { - mid = ((last - first) >> 1) + first; - addr = extable_insn(mid); - if (addr < value) - first = mid + 1; - else if (addr > value) - last = mid - 1; - else - return mid; - } - return NULL; + const struct exception_table_entry *fixup; + size_t num; + + fixup = search_exception_tables(addr); + if (fixup) + return fixup; + num = __stop_amode31_ex_table - __start_amode31_ex_table; + return search_extable(__start_amode31_ex_table, num, addr); } -/* - * The exception table needs to be sorted so that the binary - * search that we use to find entries in it works properly. - * This is used both for the kernel exception table and for - * the exception tables of modules that get loaded. - * - */ -static int cmp_ex(const void *a, const void *b) +static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs) { - const struct exception_table_entry *x = a, *y = b; + regs->psw.addr = extable_fixup(ex); + return true; +} - /* This compare is only valid after normalization. */ - return x->insn - y->insn; +static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->psw.addr = extable_fixup(ex); + return true; } -void sort_extable(struct exception_table_entry *start, - struct exception_table_entry *finish) +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, + bool pair, struct pt_regs *regs) { - struct exception_table_entry *p; - int i; - - /* Normalize entries to being relative to the start of the section */ - for (p = start, i = 0; p < finish; p++, i += 8) - p->insn += i; - sort(start, finish - start, sizeof(*start), cmp_ex, NULL); - /* Denormalize all entries */ - for (p = start, i = 0; p < finish; p++, i += 8) - p->insn -= i; + unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->gprs[reg_zero] = 0; + if (pair) + regs->gprs[reg_zero + 1] = 0; + regs->psw.addr = extable_fixup(ex); + return true; } -#ifdef CONFIG_MODULES -/* - * If the exception table is sorted, any referring to the module init - * will be at the beginning or the end. - */ -void trim_init_extable(struct module *m) +static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) { - /* Trim the beginning */ - while (m->num_exentries && - within_module_init(extable_insn(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data); + unsigned long data, addr, offset; + + addr = regs->gprs[reg_addr]; + offset = addr & (sizeof(unsigned long) - 1); + addr &= ~(sizeof(unsigned long) - 1); + data = *(unsigned long *)addr; + data <<= BITS_PER_BYTE * offset; + regs->gprs[reg_data] = data; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + fpu_sfpc(0); + regs->psw.addr = extable_fixup(ex); + return true; +} + +struct insn_ssf { + u64 opc1 : 8; + u64 r3 : 4; + u64 opc2 : 4; + u64 b1 : 4; + u64 d1 : 12; + u64 b2 : 4; + u64 d2 : 12; +} __packed; + +static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex, + bool from, struct pt_regs *regs) +{ + unsigned long uaddr, remainder; + struct insn_ssf *insn; + + /* + * If the faulting user space access crossed a page boundary retry by + * limiting the access to the first page (adjust length accordingly). + * Then the mvcos instruction will either complete with condition code + * zero, or generate another fault where the user space access did not + * cross a page boundary. + * If the faulting user space access did not cross a page boundary set + * length to zero and retry. In this case no user space access will + * happen, and the mvcos instruction will complete with condition code + * zero. + * In both cases the instruction will complete with condition code + * zero (copying finished), and the register which contains the + * length, indicates the number of bytes copied. + */ + regs->psw.addr = extable_fixup(ex); + insn = (struct insn_ssf *)regs->psw.addr; + if (from) + uaddr = regs->gprs[insn->b2] + insn->d2; + else + uaddr = regs->gprs[insn->b1] + insn->d1; + remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1)); + if (regs->gprs[insn->r3] <= remainder) + remainder = 0; + regs->gprs[insn->r3] = remainder; + return true; +} + +bool fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + ex = s390_search_extables(instruction_pointer(regs)); + if (!ex) + return false; + switch (ex->type) { + case EX_TYPE_FIXUP: + return ex_handler_fixup(ex, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(ex, regs); + case EX_TYPE_UA_FAULT: + return ex_handler_ua_fault(ex, regs); + case EX_TYPE_UA_LOAD_REG: + return ex_handler_ua_load_reg(ex, false, regs); + case EX_TYPE_UA_LOAD_REGPAIR: + return ex_handler_ua_load_reg(ex, true, regs); + case EX_TYPE_ZEROPAD: + return ex_handler_zeropad(ex, regs); + case EX_TYPE_FPC: + return ex_handler_fpc(ex, regs); + case EX_TYPE_UA_MVCOS_TO: + return ex_handler_ua_mvcos(ex, false, regs); + case EX_TYPE_UA_MVCOS_FROM: + return ex_handler_ua_mvcos(ex, true, regs); } - /* Trim the end */ - while (m->num_exentries && - within_module_init(extable_insn(&m->extable[m->num_exentries-1]), m)) - m->num_exentries--; + panic("invalid exception table entry"); } -#endif /* CONFIG_MODULES */ diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 519bba716cc3..6cc33c705de2 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Author(s)......: Carsten Otte <cotte@de.ibm.com> * Rob M van der Heij <rvdheij@nl.ibm.com> @@ -6,32 +7,30 @@ * Copyright IBM Corp. 2002, 2004 */ -#define KMSG_COMPONENT "extmem" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "extmem: " fmt #include <linux/kernel.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/module.h> -#include <linux/bootmem.h> +#include <linux/export.h> +#include <linux/memblock.h> #include <linux/ctype.h> #include <linux/ioport.h> +#include <linux/refcount.h> +#include <linux/pgtable.h> +#include <asm/machine.h> +#include <asm/diag.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/ebcdic.h> #include <asm/errno.h> #include <asm/extmem.h> #include <asm/cpcmd.h> #include <asm/setup.h> +#include <asm/asm.h> -#define DCSS_LOADSHR 0x00 -#define DCSS_LOADNSR 0x04 #define DCSS_PURGESEG 0x08 -#define DCSS_FINDSEG 0x0c -#define DCSS_LOADNOLY 0x10 -#define DCSS_SEGEXT 0x18 #define DCSS_LOADSHRX 0x20 #define DCSS_LOADNSRX 0x24 #define DCSS_FINDSEGX 0x2c @@ -51,22 +50,6 @@ struct qout64 { struct qrange range[6]; }; -#ifdef CONFIG_64BIT -struct qrange_old { - unsigned int start; /* last byte type */ - unsigned int end; /* last byte reserved */ -}; - -/* output area format for the Diag x'64' old subcode x'18' */ -struct qout64_old { - int segstart; - int segend; - int segcnt; - int segrcnt; - struct qrange_old range[6]; -}; -#endif - struct qin64 { char qopcode; char rsrv1[3]; @@ -80,10 +63,10 @@ struct qin64 { struct dcss_segment { struct list_head list; char dcss_name[8]; - char res_name[15]; + char res_name[16]; unsigned long start_addr; unsigned long end; - atomic_t ref_count; + refcount_t ref_count; int do_nonshared; unsigned int vm_segtype; struct qrange range[6]; @@ -95,55 +78,10 @@ static DEFINE_MUTEX(dcss_lock); static LIST_HEAD(dcss_list); static char *segtype_string[] = { "SW", "EW", "SR", "ER", "SN", "EN", "SC", "EW/EN-MIXED" }; -static int loadshr_scode, loadnsr_scode, findseg_scode; -static int segext_scode, purgeseg_scode; -static int scode_set; - -/* set correct Diag x'64' subcodes. */ -static int -dcss_set_subcodes(void) -{ -#ifdef CONFIG_64BIT - char *name = kmalloc(8 * sizeof(char), GFP_KERNEL | GFP_DMA); - unsigned long rx, ry; - int rc; - - if (name == NULL) - return -ENOMEM; - - rx = (unsigned long) name; - ry = DCSS_FINDSEGX; - - strcpy(name, "dummy"); - asm volatile( - " diag %0,%1,0x64\n" - "0: ipm %2\n" - " srl %2,28\n" - " j 2f\n" - "1: la %2,3\n" - "2:\n" - EX_TABLE(0b, 1b) - : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); - - kfree(name); - /* Diag x'64' new subcodes are supported, set to new subcodes */ - if (rc != 3) { - loadshr_scode = DCSS_LOADSHRX; - loadnsr_scode = DCSS_LOADNSRX; - purgeseg_scode = DCSS_PURGESEG; - findseg_scode = DCSS_FINDSEGX; - segext_scode = DCSS_SEGEXTX; - return 0; - } -#endif - /* Diag x'64' new subcodes are not supported, set to old subcodes */ - loadshr_scode = DCSS_LOADNOLY; - loadnsr_scode = DCSS_LOADNSR; - purgeseg_scode = DCSS_PURGESEG; - findseg_scode = DCSS_FINDSEG; - segext_scode = DCSS_SEGEXT; - return 0; -} +static int loadshr_scode = DCSS_LOADSHRX; +static int loadnsr_scode = DCSS_LOADNSRX; +static int purgeseg_scode = DCSS_PURGESEG; +static int segext_scode = DCSS_SEGEXTX; /* * Create the 8 bytes, ebcdic VM segment name from @@ -158,7 +96,7 @@ dcss_mkname(char *name, char *dcss_name) if (name[i] == '\0') break; dcss_name[i] = toupper(name[i]); - }; + } for (; i < 8; i++) dcss_name[i] = ' '; ASCEBC(dcss_name, 8); @@ -197,44 +135,21 @@ dcss_diag(int *func, void *parameter, unsigned long *ret1, unsigned long *ret2) { unsigned long rx, ry; - int rc; + int cc; - if (scode_set == 0) { - rc = dcss_set_subcodes(); - if (rc < 0) - return rc; - scode_set = 1; - } - rx = (unsigned long) parameter; + rx = virt_to_phys(parameter); ry = (unsigned long) *func; -#ifdef CONFIG_64BIT - /* 64-bit Diag x'64' new subcode, keep in 64-bit addressing mode */ - if (*func > DCSS_SEGEXT) - asm volatile( - " diag %0,%1,0x64\n" - " ipm %2\n" - " srl %2,28\n" - : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); - /* 31-bit Diag x'64' old subcode, switch to 31-bit addressing mode */ - else - asm volatile( - " sam31\n" - " diag %0,%1,0x64\n" - " sam64\n" - " ipm %2\n" - " srl %2,28\n" - : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); -#else + diag_stat_inc(DIAG_STAT_X064); asm volatile( - " diag %0,%1,0x64\n" - " ipm %2\n" - " srl %2,28\n" - : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); -#endif + " diag %[rx],%[ry],0x64\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry) + : + : CC_CLOBBER); *ret1 = rx; *ret2 = ry; - return rc; + return CC_TRANSFORM(cc); } static inline int @@ -265,7 +180,7 @@ query_segment_type (struct dcss_segment *seg) /* initialize diag input parameters */ qin->qopcode = DCSS_FINDSEGA; - qin->qoutptr = (unsigned long) qout; + qin->qoutptr = virt_to_phys(qout); qin->qoutlen = sizeof(struct qout64); memcpy (qin->qname, seg->dcss_name, 8); @@ -276,38 +191,11 @@ query_segment_type (struct dcss_segment *seg) goto out_free; } if (diag_cc > 1) { - pr_warning("Querying a DCSS type failed with rc=%ld\n", vmrc); + pr_warn("Querying a DCSS type failed with rc=%ld\n", vmrc); rc = dcss_diag_translate_rc (vmrc); goto out_free; } -#ifdef CONFIG_64BIT - /* Only old format of output area of Diagnose x'64' is supported, - copy data for the new format. */ - if (segext_scode == DCSS_SEGEXT) { - struct qout64_old *qout_old; - qout_old = kzalloc(sizeof(*qout_old), GFP_KERNEL | GFP_DMA); - if (qout_old == NULL) { - rc = -ENOMEM; - goto out_free; - } - memcpy(qout_old, qout, sizeof(struct qout64_old)); - qout->segstart = (unsigned long) qout_old->segstart; - qout->segend = (unsigned long) qout_old->segend; - qout->segcnt = qout_old->segcnt; - qout->segrcnt = qout_old->segrcnt; - - if (qout->segcnt > 6) - qout->segrcnt = 6; - for (i = 0; i < qout->segrcnt; i++) { - qout->range[i].start = - (unsigned long) qout_old->range[i].start; - qout->range[i].end = - (unsigned long) qout_old->range[i].end; - } - kfree(qout_old); - } -#endif if (qout->segcnt > 6) { rc = -EOPNOTSUPP; goto out_free; @@ -367,7 +255,7 @@ segment_type (char* name) int rc; struct dcss_segment seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; dcss_mkname(name, seg.dcss_name); @@ -403,15 +291,17 @@ segment_overlaps_others (struct dcss_segment *seg) /* * real segment loading function, called from segment_load + * Must return either an error code < 0, or the segment type code >= 0 */ static int __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end) { unsigned long start_addr, end_addr, dummy; struct dcss_segment *seg; - int rc, diag_cc; + int rc, diag_cc, segtype; start_addr = end_addr = 0; + segtype = -1; seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); if (seg == NULL) { rc = -ENOMEM; @@ -422,22 +312,15 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long if (rc < 0) goto out_free; - if (loadshr_scode == DCSS_LOADSHRX) { - if (segment_overlaps_others(seg)) { - rc = -EBUSY; - goto out_free; - } - } - - rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); - - if (rc) + if (segment_overlaps_others(seg)) { + rc = -EBUSY; goto out_free; + } seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (seg->res == NULL) { rc = -ENOMEM; - goto out_shared; + goto out_free; } seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; seg->res->start = seg->start_addr; @@ -445,18 +328,23 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long memcpy(&seg->res_name, seg->dcss_name, 8); EBCASC(seg->res_name, 8); seg->res_name[8] = '\0'; - strncat(seg->res_name, " (DCSS)", 7); + strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); seg->res->name = seg->res_name; - rc = seg->vm_segtype; - if (rc == SEG_TYPE_SC || - ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) + segtype = seg->vm_segtype; + if (segtype == SEG_TYPE_SC || + ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; + + /* Check for overlapping resources before adding the mapping. */ if (request_resource(&iomem_resource, seg->res)) { rc = -EBUSY; - kfree(seg->res); - goto out_shared; + goto out_free_resource; } + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + if (rc) + goto out_resource; + if (do_nonshared) diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, &start_addr, &end_addr); @@ -467,42 +355,42 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); rc = diag_cc; - goto out_resource; + goto out_mapping; } if (diag_cc > 1) { - pr_warning("Loading DCSS %s failed with rc=%ld\n", name, - end_addr); + pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr); rc = dcss_diag_translate_rc(end_addr); dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); - goto out_resource; + goto out_mapping; } seg->start_addr = start_addr; seg->end = end_addr; seg->do_nonshared = do_nonshared; - atomic_set(&seg->ref_count, 1); + refcount_set(&seg->ref_count, 1); list_add(&seg->list, &dcss_list); *addr = seg->start_addr; *end = seg->end; if (do_nonshared) - pr_info("DCSS %s of range %p to %p and type %s loaded as " + pr_info("DCSS %s of range %px to %px and type %s loaded as " "exclusive-writable\n", name, (void*) seg->start_addr, (void*) seg->end, segtype_string[seg->vm_segtype]); else { - pr_info("DCSS %s of range %p to %p and type %s loaded in " + pr_info("DCSS %s of range %px to %px and type %s loaded in " "shared access mode\n", name, (void*) seg->start_addr, (void*) seg->end, segtype_string[seg->vm_segtype]); } goto out; + out_mapping: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_resource: release_resource(seg->res); + out_free_resource: kfree(seg->res); - out_shared: - vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_free: kfree(seg); out: - return rc; + return rc < 0 ? rc : segtype; } /* @@ -517,8 +405,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long * -EIO : could not perform query or load diagnose * -ENOENT : no such segment * -EOPNOTSUPP: multi-part segment cannot be used with linux - * -ENOSPC : segment cannot be used (overlaps with storage) - * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * -EBUSY : segment cannot be used (overlaps with dcss or storage) * -ERANGE : segment cannot be used (exceeds kernel mapping range) * -EPERM : segment is currently loaded with incompatible permissions * -ENOMEM : out of memory @@ -531,7 +418,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr, struct dcss_segment *seg; int rc; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; mutex_lock(&dcss_lock); @@ -540,7 +427,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr, rc = __segment_load (name, do_nonshared, addr, end); else { if (do_nonshared == seg->do_nonshared) { - atomic_inc(&seg->ref_count); + refcount_inc(&seg->ref_count); *addr = seg->start_addr; *end = seg->end; rc = seg->vm_segtype; @@ -586,9 +473,8 @@ segment_modify_shared (char *name, int do_nonshared) rc = 0; goto out_unlock; } - if (atomic_read (&seg->ref_count) != 1) { - pr_warning("DCSS %s is in use and cannot be reloaded\n", - name); + if (refcount_read(&seg->ref_count) != 1) { + pr_warn("DCSS %s is in use and cannot be reloaded\n", name); rc = -EAGAIN; goto out_unlock; } @@ -601,8 +487,8 @@ segment_modify_shared (char *name, int do_nonshared) seg->res->flags |= IORESOURCE_READONLY; if (request_resource(&iomem_resource, seg->res)) { - pr_warning("DCSS %s overlaps with used memory resources " - "and cannot be reloaded\n", name); + pr_warn("DCSS %s overlaps with used memory resources and cannot be reloaded\n", + name); rc = -EBUSY; kfree(seg->res); goto out_del_mem; @@ -620,8 +506,8 @@ segment_modify_shared (char *name, int do_nonshared) goto out_del_res; } if (diag_cc > 1) { - pr_warning("Reloading DCSS %s failed with rc=%ld\n", name, - end_addr); + pr_warn("Reloading DCSS %s failed with rc=%ld\n", + name, end_addr); rc = dcss_diag_translate_rc(end_addr); goto out_del_res; } @@ -643,6 +529,14 @@ segment_modify_shared (char *name, int do_nonshared) return rc; } +static void __dcss_diag_purge_on_cpu_0(void *data) +{ + struct dcss_segment *seg = (struct dcss_segment *)data; + unsigned long dummy; + + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); +} + /* * Decrease the use count of a DCSS segment and remove * it from the address space if nobody is using it @@ -651,10 +545,9 @@ segment_modify_shared (char *name, int do_nonshared) void segment_unload(char *name) { - unsigned long dummy; struct dcss_segment *seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -663,13 +556,20 @@ segment_unload(char *name) pr_err("Unloading unknown DCSS %s failed\n", name); goto out_unlock; } - if (atomic_dec_return(&seg->ref_count) != 0) + if (!refcount_dec_and_test(&seg->ref_count)) goto out_unlock; release_resource(seg->res); kfree(seg->res); vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); list_del(&seg->list); - dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + /* + * Workaround for z/VM issue, where calling the DCSS unload diag on + * a non-IPL CPU would cause bogus sclp maximum memory detection on + * next IPL. + * IPL CPU 0 cannot be set offline, so the dcss_diag() call can + * directly be scheduled to that CPU. + */ + smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1); kfree(seg); out_unlock: mutex_unlock(&dcss_lock); @@ -686,7 +586,7 @@ segment_save(char *name) char cmd2[80]; int i, response; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -697,14 +597,16 @@ segment_save(char *name) goto out; } - sprintf(cmd1, "DEFSEG %s", name); + snprintf(cmd1, sizeof(cmd1), "DEFSEG %s", name); for (i=0; i<seg->segcnt; i++) { - sprintf(cmd1+strlen(cmd1), " %lX-%lX %s", - seg->range[i].start >> PAGE_SHIFT, - seg->range[i].end >> PAGE_SHIFT, - segtype_string[seg->range[i].start & 0xff]); + size_t len = strlen(cmd1); + + snprintf(cmd1 + len, sizeof(cmd1) - len, " %lX-%lX %s", + seg->range[i].start >> PAGE_SHIFT, + seg->range[i].end >> PAGE_SHIFT, + segtype_string[seg->range[i].start & 0xff]); } - sprintf(cmd2, "SAVESEG %s", name); + snprintf(cmd2, sizeof(cmd2), "SAVESEG %s", name); response = 0; cpcmd(cmd1, NULL, 0, &response); if (response) { @@ -744,10 +646,6 @@ void segment_warning(int rc, char *seg_name) pr_err("DCSS %s has multiple page ranges and cannot be " "loaded or queried\n", seg_name); break; - case -ENOSPC: - pr_err("DCSS %s overlaps with used storage and cannot " - "be loaded\n", seg_name); - break; case -EBUSY: pr_err("%s needs used memory resources and cannot be " "loaded or queried\n", seg_name); @@ -760,10 +658,13 @@ void segment_warning(int rc, char *seg_name) pr_err("There is not enough memory to load or query " "DCSS %s\n", seg_name); break; - case -ERANGE: - pr_err("DCSS %s exceeds the kernel mapping range (%lu) " - "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS); + case -ERANGE: { + struct range mhp_range = arch_get_mappable_range(); + + pr_err("DCSS %s exceeds the kernel mapping range (%llu) " + "and cannot be loaded\n", seg_name, mhp_range.end + 1); break; + } default: break; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 047c3e4c59a2..e2e13778c36a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -1,17 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 /* * S390 version * Copyright IBM Corp. 1999 * Author(s): Hartmut Penner (hp@de.ibm.com) - * Ulrich Weigand (uweigand@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds */ #include <linux/kernel_stat.h> +#include <linux/mmu_context.h> +#include <linux/cpufeature.h> #include <linux/perf_event.h> #include <linux/signal.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> @@ -19,241 +23,233 @@ #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> -#include <linux/compat.h> #include <linux/smp.h> #include <linux/kdebug.h> #include <linux/init.h> #include <linux/console.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/kfence.h> +#include <linux/pagewalk.h> +#include <asm/asm-extable.h> #include <asm/asm-offsets.h> -#include <asm/pgtable.h> +#include <asm/ptrace.h> +#include <asm/fault.h> +#include <asm/diag.h> #include <asm/irq.h> -#include <asm/mmu_context.h> #include <asm/facility.h> +#include <asm/uv.h> #include "../kernel/entry.h" -#ifndef CONFIG_64BIT -#define __FAIL_ADDR_MASK 0x7ffff000 -#define __SUBCODE_MASK 0x0200 -#define __PF_RES_FIELD 0ULL -#else /* CONFIG_64BIT */ -#define __FAIL_ADDR_MASK -4096L -#define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL -#endif /* CONFIG_64BIT */ +/* + * Find out which address space caused the exception. + */ +static bool is_kernel_fault(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; -#define VM_FAULT_BADCONTEXT 0x010000 -#define VM_FAULT_BADMAP 0x020000 -#define VM_FAULT_BADACCESS 0x040000 -#define VM_FAULT_SIGNAL 0x080000 + if (user_mode(regs)) + return false; + if (teid.as == PSW_BITS_AS_SECONDARY) + return false; + return true; +} + +static unsigned long get_fault_address(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; -static unsigned long store_indication __read_mostly; + return teid.addr * PAGE_SIZE; +} -#ifdef CONFIG_64BIT -static int __init fault_init(void) +static __always_inline bool fault_is_write(struct pt_regs *regs) { + union teid teid = { .val = regs->int_parm_long }; + if (test_facility(75)) - store_indication = 0xc00; - return 0; + return teid.fsi == TEID_FSI_STORE; + return false; } -early_initcall(fault_init); -#endif -static inline int notify_page_fault(struct pt_regs *regs) +static void dump_pagetable(unsigned long asce, unsigned long address) { - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); + unsigned long entry, *table = __va(asce & _ASCE_ORIGIN); + + pr_alert("AS:%016lx ", asce); + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("R1:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) + goto out; + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION2: + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("R2:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) + goto out; + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION3: + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("R3:%016lx ", entry); + if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + goto out; + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_SEGMENT: + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("S:%016lx ", entry); + if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + goto out; + table = __va(entry & _SEGMENT_ENTRY_ORIGIN); } - return ret; + table += (address & _PAGE_INDEX) >> PAGE_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("P:%016lx ", entry); +out: + pr_cont("\n"); + return; +bad: + pr_cont("BAD\n"); } - -/* - * Unlock any spinlocks which will prevent us from getting the - * message out. - */ -void bust_spinlocks(int yes) +static void dump_fault_info(struct pt_regs *regs) { - if (yes) { - oops_in_progress = 1; + union teid teid = { .val = regs->int_parm_long }; + unsigned long asce; + + pr_alert("Failing address: %016lx TEID: %016lx", + get_fault_address(regs), teid.val); + if (test_facility(131)) + pr_cont(" ESOP-2"); + else if (machine_has_esop()) + pr_cont(" ESOP-1"); + else + pr_cont(" SOP"); + if (test_facility(75)) + pr_cont(" FSI"); + pr_cont("\n"); + pr_alert("Fault in "); + switch (teid.as) { + case PSW_BITS_AS_HOME: + pr_cont("home space "); + break; + case PSW_BITS_AS_SECONDARY: + pr_cont("secondary space "); + break; + case PSW_BITS_AS_ACCREG: + pr_cont("access register "); + break; + case PSW_BITS_AS_PRIMARY: + pr_cont("primary space "); + break; + } + pr_cont("mode while using "); + if (is_kernel_fault(regs)) { + asce = get_lowcore()->kernel_asce.val; + pr_cont("kernel "); } else { - int loglevel_save = console_loglevel; - console_unblank(); - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; - printk(" "); - console_loglevel = loglevel_save; + asce = get_lowcore()->user_asce.val; + pr_cont("user "); } + pr_cont("ASCE.\n"); + dump_pagetable(asce, get_fault_address(regs)); } -/* - * Returns the address space associated with the fault. - * Returns 0 for kernel space and 1 for user space. - */ -static inline int user_space_fault(unsigned long trans_exc_code) +int show_unhandled_signals = 1; + +static const struct ctl_table s390_fault_sysctl_table[] = { + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_s390_fault_sysctls(void) { - /* - * The lowest two bits of the translation exception - * identification indicate which paging table was used. - */ - trans_exc_code &= 3; - if (trans_exc_code == 2) - /* Access via secondary space, set_fs setting decides */ - return current->thread.mm_segment.ar4; - if (s390_user_mode == HOME_SPACE_MODE) - /* User space if the access has been done via home space. */ - return trans_exc_code == 3; - /* - * If the user space is not the home space the kernel runs in home - * space. Access via secondary space has already been covered, - * access via primary space or access register is from user space - * and access via home space is from the kernel. - */ - return trans_exc_code != 3; + register_sysctl_init("kernel", s390_fault_sysctl_table); + return 0; } +arch_initcall(init_s390_fault_sysctls); -static inline void report_user_fault(struct pt_regs *regs, long signr) +void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) { + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; - if (!printk_ratelimit()) + if (!__ratelimit(&rs)) return; - printk(KERN_ALERT "User process fault: interruption code 0x%X ", - regs->int_code); - print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN); - printk(KERN_CONT "\n"); - printk(KERN_ALERT "failing address: %lX\n", - regs->int_parm_long & __FAIL_ADDR_MASK); + pr_alert("User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); + print_vma_addr(KERN_CONT "in ", regs->psw.addr); + pr_cont("\n"); + if (is_mm_fault) + dump_fault_info(regs); show_regs(regs); } -/* - * Send SIGSEGV to task. This is an external routine - * to keep the stack usage of do_page_fault small. - */ -static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +static void do_sigsegv(struct pt_regs *regs, int si_code) { - struct siginfo si; - - report_user_fault(regs, SIGSEGV); - si.si_signo = SIGSEGV; - si.si_code = si_code; - si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); - force_sig_info(SIGSEGV, &si, current); + report_user_fault(regs, SIGSEGV, 1); + force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs)); } -static noinline void do_no_context(struct pt_regs *regs) +static void handle_fault_error_nolock(struct pt_regs *regs, int si_code) { - const struct exception_table_entry *fixup; unsigned long address; + bool is_write; - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); - if (fixup) { - regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE; + if (user_mode(regs)) { + if (WARN_ON_ONCE(!si_code)) + si_code = SEGV_MAPERR; + return do_sigsegv(regs, si_code); + } + if (fixup_exception(regs)) return; + if (is_kernel_fault(regs)) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; + pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n"); + } else { + pr_alert("Unable to handle kernel paging request in virtual user address space\n"); } - - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - address = regs->int_parm_long & __FAIL_ADDR_MASK; - if (!user_space_fault(regs->int_parm_long)) - printk(KERN_ALERT "Unable to handle kernel pointer dereference" - " at virtual kernel address %p\n", (void *)address); - else - printk(KERN_ALERT "Unable to handle kernel paging request" - " at virtual user address %p\n", (void *)address); - + dump_fault_info(regs); die(regs, "Oops"); - do_exit(SIGKILL); } -static noinline void do_low_address(struct pt_regs *regs) +static void handle_fault_error(struct pt_regs *regs, int si_code) { - /* Low-address protection hit in kernel mode means - NULL pointer write access in kernel mode. */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - /* Low-address protection hit in user mode 'cannot happen'. */ - die (regs, "Low-address protection"); - do_exit(SIGKILL); - } + struct mm_struct *mm = current->mm; - do_no_context(regs); + mmap_read_unlock(mm); + handle_fault_error_nolock(regs, si_code); } -static noinline void do_sigbus(struct pt_regs *regs) +static void do_sigbus(struct pt_regs *regs) { - struct task_struct *tsk = current; - struct siginfo si; - - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_ADRERR; - si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); - force_sig_info(SIGBUS, &si, tsk); -} - -static noinline void do_fault_error(struct pt_regs *regs, int fault) -{ - int si_code; - - switch (fault) { - case VM_FAULT_BADACCESS: - case VM_FAULT_BADMAP: - /* Bad memory access. Check if it is kernel or user space. */ - if (user_mode(regs)) { - /* User mode accesses just cause a SIGSEGV */ - si_code = (fault == VM_FAULT_BADMAP) ? - SEGV_MAPERR : SEGV_ACCERR; - do_sigsegv(regs, si_code); - return; - } - case VM_FAULT_BADCONTEXT: - do_no_context(regs); - break; - case VM_FAULT_SIGNAL: - if (!user_mode(regs)) - do_no_context(regs); - break; - default: /* fault & VM_FAULT_ERROR */ - if (fault & VM_FAULT_OOM) { - if (!user_mode(regs)) - do_no_context(regs); - else - pagefault_out_of_memory(); - } else if (fault & VM_FAULT_SIGBUS) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigbus(regs); - } else - BUG(); - break; - } + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs)); } /* @@ -262,427 +258,216 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suprression) - * 10 Segment translation -> Not present (nullification) - * 11 Page translation -> Not present (nullification) - * 3b Region third trans. -> Not present (nullification) + * 04 Protection -> Write-Protection (suppression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) */ -static inline int do_exception(struct pt_regs *regs, int access) +static void do_exception(struct pt_regs *regs, int access) { - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; - unsigned long trans_exc_code; unsigned long address; + struct mm_struct *mm; unsigned int flags; - int fault; + vm_fault_t fault; + bool is_write; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ - clear_tsk_thread_flag(tsk, TIF_PER_TRAP); - - if (notify_page_fault(regs)) - return 0; - - mm = tsk->mm; - trans_exc_code = regs->int_parm_long; - - /* - * Verify that the fault happened in user space, that - * we are not in an interrupt and that there is a - * user context. - */ - fault = VM_FAULT_BADCONTEXT; - if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) - goto out; - - address = trans_exc_code & __FAIL_ADDR_MASK; + clear_thread_flag(TIF_PER_TRAP); + if (kprobe_page_fault(regs, 14)) + return; + mm = current->mm; + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (is_kernel_fault(regs) || faulthandler_disabled() || !mm) + return handle_fault_error_nolock(regs, 0); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + flags = FAULT_FLAG_DEFAULT; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - down_read(&mm->mmap_sem); - -#ifdef CONFIG_PGSTE - if ((current->flags & PF_VCPU) && S390_lowcore.gmap) { - address = __gmap_fault(address, - (struct gmap *) S390_lowcore.gmap); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } - if (address == -ENOMEM) { - fault = VM_FAULT_OOM; - goto out_up; - } + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + if (!(vma->vm_flags & access)) { + vma_end_read(vma); + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } -#endif - + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; + } +lock_mmap: retry: - fault = VM_FAULT_BADMAP; - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto out_up; - - if (unlikely(vma->vm_start > address)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_up; - if (expand_stack(vma, address)) - goto out_up; - } - - /* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ - fault = VM_FAULT_BADACCESS; + return handle_fault_error_nolock(regs, SEGV_MAPERR); if (unlikely(!(vma->vm_flags & access))) - goto out_up; - - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, flags); - /* No reason to continue if interrupted by SIGKILL. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { - fault = VM_FAULT_SIGNAL; - goto out; + return handle_fault_error(regs, SEGV_ACCERR); + fault = handle_mm_fault(vma, address, flags, regs); + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; } - if (unlikely(fault & VM_FAULT_ERROR)) - goto out_up; - - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - down_read(&mm->mmap_sem); - goto retry; - } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + goto retry; + } + mmap_read_unlock(mm); +done: + if (!(fault & VM_FAULT_ERROR)) + return; + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigbus(regs); + } else { + pr_emerg("Unexpected fault flags: %08x\n", fault); + BUG(); } - fault = 0; -out_up: - up_read(&mm->mmap_sem); -out: - return fault; } -void __kprobes do_protection_exception(struct pt_regs *regs) +void do_protection_exception(struct pt_regs *regs) { - unsigned long trans_exc_code; - int fault; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long; /* * Protection exceptions are suppressing, decrement psw address. * The exception to this rule are aborted transactions, for these * the PSW already points to the correct location. */ - if (!(regs->int_code & 0x200)) + if (!(regs->int_code & 0x200)) { regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); - /* - * Check for low-address protection. This needs to be treated - * as a special case because the translation exception code - * field is not guaranteed to contain valid data in this case. - */ - if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs); - return; + set_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED); } - fault = do_exception(regs, VM_WRITE); - if (unlikely(fault)) - do_fault_error(regs, fault); -} - -void __kprobes do_dat_exception(struct pt_regs *regs) -{ - int access, fault; - - access = VM_READ | VM_EXEC | VM_WRITE; - fault = do_exception(regs, access); - if (unlikely(fault)) - do_fault_error(regs, fault); -} - -#ifdef CONFIG_64BIT -void __kprobes do_asce_exception(struct pt_regs *regs) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long trans_exc_code; - /* - * The instruction that caused the program check has - * been nullified. Don't signal single step via SIGTRAP. + * If bit 61 if the TEID is not set, the remainder of the + * TEID is unpredictable. Special handling is required. */ - clear_tsk_thread_flag(current, TIF_PER_TRAP); - - trans_exc_code = regs->int_parm_long; - if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) - goto no_context; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); - up_read(&mm->mmap_sem); - - if (vma) { - update_mm(mm, current); - return; + if (unlikely(!teid.b61)) { + if (user_mode(regs)) { + dump_fault_info(regs); + die(regs, "Unexpected TEID"); + } + /* Assume low-address protection in kernel mode. */ + return handle_fault_error_nolock(regs, 0); } - - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs)) { - do_sigsegv(regs, SEGV_MAPERR); - return; + if (unlikely(cpu_has_nx() && teid.b56)) { + regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - -no_context: - do_no_context(regs); -} -#endif - -int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) -{ - struct pt_regs regs; - int access, fault; - - /* Emulate a uaccess fault from kernel mode. */ - regs.psw.mask = psw_kernel_bits | PSW_MASK_DAT | PSW_MASK_MCHECK; - if (!irqs_disabled()) - regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; - regs.psw.addr = (unsigned long) __builtin_return_address(0); - regs.psw.addr |= PSW_ADDR_AMODE; - regs.int_code = pgm_int_code; - regs.int_parm_long = (uaddr & PAGE_MASK) | 2; - access = write ? VM_WRITE : VM_READ; - fault = do_exception(®s, access); - /* - * Since the fault happened in kernel mode while performing a uaccess - * all we need to do now is emulating a fixup in case "fault" is not - * zero. - * For the calling uaccess functions this results always in -EFAULT. - */ - return fault ? -EFAULT : 0; + do_exception(regs, VM_WRITE); } +NOKPROBE_SYMBOL(do_protection_exception); -#ifdef CONFIG_PFAULT -/* - * 'pfault' pseudo page faults routines. - */ -static int pfault_disable; - -static int __init nopfault(char *str) +void do_dat_exception(struct pt_regs *regs) { - pfault_disable = 1; - return 1; + do_exception(regs, VM_ACCESS_FLAGS); } +NOKPROBE_SYMBOL(do_dat_exception); -__setup("nopfault", nopfault); - -struct pfault_refbk { - u16 refdiagc; - u16 reffcode; - u16 refdwlen; - u16 refversn; - u64 refgaddr; - u64 refselmk; - u64 refcmpmk; - u64 reserved; -} __attribute__ ((packed, aligned(8))); - -int pfault_init(void) -{ - struct pfault_refbk refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_CURRENT_PID, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD }; - int rc; - - if (pfault_disable) - return -1; - asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); - return rc; -} +#if IS_ENABLED(CONFIG_PGSTE) -void pfault_fini(void) +void do_secure_storage_access(struct pt_regs *regs) { - struct pfault_refbk refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, - }; - - if (pfault_disable) - return; - asm volatile( - " diag %0,0,0x258\n" - "0:\n" - EX_TABLE(0b,0b) - : : "a" (&refbk), "m" (refbk) : "cc"); -} - -static DEFINE_SPINLOCK(pfault_lock); -static LIST_HEAD(pfault_list); - -static void pfault_interrupt(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct task_struct *tsk; - __u16 subcode; - pid_t pid; + union teid teid = { .val = regs->int_parm_long }; + unsigned long addr = get_fault_address(regs); + struct vm_area_struct *vma; + struct folio_walk fw; + struct mm_struct *mm; + struct folio *folio; + int rc; /* - * Get the external interruption subcode & pfault - * initial/completion signal bit. VM stores this - * in the 'cpu address' field associated with the - * external interrupt. + * Bit 61 indicates if the address is valid, if it is not the + * kernel should be stopped or SIGSEGV should be sent to the + * process. Bit 61 is not reliable without the misc UV feature, + * therefore this needs to be checked too. */ - subcode = ext_code.subcode; - if ((subcode & 0xff00) != __SUBCODE_MASK) - return; - inc_irq_stat(IRQEXT_PFL); - /* Get the token (= pid of the affected task). */ - pid = sizeof(void *) == 4 ? param32 : param64; - rcu_read_lock(); - tsk = find_task_by_pid_ns(pid, &init_pid_ns); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return; - spin_lock(&pfault_lock); - if (subcode & 0x0080) { - /* signal bit is set -> a page has been swapped in by VM */ - if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion - * interrupt. pfault_wait is valid. Set pfault_wait - * back to zero and wake up the process. This can - * safely be done because the task is still sleeping - * and can't produce new pfaults. */ - tsk->thread.pfault_wait = 0; - list_del(&tsk->thread.list); - wake_up_process(tsk); - put_task_struct(tsk); - } else { - /* Completion interrupt was faster than initial - * interrupt. Set pfault_wait to -1 so the initial - * interrupt doesn't put the task to sleep. - * If the task is not running, ignore the completion - * interrupt since it must be a leftover of a PFAULT - * CANCEL operation which didn't remove all pending - * completion interrupts. */ - if (tsk->state == TASK_RUNNING) - tsk->thread.pfault_wait = -1; - } - } else { - /* signal bit not set -> a real page is missing. */ - if (WARN_ON_ONCE(tsk != current)) - goto out; - if (tsk->thread.pfault_wait == 1) { - /* Already on the list with a reference: put to sleep */ - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial - * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ - tsk->thread.pfault_wait = 0; - } else { - /* Initial interrupt arrived before completion - * interrupt. Let the task sleep. - * An extra task reference is needed since a different - * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ - get_task_struct(tsk); - tsk->thread.pfault_wait = 1; - list_add(&tsk->thread.list, &pfault_list); - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); + if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) { + /* + * When this happens, userspace did something that it + * was not supposed to do, e.g. branching into secure + * memory. Trigger a segmentation fault. + */ + if (user_mode(regs)) { + send_sig(SIGSEGV, current, 0); + return; } + /* + * The kernel should never run into this case and + * there is no way out of this situation. + */ + panic("Unexpected PGM 0x3d with TEID bit 61=0"); } -out: - spin_unlock(&pfault_lock); - put_task_struct(tsk); -} - -static int __cpuinit pfault_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - struct thread_struct *thread, *next; - struct task_struct *tsk; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); + if (is_kernel_fault(regs)) { + folio = phys_to_folio(addr); + if (unlikely(!folio_try_get(folio))) + return; + rc = arch_make_folio_accessible(folio); + folio_put(folio); + if (rc) + BUG(); + } else { + if (faulthandler_disabled()) + return handle_fault_error_nolock(regs, 0); + mm = current->mm; + mmap_read_lock(mm); + vma = find_vma(mm, addr); + if (!vma) + return handle_fault_error(regs, SEGV_MAPERR); + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) { + mmap_read_unlock(mm); + return; } - spin_unlock_irq(&pfault_lock); - break; - default: - break; + /* arch_make_folio_accessible() needs a raised refcount. */ + folio_get(folio); + rc = arch_make_folio_accessible(folio); + folio_put(folio); + folio_walk_end(&fw, vma); + if (rc) + send_sig(SIGSEGV, current, 0); + mmap_read_unlock(mm); } - return NOTIFY_OK; -} - -static int __init pfault_irq_init(void) -{ - int rc; - - rc = register_external_interrupt(0x2603, pfault_interrupt); - if (rc) - goto out_extint; - rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; - if (rc) - goto out_pfault; - service_subclass_irq_register(); - hotcpu_notifier(pfault_cpu_notify, 0); - return 0; - -out_pfault: - unregister_external_interrupt(0x2603, pfault_interrupt); -out_extint: - pfault_disable = 1; - return rc; } -early_initcall(pfault_irq_init); +NOKPROBE_SYMBOL(do_secure_storage_access); -#endif /* CONFIG_PFAULT */ +#endif /* CONFIG_PGSTE */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c new file mode 100644 index 000000000000..dd85bcca817d --- /dev/null +++ b/arch/s390/mm/gmap.c @@ -0,0 +1,2436 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2020 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.vnet.ibm.com> + */ + +#include <linux/cpufeature.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/pagewalk.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/ksm.h> +#include <linux/mman.h> +#include <linux/pgtable.h> +#include <asm/page-states.h> +#include <asm/pgalloc.h> +#include <asm/machine.h> +#include <asm/gmap_helpers.h> +#include <asm/gmap.h> +#include <asm/page.h> + +/* + * The address is saved in a radix tree directly; NULL would be ambiguous, + * since 0 is a valid address, and NULL is returned when nothing was found. + * The lower bits are ignored by all users of the macro, so it can be used + * to distinguish a valid address 0 from a NULL. + */ +#define VALID_GADDR_FLAG 1 +#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) +#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) + +#define GMAP_SHADOW_FAKE_TABLE 1ULL + +static struct page *gmap_alloc_crst(void) +{ + struct page *page; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return NULL; + __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); + return page; +} + +/** + * gmap_alloc - allocate and initialize a guest address space + * @limit: maximum address of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_alloc(unsigned long limit) +{ + struct gmap *gmap; + struct page *page; + unsigned long *table; + unsigned long etype, atype; + + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; + atype = _ASCE_TYPE_SEGMENT; + etype = _SEGMENT_ENTRY_EMPTY; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; + atype = _ASCE_TYPE_REGION3; + etype = _REGION3_ENTRY_EMPTY; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; + atype = _ASCE_TYPE_REGION2; + etype = _REGION2_ENTRY_EMPTY; + } else { + limit = -1UL; + atype = _ASCE_TYPE_REGION1; + etype = _REGION1_ENTRY_EMPTY; + } + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); + if (!gmap) + goto out; + INIT_LIST_HEAD(&gmap->children); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); + spin_lock_init(&gmap->guest_table_lock); + spin_lock_init(&gmap->shadow_lock); + refcount_set(&gmap->ref_count, 1); + page = gmap_alloc_crst(); + if (!page) + goto out_free; + table = page_to_virt(page); + crst_table_init(table, etype); + gmap->table = table; + gmap->asce = atype | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | __pa(table); + gmap->asce_end = limit; + return gmap; + +out_free: + kfree(gmap); +out: + return NULL; +} +EXPORT_SYMBOL_GPL(gmap_alloc); + +/** + * gmap_create - create a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + unsigned long gmap_asce; + + gmap = gmap_alloc(limit); + if (!gmap) + return NULL; + gmap->mm = mm; + spin_lock(&mm->context.lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + if (list_is_singular(&mm->context.gmap_list)) + gmap_asce = gmap->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(mm->context.gmap_asce, gmap_asce); + spin_unlock(&mm->context.lock); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_create); + +static void gmap_flush_tlb(struct gmap *gmap) +{ + __tlb_flush_idte(gmap->asce); +} + +static void gmap_radix_tree_free(struct radix_tree_root *root) +{ + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + radix_tree_delete(root, index); + } + } while (nr > 0); +} + +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct gmap_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + +static void gmap_free_crst(unsigned long *table, bool free_ptes) +{ + bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; + int i; + + if (is_segment) { + if (!free_ptes) + goto out; + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _SEGMENT_ENTRY_INVALID)) + page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); + } else { + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _REGION_ENTRY_INVALID)) + gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); + } + +out: + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + +/** + * gmap_free - free a guest address space + * @gmap: pointer to the guest address space structure + * + * No locks required. There are no references to this gmap anymore. + */ +void gmap_free(struct gmap *gmap) +{ + /* Flush tlb of all gmaps (if not already done for shadows) */ + if (!(gmap_is_shadow(gmap) && gmap->removed)) + gmap_flush_tlb(gmap); + /* Free all segment & region tables. */ + gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); + + gmap_radix_tree_free(&gmap->guest_to_host); + gmap_radix_tree_free(&gmap->host_to_guest); + + /* Free additional data for a shadow gmap */ + if (gmap_is_shadow(gmap)) { + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + /* Release reference to the parent */ + gmap_put(gmap->parent); + } + + kfree(gmap); +} +EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_get - increase reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * Returns the gmap pointer + */ +struct gmap *gmap_get(struct gmap *gmap) +{ + refcount_inc(&gmap->ref_count); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_get); + +/** + * gmap_put - decrease reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * If the reference counter reaches zero the guest address space is freed. + */ +void gmap_put(struct gmap *gmap) +{ + if (refcount_dec_and_test(&gmap->ref_count)) + gmap_free(gmap); +} +EXPORT_SYMBOL_GPL(gmap_put); + +/** + * gmap_remove - remove a guest address space but do not free it yet + * @gmap: pointer to the guest address space structure + */ +void gmap_remove(struct gmap *gmap) +{ + struct gmap *sg, *next; + unsigned long gmap_asce; + + /* Remove all shadow gmaps linked to this gmap */ + if (!list_empty(&gmap->children)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, &gmap->children, list) { + list_del(&sg->list); + gmap_put(sg); + } + spin_unlock(&gmap->shadow_lock); + } + /* Remove gmap from the pre-mm list */ + spin_lock(&gmap->mm->context.lock); + list_del_rcu(&gmap->list); + if (list_empty(&gmap->mm->context.gmap_list)) + gmap_asce = 0; + else if (list_is_singular(&gmap->mm->context.gmap_list)) + gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, + struct gmap, list)->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); + spin_unlock(&gmap->mm->context.lock); + synchronize_rcu(); + /* Put reference */ + gmap_put(gmap); +} +EXPORT_SYMBOL_GPL(gmap_remove); + +/* + * gmap_alloc_table is assumed to be called with mmap_lock held + */ +static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, + unsigned long init, unsigned long gaddr) +{ + struct page *page; + unsigned long *new; + + /* since we dont free the gmap table until gmap_free we can unlock */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + new = page_to_virt(page); + crst_table_init(new, init); + spin_lock(&gmap->guest_table_lock); + if (*table & _REGION_ENTRY_INVALID) { + *table = __pa(new) | _REGION_ENTRY_LENGTH | + (*table & _REGION_ENTRY_TYPE_MASK); + page = NULL; + } + spin_unlock(&gmap->guest_table_lock); + if (page) + __free_pages(page, CRST_ALLOC_ORDER); + return 0; +} + +static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) +{ + return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} + +static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) +{ + return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} + +static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, + unsigned long *gaddr) +{ + *gaddr = host_to_guest_delete(gmap, vmaddr); + if (IS_GADDR_VALID(*gaddr)) + return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); + return NULL; +} + +/** + * __gmap_unlink_by_vmaddr - unlink a single segment via a host address + * @gmap: pointer to the guest address space structure + * @vmaddr: address in the host process address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) +{ + unsigned long gaddr; + int flush = 0; + pmd_t *pmdp; + + BUG_ON(gmap_is_shadow(gmap)); + spin_lock(&gmap->guest_table_lock); + + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); + } + + spin_unlock(&gmap->guest_table_lock); + return flush; +} + +/** + * __gmap_unmap_by_gaddr - unmap a single segment via a guest address + * @gmap: pointer to the guest address space structure + * @gaddr: address in the guest address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; +} + +/** + * gmap_unmap_segment - unmap segment from the guest address space + * @gmap: pointer to the guest address space structure + * @to: address in the guest address space + * @len: length of the memory area to unmap + * + * Returns 0 if the unmap succeeded, -EINVAL if not. + */ +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + BUG_ON(gmap_is_shadow(gmap)); + if ((to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || to + len < to) + return -EINVAL; + + flush = 0; + mmap_write_lock(gmap->mm); + for (off = 0; off < len; off += PMD_SIZE) + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + mmap_write_unlock(gmap->mm); + if (flush) + gmap_flush_tlb(gmap); + return 0; +} +EXPORT_SYMBOL_GPL(gmap_unmap_segment); + +/** + * gmap_map_segment - map a segment to the guest address space + * @gmap: pointer to the guest address space structure + * @from: source address in the parent address space + * @to: target address in the guest address space + * @len: length of the memory area to map + * + * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. + */ +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + BUG_ON(gmap_is_shadow(gmap)); + if ((from | to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || from + len < from || to + len < to || + from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) + return -EINVAL; + + flush = 0; + mmap_write_lock(gmap->mm); + for (off = 0; off < len; off += PMD_SIZE) { + /* Remove old translation */ + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + /* Store new translation */ + if (radix_tree_insert(&gmap->guest_to_host, + (to + off) >> PMD_SHIFT, + (void *) from + off)) + break; + } + mmap_write_unlock(gmap->mm); + if (flush) + gmap_flush_tlb(gmap); + if (off >= len) + return 0; + gmap_unmap_segment(gmap, to, len); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gmap_map_segment); + +/** + * __gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + * The mmap_lock of the mm that belongs to the address space must be held + * when this function gets called. + * + * Note: Can also be called for shadow gmaps. + */ +unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + /* Note: guest_to_host is empty for a shadow gmap */ + return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; +} +EXPORT_SYMBOL_GPL(__gmap_translate); + +/** + * gmap_unlink - disconnect a page table from the gmap shadow tables + * @mm: pointer to the parent mm_struct + * @table: pointer to the host page table + * @vmaddr: vm address associated with the host page table + */ +void gmap_unlink(struct mm_struct *mm, unsigned long *table, + unsigned long vmaddr) +{ + struct gmap *gmap; + int flush; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); + if (flush) + gmap_flush_tlb(gmap); + } + rcu_read_unlock(); +} + +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, + unsigned long gaddr); + +/** + * __gmap_link - set up shadow page tables to connect a host to a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @vmaddr: vm address + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + * The mmap_lock of the mm that belongs to the address space must be held + * when this function gets called. + */ +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) +{ + struct mm_struct *mm; + unsigned long *table; + spinlock_t *ptl; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + u64 unprot; + int rc; + + BUG_ON(gmap_is_shadow(gmap)); + /* Create higher level tables in the gmap page table */ + table = gmap->table; + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, + gaddr & _REGION1_MASK)) + return -ENOMEM; + table = __va(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, + gaddr & _REGION2_MASK)) + return -ENOMEM; + table = __va(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, + gaddr & _REGION3_MASK)) + return -ENOMEM; + table = __va(*table & _REGION_ENTRY_ORIGIN); + } + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + /* Walk the parent mm page table */ + mm = gmap->mm; + pgd = pgd_offset(mm, vmaddr); + VM_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vmaddr); + VM_BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, vmaddr); + VM_BUG_ON(pud_none(*pud)); + /* large puds cannot yet be handled */ + if (pud_leaf(*pud)) + return -EFAULT; + pmd = pmd_offset(pud, vmaddr); + VM_BUG_ON(pmd_none(*pmd)); + /* Are we allowed to use huge pages? */ + if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) + return -EFAULT; + /* Link gmap segment table entry location to page table. */ + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + ptl = pmd_lock(mm, pmd); + spin_lock(&gmap->guest_table_lock); + if (*table == _SEGMENT_ENTRY_EMPTY) { + rc = radix_tree_insert(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT, + (void *)MAKE_VALID_GADDR(gaddr)); + if (!rc) { + if (pmd_leaf(*pmd)) { + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) + | _SEGMENT_ENTRY_GMAP_UC + | _SEGMENT_ENTRY; + } else + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS) + | _SEGMENT_ENTRY; + } + } else if (*table & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { + unprot = (u64)*table; + unprot &= ~_SEGMENT_ENTRY_PROTECT; + unprot |= _SEGMENT_ENTRY_GMAP_UC; + gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); + } + spin_unlock(&gmap->guest_table_lock); + spin_unlock(ptl); + radix_tree_preload_end(); + return rc; +} +EXPORT_SYMBOL(__gmap_link); + +/* + * this function is assumed to be called with mmap_lock held + */ +void __gmap_zap(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + mmap_assert_locked(gmap->mm); + + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (vmaddr) { + vmaddr |= gaddr & ~PMD_MASK; + gmap_helper_zap_one_page(gmap->mm, vmaddr); + } +} +EXPORT_SYMBOL_GPL(__gmap_zap); + +static LIST_HEAD(gmap_notifier_list); +static DEFINE_SPINLOCK(gmap_notifier_lock); + +/** + * gmap_register_pte_notifier - register a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_register_pte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_add_rcu(&nb->list, &gmap_notifier_list); + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); + +/** + * gmap_unregister_pte_notifier - remove a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_unregister_pte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_del_rcu(&nb->list); + spin_unlock(&gmap_notifier_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); + +/** + * gmap_call_notifier - call all registered invalidation callbacks + * @gmap: pointer to guest mapping meta data structure + * @start: start virtual address in the guest address space + * @end: end virtual address in the guest address space + */ +static void gmap_call_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct gmap_notifier *nb; + + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, start, end); +} + +/** + * gmap_table_walk - walk the gmap page tables + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @level: page table level to stop at + * + * Returns a table entry pointer for the given guest address and @level + * @level=0 : returns a pointer to a page table table entry (or NULL) + * @level=1 : returns a pointer to a segment table entry (or NULL) + * @level=2 : returns a pointer to a region-3 table entry (or NULL) + * @level=3 : returns a pointer to a region-2 table entry (or NULL) + * @level=4 : returns a pointer to a region-1 table entry (or NULL) + * + * Returns NULL if the gmap page tables could not be walked to the + * requested level. + * + * Note: Can also be called for shadow gmaps. + */ +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) +{ + const int asce_type = gmap->asce & _ASCE_TYPE_MASK; + unsigned long *table = gmap->table; + + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; + + if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) + return NULL; + + if (asce_type != _ASCE_TYPE_REGION1 && + gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) + return NULL; + + switch (asce_type) { + case _ASCE_TYPE_REGION1: + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; + if (level == 4) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION2: + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; + if (level == 3) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION3: + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; + if (level == 2) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_SEGMENT: + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + if (level == 1) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; + } + return table; +} +EXPORT_SYMBOL(gmap_table_walk); + +/** + * gmap_pte_op_walk - walk the gmap page table, get the page table lock + * and return the pte pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @ptl: pointer to the spinlock pointer + * + * Returns a pointer to the locked pte for a guest address, or NULL + */ +static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, + spinlock_t **ptl) +{ + unsigned long *table; + + BUG_ON(gmap_is_shadow(gmap)); + /* Walk the gmap page table, lock and get pte pointer */ + table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ + if (!table || *table & _SEGMENT_ENTRY_INVALID) + return NULL; + return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); +} + +/** + * gmap_pte_op_fixup - force a page in and connect the gmap page table + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @vmaddr: address in the host process address space + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if the caller can retry __gmap_translate (might fail again), + * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing + * up or connecting the gmap page table. + */ +static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr, int prot) +{ + struct mm_struct *mm = gmap->mm; + unsigned int fault_flags; + bool unlocked = false; + + BUG_ON(gmap_is_shadow(gmap)); + fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) + return -EFAULT; + if (unlocked) + /* lost mmap_lock, caller has to retry __gmap_translate */ + return 0; + /* Connect the page tables */ + return __gmap_link(gmap, gaddr, vmaddr); +} + +/** + * gmap_pte_op_end - release the page table lock + * @ptep: pointer to the locked pte + * @ptl: pointer to the page table spinlock + */ +static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) +{ + pte_unmap_unlock(ptep, ptl); +} + +/** + * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock + * and return the pmd pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * + * Returns a pointer to the pmd for a guest address, or NULL + */ +static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) +{ + pmd_t *pmdp; + + BUG_ON(gmap_is_shadow(gmap)); + pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + if (!pmdp) + return NULL; + + /* without huge pages, there is no need to take the table lock */ + if (!gmap->mm->context.allow_gmap_hpage_1m) + return pmd_none(*pmdp) ? NULL : pmdp; + + spin_lock(&gmap->guest_table_lock); + if (pmd_none(*pmdp)) { + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + + /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ + if (!pmd_leaf(*pmdp)) + spin_unlock(&gmap->guest_table_lock); + return pmdp; +} + +/** + * gmap_pmd_op_end - release the guest_table_lock if needed + * @gmap: pointer to the guest mapping meta data structure + * @pmdp: pointer to the pmd + */ +static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) +{ + if (pmd_leaf(*pmdp)) + spin_unlock(&gmap->guest_table_lock); +} + +/* + * gmap_protect_pmd - remove access rights to memory and set pmd notification bits + * @pmdp: pointer to the pmd to be protected + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns: + * 0 if successfully protected + * -EAGAIN if a fixup is needed + * -EINVAL if unsupported notifier bits have been specified + * + * Expected to be called with sg->mm->mmap_lock in read and + * guest_table_lock held. + */ +static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; + int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *pmdp; + + /* Fixup needed */ + if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) + return -EAGAIN; + + if (prot == PROT_NONE && !pmd_i) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (prot == PROT_READ && !pmd_p) { + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (bits & GMAP_NOTIFY_MPROT) + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); + + /* Shadow GMAP protection needs split PMDs */ + if (bits & GMAP_NOTIFY_SHADOW) + return -EINVAL; + + return 0; +} + +/* + * gmap_protect_pte - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @pmdp: pointer to the pmd associated with the pte + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EAGAIN if a fixup is needed. + * + * Expected to be called with sg->mm->mmap_lock in read + */ +static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int rc; + pte_t *ptep; + spinlock_t *ptl; + unsigned long pbits = 0; + + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return -EAGAIN; + + ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); + if (!ptep) + return -ENOMEM; + + pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; + pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; + /* Protect and unlock. */ + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); + gmap_pte_op_end(ptep, ptl); + return rc; +} + +/* + * gmap_protect_range - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns: + * PAGE_SIZE if a small page was successfully protected; + * HPAGE_SIZE if a large page was successfully protected; + * -ENOMEM if out of memory; + * -EFAULT if gaddr is invalid (or mapping for shadows is missing); + * -EAGAIN if the guest mapping is missing and should be fixed by the caller. + * + * Context: Called with sg->mm->mmap_lock in read. + */ +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) +{ + pmd_t *pmdp; + int rc = 0; + + BUG_ON(gmap_is_shadow(gmap)); + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return -EAGAIN; + + if (!pmd_leaf(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = PAGE_SIZE; + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = HPAGE_SIZE; + } + gmap_pmd_op_end(gmap, pmdp); + + return rc; +} +EXPORT_SYMBOL_GPL(gmap_protect_one); + +/** + * gmap_read_table - get an unsigned long value from a guest page table using + * absolute addressing, without marking the page referenced. + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @val: pointer to the unsigned long value to return + * + * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT + * if reading using the virtual address failed. -EINVAL if called on a gmap + * shadow. + * + * Called with gmap->mm->mmap_lock in read. + */ +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +{ + unsigned long address, vmaddr; + spinlock_t *ptl; + pte_t *ptep, pte; + int rc; + + if (gmap_is_shadow(gmap)) + return -EINVAL; + + while (1) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + pte = *ptep; + if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { + address = pte_val(pte) & PAGE_MASK; + address += gaddr & ~PAGE_MASK; + *val = *(unsigned long *)__va(address); + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); + /* Do *NOT* clear the _PAGE_INVALID bit! */ + rc = 0; + } + gmap_pte_op_end(ptep, ptl); + } + if (!rc) + break; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); + if (rc) + break; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_read_table); + +/** + * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree + * @sg: pointer to the shadow guest address space structure + * @vmaddr: vm address associated with the rmap + * @rmap: pointer to the rmap structure + * + * Called with the sg->guest_table_lock + */ +static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, + struct gmap_rmap *rmap) +{ + struct gmap_rmap *temp; + void __rcu **slot; + + BUG_ON(!gmap_is_shadow(sg)); + slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, + &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); + } else { + rmap->next = NULL; + radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, + rmap); + } +} + +/** + * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow gmap + * @paddr: address in the parent guest address space + * @len: length of the memory area to protect + * + * Returns 0 if successfully protected and the rmap was created, -ENOMEM + * if out of memory and -EFAULT if paddr is invalid. + */ +static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, + unsigned long paddr, unsigned long len) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + while (len) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); + if (!rmap) + return -ENOMEM; + rmap->raddr = raddr; + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); + if (rc) { + kfree(rmap); + return rc; + } + rc = -EAGAIN; + ptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (ptep) { + spin_lock(&sg->guest_table_lock); + rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, + PGSTE_VSIE_BIT); + if (!rc) + gmap_insert_rmap(sg, vmaddr, rmap); + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptep, ptl); + } + radix_tree_preload_end(); + if (rc) { + kfree(rmap); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); + if (rc) + return rc; + continue; + } + paddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +#define _SHADOW_RMAP_MASK 0x7 +#define _SHADOW_RMAP_REGION1 0x5 +#define _SHADOW_RMAP_REGION2 0x4 +#define _SHADOW_RMAP_REGION3 0x3 +#define _SHADOW_RMAP_SEGMENT 0x2 +#define _SHADOW_RMAP_PGTABLE 0x1 + +/** + * gmap_idte_one - invalidate a single region or segment table entry + * @asce: region or segment table *origin* + table-type bits + * @vaddr: virtual address to identify the table entry to flush + * + * The invalid bit of a single region or segment table entry is set + * and the associated TLB entries depending on the entry are flushed. + * The table-type of the @asce identifies the portion of the @vaddr + * that is used as the invalidation index. + */ +static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) +{ + asm volatile( + " idte %0,0,%1" + : : "a" (asce), "a" (vaddr) : "cc", "memory"); +} + +/** + * gmap_unshadow_page - remove a page from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ + if (!table || *table & _PAGE_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); + ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); +} + +/** + * __gmap_unshadow_pgt - remove all entries from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @pgt: pointer to the start of a shadow page table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, + unsigned long *pgt) +{ + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) + pgt[i] = _PAGE_INVALID; +} + +/** + * gmap_unshadow_pgt - remove a shadow page table from a segment entry + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long *ste; + phys_addr_t sto, pgt; + struct ptdesc *ptdesc; + + BUG_ON(!gmap_is_shadow(sg)); + ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ + if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; + *ste = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); + /* Free page table */ + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); +} + +/** + * __gmap_unshadow_sgt - remove all entries from a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @sgt: pointer to the start of a shadow segment table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, + unsigned long *sgt) +{ + struct ptdesc *ptdesc; + phys_addr_t pgt; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { + if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) + continue; + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; + sgt[i] = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); + /* Free page table */ + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); + } +} + +/** + * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the shadow->guest_table_lock + */ +static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long r3o, *r3e; + phys_addr_t sgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ + if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; + *r3e = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); + /* Free segment table */ + page = phys_to_page(sgt); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * @r3t: pointer to the start of a shadow region-3 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, + unsigned long *r3t) +{ + struct page *page; + phys_addr_t sgt; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { + if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) + continue; + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; + r3t[i] = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); + /* Free segment table */ + page = phys_to_page(sgt); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r2o, *r2e; + phys_addr_t r3t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ + if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; + *r2e = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); + /* Free region 3 table */ + page = phys_to_page(r3t); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r2t: pointer to the start of a shadow region-2 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, + unsigned long *r2t) +{ + phys_addr_t r3t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { + if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; + r2t[i] = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); + /* Free region 3 table */ + page = phys_to_page(r3t); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r1o, *r1e; + struct page *page; + phys_addr_t r2t; + + BUG_ON(!gmap_is_shadow(sg)); + r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ + if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; + *r1e = _REGION1_ENTRY_EMPTY; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); + /* Free region 2 table */ + page = phys_to_page(r2t); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r1t: pointer to the start of a shadow region-1 table + * + * Called with the shadow->guest_table_lock + */ +static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, + unsigned long *r1t) +{ + unsigned long asce; + struct page *page; + phys_addr_t r2t; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = __pa(r1t) | _ASCE_TYPE_REGION1; + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { + if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); + /* Clear entry and flush translation r1t -> r2t */ + gmap_idte_one(asce, raddr); + r1t[i] = _REGION1_ENTRY_EMPTY; + /* Free region 2 table */ + page = phys_to_page(r2t); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow - remove a shadow page table completely + * @sg: pointer to the shadow guest address space structure + * + * Called with sg->guest_table_lock + */ +void gmap_unshadow(struct gmap *sg) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + if (sg->removed) + return; + sg->removed = 1; + gmap_call_notifier(sg, 0, -1UL); + gmap_flush_tlb(sg); + table = __va(sg->asce & _ASCE_ORIGIN); + switch (sg->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + __gmap_unshadow_r1t(sg, 0, table); + break; + case _ASCE_TYPE_REGION2: + __gmap_unshadow_r2t(sg, 0, table); + break; + case _ASCE_TYPE_REGION3: + __gmap_unshadow_r3t(sg, 0, table); + break; + case _ASCE_TYPE_SEGMENT: + __gmap_unshadow_sgt(sg, 0, table); + break; + } +} +EXPORT_SYMBOL(gmap_unshadow); + +/** + * gmap_shadow_r2t - create an empty shadow region 2 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r2t: parent gmap address of the region 2 table to get shadowed + * @fake: r2t references contiguous guest memory block, not a r2t + * + * The r2t parameter specifies the address of the source table. The + * four pages of the source table are made read-only in the parent gmap + * address space. A write to the source table area @r2t will automatically + * remove the shadow r2 table and all of its descendants. + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *table; + phys_addr_t s_r2t; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + s_r2t = page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = s_r2t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r2t & _REGION_ENTRY_PROTECT); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r2t read-only in parent gmap page table */ + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; + origin = r2t & _REGION_ENTRY_ORIGIN; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 4); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r2t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r2t); + +/** + * gmap_shadow_r3t - create a shadow region 3 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r3t: parent gmap address of the region 3 table to get shadowed + * @fake: r3t references contiguous guest memory block, not a r3t + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *table; + phys_addr_t s_r3t; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + s_r3t = page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = s_r3t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r3t & _REGION_ENTRY_PROTECT); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r3t read-only in parent gmap page table */ + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; + origin = r3t & _REGION_ENTRY_ORIGIN; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 3); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r3t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r3t); + +/** + * gmap_shadow_sgt - create a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @sgt: parent gmap address of the segment table to get shadowed + * @fake: sgt references contiguous guest memory block, not a sgt + * + * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *table; + phys_addr_t s_sgt; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); + /* Allocate a shadow segment table */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + s_sgt = page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = s_sgt | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= sgt & _REGION_ENTRY_PROTECT; + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make sgt read-only in parent gmap page table */ + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; + origin = sgt & _REGION_ENTRY_ORIGIN; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 2); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_sgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_sgt); + +static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) +{ + unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); + + pgstes += _PAGE_ENTRIES; + + pgstes[0] &= ~PGSTE_ST2_MASK; + pgstes[1] &= ~PGSTE_ST2_MASK; + pgstes[2] &= ~PGSTE_ST2_MASK; + pgstes[3] &= ~PGSTE_ST2_MASK; + + pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; + pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; + pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; + pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; +} + +/** + * gmap_shadow_pgt - instantiate a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: parent gmap address of the page table to get shadowed + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory, + * -EFAULT if an address in the parent gmap could not be resolved and + * + * Called with gmap->mm->mmap_lock in read + */ +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake) +{ + unsigned long raddr, origin; + unsigned long *table; + struct ptdesc *ptdesc; + phys_addr_t s_pgt; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); + /* Allocate a shadow page table */ + ptdesc = page_table_alloc_pgste(sg->mm); + if (!ptdesc) + return -ENOMEM; + origin = pgt & _SEGMENT_ENTRY_ORIGIN; + if (fake) + origin |= GMAP_SHADOW_FAKE_TABLE; + gmap_pgste_set_pgt_addr(ptdesc, origin); + s_pgt = page_to_phys(ptdesc_page(ptdesc)); + /* Install shadow page table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _SEGMENT_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _SEGMENT_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | + (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_SEGMENT_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make pgt read-only in parent gmap page table (not the pgste) */ + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; + origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 1); + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_SEGMENT_ENTRY_INVALID; + } else { + gmap_unshadow_pgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + page_table_free_pgste(ptdesc); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt); + +/** + * gmap_shadow_page - create a shadow page mapping + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pte: pte in parent gmap address space to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr, paddr; + spinlock_t *ptl; + pte_t *sptep, *tptep; + int prot; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; + + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); + if (!rmap) + return -ENOMEM; + rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; + + while (1) { + paddr = pte_val(pte) & PAGE_MASK; + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); + if (rc) + break; + rc = -EAGAIN; + sptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (sptep) { + spin_lock(&sg->guest_table_lock); + /* Get page table pointer */ + tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); + if (!tptep) { + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(sptep, ptl); + radix_tree_preload_end(); + break; + } + rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); + if (rc > 0) { + /* Success and a new mapping */ + gmap_insert_rmap(sg, vmaddr, rmap); + rmap = NULL; + rc = 0; + } + gmap_pte_op_end(sptep, ptl); + spin_unlock(&sg->guest_table_lock); + } + radix_tree_preload_end(); + if (!rc) + break; + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + break; + } + kfree(rmap); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_page); + +/* + * gmap_shadow_notify - handle notifications for shadow gmap + * + * Called with sg->parent->shadow_lock. + */ +static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, + unsigned long gaddr) +{ + struct gmap_rmap *rmap, *rnext, *head; + unsigned long start, end, bits, raddr; + + BUG_ON(!gmap_is_shadow(sg)); + + spin_lock(&sg->guest_table_lock); + if (sg->removed) { + spin_unlock(&sg->guest_table_lock); + return; + } + /* Check for top level table */ + start = sg->orig_asce & _ASCE_ORIGIN; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; + if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && + gaddr < end) { + /* The complete shadow table has to go */ + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + return; + } + /* Remove the page table tree from on specific entry */ + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + gmap_for_each_rmap_safe(rmap, rnext, head) { + bits = rmap->raddr & _SHADOW_RMAP_MASK; + raddr = rmap->raddr ^ bits; + switch (bits) { + case _SHADOW_RMAP_REGION1: + gmap_unshadow_r2t(sg, raddr); + break; + case _SHADOW_RMAP_REGION2: + gmap_unshadow_r3t(sg, raddr); + break; + case _SHADOW_RMAP_REGION3: + gmap_unshadow_sgt(sg, raddr); + break; + case _SHADOW_RMAP_SEGMENT: + gmap_unshadow_pgt(sg, raddr); + break; + case _SHADOW_RMAP_PGTABLE: + gmap_unshadow_page(sg, raddr); + break; + } + kfree(rmap); + } + spin_unlock(&sg->guest_table_lock); +} + +/** + * ptep_notify - call all invalidation callbacks for a specific pte. + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + * @pte: pointer to the page table entry + * @bits: bits from the pgste that caused the notify call + * + * This function is assumed to be called with the page table lock held + * for the pte to notify. + */ +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, + pte_t *pte, unsigned long bits) +{ + unsigned long offset, gaddr = 0; + struct gmap *gmap, *sg, *next; + + offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; + spin_unlock(&gmap->guest_table_lock); + if (!IS_GADDR_VALID(gaddr)) + continue; + + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, gaddr); + spin_unlock(&gmap->shadow_lock); + } + if (bits & PGSTE_IN_BIT) + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(ptep_notify); + +static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); + gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); +} + +/** + * gmap_pmdp_xchg - exchange a gmap pmd with another + * @gmap: pointer to the guest address space structure + * @pmdp: pointer to the pmd entry + * @new: replacement entry + * @gaddr: the affected guest address + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, + unsigned long gaddr) +{ + gaddr &= HPAGE_MASK; + pmdp_notify_gmap(gmap, pmdp, gaddr); + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); + if (machine_has_tlb_guest()) + __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, + IDTE_GLOBAL); + else + __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); + set_pmd(pmdp, new); +} + +static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, + int purge) +{ + pmd_t *pmdp; + struct gmap *gmap; + unsigned long gaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (purge) + __pmdp_cspg(pmdp); + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} + +/** + * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without + * flushing + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 0); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); + +/** + * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_LOCAL); + else + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); + +/** + * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_GLOBAL); + else + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); + +/** + * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status + * @gmap: pointer to guest address space + * @pmdp: pointer to the pmd to be tested + * @gaddr: virtual address in the guest address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return false; + + /* Already protected memory, which did not change is clean */ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) + return false; + + /* Clear UC indication and reset protection */ + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); + gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); + return true; +} + +/** + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment + * @gmap: pointer to guest address space + * @bitmap: dirty bitmap for this pmd + * @gaddr: virtual address in the guest address space + * @vmaddr: virtual address in the host address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], + unsigned long gaddr, unsigned long vmaddr) +{ + int i; + pmd_t *pmdp; + pte_t *ptep; + spinlock_t *ptl; + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return; + + if (pmd_leaf(*pmdp)) { + if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) + bitmap_fill(bitmap, _PAGE_ENTRIES); + } else { + for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); + if (!ptep) + continue; + if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) + set_bit(i, bitmap); + pte_unmap_unlock(ptep, ptl); + } + } + gmap_pmd_op_end(gmap, pmdp); +} +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + split_huge_pmd(vma, pmd, addr); + return 0; +} + +static const struct mm_walk_ops thp_split_walk_ops = { + .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, +}; + +static inline void thp_split_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + for_each_vma(vmi, vma) { + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); + walk_page_vma(vma, &thp_split_walk_ops, NULL); + } + mm->def_flags |= VM_NOHUGEPAGE; +} +#else +static inline void thp_split_mm(struct mm_struct *mm) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * switch on pgstes for its userspace process (for kvm) + */ +int s390_enable_sie(void) +{ + struct mm_struct *mm = current->mm; + + /* Do we have pgstes? if yes, we are done */ + if (mm_has_pgste(mm)) + return 0; + mmap_write_lock(mm); + mm->context.has_pgste = 1; + /* split thp mappings and disable thp for future mappings */ + thp_split_mm(mm); + mmap_write_unlock(mm); + return 0; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); + +/* + * Enable storage key handling from now on and initialize the storage + * keys with the default key. + */ +static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + /* Clear storage key */ + ptep_zap_key(walk->mm, addr, pte); + return 0; +} + +/* + * Give a chance to schedule after setting a key to 256 pages. + * We only hold the mm lock, which is a rwsem and the kvm srcu. + * Both can sleep. + */ +static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + cond_resched(); + return 0; +} + +static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, + unsigned long hmask, unsigned long next, + struct mm_walk *walk) +{ + pmd_t *pmd = (pmd_t *)pte; + unsigned long start, end; + struct folio *folio = page_folio(pmd_page(*pmd)); + + /* + * The write check makes sure we do not set a key on shared + * memory. This is needed as the walker does not differentiate + * between actual guest memory and the process executable or + * shared libraries. + */ + if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || + !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) + return 0; + + start = pmd_val(*pmd) & HPAGE_MASK; + end = start + HPAGE_SIZE; + __storage_key_init_range(start, end); + set_bit(PG_arch_1, &folio->flags.f); + cond_resched(); + return 0; +} + +static const struct mm_walk_ops enable_skey_walk_ops = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, + .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, +}; + +int s390_enable_skey(void) +{ + struct mm_struct *mm = current->mm; + int rc = 0; + + mmap_write_lock(mm); + if (mm_uses_skeys(mm)) + goto out_up; + + mm->context.uses_skeys = 1; + rc = gmap_helper_disable_cow_sharing(); + if (rc) { + mm->context.uses_skeys = 0; + goto out_up; + } + walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); + +out_up: + mmap_write_unlock(mm); + return rc; +} +EXPORT_SYMBOL_GPL(s390_enable_skey); + +/* + * Reset CMMA state, make all pages stable again. + */ +static int __s390_reset_cmma(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + ptep_zap_unused(walk->mm, addr, pte, 1); + return 0; +} + +static const struct mm_walk_ops reset_cmma_walk_ops = { + .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, +}; + +void s390_reset_cmma(struct mm_struct *mm) +{ + mmap_write_lock(mm); + walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); + mmap_write_unlock(mm); +} +EXPORT_SYMBOL_GPL(s390_reset_cmma); + +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, +}; + +/* + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. + */ +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) +{ + struct folio *folio; + unsigned long i; + + for (i = 0; i < count; i++) { + folio = pfn_folio(pfns[i]); + /* we always have an extra reference */ + uv_destroy_folio(folio); + /* get rid of the extra reference */ + folio_put(folio); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); + +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } + return 0; +} +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) +{ + unsigned long asce; + struct page *page; + void *table; + + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; +} +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c new file mode 100644 index 000000000000..549f14ad08af --- /dev/null +++ b/arch/s390/mm/gmap_helpers.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2025 + */ + +#include <linux/export.h> +#include <linux/mm_types.h> +#include <linux/mmap_lock.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/leafops.h> +#include <linux/pagewalk.h> +#include <linux/ksm.h> +#include <asm/gmap_helpers.h> +#include <asm/pgtable.h> + +/** + * ptep_zap_softleaf_entry() - discard a software leaf entry. + * @mm: the mm + * @entry: the software leaf entry that needs to be zapped + * + * Discards the given software leaf entry. If the leaf entry was an actual + * swap entry (and not a migration entry, for example), the actual swapped + * page is also discarded from swap. + */ +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) +{ + if (softleaf_is_swap(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (softleaf_is_migration(entry)) + dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); + free_swap_and_cache(entry); +} + +/** + * gmap_helper_zap_one_page() - discard a page if it was swapped. + * @mm: the mm + * @vmaddr: the userspace virtual address that needs to be discarded + * + * If the given address maps to a swap entry, discard it. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + + mmap_assert_locked(mm); + + /* Find the vm address for the guest address */ + vma = vma_lookup(mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + + /* Get pointer to the page table entry */ + ptep = get_locked_pte(mm, vmaddr, &ptl); + if (unlikely(!ptep)) + return; + if (pte_swap(*ptep)) { + preempt_disable(); + pgste = pgste_get_lock(ptep); + + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + pte_clear(mm, vmaddr, ptep); + + pgste_set_unlock(ptep, pgste); + preempt_enable(); + } + pte_unmap_unlock(ptep, ptl); +} +EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page); + +/** + * gmap_helper_discard() - discard user pages in the given range + * @mm: the mm + * @vmaddr: starting userspace address + * @end: end address (first address outside the range) + * + * All userpace pages in the range [@vamddr, @end) are discarded and unmapped. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + + while (vmaddr < end) { + vma = find_vma_intersection(mm, vmaddr, end); + if (!vma) + return; + if (!is_vm_hugetlb_page(vma)) + zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); + vmaddr = vma->vm_end; + } +} +EXPORT_SYMBOL_GPL(gmap_helper_discard); + +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + unsigned long *found_addr = walk->private; + + /* Return 1 of the page is a zeropage. */ + if (is_zero_pfn(pte_pfn(*pte))) { + /* + * Shared zeropage in e.g., a FS DAX mapping? We cannot do the + * right thing and likely don't care: FAULT_FLAG_UNSHARE + * currently only works in COW mappings, which is also where + * mm_forbids_zeropage() is checked. + */ + if (!is_cow_mapping(walk->vma->vm_flags)) + return -EFAULT; + + *found_addr = addr; + return 1; + } + return 0; +} + +static const struct mm_walk_ops find_zeropage_ops = { + .pte_entry = find_zeropage_pte_entry, + .walk_lock = PGWALK_WRLOCK, +}; + +/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages + * @mm: the mm whose zero pages are to be unshared + * + * Unshare all shared zeropages, replacing them by anonymous pages. Note that + * we cannot simply zap all shared zeropages, because this could later + * trigger unexpected userfaultfd missing events. + * + * This must be called after mm->context.allow_cow_sharing was + * set to 0, to avoid future mappings of shared zeropages. + * + * mm contracts with s390, that even if mm were to remove a page table, + * and racing with walk_page_range_vma() calling pte_offset_map_lock() + * would fail, it will never insert a page table containing empty zero + * pages once mm_forbids_zeropage(mm) i.e. + * mm->context.allow_cow_sharing is set to 0. + */ +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + unsigned long addr; + vm_fault_t fault; + int rc; + + for_each_vma(vmi, vma) { + /* + * We could only look at COW mappings, but it's more future + * proof to catch unexpected zeropages in other mappings and + * fail. + */ + if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) + continue; + addr = vma->vm_start; + +retry: + rc = walk_page_range_vma(vma, addr, vma->vm_end, + &find_zeropage_ops, &addr); + if (rc < 0) + return rc; + else if (!rc) + continue; + + /* addr was updated by find_zeropage_pte_entry() */ + fault = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + if (fault & VM_FAULT_OOM) + return -ENOMEM; + /* + * See break_ksm(): even after handle_mm_fault() returned 0, we + * must start the lookup from the current address, because + * handle_mm_fault() may back out if there's any difficulty. + * + * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but + * maybe they could trigger in the future on concurrent + * truncation. In that case, the shared zeropage would be gone + * and we can simply retry and make progress. + */ + cond_resched(); + goto retry; + } + + return 0; +} + +/** + * gmap_helper_disable_cow_sharing() - disable all COW sharing + * + * Disable most COW-sharing of memory pages for the whole process: + * (1) Disable KSM and unmerge/unshare any KSM pages. + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. + * + * Not that we currently don't bother with COW-shared pages that are shared + * with parent/child processes due to fork(). + */ +int gmap_helper_disable_cow_sharing(void) +{ + struct mm_struct *mm = current->mm; + int rc; + + mmap_assert_write_locked(mm); + + if (!mm->context.allow_cow_sharing) + return 0; + + mm->context.allow_cow_sharing = 0; + + /* Replace all shared zeropages by anonymous pages. */ + rc = __gmap_helper_unshare_zeropages(mm); + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + if (!rc) + rc = ksm_disable(mm); + if (rc) + mm->context.allow_cow_sharing = 1; + return rc; +} +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c deleted file mode 100644 index 1f5315d1215c..000000000000 --- a/arch/s390/mm/gup.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Lockless get_user_pages_fast for s390 - * - * Copyright IBM Corp. 2010 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/vmstat.h> -#include <linux/pagemap.h> -#include <linux/rwsem.h> -#include <asm/pgtable.h> - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask; - pte_t *ptep, pte; - struct page *page; - - mask = (write ? _PAGE_RO : 0) | _PAGE_INVALID | _PAGE_SPECIAL; - - ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr); - do { - pte = *ptep; - barrier(); - if ((pte_val(pte) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - if (!page_cache_get_speculative(page)) - return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(page); - return 0; - } - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - - return 1; -} - -static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask, result; - struct page *head, *page, *tail; - int refs; - - result = write ? 0 : _SEGMENT_ENTRY_RO; - mask = result | _SEGMENT_ENTRY_INV; - if ((pmd_val(pmd) & mask) != result) - return 0; - VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - - return 1; -} - - -static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp, pmd; - - pmdp = (pmd_t *) pudp; -#ifdef CONFIG_64BIT - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pmdp = (pmd_t *) pud_deref(pud); - pmdp += pmd_index(addr); -#endif - do { - pmd = *pmdp; - barrier(); - next = pmd_addr_end(addr, end); - /* - * The pmd_trans_splitting() check below explains why - * pmdp_splitting_flush() has to serialize with - * smp_call_function() against our disabled IRQs, to stop - * this gup-fast code from running while we set the - * splitting bit in the pmd. Returning zero will take - * the slow path that will call wait_split_huge_page() - * if the pmd is still in splitting state. - */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) - return 0; - if (unlikely(pmd_large(pmd))) { - if (!gup_huge_pmd(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pte_range(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp, pud; - - pudp = (pud_t *) pgdp; -#ifdef CONFIG_64BIT - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pudp = (pud_t *) pgd_deref(pgd); - pudp += pud_index(addr); -#endif - do { - pud = *pudp; - barrier(); - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp, pgd; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if ((end < start) || (end > TASK_SIZE)) - return 0; - - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd = *pgdp; - barrier(); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp, pgd; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if ((end < start) || (end > TASK_SIZE)) - goto slow_irqon; - - /* - * local_irq_disable() doesn't prevent pagetable teardown, but does - * prevent the pagetables from being freed on s390. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the the page and take a ref on it. - */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd = *pgdp; - barrier(); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); - up_read(&mm->mmap_sem); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 121089d57802..d42e61c7594e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -1,132 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0 /* * IBM System z Huge TLB Page Support for Kernel. * - * Copyright IBM Corp. 2007 + * Copyright IBM Corp. 2007,2020 * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> */ +#define pr_fmt(fmt) "hugetlb: " fmt + +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/security.h> +#include <asm/pgalloc.h> +/* + * If the bit selected by single-bit bitmask "a" is set within "x", move + * it to the position indicated by single-bit bitmask "b". + */ +#define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b)) -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *pteptr, pte_t pteval) +static inline unsigned long __pte_to_rste(pte_t pte) { - pmd_t *pmdp = (pmd_t *) pteptr; - unsigned long mask; - - if (!MACHINE_HAS_HPAGE) { - pteptr = (pte_t *) pte_page(pteval)[1].index; - mask = pte_val(pteval) & - (_SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO); - pte_val(pteval) = (_SEGMENT_ENTRY + __pa(pteptr)) | mask; - } + swp_entry_t arch_entry; + unsigned long rste; - pmd_val(*pmdp) = pte_val(pteval); + /* + * Convert encoding pte bits pmd / pud bits + * lIR.uswrdy.p dy..R...I...wr + * empty 010.000000.0 -> 00..0...1...00 + * prot-none, clean, old 111.000000.1 -> 00..1...1...00 + * prot-none, clean, young 111.000001.1 -> 01..1...1...00 + * prot-none, dirty, old 111.000010.1 -> 10..1...1...00 + * prot-none, dirty, young 111.000011.1 -> 11..1...1...00 + * read-only, clean, old 111.000100.1 -> 00..1...1...01 + * read-only, clean, young 101.000101.1 -> 01..1...0...01 + * read-only, dirty, old 111.000110.1 -> 10..1...1...01 + * read-only, dirty, young 101.000111.1 -> 11..1...0...01 + * read-write, clean, old 111.001100.1 -> 00..1...1...11 + * read-write, clean, young 101.001101.1 -> 01..1...0...11 + * read-write, dirty, old 110.001110.1 -> 10..0...1...11 + * read-write, dirty, young 100.001111.1 -> 11..0...0...11 + * HW-bits: R read-only, I invalid + * SW-bits: p present, y young, d dirty, r read, w write, s special, + * u unused, l large + */ + if (pte_present(pte)) { + rste = pte_val(pte) & PAGE_MASK; + rste |= _SEGMENT_ENTRY_PRESENT; + rste |= move_set_bit(pte_val(pte), _PAGE_READ, + _SEGMENT_ENTRY_READ); + rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, + _SEGMENT_ENTRY_WRITE); + rste |= move_set_bit(pte_val(pte), _PAGE_INVALID, + _SEGMENT_ENTRY_INVALID); + rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT, + _SEGMENT_ENTRY_PROTECT); + rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY, + _SEGMENT_ENTRY_DIRTY); + rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG, + _SEGMENT_ENTRY_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY, + _SEGMENT_ENTRY_SOFT_DIRTY); +#endif + rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, + _SEGMENT_ENTRY_NOEXEC); + } else if (!pte_none(pte)) { + /* swap pte */ + arch_entry = __pte_to_swp_entry(pte); + rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry)); + } else + rste = _SEGMENT_ENTRY_EMPTY; + return rste; } -int arch_prepare_hugepage(struct page *page) +static inline pte_t __rste_to_pte(unsigned long rste) { - unsigned long addr = page_to_phys(page); + swp_entry_t arch_entry; + unsigned long pteval; + int present, none; pte_t pte; - pte_t *ptep; - int i; - if (MACHINE_HAS_HPAGE) - return 0; + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + present = pud_present(__pud(rste)); + none = pud_none(__pud(rste)); + } else { + present = pmd_present(__pmd(rste)); + none = pmd_none(__pmd(rste)); + } - ptep = (pte_t *) pte_alloc_one(&init_mm, addr); - if (!ptep) - return -ENOMEM; + /* + * Convert encoding pmd / pud bits pte bits + * dy..R...I...wr lIR.uswrdy.p + * empty 00..0...1...00 -> 010.000000.0 + * prot-none, clean, old 00..1...1...00 -> 111.000000.1 + * prot-none, clean, young 01..1...1...00 -> 111.000001.1 + * prot-none, dirty, old 10..1...1...00 -> 111.000010.1 + * prot-none, dirty, young 11..1...1...00 -> 111.000011.1 + * read-only, clean, old 00..1...1...01 -> 111.000100.1 + * read-only, clean, young 01..1...0...01 -> 101.000101.1 + * read-only, dirty, old 10..1...1...01 -> 111.000110.1 + * read-only, dirty, young 11..1...0...01 -> 101.000111.1 + * read-write, clean, old 00..1...1...11 -> 111.001100.1 + * read-write, clean, young 01..1...0...11 -> 101.001101.1 + * read-write, dirty, old 10..0...1...11 -> 110.001110.1 + * read-write, dirty, young 11..0...0...11 -> 100.001111.1 + * HW-bits: R read-only, I invalid + * SW-bits: p present, y young, d dirty, r read, w write, s special, + * u unused, l large + */ + if (present) { + pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; + pteval |= _PAGE_LARGE | _PAGE_PRESENT; + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); +#endif + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); + } else if (!none) { + /* swap rste */ + arch_entry = __rste_to_swp_entry(rste); + pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry)); + pteval = pte_val(pte); + } else + pteval = _PAGE_INVALID; + return __pte(pteval); +} + +static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) +{ + struct folio *folio; + unsigned long size, paddr; + + if (!mm_uses_skeys(mm) || + rste & _SEGMENT_ENTRY_INVALID) + return; - pte_val(pte) = addr; - for (i = 0; i < PTRS_PER_PTE; i++) { - set_pte_at(&init_mm, addr + i * PAGE_SIZE, ptep + i, pte); - pte_val(pte) += PAGE_SIZE; + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + folio = page_folio(pud_page(__pud(rste))); + size = PUD_SIZE; + paddr = rste & PUD_MASK; + } else { + folio = page_folio(pmd_page(__pmd(rste))); + size = PMD_SIZE; + paddr = rste & PMD_MASK; } - page[1].index = (unsigned long) ptep; - return 0; + + if (!test_and_set_bit(PG_arch_1, &folio->flags.f)) + __storage_key_init_range(paddr, paddr + size); } -void arch_release_hugepage(struct page *page) +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) { - pte_t *ptep; + unsigned long rste; - if (MACHINE_HAS_HPAGE) - return; + rste = __pte_to_rste(pte); - ptep = (pte_t *) page[1].index; - if (!ptep) - return; - clear_table((unsigned long *) ptep, _PAGE_TYPE_EMPTY, - PTRS_PER_PTE * sizeof(pte_t)); - page_table_free(&init_mm, (unsigned long *) ptep); - page[1].index = 0; + /* Set correct table type for 2G hugepages */ + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + if (likely(pte_present(pte))) + rste |= _REGION3_ENTRY_LARGE; + rste |= _REGION_ENTRY_TYPE_R3; + } else if (likely(pte_present(pte))) + rste |= _SEGMENT_ENTRY_LARGE; + + clear_huge_pte_skeys(mm, rste); + set_pte(ptep, __pte(rste)); +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); +} + +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + return __rste_to_pte(pte_val(*ptep)); +} + +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = huge_ptep_get(mm, addr, ptep); + pmd_t *pmdp = (pmd_t *) ptep; + pud_t *pudp = (pud_t *) ptep; + + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY)); + else + pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + return pte; } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp = NULL; pgdp = pgd_offset(mm, addr); - pudp = pud_alloc(mm, pgdp, addr); - if (pudp) - pmdp = pmd_alloc(mm, pudp, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (p4dp) { + pudp = pud_alloc(mm, p4dp, addr); + if (pudp) { + if (sz == PUD_SIZE) + return (pte_t *) pudp; + else if (sz == PMD_SIZE) + pmdp = pmd_alloc(mm, pudp, addr); + } + } return (pte_t *) pmdp; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp = NULL; pgdp = pgd_offset(mm, addr); if (pgd_present(*pgdp)) { - pudp = pud_offset(pgdp, addr); - if (pud_present(*pudp)) - pmdp = pmd_offset(pudp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_present(*p4dp)) { + pudp = pud_offset(p4dp, addr); + if (sz == PUD_SIZE) + return (pte_t *)pudp; + if (pud_present(*pudp)) + pmdp = pmd_offset(pudp, addr); + } } return (pte_t *) pmdp; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +bool __init arch_hugetlb_valid_size(unsigned long size) { - return 0; -} - -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - -int pmd_huge(pmd_t pmd) -{ - if (!MACHINE_HAS_HPAGE) - return 0; - - return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE); -} - -int pud_huge(pud_t pud) -{ - return 0; -} - -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmdp, int write) -{ - struct page *page; - - if (!MACHINE_HAS_HPAGE) - return NULL; - - page = pmd_page(*pmdp); - if (page) - page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); - return page; + if (cpu_has_edat1() && size == PMD_SIZE) + return true; + else if (cpu_has_edat2() && size == PUD_SIZE) + return true; + else + return false; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ce36ea80e4f9..e4953453d254 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * S390 version * Copyright IBM Corp. 1999 @@ -7,6 +8,7 @@ * Copyright (C) 1995 Linus Torvalds */ +#include <linux/cpufeature.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -17,76 +19,69 @@ #include <linux/mman.h> #include <linux/mm.h> #include <linux/swap.h> +#include <linux/swiotlb.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/pagemap.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/memory.h> #include <linux/pfn.h> #include <linux/poison.h> #include <linux/initrd.h> #include <linux/export.h> +#include <linux/cma.h> #include <linux/gfp.h> +#include <linux/dma-direct.h> +#include <linux/percpu.h> #include <asm/processor.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> +#include <linux/uaccess.h> #include <asm/pgalloc.h> +#include <asm/ctlreg.h> +#include <asm/kfence.h> #include <asm/dma.h> -#include <asm/lowcore.h> -#include <asm/tlb.h> +#include <asm/abs_lowcore.h> #include <asm/tlbflush.h> #include <asm/sections.h> -#include <asm/ctl_reg.h> #include <asm/sclp.h> +#include <asm/set_memory.h> +#include <asm/kasan.h> +#include <asm/dma-mapping.h> +#include <asm/uv.h> +#include <linux/virtio_anchor.h> +#include <linux/virtio_config.h> +#include <linux/execmem.h> -pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); +pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); + +struct ctlreg __bootdata_preserved(s390_invalid_asce); + +unsigned long __bootdata_preserved(page_noexec_mask); +EXPORT_SYMBOL(page_noexec_mask); + +unsigned long __bootdata_preserved(segment_noexec_mask); +EXPORT_SYMBOL(segment_noexec_mask); + +unsigned long __bootdata_preserved(region_noexec_mask); +EXPORT_SYMBOL(region_noexec_mask); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); +EXPORT_SYMBOL(zero_page_mask); static void __init setup_zero_pages(void) { - struct cpuid cpu_id; + unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; - struct page *page; - int i; - get_cpu_id(&cpu_id); - switch (cpu_id.machine) { - case 0x9672: /* g5 */ - case 0x2064: /* z900 */ - case 0x2066: /* z900 */ - case 0x2084: /* z990 */ - case 0x2086: /* z990 */ - case 0x2094: /* z9-109 */ - case 0x2096: /* z9-109 */ - order = 0; - break; - case 0x2097: /* z10 */ - case 0x2098: /* z10 */ - case 0x2817: /* z196 */ - case 0x2818: /* z196 */ - order = 2; - break; - case 0x2827: /* zEC12 */ - default: - order = 5; - break; - } + /* Latest machines require a mapping granularity of 512KB */ + order = 7; + /* Limit number of empty zero pages for small memory sizes */ - if (order > 2 && totalram_pages <= 16384) - order = 2; - - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Out of memory in setup_zero_pages"); - - page = virt_to_page((void *) empty_zero_page); - split_page(page, order); - for (i = 1 << order; i > 0; i--) { - mark_page_reserved(page); - page++; - } + while (order > 2 && (total_pages >> 10) < (1UL << order)) + order--; + + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } @@ -97,131 +92,228 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - - init_mm.pgd = swapper_pg_dir; -#ifdef CONFIG_64BIT - if (VMALLOC_END > (1UL << 42)) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } -#else - asce_bits = _ASCE_TABLE_LENGTH; - pgd_type = _SEGMENT_ENTRY_EMPTY; -#endif - S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - clear_table((unsigned long *) init_mm.pgd, pgd_type, - sizeof(unsigned long)*2048); - vmem_map_init(); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - arch_local_irq_restore(4UL << (BITS_PER_LONG - 8)); - atomic_set(&init_mm.context.attach_count, 1); - - sparse_memory_present_with_active_regions(MAX_NUMNODES); + vmem_map_init(); sparse_init(); + zone_dma_limit = DMA_BIT_MASK(31); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } -void __init mem_init(void) +void mark_rodata_ro(void) { - max_mapnr = max_low_pfn; - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + unsigned long size = __end_ro_after_init - __start_ro_after_init; + + if (cpu_has_nx()) + system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); + __set_memory_ro(__start_ro_after_init, __end_ro_after_init); + pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); +} - /* Setup guest page hinting */ - cmma_init(); +int set_memory_encrypted(unsigned long vaddr, int numpages) +{ + int i; + + /* make specified pages unshared, (swiotlb, dma_free) */ + for (i = 0; i < numpages; ++i) { + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; + } + return 0; +} + +int set_memory_decrypted(unsigned long vaddr, int numpages) +{ + int i; + /* make specified pages shared (swiotlb, dma_alloca) */ + for (i = 0; i < numpages; ++i) { + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; + } + return 0; +} + +/* are we a protected virtualization guest? */ +bool force_dma_unencrypted(struct device *dev) +{ + return is_prot_virt_guest(); +} + +/* protected virtualization */ +static void __init pv_init(void) +{ + if (!is_prot_virt_guest()) + return; + + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); + + /* make sure bounce buffers are shared */ + swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); + swiotlb_update_mem_attributes(); +} + +void __init arch_mm_preinit(void) +{ + cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(0, mm_cpumask(&init_mm)); + + pv_init(); - /* this will put all low memory onto the freelists */ - free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ +} + +unsigned long memory_block_size_bytes(void) +{ + /* + * Make sure the memory block size is always greater + * or equal than the memory increment size. + */ + return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm); +} + +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); - mem_init_print_info(NULL); - printk("Write protected kernel read-only data: %#lx - %#lx\n", - (unsigned long)&_stext, - PFN_ALIGN((unsigned long)&_eshared) - 1); +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + return LOCAL_DISTANCE; } -void free_initmem(void) +static int __init pcpu_cpu_to_node(int cpu) { - free_initmem_default(POISON_FREE_INITMEM); + return 0; } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) +void __init setup_per_cpu_areas(void) { - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } -#endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) + +#ifdef CONFIG_CMA + +/* Prevent memory blocks which contain cma regions from going offline */ + +struct s390_cma_mem_data { + unsigned long start; + unsigned long end; +}; + +static int s390_cma_check_range(struct cma *cma, void *data) +{ + struct s390_cma_mem_data *mem_data; + + mem_data = data; + + if (cma_intersects(cma, mem_data->start, mem_data->end)) + return -EBUSY; + + return 0; +} + +static int s390_cma_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct s390_cma_mem_data mem_data; + struct memory_notify *arg; + int rc = 0; + + arg = data; + mem_data.start = arg->start_pfn << PAGE_SHIFT; + mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT); + if (action == MEM_GOING_OFFLINE) + rc = cma_for_each_area(s390_cma_check_range, &mem_data); + return notifier_from_errno(rc); +} + +static struct notifier_block s390_cma_mem_nb = { + .notifier_call = s390_cma_mem_notifier, +}; + +static int __init s390_cma_mem_init(void) +{ + return register_memory_notifier(&s390_cma_mem_nb); +} +device_initcall(s390_cma_mem_init); + +#endif /* CONFIG_CMA */ + +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_params *params) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - struct zone *zone; int rc; + if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL))) + return -EINVAL; + + VM_BUG_ON(!mhp_range_allowed(start, size, true)); rc = vmem_add_mapping(start, size); if (rc) return rc; - for_each_zone(zone) { - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; - } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; - } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages); - if (rc) - break; - start_pfn += nr_pages; - size_pages -= nr_pages; - if (!size_pages) - break; - } + + rc = __add_pages(nid, start_pfn, size_pages, params); if (rc) vmem_remove_mapping(start, size); return rc; } -unsigned long memory_block_size_bytes(void) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { - /* - * Make sure the memory block size is always greater - * or equal than the memory increment size. - */ - return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp_get_rzm()); + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + __remove_pages(start_pfn, nr_pages, altmap); + vmem_remove_mapping(start, size); } +#endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size) +#ifdef CONFIG_EXECMEM +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - /* - * There is no hardware or firmware interface which could trigger a - * hot memory remove on s390. So there is nothing that needs to be - * implemented. - */ - return -EBUSY; + unsigned long module_load_offset = 0; + unsigned long start; + + if (kaslr_enabled()) + module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; + + start = MODULES_VADDR + module_load_offset; + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .flags = EXECMEM_KASAN_SHADOW, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = MODULE_ALIGN, + }, + }, + }; + + return &execmem_info; } -#endif -#endif /* CONFIG_MEMORY_HOTPLUG */ +#endif /* CONFIG_EXECMEM */ diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 921fa541dc04..cfd219fe495c 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Access kernel memory without faulting -- s390 specific implementation. * - * Copyright IBM Corp. 2009 - * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, + * Copyright IBM Corp. 2009, 2015 * */ @@ -13,186 +12,134 @@ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/cpu.h> -#include <asm/ctl_reg.h> - -/* - * This function writes to kernel memory bypassing DAT and possible - * write protection. It copies one to four bytes from src to dst - * using the stura instruction. - * Returns the number of bytes copied or -EFAULT. - */ -static long probe_kernel_write_odd(void *dst, const void *src, size_t size) +#include <linux/uio.h> +#include <linux/io.h> +#include <asm/asm-extable.h> +#include <asm/abs_lowcore.h> +#include <asm/stacktrace.h> +#include <asm/sections.h> +#include <asm/maccess.h> +#include <asm/ctlreg.h> + +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); +static DEFINE_MUTEX(memcpy_real_mutex); + +static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { - unsigned long count, aligned; - int offset, mask; - int rc = -EFAULT; - - aligned = (unsigned long) dst & ~3UL; - offset = (unsigned long) dst & 3; - count = min_t(unsigned long, 4 - offset, size); - mask = (0xf << (4 - count)) & 0xf; - mask >>= offset; + unsigned long aligned, offset, count; + char tmp[8]; + + aligned = (unsigned long) dst & ~7UL; + offset = (unsigned long) dst & 7UL; + size = min(8UL - offset, size); + count = size - 1; asm volatile( " bras 1,0f\n" - " icm 0,0,0(%3)\n" - "0: l 0,0(%1)\n" - " lra %1,0(%1)\n" - "1: ex %2,0(1)\n" - "2: stura 0,%1\n" - " la %0,0\n" - "3:\n" - EX_TABLE(0b,3b) EX_TABLE(1b,3b) EX_TABLE(2b,3b) - : "+d" (rc), "+a" (aligned) - : "a" (mask), "a" (src) : "cc", "memory", "0", "1"); - return rc ? rc : count; + " mvc 0(1,%4),0(%5)\n" + "0: mvc 0(8,%3),0(%0)\n" + " ex %1,0(1)\n" + " lg %1,0(%3)\n" + " lra %0,0(%0)\n" + " sturg %1,%0" + : "+&a" (aligned), "+&a" (count), "=m" (tmp) + : "a" (&tmp), "a" (&tmp[offset]), "a" (src) + : "cc", "memory", "1"); + return size; } -long probe_kernel_write(void *dst, const void *src, size_t size) +/* + * __s390_kernel_write - write to kernel memory bypassing DAT + * @dst: destination address + * @src: source address + * @size: number of bytes to copy + * + * This function writes to kernel memory bypassing DAT and possible page table + * write protection. It writes to the destination using the sturg instruction. + * Therefore we have a read-modify-write sequence: the function reads eight + * bytes from destination at an eight byte boundary, modifies the bytes + * requested and writes the result back in a loop. + */ +static DEFINE_SPINLOCK(s390_kernel_write_lock); + +notrace void *__s390_kernel_write(void *dst, const void *src, size_t size) { - long copied = 0; + void *tmp = dst; + unsigned long flags; + long copied; + spin_lock_irqsave(&s390_kernel_write_lock, flags); while (size) { - copied = probe_kernel_write_odd(dst, src, size); - if (copied < 0) - break; - dst += copied; + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; src += copied; size -= copied; } - return copied < 0 ? -EFAULT : 0; -} + spin_unlock_irqrestore(&s390_kernel_write_lock, flags); -static int __memcpy_real(void *dest, void *src, size_t count) -{ - register unsigned long _dest asm("2") = (unsigned long) dest; - register unsigned long _len1 asm("3") = (unsigned long) count; - register unsigned long _src asm("4") = (unsigned long) src; - register unsigned long _len2 asm("5") = (unsigned long) count; - int rc = -EFAULT; - - asm volatile ( - "0: mvcle %1,%2,0x0\n" - "1: jo 0b\n" - " lhi %0,0x0\n" - "2:\n" - EX_TABLE(1b,2b) - : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1), - "+d" (_len2), "=m" (*((long *) dest)) - : "m" (*((long *) src)) - : "cc", "memory"); - return rc; + return dst; } -/* - * Copy memory in real mode (kernel to kernel) - */ -int memcpy_real(void *dest, void *src, size_t count) +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long flags; - int rc; - - if (!count) - return 0; - local_irq_save(flags); - __arch_local_irq_stnsm(0xfbUL); - rc = __memcpy_real(dest, src, count); - local_irq_restore(flags); - return rc; -} - -/* - * Copy memory in absolute mode (kernel to kernel) - */ -void memcpy_absolute(void *dest, void *src, size_t count) -{ - unsigned long cr0, flags, prefix; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - prefix = store_prefix(); - if (prefix) { - local_mcck_disable(); - set_prefix(0); - memcpy(dest, src, count); - set_prefix(prefix); - local_mcck_enable(); - } else { - memcpy(dest, src, count); - } - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); -} - -/* - * Copy memory from kernel (real) to user (virtual) - */ -int copy_to_user_real(void __user *dest, void *src, size_t count) -{ - int offs = 0, size, rc; - char *buf; - - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = -EFAULT; - while (offs < count) { - size = min(PAGE_SIZE, count - offs); - if (memcpy_real(buf, src + offs, size)) - goto out; - if (copy_to_user(dest + offs, buf, size)) - goto out; - offs += size; + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; + + BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE); + while (count) { + phys = src & MEMCPY_REAL_MASK; + offset = src & ~MEMCPY_REAL_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, MEMCPY_REAL_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - rc = 0; -out: - free_page((unsigned long) buf); - return rc; + return res; } -/* - * Copy memory from user (virtual) to kernel (real) - */ -int copy_from_user_real(void *dest, void __user *src, size_t count) +int memcpy_real(void *dest, unsigned long src, size_t count) { - int offs = 0, size, rc; - char *buf; - - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = -EFAULT; - while (offs < count) { - size = min(PAGE_SIZE, count - offs); - if (copy_from_user(buf, src + offs, size)) - goto out; - if (memcpy_real(dest + offs, buf, size)) - goto out; - offs += size; - } - rc = 0; -out: - free_page((unsigned long) buf); - return rc; + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; } /* - * Check if physical address is within prefix or zero page + * Find CPU that owns swapped prefix page */ -static int is_swapped(unsigned long addr) +static int get_swapped_owner(phys_addr_t addr) { - unsigned long lc; + phys_addr_t lc; int cpu; - if (addr < sizeof(struct _lowcore)) - return 1; for_each_online_cpu(cpu) { - lc = (unsigned long) lowcore_ptr[cpu]; - if (addr > lc + sizeof(struct _lowcore) - 1 || addr < lc) + lc = virt_to_phys(lowcore_ptr[cpu]); + if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) continue; - return 1; + return cpu; } - return 0; + return -1; } /* @@ -201,29 +148,47 @@ static int is_swapped(unsigned long addr) * For swapped prefix pages a new buffer is returned that contains a copy of * the absolute memory. The buffer size is maximum one page large. */ -void *xlate_dev_mem_ptr(unsigned long addr) +void *xlate_dev_mem_ptr(phys_addr_t addr) { - void *bounce = (void *) addr; + void *ptr = phys_to_virt(addr); + void *bounce = ptr; + struct lowcore *abs_lc; unsigned long size; + int this_cpu, cpu; - get_online_cpus(); - preempt_disable(); - if (is_swapped(addr)) { - size = PAGE_SIZE - (addr & ~PAGE_MASK); - bounce = (void *) __get_free_page(GFP_ATOMIC); - if (bounce) - memcpy_absolute(bounce, (void *) addr, size); + cpus_read_lock(); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; } - preempt_enable(); - put_online_cpus(); + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); + } +out: + put_cpu(); + cpus_read_unlock(); return bounce; } /* * Free converted buffer for /dev/mem access (if necessary) */ -void unxlate_dev_mem_ptr(unsigned long addr, void *buf) +void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr) { - if ((void *) addr != buf) - free_page((unsigned long) buf); + if (addr != virt_to_phys(ptr)) + free_page((unsigned long)ptr); } diff --git a/arch/s390/mm/mem_detect.c b/arch/s390/mm/mem_detect.c deleted file mode 100644 index cca388253a39..000000000000 --- a/arch/s390/mm/mem_detect.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright IBM Corp. 2008, 2009 - * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <asm/ipl.h> -#include <asm/sclp.h> -#include <asm/setup.h> - -#define ADDR2G (1ULL << 31) - -static void find_memory_chunks(struct mem_chunk chunk[], unsigned long maxsize) -{ - unsigned long long memsize, rnmax, rzm; - unsigned long addr = 0, size; - int i = 0, type; - - rzm = sclp_get_rzm(); - rnmax = sclp_get_rnmax(); - memsize = rzm * rnmax; - if (!rzm) - rzm = 1ULL << 17; - if (sizeof(long) == 4) { - rzm = min(ADDR2G, rzm); - memsize = memsize ? min(ADDR2G, memsize) : ADDR2G; - } - if (maxsize) - memsize = memsize ? min((unsigned long)memsize, maxsize) : maxsize; - do { - size = 0; - type = tprot(addr); - do { - size += rzm; - if (memsize && addr + size >= memsize) - break; - } while (type == tprot(addr + size)); - if (type == CHUNK_READ_WRITE || type == CHUNK_READ_ONLY) { - if (memsize && (addr + size > memsize)) - size = memsize - addr; - chunk[i].addr = addr; - chunk[i].size = size; - chunk[i].type = type; - i++; - } - addr += size; - } while (addr < memsize && i < MEMORY_CHUNKS); -} - -/** - * detect_memory_layout - fill mem_chunk array with memory layout data - * @chunk: mem_chunk array to be filled - * @maxsize: maximum address where memory detection should stop - * - * Fills the passed in memory chunk array with the memory layout of the - * machine. The array must have a size of at least MEMORY_CHUNKS and will - * be fully initialized afterwards. - * If the maxsize paramater has a value > 0 memory detection will stop at - * that address. It is guaranteed that all chunks have an ending address - * that is smaller than maxsize. - * If maxsize is 0 all memory will be detected. - */ -void detect_memory_layout(struct mem_chunk chunk[], unsigned long maxsize) -{ - unsigned long flags, flags_dat, cr0; - - memset(chunk, 0, MEMORY_CHUNKS * sizeof(struct mem_chunk)); - /* - * Disable IRQs, DAT and low address protection so tprot does the - * right thing and we don't get scheduled away with low address - * protection disabled. - */ - local_irq_save(flags); - flags_dat = __arch_local_irq_stnsm(0xfb); - /* - * In case DAT was enabled, make sure chunk doesn't reside in vmalloc - * space. We have disabled DAT and any access to vmalloc area will - * cause an exception. - * If DAT was disabled we are called from early ipl code. - */ - if (test_bit(5, &flags_dat)) { - if (WARN_ON_ONCE(is_vmalloc_or_module_addr(chunk))) - goto out; - } - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); - find_memory_chunks(chunk, maxsize); - __ctl_load(cr0, 0, 0); -out: - __arch_local_irq_ssm(flags_dat); - local_irq_restore(flags); -} -EXPORT_SYMBOL(detect_memory_layout); - -/* - * Create memory hole with given address and size. - */ -void create_mem_hole(struct mem_chunk mem_chunk[], unsigned long addr, - unsigned long size) -{ - int i; - - for (i = 0; i < MEMORY_CHUNKS; i++) { - struct mem_chunk *chunk = &mem_chunk[i]; - - if (chunk->size == 0) - continue; - if (addr > chunk->addr + chunk->size) - continue; - if (addr + size <= chunk->addr) - continue; - /* Split */ - if ((addr > chunk->addr) && - (addr + size < chunk->addr + chunk->size)) { - struct mem_chunk *new = chunk + 1; - - memmove(new, chunk, (MEMORY_CHUNKS-i-1) * sizeof(*new)); - new->addr = addr + size; - new->size = chunk->addr + chunk->size - new->addr; - chunk->size = addr - chunk->addr; - continue; - } else if ((addr <= chunk->addr) && - (addr + size >= chunk->addr + chunk->size)) { - memmove(chunk, chunk + 1, (MEMORY_CHUNKS-i-1) * sizeof(*chunk)); - memset(&mem_chunk[MEMORY_CHUNKS-1], 0, sizeof(*chunk)); - } else if (addr + size < chunk->addr + chunk->size) { - chunk->size = chunk->addr + chunk->size - addr - size; - chunk->addr = addr + size; - } else if (addr > chunk->addr) { - chunk->size = addr - chunk->addr; - } - } -} diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 40023290ee5b..2a222a7e14f4 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -1,183 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * flexible mmap layout support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * * Started by Ingo Molnar <mingo@elte.hu> */ +#include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/mm.h> #include <linux/mman.h> -#include <linux/module.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> #include <linux/random.h> -#include <linux/compat.h> -#include <asm/pgalloc.h> +#include <linux/security.h> +#include <linux/hugetlb.h> +#include <asm/elf.h> static unsigned long stack_maxrandom_size(void) { if (!(current->flags & PF_RANDOMIZE)) return 0; - if (current->personality & ADDR_NO_RANDOMIZE) - return 0; return STACK_RND_MASK << PAGE_SHIFT; } -/* - * Top of mmap area (just below the process stack). - * - * Leave at least a ~32 MB hole. - */ -#define MIN_GAP (32*1024*1024) -#define MAX_GAP (STACK_TOP/6*5) - -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { - if (!(current->flags & PF_RANDOMIZE)) - return 0; - /* 8MB randomization for mmap_base */ - return (get_random_int() & 0x7ffUL) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } -static inline unsigned long mmap_base(void) +static unsigned long mmap_base_legacy(unsigned long rnd) { - unsigned long gap = rlimit(RLIMIT_STACK); - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - gap &= PAGE_MASK; - return STACK_TOP - stack_maxrandom_size() - mmap_rnd() - gap; + return TASK_UNMAPPED_BASE + rnd; } -#ifndef CONFIG_64BIT - -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm) +static inline unsigned long mmap_base(unsigned long rnd, + const struct rlimit *rlim_stack) { + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_maxrandom_size() + stack_guard_gap; + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: + * Top of mmap area (just below the process stack). + * Leave at least a ~128 MB hole. */ - if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} + gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5); -#else + return PAGE_ALIGN(STACK_TOP - gap - rnd); +} -int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) +static int get_align_mask(struct file *filp, unsigned long flags) { - int rc; - - if (is_compat_task() || (TASK_SIZE >= (1UL << 53))) + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); + if (!(current->flags & PF_RANDOMIZE)) return 0; - if (!(flags & MAP_FIXED)) - addr = 0; - if ((addr + len) >= TASK_SIZE) { - rc = crst_table_upgrade(current->mm, 1UL << 53); - if (rc) - return rc; - update_mm(current->mm, current); - } + if (filp || (flags & MAP_SHARED)) + return MMAP_ALIGN_MASK << PAGE_SHIFT; return 0; } -static unsigned long -s390_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; - unsigned long area; - int rc; - - area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); - if (!(area & ~PAGE_MASK)) - return area; - if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) { - /* Upgrade the page table to 4 levels and retry. */ - rc = crst_table_upgrade(mm, 1UL << 53); - if (rc) - return (unsigned long) rc; - update_mm(mm, current); - area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); + struct vm_area_struct *vma; + struct vm_unmapped_area_info info = {}; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + goto check_asce_limit; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; } - return area; + + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = get_align_mask(filp, flags); + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + if (offset_in_page(addr)) + return addr; + +check_asce_limit: + return check_asce_limit(mm, addr, len); } -static unsigned long -s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long area; - int rc; - - area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); - if (!(area & ~PAGE_MASK)) - return area; - if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) { - /* Upgrade the page table to 4 levels and retry. */ - rc = crst_table_upgrade(mm, 1UL << 53); - if (rc) - return (unsigned long) rc; - update_mm(mm, current); - area = arch_get_unmapped_area_topdown(filp, addr, len, - pgoff, flags); + struct vm_unmapped_area_info info = {}; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + goto check_asce_limit; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = get_align_mask(filp, flags); + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (offset_in_page(addr)) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + if (offset_in_page(addr)) + return addr; } - return area; + +check_asce_limit: + return check_asce_limit(mm, addr, len); } + /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = s390_get_unmapped_area; + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = mmap_base_legacy(random_factor); + mm_flags_clear(MMF_TOPDOWN, mm); } else { - mm->mmap_base = mmap_base(); - mm->get_unmapped_area = s390_get_unmapped_area_topdown; + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm_flags_set(MMF_TOPDOWN, mm); } } -#endif +static pgprot_t protection_map[16] __ro_after_init; + +void __init setup_protection_map(void) +{ + pgprot_t *pm = protection_map; + + pm[VM_NONE] = PAGE_NONE; + pm[VM_READ] = PAGE_RO; + pm[VM_WRITE] = PAGE_RO; + pm[VM_WRITE | VM_READ] = PAGE_RO; + pm[VM_EXEC] = PAGE_RX; + pm[VM_EXEC | VM_READ] = PAGE_RX; + pm[VM_EXEC | VM_WRITE] = PAGE_RX; + pm[VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX; + pm[VM_SHARED] = PAGE_NONE; + pm[VM_SHARED | VM_READ] = PAGE_RO; + pm[VM_SHARED | VM_WRITE] = PAGE_RW; + pm[VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW; + pm[VM_SHARED | VM_EXEC] = PAGE_RX; + pm[VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX; + pm[VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX; + pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX; +} + +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index a90d45e9dfb0..01f9b39e65f5 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2008 * @@ -6,109 +7,26 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> #include <linux/mm.h> -#include <linux/gfp.h> -#include <linux/init.h> +#include <asm/page-states.h> +#include <asm/sections.h> +#include <asm/page.h> -#define ESSA_SET_STABLE 1 -#define ESSA_SET_UNUSED 2 - -static int cmma_flag = 1; - -static int __init cmma(char *str) -{ - char *parm; - - parm = strstrip(str); - if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) { - cmma_flag = 1; - return 1; - } - cmma_flag = 0; - if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0) - return 1; - return 0; -} -__setup("cmma=", cmma); - -void __init cmma_init(void) -{ - register unsigned long tmp asm("0") = 0; - register int rc asm("1") = -EOPNOTSUPP; - - if (!cmma_flag) - return; - asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,0,0\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "+&d" (rc), "+&d" (tmp)); - if (rc) - cmma_flag = 0; -} - -static inline void set_page_unstable(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_UNUSED)); -} +int __bootdata_preserved(cmma_flag); void arch_free_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_unstable(page, order); -} - -static inline void set_page_stable(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); + __set_page_unused(page_to_virt(page), 1UL << order); } void arch_alloc_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_stable(page, order); -} - -void arch_set_page_states(int make_stable) -{ - unsigned long flags, order, t; - struct list_head *l; - struct page *page; - struct zone *zone; - - if (!cmma_flag) - return; - if (make_stable) - drain_local_pages(NULL); - for_each_populated_zone(zone) { - spin_lock_irqsave(&zone->lock, flags); - for_each_migratetype_order(order, t) { - list_for_each(l, &zone->free_area[order].free_list[t]) { - page = list_entry(l, struct page, lru); - if (make_stable) - set_page_stable(page, order); - else - set_page_unstable(page, order); - } - } - spin_unlock_irqrestore(&zone->lock, flags); - } + if (cmma_flag < 2) + __set_page_stable_dat(page_to_virt(page), 1UL << order); + else + __set_page_stable_nodat(page_to_virt(page), 1UL << order); } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 80adfbf75065..3042647c9dbf 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -1,27 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2011 * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/hugetlb.h> -#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <linux/mm.h> #include <asm/cacheflush.h> -#include <asm/pgtable.h> +#include <asm/facility.h> +#include <asm/pgalloc.h> +#include <asm/kfence.h> #include <asm/page.h> +#include <asm/asm.h> +#include <asm/set_memory.h> static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) { - asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0" + asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" : [addr] "+a" (addr) : [skey] "d" (skey)); return addr; } -void storage_key_init_range(unsigned long start, unsigned long end) +void __storage_key_init_range(unsigned long start, unsigned long end) { unsigned long boundary, size; while (start < end) { - if (MACHINE_HAS_EDAT1) { + if (cpu_has_edat1()) { /* set storage keys for a 1MB frame */ size = 1UL << 20; boundary = (start + size) & ~(size - 1); @@ -32,113 +39,435 @@ void storage_key_init_range(unsigned long start, unsigned long end) continue; } } - page_set_storage_key(start, PAGE_DEFAULT_KEY, 0); + page_set_storage_key(start, PAGE_DEFAULT_KEY, 1); start += PAGE_SIZE; } } -static pte_t *walk_page_table(unsigned long addr) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; +#ifdef CONFIG_PROC_FS +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); - pgdp = pgd_offset_k(addr); - if (pgd_none(*pgdp)) - return NULL; - pudp = pud_offset(pgdp, addr); - if (pud_none(*pudp) || pud_large(*pudp)) - return NULL; - pmdp = pmd_offset(pudp, addr); - if (pmd_none(*pmdp) || pmd_large(*pmdp)) - return NULL; - ptep = pte_offset_kernel(pmdp, addr); - if (pte_none(*ptep)) - return NULL; - return ptep; +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2); + seq_printf(m, "DirectMap1M: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10); + seq_printf(m, "DirectMap2G: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21); } +#endif /* CONFIG_PROC_FS */ -static void change_page_attr(unsigned long addr, int numpages, - pte_t (*set) (pte_t)) +static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, + unsigned long dtt) { - pte_t *ptep, pte; - int i; + unsigned long *table, mask; - for (i = 0; i < numpages; i++) { - ptep = walk_page_table(addr); - if (WARN_ON_ONCE(!ptep)) + mask = 0; + if (cpu_has_edat2()) { + switch (dtt) { + case CRDTE_DTT_REGION3: + mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1); break; - pte = *ptep; - pte = set(pte); - __ptep_ipte(addr, ptep); - *ptep = pte; - addr += PAGE_SIZE; + case CRDTE_DTT_SEGMENT: + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + break; + case CRDTE_DTT_PAGE: + mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + break; + } + table = (unsigned long *)((unsigned long)old & mask); + crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val); + } else { + cspg(old, *old, new); } } -int set_memory_ro(unsigned long addr, int numpages) +static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, + unsigned long flags) { - change_page_attr(addr, numpages, pte_wrprotect); + pte_t *ptep, new; + + if (flags == SET_MEMORY_4K) + return 0; + ptep = pte_offset_kernel(pmdp, addr); + do { + new = *ptep; + if (pte_none(new)) + return -EINVAL; + if (flags & SET_MEMORY_RO) + new = pte_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pte_mkwrite_novma(pte_mkdirty(new)); + if (flags & SET_MEMORY_NX) + new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + else if (flags & SET_MEMORY_X) + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pte_bit(new, __pgprot(_PAGE_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pte(pte_val(new) & PAGE_MASK); + new = set_pte_bit(new, PAGE_KERNEL); + } + pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); + ptep++; + addr += PAGE_SIZE; + cond_resched(); + } while (addr < end); return 0; } -int set_memory_rw(unsigned long addr, int numpages) +static int split_pmd_page(pmd_t *pmdp, unsigned long addr) { - change_page_attr(addr, numpages, pte_mkwrite); + unsigned long pte_addr, prot; + pte_t *pt_dir, *ptep; + pmd_t new; + int i, ro, nx; + + pt_dir = vmem_pte_alloc(); + if (!pt_dir) + return -ENOMEM; + pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT; + ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT); + nx = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_NOEXEC); + prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL); + if (!nx) + prot &= ~_PAGE_NOEXEC; + ptep = pt_dir; + for (i = 0; i < PTRS_PER_PTE; i++) { + set_pte(ptep, __pte(pte_addr | prot)); + pte_addr += PAGE_SIZE; + ptep++; + } + new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY); + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); + update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE); + update_page_count(PG_DIRECT_MAP_1M, -1); return 0; } -/* not possible */ -int set_memory_nx(unsigned long addr, int numpages) +static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, + unsigned long flags) { - return 0; + pmd_t new = *pmdp; + + if (flags & SET_MEMORY_RO) + new = pmd_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pmd_mkwrite_novma(pmd_mkdirty(new)); + if (flags & SET_MEMORY_NX) + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + else if (flags & SET_MEMORY_X) + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pmd(pmd_val(new) & PMD_MASK); + new = set_pmd_bit(new, SEGMENT_KERNEL); + } + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); +} + +static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + int need_split; + pmd_t *pmdp; + int rc = 0; + + pmdp = pmd_offset(pudp, addr); + do { + if (pmd_none(*pmdp)) + return -EINVAL; + next = pmd_addr_end(addr, end); + if (pmd_leaf(*pmdp)) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PMD_MASK); + need_split |= !!(addr + PMD_SIZE > next); + if (need_split) { + rc = split_pmd_page(pmdp, addr); + if (rc) + return rc; + continue; + } + modify_pmd_page(pmdp, addr, flags); + } else { + rc = walk_pte_level(pmdp, addr, next, flags); + if (rc) + return rc; + } + pmdp++; + addr = next; + cond_resched(); + } while (addr < end); + return rc; } -int set_memory_x(unsigned long addr, int numpages) +static int split_pud_page(pud_t *pudp, unsigned long addr) { + unsigned long pmd_addr, prot; + pmd_t *pm_dir, *pmdp; + pud_t new; + int i, ro, nx; + + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pm_dir) + return -ENOMEM; + pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; + ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT); + nx = !!(pud_val(*pudp) & _REGION_ENTRY_NOEXEC); + prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL); + if (!nx) + prot &= ~_SEGMENT_ENTRY_NOEXEC; + pmdp = pm_dir; + for (i = 0; i < PTRS_PER_PMD; i++) { + set_pmd(pmdp, __pmd(pmd_addr | prot)); + pmd_addr += PMD_SIZE; + pmdp++; + } + new = __pud(__pa(pm_dir) | _REGION3_ENTRY); + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); + update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD); + update_page_count(PG_DIRECT_MAP_2G, -1); return 0; } -#ifdef CONFIG_DEBUG_PAGEALLOC -void kernel_map_pages(struct page *page, int numpages, int enable) +static void modify_pud_page(pud_t *pudp, unsigned long addr, + unsigned long flags) { - unsigned long address; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i; + pud_t new = *pudp; - for (i = 0; i < numpages; i++) { - address = page_to_phys(page + i); - pgd = pgd_offset_k(address); - pud = pud_offset(pgd, address); - pmd = pmd_offset(pud, address); - pte = pte_offset_kernel(pmd, address); - if (!enable) { - __ptep_ipte(address, pte); - pte_val(*pte) = _PAGE_TYPE_EMPTY; - continue; + if (flags & SET_MEMORY_RO) + new = pud_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pud_mkwrite(pud_mkdirty(new)); + if (flags & SET_MEMORY_NX) + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + else if (flags & SET_MEMORY_X) + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pud(pud_val(new) & PUD_MASK); + new = set_pud_bit(new, REGION3_KERNEL); + } + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); +} + +static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + int need_split; + pud_t *pudp; + int rc = 0; + + pudp = pud_offset(p4d, addr); + do { + if (pud_none(*pudp)) + return -EINVAL; + next = pud_addr_end(addr, end); + if (pud_leaf(*pudp)) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PUD_MASK); + need_split |= !!(addr + PUD_SIZE > next); + if (need_split) { + rc = split_pud_page(pudp, addr); + if (rc) + break; + continue; + } + modify_pud_page(pudp, addr, flags); + } else { + rc = walk_pmd_level(pudp, addr, next, flags); } - pte_val(*pte) = __pa(address); + pudp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + +static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + p4d_t *p4dp; + int rc = 0; + + p4dp = p4d_offset(pgd, addr); + do { + if (p4d_none(*p4dp)) + return -EINVAL; + next = p4d_addr_end(addr, end); + rc = walk_pud_level(p4dp, addr, next, flags); + p4dp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + +DEFINE_MUTEX(cpa_mutex); + +static int change_page_attr(unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + int rc = -EINVAL; + pgd_t *pgdp; + + pgdp = pgd_offset_k(addr); + do { + if (pgd_none(*pgdp)) + break; + next = pgd_addr_end(addr, end); + rc = walk_p4d_level(pgdp, addr, next, flags); + if (rc) + break; + cond_resched(); + } while (pgdp++, addr = next, addr < end && !rc); + return rc; +} + +static int change_page_attr_alias(unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long alias, offset, va_start, va_end; + struct vm_struct *area; + int rc = 0; + + /* + * Changes to read-only permissions on kernel VA mappings are also + * applied to the kernel direct mapping. Execute permissions are + * intentionally not transferred to keep all allocated pages within + * the direct mapping non-executable. + */ + flags &= SET_MEMORY_RO | SET_MEMORY_RW; + if (!flags) + return 0; + area = NULL; + while (addr < end) { + if (!area) + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_ALLOC)) + return 0; + va_start = (unsigned long)area->addr; + va_end = va_start + area->nr_pages * PAGE_SIZE; + offset = (addr - va_start) >> PAGE_SHIFT; + alias = (unsigned long)page_address(area->pages[offset]); + rc = change_page_attr(alias, alias + PAGE_SIZE, flags); + if (rc) + break; + addr += PAGE_SIZE; + if (addr >= va_end) + area = NULL; } + return rc; +} + +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags) +{ + unsigned long end; + int rc; + + if (!cpu_has_nx()) + flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); + if (!flags) + return 0; + if (!numpages) + return 0; + addr &= PAGE_MASK; + end = addr + numpages * PAGE_SIZE; + mutex_lock(&cpa_mutex); + rc = change_page_attr(addr, end, flags); + if (rc) + goto out; + rc = change_page_attr_alias(addr, end, flags); +out: + mutex_unlock(&cpa_mutex); + return rc; +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); +} + +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long flags; + + if (valid) + flags = SET_MEMORY_DEF; + else + flags = SET_MEMORY_INV; + + return __set_memory((unsigned long)page_to_virt(page), nr, flags); } -#ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) { unsigned long addr; - int cc; + unsigned int cc; - addr = page_to_phys(page); + addr = (unsigned long)page_address(page); asm volatile( - " lra %1,0(%1)\n" - " ipm %0\n" - " srl %0,28" - : "=d" (cc), "+a" (addr) : : "cc"); - return cc == 0; + " lra %[addr],0(%[addr])\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [addr] "+a" (addr) + : + : CC_CLOBBER); + return CC_TRANSFORM(cc) == 0; +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) + +static void ipte_range(pte_t *pte, unsigned long address, int nr) +{ + int i; + + if (test_facility(13)) { + __ptep_ipte_range(address, nr - 1, pte, IPTE_GLOBAL); + return; + } + for (i = 0; i < nr; i++) { + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); + address += PAGE_SIZE; + pte++; + } +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long address; + pte_t *ptep, pte; + int nr, i, j; + + for (i = 0; i < numpages;) { + address = (unsigned long)page_to_virt(page + i); + ptep = virt_to_kpte(address); + nr = (unsigned long)ptep >> ilog2(sizeof(long)); + nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1)); + nr = min(numpages - i, nr); + if (enable) { + for (j = 0; j < nr; j++) { + pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); + address += PAGE_SIZE; + ptep++; + } + } else { + ipte_range(ptep, address, nr); + } + i += nr; + } } -#endif /* CONFIG_HIBERNATION */ #endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c new file mode 100644 index 000000000000..2f829448c719 --- /dev/null +++ b/arch/s390/mm/pfault.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/cpuhotplug.h> +#include <linux/sched/task.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <asm/asm-extable.h> +#include <asm/asm-offsets.h> +#include <asm/pfault.h> +#include <asm/diag.h> + +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000UL + +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} +early_param("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +}; + +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1UL << 48, + .refcmpmk = 1UL << 48, + .reserved = __PF_RES_FIELD +}; + +int __pfault_init(void) +{ + int rc = -EOPNOTSUPP; + + if (pfault_disable) + return rc; + diag_stat_inc(DIAG_STAT_X258); + asm_inline volatile( + " diag %[refbk],%[rc],0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rc] "+d" (rc) + : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : "cc"); + return rc; +} + +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + +void __pfault_fini(void) +{ + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm_inline volatile( + " diag %[refbk],0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : + : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* + * Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. + */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* + * Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. + */ + if (task_is_running(tsk)) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* + * Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. + */ + tsk->thread.pfault_wait = 0; + } else { + /* + * Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. + */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* + * Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_need_resched_current(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c new file mode 100644 index 000000000000..7df23528c01b --- /dev/null +++ b/arch/s390/mm/pgalloc.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Page table allocation functions + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <asm/mmu_context.h> +#include <asm/page-states.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> + +unsigned long *crst_table_alloc_noprof(struct mm_struct *mm) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; + unsigned long *table; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; +} + +void crst_table_free(struct mm_struct *mm, unsigned long *table) +{ + if (!table) + return; + pagetable_free(virt_to_ptdesc(table)); +} + +static void __crst_table_upgrade(void *arg) +{ + struct mm_struct *mm = arg; + struct ctlreg asce; + + /* change all active ASCEs to avoid the creation of new TLBs */ + if (current->active_mm == mm) { + asce.val = mm->context.asce; + get_lowcore()->user_asce = asce; + local_ctl_load(7, &asce); + if (!test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &asce); + } + __tlb_flush_local(); +} + +int crst_table_upgrade(struct mm_struct *mm, unsigned long end) +{ + unsigned long *pgd = NULL, *p4d = NULL, *__pgd; + unsigned long asce_limit = mm->context.asce_limit; + + mmap_assert_write_locked(mm); + + /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ + VM_BUG_ON(asce_limit < _REGION2_SIZE); + + if (end <= asce_limit) + return 0; + + if (asce_limit == _REGION2_SIZE) { + p4d = crst_table_alloc(mm); + if (unlikely(!p4d)) + goto err_p4d; + crst_table_init(p4d, _REGION2_ENTRY_EMPTY); + pagetable_p4d_ctor(virt_to_ptdesc(p4d)); + } + if (end > _REGION1_SIZE) { + pgd = crst_table_alloc(mm); + if (unlikely(!pgd)) + goto err_pgd; + crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + pagetable_pgd_ctor(virt_to_ptdesc(pgd)); + } + + spin_lock_bh(&mm->page_table_lock); + + if (p4d) { + __pgd = (unsigned long *) mm->pgd; + p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); + mm->pgd = (pgd_t *) p4d; + mm->context.asce_limit = _REGION1_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); + } + if (pgd) { + __pgd = (unsigned long *) mm->pgd; + pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); + mm->pgd = (pgd_t *) pgd; + mm->context.asce_limit = TASK_SIZE_MAX; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + } + + spin_unlock_bh(&mm->page_table_lock); + + on_each_cpu(__crst_table_upgrade, mm, 0); + + return 0; + +err_pgd: + pagetable_dtor(virt_to_ptdesc(p4d)); + crst_table_free(mm, p4d); +err_p4d: + return -ENOMEM; +} + +#ifdef CONFIG_PGSTE + +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm) +{ + struct ptdesc *ptdesc; + u64 *table; + + ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0); + if (ptdesc) { + table = (u64 *)ptdesc_address(ptdesc); + __arch_set_page_dat(table, 1); + memset64(table, _PAGE_INVALID, PTRS_PER_PTE); + memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); + } + return ptdesc; +} + +void page_table_free_pgste(struct ptdesc *ptdesc) +{ + pagetable_free(ptdesc); +} + +#endif /* CONFIG_PGSTE */ + +unsigned long *page_table_alloc_noprof(struct mm_struct *mm) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; + unsigned long *table; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + table = ptdesc_address(ptdesc); + __arch_set_page_dat(table, 1); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); + return table; +} + +void page_table_free(struct mm_struct *mm, unsigned long *table) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(table); + + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); + pagetable_dtor_free(ptdesc); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) +{ + struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + + pagetable_dtor_free(ptdesc); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); + + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * Base infrastructure required to generate basic asces, region, segment, + * and page tables that do not make use of enhanced features like EDAT1. + */ + +static struct kmem_cache *base_pgt_cache; + +static unsigned long *base_pgt_alloc(void) +{ + unsigned long *table; + + table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); + if (table) + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + return table; +} + +static void base_pgt_free(unsigned long *table) +{ + kmem_cache_free(base_pgt_cache, table); +} + +static unsigned long *base_crst_alloc(unsigned long val) +{ + unsigned long *table; + struct ptdesc *ptdesc; + + ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + crst_table_init(table, val); + return table; +} + +static void base_crst_free(unsigned long *table) +{ + if (!table) + return; + pagetable_free(virt_to_ptdesc(table)); +} + +#define BASE_ADDR_END_FUNC(NAME, SIZE) \ +static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \ + unsigned long end) \ +{ \ + unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \ + \ + return (next - 1) < (end - 1) ? next : end; \ +} + +BASE_ADDR_END_FUNC(page, PAGE_SIZE) +BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE) +BASE_ADDR_END_FUNC(region3, _REGION3_SIZE) +BASE_ADDR_END_FUNC(region2, _REGION2_SIZE) +BASE_ADDR_END_FUNC(region1, _REGION1_SIZE) + +static inline unsigned long base_lra(unsigned long address) +{ + unsigned long real; + + asm volatile( + " lra %0,0(%1)" + : "=d" (real) : "a" (address) : "cc"); + return real; +} + +static int base_page_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *pte, next; + + if (!alloc) + return 0; + pte = origin; + pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT; + do { + next = base_page_addr_end(addr, end); + *pte = base_lra(addr); + } while (pte++, addr = next, addr < end); + return 0; +} + +static int base_segment_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *ste, next, *table; + int rc; + + ste = origin; + ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + do { + next = base_segment_addr_end(addr, end); + if (*ste & _SEGMENT_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_pgt_alloc(); + if (!table) + return -ENOMEM; + *ste = __pa(table) | _SEGMENT_ENTRY; + } + table = __va(*ste & _SEGMENT_ENTRY_ORIGIN); + rc = base_page_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_pgt_free(table); + cond_resched(); + } while (ste++, addr = next, addr < end); + return 0; +} + +static int base_region3_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rtte, next, *table; + int rc; + + rtte = origin; + rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; + do { + next = base_region3_addr_end(addr, end); + if (*rtte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rtte = __pa(table) | _REGION3_ENTRY; + } + table = __va(*rtte & _REGION_ENTRY_ORIGIN); + rc = base_segment_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rtte++, addr = next, addr < end); + return 0; +} + +static int base_region2_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rste, next, *table; + int rc; + + rste = origin; + rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; + do { + next = base_region2_addr_end(addr, end); + if (*rste & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rste = __pa(table) | _REGION2_ENTRY; + } + table = __va(*rste & _REGION_ENTRY_ORIGIN); + rc = base_region3_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rste++, addr = next, addr < end); + return 0; +} + +static int base_region1_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rfte, next, *table; + int rc; + + rfte = origin; + rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; + do { + next = base_region1_addr_end(addr, end); + if (*rfte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rfte = __pa(table) | _REGION1_ENTRY; + } + table = __va(*rfte & _REGION_ENTRY_ORIGIN); + rc = base_region2_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rfte++, addr = next, addr < end); + return 0; +} + +/** + * base_asce_free - free asce and tables returned from base_asce_alloc() + * @asce: asce to be freed + * + * Frees all region, segment, and page tables that were allocated with a + * corresponding base_asce_alloc() call. + */ +void base_asce_free(unsigned long asce) +{ + unsigned long *table = __va(asce & _ASCE_ORIGIN); + + if (!asce) + return; + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_SEGMENT: + base_segment_walk(table, 0, _REGION3_SIZE, 0); + break; + case _ASCE_TYPE_REGION3: + base_region3_walk(table, 0, _REGION2_SIZE, 0); + break; + case _ASCE_TYPE_REGION2: + base_region2_walk(table, 0, _REGION1_SIZE, 0); + break; + case _ASCE_TYPE_REGION1: + base_region1_walk(table, 0, TASK_SIZE_MAX, 0); + break; + } + base_crst_free(table); +} + +static int base_pgt_cache_init(void) +{ + static DEFINE_MUTEX(base_pgt_cache_mutex); + unsigned long sz = _PAGE_TABLE_SIZE; + + if (base_pgt_cache) + return 0; + mutex_lock(&base_pgt_cache_mutex); + if (!base_pgt_cache) + base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL); + mutex_unlock(&base_pgt_cache_mutex); + return base_pgt_cache ? 0 : -ENOMEM; +} + +/** + * base_asce_alloc - create kernel mapping without enhanced DAT features + * @addr: virtual start address of kernel mapping + * @num_pages: number of consecutive pages + * + * Generate an asce, including all required region, segment and page tables, + * that can be used to access the virtual kernel mapping. The difference is + * that the returned asce does not make use of any enhanced DAT features like + * e.g. large pages. This is required for some I/O functions that pass an + * asce, like e.g. some service call requests. + * + * Note: the returned asce may NEVER be attached to any cpu. It may only be + * used for I/O requests. tlb entries that might result because the + * asce was attached to a cpu won't be cleared. + */ +unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) +{ + unsigned long asce, *table, end; + int rc; + + if (base_pgt_cache_init()) + return 0; + end = addr + num_pages * PAGE_SIZE; + if (end <= _REGION3_SIZE) { + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_segment_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION2_SIZE) { + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region3_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION1_SIZE) { + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region2_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + table = base_crst_alloc(_REGION1_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region1_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + } + if (rc) { + base_asce_free(asce); + asce = 0; + } + return asce; +} diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index a8154a1a2c94..666adcd681ab 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1,8 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2007, 2011 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ +#include <linux/cpufeature.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -10,1197 +13,1143 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/smp.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> #include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/quicklist.h> #include <linux/rcupdate.h> #include <linux/slab.h> +#include <linux/leafops.h> +#include <linux/sysctl.h> +#include <linux/ksm.h> +#include <linux/mman.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/page-states.h> +#include <asm/pgtable.h> +#include <asm/machine.h> -#ifndef CONFIG_64BIT -#define ALLOC_ORDER 1 -#define FRAG_MASK 0x0f -#else -#define ALLOC_ORDER 2 -#define FRAG_MASK 0x03 -#endif - - -unsigned long *crst_table_alloc(struct mm_struct *mm) +pgprot_t pgprot_writecombine(pgprot_t prot) { - struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); - - if (!page) - return NULL; - return (unsigned long *) page_to_phys(page); + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); } +EXPORT_SYMBOL_GPL(pgprot_writecombine); -void crst_table_free(struct mm_struct *mm, unsigned long *table) +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) { - free_pages((unsigned long) table, ALLOC_ORDER); + unsigned long opt, asce; + + if (machine_has_tlb_guest()) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); + } } -#ifdef CONFIG_64BIT -int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) { - unsigned long *table, *pgd; - unsigned long entry; - - BUG_ON(limit > (1UL << 53)); -repeat: - table = crst_table_alloc(mm); - if (!table) - return -ENOMEM; - spin_lock_bh(&mm->page_table_lock); - if (mm->context.asce_limit < limit) { - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit <= (1UL << 31)) { - entry = _REGION3_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - } else { - entry = _REGION2_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 53; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION2; + unsigned long opt, asce; + + if (machine_has_tlb_guest()) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; } - crst_table_init(table, entry); - pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->task_size = mm->context.asce_limit; - table = NULL; + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); } - spin_unlock_bh(&mm->page_table_lock); - if (table) - crst_table_free(mm, table); - if (mm->context.asce_limit < limit) - goto repeat; - return 0; } -void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) +static inline pte_t ptep_flush_direct(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + int nodat) { - pgd_t *pgd; + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + atomic_inc(&mm->context.flush_count); + if (cpu_has_tlb_lc() && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + ptep_ipte_local(mm, addr, ptep, nodat); + else + ptep_ipte_global(mm, addr, ptep, nodat); + atomic_dec(&mm->context.flush_count); + return old; +} - while (mm->context.asce_limit > limit) { - pgd = mm->pgd; - switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { - case _REGION_ENTRY_TYPE_R2: - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - break; - case _REGION_ENTRY_TYPE_R3: - mm->context.asce_limit = 1UL << 31; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_SEGMENT; - break; - default: - BUG(); - } - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->task_size = mm->context.asce_limit; - crst_table_free(mm, (unsigned long *) pgd); - } +static inline pte_t ptep_flush_lazy(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + int nodat) +{ + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); + mm->context.flush_mm = 1; + } else + ptep_ipte_global(mm, addr, ptep, nodat); + atomic_dec(&mm->context.flush_count); + return old; } + +static inline pgste_t pgste_get(pte_t *ptep) +{ + unsigned long pgste = 0; +#ifdef CONFIG_PGSTE + pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); #endif + return __pgste(pgste); +} +static inline void pgste_set(pte_t *ptep, pgste_t pgste) +{ #ifdef CONFIG_PGSTE + *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; +#endif +} -/** - * gmap_alloc - allocate a guest address space - * @mm: pointer to the parent mm_struct - * - * Returns a guest address space structure. - */ -struct gmap *gmap_alloc(struct mm_struct *mm) +static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, + struct mm_struct *mm) { - struct gmap *gmap; - struct page *page; - unsigned long *table; - - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); - if (!gmap) - goto out; - INIT_LIST_HEAD(&gmap->crst_list); - gmap->mm = mm; - page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); - if (!page) - goto out_free; - list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); - crst_table_init(table, _REGION1_ENTRY_EMPTY); - gmap->table = table; - gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); - list_add(&gmap->list, &mm->context.gmap_list); - return gmap; - -out_free: - kfree(gmap); -out: - return NULL; +#ifdef CONFIG_PGSTE + unsigned long address, bits, skey; + + if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) + return pgste; + address = pte_val(pte) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(address); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + /* Transfer page changed & referenced bit to guest bits in pgste */ + pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ + /* Copy page access key and fetch protection bit to pgste */ + pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); +#endif + return pgste; + } -EXPORT_SYMBOL_GPL(gmap_alloc); -static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) +static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, + struct mm_struct *mm) { - struct gmap_pgtable *mp; - struct gmap_rmap *rmap; - struct page *page; +#ifdef CONFIG_PGSTE + unsigned long address; + unsigned long nkey; - if (*table & _SEGMENT_ENTRY_INV) - return 0; - page = pfn_to_page(*table >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - list_for_each_entry(rmap, &mp->mapper, list) { - if (rmap->entry != table) - continue; - list_del(&rmap->list); - kfree(rmap); - break; - } - *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; - return 1; + if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) + return; + VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); + address = pte_val(entry) & PAGE_MASK; + /* + * Set page access key and fetch protection bit from pgste. + * The guest C/R information is still in the PGSTE, set real + * key C/R to 0. + */ + nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + page_set_storage_key(address, nkey, 0); +#endif } -static void gmap_flush_tlb(struct gmap *gmap) +static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) { - if (MACHINE_HAS_IDTE) - __tlb_flush_idte((unsigned long) gmap->table | - _ASCE_TYPE_REGION1); - else - __tlb_flush_global(); +#ifdef CONFIG_PGSTE + if ((pte_val(entry) & _PAGE_PRESENT) && + (pte_val(entry) & _PAGE_WRITE) && + !(pte_val(entry) & _PAGE_INVALID)) { + if (!machine_has_esop()) { + /* + * Without enhanced suppression-on-protection force + * the dirty bit on for all writable ptes. + */ + entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); + entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); + } + if (!(pte_val(entry) & _PAGE_PROTECT)) + /* This pte allows write access, set user-dirty */ + pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); + } +#endif + set_pte(ptep, entry); + return pgste; } -/** - * gmap_free - free a guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_free(struct gmap *gmap) +static inline pgste_t pgste_pte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) { - struct page *page, *next; - unsigned long *table; - int i; +#ifdef CONFIG_PGSTE + unsigned long bits; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste = __pgste(pgste_val(pgste) ^ bits); + ptep_notify(mm, addr, ptep, bits); + } +#endif + return pgste; +} +static inline pgste_t ptep_xchg_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pgste_t pgste = __pgste(0); - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_idte((unsigned long) gmap->table | - _ASCE_TYPE_REGION1); - else - __tlb_flush_global(); - - /* Free all segment & region tables. */ - down_read(&gmap->mm->mmap_sem); - spin_lock(&gmap->mm->page_table_lock); - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { - table = (unsigned long *) page_to_phys(page); - if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) - /* Remove gmap rmap structures for segment table. */ - for (i = 0; i < PTRS_PER_PMD; i++, table++) - gmap_unlink_segment(gmap, table); - __free_pages(page, ALLOC_ORDER); + if (mm_has_pgste(mm)) { + pgste = pgste_get_lock(ptep); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); } - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); - list_del(&gmap->list); - kfree(gmap); + return pgste; } -EXPORT_SYMBOL_GPL(gmap_free); -/** - * gmap_enable - switch primary space to the guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_enable(struct gmap *gmap) +static inline pte_t ptep_xchg_commit(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pgste_t pgste, pte_t old, pte_t new) { - S390_lowcore.gmap = (unsigned long) gmap; + if (mm_has_pgste(mm)) { + if (pte_val(old) & _PAGE_INVALID) + pgste_set_key(ptep, pgste, new, mm); + if (pte_val(new) & _PAGE_INVALID) { + pgste = pgste_update_all(old, pgste, mm); + if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == + _PGSTE_GPS_USAGE_UNUSED) + old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); + } + pgste = pgste_set_pte(ptep, pgste, new); + pgste_set_unlock(ptep, pgste); + } else { + set_pte(ptep, new); + } + return old; } -EXPORT_SYMBOL_GPL(gmap_enable); -/** - * gmap_disable - switch back to the standard primary address space - * @gmap: pointer to the guest address space structure - */ -void gmap_disable(struct gmap *gmap) +pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) { - S390_lowcore.gmap = 0UL; + pgste_t pgste; + pte_t old; + int nodat; + + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); + old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); + return old; } -EXPORT_SYMBOL_GPL(gmap_disable); +EXPORT_SYMBOL(ptep_xchg_direct); /* - * gmap_alloc_table is assumed to be called with mmap_sem held + * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that + * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). */ -static int gmap_alloc_table(struct gmap *gmap, - unsigned long *table, unsigned long init) +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new) { - struct page *page; - unsigned long *new; - - /* since we dont free the gmap table until gmap_free we can unlock */ - spin_unlock(&gmap->mm->page_table_lock); - page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); - spin_lock(&gmap->mm->page_table_lock); - if (!page) - return -ENOMEM; - new = (unsigned long *) page_to_phys(page); - crst_table_init(new, init); - if (*table & _REGION_ENTRY_INV) { - list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | - (*table & _REGION_ENTRY_TYPE_MASK); - } else - __free_pages(page, ALLOC_ORDER); - return 0; + preempt_disable(); + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_rdp(addr, ptep, 1); + else + __ptep_rdp(addr, ptep, 0); + /* + * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That + * means it is still valid and active, and must not be changed according + * to the architecture. But writing a new value that only differs in SW + * bits is allowed. + */ + set_pte(ptep, new); + atomic_dec(&mm->context.flush_count); + preempt_enable(); } +EXPORT_SYMBOL(ptep_reset_dat_prot); -/** - * gmap_unmap_segment - unmap segment from the guest address space - * @gmap: pointer to the guest address space structure - * @addr: address in the guest address space - * @len: length of the memory area to unmap - * - * Returns 0 if the unmap succeded, -EINVAL if not. - */ -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) { - unsigned long *table; - unsigned long off; - int flush; + pgste_t pgste; + pte_t old; + int nodat; - if ((to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || to + len < to) - return -EINVAL; + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); + old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(ptep_xchg_lazy); - flush = 0; - down_read(&gmap->mm->mmap_sem); - spin_lock(&gmap->mm->page_table_lock); - for (off = 0; off < len; off += PMD_SIZE) { - /* Walk the guest addr space page table */ - table = gmap->table + (((to + off) >> 53) & 0x7ff); - if (*table & _REGION_ENTRY_INV) - goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 42) & 0x7ff); - if (*table & _REGION_ENTRY_INV) - goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 31) & 0x7ff); - if (*table & _REGION_ENTRY_INV) - goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 20) & 0x7ff); - - /* Clear segment table entry in guest address space. */ - flush |= gmap_unlink_segment(gmap, table); - *table = _SEGMENT_ENTRY_INV; +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + pgste_t pgste; + pte_t old; + int nodat; + struct mm_struct *mm = vma->vm_mm; + + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); + if (mm_has_pgste(mm)) { + pgste = pgste_update_all(old, pgste, mm); + pgste_set(ptep, pgste); } -out: - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - return 0; + return old; } -EXPORT_SYMBOL_GPL(gmap_unmap_segment); -/** - * gmap_mmap_segment - map a segment to the guest address space - * @gmap: pointer to the guest address space structure - * @from: source address in the parent address space - * @to: target address in the guest address space - * - * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not. - */ -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len) +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) { - unsigned long *table; - unsigned long off; - int flush; - - if ((from | to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || from + len > PGDIR_SIZE || - from + len < from || to + len < to) - return -EINVAL; + pgste_t pgste; + struct mm_struct *mm = vma->vm_mm; - flush = 0; - down_read(&gmap->mm->mmap_sem); - spin_lock(&gmap->mm->page_table_lock); - for (off = 0; off < len; off += PMD_SIZE) { - /* Walk the gmap address space page table */ - table = gmap->table + (((to + off) >> 53) & 0x7ff); - if ((*table & _REGION_ENTRY_INV) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) - goto out_unmap; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 42) & 0x7ff); - if ((*table & _REGION_ENTRY_INV) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) - goto out_unmap; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 31) & 0x7ff); - if ((*table & _REGION_ENTRY_INV) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) - goto out_unmap; - table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 20) & 0x7ff); - - /* Store 'from' address in an invalid segment table entry. */ - flush |= gmap_unlink_segment(gmap, table); - *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off); + if (mm_has_pgste(mm)) { + pgste = pgste_get(ptep); + pgste_set_key(ptep, pgste, pte, mm); + pgste = pgste_set_pte(ptep, pgste, pte); + pgste_set_unlock(ptep, pgste); + } else { + set_pte(ptep, pte); } - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - return 0; - -out_unmap: - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); - gmap_unmap_segment(gmap, to, len); - return -ENOMEM; } -EXPORT_SYMBOL_GPL(gmap_map_segment); -static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { - unsigned long *table; - - table = gmap->table + ((address >> 53) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) - return ERR_PTR(-EFAULT); - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 42) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) - return ERR_PTR(-EFAULT); - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 31) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) - return ERR_PTR(-EFAULT); - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 20) & 0x7ff); - return table; + if (machine_has_tlb_guest()) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_local(mm, addr); } -/** - * __gmap_translate - translate a guest address to a user space address - * @address: guest address - * @gmap: pointer to guest mapping meta data structure - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - * The mmap_sem of the mm that belongs to the address space must be held - * when this function gets called. - */ -unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { - unsigned long *segment_ptr, vmaddr, segment; - struct gmap_pgtable *mp; - struct page *page; - - current->thread.gmap_addr = address; - segment_ptr = gmap_table_walk(address, gmap); - if (IS_ERR(segment_ptr)) - return PTR_ERR(segment_ptr); - /* Convert the gmap address to an mm address. */ - segment = *segment_ptr; - if (!(segment & _SEGMENT_ENTRY_INV)) { - page = pfn_to_page(segment >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - return mp->vmaddr | (address & ~PMD_MASK); - } else if (segment & _SEGMENT_ENTRY_RO) { - vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; - return vmaddr | (address & ~PMD_MASK); + if (machine_has_tlb_guest()) { + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); + } else { + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); } - return -EFAULT; } -EXPORT_SYMBOL_GPL(__gmap_translate); -/** - * gmap_translate - translate a guest address to a user space address - * @address: guest address - * @gmap: pointer to guest mapping meta data structure - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(unsigned long address, struct gmap *gmap) +static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { - unsigned long rc; + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (cpu_has_tlb_lc() && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + pmdp_idte_local(mm, addr, pmdp); + else + pmdp_idte_global(mm, addr, pmdp); + atomic_dec(&mm->context.flush_count); + return old; +} - down_read(&gmap->mm->mmap_sem); - rc = __gmap_translate(address, gmap); - up_read(&gmap->mm->mmap_sem); - return rc; +static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); + mm->context.flush_mm = 1; + if (mm_has_pgste(mm)) + gmap_pmdp_invalidate(mm, addr); + } else { + pmdp_idte_global(mm, addr, pmdp); + } + atomic_dec(&mm->context.flush_count); + return old; } -EXPORT_SYMBOL_GPL(gmap_translate); -static int gmap_connect_pgtable(unsigned long address, unsigned long segment, - unsigned long *segment_ptr, struct gmap *gmap) +#ifdef CONFIG_PGSTE +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) { - unsigned long vmaddr; struct vm_area_struct *vma; - struct gmap_pgtable *mp; - struct gmap_rmap *rmap; - struct mm_struct *mm; - struct page *page; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; - pmd_t *pmd; - mm = gmap->mm; - vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; - vma = find_vma(mm, vmaddr); - if (!vma || vma->vm_start > vmaddr) + /* We need a valid VMA, otherwise this is clearly a fault. */ + vma = vma_lookup(mm, addr); + if (!vma) return -EFAULT; - /* Walk the parent mm page table */ - pgd = pgd_offset(mm, vmaddr); - pud = pud_alloc(mm, pgd, vmaddr); - if (!pud) - return -ENOMEM; - pmd = pmd_alloc(mm, pud, vmaddr); - if (!pmd) - return -ENOMEM; - if (!pmd_present(*pmd) && - __pte_alloc(mm, vma, pmd, vmaddr)) - return -ENOMEM; - /* pmd now points to a valid segment table entry. */ - rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); - if (!rmap) - return -ENOMEM; - /* Link gmap segment table entry location to page table. */ - page = pmd_page(*pmd); - mp = (struct gmap_pgtable *) page->index; - rmap->gmap = gmap; - rmap->entry = segment_ptr; - rmap->vmaddr = address & PMD_MASK; - spin_lock(&mm->page_table_lock); - if (*segment_ptr == segment) { - list_add(&rmap->list, &mp->mapper); - /* Set gmap segment table entry to page table. */ - *segment_ptr = pmd_val(*pmd) & PAGE_MASK; - rmap = NULL; - } - spin_unlock(&mm->page_table_lock); - kfree(rmap); - return 0; -} -static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table) -{ - struct gmap_rmap *rmap, *next; - struct gmap_pgtable *mp; - struct page *page; - int flush; - - flush = 0; - spin_lock(&mm->page_table_lock); - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - list_for_each_entry_safe(rmap, next, &mp->mapper, list) { - *rmap->entry = - _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; - list_del(&rmap->list); - kfree(rmap); - flush = 1; - } - spin_unlock(&mm->page_table_lock); - if (flush) - __tlb_flush_global(); -} + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return -ENOENT; -/* - * this function is assumed to be called with mmap_sem held - */ -unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) -{ - unsigned long *segment_ptr, segment; - struct gmap_pgtable *mp; - struct page *page; - int rc; + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return -ENOENT; - current->thread.gmap_addr = address; - segment_ptr = gmap_table_walk(address, gmap); - if (IS_ERR(segment_ptr)) + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return -ENOENT; + + /* Large PUDs are not supported yet. */ + if (pud_leaf(*pud)) return -EFAULT; - /* Convert the gmap address to an mm address. */ - while (1) { - segment = *segment_ptr; - if (!(segment & _SEGMENT_ENTRY_INV)) { - /* Page table is present */ - page = pfn_to_page(segment >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - return mp->vmaddr | (address & ~PMD_MASK); - } - if (!(segment & _SEGMENT_ENTRY_RO)) - /* Nothing mapped in the gmap address space. */ - break; - rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap); - if (rc) - return rc; - } - return -EFAULT; + + *pmdp = pmd_offset(pud, addr); + return 0; } +#endif -unsigned long gmap_fault(unsigned long address, struct gmap *gmap) +pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) { - unsigned long rc; - - down_read(&gmap->mm->mmap_sem); - rc = __gmap_fault(address, gmap); - up_read(&gmap->mm->mmap_sem); + pmd_t old; - return rc; + preempt_disable(); + old = pmdp_flush_direct(mm, addr, pmdp); + set_pmd(pmdp, new); + preempt_enable(); + return old; } -EXPORT_SYMBOL_GPL(gmap_fault); +EXPORT_SYMBOL(pmdp_xchg_direct); -void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) +pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) { + pmd_t old; - unsigned long *table, address, size; - struct vm_area_struct *vma; - struct gmap_pgtable *mp; - struct page *page; - - down_read(&gmap->mm->mmap_sem); - address = from; - while (address < to) { - /* Walk the gmap address space page table */ - table = gmap->table + ((address >> 53) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 42) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 31) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 20) & 0x7ff); - if (unlikely(*table & _SEGMENT_ENTRY_INV)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - page = pfn_to_page(*table >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - vma = find_vma(gmap->mm, mp->vmaddr); - size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); - zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), - size, NULL); - address = (address + PMD_SIZE) & PMD_MASK; - } - up_read(&gmap->mm->mmap_sem); + preempt_disable(); + old = pmdp_flush_lazy(mm, addr, pmdp); + set_pmd(pmdp, new); + preempt_enable(); + return old; } -EXPORT_SYMBOL_GPL(gmap_discard); +EXPORT_SYMBOL(pmdp_xchg_lazy); -static LIST_HEAD(gmap_notifier_list); -static DEFINE_SPINLOCK(gmap_notifier_lock); - -/** - * gmap_register_ipte_notifier - register a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) { - spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); - spin_unlock(&gmap_notifier_lock); + if (machine_has_tlb_guest()) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); } -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); -/** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) { - spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); - spin_unlock(&gmap_notifier_lock); + if (machine_has_tlb_guest()) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); } -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); -/** - * gmap_ipte_notify - mark a range of ptes for invalidation notification - * @gmap: pointer to guest mapping meta data structure - * @address: virtual address in the guest address space - * @len: size of area - * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. - */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) +static inline pud_t pudp_flush_direct(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) { - unsigned long addr; - spinlock_t *ptl; - pte_t *ptep, entry; - pgste_t pgste; - int rc = 0; - - if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK)) - return -EINVAL; - down_read(&gmap->mm->mmap_sem); - while (len) { - /* Convert gmap address and connect the page tables */ - addr = __gmap_fault(start, gmap); - if (IS_ERR_VALUE(addr)) { - rc = addr; - break; - } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { - rc = -EFAULT; - break; - } - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - if (unlikely(!ptep)) - continue; - /* Set notification bit in the pgste of the pte */ - entry = *ptep; - if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) { - pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= PGSTE_IN_BIT; - pgste_set_unlock(ptep, pgste); - start += PAGE_SIZE; - len -= PAGE_SIZE; - } - spin_unlock(ptl); - } - up_read(&gmap->mm->mmap_sem); - return rc; + pud_t old; + + old = *pudp; + if (pud_val(old) & _REGION_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (cpu_has_tlb_lc() && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + pudp_idte_local(mm, addr, pudp); + else + pudp_idte_global(mm, addr, pudp); + atomic_dec(&mm->context.flush_count); + return old; } -EXPORT_SYMBOL_GPL(gmap_ipte_notify); -/** - * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space - * @pte: pointer to the page table entry - * - * This function is assumed to be called with the page table lock held - * for the pte to notify. - */ -void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte) +pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t new) { - unsigned long segment_offset; - struct gmap_notifier *nb; - struct gmap_pgtable *mp; - struct gmap_rmap *rmap; - struct page *page; - - segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - segment_offset = segment_offset * (4096 / sizeof(pte_t)); - page = pfn_to_page(__pa(pte) >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - spin_lock(&gmap_notifier_lock); - list_for_each_entry(rmap, &mp->mapper, list) { - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(rmap->gmap, - rmap->vmaddr + segment_offset); - } - spin_unlock(&gmap_notifier_lock); -} + pud_t old; -static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, - unsigned long vmaddr) -{ - struct page *page; - unsigned long *table; - struct gmap_pgtable *mp; - - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); - if (!page) - return NULL; - mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); - if (!mp) { - __free_page(page); - return NULL; - } - pgtable_page_ctor(page); - mp->vmaddr = vmaddr & PMD_MASK; - INIT_LIST_HEAD(&mp->mapper); - page->index = (unsigned long) mp; - atomic_set(&page->_mapcount, 3); - table = (unsigned long *) page_to_phys(page); - clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); - return table; + preempt_disable(); + old = pudp_flush_direct(mm, addr, pudp); + set_pud(pudp, new); + preempt_enable(); + return old; } +EXPORT_SYMBOL(pudp_xchg_direct); -static inline void page_table_free_pgste(unsigned long *table) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { - struct page *page; - struct gmap_pgtable *mp; - - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - BUG_ON(!list_empty(&mp->mapper)); - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - kfree(mp); - __free_page(page); + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned long key, bool nq) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { - spinlock_t *ptl; - pgste_t old, new; + struct list_head *lh; + pgtable_t pgtable; pte_t *ptep; - down_read(&mm->mmap_sem); - ptep = get_locked_pte(current->mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); - return -EFAULT; - } - - new = old = pgste_get_lock(ptep); - pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; - pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long address, bits; - unsigned char skey; + assert_spin_locked(pmd_lockptr(mm, pmdp)); - address = pte_val(*ptep) & PAGE_MASK; - skey = page_get_storage_key(address); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - /* Set storage key ACC and FP */ - page_set_storage_key(address, - (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)), - !nq); - - /* Merge host changed & referenced into pgste */ - pgste_val(new) |= bits << 52; - /* Transfer skey changed & referenced bit to kvm user bits */ - pgste_val(new) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */ + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); } - /* changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & - (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - pgste_val(new) |= PGSTE_UC_BIT; - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(*ptep, ptl); - up_read(&mm->mmap_sem); - return 0; -} -EXPORT_SYMBOL(set_guest_storage_key); - -#else /* CONFIG_PGSTE */ - -static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, - unsigned long vmaddr) -{ - return NULL; + ptep = (pte_t *) pgtable; + set_pte(ptep, __pte(_PAGE_INVALID)); + ptep++; + set_pte(ptep, __pte(_PAGE_INVALID)); + return pgtable; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline void page_table_free_pgste(unsigned long *table) +#ifdef CONFIG_PGSTE +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) { -} + pgste_t pgste; -static inline void gmap_disconnect_pgtable(struct mm_struct *mm, - unsigned long *table) -{ + /* the mm_has_pgste() check is done in set_pte_at() */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); + pgste_set_key(ptep, pgste, entry, mm); + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + preempt_enable(); } -#endif /* CONFIG_PGSTE */ - -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned int old, new; + pgste_t pgste; - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); + pgste_set_unlock(ptep, pgste); + preempt_enable(); } -/* - * page table entry allocation/free routines. +/** + * ptep_force_prot - change access rights of a locked pte + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @ptep: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bit: pgste bit to set (e.g. for notification) + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. */ -unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) +int ptep_force_prot(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int prot, unsigned long bit) { - unsigned long *uninitialized_var(table); - struct page *uninitialized_var(page); - unsigned int mask, bit; - - if (mm_has_pgste(mm)) - return page_table_alloc_pgste(mm, vmaddr); - /* Allocate fragments of a 4K page as 1K/2K page table */ - spin_lock_bh(&mm->context.list_lock); - mask = FRAG_MASK; - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - table = (unsigned long *) page_to_phys(page); - mask = atomic_read(&page->_mapcount); - mask = mask | (mask >> 4); + pte_t entry; + pgste_t pgste; + int pte_i, pte_p, nodat; + + pgste = pgste_get_lock(ptep); + entry = *ptep; + /* Check pte entry after all locks have been acquired */ + pte_i = pte_val(entry) & _PAGE_INVALID; + pte_p = pte_val(entry) & _PAGE_PROTECT; + if ((pte_i && (prot != PROT_NONE)) || + (pte_p && (prot & PROT_WRITE))) { + pgste_set_unlock(ptep, pgste); + return -EAGAIN; } - if ((mask & FRAG_MASK) == FRAG_MASK) { - spin_unlock_bh(&mm->context.list_lock); - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); - if (!page) - return NULL; - pgtable_page_ctor(page); - atomic_set(&page->_mapcount, 1); - table = (unsigned long *) page_to_phys(page); - clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); - list_add(&page->lru, &mm->context.pgtable_list); - } else { - for (bit = 1; mask & bit; bit <<= 1) - table += PTRS_PER_PTE; - mask = atomic_xor_bits(&page->_mapcount, bit); - if ((mask & FRAG_MASK) == FRAG_MASK) - list_del(&page->lru); + /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + if (prot == PROT_NONE && !pte_i) { + ptep_flush_direct(mm, addr, ptep, nodat); + pgste = pgste_update_all(entry, pgste, mm); + entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); } - spin_unlock_bh(&mm->context.list_lock); - return table; + if (prot == PROT_READ && !pte_p) { + ptep_flush_direct(mm, addr, ptep, nodat); + entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); + entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); + } + pgste = set_pgste_bit(pgste, bit); + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + return 0; } -void page_table_free(struct mm_struct *mm, unsigned long *table) +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte) { - struct page *page; - unsigned int bit, mask; - - if (mm_has_pgste(mm)) { - gmap_disconnect_pgtable(mm, table); - return page_table_free_pgste(table); - } - /* Free 1K/2K page table fragment of a 4K page */ - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); - spin_lock_bh(&mm->context.list_lock); - if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) - list_del(&page->lru); - mask = atomic_xor_bits(&page->_mapcount, bit); - if (mask & FRAG_MASK) - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); - if (mask == 0) { - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); + pgste_t spgste, tpgste; + pte_t spte, tpte; + int rc = -EAGAIN; + + if (!(pte_val(*tptep) & _PAGE_INVALID)) + return 0; /* already shadowed */ + spgste = pgste_get_lock(sptep); + spte = *sptep; + if (!(pte_val(spte) & _PAGE_INVALID) && + !((pte_val(spte) & _PAGE_PROTECT) && + !(pte_val(pte) & _PAGE_PROTECT))) { + spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); + tpgste = pgste_get_lock(tptep); + tpte = __pte((pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT)); + /* don't touch the storage key - it belongs to parent pgste */ + tpgste = pgste_set_pte(tptep, tpgste, tpte); + pgste_set_unlock(tptep, tpgste); + rc = 1; } + pgste_set_unlock(sptep, spgste); + return rc; } -static void __page_table_free_rcu(void *table, unsigned bit) +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) { - struct page *page; - - if (bit == FRAG_MASK) - return page_table_free_pgste(table); - /* Free 1K/2K page table fragment of a 4K page */ - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (atomic_xor_bits(&page->_mapcount, bit) == 0) { - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); - } + pgste_t pgste; + int nodat; + + pgste = pgste_get_lock(ptep); + /* notifier is called by the caller */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); + /* don't touch the storage key - it belongs to parent pgste */ + pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); + pgste_set_unlock(ptep, pgste); } -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) { - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; + if (softleaf_is_swap(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); - mm = tlb->mm; - if (mm_has_pgste(mm)) { - gmap_disconnect_pgtable(mm, table); - table = (unsigned long *) (__pa(table) | FRAG_MASK); - tlb_remove_table(tlb, table); - return; + dec_mm_counter(mm, mm_counter(folio)); } - bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - spin_lock_bh(&mm->context.list_lock); - if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) - list_del(&page->lru); - mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); - if (mask & FRAG_MASK) - list_add_tail(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); - table = (unsigned long *) (__pa(table) | (bit << 4)); - tlb_remove_table(tlb, table); + free_swap_and_cache(entry); } -void __tlb_remove_table(void *_table) +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int reset) { - const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; - void *table = (void *)((unsigned long) _table & ~mask); - unsigned type = (unsigned long) _table & mask; + unsigned long pgstev; + pgste_t pgste; + pte_t pte; - if (type) - __page_table_free_rcu(table, type); - else - free_pages((unsigned long) table, ALLOC_ORDER); + /* Zap unused and logically-zero pages */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + pte = *ptep; + if (!reset && pte_swap(pte) && + ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO))) { + ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte)); + pte_clear(mm, addr, ptep); + } + if (reset) + pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); + pgste_set_unlock(ptep, pgste); + preempt_enable(); } -static void tlb_remove_table_smp_sync(void *arg) +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - /* Simply deliver the interrupt */ -} + unsigned long ptev; + pgste_t pgste; -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely - * on IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); + /* Clear storage key ACC and F, but set R/C */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); + ptev = pte_val(*ptep); + if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); + pgste_set_unlock(ptep, pgste); + preempt_enable(); } -static void tlb_remove_table_rcu(struct rcu_head *head) +/* + * Test and reset if a guest page is dirty + */ +bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); + pgste_t pgste; + pte_t pte; + bool dirty; + int nodat; + + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); + pte = *ptep; + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { + pgste = pgste_pte_notify(mm, addr, ptep, pgste); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); + if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) + pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); + else + pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); + } + pgste_set_unlock(ptep, pgste); + return dirty; } +EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); -void tlb_table_flush(struct mmu_gather *tlb) +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq) { - struct mmu_table_batch **batch = &tlb->batch; + unsigned long keyul, paddr; + spinlock_t *ptl; + pgste_t old, new; + pmd_t *pmdp; + pte_t *ptep; - if (*batch) { - __tlb_flush_mm(tlb->mm); - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; + /* + * If we don't have a PTE table and if there is no huge page mapped, + * we can ignore attempts to set the key to 0, because it already is 0. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return key ? -EFAULT : 0; + case 0: + break; + default: + return -EFAULT; + } +again: + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return key ? -EFAULT : 0; } -} -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch == NULL) { - *batch = (struct mmu_table_batch *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - __tlb_flush_mm(tlb->mm); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; + if (pmd_leaf(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + /* + * Huge pmds need quiescing operations, they are + * always mapped. + */ + page_set_storage_key(paddr, key, 1); + spin_unlock(ptl); + return 0; } - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_table_flush(tlb); -} + spin_unlock(ptl); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -void thp_split_vma(struct vm_area_struct *vma) -{ - unsigned long addr; - struct page *page; + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; + new = old = pgste_get_lock(ptep); + new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); + keyul = (unsigned long) key; + new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); + new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + unsigned long bits, skey; - for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { - page = follow_page(vma, addr, FOLL_SPLIT); + paddr = pte_val(*ptep) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(paddr); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); + /* Set storage key ACC and FP */ + page_set_storage_key(paddr, skey, !nq); + /* Merge host changed & referenced into pgste */ + new = set_pgste_bit(new, bits << 52); } + /* changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & + (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) + new = set_pgste_bit(new, PGSTE_UC_BIT); + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; } +EXPORT_SYMBOL(set_guest_storage_key); -void thp_split_mm(struct mm_struct *mm) +/* + * Conditionally set a guest storage key (handling csske). + * oldkey will be updated when either mr or mc is set and a pointer is given. + * + * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest + * storage key was updated and -EFAULT on access errors. + */ +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc) { - struct vm_area_struct *vma = mm->mmap; + unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; + int rc; - while (vma != NULL) { - thp_split_vma(vma); - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; - vma = vma->vm_next; + /* we can drop the pgste lock between getting and setting the key */ + if (mr | mc) { + rc = get_guest_storage_key(current->mm, addr, &tmp); + if (rc) + return rc; + if (oldkey) + *oldkey = tmp; + if (!mr) + mask |= _PAGE_REFERENCED; + if (!mc) + mask |= _PAGE_CHANGED; + if (!((tmp ^ key) & mask)) + return 0; } + rc = set_guest_storage_key(current->mm, addr, key, nq); + return rc < 0 ? rc : 1; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +EXPORT_SYMBOL(cond_set_guest_storage_key); /* - * switch on pgstes for its userspace process (for kvm) + * Reset a guest reference bit (rrbe), returning the reference and changed bit. + * + * Returns < 0 in case of error, otherwise the cc to be reported to the guest. */ -int s390_enable_sie(void) +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) { - struct task_struct *tsk = current; - struct mm_struct *mm, *old_mm; - - /* Do we have switched amode? If no, we cannot do sie */ - if (s390_user_mode == HOME_SPACE_MODE) - return -EINVAL; + spinlock_t *ptl; + unsigned long paddr; + pgste_t old, new; + pmd_t *pmdp; + pte_t *ptep; + int cc = 0; - /* Do we have pgstes? if yes, we are done */ - if (mm_has_pgste(tsk->mm)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0 and there is nothing for us to do. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: + return -EFAULT; + } +again: + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); return 0; - - /* lets check if we are allowed to replace the mm */ - task_lock(tsk); - if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || -#ifdef CONFIG_AIO - !hlist_empty(&tsk->mm->ioctx_list) || -#endif - tsk->mm != tsk->active_mm) { - task_unlock(tsk); - return -EINVAL; } - task_unlock(tsk); - /* we copy the mm and let dup_mm create the page tables with_pgstes */ - tsk->mm->context.alloc_pgste = 1; - /* make sure that both mms have a correct rss state */ - sync_mm_rss(tsk->mm); - mm = dup_mm(tsk); - tsk->mm->context.alloc_pgste = 0; - if (!mm) - return -ENOMEM; + if (pmd_leaf(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + cc = page_reset_referenced(paddr); + spin_unlock(ptl); + return cc; + } + spin_unlock(ptl); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* split thp mappings and disable thp for future mappings */ - thp_split_mm(mm); - mm->def_flags |= VM_NOHUGEPAGE; -#endif + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; + new = old = pgste_get_lock(ptep); + /* Reset guest reference bit only */ + new = clear_pgste_bit(new, PGSTE_GR_BIT); - /* Now lets check again if something happened */ - task_lock(tsk); - if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || -#ifdef CONFIG_AIO - !hlist_empty(&tsk->mm->ioctx_list) || -#endif - tsk->mm != tsk->active_mm) { - mmput(mm); - task_unlock(tsk); - return -EINVAL; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + paddr = pte_val(*ptep) & PAGE_MASK; + cc = page_reset_referenced(paddr); + /* Merge real referenced bit into host-set */ + new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); } + /* Reflect guest's logical view, not physical */ + cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; + /* Changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) + new = set_pgste_bit(new, PGSTE_UC_BIT); - /* ok, we are alone. No ptrace, no threads, etc. */ - old_mm = tsk->mm; - tsk->mm = tsk->active_mm = mm; - preempt_disable(); - update_mm(mm, tsk); - atomic_inc(&mm->context.attach_count); - atomic_dec(&old_mm->context.attach_count); - cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); - preempt_enable(); - task_unlock(tsk); - mmput(old_mm); - return 0; + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return cc; } -EXPORT_SYMBOL_GPL(s390_enable_sie); +EXPORT_SYMBOL(reset_guest_reference_bit); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key) { - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* No need to flush TLB - * On s390 reference bits are in storage key and never in TLB */ - return pmdp_test_and_clear_young(vma, address, pmdp); -} + unsigned long paddr; + spinlock_t *ptl; + pgste_t pgste; + pmd_t *pmdp; + pte_t *ptep; -int pmdp_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp, - pmd_t entry, int dirty) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0. + */ + *key = 0; - if (pmd_same(*pmdp, entry)) + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: return 0; - pmdp_invalidate(vma, address, pmdp); - set_pmd_at(vma->vm_mm, address, pmdp, entry); - return 1; -} + case 0: + break; + default: + return -EFAULT; + } +again: + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return 0; + } -static void pmdp_splitting_flush_sync(void *arg) -{ - /* Simply deliver the interrupt */ + if (pmd_leaf(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + *key = page_get_storage_key(paddr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; + pgste = pgste_get_lock(ptep); + *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + paddr = pte_val(*ptep) & PAGE_MASK; + if (!(pte_val(*ptep) & _PAGE_INVALID)) + *key = page_get_storage_key(paddr); + /* Reflect guest's logical view, not physical */ + *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + return 0; } +EXPORT_SYMBOL(get_guest_storage_key); -void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +/** + * pgste_perform_essa - perform ESSA actions on the PGSTE. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @orc: the specific action to perform, see the ESSA_SET_* macros. + * @oldpte: the PTE will be saved there if the pointer is not NULL. + * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. + * + * Return: 1 if the page is to be added to the CBRL, otherwise 0, + * or < 0 in case of error. -EINVAL is returned for invalid values + * of orc, -EFAULT for invalid addresses. + */ +int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, + unsigned long *oldpte, unsigned long *oldpgste) { - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, - (unsigned long *) pmdp)) { - /* need to serialize against gup-fast (IRQ disabled) */ - smp_call_function(pmdp_splitting_flush_sync, NULL, 1); + struct vm_area_struct *vma; + unsigned long pgstev; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + int res = 0; + + WARN_ON_ONCE(orc > ESSA_MAX); + if (unlikely(orc > ESSA_MAX)) + return -EINVAL; + + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + if (oldpte) + *oldpte = pte_val(*ptep); + if (oldpgste) + *oldpgste = pgstev; + + switch (orc) { + case ESSA_GET_STATE: + break; + case ESSA_SET_STABLE: + pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); + pgstev |= _PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_UNUSED: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_UNUSED; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; + break; + } + if (pgstev & _PGSTE_GPS_ZERO) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + break; + } + if (!(pgstev & PGSTE_GC_BIT)) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + res = 1; + break; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + } + break; + case ESSA_SET_STABLE_NODAT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; + break; + default: + /* we should never get here! */ + break; } + /* If we are discarding a page, set it to logical zero */ + if (res) + pgstev |= _PGSTE_GPS_ZERO; + + pgste = __pgste(pgstev); + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + return res; } +EXPORT_SYMBOL(pgste_perform_essa); -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) +/** + * set_pgste_bits - set specific PGSTE bits. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @bits: a bitmask representing the bits that will be touched + * @value: the values of the bits to be written. Only the bits in the mask + * will be written. + * + * Return: 0 on success, < 0 in case of error. + */ +int set_pgste_bits(struct mm_struct *mm, unsigned long hva, + unsigned long bits, unsigned long value) { - struct list_head *lh = (struct list_head *) pgtable; + struct vm_area_struct *vma; + spinlock_t *ptl; + pgste_t new; + pte_t *ptep; - assert_spin_locked(&mm->page_table_lock); + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + new = pgste_get_lock(ptep); - /* FIFO */ - if (!mm->pmd_huge_pte) - INIT_LIST_HEAD(lh); - else - list_add(lh, (struct list_head *) mm->pmd_huge_pte); - mm->pmd_huge_pte = pgtable; + new = clear_pgste_bit(new, bits); + new = set_pgste_bit(new, value & bits); + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; } +EXPORT_SYMBOL(set_pgste_bits); -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +/** + * get_pgste - get the current PGSTE for the given address. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @pgstep: will be written with the current PGSTE for the given address. + * + * Return: 0 on success, < 0 in case of error. + */ +int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) { - struct list_head *lh; - pgtable_t pgtable; + struct vm_area_struct *vma; + spinlock_t *ptl; pte_t *ptep; - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - pgtable = mm->pmd_huge_pte; - lh = (struct list_head *) pgtable; - if (list_empty(lh)) - mm->pmd_huge_pte = NULL; - else { - mm->pmd_huge_pte = (pgtable_t) lh->next; - list_del(lh); - } - ptep = (pte_t *) pgtable; - pte_val(*ptep) = _PAGE_TYPE_EMPTY; - ptep++; - pte_val(*ptep) = _PAGE_TYPE_EMPTY; - return pgtable; + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + *pgstep = pgste_val(pgste_get(ptep)); + pte_unmap_unlock(ptep, ptl); + return 0; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +EXPORT_SYMBOL(get_pgste); +#endif diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c new file mode 100644 index 000000000000..59de866c72d9 --- /dev/null +++ b/arch/s390/mm/physaddr.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mmdebug.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <asm/page.h> + +unsigned long __phys_addr(unsigned long x, bool is_31bit) +{ + VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x))); + x = __pa_nodebug(x); + if (is_31bit) + VIRTUAL_BUG_ON(x >> 31); + return x; +} +EXPORT_SYMBOL(__phys_addr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 8b268fcc4612..d96587b84e81 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -1,425 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2006 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ -#include <linux/bootmem.h> +#include <linux/memory_hotplug.h> +#include <linux/bootmem_info.h> +#include <linux/cpufeature.h> +#include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mm.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/list.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/sort.h> +#include <asm/page-states.h> +#include <asm/abs_lowcore.h> +#include <asm/cacheflush.h> +#include <asm/maccess.h> +#include <asm/nospec-branch.h> +#include <asm/ctlreg.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/set_memory.h> +#include <asm/physmem_info.h> static DEFINE_MUTEX(vmem_mutex); -struct memory_segment { - struct list_head list; - unsigned long start; - unsigned long size; -}; - -static LIST_HEAD(mem_segs); - static void __ref *vmem_alloc_pages(unsigned int order) { + unsigned long size = PAGE_SIZE << order; + if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return alloc_bootmem_pages((1 << order) * PAGE_SIZE); + return memblock_alloc(size, size); } -static inline pud_t *vmem_pud_alloc(void) +static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) { - pud_t *pud = NULL; + unsigned int nr_pages = 1 << order; + struct page *page; -#ifdef CONFIG_64BIT - pud = vmem_alloc_pages(2); - if (!pud) - return NULL; - clear_table((unsigned long *) pud, _REGION3_ENTRY_EMPTY, PAGE_SIZE * 4); -#endif - return pud; + if (altmap) { + vmem_altmap_free(altmap, 1 << order); + return; + } + page = virt_to_page((void *)addr); + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_bootmem_page(page++); + } else { + free_pages(addr, order); + } } -static inline pmd_t *vmem_pmd_alloc(void) +void *vmem_crst_alloc(unsigned long val) { - pmd_t *pmd = NULL; + unsigned long *table; -#ifdef CONFIG_64BIT - pmd = vmem_alloc_pages(2); - if (!pmd) + table = vmem_alloc_pages(CRST_ALLOC_ORDER); + if (!table) return NULL; - clear_table((unsigned long *) pmd, _SEGMENT_ENTRY_EMPTY, PAGE_SIZE * 4); -#endif - return pmd; + crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; } -static pte_t __ref *vmem_pte_alloc(unsigned long address) +pte_t __ref *vmem_pte_alloc(void) { pte_t *pte; if (slab_is_available()) - pte = (pte_t *) page_table_alloc(&init_mm, address); + pte = (pte_t *)page_table_alloc(&init_mm); else - pte = alloc_bootmem(PTRS_PER_PTE * sizeof(pte_t)); + pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!pte) return NULL; - clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY, - PTRS_PER_PTE * sizeof(pte_t)); + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + __arch_set_page_dat(pte, 1); return pte; } +static void vmem_pte_free(unsigned long *table) +{ + page_table_free(&init_mm, table); +} + +#define PAGE_UNUSED 0xFD + /* - * Add a physical memory range to the 1:1 mapping. + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges + * from unused_sub_pmd_start to next PMD_SIZE boundary. */ -static int vmem_add_mem(unsigned long start, unsigned long size, int ro) -{ - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; +static unsigned long unused_sub_pmd_start; + +static void vmemmap_flush_unused_sub_pmd(void) +{ + if (!unused_sub_pmd_start) + return; + memset((void *)unused_sub_pmd_start, PAGE_UNUSED, + ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); + unused_sub_pmd_start = 0; +} + +static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed (just in case the memmap never gets initialized, + * e.g., because the memory block never gets onlined). + */ + memset((void *)start, 0, sizeof(struct page)); +} + +static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_sub_pmd_start == start) { + unused_sub_pmd_start = end; + if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) + unused_sub_pmd_start = 0; + return; + } + vmemmap_flush_unused_sub_pmd(); + vmemmap_mark_sub_pmd_used(start, end); +} + +static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + + /* Could be our memmap page is filled with PAGE_UNUSED already ... */ + vmemmap_mark_sub_pmd_used(start, end); + + /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset((void *)page, PAGE_UNUSED, start - page); + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD the last + * unused range in the populated PMD. + */ + if (!IS_ALIGNED(end, PMD_SIZE)) + unused_sub_pmd_start = end; +} + +/* Returns true if the PMD is completely unused and can be freed. */ +static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + memset((void *)start, PAGE_UNUSED, end - start); + return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); +} + +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long prot, pages = 0; int ret = -ENOMEM; + pte_t *pte; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - pu_dir = vmem_pud_alloc(); - if (!pu_dir) - goto out; - pgd_populate(&init_mm, pg_dir, pu_dir); - } - pu_dir = pud_offset(pg_dir, address); -#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC) - if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && - !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) { - pud_val(*pu_dir) = __pa(address) | - _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE | - (ro ? _REGION_ENTRY_RO : 0); - address += PUD_SIZE; - continue; - } -#endif - if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); - if (!pm_dir) - goto out; - pud_populate(&init_mm, pu_dir, pm_dir); - } - pm_dir = pmd_offset(pu_dir, address); -#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC) - if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && - !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) { - pmd_val(*pm_dir) = __pa(address) | - _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE | - (ro ? _SEGMENT_ENTRY_RO : 0); - address += PMD_SIZE; + prot = pgprot_val(PAGE_KERNEL); + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (!add) { + if (pte_none(*pte)) + continue; + if (!direct) + vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); + pte_clear(&init_mm, addr, pte); + } else if (pte_none(*pte)) { + if (!direct) { + void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); + + if (!new_page) + goto out; + set_pte(pte, __pte(__pa(new_page) | prot)); + } else { + set_pte(pte, __pte(__pa(addr) | prot)); + } + } else { continue; } -#endif - if (pmd_none(*pm_dir)) { - pt_dir = vmem_pte_alloc(address); - if (!pt_dir) - goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } - - pt_dir = pte_offset_kernel(pm_dir, address); - pte_val(*pt_dir) = __pa(address) | (ro ? _PAGE_RO : 0); - address += PAGE_SIZE; + pages++; } ret = 0; out: - flush_tlb_kernel_range(start, end); + if (direct) + update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); return ret; } -/* - * Remove a physical memory range from the 1:1 mapping. - * Currently only invalidates page table entries. - */ -static void vmem_remove_range(unsigned long start, unsigned long size) +static void try_free_pte_table(pmd_t *pmd, unsigned long start) { - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - pte_t pte; - - pte_val(pte) = _PAGE_TYPE_EMPTY; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - address += PGDIR_SIZE; - continue; - } - pu_dir = pud_offset(pg_dir, address); - if (pud_none(*pu_dir)) { - address += PUD_SIZE; - continue; - } - if (pud_large(*pu_dir)) { - pud_clear(pu_dir); - address += PUD_SIZE; - continue; - } - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - address += PMD_SIZE; - continue; - } - if (pmd_large(*pm_dir)) { - pmd_clear(pm_dir); - address += PMD_SIZE; - continue; - } - pt_dir = pte_offset_kernel(pm_dir, address); - *pt_dir = pte; - address += PAGE_SIZE; + pte_t *pte; + int i; + + /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ + pte = pte_offset_kernel(pmd, start); + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(*pte)) + return; } - flush_tlb_kernel_range(start, end); + vmem_pte_free((unsigned long *) pmd_deref(*pmd)); + pmd_clear(pmd); } -/* - * Add a backed mem_map array to the virtual mem_map array. - */ -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { - unsigned long address = start; - pgd_t *pg_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; + unsigned long next, prot, pages = 0; int ret = -ENOMEM; + pmd_t *pmd; + pte_t *pte; - for (address = start; address < end;) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - pu_dir = vmem_pud_alloc(); - if (!pu_dir) - goto out; - pgd_populate(&init_mm, pg_dir, pu_dir); - } + prot = pgprot_val(SEGMENT_KERNEL); + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (!add) { + if (pmd_none(*pmd)) + continue; + if (pmd_leaf(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); + pmd_clear(pmd); + pages++; + } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); + pmd_clear(pmd); + } + continue; + } + } else if (pmd_none(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE) && + cpu_has_edat1() && direct && + !debug_pagealloc_enabled()) { + set_pmd(pmd, __pmd(__pa(addr) | prot)); + pages++; + continue; + } else if (!direct && cpu_has_edat1()) { + void *new_page; - pu_dir = pud_offset(pg_dir, address); - if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); - if (!pm_dir) + /* + * Use 1MB frames for vmemmap if available. We + * always use large frames even if they are only + * partially used. Otherwise we would have also + * page tables since vmemmap_populate gets + * called for each section separately. + */ + new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); + if (new_page) { + set_pmd(pmd, __pmd(__pa(new_page) | prot)); + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + vmemmap_use_new_sub_pmd(addr, next); + } + continue; + } + } + pte = vmem_pte_alloc(); + if (!pte) goto out; - pud_populate(&init_mm, pu_dir, pm_dir); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_leaf(*pmd)) { + if (!direct) + vmemmap_use_sub_pmd(addr, next); + continue; } + ret = modify_pte_table(pmd, addr, next, add, direct, altmap); + if (ret) + goto out; + if (!add) + try_free_pte_table(pmd, addr & PMD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); + return ret; +} - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { -#ifdef CONFIG_64BIT - /* Use 1MB frames for vmemmap if available. We always - * use large frames even if they are only partially - * used. - * Otherwise we would have also page tables since - * vmemmap_populate gets called for each section - * separately. */ - if (MACHINE_HAS_EDAT1) { - void *new_page; +static void try_free_pmd_table(pud_t *pud, unsigned long start) +{ + pmd_t *pmd; + int i; - new_page = vmemmap_alloc_block(PMD_SIZE, node); - if (!new_page) - goto out; - pmd_val(*pm_dir) = __pa(new_page) | - _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE | - _SEGMENT_ENTRY_CO; - address = (address + PMD_SIZE) & PMD_MASK; + pmd = pmd_offset(pud, start); + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) + if (!pmd_none(*pmd)) + return; + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); + pud_clear(pud); +} + +static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, + bool add, bool direct, struct vmem_altmap *altmap) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pud_t *pud; + pmd_t *pmd; + + prot = pgprot_val(REGION3_KERNEL); + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (!add) { + if (pud_none(*pud)) + continue; + if (pud_leaf(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + pud_clear(pud); + pages++; + } continue; } -#endif - pt_dir = vmem_pte_alloc(address); - if (!pt_dir) + } else if (pud_none(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE) && + cpu_has_edat2() && direct && + !debug_pagealloc_enabled()) { + set_pud(pud, __pud(__pa(addr) | prot)); + pages++; + continue; + } + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; + pud_populate(&init_mm, pud, pmd); + } else if (pud_leaf(*pud)) { continue; } + ret = modify_pmd_table(pud, addr, next, add, direct, altmap); + if (ret) + goto out; + if (!add) + try_free_pmd_table(pud, addr & PUD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); + return ret; +} - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - unsigned long new_page; +static void try_free_pud_table(p4d_t *p4d, unsigned long start) +{ + pud_t *pud; + int i; + + pud = pud_offset(p4d, start); + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + if (!pud_none(*pud)) + return; + } + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); + p4d_clear(p4d); +} - new_page =__pa(vmem_alloc_pages(0)); - if (!new_page) +static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, + bool add, bool direct, struct vmem_altmap *altmap) +{ + unsigned long next; + int ret = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (!add) { + if (p4d_none(*p4d)) + continue; + } else if (p4d_none(*p4d)) { + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) goto out; - pte_val(*pt_dir) = __pa(new_page); + p4d_populate(&init_mm, p4d, pud); } - address += PAGE_SIZE; + ret = modify_pud_table(p4d, addr, next, add, direct, altmap); + if (ret) + goto out; + if (!add) + try_free_pud_table(p4d, addr & P4D_MASK); } - memset((void *)start, 0, end - start); ret = 0; out: - flush_tlb_kernel_range(start, end); return ret; } -void vmemmap_free(unsigned long start, unsigned long end) +static void try_free_p4d_table(pgd_t *pgd, unsigned long start) { + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, start); + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + if (!p4d_none(*p4d)) + return; + } + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); + pgd_clear(pgd); } -/* - * Add memory segment to the segment list if it doesn't overlap with - * an already present segment. - */ -static int insert_memory_segment(struct memory_segment *seg) +static int modify_pagetable(unsigned long start, unsigned long end, bool add, + bool direct, struct vmem_altmap *altmap) { - struct memory_segment *tmp; + unsigned long addr, next; + int ret = -ENOMEM; + pgd_t *pgd; + p4d_t *p4d; + + if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) + return -EINVAL; + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (WARN_ON_ONCE(end > __abs_lowcore)) + return -EINVAL; + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); + + if (!add) { + if (pgd_none(*pgd)) + continue; + } else if (pgd_none(*pgd)) { + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); + if (ret) + goto out; + if (!add) + try_free_p4d_table(pgd, addr & PGDIR_MASK); + } + ret = 0; +out: + if (!add) + flush_tlb_kernel_range(start, end); + return ret; +} - if (seg->start + seg->size > VMEM_MAX_PHYS || - seg->start + seg->size < seg->start) - return -ERANGE; +static int add_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + return modify_pagetable(start, end, true, direct, altmap); +} - list_for_each_entry(tmp, &mem_segs, list) { - if (seg->start >= tmp->start + tmp->size) - continue; - if (seg->start + seg->size <= tmp->start) - continue; - return -ENOSPC; - } - list_add(&seg->list, &mem_segs); - return 0; +static int remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + return modify_pagetable(start, end, false, direct, altmap); } /* - * Remove memory segment from the segment list. + * Add a physical memory range to the 1:1 mapping. */ -static void remove_memory_segment(struct memory_segment *seg) +static int vmem_add_range(unsigned long start, unsigned long size) { - list_del(&seg->list); + start = (unsigned long)__va(start); + return add_pagetable(start, start + size, true, NULL); } -static void __remove_shared_memory(struct memory_segment *seg) +/* + * Remove a physical memory range from the 1:1 mapping. + */ +static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_memory_segment(seg); - vmem_remove_range(seg->start, seg->size); + start = (unsigned long)__va(start); + remove_pagetable(start, start + size, true, NULL); } -int vmem_remove_mapping(unsigned long start, unsigned long size) +/* + * Add a backed mem_map array to the virtual mem_map array. + */ +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { - struct memory_segment *seg; int ret; mutex_lock(&vmem_mutex); + /* We don't care about the node, just use NUMA_NO_NODE on allocations */ + ret = add_pagetable(start, end, false, altmap); + if (ret) + remove_pagetable(start, end, false, altmap); + mutex_unlock(&vmem_mutex); + return ret; +} - ret = -ENOENT; - list_for_each_entry(seg, &mem_segs, list) { - if (seg->start == start && seg->size == size) - break; - } +#ifdef CONFIG_MEMORY_HOTPLUG - if (seg->start != start || seg->size != size) - goto out; +void vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + mutex_lock(&vmem_mutex); + remove_pagetable(start, end, false, altmap); + mutex_unlock(&vmem_mutex); +} - ret = 0; - __remove_shared_memory(seg); - kfree(seg); -out: +#endif + +void vmem_remove_mapping(unsigned long start, unsigned long size) +{ + mutex_lock(&vmem_mutex); + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); - return ret; +} + +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = 0; + mhp_range.end = max_mappable - 1; + return mhp_range; } int vmem_add_mapping(unsigned long start, unsigned long size) { - struct memory_segment *seg; + struct range range = arch_get_mappable_range(); int ret; - mutex_lock(&vmem_mutex); - ret = -ENOMEM; - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - goto out; - seg->start = start; - seg->size = size; - - ret = insert_memory_segment(seg); - if (ret) - goto out_free; + if (start < range.start || + start + size > range.end + 1 || + start + size < start) + return -ERANGE; - ret = vmem_add_mem(start, size, 0); + mutex_lock(&vmem_mutex); + ret = vmem_add_range(start, size); if (ret) - goto out_remove; - goto out; - -out_remove: - __remove_shared_memory(seg); -out_free: - kfree(seg); -out: + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); return ret; } /* - * map whole physical memory to virtual memory (identity mapping) - * we reserve enough space in the vmalloc area for vmemmap to hotplug - * additional memory segments. + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserved for 4KB mappings only. */ -void __init vmem_map_init(void) +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) { - unsigned long ro_start, ro_end; - unsigned long start, end; - int i; + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; - ro_start = PFN_ALIGN((unsigned long)&_stext); - ro_end = (unsigned long)&_eshared & PAGE_MASK; - for (i = 0; i < MEMORY_CHUNKS; i++) { - if (!memory_chunk[i].size) - continue; - start = memory_chunk[i].addr; - end = memory_chunk[i].addr + memory_chunk[i].size; - if (start >= ro_end || end <= ro_start) - vmem_add_mem(start, end - start, 0); - else if (start >= ro_start && end <= ro_end) - vmem_add_mem(start, end - start, 1); - else if (start >= ro_start) { - vmem_add_mem(start, ro_end - start, 1); - vmem_add_mem(ro_end, end - ro_end, 0); - } else if (end < ro_end) { - vmem_add_mem(start, ro_start - start, 0); - vmem_add_mem(ro_start, end - ro_start, 1); - } else { - vmem_add_mem(start, ro_start - start, 0); - vmem_add_mem(ro_start, ro_end - ro_start, 1); - vmem_add_mem(ro_end, end - ro_end, 0); - } + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_leaf(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); +out: + return ptep; } -/* - * Convert memory chunk array to a memory segment list so there is a single - * list that contains both r/w memory and shared memory segments. - */ -static int __init vmem_convert_memory_chunk(void) +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) { - struct memory_segment *seg; - int i; + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; mutex_lock(&vmem_mutex); - for (i = 0; i < MEMORY_CHUNKS; i++) { - if (!memory_chunk[i].size) - continue; - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - panic("Out of memory...\n"); - seg->start = memory_chunk[i].addr; - seg->size = memory_chunk[i].size; - insert_memory_segment(seg); - } + rc = __vmem_map_4k_page(addr, phys, prot, true); + mutex_unlock(&vmem_mutex); + return rc; +} + +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); mutex_unlock(&vmem_mutex); - return 0; } -core_initcall(vmem_convert_memory_chunk); +void __init vmem_map_init(void) +{ + __set_memory_rox(_stext, _etext); + __set_memory_ro(_etext, __end_rodata); + __set_memory_rox(__stext_amode31, __etext_amode31); + /* + * If the BEAR-enhancement facility is not installed the first + * prefix page is used to return to the previous context with + * an LPSWE instruction and therefore must be executable. + */ + if (!cpu_has_bear()) + set_memory_x(0, 1); + if (debug_pagealloc_enabled()) + __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); + pr_info("Write protected kernel read-only data: %luk\n", + (unsigned long)(__end_rodata - _stext) >> 10); +} |
