diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/debug_pagetables.c | 58 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 26 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 244 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 33 | ||||
-rw-r--r-- | arch/x86/mm/kasan_init_64.c | 55 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 7 | ||||
-rw-r--r-- | arch/x86/mm/mm_internal.h | 2 | ||||
-rw-r--r-- | arch/x86/mm/pageattr-test.c | 31 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 309 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 14 | ||||
-rw-r--r-- | arch/x86/mm/pkeys.c | 1 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 119 |
15 files changed, 501 insertions, 417 deletions
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 225fe2f0bfec..cd84f067e41d 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -10,20 +10,9 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .owner = THIS_MODULE, - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); -static int ptdump_show_curknl(struct seq_file *m, void *v) +static int ptdump_curknl_show(struct seq_file *m, void *v) { if (current->mm->pgd) { down_read(¤t->mm->mmap_sem); @@ -33,23 +22,12 @@ static int ptdump_show_curknl(struct seq_file *m, void *v) return 0; } -static int ptdump_open_curknl(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_curknl, NULL); -} - -static const struct file_operations ptdump_curknl_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_curknl, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); #ifdef CONFIG_PAGE_TABLE_ISOLATION static struct dentry *pe_curusr; -static int ptdump_show_curusr(struct seq_file *m, void *v) +static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) { down_read(¤t->mm->mmap_sem); @@ -59,42 +37,20 @@ static int ptdump_show_curusr(struct seq_file *m, void *v) return 0; } -static int ptdump_open_curusr(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_curusr, NULL); -} - -static const struct file_operations ptdump_curusr_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_curusr, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_curusr); #endif #if defined(CONFIG_EFI) && defined(CONFIG_X86_64) static struct dentry *pe_efi; -static int ptdump_show_efi(struct seq_file *m, void *v) +static int ptdump_efi_show(struct seq_file *m, void *v) { if (efi_mm.pgd) ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false); return 0; } -static int ptdump_open_efi(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_efi, NULL); -} - -static const struct file_operations ptdump_efi_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_efi, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_efi); #endif static struct dentry *dir, *pe_knl, *pe_curknl; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index fc37bbd23eb8..e3cdc85ce5b6 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -55,10 +55,10 @@ struct addr_marker { enum address_markers_idx { USER_SPACE_NR = 0, KERNEL_SPACE_NR, - LOW_KERNEL_NR, -#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) +#ifdef CONFIG_MODIFY_LDT_SYSCALL LDT_NR, #endif + LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, #ifdef CONFIG_KASAN @@ -66,9 +66,6 @@ enum address_markers_idx { KASAN_SHADOW_END_NR, #endif CPU_ENTRY_AREA_NR, -#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) - LDT_NR, -#endif #ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, #endif @@ -380,7 +377,7 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, /* * This is an optimization for KASAN=y case. Since all kasan page tables - * eventually point to the kasan_zero_page we could call note_page() + * eventually point to the kasan_early_shadow_page we could call note_page() * right away without walking through lower level page tables. This saves * us dozens of seconds (minutes for 5-level config) while checking for * W+X mapping or reading kernel_page_tables debugfs file. @@ -388,10 +385,11 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, void *pt) { - if (__pa(pt) == __pa(kasan_zero_pmd) || - (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) || - __pa(pt) == __pa(kasan_zero_pud)) { - pgprotval_t prot = pte_flags(kasan_zero_pte[0]); + if (__pa(pt) == __pa(kasan_early_shadow_pmd) || + (pgtable_l5_enabled() && + __pa(pt) == __pa(kasan_early_shadow_p4d)) || + __pa(pt) == __pa(kasan_early_shadow_pud)) { + pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]); note_page(m, st, __pgprot(prot), 0, 5); return true; } @@ -512,11 +510,11 @@ static inline bool is_hypervisor_range(int idx) { #ifdef CONFIG_X86_64 /* - * ffff800000000000 - ffff87ffffffffff is reserved for - * the hypervisor. + * A hole in the beginning of kernel address space reserved + * for a hypervisor. */ - return (idx >= pgd_index(__PAGE_OFFSET) - 16) && - (idx < pgd_index(__PAGE_OFFSET)); + return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) && + (idx < pgd_index(GUARD_HOLE_END_ADDR)); #else return false; #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 71d4b9d4d43f..2ff25ad33233 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_recover_from_page_fault()*/ +#include <asm/desc.h> /* store_idt(), ... */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -571,10 +572,55 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) +{ + u32 offset = (index >> 3) * sizeof(struct desc_struct); + unsigned long addr; + struct ldttss_desc desc; + + if (index == 0) { + pr_alert("%s: NULL\n", name); + return; + } + + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); + return; + } + + if (probe_kernel_read(&desc, (void *)(gdt->address + offset), + sizeof(struct ldttss_desc))) { + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", + name, index); + return; + } + + addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24); +#ifdef CONFIG_X86_64 + addr |= ((u64)desc.base3 << 32); +#endif + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", + name, index, addr, (desc.limit0 | (desc.limit1 << 16))); +} + +/* + * This helper function transforms the #PF error_code bits into + * "[PROT] [USER]" type of descriptive, almost human-readable error strings: + */ +static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) +{ + if (error_code & mask) { + if (buf[0]) + strcat(buf, " "); + strcat(buf, txt); + } +} + static void -show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + char err_txt[64]; + if (!oops_may_print()) return; @@ -602,6 +648,52 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); + err_txt[0] = 0; + + /* + * Note: length of these appended strings including the separation space and the + * zero delimiter must fit into err_txt[]. + */ + err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); + err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); + err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); + err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); + err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); + err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); + + pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + + if (!(error_code & X86_PF_USER) && user_mode(regs)) { + struct desc_ptr idt, gdt; + u16 ldtr, tr; + + pr_alert("This was a system access from user code\n"); + + /* + * This can happen for quite a few reasons. The more obvious + * ones are faults accessing the GDT, or LDT. Perhaps + * surprisingly, if the CPU tries to deliver a benign or + * contributory exception from user code and gets a page fault + * during delivery, the page fault can be delivered as though + * it originated directly from user code. This could happen + * due to wrong permissions on the IDT, GDT, LDT, TSS, or + * kernel or IST stack. + */ + store_idt(&idt); + + /* Usable even on Xen PV -- it's just slow. */ + native_store_gdt(&gdt); + + pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", + idt.address, idt.size, gdt.address, gdt.size); + + store_ldt(ldtr); + show_ldttss(&gdt, "LDTR", ldtr); + + store_tr(tr); + show_ldttss(&gdt, "TR", tr); + } + dump_pagetable(address); } @@ -621,16 +713,30 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; + + /* + * To avoid leaking information about the kernel page + * table layout, pretend that user-mode accesses to + * kernel addresses are always protection faults. + */ + if (address >= TASK_SIZE_MAX) + error_code |= X86_PF_PROT; + + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; +} + static noinline void no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) @@ -639,6 +745,15 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long flags; int sig; + if (user_mode(regs)) { + /* + * This is an implicit supervisor-mode access from user + * mode. Bypass all the kernel-mode recovery code and just + * OOPS. + */ + goto oops; + } + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* @@ -656,9 +771,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | X86_PF_USER; - tsk->thread.cr2 = address; + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address, @@ -726,6 +839,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_recover_from_page_fault(address); +oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: @@ -737,10 +851,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; @@ -794,7 +904,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & X86_PF_USER) { + if (user_mode(regs) && (error_code & X86_PF_USER)) { /* * It's possible to have interrupts off here: */ @@ -821,9 +931,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); @@ -937,9 +1045,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { @@ -1148,23 +1254,6 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } -static inline bool smap_violation(int error_code, struct pt_regs *regs) -{ - if (!IS_ENABLED(CONFIG_X86_SMAP)) - return false; - - if (!static_cpu_has(X86_FEATURE_SMAP)) - return false; - - if (error_code & X86_PF_USER) - return false; - - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) - return false; - - return true; -} - /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that @@ -1230,7 +1319,6 @@ void do_user_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1252,10 +1340,16 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* - * Check for invalid kernel (supervisor) access to user - * pages in the user address space. + * If SMAP is on, check for invalid kernel (supervisor) access to user + * pages in the user address space. The odd case here is WRUSS, + * which, according to the preliminary documentation, does not respect + * SMAP and will have the USER bit set so, in all cases, SMAP + * enforcement appears to be consistent with the USER bit. */ - if (unlikely(smap_violation(hw_error_code, regs))) { + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && + !(hw_error_code & X86_PF_USER) && + !(regs->flags & X86_EFLAGS_AC))) + { bad_area_nosemaphore(regs, hw_error_code, address); return; } @@ -1270,13 +1364,6 @@ void do_user_addr_fault(struct pt_regs *regs, } /* - * hw_error_code is literally the "page fault error code" passed to - * the kernel directly from the hardware. But, we will shortly be - * modifying it in software, so give it a new name. - */ - sw_error_code = hw_error_code; - - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * @@ -1285,26 +1372,6 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (user_mode(regs)) { local_irq_enable(); - /* - * Up to this point, X86_PF_USER set in hw_error_code - * indicated a user-mode access. But, after this, - * X86_PF_USER in sw_error_code will indicate either - * that, *or* an implicit kernel(supervisor)-mode access - * which originated from user mode. - */ - if (!(hw_error_code & X86_PF_USER)) { - /* - * The CPU was in user mode, but the CPU says - * the fault was not a user-mode access. - * Must be an implicit kernel-mode access, - * which we do not expect to happen in the - * user address space. - */ - pr_warn_once("kernel-mode error from user-mode: %lx\n", - hw_error_code); - - sw_error_code |= X86_PF_USER; - } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1313,9 +1380,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (sw_error_code & X86_PF_WRITE) + if (hw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (sw_error_code & X86_PF_INSTR) + if (hw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1328,7 +1395,7 @@ void do_user_addr_fault(struct pt_regs *regs, * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. */ - if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { if (emulate_vsyscall(regs, address)) return; } @@ -1344,18 +1411,15 @@ void do_user_addr_fault(struct pt_regs *regs, * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we * 1. Failed to acquire mmap_sem, and - * 2. The access did not originate in userspace. Note: either the - * hardware or earlier page fault code may set X86_PF_USER - * in sw_error_code. + * 2. The access did not originate in userspace. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(sw_error_code & X86_PF_USER) && - !search_exception_tables(regs->ip)) { + if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, sw_error_code, address); + bad_area_nosemaphore(regs, hw_error_code, address); return; } retry: @@ -1371,29 +1435,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } - if (sw_error_code & X86_PF_USER) { - /* - * Accessing the stack below %sp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535, $31" pushes - * 32 pointers and then decrements %sp by 65535.) - */ - if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, sw_error_code, address); - return; - } - } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } @@ -1402,8 +1454,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(sw_error_code, vma))) { - bad_area_access_error(regs, sw_error_code, address, vma); + if (unlikely(access_error(hw_error_code, vma))) { + bad_area_access_error(regs, hw_error_code, address, vma); return; } @@ -1442,13 +1494,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, sw_error_code, address, fault); + mm_fault_error(regs, hw_error_code, address, fault); return; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ef99f3892e1f..f905a2371080 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -742,7 +742,7 @@ int devmem_is_allowed(unsigned long pagenr) return 1; } -void free_init_pages(char *what, unsigned long begin, unsigned long end) +void free_init_pages(const char *what, unsigned long begin, unsigned long end) { unsigned long begin_aligned, end_aligned; @@ -931,7 +931,7 @@ unsigned long max_swapfile_size(void) pages = generic_max_swapfile_size(); - if (boot_cpu_has_bug(X86_BUG_L1TF)) { + if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ unsigned long long l1tf_limit = l1tf_pfn_limit(); /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 49ecf5ecf6d3..85c94f9a87f8 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -860,7 +860,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5fab264948c2..bccff68e3267 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -432,7 +432,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pte(pte, __pte(0)); + set_pte_safe(pte, __pte(0)); continue; } @@ -452,7 +452,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } @@ -487,7 +487,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pmd(pmd, __pmd(0)); + set_pmd_safe(pmd, __pmd(0)); continue; } @@ -524,7 +524,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte((pte_t *)pmd, + set_pte_safe((pte_t *)pmd, pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); @@ -536,7 +536,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, pte); + pmd_populate_kernel_safe(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -573,7 +573,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pud(pud, __pud(0)); + set_pud_safe(pud, __pud(0)); continue; } @@ -584,7 +584,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, paddr_end, page_size_mask, prot); - __flush_tlb_all(); continue; } /* @@ -611,7 +610,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte((pte_t *)pud, + set_pte_safe((pte_t *)pud, pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); @@ -624,10 +623,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, page_size_mask, prot); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, pmd); + pud_populate_safe(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } - __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); @@ -659,7 +657,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_p4d(p4d, __p4d(0)); + set_p4d_safe(p4d, __p4d(0)); continue; } @@ -668,7 +666,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pud_init(pud, paddr, paddr_end, page_size_mask); - __flush_tlb_all(); continue; } @@ -677,10 +674,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, page_size_mask); spin_lock(&init_mm.page_table_lock); - p4d_populate(&init_mm, p4d, pud); + p4d_populate_safe(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); } - __flush_tlb_all(); return paddr_last; } @@ -723,9 +719,9 @@ kernel_physical_mapping_init(unsigned long paddr_start, spin_lock(&init_mm.page_table_lock); if (pgtable_l5_enabled()) - pgd_populate(&init_mm, pgd, p4d); + pgd_populate_safe(&init_mm, pgd, p4d); else - p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); + p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -733,8 +729,6 @@ kernel_physical_mapping_init(unsigned long paddr_start, if (pgd_changed) sync_global_pgds(vaddr_start, vaddr_end - 1); - __flush_tlb_all(); - return paddr_last; } @@ -1147,7 +1141,8 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -int __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 04a9cf6b034f..462fde83b515 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -211,7 +211,8 @@ static void __init kasan_early_p4d_populate(pgd_t *pgd, unsigned long next; if (pgd_none(*pgd)) { - pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d)); + pgd_entry = __pgd(_KERNPG_TABLE | + __pa_nodebug(kasan_early_shadow_p4d)); set_pgd(pgd, pgd_entry); } @@ -222,7 +223,8 @@ static void __init kasan_early_p4d_populate(pgd_t *pgd, if (!p4d_none(*p4d)) continue; - p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud)); + p4d_entry = __p4d(_KERNPG_TABLE | + __pa_nodebug(kasan_early_shadow_pud)); set_p4d(p4d, p4d_entry); } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); } @@ -261,10 +263,11 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; - pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; - pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; - p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; + pteval_t pte_val = __pa_nodebug(kasan_early_shadow_page) | + __PAGE_KERNEL | _PAGE_ENC; + pmdval_t pmd_val = __pa_nodebug(kasan_early_shadow_pte) | _KERNPG_TABLE; + pudval_t pud_val = __pa_nodebug(kasan_early_shadow_pmd) | _KERNPG_TABLE; + p4dval_t p4d_val = __pa_nodebug(kasan_early_shadow_pud) | _KERNPG_TABLE; /* Mask out unsupported __PAGE_KERNEL bits: */ pte_val &= __default_kernel_pte_mask; @@ -273,16 +276,16 @@ void __init kasan_early_init(void) p4d_val &= __default_kernel_pte_mask; for (i = 0; i < PTRS_PER_PTE; i++) - kasan_zero_pte[i] = __pte(pte_val); + kasan_early_shadow_pte[i] = __pte(pte_val); for (i = 0; i < PTRS_PER_PMD; i++) - kasan_zero_pmd[i] = __pmd(pmd_val); + kasan_early_shadow_pmd[i] = __pmd(pmd_val); for (i = 0; i < PTRS_PER_PUD; i++) - kasan_zero_pud[i] = __pud(pud_val); + kasan_early_shadow_pud[i] = __pud(pud_val); for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++) - kasan_zero_p4d[i] = __p4d(p4d_val); + kasan_early_shadow_p4d[i] = __p4d(p4d_val); kasan_map_early_shadow(early_top_pgt); kasan_map_early_shadow(init_top_pgt); @@ -326,7 +329,7 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); - kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), + kasan_populate_early_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), kasan_mem_to_shadow((void *)PAGE_OFFSET)); for (i = 0; i < E820_MAX_ENTRIES; i++) { @@ -338,41 +341,41 @@ void __init kasan_init(void) shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); - shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, - PAGE_SIZE); + shadow_cpu_entry_begin = (void *)round_down( + (unsigned long)shadow_cpu_entry_begin, PAGE_SIZE); shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE); shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); - shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, - PAGE_SIZE); + shadow_cpu_entry_end = (void *)round_up( + (unsigned long)shadow_cpu_entry_end, PAGE_SIZE); - kasan_populate_zero_shadow( + kasan_populate_early_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), shadow_cpu_entry_begin); kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, (unsigned long)shadow_cpu_entry_end, 0); - kasan_populate_zero_shadow(shadow_cpu_entry_end, - kasan_mem_to_shadow((void *)__START_KERNEL_map)); + kasan_populate_early_shadow(shadow_cpu_entry_end, + kasan_mem_to_shadow((void *)__START_KERNEL_map)); kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), (unsigned long)kasan_mem_to_shadow(_end), early_pfn_to_nid(__pa(_stext))); - kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), - (void *)KASAN_SHADOW_END); + kasan_populate_early_shadow(kasan_mem_to_shadow((void *)MODULES_END), + (void *)KASAN_SHADOW_END); load_cr3(init_top_pgt); __flush_tlb_all(); /* - * kasan_zero_page has been used as early shadow memory, thus it may - * contain some garbage. Now we can clear and write protect it, since - * after the TLB flush no one should write to it. + * kasan_early_shadow_page has been used as early shadow memory, thus + * it may contain some garbage. Now we can clear and write protect it, + * since after the TLB flush no one should write to it. */ - memset(kasan_zero_page, 0, PAGE_SIZE); + memset(kasan_early_shadow_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { pte_t pte; pgprot_t prot; @@ -380,8 +383,8 @@ void __init kasan_init(void) prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC); pgprot_val(prot) &= __default_kernel_pte_mask; - pte = __pte(__pa(kasan_zero_page) | pgprot_val(prot)); - set_pte(&kasan_zero_pte[i], pte); + pte = __pte(__pa(kasan_early_shadow_page) | pgprot_val(prot)); + set_pte(&kasan_early_shadow_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ __flush_tlb_all(); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 006f373f54ab..385afa2b9e17 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -381,13 +381,6 @@ void __init mem_encrypt_init(void) swiotlb_update_mem_attributes(); /* - * With SEV, DMA operations cannot use encryption, we need to use - * SWIOTLB to bounce buffer DMA operation. - */ - if (sev_active()) - dma_ops = &swiotlb_dma_ops; - - /* * With SEV, we need to unroll the rep string I/O instructions. */ if (sev_active()) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 4e1f6e1b8159..319bde386d5f 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -19,4 +19,6 @@ extern int after_bootmem; void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); +extern unsigned long tlb_single_page_flush_ceiling; + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 08f8f76a4852..facce271e8b9 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -23,7 +23,8 @@ static __read_mostly int print = 1; enum { - NTEST = 400, + NTEST = 3 * 100, + NPAGES = 100, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -110,6 +111,9 @@ static int print_split(struct split_state *s) static unsigned long addr[NTEST]; static unsigned int len[NTEST]; +static struct page *pages[NPAGES]; +static unsigned long addrs[NPAGES]; + /* Change the global bit on random pages in the direct mapping */ static int pageattr_test(void) { @@ -120,7 +124,6 @@ static int pageattr_test(void) unsigned int level; int i, k; int err; - unsigned long test_addr; if (print) printk(KERN_INFO "CPA self-test:\n"); @@ -137,7 +140,7 @@ static int pageattr_test(void) unsigned long pfn = prandom_u32() % max_pfn_mapped; addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = prandom_u32() % 100; + len[i] = prandom_u32() % NPAGES; len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) @@ -167,14 +170,29 @@ static int pageattr_test(void) break; } __set_bit(pfn + k, bm); + addrs[k] = addr[i] + k*PAGE_SIZE; + pages[k] = pfn_to_page(pfn + k); } if (!addr[i] || !pte || !k) { addr[i] = 0; continue; } - test_addr = addr[i]; - err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); + switch (i % 3) { + case 0: + err = change_page_attr_set(&addr[i], len[i], PAGE_CPA_TEST, 0); + break; + + case 1: + err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + break; + + case 2: + err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST); + break; + } + + if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; @@ -206,8 +224,7 @@ static int pageattr_test(void) failed++; continue; } - test_addr = addr[i]; - err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); + err = change_page_attr_clear(&addr[i], len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index db7a10082238..4f8972311a77 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -26,6 +26,8 @@ #include <asm/pat.h> #include <asm/set_memory.h> +#include "mm_internal.h" + /* * The current flushing context - we pass it instead of 5 arguments: */ @@ -35,11 +37,11 @@ struct cpa_data { pgprot_t mask_set; pgprot_t mask_clr; unsigned long numpages; - int flags; + unsigned long curpage; unsigned long pfn; - unsigned force_split : 1, + unsigned int flags; + unsigned int force_split : 1, force_static_prot : 1; - int curpage; struct page **pages; }; @@ -228,19 +230,28 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn) #endif +static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) +{ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[idx]; + + if (unlikely(PageHighMem(page))) + return 0; + + return (unsigned long)page_address(page); + } + + if (cpa->flags & CPA_ARRAY) + return cpa->vaddr[idx]; + + return *cpa->vaddr + idx * PAGE_SIZE; +} + /* * Flushing functions */ -/** - * clflush_cache_range - flush a cache range with clflush - * @vaddr: virtual start address - * @size: number of bytes to flush - * - * clflushopt is an unordered instruction which needs fencing with mfence or - * sfence to avoid ordering issues. - */ -void clflush_cache_range(void *vaddr, unsigned int size) +static void clflush_cache_range_opt(void *vaddr, unsigned int size) { const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); @@ -249,11 +260,22 @@ void clflush_cache_range(void *vaddr, unsigned int size) if (p >= vend) return; - mb(); - for (; p < vend; p += clflush_size) clflushopt(p); +} +/** + * clflush_cache_range - flush a cache range with clflush + * @vaddr: virtual start address + * @size: number of bytes to flush + * + * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or + * SFENCE to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + mb(); + clflush_cache_range_opt(vaddr, size); mb(); } EXPORT_SYMBOL_GPL(clflush_cache_range); @@ -285,79 +307,49 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static bool __cpa_flush_range(unsigned long start, int numpages, int cache) +void __cpa_flush_tlb(void *data) { - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - - WARN_ON(PAGE_ALIGN(start) != start); - - if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { - cpa_flush_all(cache); - return true; - } + struct cpa_data *cpa = data; + unsigned int i; - flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - - return !cache; + for (i = 0; i < cpa->numpages; i++) + __flush_tlb_one_kernel(__cpa_addr(cpa, i)); } -static void cpa_flush_range(unsigned long start, int numpages, int cache) +static void cpa_flush(struct cpa_data *data, int cache) { - unsigned int i, level; - unsigned long addr; + struct cpa_data *cpa = data; + unsigned int i; - if (__cpa_flush_range(start, numpages, cache)) - return; - - /* - * We only need to flush on one CPU, - * clflush is a MESI-coherent instruction that - * will cause all other CPUs to flush the same - * cachelines: - */ - for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { - pte_t *pte = lookup_address(addr, &level); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - /* - * Only flush present addresses: - */ - if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *) addr, PAGE_SIZE); + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; } -} -static void cpa_flush_array(unsigned long baddr, unsigned long *start, - int numpages, int cache, - int in_flags, struct page **pages) -{ - unsigned int i, level; + if (cpa->numpages <= tlb_single_page_flush_ceiling) + on_each_cpu(__cpa_flush_tlb, cpa, 1); + else + flush_tlb_all(); - if (__cpa_flush_range(baddr, numpages, cache)) + if (!cache) return; - /* - * We only need to flush on one CPU, - * clflush is a MESI-coherent instruction that - * will cause all other CPUs to flush the same - * cachelines: - */ - for (i = 0; i < numpages; i++) { - unsigned long addr; - pte_t *pte; - - if (in_flags & CPA_PAGES_ARRAY) - addr = (unsigned long)page_address(pages[i]); - else - addr = start[i]; + mb(); + for (i = 0; i < cpa->numpages; i++) { + unsigned long addr = __cpa_addr(cpa, i); + unsigned int level; - pte = lookup_address(addr, &level); + pte_t *pte = lookup_address(addr, &level); /* * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *)addr, PAGE_SIZE); + clflush_cache_range_opt((void *)addr, PAGE_SIZE); } + mb(); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, @@ -1468,15 +1460,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_PAGES_ARRAY) { - struct page *page = cpa->pages[cpa->curpage]; - if (unlikely(PageHighMem(page))) - return 0; - address = (unsigned long)page_address(page); - } else if (cpa->flags & CPA_ARRAY) - address = cpa->vaddr[cpa->curpage]; - else - address = *cpa->vaddr; + address = __cpa_addr(cpa, cpa->curpage); repeat: kpte = _lookup_address_cpa(cpa, address, &level); if (!kpte) @@ -1557,22 +1541,14 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_PAGES_ARRAY) { - struct page *page = cpa->pages[cpa->curpage]; - if (unlikely(PageHighMem(page))) - return 0; - vaddr = (unsigned long)page_address(page); - } else if (cpa->flags & CPA_ARRAY) - vaddr = cpa->vaddr[cpa->curpage]; - else - vaddr = *cpa->vaddr; - + vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) @@ -1592,6 +1568,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa = *cpa; alias_cpa.vaddr = &temp_cpa_vaddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; /* * The high mapping range is imprecise, so ignore the @@ -1607,14 +1584,15 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { unsigned long numpages = cpa->numpages; - int ret; + unsigned long rempages = numpages; + int ret = 0; - while (numpages) { + while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check. */ - cpa->numpages = numpages; + cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; @@ -1625,12 +1603,12 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) - return ret; + goto out; if (checkalias) { ret = cpa_process_alias(cpa); if (ret) - return ret; + goto out; } /* @@ -1638,15 +1616,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->numpages > numpages || !cpa->numpages); - numpages -= cpa->numpages; - if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) - cpa->curpage++; - else - *cpa->vaddr += cpa->numpages * PAGE_SIZE; - + BUG_ON(cpa->numpages > rempages || !cpa->numpages); + rempages -= cpa->numpages; + cpa->curpage += cpa->numpages; } - return 0; + +out: + /* Restore the original numpages */ + cpa->numpages = numpages; + return ret; } /* @@ -1679,7 +1657,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, { struct cpa_data cpa; int ret, cache, checkalias; - unsigned long baddr = 0; memset(&cpa, 0, sizeof(cpa)); @@ -1704,7 +1681,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, } else if (!(in_flag & CPA_PAGES_ARRAY)) { /* * in_flag of CPA_PAGES_ARRAY implies it is aligned. - * No need to cehck in that case + * No need to check in that case */ if (*addr & ~PAGE_MASK) { *addr &= PAGE_MASK; @@ -1713,11 +1690,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, */ WARN_ON_ONCE(1); } - /* - * Save address for cache flush. *addr is modified in the call - * to __change_page_attr_set_clr() below. - */ - baddr = make_addr_canonical_again(*addr); } /* Must avoid aliasing mappings in the highmem code */ @@ -1765,13 +1737,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, goto out; } - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(baddr, addr, numpages, cache, - cpa.flags, pages); - } else { - cpa_flush_range(baddr, numpages, cache); - } - + cpa_flush(&cpa, cache); out: return ret; } @@ -1842,14 +1808,14 @@ out_err: } EXPORT_SYMBOL(set_memory_uc); -static int _set_memory_array(unsigned long *addr, int addrinarray, +static int _set_memory_array(unsigned long *addr, int numpages, enum page_cache_mode new_type) { enum page_cache_mode set_type; int i, j; int ret; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, new_type, NULL); if (ret) @@ -1860,11 +1826,11 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; - ret = change_page_attr_set(addr, addrinarray, + ret = change_page_attr_set(addr, numpages, cachemode2pgprot(set_type), 1); if (!ret && new_type == _PAGE_CACHE_MODE_WC) - ret = change_page_attr_set_clr(addr, addrinarray, + ret = change_page_attr_set_clr(addr, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), @@ -1881,36 +1847,34 @@ out_free: return ret; } -int set_memory_array_uc(unsigned long *addr, int addrinarray) +int set_memory_array_uc(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_memory_array_uc); -int set_memory_array_wc(unsigned long *addr, int addrinarray) +int set_memory_array_wc(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_memory_array_wc); -int set_memory_array_wt(unsigned long *addr, int addrinarray) +int set_memory_array_wt(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT); } EXPORT_SYMBOL_GPL(set_memory_array_wt); int _set_memory_wc(unsigned long addr, int numpages) { int ret; - unsigned long addr_copy = addr; ret = change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); if (!ret) { - ret = change_page_attr_set_clr(&addr_copy, numpages, - cachemode2pgprot( - _PAGE_CACHE_MODE_WC), + ret = change_page_attr_set_clr(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } @@ -1977,18 +1941,18 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); -int set_memory_array_wb(unsigned long *addr, int addrinarray) +int set_memory_array_wb(unsigned long *addr, int numpages) { int i; int ret; /* WB cache mode is hard wired to all cache attribute bits being 0 */ - ret = change_page_attr_clear(addr, addrinarray, + ret = change_page_attr_clear(addr, numpages, __pgprot(_PAGE_CACHE_MASK), 1); if (ret) return ret; - for (i = 0; i < addrinarray; i++) + for (i = 0; i < numpages; i++) free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); return 0; @@ -2058,7 +2022,6 @@ int set_memory_global(unsigned long addr, int numpages) static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; - unsigned long start; int ret; /* Nothing to do if memory encryption is not active */ @@ -2069,8 +2032,6 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; - start = addr; - memset(&cpa, 0, sizeof(cpa)); cpa.vaddr = &addr; cpa.numpages = numpages; @@ -2085,18 +2046,18 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush_range(start, numpages, 1); + cpa_flush(&cpa, 1); ret = __change_page_attr_set_clr(&cpa, 1); /* - * After changing the encryption attribute, we need to flush TLBs - * again in case any speculative TLB caching occurred (but no need - * to flush caches again). We could just use cpa_flush_all(), but - * in case TLB flushing gets optimized in the cpa_flush_range() - * path use the same logic as above. + * After changing the encryption attribute, we need to flush TLBs again + * in case any speculative TLB caching occurred (but no need to flush + * caches again). We could just use cpa_flush_all(), but in case TLB + * flushing gets optimized in the cpa_flush() path use the same logic + * as above. */ - cpa_flush_range(start, numpages, 0); + cpa_flush(&cpa, 0); return ret; } @@ -2121,7 +2082,7 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); -static int _set_pages_array(struct page **pages, int addrinarray, +static int _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type) { unsigned long start; @@ -2131,7 +2092,7 @@ static int _set_pages_array(struct page **pages, int addrinarray, int free_idx; int ret; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; @@ -2144,10 +2105,10 @@ static int _set_pages_array(struct page **pages, int addrinarray, set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; - ret = cpa_set_pages_array(pages, addrinarray, + ret = cpa_set_pages_array(pages, numpages, cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) - ret = change_page_attr_set_clr(NULL, addrinarray, + ret = change_page_attr_set_clr(NULL, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), @@ -2167,21 +2128,21 @@ err_out: return -EINVAL; } -int set_pages_array_uc(struct page **pages, int addrinarray) +int set_pages_array_uc(struct page **pages, int numpages) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); -int set_pages_array_wc(struct page **pages, int addrinarray) +int set_pages_array_wc(struct page **pages, int numpages) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); -int set_pages_array_wt(struct page **pages, int addrinarray) +int set_pages_array_wt(struct page **pages, int numpages) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WT); } EXPORT_SYMBOL_GPL(set_pages_array_wt); @@ -2193,7 +2154,7 @@ int set_pages_wb(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_wb); -int set_pages_array_wb(struct page **pages, int addrinarray) +int set_pages_array_wb(struct page **pages, int numpages) { int retval; unsigned long start; @@ -2201,12 +2162,12 @@ int set_pages_array_wb(struct page **pages, int addrinarray) int i; /* WB cache mode is hard wired to all cache attribute bits being 0 */ - retval = cpa_clear_pages_array(pages, addrinarray, + retval = cpa_clear_pages_array(pages, numpages, __pgprot(_PAGE_CACHE_MASK)); if (retval) return retval; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; @@ -2338,8 +2299,8 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ -int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, - unsigned numpages, unsigned long page_flags) +int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags) { int retval = -EINVAL; @@ -2353,6 +2314,8 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .flags = 0, }; + WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); + if (!(__supported_pte_mask & _PAGE_NX)) goto out; @@ -2375,6 +2338,40 @@ out: } /* + * __flush_tlb_all() flushes mappings only on current CPU and hence this + * function shouldn't be used in an SMP environment. Presently, it's used only + * during boot (way before smp_init()) by EFI subsystem and hence is ok. + */ +int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, + unsigned long numpages) +{ + int retval; + + /* + * The typical sequence for unmapping is to find a pte through + * lookup_address_in_pgd() (ideally, it should never return NULL because + * the address is already mapped) and change it's protections. As pfn is + * the *target* of a mapping, it's not useful while unmapping. + */ + struct cpa_data cpa = { + .vaddr = &address, + .pfn = 0, + .pgd = pgd, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0, + }; + + WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); + + retval = __change_page_attr_set_clr(&cpa, 0); + __flush_tlb_all(); + + return retval; +} + +/* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 08013524fba1..4fe956a63b25 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -519,8 +519,13 @@ static u64 sanitize_phys(u64 address) * for a "decoy" virtual address (bit 63 clear) passed to * set_memory_X(). __pa() on a "decoy" address results in a * physical address with bit 63 set. + * + * Decoy addresses are not present for 32-bit builds, see + * set_mce_nospec(). */ - return address & __PHYSICAL_MASK; + if (IS_ENABLED(CONFIG_X86_64)) + return address & __PHYSICAL_MASK; + return address; } /* @@ -546,7 +551,11 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, start = sanitize_phys(start); end = sanitize_phys(end); - BUG_ON(start >= end); /* end is exclusive */ + if (start >= end) { + WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, + start, end - 1, cattr_name(req_type)); + return -EINVAL; + } if (!pat_enabled()) { /* This is identical to page table setting without PAT */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 59274e2c1ac4..b0284eab14dc 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -794,6 +794,14 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } +/* + * Until we support 512GB pages, skip them in the vmap area. + */ +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; +} + #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. @@ -811,9 +819,6 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) pte_t *pte; int i; - if (pud_none(*pud)) - return 1; - pmd = (pmd_t *)pud_page_vaddr(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) @@ -855,9 +860,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; - if (pmd_none(*pmd)) - return 1; - pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 6e98e0a7c923..047a77f6a10c 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -131,6 +131,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey * in the process's lifetime will not accidentally get access * to data which is pkey-protected later on. */ +static u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bddd6b3cee1d..999d6d8f0bef 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> -#include <linux/ptrace.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -16,6 +15,8 @@ #include <asm/apic.h> #include <asm/uv/uv.h> +#include "mm_internal.h" + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -31,6 +32,12 @@ */ /* + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_ibpb. + */ +#define LAST_USER_MM_IBPB 0x1UL + +/* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the * necessary invalidation by clearing out the 'ctx_id' which @@ -181,17 +188,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) } } -static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +{ + unsigned long next_tif = task_thread_info(next)->flags; + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + + return (unsigned long)next->mm | ibpb; +} + +static void cond_ibpb(struct task_struct *next) { + if (!next || !next->mm) + return; + /* - * Check if the current (previous) task has access to the memory - * of the @tsk (next) task. If access is denied, make sure to - * issue a IBPB to stop user->user Spectre-v2 attacks. - * - * Note: __ptrace_may_access() returns 0 or -ERRNO. + * Both, the conditional and the always IBPB mode use the mm + * pointer to avoid the IBPB when switching between tasks of the + * same process. Using the mm pointer instead of mm->context.ctx_id + * opens a hypothetical hole vs. mm_struct reuse, which is more or + * less impossible to control by an attacker. Aside of that it + * would only affect the first schedule so the theoretically + * exposed data is not really interesting. */ - return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && - ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); + if (static_branch_likely(&switch_mm_cond_ibpb)) { + unsigned long prev_mm, next_mm; + + /* + * This is a bit more complex than the always mode because + * it has to handle two cases: + * + * 1) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB set to a user space task + * (potential victim) which has TIF_SPEC_IB not set. + * + * 2) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB not set to a user space task + * (potential victim) which has TIF_SPEC_IB set. + * + * This could be done by unconditionally issuing IBPB when + * a task which has TIF_SPEC_IB set is either scheduled in + * or out. Though that results in two flushes when: + * + * - the same user space task is scheduled out and later + * scheduled in again and only a kernel thread ran in + * between. + * + * - a user space task belonging to the same process is + * scheduled in after a kernel thread ran in between + * + * - a user space task belonging to the same process is + * scheduled in immediately. + * + * Optimize this with reasonably small overhead for the + * above cases. Mangle the TIF_SPEC_IB bit into the mm + * pointer of the incoming task which is stored in + * cpu_tlbstate.last_user_mm_ibpb for comparison. + */ + next_mm = mm_mangle_tif_spec_ib(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); + + /* + * Issue IBPB only if the mm's are different and one or + * both have the IBPB bit set. + */ + if (next_mm != prev_mm && + (next_mm | prev_mm) & LAST_USER_MM_IBPB) + indirect_branch_prediction_barrier(); + + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); + } + + if (static_branch_unlikely(&switch_mm_always_ibpb)) { + /* + * Only flush when switching to a user space task with a + * different context than the user space task which ran + * last on this CPU. + */ + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + indirect_branch_prediction_barrier(); + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); + } + } } void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, @@ -292,22 +369,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, new_asid = prev_asid; need_flush = true; } else { - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); - /* * Avoid user/user BTB poisoning by flushing the branch * predictor when switching between processes. This stops * one process from doing Spectre-v2 attacks on another. - * - * As an optimization, flush indirect branches only when - * switching into a processes that can't be ptrace by the - * current one (as in such case, attacker has much more - * convenient way how to tamper with the next process than - * branch buffer poisoning). */ - if (static_cpu_has(X86_FEATURE_USE_IBPB) && - ibpb_needed(tsk, last_ctx_id)) - indirect_branch_prediction_barrier(); + cond_ibpb(tsk); if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* @@ -365,14 +432,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } - /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. - */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - /* Make sure we write CR3 before loaded_mm. */ barrier(); @@ -441,7 +500,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); @@ -664,7 +723,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * * This is in units of pages. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, |