From 3f148f3318140035e87decc1214795ff0755757b Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 28 Oct 2022 00:31:04 +0300 Subject: x86/kasan: Map shadow for percpu pages on demand KASAN maps shadow for the entire CPU-entry-area: [CPU_ENTRY_AREA_BASE, CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE] This will explode once the per-cpu entry areas are randomized since it will increase CPU_ENTRY_AREA_MAP_SIZE to 512 GB and KASAN fails to allocate shadow for such big area. Fix this by allocating KASAN shadow only for really used cpu entry area addresses mapped by cea_map_percpu_pages() Thanks to the 0day folks for finding and reporting this to be an issue. [ dhansen: tweak changelog since this will get committed before peterz's actual cpu-entry-area randomization ] Signed-off-by: Andrey Ryabinin Signed-off-by: Dave Hansen Tested-by: Yujie Liu Cc: kernel test robot Link: https://lore.kernel.org/r/202210241508.2e203c3d-yujie.liu@intel.com --- arch/x86/mm/cpu_entry_area.c | 8 +++++++- arch/x86/mm/kasan_init_64.c | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 6c2f1b76a0b6..d7081b1accca 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -9,6 +9,7 @@ #include #include #include +#include static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); @@ -53,8 +54,13 @@ void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) static void __init cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) { + phys_addr_t pa = per_cpu_ptr_to_phys(ptr); + + kasan_populate_shadow_for_vaddr(cea_vaddr, pages * PAGE_SIZE, + early_pfn_to_nid(PFN_DOWN(pa))); + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) - cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); + cea_set_pte(cea_vaddr, pa, prot); } static void __init percpu_setup_debug_store(unsigned int cpu) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index e7b9b464a82f..d1416926ad52 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -316,6 +316,18 @@ void __init kasan_early_init(void) kasan_map_early_shadow(init_top_pgt); } +void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid) +{ + unsigned long shadow_start, shadow_end; + + shadow_start = (unsigned long)kasan_mem_to_shadow(va); + shadow_start = round_down(shadow_start, PAGE_SIZE); + shadow_end = (unsigned long)kasan_mem_to_shadow(va + size); + shadow_end = round_up(shadow_end, PAGE_SIZE); + + kasan_populate_shadow(shadow_start, shadow_end, nid); +} + void __init kasan_init(void) { int i; @@ -393,9 +405,6 @@ void __init kasan_init(void) kasan_mem_to_shadow((void *)VMALLOC_END + 1), shadow_cpu_entry_begin); - kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, - (unsigned long)shadow_cpu_entry_end, 0); - kasan_populate_early_shadow(shadow_cpu_entry_end, kasan_mem_to_shadow((void *)__START_KERNEL_map)); -- cgit From 97e3d26b5e5f371b3ee223d94dd123e6c442ba80 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 14:54:41 -0700 Subject: x86/mm: Randomize per-cpu entry area Seth found that the CPU-entry-area; the piece of per-cpu data that is mapped into the userspace page-tables for kPTI is not subject to any randomization -- irrespective of kASLR settings. On x86_64 a whole P4D (512 GB) of virtual address space is reserved for this structure, which is plenty large enough to randomize things a little. As such, use a straight forward randomization scheme that avoids duplicates to spread the existing CPUs over the available space. [ bp: Fix le build. ] Reported-by: Seth Jenkins Reviewed-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov --- arch/x86/mm/cpu_entry_area.c | 46 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index d7081b1accca..dff9001e5e12 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -16,16 +16,53 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage) #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); -#endif -#ifdef CONFIG_X86_32 +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset); + +static __always_inline unsigned int cea_offset(unsigned int cpu) +{ + return per_cpu(_cea_offset, cpu); +} + +static __init void init_cea_offsets(void) +{ + unsigned int max_cea; + unsigned int i, j; + + max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE; + + /* O(sodding terrible) */ + for_each_possible_cpu(i) { + unsigned int cea; + +again: + cea = prandom_u32_max(max_cea); + + for_each_possible_cpu(j) { + if (cea_offset(j) == cea) + goto again; + + if (i == j) + break; + } + + per_cpu(_cea_offset, i) = cea; + } +} +#else /* !X86_64 */ DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); + +static __always_inline unsigned int cea_offset(unsigned int cpu) +{ + return cpu; +} +static inline void init_cea_offsets(void) { } #endif /* Is called from entry code, so must be noinstr */ noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu) { - unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE; BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); return (struct cpu_entry_area *) va; @@ -211,7 +248,6 @@ static __init void setup_cpu_entry_area_ptes(void) /* The +1 is for the readonly IDT: */ BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); - BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE); BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); start = CPU_ENTRY_AREA_BASE; @@ -227,6 +263,8 @@ void __init setup_cpu_entry_areas(void) { unsigned int cpu; + init_cea_offsets(); + setup_cpu_entry_area_ptes(); for_each_possible_cpu(cpu) -- cgit From 3f4c8211d982099be693be9aa7d6fc4607dff290 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Oct 2022 21:38:21 +0200 Subject: x86/mm: Use mm_alloc() in poking_init() Instead of duplicating init_mm, allocate a fresh mm. The advantage is that mm_alloc() has much simpler dependencies. Additionally it makes more conceptual sense, init_mm has no (and must not have) user state to duplicate. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221025201057.816175235@infradead.org --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9121bc1b9453..d3987359d441 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -801,7 +801,7 @@ void __init poking_init(void) spinlock_t *ptl; pte_t *ptep; - poking_mm = copy_init_mm(); + poking_mm = mm_alloc(); BUG_ON(!poking_mm); /* -- cgit From 414ebf148cb5c5fa727ec51fdb69c4ab82dccf3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Oct 2022 21:39:43 +0200 Subject: x86/mm: Do verify W^X at boot up Straight up revert of commit: a970174d7a10 ("x86/mm: Do not verify W^X at boot up") now that the root cause has been fixed. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221025201058.011279208@infradead.org --- arch/x86/mm/pat/set_memory.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 2e5a045731de..97342c42dda8 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -587,10 +587,6 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star { unsigned long end; - /* Kernel text is rw at boot up */ - if (system_state == SYSTEM_BOOTING) - return new; - /* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable -- cgit From 60463628c9e0a8060ac6bef0457b0505c7532c7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Oct 2022 13:19:31 +0200 Subject: x86/mm: Implement native set_memory_rox() Provide a native implementation of set_memory_rox(), avoiding the double set_memory_ro();set_memory_x(); calls. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/mm/pat/set_memory.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 97342c42dda8..f275605892df 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2025,6 +2025,16 @@ int set_memory_ro(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } +int set_memory_rox(unsigned long addr, int numpages) +{ + pgprot_t clr = __pgprot(_PAGE_RW); + + if (__supported_pte_mask & _PAGE_NX) + clr.pgprot |= _PAGE_NX; + + return change_page_attr_clear(&addr, numpages, clr, 0); +} + int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); -- cgit From 82328227db8f0b9b5f77bb5afcd47e59d0e4d08f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 16 May 2022 18:52:02 +0000 Subject: x86/mm: Remove P*D_PAGE_MASK and P*D_PAGE_SIZE macros Other architectures and the common mm/ use P*D_MASK, and P*D_SIZE. Remove the duplicated P*D_PAGE_MASK and P*D_PAGE_SIZE which are only used in x86/*. Signed-off-by: Pasha Tatashin Signed-off-by: Borislav Petkov Reviewed-by: Anshuman Khandual Acked-by: Mike Rapoport Link: https://lore.kernel.org/r/20220516185202.604654-1-tatashin@google.com --- arch/x86/mm/mem_encrypt_boot.S | 4 ++-- arch/x86/mm/mem_encrypt_identity.c | 18 +++++++++--------- arch/x86/mm/pat/set_memory.c | 6 +++--- arch/x86/mm/pti.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 9de3d900bc92..e25288ee33c2 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -26,7 +26,7 @@ SYM_FUNC_START(sme_encrypt_execute) * RCX - virtual address of the encryption workarea, including: * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) - * - intermediate copy buffer (PMD_PAGE_SIZE) + * - intermediate copy buffer (PMD_SIZE) * R8 - physical address of the pagetables to use for encryption */ @@ -123,7 +123,7 @@ SYM_FUNC_START(__enc_copy) wbinvd /* Invalidate any cache entries */ /* Copy/encrypt up to 2MB at a time */ - movq $PMD_PAGE_SIZE, %r12 + movq $PMD_SIZE, %r12 1: cmpq %r12, %r9 jnb 2f diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index f415498d3175..88cccd65029d 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -93,7 +93,7 @@ struct sme_populate_pgd_data { * section is 2MB aligned to allow for simple pagetable setup using only * PMD entries (see vmlinux.lds.S). */ -static char sme_workarea[2 * PMD_PAGE_SIZE] __section(".init.scratch"); +static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; @@ -198,8 +198,8 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); - ppd->vaddr += PMD_PAGE_SIZE; - ppd->paddr += PMD_PAGE_SIZE; + ppd->vaddr += PMD_SIZE; + ppd->paddr += PMD_SIZE; } } @@ -225,11 +225,11 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, vaddr_end = ppd->vaddr_end; /* If start is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE); __sme_map_range_pte(ppd); /* Create PMD entries */ - ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + ppd->vaddr_end = vaddr_end & PMD_MASK; __sme_map_range_pmd(ppd); /* If end is not 2MB aligned, create PTE entries */ @@ -325,7 +325,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) /* Physical addresses gives us the identity mapped virtual addresses */ kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE); kernel_len = kernel_end - kernel_start; initrd_start = 0; @@ -355,12 +355,12 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * executable encryption area size: * stack page (PAGE_SIZE) * encryption routine page (PAGE_SIZE) - * intermediate copy buffer (PMD_PAGE_SIZE) + * intermediate copy buffer (PMD_SIZE) * pagetable structures for the encryption of the kernel * pagetable structures for workarea (in case not currently mapped) */ execute_start = workarea_start; - execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; execute_len = execute_end - execute_start; /* @@ -383,7 +383,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * before it is mapped. */ workarea_len = execute_len + pgtable_area_len; - workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); + workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE); /* * Set the address to the start of where newly created pagetable diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index f275605892df..06eb8910462f 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -743,11 +743,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) switch (level) { case PG_LEVEL_1G: phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; - offset = virt_addr & ~PUD_PAGE_MASK; + offset = virt_addr & ~PUD_MASK; break; case PG_LEVEL_2M: phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; - offset = virt_addr & ~PMD_PAGE_MASK; + offset = virt_addr & ~PMD_MASK; break; default: phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; @@ -1037,7 +1037,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); - pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pfninc = PMD_SIZE >> PAGE_SHIFT; lpaddr = address & PUD_MASK; lpinc = PMD_SIZE; /* diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index ffe3b3a087fe..78414c6d1b5e 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -592,7 +592,7 @@ static void pti_set_kernel_image_nonglobal(void) * of the image. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); + unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE); /* * This clears _PAGE_GLOBAL from the entire kernel image. -- cgit From 5ceeee7571b7628f439ae0444ec41d132558f47e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Nov 2022 13:33:50 +0100 Subject: x86/mm: Add a few comments It's a shame to hide useful comments in Changelogs, add some to the code. Shamelessly stolen from commit: c40a56a7818c ("x86/mm/init: Remove freed kernel image areas from alias mapping") Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221110125544.460677011%40infradead.org --- arch/x86/mm/pat/set_memory.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 06eb8910462f..50f81ea1fbad 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -219,6 +219,23 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end) #ifdef CONFIG_X86_64 +/* + * The kernel image is mapped into two places in the virtual address space + * (addresses without KASLR, of course): + * + * 1. The kernel direct map (0xffff880000000000) + * 2. The "high kernel map" (0xffffffff81000000) + * + * We actually execute out of #2. If we get the address of a kernel symbol, it + * points to #2, but almost all physical-to-virtual translations point to #1. + * + * This is so that we can have both a directmap of all physical memory *and* + * take full advantage of the the limited (s32) immediate addressing range (2G) + * of x86_64. + * + * See Documentation/x86/x86_64/mm.rst for more detail. + */ + static inline unsigned long highmap_start_pfn(void) { return __pa_symbol(_text) >> PAGE_SHIFT; @@ -1626,6 +1643,9 @@ repeat: static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); +/* + * Check the directmap and "high kernel map" 'aliases'. + */ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; -- cgit From ef9ab81af6e1f7b7ff589aa1504434aa5915c1df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Nov 2022 13:33:54 +0100 Subject: x86/mm: Untangle __change_page_attr_set_clr(.checkalias) The .checkalias argument to __change_page_attr_set_clr() is overloaded and serves two different purposes: - it inhibits the call to cpa_process_alias() -- as suggested by the name; however, - it also serves as 'primary' indicator for __change_page_attr() ( which in turn also serves as a recursion terminator for cpa_process_alias() ). Untangle these by extending the use of CPA_NO_CHECK_ALIAS to all callsites that currently use .checkalias=0 for this purpose. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221110125544.527267183%40infradead.org --- arch/x86/mm/pat/set_memory.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 50f81ea1fbad..4943f6c5d8d2 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1727,7 +1727,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (ret) goto out; - if (checkalias) { + if (checkalias && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { ret = cpa_process_alias(cpa); if (ret) goto out; @@ -1801,18 +1801,12 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; - cpa.flags = 0; + cpa.flags = in_flag; cpa.curpage = 0; cpa.force_split = force_split; - if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) - cpa.flags |= in_flag; - /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; - /* Has caller explicitly disabled alias checking? */ - if (in_flag & CPA_NO_CHECK_ALIAS) - checkalias = 0; ret = __change_page_attr_set_clr(&cpa, checkalias); @@ -2067,11 +2061,9 @@ int set_memory_np(unsigned long addr, int numpages) int set_memory_np_noalias(unsigned long addr, int numpages) { - int cpa_flags = CPA_NO_CHECK_ALIAS; - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(_PAGE_PRESENT), 0, - cpa_flags, NULL); + CPA_NO_CHECK_ALIAS, NULL); } int set_memory_4k(unsigned long addr, int numpages) @@ -2288,7 +2280,7 @@ static int __set_pages_p(struct page *page, int numpages) .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), - .flags = 0}; + .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting present flag. otherwise, @@ -2296,7 +2288,7 @@ static int __set_pages_p(struct page *page, int numpages) * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ - return __change_page_attr_set_clr(&cpa, 0); + return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) @@ -2307,7 +2299,7 @@ static int __set_pages_np(struct page *page, int numpages) .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .flags = 0}; + .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting not present flag. otherwise, @@ -2315,7 +2307,7 @@ static int __set_pages_np(struct page *page, int numpages) * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ - return __change_page_attr_set_clr(&cpa, 0); + return __change_page_attr_set_clr(&cpa, 1); } int set_direct_map_invalid_noflush(struct page *page) @@ -2386,7 +2378,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), - .flags = 0, + .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); @@ -2399,7 +2391,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); - retval = __change_page_attr_set_clr(&cpa, 0); + retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); out: @@ -2429,12 +2421,12 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .flags = 0, + .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); - retval = __change_page_attr_set_clr(&cpa, 0); + retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); return retval; -- cgit From d597416683d587e940faa35945fba162329b5a71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Nov 2022 13:33:57 +0100 Subject: x86/mm: Inhibit _PAGE_NX changes from cpa_process_alias() There is a cludge in change_page_attr_set_clr() that inhibits propagating NX changes to the aliases (directmap and highmap) -- this is a cludge twofold: - it also inhibits the primary checks in __change_page_attr(); - it hard depends on single bit changes. The introduction of set_memory_rox() triggered this last issue for clearing both _PAGE_RW and _PAGE_NX. Explicitly ignore _PAGE_NX in cpa_process_alias() instead. Fixes: b38994948567 ("x86/mm: Implement native set_memory_rox()") Reported-by: kernel test robot Debugged-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221110125544.594991716%40infradead.org --- arch/x86/mm/pat/set_memory.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 4943f6c5d8d2..beef77417115 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1669,6 +1669,12 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + /* Directmap always has NX set, do not modify. */ + if (__supported_pte_mask & _PAGE_NX) { + alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; + alias_cpa.mask_set.pgprot &= ~_PAGE_NX; + } + cpa->force_flush_all = 1; ret = __change_page_attr_set_clr(&alias_cpa, 0); @@ -1691,6 +1697,15 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + /* + * [_text, _brk_end) also covers data, do not modify NX except + * in cases where the highmap is the primary target. + */ + if (__supported_pte_mask & _PAGE_NX) { + alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; + alias_cpa.mask_set.pgprot &= ~_PAGE_NX; + } + cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the @@ -1709,6 +1724,12 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) unsigned long rempages = numpages; int ret = 0; + /* + * No changes, easy! + */ + if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr))) + return ret; + while (rempages) { /* * Store the remaining nr of pages for the large page @@ -1755,7 +1776,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, struct page **pages) { struct cpa_data cpa; - int ret, cache, checkalias; + int ret, cache; memset(&cpa, 0, sizeof(cpa)); @@ -1805,10 +1826,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa.curpage = 0; cpa.force_split = force_split; - /* No alias checking for _NX bit modifications */ - checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; - - ret = __change_page_attr_set_clr(&cpa, checkalias); + ret = __change_page_attr_set_clr(&cpa, 1); /* * Check whether we really changed something: -- cgit From e996365ee7475805d2a01312532855004e89df84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Nov 2022 13:34:00 +0100 Subject: x86/mm: Rename __change_page_attr_set_clr(.checkalias) Now that the checkalias functionality is taken by CPA_NO_CHECK_ALIAS rename the argument to better match is remaining purpose: primary, matching __change_page_attr(). Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221110125544.661001508%40infradead.org --- arch/x86/mm/pat/set_memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index beef77417115..220361ceb997 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1641,7 +1641,7 @@ repeat: return err; } -static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); +static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); /* * Check the directmap and "high kernel map" 'aliases'. @@ -1718,7 +1718,7 @@ static int cpa_process_alias(struct cpa_data *cpa) return 0; } -static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) +static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) { unsigned long numpages = cpa->numpages; unsigned long rempages = numpages; @@ -1742,13 +1742,13 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); - ret = __change_page_attr(cpa, checkalias); + ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) goto out; - if (checkalias && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { + if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { ret = cpa_process_alias(cpa); if (ret) goto out; -- cgit From 80d72a8f76e8f3f0b5a70b8c7022578e17bde8e7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Nov 2022 20:35:00 +0000 Subject: x86/mm: Recompute physical address for every page of per-CPU CEA mapping Recompute the physical address for each per-CPU page in the CPU entry area, a recent commit inadvertantly modified cea_map_percpu_pages() such that every PTE is mapped to the physical address of the first page. Fixes: 9fd429c28073 ("x86/kasan: Map shadow for percpu pages on demand") Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andrey Ryabinin Link: https://lkml.kernel.org/r/20221110203504.1985010-2-seanjc@google.com --- arch/x86/mm/cpu_entry_area.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index dff9001e5e12..d831aae94b41 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -97,7 +97,7 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) early_pfn_to_nid(PFN_DOWN(pa))); for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) - cea_set_pte(cea_vaddr, pa, prot); + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } static void __init percpu_setup_debug_store(unsigned int cpu) -- cgit From 97650148a15e0b30099d6175ffe278b9f55ec66a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Nov 2022 20:35:01 +0000 Subject: x86/mm: Populate KASAN shadow for entire per-CPU range of CPU entry area Populate a KASAN shadow for the entire possible per-CPU range of the CPU entry area instead of requiring that each individual chunk map a shadow. Mapping shadows individually is error prone, e.g. the per-CPU GDT mapping was left behind, which can lead to not-present page faults during KASAN validation if the kernel performs a software lookup into the GDT. The DS buffer is also likely affected. The motivation for mapping the per-CPU areas on-demand was to avoid mapping the entire 512GiB range that's reserved for the CPU entry area, shaving a few bytes by not creating shadows for potentially unused memory was not a goal. The bug is most easily reproduced by doing a sigreturn with a garbage CS in the sigcontext, e.g. int main(void) { struct sigcontext regs; syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul); syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); memset(®s, 0, sizeof(regs)); regs.cs = 0x1d0; syscall(__NR_rt_sigreturn); return 0; } to coerce the kernel into doing a GDT lookup to compute CS.base when reading the instruction bytes on the subsequent #GP to determine whether or not the #GP is something the kernel should handle, e.g. to fixup UMIP violations or to emulate CLI/STI for IOPL=3 applications. BUG: unable to handle page fault for address: fffffbc8379ace00 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 16c03a067 P4D 16c03a067 PUD 15b990067 PMD 15b98f067 PTE 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 3 PID: 851 Comm: r2 Not tainted 6.1.0-rc3-next-20221103+ #432 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kasan_check_range+0xdf/0x190 Call Trace: get_desc+0xb0/0x1d0 insn_get_seg_base+0x104/0x270 insn_fetch_from_user+0x66/0x80 fixup_umip_exception+0xb1/0x530 exc_general_protection+0x181/0x210 asm_exc_general_protection+0x22/0x30 RIP: 0003:0x0 Code: Unable to access opcode bytes at 0xffffffffffffffd6. RSP: 0003:0000000000000000 EFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00000000000001d0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: 9fd429c28073 ("x86/kasan: Map shadow for percpu pages on demand") Reported-by: syzbot+ffb4f000dc2872c93f62@syzkaller.appspotmail.com Suggested-by: Andrey Ryabinin Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andrey Ryabinin Link: https://lkml.kernel.org/r/20221110203504.1985010-3-seanjc@google.com --- arch/x86/mm/cpu_entry_area.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index d831aae94b41..7c855dffcdc2 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -91,11 +91,6 @@ void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) static void __init cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) { - phys_addr_t pa = per_cpu_ptr_to_phys(ptr); - - kasan_populate_shadow_for_vaddr(cea_vaddr, pages * PAGE_SIZE, - early_pfn_to_nid(PFN_DOWN(pa))); - for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } @@ -195,6 +190,9 @@ static void __init setup_cpu_entry_area(unsigned int cpu) pgprot_t tss_prot = PAGE_KERNEL; #endif + kasan_populate_shadow_for_vaddr(cea, CPU_ENTRY_AREA_SIZE, + early_cpu_to_node(cpu)); + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); cea_map_percpu_pages(&cea->entry_stack_page, -- cgit From 7077d2ccb94dafd00b29cc2d601c9f6891648f5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Nov 2022 20:35:02 +0000 Subject: x86/kasan: Rename local CPU_ENTRY_AREA variables to shorten names Rename the CPU entry area variables in kasan_init() to shorten their names, a future fix will reference the beginning of the per-CPU portion of the CPU entry area, and shadow_cpu_entry_per_cpu_begin is a bit much. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andrey Ryabinin Link: https://lkml.kernel.org/r/20221110203504.1985010-4-seanjc@google.com --- arch/x86/mm/kasan_init_64.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index d1416926ad52..ad7872ae10ed 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -331,7 +331,7 @@ void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid) void __init kasan_init(void) { int i; - void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; + void *shadow_cea_begin, *shadow_cea_end; memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); @@ -372,16 +372,16 @@ void __init kasan_init(void) map_range(&pfn_mapped[i]); } - shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; - shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); - shadow_cpu_entry_begin = (void *)round_down( - (unsigned long)shadow_cpu_entry_begin, PAGE_SIZE); + shadow_cea_begin = (void *)CPU_ENTRY_AREA_BASE; + shadow_cea_begin = kasan_mem_to_shadow(shadow_cea_begin); + shadow_cea_begin = (void *)round_down( + (unsigned long)shadow_cea_begin, PAGE_SIZE); - shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + + shadow_cea_end = (void *)(CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE); - shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); - shadow_cpu_entry_end = (void *)round_up( - (unsigned long)shadow_cpu_entry_end, PAGE_SIZE); + shadow_cea_end = kasan_mem_to_shadow(shadow_cea_end); + shadow_cea_end = (void *)round_up( + (unsigned long)shadow_cea_end, PAGE_SIZE); kasan_populate_early_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), @@ -403,9 +403,9 @@ void __init kasan_init(void) kasan_populate_early_shadow( kasan_mem_to_shadow((void *)VMALLOC_END + 1), - shadow_cpu_entry_begin); + shadow_cea_begin); - kasan_populate_early_shadow(shadow_cpu_entry_end, + kasan_populate_early_shadow(shadow_cea_end, kasan_mem_to_shadow((void *)__START_KERNEL_map)); kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), -- cgit From bde258d97409f2a45243cb393a55ea9ecfc7aba5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Nov 2022 20:35:03 +0000 Subject: x86/kasan: Add helpers to align shadow addresses up and down Add helpers to dedup code for aligning shadow address up/down to page boundaries when translating an address to its shadow. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andrey Ryabinin Link: https://lkml.kernel.org/r/20221110203504.1985010-5-seanjc@google.com --- arch/x86/mm/kasan_init_64.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index ad7872ae10ed..afc5e129ca7b 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -316,22 +316,33 @@ void __init kasan_early_init(void) kasan_map_early_shadow(init_top_pgt); } +static unsigned long kasan_mem_to_shadow_align_down(unsigned long va) +{ + unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va); + + return round_down(shadow, PAGE_SIZE); +} + +static unsigned long kasan_mem_to_shadow_align_up(unsigned long va) +{ + unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va); + + return round_up(shadow, PAGE_SIZE); +} + void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid) { unsigned long shadow_start, shadow_end; - shadow_start = (unsigned long)kasan_mem_to_shadow(va); - shadow_start = round_down(shadow_start, PAGE_SIZE); - shadow_end = (unsigned long)kasan_mem_to_shadow(va + size); - shadow_end = round_up(shadow_end, PAGE_SIZE); - + shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va); + shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size); kasan_populate_shadow(shadow_start, shadow_end, nid); } void __init kasan_init(void) { + unsigned long shadow_cea_begin, shadow_cea_end; int i; - void *shadow_cea_begin, *shadow_cea_end; memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); @@ -372,16 +383,9 @@ void __init kasan_init(void) map_range(&pfn_mapped[i]); } - shadow_cea_begin = (void *)CPU_ENTRY_AREA_BASE; - shadow_cea_begin = kasan_mem_to_shadow(shadow_cea_begin); - shadow_cea_begin = (void *)round_down( - (unsigned long)shadow_cea_begin, PAGE_SIZE); - - shadow_cea_end = (void *)(CPU_ENTRY_AREA_BASE + - CPU_ENTRY_AREA_MAP_SIZE); - shadow_cea_end = kasan_mem_to_shadow(shadow_cea_end); - shadow_cea_end = (void *)round_up( - (unsigned long)shadow_cea_end, PAGE_SIZE); + shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE); + shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_MAP_SIZE); kasan_populate_early_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), @@ -403,9 +407,9 @@ void __init kasan_init(void) kasan_populate_early_shadow( kasan_mem_to_shadow((void *)VMALLOC_END + 1), - shadow_cea_begin); + (void *)shadow_cea_begin); - kasan_populate_early_shadow(shadow_cea_end, + kasan_populate_early_shadow((void *)shadow_cea_end, kasan_mem_to_shadow((void *)__START_KERNEL_map)); kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), -- cgit From 1cfaac2400c73378e78182a706be0f3ac8b93cd7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Nov 2022 20:35:04 +0000 Subject: x86/kasan: Populate shadow for shared chunk of the CPU entry area Popuplate the shadow for the shared portion of the CPU entry area, i.e. the read-only IDT mapping, during KASAN initialization. A recent change modified KASAN to map the per-CPU areas on-demand, but forgot to keep a shadow for the common area that is shared amongst all CPUs. Map the common area in KASAN init instead of letting idt_map_in_cea() do the dirty work so that it Just Works in the unlikely event more shared data is shoved into the CPU entry area. The bug manifests as a not-present #PF when software attempts to lookup an IDT entry, e.g. when KVM is handling IRQs on Intel CPUs (KVM performs direct CALL to the IRQ handler to avoid the overhead of INTn): BUG: unable to handle page fault for address: fffffbc0000001d8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 16c03a067 P4D 16c03a067 PUD 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 5 PID: 901 Comm: repro Tainted: G W 6.1.0-rc3+ #410 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kasan_check_range+0xdf/0x190 vmx_handle_exit_irqoff+0x152/0x290 [kvm_intel] vcpu_run+0x1d89/0x2bd0 [kvm] kvm_arch_vcpu_ioctl_run+0x3ce/0xa70 [kvm] kvm_vcpu_ioctl+0x349/0x900 [kvm] __x64_sys_ioctl+0xb8/0xf0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: 9fd429c28073 ("x86/kasan: Map shadow for percpu pages on demand") Reported-by: syzbot+8cdd16fd5a6c0565e227@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221110203504.1985010-6-seanjc@google.com --- arch/x86/mm/kasan_init_64.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index afc5e129ca7b..0302491d799d 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -341,7 +341,7 @@ void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid) void __init kasan_init(void) { - unsigned long shadow_cea_begin, shadow_cea_end; + unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end; int i; memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); @@ -384,6 +384,7 @@ void __init kasan_init(void) } shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE); + shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU); shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE); @@ -409,6 +410,15 @@ void __init kasan_init(void) kasan_mem_to_shadow((void *)VMALLOC_END + 1), (void *)shadow_cea_begin); + /* + * Populate the shadow for the shared portion of the CPU entry area. + * Shadows for the per-CPU areas are mapped on-demand, as each CPU's + * area is randomly placed somewhere in the 512GiB range and mapping + * the entire 512GiB range is prohibitively expensive. + */ + kasan_populate_shadow(shadow_cea_begin, + shadow_cea_per_cpu_begin, 0); + kasan_populate_early_shadow((void *)shadow_cea_end, kasan_mem_to_shadow((void *)__START_KERNEL_map)); -- cgit From 3e844d842d49cdbe61a4b338bdd512654179488a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 18 Nov 2022 07:16:16 -0800 Subject: x86/mm: Ensure forced page table splitting There are a few kernel users like kfence that require 4k pages to work correctly and do not support large mappings. They use set_memory_4k() to break down those large mappings. That, in turn relies on cpa_data->force_split option to indicate to set_memory code that it should split page tables regardless of whether the need to be. But, a recent change added an optimization which would return early if a set_memory request came in that did not change permissions. It did not consult ->force_split and would mistakenly optimize away the splitting that set_memory_4k() needs. This broke kfence. Skip the same-permission optimization when ->force_split is set. Fixes: 127960a05548 ("x86/mm: Inhibit _PAGE_NX changes from cpa_process_alias()") Signed-off-by: Dave Hansen Tested-by: Marco Elver Cc: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/CA+G9fYuFxZTxkeS35VTZMXwQvohu73W3xbZ5NtjebsVvH6hCuA@mail.gmail.com/ --- arch/x86/mm/pat/set_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 220361ceb997..0db69514fe29 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1727,7 +1727,8 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) /* * No changes, easy! */ - if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr))) + if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && + !cpa->force_split) return ret; while (rempages) { -- cgit