diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-19 07:13:33 -0600 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-19 07:13:33 -0600 |
commit | 5f6e430f931d245da838db3e10e918681207029b (patch) | |
tree | 6adc54a582652ae470ce95d9205f798568339ff0 /arch/powerpc/lib/code-patching.c | |
parent | a6e3e6f138058ff184d8ef5064a033b3f5fee8f8 (diff) | |
parent | 980411a4d1bb925d28cd9e8d8301dc982ece788d (diff) |
Merge tag 'powerpc-6.2-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman:
- Add powerpc qspinlock implementation optimised for large system
scalability and paravirt. See the merge message for more details
- Enable objtool to be built on powerpc to generate mcount locations
- Use a temporary mm for code patching with the Radix MMU, so the
writable mapping is restricted to the patching CPU
- Add an option to build the 64-bit big-endian kernel with the ELFv2
ABI
- Sanitise user registers on interrupt entry on 64-bit Book3S
- Many other small features and fixes
Thanks to Aboorva Devarajan, Angel Iglesias, Benjamin Gray, Bjorn
Helgaas, Bo Liu, Chen Lifu, Christoph Hellwig, Christophe JAILLET,
Christophe Leroy, Christopher M. Riedl, Colin Ian King, Deming Wang,
Disha Goel, Dmitry Torokhov, Finn Thain, Geert Uytterhoeven, Gustavo A.
R. Silva, Haowen Bai, Joel Stanley, Jordan Niethe, Julia Lawall, Kajol
Jain, Laurent Dufour, Li zeming, Miaoqian Lin, Michael Jeanson, Nathan
Lynch, Naveen N. Rao, Nayna Jain, Nicholas Miehlbradt, Nicholas Piggin,
Pali Rohár, Randy Dunlap, Rohan McLure, Russell Currey, Sathvika
Vasireddy, Shaomin Deng, Stephen Kitt, Stephen Rothwell, Thomas
Weißschuh, Tiezhu Yang, Uwe Kleine-König, Xie Shaowen, Xiu Jianfeng,
XueBing Chen, Yang Yingliang, Zhang Jiaming, ruanjinjie, Jessica Yu,
and Wolfram Sang.
* tag 'powerpc-6.2-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (181 commits)
powerpc/code-patching: Fix oops with DEBUG_VM enabled
powerpc/qspinlock: Fix 32-bit build
powerpc/prom: Fix 32-bit build
powerpc/rtas: mandate RTAS syscall filtering
powerpc/rtas: define pr_fmt and convert printk call sites
powerpc/rtas: clean up includes
powerpc/rtas: clean up rtas_error_log_max initialization
powerpc/pseries/eeh: use correct API for error log size
powerpc/rtas: avoid scheduling in rtas_os_term()
powerpc/rtas: avoid device tree lookups in rtas_os_term()
powerpc/rtasd: use correct OF API for event scan rate
powerpc/rtas: document rtas_call()
powerpc/pseries: unregister VPA when hot unplugging a CPU
powerpc/pseries: reset the RCU watchdogs after a LPM
powerpc: Take in account addition CPU node when building kexec FDT
powerpc: export the CPU node count
powerpc/cpuidle: Set CPUIDLE_FLAG_POLLING for snooze state
powerpc/dts/fsl: Fix pca954x i2c-mux node names
cxl: Remove unnecessary cxl_pci_window_alignment()
selftests/powerpc: Fix resource leaks
...
Diffstat (limited to 'arch/powerpc/lib/code-patching.c')
-rw-r--r-- | arch/powerpc/lib/code-patching.c | 238 |
1 files changed, 202 insertions, 36 deletions
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index ad0cf3108dd0..b00112d7ad46 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -4,12 +4,17 @@ */ #include <linux/kprobes.h> +#include <linux/mmu_context.h> +#include <linux/random.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/cpuhotplug.h> #include <linux/uaccess.h> #include <linux/jump_label.h> +#include <asm/debug.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/page.h> #include <asm/code-patching.h> @@ -41,12 +46,59 @@ int raw_patch_instruction(u32 *addr, ppc_inst_t instr) return __patch_instruction(addr, instr, addr); } -#ifdef CONFIG_STRICT_KERNEL_RWX -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); +struct patch_context { + union { + struct vm_struct *area; + struct mm_struct *mm; + }; + unsigned long addr; + pte_t *pte; +}; + +static DEFINE_PER_CPU(struct patch_context, cpu_patching_context); static int map_patch_area(void *addr, unsigned long text_poke_addr); static void unmap_patch_area(unsigned long addr); +static bool mm_patch_enabled(void) +{ + return IS_ENABLED(CONFIG_SMP) && radix_enabled(); +} + +/* + * The following applies for Radix MMU. Hash MMU has different requirements, + * and so is not supported. + * + * Changing mm requires context synchronising instructions on both sides of + * the context switch, as well as a hwsync between the last instruction for + * which the address of an associated storage access was translated using + * the current context. + * + * switch_mm_irqs_off() performs an isync after the context switch. It is + * the responsibility of the caller to perform the CSI and hwsync before + * starting/stopping the temp mm. + */ +static struct mm_struct *start_using_temp_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *orig_mm = current->active_mm; + + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(orig_mm, temp_mm, current); + + WARN_ON(!mm_is_thread_local(temp_mm)); + + suspend_breakpoints(); + return orig_mm; +} + +static void stop_using_temp_mm(struct mm_struct *temp_mm, + struct mm_struct *orig_mm) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(temp_mm, orig_mm, current); + restore_breakpoints(); +} + static int text_area_cpu_up(unsigned int cpu) { struct vm_struct *area; @@ -68,29 +120,108 @@ static int text_area_cpu_up(unsigned int cpu) unmap_patch_area(addr); - this_cpu_write(text_poke_area, area); + this_cpu_write(cpu_patching_context.area, area); + this_cpu_write(cpu_patching_context.addr, addr); + this_cpu_write(cpu_patching_context.pte, virt_to_kpte(addr)); return 0; } static int text_area_cpu_down(unsigned int cpu) { - free_vm_area(this_cpu_read(text_poke_area)); + free_vm_area(this_cpu_read(cpu_patching_context.area)); + this_cpu_write(cpu_patching_context.area, NULL); + this_cpu_write(cpu_patching_context.addr, 0); + this_cpu_write(cpu_patching_context.pte, NULL); + return 0; +} + +static void put_patching_mm(struct mm_struct *mm, unsigned long patching_addr) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, mm); + free_pgd_range(&tlb, patching_addr, patching_addr + PAGE_SIZE, 0, 0); + mmput(mm); +} + +static int text_area_cpu_up_mm(unsigned int cpu) +{ + struct mm_struct *mm; + unsigned long addr; + pte_t *pte; + spinlock_t *ptl; + + mm = mm_alloc(); + if (WARN_ON(!mm)) + goto fail_no_mm; + + /* + * Choose a random page-aligned address from the interval + * [PAGE_SIZE .. DEFAULT_MAP_WINDOW - PAGE_SIZE]. + * The lower address bound is PAGE_SIZE to avoid the zero-page. + */ + addr = (1 + (get_random_long() % (DEFAULT_MAP_WINDOW / PAGE_SIZE - 2))) << PAGE_SHIFT; + + /* + * PTE allocation uses GFP_KERNEL which means we need to + * pre-allocate the PTE here because we cannot do the + * allocation during patching when IRQs are disabled. + * + * Using get_locked_pte() to avoid open coding, the lock + * is unnecessary. + */ + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto fail_no_pte; + pte_unmap_unlock(pte, ptl); + + this_cpu_write(cpu_patching_context.mm, mm); + this_cpu_write(cpu_patching_context.addr, addr); + + return 0; + +fail_no_pte: + put_patching_mm(mm, addr); +fail_no_mm: + return -ENOMEM; +} + +static int text_area_cpu_down_mm(unsigned int cpu) +{ + put_patching_mm(this_cpu_read(cpu_patching_context.mm), + this_cpu_read(cpu_patching_context.addr)); + + this_cpu_write(cpu_patching_context.mm, NULL); + this_cpu_write(cpu_patching_context.addr, 0); + return 0; } static __ro_after_init DEFINE_STATIC_KEY_FALSE(poking_init_done); -/* - * Although BUG_ON() is rude, in this case it should only happen if ENOMEM, and - * we judge it as being preferable to a kernel that will crash later when - * someone tries to use patch_instruction(). - */ void __init poking_init(void) { - BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "powerpc/text_poke:online", text_area_cpu_up, - text_area_cpu_down)); + int ret; + + if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + return; + + if (mm_patch_enabled()) + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke_mm:online", + text_area_cpu_up_mm, + text_area_cpu_down_mm); + else + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke:online", + text_area_cpu_up, + text_area_cpu_down); + + /* cpuhp_setup_state returns >= 0 on success */ + if (WARN_ON(ret < 0)) + return; + static_branch_enable(&poking_init_done); } @@ -147,6 +278,56 @@ static void unmap_patch_area(unsigned long addr) flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } +static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) +{ + int err; + u32 *patch_addr; + unsigned long text_poke_addr; + pte_t *pte; + unsigned long pfn = get_patch_pfn(addr); + struct mm_struct *patching_mm; + struct mm_struct *orig_mm; + spinlock_t *ptl; + + patching_mm = __this_cpu_read(cpu_patching_context.mm); + text_poke_addr = __this_cpu_read(cpu_patching_context.addr); + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + + pte = get_locked_pte(patching_mm, text_poke_addr, &ptl); + if (!pte) + return -ENOMEM; + + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + + /* order PTE update before use, also serves as the hwsync */ + asm volatile("ptesync": : :"memory"); + + /* order context switch after arbitrary prior code */ + isync(); + + orig_mm = start_using_temp_mm(patching_mm); + + err = __patch_instruction(addr, instr, patch_addr); + + /* hwsync performed by __patch_instruction (sync) if successful */ + if (err) + mb(); /* sync */ + + /* context synchronisation performed by __patch_instruction (isync or exception) */ + stop_using_temp_mm(patching_mm, orig_mm); + + pte_clear(patching_mm, text_poke_addr, pte); + /* + * ptesync to order PTE update before TLB invalidation done + * by radix__local_flush_tlb_page_psize (in _tlbiel_va) + */ + local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); + + pte_unmap_unlock(pte, ptl); + + return err; +} + static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) { int err; @@ -155,10 +336,10 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) pte_t *pte; unsigned long pfn = get_patch_pfn(addr); - text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr & PAGE_MASK; + text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); - pte = virt_to_kpte(text_poke_addr); + pte = __this_cpu_read(cpu_patching_context.pte); __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); /* See ptesync comment in radix__set_pte_at() */ if (radix_enabled()) @@ -172,7 +353,7 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) return err; } -static int do_patch_instruction(u32 *addr, ppc_inst_t instr) +int patch_instruction(u32 *addr, ppc_inst_t instr) { int err; unsigned long flags; @@ -182,34 +363,19 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) * when text_poke_area is not ready, but we still need * to allow patching. We just do the plain old patching */ - if (!static_branch_likely(&poking_init_done)) + if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) || + !static_branch_likely(&poking_init_done)) return raw_patch_instruction(addr, instr); local_irq_save(flags); - err = __do_patch_instruction(addr, instr); + if (mm_patch_enabled()) + err = __do_patch_instruction_mm(addr, instr); + else + err = __do_patch_instruction(addr, instr); local_irq_restore(flags); return err; } -#else /* !CONFIG_STRICT_KERNEL_RWX */ - -static int do_patch_instruction(u32 *addr, ppc_inst_t instr) -{ - return raw_patch_instruction(addr, instr); -} - -#endif /* CONFIG_STRICT_KERNEL_RWX */ - -__ro_after_init DEFINE_STATIC_KEY_FALSE(init_mem_is_free); - -int patch_instruction(u32 *addr, ppc_inst_t instr) -{ - /* Make sure we aren't patching a freed init section */ - if (static_branch_likely(&init_mem_is_free) && init_section_contains(addr, 4)) - return 0; - - return do_patch_instruction(addr, instr); -} NOKPROBE_SYMBOL(patch_instruction); int patch_branch(u32 *addr, unsigned long target, int flags) |