diff options
Diffstat (limited to 'arch/powerpc/kernel')
211 files changed, 25260 insertions, 23279 deletions
diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore index 67ebd3003c05..d71179d3ffe9 100644 --- a/arch/powerpc/kernel/.gitignore +++ b/arch/powerpc/kernel/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only prom_init_check vmlinux.lds diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/85xx_entry_mapping.S index 8bccce6544b5..dedc17fac8f8 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/85xx_entry_mapping.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* 1. Find the index of the entry we're executing in */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ @@ -85,7 +85,7 @@ skpinv: addi r6,r6,1 /* Increment */ addi r6,r6,10 slw r6,r8,r6 /* convert to mask */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r7 mfspr r8,SPRN_MAS3 @@ -117,7 +117,7 @@ skpinv: addi r6,r6,1 /* Increment */ xori r6,r4,1 slwi r6,r6,5 /* setup new context with other address space */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r7,r9,0,20,31 addi r7,r7,(2f - 1b) @@ -207,7 +207,7 @@ next_tlb_setup: lis r7,MSR_KERNEL@h ori r7,r7,MSR_KERNEL@l - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r6,r9,0,20,31 addi r6,r6,(2f - 1b) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 157b0147921f..2919433be355 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -3,8 +3,6 @@ # Makefile for the linux kernel. # -CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' - ifdef CONFIG_PPC64 CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) endif @@ -13,14 +11,16 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif +CFLAGS_early_32.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) -CFLAGS_prom_init.o += $(call cc-option, -fno-stack-protector) +CFLAGS_prom_init.o += -fno-stack-protector CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING CFLAGS_prom_init.o += -ffreestanding +CFLAGS_prom_init.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code @@ -34,6 +34,19 @@ KASAN_SANITIZE_early_32.o := n KASAN_SANITIZE_cputable.o := n KASAN_SANITIZE_prom_init.o := n KASAN_SANITIZE_btext.o := n +KASAN_SANITIZE_paca.o := n +KASAN_SANITIZE_setup_64.o := n +KASAN_SANITIZE_mce.o := n +KASAN_SANITIZE_mce_power.o := n +KASAN_SANITIZE_udbg.o := n +KASAN_SANITIZE_udbg_16550.o := n + +# we have to be particularly careful in ppc64 to exclude code that +# runs with translations off, as we cannot access the shadow with +# translations off. However, ppc32 can sanitize this. +ifdef CONFIG_PPC64 +KASAN_SANITIZE_traps.o := n +endif ifdef CONFIG_KASAN CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING @@ -41,71 +54,87 @@ CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING endif -obj-y := cputable.o ptrace.o syscalls.o \ - irq.o align.o signal_32.o pmc.o vdso.o \ +KCSAN_SANITIZE_early_32.o := n +KCSAN_SANITIZE_early_64.o := n +KCSAN_SANITIZE_cputable.o := n +KCSAN_SANITIZE_btext.o := n +KCSAN_SANITIZE_paca.o := n +KCSAN_SANITIZE_setup_64.o := n + +#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET +# Remove stack protector to avoid triggering unneeded stack canary +# checks due to randomize_kstack_offset. +CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong +CFLAGS_syscall.o += -fno-stack-protector +#endif + +obj-y := cputable.o syscalls.o switch.o \ + irq.o align.o signal_$(BITS).o pmc.o vdso.o \ process.o systbl.o idle.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ - of_platform.o prom_parse.o -obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ - signal_64.o ptrace32.o \ - paca.o nvram_64.o firmware.o note.o -obj-$(CONFIG_VDSO32) += vdso32/ + of_platform.o prom_parse.o firmware.o \ + hw_breakpoint_constraints.o interrupt.o \ + kdebugfs.o stacktrace.o syscall.o +obj-y += ptrace/ +obj-$(CONFIG_PPC64) += setup_64.o irq_64.o\ + paca.o nvram_64.o note.o +obj-$(CONFIG_PPC32) += sys_ppc32.o +obj-$(CONFIG_COMPAT) += sys_ppc32.o signal_32.o +obj-$(CONFIG_VDSO32) += vdso32_wrapper.o obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_DAWR) += dawr.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o -obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o +obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_64e.o obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o -obj-$(CONFIG_PPC64) += vdso64/ +obj-$(CONFIG_PPC64) += vdso64_wrapper.o obj-$(CONFIG_ALTIVEC) += vecemu.o -obj-$(CONFIG_PPC_970_NAP) += idle_power4.o -obj-$(CONFIG_PPC_P7_NAP) += idle_book3s.o +obj-$(CONFIG_PPC_BOOK3S_IDLE) += idle_book3s.o procfs-y := proc_powerpc.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o -obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y) +obj-$(CONFIG_PPC_RTAS) += rtas_entry.o rtas.o rtas-rtc.o $(rtaspci-y-y) obj-$(CONFIG_PPC_RTAS_DAEMON) += rtasd.o obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o obj-$(CONFIG_PPC_DT_CPU_FTRS) += dt_cpu_ftrs.o -obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ +obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_cache.o \ eeh_driver.o eeh_event.o eeh_sysfs.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_FA_DUMP) += fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += fadump.o -ifdef CONFIG_PPC32 -obj-$(CONFIG_E500) += idle_e500.o -endif +obj-$(CONFIG_PPC_85xx) += idle_85xx.o obj-$(CONFIG_PPC_BOOK3S_32) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o -ifdef CONFIG_FSL_BOOKE -obj-$(CONFIG_HIBERNATION) += swsusp_booke.o +ifdef CONFIG_PPC_85xx +obj-$(CONFIG_HIBERNATION) += swsusp_85xx.o else obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o endif obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o obj-$(CONFIG_MODULES) += module.o module_$(BITS).o obj-$(CONFIG_44x) += cpu_setup_44x.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o +obj-$(CONFIG_PPC_E500) += cpu_setup_e500.o obj-$(CONFIG_PPC_DOORBELL) += dbell.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -extra-y := head_$(BITS).o -extra-$(CONFIG_40x) := head_40x.o -extra-$(CONFIG_44x) := head_44x.o -extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o -extra-$(CONFIG_PPC_8xx) := head_8xx.o +obj-$(CONFIG_PPC64) += head_64.o +obj-$(CONFIG_PPC_BOOK3S_32) += head_book3s_32.o +obj-$(CONFIG_40x) += head_40x.o +obj-$(CONFIG_44x) += head_44x.o +obj-$(CONFIG_PPC_8xx) += head_8xx.o +obj-$(CONFIG_PPC_85xx) += head_85xx.o extra-y += vmlinux.lds obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o -obj-$(CONFIG_PPC32) += entry_32.o setup_32.o early_32.o +obj-$(CONFIG_PPC32) += entry_32.o setup_32.o early_32.o static_call.o obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_BOOTX_TEXT) += btext.o @@ -115,7 +144,6 @@ obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o @@ -137,9 +165,6 @@ endif obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o -obj-$(CONFIG_PPC64) += $(obj64-y) -obj-$(CONFIG_PPC32) += $(obj32-y) - ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),) obj-y += ppc_save_regs.o endif @@ -156,13 +181,19 @@ obj-$(CONFIG_PPC_SECVAR_SYSFS) += secvar-sysfs.o # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n KCOV_INSTRUMENT_prom_init.o := n +KCSAN_SANITIZE_prom_init.o := n UBSAN_SANITIZE_prom_init.o := n GCOV_PROFILE_kprobes.o := n KCOV_INSTRUMENT_kprobes.o := n +KCSAN_SANITIZE_kprobes.o := n UBSAN_SANITIZE_kprobes.o := n GCOV_PROFILE_kprobes-ftrace.o := n KCOV_INSTRUMENT_kprobes-ftrace.o := n +KCSAN_SANITIZE_kprobes-ftrace.o := n UBSAN_SANITIZE_kprobes-ftrace.o := n +GCOV_PROFILE_syscall_64.o := n +KCOV_INSTRUMENT_syscall_64.o := n +UBSAN_SANITIZE_syscall_64.o := n UBSAN_SANITIZE_vdso.o := n # Necessary for booting with kcov enabled on book3e machines @@ -170,12 +201,18 @@ KCOV_INSTRUMENT_cputable.o := n KCOV_INSTRUMENT_setup_64.o := n KCOV_INSTRUMENT_paca.o := n -extra-$(CONFIG_PPC_FPU) += fpu.o -extra-$(CONFIG_ALTIVEC) += vector.o -extra-$(CONFIG_PPC64) += entry_64.o -extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o +CFLAGS_setup_64.o += -fno-stack-protector +CFLAGS_paca.o += -fno-stack-protector + +obj-$(CONFIG_PPC_FPU) += fpu.o +obj-$(CONFIG_ALTIVEC) += vector.o -extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check +obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o +obj64-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_entry_64.o +extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check + +obj-$(CONFIG_PPC64) += $(obj64-y) +obj-$(CONFIG_PPC32) += $(obj32-y) quiet_cmd_prom_init_check = PROMCHK $@ cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" $(obj)/prom_init.o; touch $@ @@ -185,3 +222,10 @@ $(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE targets += prom_init_check clean-files := vmlinux.lds + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso/vdso32.so.dbg +$(obj)/vdso64_wrapper.o : $(obj)/vdso/vdso64.so.dbg + +# for cleaning +subdir- += vdso diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index 92045ed64976..3e37ece06739 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -24,6 +24,7 @@ #include <asm/disassemble.h> #include <asm/cpu_has_feature.h> #include <asm/sstep.h> +#include <asm/inst.h> struct aligninfo { unsigned char len; @@ -104,9 +105,8 @@ static struct aligninfo spe_aligninfo[32] = { * so we don't need the address swizzling. */ static int emulate_spe(struct pt_regs *regs, unsigned int reg, - unsigned int instr) + ppc_inst_t ppc_instr) { - int ret; union { u64 ll; u32 w[2]; @@ -115,8 +115,9 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, } data, temp; unsigned char __user *p, *addr; unsigned long *evr = ¤t->thread.evr[reg]; - unsigned int nb, flags; + unsigned int nb, flags, instr; + instr = ppc_inst_val(ppc_instr); instr = (instr >> 1) & 0x1f; /* DAR has the operand effective address */ @@ -125,11 +126,6 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, nb = spe_aligninfo[instr].len; flags = spe_aligninfo[instr].flags; - /* Verify the address of the operand */ - if (unlikely(user_mode(regs) && - !access_ok(addr, nb))) - return -EFAULT; - /* userland only */ if (unlikely(!user_mode(regs))) return 0; @@ -167,26 +163,27 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, } } else { temp.ll = data.ll = 0; - ret = 0; p = addr; + if (!user_read_access_begin(addr, nb)) + return -EFAULT; + switch (nb) { case 8: - ret |= __get_user_inatomic(temp.v[0], p++); - ret |= __get_user_inatomic(temp.v[1], p++); - ret |= __get_user_inatomic(temp.v[2], p++); - ret |= __get_user_inatomic(temp.v[3], p++); - /* fall through */ + unsafe_get_user(temp.v[0], p++, Efault_read); + unsafe_get_user(temp.v[1], p++, Efault_read); + unsafe_get_user(temp.v[2], p++, Efault_read); + unsafe_get_user(temp.v[3], p++, Efault_read); + fallthrough; case 4: - ret |= __get_user_inatomic(temp.v[4], p++); - ret |= __get_user_inatomic(temp.v[5], p++); - /* fall through */ + unsafe_get_user(temp.v[4], p++, Efault_read); + unsafe_get_user(temp.v[5], p++, Efault_read); + fallthrough; case 2: - ret |= __get_user_inatomic(temp.v[6], p++); - ret |= __get_user_inatomic(temp.v[7], p++); - if (unlikely(ret)) - return -EFAULT; + unsafe_get_user(temp.v[6], p++, Efault_read); + unsafe_get_user(temp.v[7], p++, Efault_read); } + user_read_access_end(); switch (instr) { case EVLDD: @@ -253,31 +250,41 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, /* Store result to memory or update registers */ if (flags & ST) { - ret = 0; p = addr; + + if (!user_write_access_begin(addr, nb)) + return -EFAULT; + switch (nb) { case 8: - ret |= __put_user_inatomic(data.v[0], p++); - ret |= __put_user_inatomic(data.v[1], p++); - ret |= __put_user_inatomic(data.v[2], p++); - ret |= __put_user_inatomic(data.v[3], p++); - /* fall through */ + unsafe_put_user(data.v[0], p++, Efault_write); + unsafe_put_user(data.v[1], p++, Efault_write); + unsafe_put_user(data.v[2], p++, Efault_write); + unsafe_put_user(data.v[3], p++, Efault_write); + fallthrough; case 4: - ret |= __put_user_inatomic(data.v[4], p++); - ret |= __put_user_inatomic(data.v[5], p++); - /* fall through */ + unsafe_put_user(data.v[4], p++, Efault_write); + unsafe_put_user(data.v[5], p++, Efault_write); + fallthrough; case 2: - ret |= __put_user_inatomic(data.v[6], p++); - ret |= __put_user_inatomic(data.v[7], p++); + unsafe_put_user(data.v[6], p++, Efault_write); + unsafe_put_user(data.v[7], p++, Efault_write); } - if (unlikely(ret)) - return -EFAULT; + user_write_access_end(); } else { *evr = data.w[0]; regs->gpr[reg] = data.w[1]; } return 1; + +Efault_read: + user_read_access_end(); + return -EFAULT; + +Efault_write: + user_write_access_end(); + return -EFAULT; } #endif /* CONFIG_SPE */ @@ -293,28 +300,27 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, int fix_alignment(struct pt_regs *regs) { - unsigned int instr; + ppc_inst_t instr; struct instruction_op op; int r, type; - /* - * We require a complete register set, if not, then our assembly - * is broken - */ - CHECK_FULL_REGS(regs); + if (is_kernel_addr(regs->nip)) + r = copy_inst_from_kernel_nofault(&instr, (void *)regs->nip); + else + r = __get_user_instr(instr, (void __user *)regs->nip); - if (unlikely(__get_user(instr, (unsigned int __user *)regs->nip))) + if (unlikely(r)) return -EFAULT; if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { /* We don't handle PPC little-endian any more... */ if (cpu_has_feature(CPU_FTR_PPC_LE)) return -EIO; - instr = swab32(instr); + instr = ppc_inst_swab(instr); } #ifdef CONFIG_SPE - if ((instr >> 26) == 0x4) { - int reg = (instr >> 21) & 0x1f; + if (ppc_inst_primary_opcode(instr) == 0x4) { + int reg = (ppc_inst_val(instr) >> 21) & 0x1f; PPC_WARN_ALIGNMENT(spe, regs); return emulate_spe(regs, reg, instr); } @@ -331,7 +337,7 @@ int fix_alignment(struct pt_regs *regs) * when pasting to a co-processor. Furthermore, paste_last is the * synchronisation point for preceding copy/paste sequences. */ - if ((instr & 0xfc0006fe) == (PPC_INST_COPY & 0xfc0006fe)) + if ((ppc_inst_val(instr) & 0xfc0006fe) == (PPC_INST_COPY & 0xfc0006fe)) return -EIO; r = analyse_instr(&op, regs, instr); @@ -343,6 +349,7 @@ int fix_alignment(struct pt_regs *regs) if (op.type != CACHEOP + DCBZ) return -EINVAL; PPC_WARN_ALIGNMENT(dcbz, regs); + WARN_ON_ONCE(!user_mode(regs)); r = emulate_dcbz(op.ea, regs); } else { if (type == LARX || type == STCX) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 3d47aec7becf..9f14d95b8b32 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -9,8 +9,6 @@ * #defines from the assembly-language output. */ -#define GENERATING_ASM_OFFSETS /* asm/smp.h */ - #include <linux/compat.h> #include <linux/signal.h> #include <linux/sched.h> @@ -30,7 +28,6 @@ #include <asm/io.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/processor.h> #include <asm/cputable.h> #include <asm/thread_info.h> @@ -57,12 +54,12 @@ #endif #ifdef CONFIG_PPC32 -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x #include "head_booke.h" #endif #endif -#if defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_E500) #include "../mm/mmu_decl.h" #endif @@ -70,8 +67,12 @@ #include <asm/fixmap.h> #endif +#ifdef CONFIG_XMON +#include "../xmon/xmon_bpts.h" +#endif + #define STACK_PT_REGS_OFFSET(sym, val) \ - DEFINE(sym, STACK_FRAME_OVERHEAD + offsetof(struct pt_regs, val)) + DEFINE(sym, STACK_INT_FRAME_REGS + offsetof(struct pt_regs, val)) int main(void) { @@ -83,22 +84,17 @@ int main(void) OFFSET(PACA_CANARY, paca_struct, canary); #endif #endif - OFFSET(MMCONTEXTID, mm_struct, context.id); -#ifdef CONFIG_PPC64 - DEFINE(SIGSEGV, SIGSEGV); - DEFINE(NMI_MASK, NMI_MASK); -#else - OFFSET(KSP_LIMIT, thread_struct, ksp_limit); +#ifdef CONFIG_PPC32 #ifdef CONFIG_PPC_RTAS OFFSET(RTAS_SP, thread_struct, rtas_sp); #endif #endif /* CONFIG_PPC64 */ OFFSET(TASK_STACK, task_struct, stack); #ifdef CONFIG_SMP - OFFSET(TASK_CPU, task_struct, cpu); + OFFSET(TASK_CPU, task_struct, thread_info.cpu); #endif -#ifdef CONFIG_LIVEPATCH +#ifdef CONFIG_LIVEPATCH_64 OFFSET(TI_livepatch_sp, thread_info, livepatch_sp); #endif @@ -107,15 +103,16 @@ int main(void) #ifdef CONFIG_BOOKE OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]); #endif +#ifdef CONFIG_PPC_FPU OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode); OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr); OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area); +#endif OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr); OFFSET(THREAD_LOAD_FP, thread_struct, load_fp); #ifdef CONFIG_ALTIVEC OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr); OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area); - OFFSET(THREAD_VRSAVE, thread_struct, vrsave); OFFSET(THREAD_USED_VR, thread_struct, used_vr); OFFSET(VRSTATE_VSCR, thread_vr_state, vscr); OFFSET(THREAD_LOAD_VEC, thread_struct, load_vec); @@ -127,25 +124,35 @@ int main(void) OFFSET(KSP_VSID, thread_struct, ksp_vsid); #else /* CONFIG_PPC64 */ OFFSET(PGDIR, thread_struct, pgdir); + OFFSET(SRR0, thread_struct, srr0); + OFFSET(SRR1, thread_struct, srr1); + OFFSET(DAR, thread_struct, dar); + OFFSET(DSISR, thread_struct, dsisr); +#ifdef CONFIG_PPC_BOOK3S_32 + OFFSET(THR0, thread_struct, r0); + OFFSET(THR3, thread_struct, r3); + OFFSET(THR4, thread_struct, r4); + OFFSET(THR5, thread_struct, r5); + OFFSET(THR6, thread_struct, r6); + OFFSET(THR8, thread_struct, r8); + OFFSET(THR9, thread_struct, r9); + OFFSET(THR11, thread_struct, r11); + OFFSET(THLR, thread_struct, lr); + OFFSET(THCTR, thread_struct, ctr); + OFFSET(THSR0, thread_struct, sr0); +#endif #ifdef CONFIG_SPE OFFSET(THREAD_EVR0, thread_struct, evr[0]); OFFSET(THREAD_ACC, thread_struct, acc); - OFFSET(THREAD_SPEFSCR, thread_struct, spefscr); OFFSET(THREAD_USED_SPE, thread_struct, used_spe); #endif /* CONFIG_SPE */ #endif /* CONFIG_PPC64 */ -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - OFFSET(THREAD_DBCR0, thread_struct, debug.dbcr0); -#endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER OFFSET(THREAD_KVM_SVCPU, thread_struct, kvm_shadow_vcpu); #endif #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) OFFSET(THREAD_KVM_VCPU, thread_struct, kvm_vcpu); #endif -#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) - OFFSET(KUAP, thread_struct, kuap); -#endif #ifdef CONFIG_PPC_TRANSACTIONAL_MEM OFFSET(PACATMSCRATCH, paca_struct, tm_scratch); @@ -155,28 +162,21 @@ int main(void) OFFSET(THREAD_TM_TAR, thread_struct, tm_tar); OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr); OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr); + OFFSET(THREAD_TM_AMR, thread_struct, tm_amr); OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs); OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr); - /* Local pt_regs on stack for Transactional Memory funcs. */ - DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + - sizeof(struct pt_regs) + 16); + /* Local pt_regs on stack in int frame form, plus 16 bytes for TM */ + DEFINE(TM_FRAME_SIZE, STACK_INT_FRAME_SIZE + 16); #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ - OFFSET(TI_FLAGS, thread_info, flags); OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); - OFFSET(TI_PREEMPT, thread_info, preempt_count); #ifdef CONFIG_PPC64 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); OFFSET(DCACHEL1LOGBLOCKSIZE, ppc64_caches, l1d.log_block_size); - OFFSET(DCACHEL1BLOCKSPERPAGE, ppc64_caches, l1d.blocks_per_page); - OFFSET(ICACHEL1BLOCKSIZE, ppc64_caches, l1i.block_size); - OFFSET(ICACHEL1LOGBLOCKSIZE, ppc64_caches, l1i.log_block_size); - OFFSET(ICACHEL1BLOCKSPERPAGE, ppc64_caches, l1i.blocks_per_page); /* paca */ - DEFINE(PACA_SIZE, sizeof(struct paca_struct)); OFFSET(PACAPACAINDEX, paca_struct, paca_index); OFFSET(PACAPROCSTART, paca_struct, cpu_start); OFFSET(PACAKSAVE, paca_struct, kstack); @@ -185,23 +185,20 @@ int main(void) offsetof(struct task_struct, thread_info)); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); OFFSET(PACAR1, paca_struct, saved_r1); +#ifndef CONFIG_PPC_KERNEL_PCREL OFFSET(PACATOC, paca_struct, kernel_toc); +#endif OFFSET(PACAKBASE, paca_struct, kernelbase); OFFSET(PACAKMSR, paca_struct, kernel_msr); +#ifdef CONFIG_PPC_BOOK3S_64 + OFFSET(PACAHSRR_VALID, paca_struct, hsrr_valid); + OFFSET(PACASRR_VALID, paca_struct, srr_valid); +#endif OFFSET(PACAIRQSOFTMASK, paca_struct, irq_soft_mask); OFFSET(PACAIRQHAPPENED, paca_struct, irq_happened); OFFSET(PACA_FTRACE_ENABLED, paca_struct, ftrace_enabled); -#ifdef CONFIG_PPC_BOOK3S - OFFSET(PACACONTEXTID, paca_struct, mm_ctx_id); -#ifdef CONFIG_PPC_MM_SLICES - OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); - OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); - OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit); - DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); -#endif /* CONFIG_PPC_MM_SLICES */ -#endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 OFFSET(PACAPGD, paca_struct, pgd); OFFSET(PACA_KERNELPGD, paca_struct, kernel_pgd); OFFSET(PACA_EXGEN, paca_struct, exgen); @@ -217,36 +214,23 @@ int main(void) OFFSET(TCD_ESEL_NEXT, tlb_core_data, esel_next); OFFSET(TCD_ESEL_MAX, tlb_core_data, esel_max); OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first); -#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_BOOK3E_64 */ #ifdef CONFIG_PPC_BOOK3S_64 - OFFSET(PACASLBCACHE, paca_struct, slb_cache); - OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); - OFFSET(PACASTABRR, paca_struct, stab_rr); - OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); -#ifdef CONFIG_PPC_MM_SLICES - OFFSET(MMUPSIZESLLP, mmu_psize_def, sllp); -#else - OFFSET(PACACONTEXTSLLP, paca_struct, mm_ctx_sllp); -#endif /* CONFIG_PPC_MM_SLICES */ OFFSET(PACA_EXGEN, paca_struct, exgen); OFFSET(PACA_EXMC, paca_struct, exmc); - OFFSET(PACA_EXSLB, paca_struct, exslb); OFFSET(PACA_EXNMI, paca_struct, exnmi); -#ifdef CONFIG_PPC_PSERIES - OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr); -#endif +#ifdef CONFIG_PPC_64S_HASH_MMU OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr); OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid); OFFSET(SLBSHADOW_STACKESID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].esid); OFFSET(SLBSHADOW_SAVEAREA, slb_shadow, save_area); +#endif OFFSET(LPPACA_PMCINUSE, lppaca, pmcregs_in_use); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE OFFSET(PACA_PMCINUSE, paca_struct, pmcregs_in_use); #endif - OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx); OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count); - OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx); #endif /* CONFIG_PPC_BOOK3S_64 */ OFFSET(PACAEMERGSP, paca_struct, emergency_sp); #ifdef CONFIG_PPC_BOOK3S_64 @@ -262,21 +246,14 @@ int main(void) OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); OFFSET(PACA_DSCR_DEFAULT, paca_struct, dscr_default); - OFFSET(ACCOUNT_STARTTIME, paca_struct, accounting.starttime); - OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user); - OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime); - OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC64 + OFFSET(PACA_EXIT_SAVE_R1, paca_struct, exit_save_r1); +#endif +#ifdef CONFIG_PPC_BOOK3E_64 OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save); #endif OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso); #else /* CONFIG_PPC64 */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - OFFSET(ACCOUNT_STARTTIME, thread_info, accounting.starttime); - OFFSET(ACCOUNT_STARTTIME_USER, thread_info, accounting.starttime_user); - OFFSET(ACCOUNT_USER_TIME, thread_info, accounting.utime); - OFFSET(ACCOUNT_SYSTEM_TIME, thread_info, accounting.stime); -#endif #endif /* CONFIG_PPC64 */ /* RTAS */ @@ -285,7 +262,7 @@ int main(void) /* Interrupt register frame */ DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE); - DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)); + DEFINE(SWITCH_FRAME_SIZE, STACK_SWITCH_FRAME_SIZE); STACK_PT_REGS_OFFSET(GPR0, gpr[0]); STACK_PT_REGS_OFFSET(GPR1, gpr[1]); STACK_PT_REGS_OFFSET(GPR2, gpr[2]); @@ -300,9 +277,6 @@ int main(void) STACK_PT_REGS_OFFSET(GPR11, gpr[11]); STACK_PT_REGS_OFFSET(GPR12, gpr[12]); STACK_PT_REGS_OFFSET(GPR13, gpr[13]); -#ifndef CONFIG_PPC64 - STACK_PT_REGS_OFFSET(GPR14, gpr[14]); -#endif /* CONFIG_PPC64 */ /* * Note: these symbols include _ because they overlap with special * register names @@ -314,53 +288,39 @@ int main(void) STACK_PT_REGS_OFFSET(_CCR, ccr); STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); + STACK_PT_REGS_OFFSET(_DEAR, dear); STACK_PT_REGS_OFFSET(_DSISR, dsisr); + STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); -#ifndef CONFIG_PPC64 - /* - * The PowerPC 400-class & Book-E processors have neither the DAR - * nor the DSISR SPRs. Hence, we overload them to hold the similar - * DEAR and ESR SPRs for such processors. For critical interrupts - * we use them to hold SRR0 and SRR1. - */ - STACK_PT_REGS_OFFSET(_DEAR, dar); - STACK_PT_REGS_OFFSET(_ESR, dsisr); -#else /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC64 STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); -#endif /* CONFIG_PPC64 */ +#endif -#ifdef CONFIG_PPC_KUAP - STACK_PT_REGS_OFFSET(STACK_REGS_KUAP, kuap); +#ifdef CONFIG_PPC_PKEY + STACK_PT_REGS_OFFSET(STACK_REGS_AMR, amr); + STACK_PT_REGS_OFFSET(STACK_REGS_IAMR, iamr); #endif -#if defined(CONFIG_PPC32) -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) - DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); - DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); +#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE) + STACK_PT_REGS_OFFSET(MAS0, mas0); /* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */ - DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); - DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas1)); - DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas2)); - DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas3)); - DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas6)); - DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas7)); - DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr0)); - DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr1)); - DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr0)); - DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1)); - DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0)); - DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1)); - DEFINE(SAVED_KSP_LIMIT, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, saved_ksp_limit)); -#endif + STACK_PT_REGS_OFFSET(MMUCR, mas0); + STACK_PT_REGS_OFFSET(MAS1, mas1); + STACK_PT_REGS_OFFSET(MAS2, mas2); + STACK_PT_REGS_OFFSET(MAS3, mas3); + STACK_PT_REGS_OFFSET(MAS6, mas6); + STACK_PT_REGS_OFFSET(MAS7, mas7); + STACK_PT_REGS_OFFSET(_SRR0, srr0); + STACK_PT_REGS_OFFSET(_SRR1, srr1); + STACK_PT_REGS_OFFSET(_CSRR0, csrr0); + STACK_PT_REGS_OFFSET(_CSRR1, csrr1); + STACK_PT_REGS_OFFSET(_DSRR0, dsrr0); + STACK_PT_REGS_OFFSET(_DSRR1, dsrr1); #endif -#ifndef CONFIG_PPC64 - OFFSET(MM_PGD, mm_struct, pgd); -#endif /* ! CONFIG_PPC64 */ - /* About the CPU features table */ OFFSET(CPU_SPEC_FEATURES, cpu_spec, cpu_features); OFFSET(CPU_SPEC_SETUP, cpu_spec, cpu_setup); @@ -376,56 +336,23 @@ int main(void) #endif /* ! CONFIG_PPC64 */ /* datapage offsets for use by vdso */ - OFFSET(CFG_TB_ORIG_STAMP, vdso_data, tb_orig_stamp); - OFFSET(CFG_TB_TICKS_PER_SEC, vdso_data, tb_ticks_per_sec); - OFFSET(CFG_TB_TO_XS, vdso_data, tb_to_xs); - OFFSET(CFG_TB_UPDATE_COUNT, vdso_data, tb_update_count); - OFFSET(CFG_TZ_MINUTEWEST, vdso_data, tz_minuteswest); - OFFSET(CFG_TZ_DSTTIME, vdso_data, tz_dsttime); - OFFSET(CFG_SYSCALL_MAP32, vdso_data, syscall_map_32); - OFFSET(WTOM_CLOCK_SEC, vdso_data, wtom_clock_sec); - OFFSET(WTOM_CLOCK_NSEC, vdso_data, wtom_clock_nsec); - OFFSET(STAMP_XTIME_SEC, vdso_data, stamp_xtime_sec); - OFFSET(STAMP_XTIME_NSEC, vdso_data, stamp_xtime_nsec); - OFFSET(STAMP_SEC_FRAC, vdso_data, stamp_sec_fraction); - OFFSET(CLOCK_HRTIMER_RES, vdso_data, hrtimer_res); - OFFSET(CFG_ICACHE_BLOCKSZ, vdso_data, icache_block_size); - OFFSET(CFG_DCACHE_BLOCKSZ, vdso_data, dcache_block_size); - OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_data, icache_log_block_size); - OFFSET(CFG_DCACHE_LOGBLOCKSZ, vdso_data, dcache_log_block_size); + OFFSET(VDSO_DATA_OFFSET, vdso_arch_data, data); + OFFSET(CFG_TB_TICKS_PER_SEC, vdso_arch_data, tb_ticks_per_sec); #ifdef CONFIG_PPC64 - OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64); - OFFSET(TVAL64_TV_SEC, __kernel_old_timeval, tv_sec); - OFFSET(TVAL64_TV_USEC, __kernel_old_timeval, tv_usec); -#endif - OFFSET(TSPC64_TV_SEC, __kernel_timespec, tv_sec); - OFFSET(TSPC64_TV_NSEC, __kernel_timespec, tv_nsec); - OFFSET(TVAL32_TV_SEC, old_timeval32, tv_sec); - OFFSET(TVAL32_TV_USEC, old_timeval32, tv_usec); - OFFSET(TSPC32_TV_SEC, old_timespec32, tv_sec); - OFFSET(TSPC32_TV_NSEC, old_timespec32, tv_nsec); - /* timeval/timezone offsets for use by vdso */ - OFFSET(TZONE_TZ_MINWEST, timezone, tz_minuteswest); - OFFSET(TZONE_TZ_DSTTIME, timezone, tz_dsttime); - - /* Other bits used by the vdso */ - DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); - DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); - DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); - DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE); - DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); + OFFSET(CFG_ICACHE_BLOCKSZ, vdso_arch_data, icache_block_size); + OFFSET(CFG_DCACHE_BLOCKSZ, vdso_arch_data, dcache_block_size); + OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_arch_data, icache_log_block_size); + OFFSET(CFG_DCACHE_LOGBLOCKSZ, vdso_arch_data, dcache_log_block_size); + OFFSET(CFG_SYSCALL_MAP64, vdso_arch_data, syscall_map); + OFFSET(CFG_SYSCALL_MAP32, vdso_arch_data, compat_syscall_map); +#else + OFFSET(CFG_SYSCALL_MAP32, vdso_arch_data, syscall_map); +#endif #ifdef CONFIG_BUG DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); #endif -#ifdef CONFIG_PPC_BOOK3S_64 - DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE))); -#else - DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); -#endif - DEFINE(PTE_SIZE, sizeof(pte_t)); - #ifdef CONFIG_KVM OFFSET(VCPU_HOST_STACK, kvm_vcpu, arch.host_stack); OFFSET(VCPU_HOST_PID, kvm_vcpu, arch.host_pid); @@ -453,7 +380,7 @@ int main(void) OFFSET(VCPU_SPRG2, kvm_vcpu, arch.shregs.sprg2); OFFSET(VCPU_SPRG3, kvm_vcpu, arch.shregs.sprg3); #endif -#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING OFFSET(VCPU_TB_RMENTRY, kvm_vcpu, arch.rm_entry); OFFSET(VCPU_TB_RMINTR, kvm_vcpu, arch.rm_intr); OFFSET(VCPU_TB_RMEXIT, kvm_vcpu, arch.rm_exit); @@ -492,23 +419,18 @@ int main(void) /* book3s */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - OFFSET(KVM_TLB_SETS, kvm, arch.tlb_sets); OFFSET(KVM_SDR1, kvm, arch.sdr1); OFFSET(KVM_HOST_LPID, kvm, arch.host_lpid); OFFSET(KVM_HOST_LPCR, kvm, arch.host_lpcr); OFFSET(KVM_HOST_SDR1, kvm, arch.host_sdr1); - OFFSET(KVM_NEED_FLUSH, kvm, arch.need_tlb_flush.bits); OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); - OFFSET(KVM_RADIX, kvm, arch.radix); - OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); - OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); OFFSET(VCPU_CPU, kvm_vcpu, cpu); OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); #endif @@ -523,20 +445,18 @@ int main(void) OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl); OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr); OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx); - OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr); - OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx); + OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0); + OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0); OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr); OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags); - OFFSET(VCPU_DEC, kvm_vcpu, arch.dec); OFFSET(VCPU_DEC_EXPIRES, kvm_vcpu, arch.dec_expires); OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); - OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending); - OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); + OFFSET(VCPU_MMCRA, kvm_vcpu, arch.mmcra); + OFFSET(VCPU_MMCRS, kvm_vcpu, arch.mmcrs); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); - OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc); OFFSET(VCPU_SIAR, kvm_vcpu, arch.siar); OFFSET(VCPU_SDAR, kvm_vcpu, arch.sdar); OFFSET(VCPU_SIER, kvm_vcpu, arch.sier); @@ -545,7 +465,6 @@ int main(void) OFFSET(VCPU_SLB_NR, kvm_vcpu, arch.slb_nr); OFFSET(VCPU_FAULT_DSISR, kvm_vcpu, arch.fault_dsisr); OFFSET(VCPU_FAULT_DAR, kvm_vcpu, arch.fault_dar); - OFFSET(VCPU_FAULT_GPA, kvm_vcpu, arch.fault_gpa); OFFSET(VCPU_INTR_MSR, kvm_vcpu, arch.intr_msr); OFFSET(VCPU_LAST_INST, kvm_vcpu, arch.last_inst); OFFSET(VCPU_TRAP, kvm_vcpu, arch.trap); @@ -561,8 +480,6 @@ int main(void) OFFSET(VCPU_TCSCR, kvm_vcpu, arch.tcscr); OFFSET(VCPU_ACOP, kvm_vcpu, arch.acop); OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); - OFFSET(VCPU_TID, kvm_vcpu, arch.tid); - OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr); OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); @@ -657,13 +574,8 @@ int main(void) HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state); HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); - HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); - HSTATE_FIELD(HSTATE_XIVE_TIMA_PHYS, xive_tima_phys); - HSTATE_FIELD(HSTATE_XIVE_TIMA_VIRT, xive_tima_virt); - HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_PTID, ptid); - HSTATE_FIELD(HSTATE_TID, tid); HSTATE_FIELD(HSTATE_FAKE_SUSPEND, fake_suspend); HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); @@ -690,8 +602,6 @@ int main(void) OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); - OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set); - OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 @@ -728,7 +638,7 @@ int main(void) DEFINE(PGD_T_LOG2, PGD_T_LOG2); DEFINE(PTE_T_LOG2, PTE_T_LOG2); #endif -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 DEFINE(TLBCAM_SIZE, sizeof(struct tlbcam)); OFFSET(TLBCAM_MAS0, tlbcam, MAS0); OFFSET(TLBCAM_MAS1, tlbcam, MAS1); @@ -749,17 +659,6 @@ int main(void) OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6); #endif -#ifdef CONFIG_KVM_XICS - DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu, - arch.xive_saved_state)); - DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu, - arch.xive_cam_word)); - DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed)); - DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on)); - DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr)); - DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr)); -#endif - #ifdef CONFIG_KVM_EXIT_TIMING OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu); OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl); @@ -768,11 +667,14 @@ int main(void) #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); - DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE); #ifdef CONFIG_PPC_8xx DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE)); #endif +#ifdef CONFIG_XMON + DEFINE(BPT_SIZE, BPT_SIZE); +#endif + return 0; } diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c index a2dddd7f3d09..92298d6a3a37 100644 --- a/arch/powerpc/kernel/audit.c +++ b/arch/powerpc/kernel/audit.c @@ -4,6 +4,8 @@ #include <linux/audit.h> #include <asm/unistd.h> +#include "audit_32.h" + static unsigned dir_class[] = { #include <asm-generic/audit_dir_write.h> ~0U @@ -41,21 +43,22 @@ int audit_classify_arch(int arch) int audit_classify_syscall(int abi, unsigned syscall) { #ifdef CONFIG_PPC64 - extern int ppc32_classify_syscall(unsigned); if (abi == AUDIT_ARCH_PPC) return ppc32_classify_syscall(syscall); #endif switch(syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_socketcall: - return 4; + return AUDITSC_SOCKETCALL; case __NR_execve: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 0; + return AUDITSC_NATIVE; } } diff --git a/arch/powerpc/kernel/audit_32.h b/arch/powerpc/kernel/audit_32.h new file mode 100644 index 000000000000..c6c79c3041ab --- /dev/null +++ b/arch/powerpc/kernel/audit_32.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __AUDIT_32_H__ +#define __AUDIT_32_H__ + +extern int ppc32_classify_syscall(unsigned); + +#endif diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 6dfceaa820e4..7f63f1cdc6c3 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -8,14 +8,15 @@ #include <linux/string.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/font.h> #include <linux/memblock.h> +#include <linux/pgtable.h> +#include <linux/of.h> #include <asm/sections.h> -#include <asm/prom.h> #include <asm/btext.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/udbg.h> @@ -26,7 +27,7 @@ static void scrollscreen(void); #endif -#define __force_data __attribute__((__section__(".data"))) +#define __force_data __section(".data") static int g_loc_X __force_data; static int g_loc_Y __force_data; @@ -41,12 +42,7 @@ static unsigned char *logicalDisplayBase __force_data; unsigned long disp_BAT[2] __initdata = {0, 0}; -#define cmapsz (16*256) - -static unsigned char vga_font[cmapsz]; - -int boot_text_mapped __force_data = 0; -int force_printk_to_btext = 0; +static int boot_text_mapped __force_data; extern void rmci_on(void); extern void rmci_off(void); @@ -74,7 +70,7 @@ static inline void rmci_maybe_off(void) * the display during identify_machine() and MMU_Init() * * The display is mapped to virtual address 0xD0000000, rather - * than 1:1, because some some CHRP machines put the frame buffer + * than 1:1, because some CHRP machines put the frame buffer * in the region starting at 0xC0000000 (PAGE_OFFSET). * This mapping is temporary and will disappear as soon as the * setup done by MMU_Init() is applied. @@ -95,19 +91,10 @@ void __init btext_prepare_BAT(void) boot_text_mapped = 0; return; } - if (PVR_VER(mfspr(SPRN_PVR)) != 1) { - /* 603, 604, G3, G4, ... */ - lowbits = addr & ~0xFF000000UL; - addr &= 0xFF000000UL; - disp_BAT[0] = vaddr | (BL_16M<<2) | 2; - disp_BAT[1] = addr | (_PAGE_NO_CACHE | _PAGE_GUARDED | BPP_RW); - } else { - /* 601 */ - lowbits = addr & ~0xFF800000UL; - addr &= 0xFF800000UL; - disp_BAT[0] = vaddr | (_PAGE_NO_CACHE | PP_RWXX) | 4; - disp_BAT[1] = addr | BL_8M | 0x40; - } + lowbits = addr & ~0xFF000000UL; + addr &= 0xFF000000UL; + disp_BAT[0] = vaddr | (BL_16M<<2) | 2; + disp_BAT[1] = addr | (_PAGE_NO_CACHE | _PAGE_GUARDED | BPP_RW); logicalDisplayBase = (void *) (vaddr + lowbits); } #endif @@ -170,7 +157,7 @@ void btext_map(void) boot_text_mapped = 1; } -static int btext_initialize(struct device_node *np) +static int __init btext_initialize(struct device_node *np) { unsigned int width, height, depth, pitch; unsigned long address = 0; @@ -245,13 +232,15 @@ int __init btext_find_display(int allow_nonstdout) return rc; for_each_node_by_type(np, "display") { - if (of_get_property(np, "linux,opened", NULL)) { + if (of_property_read_bool(np, "linux,opened")) { printk("trying %pOF ...\n", np); rc = btext_initialize(np); printk("result: %d\n", rc); } - if (rc == 0) + if (rc == 0) { + of_node_put(np); break; + } } return rc; } @@ -299,7 +288,7 @@ void btext_update_display(unsigned long phys, int width, int height, } EXPORT_SYMBOL(btext_update_display); -void btext_clearscreen(void) +void __init btext_clearscreen(void) { unsigned int *base = (unsigned int *)calc_base(0, 0); unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * @@ -317,7 +306,7 @@ void btext_clearscreen(void) rmci_maybe_off(); } -void btext_flushscreen(void) +void __init btext_flushscreen(void) { unsigned int *base = (unsigned int *)calc_base(0, 0); unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * @@ -336,7 +325,7 @@ void btext_flushscreen(void) __asm__ __volatile__ ("sync" ::: "memory"); } -void btext_flushline(void) +void __init btext_flushline(void) { unsigned int *base = (unsigned int *)calc_base(0, g_loc_Y << 4); unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * @@ -415,7 +404,7 @@ static unsigned int expand_bits_16[4] = { }; -static void draw_byte_32(unsigned char *font, unsigned int *base, int rb) +static void draw_byte_32(const unsigned char *font, unsigned int *base, int rb) { int l, bits; int fg = 0xFFFFFFFFUL; @@ -436,7 +425,7 @@ static void draw_byte_32(unsigned char *font, unsigned int *base, int rb) } } -static inline void draw_byte_16(unsigned char *font, unsigned int *base, int rb) +static inline void draw_byte_16(const unsigned char *font, unsigned int *base, int rb) { int l, bits; int fg = 0xFFFFFFFFUL; @@ -454,7 +443,7 @@ static inline void draw_byte_16(unsigned char *font, unsigned int *base, int rb) } } -static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb) +static inline void draw_byte_8(const unsigned char *font, unsigned int *base, int rb) { int l, bits; int fg = 0x0F0F0F0FUL; @@ -473,7 +462,8 @@ static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb) static noinline void draw_byte(unsigned char c, long locX, long locY) { unsigned char *base = calc_base(locX << 3, locY << 4); - unsigned char *font = &vga_font[((unsigned int)c) * 16]; + unsigned int font_index = c * 16; + const unsigned char *font = font_sun_8x16.data + font_index; int rb = dispDeviceRowBytes; rmci_maybe_on(); @@ -551,7 +541,7 @@ void btext_drawstring(const char *c) btext_drawchar(*c++); } -void btext_drawtext(const char *c, unsigned int len) +void __init btext_drawtext(const char *c, unsigned int len) { if (!boot_text_mapped) return; @@ -559,7 +549,7 @@ void btext_drawtext(const char *c, unsigned int len) btext_drawchar(*c++); } -void btext_drawhex(unsigned long v) +void __init btext_drawhex(unsigned long v) { if (!boot_text_mapped) return; @@ -591,349 +581,3 @@ void __init udbg_init_btext(void) */ udbg_putc = btext_drawchar; } - -static unsigned char vga_font[cmapsz] = { -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd, -0x99, 0x81, 0x81, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xff, -0xdb, 0xff, 0xff, 0xc3, 0xe7, 0xff, 0xff, 0x7e, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x7c, 0xfe, -0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, -0x3c, 0x3c, 0xe7, 0xe7, 0xe7, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x7e, 0x18, 0x18, 0x3c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, -0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, -0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, -0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00, -0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xc3, 0x99, 0xbd, -0xbd, 0x99, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x1e, 0x0e, -0x1a, 0x32, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x3c, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x18, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x30, -0x30, 0x70, 0xf0, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x63, -0x7f, 0x63, 0x63, 0x63, 0x63, 0x67, 0xe7, 0xe6, 0xc0, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x18, 0x18, 0xdb, 0x3c, 0xe7, 0x3c, 0xdb, 0x18, 0x18, -0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfe, 0xf8, -0xf0, 0xe0, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x06, 0x0e, -0x1e, 0x3e, 0xfe, 0x3e, 0x1e, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, -0x66, 0x00, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xdb, -0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x00, 0x00, 0x00, 0x00, -0x00, 0x7c, 0xc6, 0x60, 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x38, 0x0c, 0xc6, -0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0xfe, 0xfe, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, -0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0xc0, -0xc0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x38, 0x7c, 0x7c, 0xfe, 0xfe, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0x7c, 0x7c, -0x38, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, -0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x24, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6c, -0x6c, 0xfe, 0x6c, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, -0x18, 0x18, 0x7c, 0xc6, 0xc2, 0xc0, 0x7c, 0x06, 0x06, 0x86, 0xc6, 0x7c, -0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2, 0xc6, 0x0c, 0x18, -0x30, 0x60, 0xc6, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, -0x6c, 0x38, 0x76, 0xdc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, -0x00, 0x30, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30, -0x30, 0x30, 0x18, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x18, -0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, -0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x02, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xce, 0xde, 0xf6, 0xe6, 0xc6, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x38, 0x78, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, -0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x7c, 0xc6, 0x06, 0x06, 0x3c, 0x06, 0x06, 0x06, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x1c, 0x3c, 0x6c, 0xcc, 0xfe, -0x0c, 0x0c, 0x0c, 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0, -0xc0, 0xc0, 0xfc, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x38, 0x60, 0xc0, 0xc0, 0xfc, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc6, 0x06, 0x06, 0x0c, 0x18, -0x30, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, -0xc6, 0xc6, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x06, 0x06, 0x0c, 0x78, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, -0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x06, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, -0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, -0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x0c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xde, 0xde, -0xde, 0xdc, 0xc0, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, -0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x66, 0x66, 0x66, 0x66, 0xfc, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0, -0xc0, 0xc2, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x6c, -0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x62, 0x66, 0xfe, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, -0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, -0xc2, 0xc0, 0xc0, 0xde, 0xc6, 0xc6, 0x66, 0x3a, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x0c, -0x0c, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xe6, 0x66, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0x66, 0xe6, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x60, 0x60, 0x60, 0x60, 0x60, -0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xe7, -0xff, 0xff, 0xdb, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0xc6, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, -0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66, -0x66, 0x66, 0x7c, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xd6, 0xde, 0x7c, -0x0c, 0x0e, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x6c, -0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, -0xc6, 0x60, 0x38, 0x0c, 0x06, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xff, 0xdb, 0x99, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, -0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, -0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x66, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18, -0x3c, 0x66, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, -0xc3, 0x66, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xff, 0xc3, 0x86, 0x0c, 0x18, 0x30, 0x60, 0xc1, 0xc3, 0xff, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x30, 0x30, 0x30, 0x30, 0x30, -0x30, 0x30, 0x30, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, -0xc0, 0xe0, 0x70, 0x38, 0x1c, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x3c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x3c, -0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, -0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c, -0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0x60, -0x60, 0x78, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc0, 0xc0, 0xc0, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x0c, 0x3c, 0x6c, 0xcc, -0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xf0, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, -0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xcc, 0x78, 0x00, 0x00, 0x00, 0xe0, 0x60, -0x60, 0x6c, 0x76, 0x66, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x18, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x06, 0x00, 0x0e, 0x06, 0x06, -0x06, 0x06, 0x06, 0x06, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0xe0, 0x60, -0x60, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xdb, -0xdb, 0xdb, 0xdb, 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66, -0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0x0c, 0x1e, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0x60, 0x60, 0xf0, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0x60, -0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x30, -0x30, 0xfc, 0x30, 0x30, 0x30, 0x30, 0x36, 0x1c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3, -0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0x3c, 0x66, 0xc3, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6, -0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0xfe, 0xcc, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x0e, 0x18, 0x18, 0x18, 0x70, 0x18, 0x18, 0x18, 0x18, 0x0e, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18, -0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x18, -0x18, 0x18, 0x0e, 0x18, 0x18, 0x18, 0x18, 0x70, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, -0xc6, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, -0xc2, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x0c, 0x06, 0x7c, 0x00, 0x00, -0x00, 0x00, 0xcc, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, -0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x00, 0x7c, 0xc6, 0xfe, -0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, -0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xcc, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, -0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x00, 0x78, 0x0c, 0x7c, -0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x38, -0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x60, 0x60, 0x66, 0x3c, 0x0c, 0x06, -0x3c, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xfe, -0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, -0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x38, 0x18, 0x18, -0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, 0x66, -0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x60, 0x30, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, -0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, -0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x38, 0x00, -0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00, -0x18, 0x30, 0x60, 0x00, 0xfe, 0x66, 0x60, 0x7c, 0x60, 0x60, 0x66, 0xfe, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0x3b, 0x1b, -0x7e, 0xd8, 0xdc, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3e, 0x6c, -0xcc, 0xcc, 0xfe, 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0x00, 0x00, 0x00, 0x00, -0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xc6, -0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, -0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x30, 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, -0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x00, 0xcc, 0xcc, 0xcc, -0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, -0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0x78, 0x00, -0x00, 0xc6, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, -0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, -0xc3, 0xc0, 0xc0, 0xc0, 0xc3, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, -0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xe6, 0xfc, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0xff, 0x18, -0xff, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66, 0x66, -0x7c, 0x62, 0x66, 0x6f, 0x66, 0x66, 0x66, 0xf3, 0x00, 0x00, 0x00, 0x00, -0x00, 0x0e, 0x1b, 0x18, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, -0xd8, 0x70, 0x00, 0x00, 0x00, 0x18, 0x30, 0x60, 0x00, 0x78, 0x0c, 0x7c, -0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, -0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x18, 0x30, 0x60, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x30, 0x60, 0x00, 0xcc, 0xcc, 0xcc, -0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, -0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00, -0x76, 0xdc, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, -0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x6c, -0x38, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x30, 0x30, 0x00, 0x30, 0x30, 0x60, 0xc0, 0xc6, 0xc6, 0x7c, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0, -0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0xfe, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x60, 0xce, 0x9b, 0x06, -0x0c, 0x1f, 0x00, 0x00, 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, -0x66, 0xce, 0x96, 0x3e, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, -0x00, 0x18, 0x18, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x6c, 0xd8, 0x6c, 0x36, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, 0x6c, 0x36, -0x6c, 0xd8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x44, 0x11, 0x44, -0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, -0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, -0x55, 0xaa, 0x55, 0xaa, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, -0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0xf8, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, -0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0xf6, -0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, -0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xff, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36, -0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, -0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, -0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, -0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, -0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, -0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xf0, 0xf0, 0xf0, -0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, -0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, -0x0f, 0x0f, 0x0f, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x76, 0xdc, 0xd8, 0xd8, 0xd8, 0xdc, 0x76, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xd8, 0xcc, 0xc6, 0xc6, 0xc6, 0xcc, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc6, 0xc6, 0xc0, 0xc0, 0xc0, -0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0xfe, 0xc6, 0x60, 0x30, 0x18, 0x30, 0x60, 0xc6, 0xfe, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8, -0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x18, 0x3c, 0x66, 0x66, -0x66, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, -0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x6c, 0x6c, 0x6c, 0xee, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x30, 0x18, 0x0c, 0x3e, 0x66, -0x66, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x7e, 0xdb, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x03, 0x06, 0x7e, 0xdb, 0xdb, 0xf3, 0x7e, 0x60, 0xc0, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x30, 0x60, 0x60, 0x7c, 0x60, -0x60, 0x60, 0x30, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, -0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18, -0x18, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, -0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x00, 0x7e, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x1b, 0x1b, 0x1b, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, -0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x7e, 0x00, 0x18, 0x18, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x00, -0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x6c, -0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x0c, 0x0c, -0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x6c, 0x3c, 0x1c, 0x00, 0x00, 0x00, 0x00, -0x00, 0xd8, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0xd8, 0x30, 0x60, 0xc8, 0xf8, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -0x00, 0x00, 0x00, 0x00, -}; - diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 470336277c67..f502337dd37d 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -7,6 +7,8 @@ * Author: Nathan Lynch */ +#define pr_fmt(fmt) "cacheinfo: " fmt + #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/kernel.h> @@ -16,7 +18,6 @@ #include <linux/of.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <asm/prom.h> #include <asm/cputhreads.h> #include <asm/smp.h> @@ -118,6 +119,7 @@ struct cache { struct cpumask shared_cpu_map; /* online CPUs using this cache */ int type; /* split cache disambiguation */ int level; /* level not explicit in device tree */ + int group_id; /* id of the group of threads that share this cache */ struct list_head list; /* global list of cache objects */ struct cache *next_local; /* next cache of >= level */ }; @@ -140,22 +142,24 @@ static const char *cache_type_string(const struct cache *cache) } static void cache_init(struct cache *cache, int type, int level, - struct device_node *ofnode) + struct device_node *ofnode, int group_id) { cache->type = type; cache->level = level; cache->ofnode = of_node_get(ofnode); + cache->group_id = group_id; INIT_LIST_HEAD(&cache->list); list_add(&cache->list, &cache_list); } -static struct cache *new_cache(int type, int level, struct device_node *ofnode) +static struct cache *new_cache(int type, int level, + struct device_node *ofnode, int group_id) { struct cache *cache; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache) - cache_init(cache, type, level, ofnode); + cache_init(cache, type, level, ofnode, group_id); return cache; } @@ -166,7 +170,7 @@ static void release_cache_debugcheck(struct cache *cache) list_for_each_entry(iter, &cache_list, list) WARN_ONCE(iter->next_local == cache, - "cache for %pOF(%s) refers to cache for %pOF(%s)\n", + "cache for %pOFP(%s) refers to cache for %pOFP(%s)\n", iter->ofnode, cache_type_string(iter), cache->ofnode, @@ -178,7 +182,7 @@ static void release_cache(struct cache *cache) if (!cache) return; - pr_debug("freeing L%d %s cache for %pOF\n", cache->level, + pr_debug("freeing L%d %s cache for %pOFP\n", cache->level, cache_type_string(cache), cache->ofnode); release_cache_debugcheck(cache); @@ -193,7 +197,7 @@ static void cache_cpu_set(struct cache *cache, int cpu) while (next) { WARN_ONCE(cpumask_test_cpu(cpu, &next->shared_cpu_map), - "CPU %i already accounted in %pOF(%s)\n", + "CPU %i already accounted in %pOFP(%s)\n", cpu, next->ofnode, cache_type_string(next)); cpumask_set_cpu(cpu, &next->shared_cpu_map); @@ -307,20 +311,24 @@ static struct cache *cache_find_first_sibling(struct cache *cache) return cache; list_for_each_entry(iter, &cache_list, list) - if (iter->ofnode == cache->ofnode && iter->next_local == cache) + if (iter->ofnode == cache->ofnode && + iter->group_id == cache->group_id && + iter->next_local == cache) return iter; return cache; } -/* return the first cache on a local list matching node */ -static struct cache *cache_lookup_by_node(const struct device_node *node) +/* return the first cache on a local list matching node and thread-group id */ +static struct cache *cache_lookup_by_node_group(const struct device_node *node, + int group_id) { struct cache *cache = NULL; struct cache *iter; list_for_each_entry(iter, &cache_list, list) { - if (iter->ofnode != node) + if (iter->ofnode != node || + iter->group_id != group_id) continue; cache = cache_find_first_sibling(iter); break; @@ -350,23 +358,24 @@ static int cache_is_unified_d(const struct device_node *np) CACHE_TYPE_UNIFIED_D : CACHE_TYPE_UNIFIED; } -static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_unified(struct device_node *node, int group_id, + int level) { - pr_debug("creating L%d ucache for %pOF\n", level, node); + pr_debug("creating L%d ucache for %pOFP\n", level, node); - return new_cache(cache_is_unified_d(node), level, node); + return new_cache(cache_is_unified_d(node), level, node, group_id); } -static struct cache *cache_do_one_devnode_split(struct device_node *node, +static struct cache *cache_do_one_devnode_split(struct device_node *node, int group_id, int level) { struct cache *dcache, *icache; - pr_debug("creating L%d dcache and icache for %pOF\n", level, + pr_debug("creating L%d dcache and icache for %pOFP\n", level, node); - dcache = new_cache(CACHE_TYPE_DATA, level, node); - icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); + dcache = new_cache(CACHE_TYPE_DATA, level, node, group_id); + icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node, group_id); if (!dcache || !icache) goto err; @@ -380,31 +389,32 @@ err: return NULL; } -static struct cache *cache_do_one_devnode(struct device_node *node, int level) +static struct cache *cache_do_one_devnode(struct device_node *node, int group_id, int level) { struct cache *cache; if (cache_node_is_unified(node)) - cache = cache_do_one_devnode_unified(node, level); + cache = cache_do_one_devnode_unified(node, group_id, level); else - cache = cache_do_one_devnode_split(node, level); + cache = cache_do_one_devnode_split(node, group_id, level); return cache; } static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int group_id, int level) { struct cache *cache; - cache = cache_lookup_by_node(node); + cache = cache_lookup_by_node_group(node, group_id); WARN_ONCE(cache && cache->level != level, "cache level mismatch on lookup (got %d, expected %d)\n", cache->level, level); if (!cache) - cache = cache_do_one_devnode(node, level); + cache = cache_do_one_devnode(node, group_id, level); return cache; } @@ -418,15 +428,53 @@ static void link_cache_lists(struct cache *smaller, struct cache *bigger) } smaller->next_local = bigger; + + /* + * The cache->next_local list sorts by level ascending: + * L1d -> L1i -> L2 -> L3 ... + */ + WARN_ONCE((smaller->level == 1 && bigger->level > 2) || + (smaller->level > 1 && bigger->level != smaller->level + 1), + "linking L%i cache %pOFP to L%i cache %pOFP; skipped a level?\n", + smaller->level, smaller->ofnode, bigger->level, bigger->ofnode); } static void do_subsidiary_caches_debugcheck(struct cache *cache) { - WARN_ON_ONCE(cache->level != 1); - WARN_ON_ONCE(!of_node_is_type(cache->ofnode, "cpu")); + WARN_ONCE(cache->level != 1, + "instantiating cache chain from L%d %s cache for " + "%pOFP instead of an L1\n", cache->level, + cache_type_string(cache), cache->ofnode); + WARN_ONCE(!of_node_is_type(cache->ofnode, "cpu"), + "instantiating cache chain from node %pOFP of type '%s' " + "instead of a cpu node\n", cache->ofnode, + of_node_get_device_type(cache->ofnode)); } -static void do_subsidiary_caches(struct cache *cache) +/* + * If sub-groups of threads in a core containing @cpu_id share the + * L@level-cache (information obtained via "ibm,thread-groups" + * device-tree property), then we identify the group by the first + * thread-sibling in the group. We define this to be the group-id. + * + * In the absence of any thread-group information for L@level-cache, + * this function returns -1. + */ +static int get_group_id(unsigned int cpu_id, int level) +{ + if (has_big_cores && level == 1) + return cpumask_first(per_cpu(thread_group_l1_cache_map, + cpu_id)); + else if (thread_group_shares_l2 && level == 2) + return cpumask_first(per_cpu(thread_group_l2_cache_map, + cpu_id)); + else if (thread_group_shares_l3 && level == 3) + return cpumask_first(per_cpu(thread_group_l3_cache_map, + cpu_id)); + return -1; +} + +static void do_subsidiary_caches(struct cache *cache, unsigned int cpu_id) { struct device_node *subcache_node; int level = cache->level; @@ -435,9 +483,11 @@ static void do_subsidiary_caches(struct cache *cache) while ((subcache_node = of_find_next_cache_node(cache->ofnode))) { struct cache *subcache; + int group_id; level++; - subcache = cache_lookup_or_instantiate(subcache_node, level); + group_id = get_group_id(cpu_id, level); + subcache = cache_lookup_or_instantiate(subcache_node, group_id, level); of_node_put(subcache_node); if (!subcache) break; @@ -451,6 +501,7 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cpu_cache = NULL; + int group_id; pr_debug("creating cache object(s) for CPU %i\n", cpu_id); @@ -459,11 +510,13 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) if (!cpu_node) goto out; - cpu_cache = cache_lookup_or_instantiate(cpu_node, 1); + group_id = get_group_id(cpu_id, 1); + + cpu_cache = cache_lookup_or_instantiate(cpu_node, group_id, 1); if (!cpu_cache) goto out; - do_subsidiary_caches(cpu_cache); + do_subsidiary_caches(cpu_cache, cpu_id); cache_cpu_set(cpu_cache, cpu_id); out: @@ -624,66 +677,49 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char * static struct kobj_attribute cache_level_attr = __ATTR(level, 0444, level_show, NULL); -static unsigned int index_dir_to_cpu(struct cache_index_dir *index) -{ - struct kobject *index_dir_kobj = &index->kobj; - struct kobject *cache_dir_kobj = index_dir_kobj->parent; - struct kobject *cpu_dev_kobj = cache_dir_kobj->parent; - struct device *dev = kobj_to_dev(cpu_dev_kobj); - - return dev->id; -} - -/* - * On big-core systems, each core has two groups of CPUs each of which - * has its own L1-cache. The thread-siblings which share l1-cache with - * @cpu can be obtained via cpu_smallcore_mask(). - */ -static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache *cache) -{ - if (cache->level == 1) - return cpu_smallcore_mask(cpu); - - return &cache->shared_cpu_map; -} - -static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +static ssize_t +show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list) { struct cache_index_dir *index; struct cache *cache; const struct cpumask *mask; - int ret, cpu; index = kobj_to_cache_index_dir(k); cache = index->cache; - if (has_big_cores) { - cpu = index_dir_to_cpu(index); - mask = get_big_core_shared_cpu_map(cpu, cache); - } else { - mask = &cache->shared_cpu_map; - } + mask = &cache->shared_cpu_map; - ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb\n", - cpumask_pr_args(mask)); - buf[ret++] = '\n'; - buf[ret] = '\0'; - return ret; + return cpumap_print_to_pagebuf(list, buf, mask); +} + +static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + return show_shared_cpumap(k, attr, buf, false); +} + +static ssize_t shared_cpu_list_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + return show_shared_cpumap(k, attr, buf, true); } static struct kobj_attribute cache_shared_cpu_map_attr = __ATTR(shared_cpu_map, 0444, shared_cpu_map_show, NULL); +static struct kobj_attribute cache_shared_cpu_list_attr = + __ATTR(shared_cpu_list, 0444, shared_cpu_list_show, NULL); + /* Attributes which should always be created -- the kobject/sysfs core - * does this automatically via kobj_type->default_attrs. This is the + * does this automatically via kobj_type->default_groups. This is the * minimum data required to uniquely identify a cache. */ static struct attribute *cache_index_default_attrs[] = { &cache_type_attr.attr, &cache_level_attr.attr, &cache_shared_cpu_map_attr.attr, + &cache_shared_cpu_list_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(cache_index_default); /* Attributes which should be created if the cache device node has the * right properties -- see cacheinfo_create_index_opt_attrs @@ -702,7 +738,7 @@ static const struct sysfs_ops cache_index_ops = { static struct kobj_type cache_index_type = { .release = cache_index_release, .sysfs_ops = &cache_index_ops, - .default_attrs = cache_index_default_attrs, + .default_groups = cache_index_default_groups, }; static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) @@ -733,13 +769,13 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) rc = attr->show(&dir->kobj, attr, buf); if (rc <= 0) { pr_debug("not creating %s attribute for " - "%pOF(%s) (rc = %zd)\n", + "%pOFP(%s) (rc = %zd)\n", attr->attr.name, cache->ofnode, cache_type, rc); continue; } if (sysfs_create_file(&dir->kobj, &attr->attr)) - pr_debug("could not create %s attribute for %pOF(%s)\n", + pr_debug("could not create %s attribute for %pOFP(%s)\n", attr->attr.name, cache->ofnode, cache_type); } @@ -810,13 +846,15 @@ static struct cache *cache_lookup_by_cpu(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cache; + int group_id; cpu_node = of_get_cpu_node(cpu_id, NULL); WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id); if (!cpu_node) return NULL; - cache = cache_lookup_by_node(cpu_node); + group_id = get_group_id(cpu_id, 1); + cache = cache_lookup_by_node_group(cpu_node, group_id); of_node_put(cpu_node); return cache; @@ -855,7 +893,7 @@ static void cache_cpu_clear(struct cache *cache, int cpu) struct cache *next = cache->next_local; WARN_ONCE(!cpumask_test_cpu(cpu, &cache->shared_cpu_map), - "CPU %i not accounted in %pOF(%s)\n", + "CPU %i not accounted in %pOFP(%s)\n", cpu, cache->ofnode, cache_type_string(cache)); diff --git a/arch/powerpc/kernel/compat_audit.c b/arch/powerpc/kernel/compat_audit.c index 55c6ccda0a85..57b38c592b9f 100644 --- a/arch/powerpc/kernel/compat_audit.c +++ b/arch/powerpc/kernel/compat_audit.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #undef __powerpc64__ +#include <linux/audit_arch.h> #include <asm/unistd.h> +#include "audit_32.h" + unsigned ppc32_dir_class[] = { #include <asm-generic/audit_dir_write.h> ~0U @@ -31,14 +34,16 @@ int ppc32_classify_syscall(unsigned syscall) { switch(syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_socketcall: - return 4; + return AUDITSC_SOCKETCALL; case __NR_execve: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 1; + return AUDITSC_COMPAT; } } diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index f6517f67265a..f29ce3dd6140 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -4,6 +4,8 @@ * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) */ +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/cputable.h> @@ -81,7 +83,7 @@ _GLOBAL(__setup_cpu_745x) blr /* Enable caches for 603's, 604, 750 & 7400 */ -setup_common_caches: +SYM_FUNC_START_LOCAL(setup_common_caches) mfspr r11,SPRN_HID0 andi. r0,r11,HID0_DCE ori r11,r11,HID0_ICE|HID0_DCE @@ -95,11 +97,12 @@ setup_common_caches: sync isync blr +SYM_FUNC_END(setup_common_caches) /* 604, 604e, 604ev, ... * Enable superscalar execution & branch history table */ -setup_604_hid0: +SYM_FUNC_START_LOCAL(setup_604_hid0) mfspr r11,SPRN_HID0 ori r11,r11,HID0_SIED|HID0_BHTE ori r8,r11,HID0_BTCD @@ -110,6 +113,7 @@ setup_604_hid0: sync isync blr +SYM_FUNC_END(setup_604_hid0) /* 7400 <= rev 2.7 and 7410 rev = 1.0 suffer from some * erratas we work around here. @@ -125,13 +129,14 @@ setup_604_hid0: * needed once we have applied workaround #5 (though it's * not set by Apple's firmware at least). */ -setup_7400_workarounds: +SYM_FUNC_START_LOCAL(setup_7400_workarounds) mfpvr r3 rlwinm r3,r3,0,20,31 cmpwi 0,r3,0x0207 ble 1f blr -setup_7410_workarounds: +SYM_FUNC_END(setup_7400_workarounds) +SYM_FUNC_START_LOCAL(setup_7410_workarounds) mfpvr r3 rlwinm r3,r3,0,20,31 cmpwi 0,r3,0x0100 @@ -151,6 +156,7 @@ setup_7410_workarounds: sync isync blr +SYM_FUNC_END(setup_7410_workarounds) /* 740/750/7400/7410 * Enable Store Gathering (SGE), Address Broadcast (ABE), @@ -158,7 +164,7 @@ setup_7410_workarounds: * Dynamic Power Management (DPM), Speculative (SPD) * Clear Instruction cache throttling (ICTC) */ -setup_750_7400_hid0: +SYM_FUNC_START_LOCAL(setup_750_7400_hid0) mfspr r11,SPRN_HID0 ori r11,r11,HID0_SGE | HID0_ABE | HID0_BHTE | HID0_BTIC oris r11,r11,HID0_DPM@h @@ -177,12 +183,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM) sync isync blr +SYM_FUNC_END(setup_750_7400_hid0) /* 750cx specific * Looks like we have to disable NAP feature for some PLL settings... * (waiting for confirmation) */ -setup_750cx: +SYM_FUNC_START_LOCAL(setup_750cx) mfspr r10, SPRN_HID1 rlwinm r10,r10,4,28,31 cmpwi cr0,r10,7 @@ -196,11 +203,13 @@ setup_750cx: andc r6,r6,r7 stw r6,CPU_SPEC_FEATURES(r4) blr +SYM_FUNC_END(setup_750cx) /* 750fx specific */ -setup_750fx: +SYM_FUNC_START_LOCAL(setup_750fx) blr +SYM_FUNC_END(setup_750fx) /* MPC 745x * Enable Store Gathering (SGE), Branch Folding (FOLD) @@ -212,7 +221,7 @@ setup_750fx: * Clear Instruction cache throttling (ICTC) * Enable L2 HW prefetch */ -setup_745x_specifics: +SYM_FUNC_START_LOCAL(setup_745x_specifics) /* We check for the presence of an L3 cache setup by * the firmware. If any, we disable NAP capability as * it's known to be bogus on rev 2.1 and earlier @@ -270,6 +279,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM) sync isync blr +SYM_FUNC_END(setup_745x_specifics) /* * Initialize the FPU registers. This is needed to work around an errata @@ -288,6 +298,7 @@ _GLOBAL(__init_fpu_registers) mtmsr r10 isync blr +_ASM_NOKPROBE_SYMBOL(__init_fpu_registers) /* Definitions for the table use to save CPU states */ @@ -483,4 +494,5 @@ _GLOBAL(__restore_cpu_setup) 1: mtcr r7 blr +_ASM_NOKPROBE_SYMBOL(__restore_cpu_setup) diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_e500.S index 1d308780e0d3..077cfccc3461 100644 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ b/arch/powerpc/kernel/cpu_setup_e500.S @@ -8,11 +8,13 @@ * Benjamin Herrenschmidt <benh@kernel.crashing.org> */ +#include <linux/linkage.h> + #include <asm/page.h> #include <asm/processor.h> #include <asm/cputable.h> #include <asm/ppc_asm.h> -#include <asm/nohash/mmu-book3e.h> +#include <asm/nohash/mmu-e500.h> #include <asm/asm-offsets.h> #include <asm/mpc85xx.h> @@ -108,16 +110,7 @@ _GLOBAL(__setup_cpu_e6500) #endif /* CONFIG_PPC_E500MC */ #ifdef CONFIG_PPC32 -#ifdef CONFIG_E200 -_GLOBAL(__setup_cpu_e200) - /* enable dedicated debug exception handling resources (Debug APU) */ - mfspr r3,SPRN_HID0 - ori r3,r3,HID0_DAPUEN@l - mtspr SPRN_HID0,r3 - b __setup_e200_ivors -#endif /* CONFIG_E200 */ - -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 #ifndef CONFIG_PPC_E500MC _GLOBAL(__setup_cpu_e500v1) _GLOBAL(__setup_cpu_e500v2) @@ -165,7 +158,7 @@ _GLOBAL(__setup_cpu_e5500) mtlr r5 blr #endif /* CONFIG_PPC_E500MC */ -#endif /* CONFIG_E500 */ +#endif /* CONFIG_PPC_E500 */ #endif /* CONFIG_PPC32 */ #ifdef CONFIG_PPC_BOOK3E_64 @@ -283,7 +276,7 @@ _GLOBAL(flush_dcache_L1) blr -has_L2_cache: +SYM_FUNC_START_LOCAL(has_L2_cache) /* skip L2 cache on P2040/P2040E as they have no L2 cache */ mfspr r3, SPRN_SVR /* shift right by 8 bits and clear E bit of SVR */ @@ -299,9 +292,10 @@ has_L2_cache: 1: li r3, 0 blr +SYM_FUNC_END(has_L2_cache) /* flush backside L2 cache */ -flush_backside_L2_cache: +SYM_FUNC_START_LOCAL(flush_backside_L2_cache) mflr r10 bl has_L2_cache mtlr r10 @@ -322,6 +316,7 @@ flush_backside_L2_cache: bne 1b 2: blr +SYM_FUNC_END(flush_backside_L2_cache) _GLOBAL(cpu_down_flush_e500v2) mflr r0 diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S deleted file mode 100644 index a460298c7ddb..000000000000 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file contains low level CPU setup functions. - * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) - */ - -#include <asm/processor.h> -#include <asm/page.h> -#include <asm/cputable.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/cache.h> -#include <asm/book3s/64/mmu-hash.h> - -/* Entry: r3 = crap, r4 = ptr to cputable entry - * - * Note that we can be called twice for pseudo-PVRs - */ -_GLOBAL(__setup_cpu_power7) - mflr r11 - bl __init_hvmode_206 - mtlr r11 - beqlr - li r0,0 - mtspr SPRN_LPID,r0 - LOAD_REG_IMMEDIATE(r0, PCR_MASK) - mtspr SPRN_PCR,r0 - mfspr r3,SPRN_LPCR - li r4,(LPCR_LPES1 >> LPCR_LPES_SH) - bl __init_LPCR_ISA206 - mtlr r11 - blr - -_GLOBAL(__restore_cpu_power7) - mflr r11 - mfmsr r3 - rldicl. r0,r3,4,63 - beqlr - li r0,0 - mtspr SPRN_LPID,r0 - LOAD_REG_IMMEDIATE(r0, PCR_MASK) - mtspr SPRN_PCR,r0 - mfspr r3,SPRN_LPCR - li r4,(LPCR_LPES1 >> LPCR_LPES_SH) - bl __init_LPCR_ISA206 - mtlr r11 - blr - -_GLOBAL(__setup_cpu_power8) - mflr r11 - bl __init_FSCR - bl __init_PMU - bl __init_PMU_ISA207 - bl __init_hvmode_206 - mtlr r11 - beqlr - li r0,0 - mtspr SPRN_LPID,r0 - LOAD_REG_IMMEDIATE(r0, PCR_MASK) - mtspr SPRN_PCR,r0 - mfspr r3,SPRN_LPCR - ori r3, r3, LPCR_PECEDH - li r4,0 /* LPES = 0 */ - bl __init_LPCR_ISA206 - bl __init_HFSCR - bl __init_PMU_HV - bl __init_PMU_HV_ISA207 - mtlr r11 - blr - -_GLOBAL(__restore_cpu_power8) - mflr r11 - bl __init_FSCR - bl __init_PMU - bl __init_PMU_ISA207 - mfmsr r3 - rldicl. r0,r3,4,63 - mtlr r11 - beqlr - li r0,0 - mtspr SPRN_LPID,r0 - LOAD_REG_IMMEDIATE(r0, PCR_MASK) - mtspr SPRN_PCR,r0 - mfspr r3,SPRN_LPCR - ori r3, r3, LPCR_PECEDH - li r4,0 /* LPES = 0 */ - bl __init_LPCR_ISA206 - bl __init_HFSCR - bl __init_PMU_HV - bl __init_PMU_HV_ISA207 - mtlr r11 - blr - -_GLOBAL(__setup_cpu_power9) - mflr r11 - bl __init_FSCR - bl __init_PMU - bl __init_hvmode_206 - mtlr r11 - beqlr - li r0,0 - mtspr SPRN_PSSCR,r0 - mtspr SPRN_LPID,r0 - mtspr SPRN_PID,r0 - LOAD_REG_IMMEDIATE(r0, PCR_MASK) - mtspr SPRN_PCR,r0 - mfspr r3,SPRN_LPCR - LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) - or r3, r3, r4 - LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) - andc r3, r3, r4 - li r4,0 /* LPES = 0 */ - bl __init_LPCR_ISA300 - bl __init_HFSCR - bl __init_PMU_HV - mtlr r11 - blr - -_GLOBAL(__restore_cpu_power9) - mflr r11 - bl __init_FSCR - bl __init_PMU - mfmsr r3 - rldicl. r0,r3,4,63 - mtlr r11 - beqlr - li r0,0 - mtspr SPRN_PSSCR,r0 - mtspr SPRN_LPID,r0 - mtspr SPRN_PID,r0 - LOAD_REG_IMMEDIATE(r0, PCR_MASK) - mtspr SPRN_PCR,r0 - mfspr r3,SPRN_LPCR - LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) - or r3, r3, r4 - LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) - andc r3, r3, r4 - li r4,0 /* LPES = 0 */ - bl __init_LPCR_ISA300 - bl __init_HFSCR - bl __init_PMU_HV - mtlr r11 - blr - -__init_hvmode_206: - /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */ - mfmsr r3 - rldicl. r0,r3,4,63 - bnelr - ld r5,CPU_SPEC_FEATURES(r4) - LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST) - andc r5,r5,r6 - std r5,CPU_SPEC_FEATURES(r4) - blr - -__init_LPCR_ISA206: - /* Setup a sane LPCR: - * Called with initial LPCR in R3 and desired LPES 2-bit value in R4 - * - * LPES = 0b01 (HSRR0/1 used for 0x500) - * PECE = 0b111 - * DPFD = 4 - * HDICE = 0 - * VC = 0b100 (VPM0=1, VPM1=0, ISL=0) - * VRMASD = 0b10000 (L=1, LP=00) - * - * Other bits untouched for now - */ - li r5,0x10 - rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5 - - /* POWER9 has no VRMASD */ -__init_LPCR_ISA300: - rldimi r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 - ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) - li r5,4 - rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3 - clrrdi r3,r3,1 /* clear HDICE */ - li r5,4 - rldimi r3,r5, LPCR_VC_SH, 0 - mtspr SPRN_LPCR,r3 - isync - blr - -__init_FSCR: - mfspr r3,SPRN_FSCR - ori r3,r3,FSCR_TAR|FSCR_DSCR|FSCR_EBB - mtspr SPRN_FSCR,r3 - blr - -__init_HFSCR: - mfspr r3,SPRN_HFSCR - ori r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\ - HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP - mtspr SPRN_HFSCR,r3 - blr - -__init_PMU_HV: - li r5,0 - mtspr SPRN_MMCRC,r5 - blr - -__init_PMU_HV_ISA207: - li r5,0 - mtspr SPRN_MMCRH,r5 - blr - -__init_PMU: - li r5,0 - mtspr SPRN_MMCRA,r5 - mtspr SPRN_MMCR0,r5 - mtspr SPRN_MMCR1,r5 - mtspr SPRN_MMCR2,r5 - blr - -__init_PMU_ISA207: - li r5,0 - mtspr SPRN_MMCRS,r5 - blr diff --git a/arch/powerpc/kernel/cpu_setup_power.c b/arch/powerpc/kernel/cpu_setup_power.c new file mode 100644 index 000000000000..98bd4e6c1770 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_power.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2020, Jordan Niethe, IBM Corporation. + * + * This file contains low level CPU setup functions. + * Originally written in assembly by Benjamin Herrenschmidt & various other + * authors. + */ + +#include <asm/reg.h> +#include <asm/synch.h> +#include <linux/bitops.h> +#include <asm/cputable.h> +#include <asm/cpu_setup.h> + +/* Disable CPU_FTR_HVMODE and return false if MSR:HV is not set */ +static bool init_hvmode_206(struct cpu_spec *t) +{ + u64 msr; + + msr = mfmsr(); + if (msr & MSR_HV) + return true; + + t->cpu_features &= ~(CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST); + return false; +} + +static void init_LPCR_ISA300(u64 lpcr, u64 lpes) +{ + /* POWER9 has no VRMASD */ + lpcr |= (lpes << LPCR_LPES_SH) & LPCR_LPES; + lpcr |= LPCR_PECE0|LPCR_PECE1|LPCR_PECE2; + lpcr |= (4ull << LPCR_DPFD_SH) & LPCR_DPFD; + lpcr &= ~LPCR_HDICE; /* clear HDICE */ + lpcr |= (4ull << LPCR_VC_SH); + mtspr(SPRN_LPCR, lpcr); + isync(); +} + +/* + * Setup a sane LPCR: + * Called with initial LPCR and desired LPES 2-bit value + * + * LPES = 0b01 (HSRR0/1 used for 0x500) + * PECE = 0b111 + * DPFD = 4 + * HDICE = 0 + * VC = 0b100 (VPM0=1, VPM1=0, ISL=0) + * VRMASD = 0b10000 (L=1, LP=00) + * + * Other bits untouched for now + */ +static void init_LPCR_ISA206(u64 lpcr, u64 lpes) +{ + lpcr |= (0x10ull << LPCR_VRMASD_SH) & LPCR_VRMASD; + init_LPCR_ISA300(lpcr, lpes); +} + +static void init_FSCR(void) +{ + u64 fscr; + + fscr = mfspr(SPRN_FSCR); + fscr |= FSCR_TAR|FSCR_EBB; + mtspr(SPRN_FSCR, fscr); +} + +static void init_FSCR_power9(void) +{ + u64 fscr; + + fscr = mfspr(SPRN_FSCR); + fscr |= FSCR_SCV; + mtspr(SPRN_FSCR, fscr); + init_FSCR(); +} + +static void init_FSCR_power10(void) +{ + u64 fscr; + + fscr = mfspr(SPRN_FSCR); + fscr |= FSCR_PREFIX; + mtspr(SPRN_FSCR, fscr); + init_FSCR_power9(); +} + +static void init_HFSCR(void) +{ + u64 hfscr; + + hfscr = mfspr(SPRN_HFSCR); + hfscr |= HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|HFSCR_DSCR|\ + HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP; + mtspr(SPRN_HFSCR, hfscr); +} + +static void init_PMU_HV(void) +{ + mtspr(SPRN_MMCRC, 0); +} + +static void init_PMU_HV_ISA207(void) +{ + mtspr(SPRN_MMCRH, 0); +} + +static void init_PMU(void) +{ + mtspr(SPRN_MMCRA, 0); + mtspr(SPRN_MMCR0, MMCR0_FC); + mtspr(SPRN_MMCR1, 0); + mtspr(SPRN_MMCR2, 0); +} + +static void init_PMU_ISA207(void) +{ + mtspr(SPRN_MMCRS, 0); +} + +static void init_PMU_ISA31(void) +{ + mtspr(SPRN_MMCR3, 0); + mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE); + mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT); +} + +static void init_DEXCR(void) +{ + mtspr(SPRN_DEXCR, DEXCR_INIT); + mtspr(SPRN_HASHKEYR, 0); +} + +/* + * Note that we can be called twice of pseudo-PVRs. + * The parameter offset is not used. + */ + +void __setup_cpu_power7(unsigned long offset, struct cpu_spec *t) +{ + if (!init_hvmode_206(t)) + return; + + mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH); +} + +void __restore_cpu_power7(void) +{ + u64 msr; + + msr = mfmsr(); + if (!(msr & MSR_HV)) + return; + + mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH); +} + +void __setup_cpu_power8(unsigned long offset, struct cpu_spec *t) +{ + init_FSCR(); + init_PMU(); + init_PMU_ISA207(); + + if (!init_hvmode_206(t)) + return; + + mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */ + init_HFSCR(); + init_PMU_HV(); + init_PMU_HV_ISA207(); +} + +void __restore_cpu_power8(void) +{ + u64 msr; + + init_FSCR(); + init_PMU(); + init_PMU_ISA207(); + + msr = mfmsr(); + if (!(msr & MSR_HV)) + return; + + mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */ + init_HFSCR(); + init_PMU_HV(); + init_PMU_HV_ISA207(); +} + +void __setup_cpu_power9(unsigned long offset, struct cpu_spec *t) +{ + init_FSCR_power9(); + init_PMU(); + + if (!init_hvmode_206(t)) + return; + + mtspr(SPRN_PSSCR, 0); + mtspr(SPRN_LPID, 0); + mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ + LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); + init_HFSCR(); + init_PMU_HV(); +} + +void __restore_cpu_power9(void) +{ + u64 msr; + + init_FSCR_power9(); + init_PMU(); + + msr = mfmsr(); + if (!(msr & MSR_HV)) + return; + + mtspr(SPRN_PSSCR, 0); + mtspr(SPRN_LPID, 0); + mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ + LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); + init_HFSCR(); + init_PMU_HV(); +} + +void __setup_cpu_power10(unsigned long offset, struct cpu_spec *t) +{ + init_FSCR_power10(); + init_PMU(); + init_PMU_ISA31(); + init_DEXCR(); + + if (!init_hvmode_206(t)) + return; + + mtspr(SPRN_PSSCR, 0); + mtspr(SPRN_LPID, 0); + mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ + LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); + init_HFSCR(); + init_PMU_HV(); +} + +void __restore_cpu_power10(void) +{ + u64 msr; + + init_FSCR_power10(); + init_PMU(); + init_PMU_ISA31(); + init_DEXCR(); + + msr = mfmsr(); + if (!(msr & MSR_HV)) + return; + + mtspr(SPRN_PSSCR, 0); + mtspr(SPRN_LPID, 0); + mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_PCR, PCR_MASK); + init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ + LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); + init_HFSCR(); + init_PMU_HV(); +} diff --git a/arch/powerpc/kernel/cpu_specs.h b/arch/powerpc/kernel/cpu_specs.h new file mode 100644 index 000000000000..85ded3f77204 --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifdef CONFIG_40x +#include "cpu_specs_40x.h" +#endif + +#ifdef CONFIG_PPC_47x +#include "cpu_specs_47x.h" +#elif defined(CONFIG_44x) +#include "cpu_specs_44x.h" +#endif + +#ifdef CONFIG_PPC_8xx +#include "cpu_specs_8xx.h" +#endif + +#ifdef CONFIG_PPC_E500MC +#include "cpu_specs_e500mc.h" +#elif defined(CONFIG_PPC_85xx) +#include "cpu_specs_85xx.h" +#endif + +#ifdef CONFIG_PPC_BOOK3S_32 +#include "cpu_specs_book3s_32.h" +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +#include "cpu_specs_book3s_64.h" +#endif diff --git a/arch/powerpc/kernel/cpu_specs_40x.h b/arch/powerpc/kernel/cpu_specs_40x.h new file mode 100644 index 000000000000..a1362a75b8c8 --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_40x.h @@ -0,0 +1,280 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + */ + +static struct cpu_spec cpu_specs[] __initdata = { + { /* STB 04xxx */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x41810000, + .cpu_name = "STB04xxx", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* NP405L */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x41610000, + .cpu_name = "NP405L", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* NP4GS3 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x40B10000, + .cpu_name = "NP4GS3", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* NP405H */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x41410000, + .cpu_name = "NP405H", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405GPr */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x50910000, + .cpu_name = "405GPr", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* STBx25xx */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x51510000, + .cpu_name = "STBx25xx", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405LP */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x41F10000, + .cpu_name = "405LP", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EP */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x51210000, + .cpu_name = "405EP", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. A/B with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910007, + .cpu_name = "405EX Rev. A/B", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. C without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x1291000d, + .cpu_name = "405EX Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. C with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x1291000f, + .cpu_name = "405EX Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. D without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910003, + .cpu_name = "405EX Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. D with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910005, + .cpu_name = "405EX Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. A/B without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910001, + .cpu_name = "405EXr Rev. A/B", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. C without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910009, + .cpu_name = "405EXr Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. C with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x1291000b, + .cpu_name = "405EXr Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. D without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910000, + .cpu_name = "405EXr Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. D with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910002, + .cpu_name = "405EXr Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { + /* 405EZ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x41510000, + .cpu_name = "405EZ", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* APM8018X */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x7ff11432, + .cpu_name = "APM8018X", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic 40x PPC)", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | + PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + } +}; diff --git a/arch/powerpc/kernel/cpu_specs_44x.h b/arch/powerpc/kernel/cpu_specs_44x.h new file mode 100644 index 000000000000..69c4cdc0cdee --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_44x.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + */ + +#define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ + PPC_FEATURE_BOOKE) + +static struct cpu_spec cpu_specs[] __initdata = { + { + .pvr_mask = 0xf0000fff, + .pvr_value = 0x40000850, + .cpu_name = "440GR Rev. A", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x40000858, + .cpu_name = "440EP Rev. A", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440ep, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { + .pvr_mask = 0xf0000fff, + .pvr_value = 0x400008d3, + .cpu_name = "440GR Rev. B", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* Matches both physical and logical PVR for 440EP (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000ff7, + .pvr_value = 0x400008d4, + .cpu_name = "440EP Rev. C", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440ep, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x400008db, + .cpu_name = "440EP Rev. B", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440ep, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* 440GRX */ + .pvr_mask = 0xf0000ffb, + .pvr_value = 0x200008D0, + .cpu_name = "440GRX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440grx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* Use logical PVR for 440EPx (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000ffb, + .pvr_value = 0x200008D8, + .cpu_name = "440EPX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440epx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440GP Rev. B */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x40000440, + .cpu_name = "440GP Rev. B", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440gp", + }, + { /* 440GP Rev. C */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x40000481, + .cpu_name = "440GP Rev. C", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440gp", + }, + { /* 440GX Rev. A */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x50000850, + .cpu_name = "440GX Rev. A", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440GX Rev. B */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x50000851, + .cpu_name = "440GX Rev. B", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440GX Rev. C */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x50000892, + .cpu_name = "440GX Rev. C", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440GX Rev. F */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x50000894, + .cpu_name = "440GX Rev. F", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440SP Rev. A */ + .pvr_mask = 0xfff00fff, + .pvr_value = 0x53200891, + .cpu_name = "440SP Rev. A", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* 440SPe Rev. A */ + .pvr_mask = 0xfff00fff, + .pvr_value = 0x53400890, + .cpu_name = "440SPe Rev. A", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440spe, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440SPe Rev. B */ + .pvr_mask = 0xfff00fff, + .pvr_value = 0x53400891, + .cpu_name = "440SPe Rev. B", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440spe, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460EX */ + .pvr_mask = 0xffff0006, + .pvr_value = 0x13020002, + .cpu_name = "460EX", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460ex, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460EX Rev B */ + .pvr_mask = 0xffff0007, + .pvr_value = 0x13020004, + .cpu_name = "460EX Rev. B", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460ex, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460GT */ + .pvr_mask = 0xffff0006, + .pvr_value = 0x13020000, + .cpu_name = "460GT", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460gt, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460GT Rev B */ + .pvr_mask = 0xffff0007, + .pvr_value = 0x13020005, + .cpu_name = "460GT Rev. B", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460gt, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460SX */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x13541800, + .cpu_name = "460SX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460sx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 464 in APM821xx */ + .pvr_mask = 0xfffffff0, + .pvr_value = 0x12C41C80, + .cpu_name = "APM821XX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_apm821xx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic 44x PPC)", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", + } +}; diff --git a/arch/powerpc/kernel/cpu_specs_47x.h b/arch/powerpc/kernel/cpu_specs_47x.h new file mode 100644 index 000000000000..3143cd504a51 --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_47x.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + */ + +#define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ + PPC_FEATURE_BOOKE) + +static struct cpu_spec cpu_specs[] __initdata = { + { /* 476 DD2 core */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x11a52080, + .cpu_name = "476", + .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | MMU_FTR_USE_TLBIVAX_BCAST | + MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* 476fpe */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x7ff50000, + .cpu_name = "476fpe", + .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | MMU_FTR_USE_TLBIVAX_BCAST | + MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* 476 iss */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00050000, + .cpu_name = "476", + .cpu_features = CPU_FTRS_47X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | MMU_FTR_USE_TLBIVAX_BCAST | + MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* 476 others */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x11a50000, + .cpu_name = "476", + .cpu_features = CPU_FTRS_47X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | MMU_FTR_USE_TLBIVAX_BCAST | + MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic 47x PPC)", + .cpu_features = CPU_FTRS_47X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_47x, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + } +}; diff --git a/arch/powerpc/kernel/cpu_specs_85xx.h b/arch/powerpc/kernel/cpu_specs_85xx.h new file mode 100644 index 000000000000..aaae202c1a89 --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_85xx.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + */ + +#define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ + PPC_FEATURE_BOOKE) + +static struct cpu_spec cpu_specs[] __initdata = { + { /* e500 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80200000, + .cpu_name = "e500", + .cpu_features = CPU_FTRS_E500, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_SPE_COMP | + PPC_FEATURE_HAS_EFP_SINGLE_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_e500v1, + .machine_check = machine_check_e500, + .platform = "ppc8540", + }, + { /* e500v2 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80210000, + .cpu_name = "e500v2", + .cpu_features = CPU_FTRS_E500_2, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_SPE_COMP | + PPC_FEATURE_HAS_EFP_SINGLE_COMP | + PPC_FEATURE_HAS_EFP_DOUBLE_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_e500v2, + .machine_check = machine_check_e500, + .platform = "ppc8548", + .cpu_down_flush = cpu_down_flush_e500v2, + }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic E500 PPC)", + .cpu_features = CPU_FTRS_E500, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_SPE_COMP | + PPC_FEATURE_HAS_EFP_SINGLE_COMP, + .mmu_features = MMU_FTR_TYPE_FSL_E, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_e500, + .platform = "powerpc", + } +}; diff --git a/arch/powerpc/kernel/cpu_specs_8xx.h b/arch/powerpc/kernel/cpu_specs_8xx.h new file mode 100644 index 000000000000..93ddbc202ba3 --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_8xx.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + */ + +static struct cpu_spec cpu_specs[] __initdata = { + { /* 8xx */ + .pvr_mask = 0xffff0000, + .pvr_value = PVR_8xx, + .cpu_name = "8xx", + /* + * CPU_FTR_MAYBE_CAN_DOZE is possible, + * if the 8xx code is there.... + */ + .cpu_features = CPU_FTRS_8XX, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_TYPE_8xx, + .icache_bsize = 16, + .dcache_bsize = 16, + .machine_check = machine_check_8xx, + .platform = "ppc823", + }, +}; diff --git a/arch/powerpc/kernel/cpu_specs_book3s_32.h b/arch/powerpc/kernel/cpu_specs_book3s_32.h new file mode 100644 index 000000000000..3714634d194a --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_book3s_32.h @@ -0,0 +1,605 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + */ + +#define COMMON_USER (PPC_FEATURE_32 | PPC_FEATURE_HAS_FPU | \ + PPC_FEATURE_HAS_MMU) + +static struct cpu_spec cpu_specs[] __initdata = { +#ifdef CONFIG_PPC_BOOK3S_603 + { /* 603 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00030000, + .cpu_name = "603", + .cpu_features = CPU_FTRS_603, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = 0, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, + { /* 603e */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00060000, + .cpu_name = "603e", + .cpu_features = CPU_FTRS_603, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = 0, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, + { /* 603ev */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00070000, + .cpu_name = "603ev", + .cpu_features = CPU_FTRS_603, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = 0, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, + { /* 82xx (8240, 8245, 8260 are all 603e cores) */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00810000, + .cpu_name = "82xx", + .cpu_features = CPU_FTRS_82XX, + .cpu_user_features = COMMON_USER, + .mmu_features = 0, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, + { /* All G2_LE (603e core, plus some) have the same pvr */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00820000, + .cpu_name = "G2_LE", + .cpu_features = CPU_FTRS_G2_LE, + .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, +#ifdef CONFIG_PPC_83xx + { /* e300c1 (a 603e core, plus some) on 83xx */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00830000, + .cpu_name = "e300c1", + .cpu_features = CPU_FTRS_E300, + .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_83xx, + .platform = "ppc603", + }, + { /* e300c2 (an e300c1 core, plus some, minus FPU) on 83xx */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00840000, + .cpu_name = "e300c2", + .cpu_features = CPU_FTRS_E300C2, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_USE_HIGH_BATS | MMU_FTR_NEED_DTLB_SW_LRU, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_83xx, + .platform = "ppc603", + }, + { /* e300c3 (e300c1, plus one IU, half cache size) on 83xx */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00850000, + .cpu_name = "e300c3", + .cpu_features = CPU_FTRS_E300, + .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS | MMU_FTR_NEED_DTLB_SW_LRU, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_83xx, + .num_pmcs = 4, + .platform = "ppc603", + }, + { /* e300c4 (e300c1, plus one IU) */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00860000, + .cpu_name = "e300c4", + .cpu_features = CPU_FTRS_E300, + .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS | MMU_FTR_NEED_DTLB_SW_LRU, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_83xx, + .num_pmcs = 4, + .platform = "ppc603", + }, +#endif +#endif /* CONFIG_PPC_BOOK3S_603 */ +#ifdef CONFIG_PPC_BOOK3S_604 + { /* 604 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00040000, + .cpu_name = "604", + .cpu_features = CPU_FTRS_604, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 2, + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", + }, + { /* 604e */ + .pvr_mask = 0xfffff000, + .pvr_value = 0x00090000, + .cpu_name = "604e", + .cpu_features = CPU_FTRS_604, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", + }, + { /* 604r */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00090000, + .cpu_name = "604r", + .cpu_features = CPU_FTRS_604, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", + }, + { /* 604ev */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x000a0000, + .cpu_name = "604ev", + .cpu_features = CPU_FTRS_604, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", + }, + { /* 740/750 (0x4202, don't support TAU ?) */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x00084202, + .cpu_name = "740/750", + .cpu_features = CPU_FTRS_740_NOTAU, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750CX (80100 and 8010x?) */ + .pvr_mask = 0xfffffff0, + .pvr_value = 0x00080100, + .cpu_name = "750CX", + .cpu_features = CPU_FTRS_750, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750CX (82201 and 82202) */ + .pvr_mask = 0xfffffff0, + .pvr_value = 0x00082200, + .cpu_name = "750CX", + .cpu_features = CPU_FTRS_750, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750CXe (82214) */ + .pvr_mask = 0xfffffff0, + .pvr_value = 0x00082210, + .cpu_name = "750CXe", + .cpu_features = CPU_FTRS_750, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750CXe "Gekko" (83214) */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x00083214, + .cpu_name = "750CXe", + .cpu_features = CPU_FTRS_750, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750CL (and "Broadway") */ + .pvr_mask = 0xfffff0e0, + .pvr_value = 0x00087000, + .cpu_name = "750CL", + .cpu_features = CPU_FTRS_750CL, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 745/755 */ + .pvr_mask = 0xfffff000, + .pvr_value = 0x00083000, + .cpu_name = "745/755", + .cpu_features = CPU_FTRS_750, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750FX rev 1.x */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x70000100, + .cpu_name = "750FX", + .cpu_features = CPU_FTRS_750FX1, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750FX rev 2.0 must disable HID0[DPM] */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x70000200, + .cpu_name = "750FX", + .cpu_features = CPU_FTRS_750FX2, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750FX (All revs except 2.0) */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x70000000, + .cpu_name = "750FX", + .cpu_features = CPU_FTRS_750FX, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750fx, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750GX */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x70020000, + .cpu_name = "750GX", + .cpu_features = CPU_FTRS_750GX, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750fx, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 740/750 (L2CR bit need fixup for 740) */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00080000, + .cpu_name = "740/750", + .cpu_features = CPU_FTRS_740, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 7400 rev 1.1 ? (no TAU) */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x000c1101, + .cpu_name = "7400 (1.1)", + .cpu_features = CPU_FTRS_7400_NOTAU, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_7400, + .machine_check = machine_check_generic, + .platform = "ppc7400", + }, + { /* 7400 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x000c0000, + .cpu_name = "7400", + .cpu_features = CPU_FTRS_7400, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_7400, + .machine_check = machine_check_generic, + .platform = "ppc7400", + }, + { /* 7410 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x800c0000, + .cpu_name = "7410", + .cpu_features = CPU_FTRS_7400, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_7410, + .machine_check = machine_check_generic, + .platform = "ppc7400", + }, + { /* 7450 2.0 - no doze/nap */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x80000200, + .cpu_name = "7450", + .cpu_features = CPU_FTRS_7450_20, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7450 2.1 */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x80000201, + .cpu_name = "7450", + .cpu_features = CPU_FTRS_7450_21, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7450 2.3 and newer */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80000000, + .cpu_name = "7450", + .cpu_features = CPU_FTRS_7450_23, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7455 rev 1.x */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x80010100, + .cpu_name = "7455", + .cpu_features = CPU_FTRS_7455_1, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7455 rev 2.0 */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x80010200, + .cpu_name = "7455", + .cpu_features = CPU_FTRS_7455_20, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7455 others */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80010000, + .cpu_name = "7455", + .cpu_features = CPU_FTRS_7455, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7447/7457 Rev 1.0 */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x80020100, + .cpu_name = "7447/7457", + .cpu_features = CPU_FTRS_7447_10, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7447/7457 Rev 1.1 */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x80020101, + .cpu_name = "7447/7457", + .cpu_features = CPU_FTRS_7447_10, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7447/7457 Rev 1.2 and later */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80020000, + .cpu_name = "7447/7457", + .cpu_features = CPU_FTRS_7447, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7447A */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80030000, + .cpu_name = "7447A", + .cpu_features = CPU_FTRS_7447A, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* 7448 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80040000, + .cpu_name = "7448", + .cpu_features = CPU_FTRS_7448, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 6, + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .machine_check = machine_check_generic, + .platform = "ppc7450", + }, + { /* default match, we assume split I/D cache & TB (non-601)... */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic PPC)", + .cpu_features = CPU_FTRS_CLASSIC32, + .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_HPTE_TABLE, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, +#endif /* CONFIG_PPC_BOOK3S_604 */ +}; diff --git a/arch/powerpc/kernel/cpu_specs_book3s_64.h b/arch/powerpc/kernel/cpu_specs_book3s_64.h new file mode 100644 index 000000000000..3ff9757df4c0 --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_book3s_64.h @@ -0,0 +1,496 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + */ + +/* NOTE: + * Unlike ppc32, ppc64 will only call cpu_setup() for the boot CPU, it's + * the responsibility of the appropriate CPU save/restore functions to + * eventually copy these settings over. Those save/restore aren't yet + * part of the cputable though. That has to be fixed for both ppc32 + * and ppc64 + */ +#define COMMON_USER_PPC64 (PPC_FEATURE_32 | PPC_FEATURE_HAS_FPU | \ + PPC_FEATURE_HAS_MMU | PPC_FEATURE_64) +#define COMMON_USER_POWER4 (COMMON_USER_PPC64 | PPC_FEATURE_POWER4) +#define COMMON_USER_POWER5 (COMMON_USER_PPC64 | PPC_FEATURE_POWER5 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP) +#define COMMON_USER_POWER5_PLUS (COMMON_USER_PPC64 | PPC_FEATURE_POWER5_PLUS|\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP) +#define COMMON_USER_POWER6 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_05 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER_POWER7 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER2_POWER7 (PPC_FEATURE2_DSCR) +#define COMMON_USER_POWER8 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER2_POWER8 (PPC_FEATURE2_ARCH_2_07 | \ + PPC_FEATURE2_HTM_COMP | \ + PPC_FEATURE2_HTM_NOSC_COMP | \ + PPC_FEATURE2_DSCR | \ + PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \ + PPC_FEATURE2_VEC_CRYPTO) +#define COMMON_USER_PA6T (COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_HAS_ALTIVEC_COMP) +#define COMMON_USER_POWER9 COMMON_USER_POWER8 +#define COMMON_USER2_POWER9 (COMMON_USER2_POWER8 | \ + PPC_FEATURE2_ARCH_3_00 | \ + PPC_FEATURE2_HAS_IEEE128 | \ + PPC_FEATURE2_DARN | \ + PPC_FEATURE2_SCV) +#define COMMON_USER_POWER10 COMMON_USER_POWER9 +#define COMMON_USER2_POWER10 (PPC_FEATURE2_ARCH_3_1 | \ + PPC_FEATURE2_MMA | \ + PPC_FEATURE2_ARCH_3_00 | \ + PPC_FEATURE2_HAS_IEEE128 | \ + PPC_FEATURE2_DARN | \ + PPC_FEATURE2_SCV | \ + PPC_FEATURE2_ARCH_2_07 | \ + PPC_FEATURE2_DSCR | \ + PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \ + PPC_FEATURE2_VEC_CRYPTO) + +static struct cpu_spec cpu_specs[] __initdata = { + { /* PPC970 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00390000, + .cpu_name = "PPC970", + .cpu_features = CPU_FTRS_PPC970, + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_ppc970, + .cpu_restore = __restore_cpu_ppc970, + .platform = "ppc970", + }, + { /* PPC970FX */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003c0000, + .cpu_name = "PPC970FX", + .cpu_features = CPU_FTRS_PPC970, + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_ppc970, + .cpu_restore = __restore_cpu_ppc970, + .platform = "ppc970", + }, + { /* PPC970MP DD1.0 - no DEEPNAP, use regular 970 init */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x00440100, + .cpu_name = "PPC970MP", + .cpu_features = CPU_FTRS_PPC970, + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_ppc970, + .cpu_restore = __restore_cpu_ppc970, + .platform = "ppc970", + }, + { /* PPC970MP */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00440000, + .cpu_name = "PPC970MP", + .cpu_features = CPU_FTRS_PPC970, + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_ppc970MP, + .cpu_restore = __restore_cpu_ppc970, + .platform = "ppc970", + }, + { /* PPC970GX */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00450000, + .cpu_name = "PPC970GX", + .cpu_features = CPU_FTRS_PPC970, + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_ppc970, + .platform = "ppc970", + }, + { /* Power5 GR */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003a0000, + .cpu_name = "POWER5 (gr)", + .cpu_features = CPU_FTRS_POWER5, + .cpu_user_features = COMMON_USER_POWER5, + .mmu_features = MMU_FTRS_POWER5, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .platform = "power5", + }, + { /* Power5++ */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x003b0300, + .cpu_name = "POWER5+ (gs)", + .cpu_features = CPU_FTRS_POWER5, + .cpu_user_features = COMMON_USER_POWER5_PLUS, + .mmu_features = MMU_FTRS_POWER5, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .platform = "power5+", + }, + { /* Power5 GS */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003b0000, + .cpu_name = "POWER5+ (gs)", + .cpu_features = CPU_FTRS_POWER5, + .cpu_user_features = COMMON_USER_POWER5_PLUS, + .mmu_features = MMU_FTRS_POWER5, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .platform = "power5+", + }, + { /* POWER6 in P5+ mode; 2.04-compliant processor */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000001, + .cpu_name = "POWER5+", + .cpu_features = CPU_FTRS_POWER5, + .cpu_user_features = COMMON_USER_POWER5_PLUS, + .mmu_features = MMU_FTRS_POWER5, + .icache_bsize = 128, + .dcache_bsize = 128, + .platform = "power5+", + }, + { /* Power6 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003e0000, + .cpu_name = "POWER6 (raw)", + .cpu_features = CPU_FTRS_POWER6, + .cpu_user_features = COMMON_USER_POWER6 | PPC_FEATURE_POWER6_EXT, + .mmu_features = MMU_FTRS_POWER6, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .platform = "power6x", + }, + { /* 2.05-compliant processor, i.e. Power6 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000002, + .cpu_name = "POWER6 (architected)", + .cpu_features = CPU_FTRS_POWER6, + .cpu_user_features = COMMON_USER_POWER6, + .mmu_features = MMU_FTRS_POWER6, + .icache_bsize = 128, + .dcache_bsize = 128, + .platform = "power6", + }, + { /* 2.06-compliant processor, i.e. Power7 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000003, + .cpu_name = "POWER7 (architected)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, + .mmu_features = MMU_FTRS_POWER7, + .icache_bsize = 128, + .dcache_bsize = 128, + .cpu_setup = __setup_cpu_power7, + .cpu_restore = __restore_cpu_power7, + .machine_check_early = __machine_check_early_realmode_p7, + .platform = "power7", + }, + { /* 2.07-compliant processor, i.e. Power8 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000004, + .cpu_name = "POWER8 (architected)", + .cpu_features = CPU_FTRS_POWER8, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* 2.07-compliant processor, HeXin C2000 processor */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00660000, + .cpu_name = "HX-C2000", + .cpu_features = CPU_FTRS_POWER8, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* 3.00-compliant processor, i.e. Power9 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000005, + .cpu_name = "POWER9 (architected)", + .cpu_features = CPU_FTRS_POWER9, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .platform = "power9", + }, + { /* 3.1-compliant processor, i.e. Power10 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000006, + .cpu_name = "POWER10 (architected)", + .cpu_features = CPU_FTRS_POWER10, + .cpu_user_features = COMMON_USER_POWER10, + .cpu_user_features2 = COMMON_USER2_POWER10, + .mmu_features = MMU_FTRS_POWER10, + .icache_bsize = 128, + .dcache_bsize = 128, + .cpu_setup = __setup_cpu_power10, + .cpu_restore = __restore_cpu_power10, + .platform = "power10", + }, + { /* Power7 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003f0000, + .cpu_name = "POWER7 (raw)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, + .mmu_features = MMU_FTRS_POWER7, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power7, + .cpu_restore = __restore_cpu_power7, + .machine_check_early = __machine_check_early_realmode_p7, + .platform = "power7", + }, + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, + .mmu_features = MMU_FTRS_POWER7, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power7, + .cpu_restore = __restore_cpu_power7, + .machine_check_early = __machine_check_early_realmode_p7, + .platform = "power7+", + }, + { /* Power8E */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_features = CPU_FTRS_POWER8E, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power8NVL */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004c0000, + .cpu_name = "POWER8NVL (raw)", + .cpu_features = CPU_FTRS_POWER8, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, + .cpu_name = "POWER8 (raw)", + .cpu_features = CPU_FTRS_POWER8, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_0, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power9 DD 2.1 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0201, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_1, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power9 DD2.2 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0202, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_2, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power9 DD2.3 or later */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004e0000, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_3, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power10 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00800000, + .cpu_name = "POWER10 (raw)", + .cpu_features = CPU_FTRS_POWER10, + .cpu_user_features = COMMON_USER_POWER10, + .cpu_user_features2 = COMMON_USER2_POWER10, + .mmu_features = MMU_FTRS_POWER10, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_power10, + .cpu_restore = __restore_cpu_power10, + .machine_check_early = __machine_check_early_realmode_p10, + .platform = "power10", + }, + { /* Cell Broadband Engine */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00700000, + .cpu_name = "Cell Broadband Engine", + .cpu_features = CPU_FTRS_CELL, + .cpu_user_features = COMMON_USER_PPC64 | PPC_FEATURE_CELL | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_SMT, + .mmu_features = MMU_FTRS_CELL, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .platform = "ppc-cell-be", + }, + { /* PA Semi PA6T */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00900000, + .cpu_name = "PA6T", + .cpu_features = CPU_FTRS_PA6T, + .cpu_user_features = COMMON_USER_PA6T, + .mmu_features = MMU_FTRS_PA6T, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 6, + .pmc_type = PPC_PMC_PA6T, + .cpu_setup = __setup_cpu_pa6t, + .cpu_restore = __restore_cpu_pa6t, + .platform = "pa6t", + }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "POWER5 (compatible)", + .cpu_features = CPU_FTRS_COMPATIBLE, + .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTRS_POWER, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .platform = "power5", + } +}; diff --git a/arch/powerpc/kernel/cpu_specs_e500mc.h b/arch/powerpc/kernel/cpu_specs_e500mc.h new file mode 100644 index 000000000000..ceb06b109f83 --- /dev/null +++ b/arch/powerpc/kernel/cpu_specs_e500mc.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + */ + +#ifdef CONFIG_PPC64 +#define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ + PPC_FEATURE_HAS_FPU | PPC_FEATURE_64) +#else +#define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ + PPC_FEATURE_BOOKE) +#endif + +static struct cpu_spec cpu_specs[] __initdata = { +#ifdef CONFIG_PPC32 + { /* e500mc */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80230000, + .cpu_name = "e500mc", + .cpu_features = CPU_FTRS_E500MC, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | MMU_FTR_USE_TLBILX, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_e500mc, + .machine_check = machine_check_e500mc, + .platform = "ppce500mc", + .cpu_down_flush = cpu_down_flush_e500mc, + }, +#endif /* CONFIG_PPC32 */ + { /* e5500 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80240000, + .cpu_name = "e5500", + .cpu_features = CPU_FTRS_E5500, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | MMU_FTR_USE_TLBILX, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 4, + .cpu_setup = __setup_cpu_e5500, +#ifndef CONFIG_PPC32 + .cpu_restore = __restore_cpu_e5500, +#endif + .machine_check = machine_check_e500mc, + .platform = "ppce5500", + .cpu_down_flush = cpu_down_flush_e5500, + }, + { /* e6500 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80400000, + .cpu_name = "e6500", + .cpu_features = CPU_FTRS_E6500, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU | + PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | MMU_FTR_USE_TLBILX, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 6, + .cpu_setup = __setup_cpu_e6500, +#ifndef CONFIG_PPC32 + .cpu_restore = __restore_cpu_e6500, +#endif + .machine_check = machine_check_e500mc, + .platform = "ppce6500", + .cpu_down_flush = cpu_down_flush_e6500, + }, +}; diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index e745abc5457a..6f6801da9dc1 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -12,2133 +12,23 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/jump_label.h> +#include <linux/of.h> -#include <asm/oprofile_impl.h> #include <asm/cputable.h> -#include <asm/prom.h> /* for PTRRELOC on ARCH=ppc */ +#include <asm/mce.h> #include <asm/mmu.h> #include <asm/setup.h> +#include <asm/cpu_setup.h> -static struct cpu_spec the_cpu_spec __read_mostly; +static struct cpu_spec the_cpu_spec __ro_after_init; -struct cpu_spec* cur_cpu_spec __read_mostly = NULL; +struct cpu_spec *cur_cpu_spec __ro_after_init = NULL; EXPORT_SYMBOL(cur_cpu_spec); /* The platform string corresponding to the real PVR */ const char *powerpc_base_platform; -/* NOTE: - * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's - * the responsibility of the appropriate CPU save/restore functions to - * eventually copy these settings over. Those save/restore aren't yet - * part of the cputable though. That has to be fixed for both ppc32 - * and ppc64 - */ -#ifdef CONFIG_PPC32 -extern void __setup_cpu_e200(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_e500v1(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_e500v2(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_e500mc(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440ep(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440epx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440gx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440grx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440spe(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_440x5(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_460ex(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_460gt(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_460sx(unsigned long offset, struct cpu_spec *spec); -extern void __setup_cpu_apm821xx(unsigned long offset, struct cpu_spec *spec); -extern void __setup_cpu_603(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_604(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_750(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_750cx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_750fx(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_7400(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_7410(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec); -#endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 -extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec); -extern void __restore_cpu_pa6t(void); -extern void __restore_cpu_ppc970(void); -extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec); -extern void __restore_cpu_power7(void); -extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec); -extern void __restore_cpu_power8(void); -extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec); -extern void __restore_cpu_power9(void); -extern long __machine_check_early_realmode_p7(struct pt_regs *regs); -extern long __machine_check_early_realmode_p8(struct pt_regs *regs); -extern long __machine_check_early_realmode_p9(struct pt_regs *regs); -#endif /* CONFIG_PPC64 */ -#if defined(CONFIG_E500) -extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_e6500(unsigned long offset, struct cpu_spec* spec); -extern void __restore_cpu_e5500(void); -extern void __restore_cpu_e6500(void); -#endif /* CONFIG_E500 */ - -/* This table only contains "desktop" CPUs, it need to be filled with embedded - * ones as well... - */ -#define COMMON_USER (PPC_FEATURE_32 | PPC_FEATURE_HAS_FPU | \ - PPC_FEATURE_HAS_MMU) -#define COMMON_USER_PPC64 (COMMON_USER | PPC_FEATURE_64) -#define COMMON_USER_POWER4 (COMMON_USER_PPC64 | PPC_FEATURE_POWER4) -#define COMMON_USER_POWER5 (COMMON_USER_PPC64 | PPC_FEATURE_POWER5 |\ - PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP) -#define COMMON_USER_POWER5_PLUS (COMMON_USER_PPC64 | PPC_FEATURE_POWER5_PLUS|\ - PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP) -#define COMMON_USER_POWER6 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_05 |\ - PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ - PPC_FEATURE_TRUE_LE | \ - PPC_FEATURE_PSERIES_PERFMON_COMPAT) -#define COMMON_USER_POWER7 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\ - PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ - PPC_FEATURE_TRUE_LE | \ - PPC_FEATURE_PSERIES_PERFMON_COMPAT) -#define COMMON_USER2_POWER7 (PPC_FEATURE2_DSCR) -#define COMMON_USER_POWER8 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\ - PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ - PPC_FEATURE_TRUE_LE | \ - PPC_FEATURE_PSERIES_PERFMON_COMPAT) -#define COMMON_USER2_POWER8 (PPC_FEATURE2_ARCH_2_07 | \ - PPC_FEATURE2_HTM_COMP | \ - PPC_FEATURE2_HTM_NOSC_COMP | \ - PPC_FEATURE2_DSCR | \ - PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \ - PPC_FEATURE2_VEC_CRYPTO) -#define COMMON_USER_PA6T (COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\ - PPC_FEATURE_TRUE_LE | \ - PPC_FEATURE_HAS_ALTIVEC_COMP) -#define COMMON_USER_POWER9 COMMON_USER_POWER8 -#define COMMON_USER2_POWER9 (COMMON_USER2_POWER8 | \ - PPC_FEATURE2_ARCH_3_00 | \ - PPC_FEATURE2_HAS_IEEE128 | \ - PPC_FEATURE2_DARN ) - -#ifdef CONFIG_PPC_BOOK3E_64 -#define COMMON_USER_BOOKE (COMMON_USER_PPC64 | PPC_FEATURE_BOOKE) -#else -#define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ - PPC_FEATURE_BOOKE) -#endif - -static struct cpu_spec __initdata cpu_specs[] = { -#ifdef CONFIG_PPC_BOOK3S_64 - { /* PPC970 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00390000, - .cpu_name = "PPC970", - .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_POWER4 | - PPC_FEATURE_HAS_ALTIVEC_COMP, - .mmu_features = MMU_FTRS_PPC970, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 8, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_ppc970, - .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970", - .oprofile_type = PPC_OPROFILE_POWER4, - .platform = "ppc970", - }, - { /* PPC970FX */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x003c0000, - .cpu_name = "PPC970FX", - .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_POWER4 | - PPC_FEATURE_HAS_ALTIVEC_COMP, - .mmu_features = MMU_FTRS_PPC970, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 8, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_ppc970, - .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970", - .oprofile_type = PPC_OPROFILE_POWER4, - .platform = "ppc970", - }, - { /* PPC970MP DD1.0 - no DEEPNAP, use regular 970 init */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x00440100, - .cpu_name = "PPC970MP", - .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_POWER4 | - PPC_FEATURE_HAS_ALTIVEC_COMP, - .mmu_features = MMU_FTRS_PPC970, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 8, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_ppc970, - .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970MP", - .oprofile_type = PPC_OPROFILE_POWER4, - .platform = "ppc970", - }, - { /* PPC970MP */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00440000, - .cpu_name = "PPC970MP", - .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_POWER4 | - PPC_FEATURE_HAS_ALTIVEC_COMP, - .mmu_features = MMU_FTRS_PPC970, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 8, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_ppc970MP, - .cpu_restore = __restore_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970MP", - .oprofile_type = PPC_OPROFILE_POWER4, - .platform = "ppc970", - }, - { /* PPC970GX */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00450000, - .cpu_name = "PPC970GX", - .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_POWER4 | - PPC_FEATURE_HAS_ALTIVEC_COMP, - .mmu_features = MMU_FTRS_PPC970, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 8, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_ppc970, - .oprofile_cpu_type = "ppc64/970", - .oprofile_type = PPC_OPROFILE_POWER4, - .platform = "ppc970", - }, - { /* Power5 GR */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x003a0000, - .cpu_name = "POWER5 (gr)", - .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_POWER5, - .mmu_features = MMU_FTRS_POWER5, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power5", - .oprofile_type = PPC_OPROFILE_POWER4, - /* SIHV / SIPR bits are implemented on POWER4+ (GQ) - * and above but only works on POWER5 and above - */ - .oprofile_mmcra_sihv = MMCRA_SIHV, - .oprofile_mmcra_sipr = MMCRA_SIPR, - .platform = "power5", - }, - { /* Power5++ */ - .pvr_mask = 0xffffff00, - .pvr_value = 0x003b0300, - .cpu_name = "POWER5+ (gs)", - .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_POWER5_PLUS, - .mmu_features = MMU_FTRS_POWER5, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .oprofile_cpu_type = "ppc64/power5++", - .oprofile_type = PPC_OPROFILE_POWER4, - .oprofile_mmcra_sihv = MMCRA_SIHV, - .oprofile_mmcra_sipr = MMCRA_SIPR, - .platform = "power5+", - }, - { /* Power5 GS */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x003b0000, - .cpu_name = "POWER5+ (gs)", - .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_POWER5_PLUS, - .mmu_features = MMU_FTRS_POWER5, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power5+", - .oprofile_type = PPC_OPROFILE_POWER4, - .oprofile_mmcra_sihv = MMCRA_SIHV, - .oprofile_mmcra_sipr = MMCRA_SIPR, - .platform = "power5+", - }, - { /* POWER6 in P5+ mode; 2.04-compliant processor */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x0f000001, - .cpu_name = "POWER5+", - .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_POWER5_PLUS, - .mmu_features = MMU_FTRS_POWER5, - .icache_bsize = 128, - .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", - .oprofile_type = PPC_OPROFILE_POWER4, - .platform = "power5+", - }, - { /* Power6 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x003e0000, - .cpu_name = "POWER6 (raw)", - .cpu_features = CPU_FTRS_POWER6, - .cpu_user_features = COMMON_USER_POWER6 | - PPC_FEATURE_POWER6_EXT, - .mmu_features = MMU_FTRS_POWER6, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power6", - .oprofile_type = PPC_OPROFILE_POWER4, - .oprofile_mmcra_sihv = POWER6_MMCRA_SIHV, - .oprofile_mmcra_sipr = POWER6_MMCRA_SIPR, - .oprofile_mmcra_clear = POWER6_MMCRA_THRM | - POWER6_MMCRA_OTHER, - .platform = "power6x", - }, - { /* 2.05-compliant processor, i.e. Power6 "architected" mode */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x0f000002, - .cpu_name = "POWER6 (architected)", - .cpu_features = CPU_FTRS_POWER6, - .cpu_user_features = COMMON_USER_POWER6, - .mmu_features = MMU_FTRS_POWER6, - .icache_bsize = 128, - .dcache_bsize = 128, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", - .oprofile_type = PPC_OPROFILE_POWER4, - .platform = "power6", - }, - { /* 2.06-compliant processor, i.e. Power7 "architected" mode */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x0f000003, - .cpu_name = "POWER7 (architected)", - .cpu_features = CPU_FTRS_POWER7, - .cpu_user_features = COMMON_USER_POWER7, - .cpu_user_features2 = COMMON_USER2_POWER7, - .mmu_features = MMU_FTRS_POWER7, - .icache_bsize = 128, - .dcache_bsize = 128, - .oprofile_type = PPC_OPROFILE_POWER4, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", - .cpu_setup = __setup_cpu_power7, - .cpu_restore = __restore_cpu_power7, - .machine_check_early = __machine_check_early_realmode_p7, - .platform = "power7", - }, - { /* 2.07-compliant processor, i.e. Power8 "architected" mode */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x0f000004, - .cpu_name = "POWER8 (architected)", - .cpu_features = CPU_FTRS_POWER8, - .cpu_user_features = COMMON_USER_POWER8, - .cpu_user_features2 = COMMON_USER2_POWER8, - .mmu_features = MMU_FTRS_POWER8, - .icache_bsize = 128, - .dcache_bsize = 128, - .oprofile_type = PPC_OPROFILE_INVALID, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", - .cpu_setup = __setup_cpu_power8, - .cpu_restore = __restore_cpu_power8, - .machine_check_early = __machine_check_early_realmode_p8, - .platform = "power8", - }, - { /* 3.00-compliant processor, i.e. Power9 "architected" mode */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x0f000005, - .cpu_name = "POWER9 (architected)", - .cpu_features = CPU_FTRS_POWER9, - .cpu_user_features = COMMON_USER_POWER9, - .cpu_user_features2 = COMMON_USER2_POWER9, - .mmu_features = MMU_FTRS_POWER9, - .icache_bsize = 128, - .dcache_bsize = 128, - .oprofile_type = PPC_OPROFILE_INVALID, - .oprofile_cpu_type = "ppc64/ibm-compat-v1", - .cpu_setup = __setup_cpu_power9, - .cpu_restore = __restore_cpu_power9, - .platform = "power9", - }, - { /* Power7 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x003f0000, - .cpu_name = "POWER7 (raw)", - .cpu_features = CPU_FTRS_POWER7, - .cpu_user_features = COMMON_USER_POWER7, - .cpu_user_features2 = COMMON_USER2_POWER7, - .mmu_features = MMU_FTRS_POWER7, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power7", - .oprofile_type = PPC_OPROFILE_POWER4, - .cpu_setup = __setup_cpu_power7, - .cpu_restore = __restore_cpu_power7, - .machine_check_early = __machine_check_early_realmode_p7, - .platform = "power7", - }, - { /* Power7+ */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004A0000, - .cpu_name = "POWER7+ (raw)", - .cpu_features = CPU_FTRS_POWER7, - .cpu_user_features = COMMON_USER_POWER7, - .cpu_user_features2 = COMMON_USER2_POWER7, - .mmu_features = MMU_FTRS_POWER7, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power7", - .oprofile_type = PPC_OPROFILE_POWER4, - .cpu_setup = __setup_cpu_power7, - .cpu_restore = __restore_cpu_power7, - .machine_check_early = __machine_check_early_realmode_p7, - .platform = "power7+", - }, - { /* Power8E */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004b0000, - .cpu_name = "POWER8E (raw)", - .cpu_features = CPU_FTRS_POWER8E, - .cpu_user_features = COMMON_USER_POWER8, - .cpu_user_features2 = COMMON_USER2_POWER8, - .mmu_features = MMU_FTRS_POWER8, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power8", - .oprofile_type = PPC_OPROFILE_INVALID, - .cpu_setup = __setup_cpu_power8, - .cpu_restore = __restore_cpu_power8, - .machine_check_early = __machine_check_early_realmode_p8, - .platform = "power8", - }, - { /* Power8NVL */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004c0000, - .cpu_name = "POWER8NVL (raw)", - .cpu_features = CPU_FTRS_POWER8, - .cpu_user_features = COMMON_USER_POWER8, - .cpu_user_features2 = COMMON_USER2_POWER8, - .mmu_features = MMU_FTRS_POWER8, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power8", - .oprofile_type = PPC_OPROFILE_INVALID, - .cpu_setup = __setup_cpu_power8, - .cpu_restore = __restore_cpu_power8, - .machine_check_early = __machine_check_early_realmode_p8, - .platform = "power8", - }, - { /* Power8 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004d0000, - .cpu_name = "POWER8 (raw)", - .cpu_features = CPU_FTRS_POWER8, - .cpu_user_features = COMMON_USER_POWER8, - .cpu_user_features2 = COMMON_USER2_POWER8, - .mmu_features = MMU_FTRS_POWER8, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power8", - .oprofile_type = PPC_OPROFILE_INVALID, - .cpu_setup = __setup_cpu_power8, - .cpu_restore = __restore_cpu_power8, - .machine_check_early = __machine_check_early_realmode_p8, - .platform = "power8", - }, - { /* Power9 DD2.0 */ - .pvr_mask = 0xffffefff, - .pvr_value = 0x004e0200, - .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9_DD2_0, - .cpu_user_features = COMMON_USER_POWER9, - .cpu_user_features2 = COMMON_USER2_POWER9, - .mmu_features = MMU_FTRS_POWER9, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", - .oprofile_type = PPC_OPROFILE_INVALID, - .cpu_setup = __setup_cpu_power9, - .cpu_restore = __restore_cpu_power9, - .machine_check_early = __machine_check_early_realmode_p9, - .platform = "power9", - }, - { /* Power9 DD 2.1 */ - .pvr_mask = 0xffffefff, - .pvr_value = 0x004e0201, - .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9_DD2_1, - .cpu_user_features = COMMON_USER_POWER9, - .cpu_user_features2 = COMMON_USER2_POWER9, - .mmu_features = MMU_FTRS_POWER9, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", - .oprofile_type = PPC_OPROFILE_INVALID, - .cpu_setup = __setup_cpu_power9, - .cpu_restore = __restore_cpu_power9, - .machine_check_early = __machine_check_early_realmode_p9, - .platform = "power9", - }, - { /* Power9 DD2.2 or later */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004e0000, - .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9_DD2_2, - .cpu_user_features = COMMON_USER_POWER9, - .cpu_user_features2 = COMMON_USER2_POWER9, - .mmu_features = MMU_FTRS_POWER9, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", - .oprofile_type = PPC_OPROFILE_INVALID, - .cpu_setup = __setup_cpu_power9, - .cpu_restore = __restore_cpu_power9, - .machine_check_early = __machine_check_early_realmode_p9, - .platform = "power9", - }, - { /* Cell Broadband Engine */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00700000, - .cpu_name = "Cell Broadband Engine", - .cpu_features = CPU_FTRS_CELL, - .cpu_user_features = COMMON_USER_PPC64 | - PPC_FEATURE_CELL | PPC_FEATURE_HAS_ALTIVEC_COMP | - PPC_FEATURE_SMT, - .mmu_features = MMU_FTRS_CELL, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/cell-be", - .oprofile_type = PPC_OPROFILE_CELL, - .platform = "ppc-cell-be", - }, - { /* PA Semi PA6T */ - .pvr_mask = 0x7fff0000, - .pvr_value = 0x00900000, - .cpu_name = "PA6T", - .cpu_features = CPU_FTRS_PA6T, - .cpu_user_features = COMMON_USER_PA6T, - .mmu_features = MMU_FTRS_PA6T, - .icache_bsize = 64, - .dcache_bsize = 64, - .num_pmcs = 6, - .pmc_type = PPC_PMC_PA6T, - .cpu_setup = __setup_cpu_pa6t, - .cpu_restore = __restore_cpu_pa6t, - .oprofile_cpu_type = "ppc64/pa6t", - .oprofile_type = PPC_OPROFILE_PA6T, - .platform = "pa6t", - }, - { /* default match */ - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "POWER5 (compatible)", - .cpu_features = CPU_FTRS_COMPATIBLE, - .cpu_user_features = COMMON_USER_PPC64, - .mmu_features = MMU_FTRS_POWER, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .platform = "power5", - } -#endif /* CONFIG_PPC_BOOK3S_64 */ - -#ifdef CONFIG_PPC32 -#ifdef CONFIG_PPC_BOOK3S_601 - { /* 601 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00010000, - .cpu_name = "601", - .cpu_features = CPU_FTRS_PPC601, - .cpu_user_features = COMMON_USER | PPC_FEATURE_601_INSTR | - PPC_FEATURE_UNIFIED_CACHE | PPC_FEATURE_NO_TB, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_generic, - .platform = "ppc601", - }, -#endif /* CONFIG_PPC_BOOK3S_601 */ -#ifdef CONFIG_PPC_BOOK3S_6xx - { /* 603 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00030000, - .cpu_name = "603", - .cpu_features = CPU_FTRS_603, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = 0, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_generic, - .platform = "ppc603", - }, - { /* 603e */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00060000, - .cpu_name = "603e", - .cpu_features = CPU_FTRS_603, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = 0, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_generic, - .platform = "ppc603", - }, - { /* 603ev */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00070000, - .cpu_name = "603ev", - .cpu_features = CPU_FTRS_603, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = 0, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_generic, - .platform = "ppc603", - }, - { /* 604 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00040000, - .cpu_name = "604", - .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 2, - .cpu_setup = __setup_cpu_604, - .machine_check = machine_check_generic, - .platform = "ppc604", - }, - { /* 604e */ - .pvr_mask = 0xfffff000, - .pvr_value = 0x00090000, - .cpu_name = "604e", - .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .cpu_setup = __setup_cpu_604, - .machine_check = machine_check_generic, - .platform = "ppc604", - }, - { /* 604r */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00090000, - .cpu_name = "604r", - .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .cpu_setup = __setup_cpu_604, - .machine_check = machine_check_generic, - .platform = "ppc604", - }, - { /* 604ev */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x000a0000, - .cpu_name = "604ev", - .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .cpu_setup = __setup_cpu_604, - .machine_check = machine_check_generic, - .platform = "ppc604", - }, - { /* 740/750 (0x4202, don't support TAU ?) */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x00084202, - .cpu_name = "740/750", - .cpu_features = CPU_FTRS_740_NOTAU, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .cpu_setup = __setup_cpu_750, - .machine_check = machine_check_generic, - .platform = "ppc750", - }, - { /* 750CX (80100 and 8010x?) */ - .pvr_mask = 0xfffffff0, - .pvr_value = 0x00080100, - .cpu_name = "750CX", - .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .cpu_setup = __setup_cpu_750cx, - .machine_check = machine_check_generic, - .platform = "ppc750", - }, - { /* 750CX (82201 and 82202) */ - .pvr_mask = 0xfffffff0, - .pvr_value = 0x00082200, - .cpu_name = "750CX", - .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750cx, - .machine_check = machine_check_generic, - .platform = "ppc750", - }, - { /* 750CXe (82214) */ - .pvr_mask = 0xfffffff0, - .pvr_value = 0x00082210, - .cpu_name = "750CXe", - .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750cx, - .machine_check = machine_check_generic, - .platform = "ppc750", - }, - { /* 750CXe "Gekko" (83214) */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x00083214, - .cpu_name = "750CXe", - .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750cx, - .machine_check = machine_check_generic, - .platform = "ppc750", - }, - { /* 750CL (and "Broadway") */ - .pvr_mask = 0xfffff0e0, - .pvr_value = 0x00087000, - .cpu_name = "750CL", - .cpu_features = CPU_FTRS_750CL, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750, - .machine_check = machine_check_generic, - .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", - .oprofile_type = PPC_OPROFILE_G4, - }, - { /* 745/755 */ - .pvr_mask = 0xfffff000, - .pvr_value = 0x00083000, - .cpu_name = "745/755", - .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750, - .machine_check = machine_check_generic, - .platform = "ppc750", - }, - { /* 750FX rev 1.x */ - .pvr_mask = 0xffffff00, - .pvr_value = 0x70000100, - .cpu_name = "750FX", - .cpu_features = CPU_FTRS_750FX1, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750, - .machine_check = machine_check_generic, - .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", - .oprofile_type = PPC_OPROFILE_G4, - }, - { /* 750FX rev 2.0 must disable HID0[DPM] */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x70000200, - .cpu_name = "750FX", - .cpu_features = CPU_FTRS_750FX2, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750, - .machine_check = machine_check_generic, - .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", - .oprofile_type = PPC_OPROFILE_G4, - }, - { /* 750FX (All revs except 2.0) */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x70000000, - .cpu_name = "750FX", - .cpu_features = CPU_FTRS_750FX, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750fx, - .machine_check = machine_check_generic, - .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", - .oprofile_type = PPC_OPROFILE_G4, - }, - { /* 750GX */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x70020000, - .cpu_name = "750GX", - .cpu_features = CPU_FTRS_750GX, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750fx, - .machine_check = machine_check_generic, - .platform = "ppc750", - .oprofile_cpu_type = "ppc/750", - .oprofile_type = PPC_OPROFILE_G4, - }, - { /* 740/750 (L2CR bit need fixup for 740) */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00080000, - .cpu_name = "740/750", - .cpu_features = CPU_FTRS_740, - .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_IBM, - .cpu_setup = __setup_cpu_750, - .machine_check = machine_check_generic, - .platform = "ppc750", - }, - { /* 7400 rev 1.1 ? (no TAU) */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x000c1101, - .cpu_name = "7400 (1.1)", - .cpu_features = CPU_FTRS_7400_NOTAU, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_7400, - .machine_check = machine_check_generic, - .platform = "ppc7400", - }, - { /* 7400 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x000c0000, - .cpu_name = "7400", - .cpu_features = CPU_FTRS_7400, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_7400, - .machine_check = machine_check_generic, - .platform = "ppc7400", - }, - { /* 7410 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x800c0000, - .cpu_name = "7410", - .cpu_features = CPU_FTRS_7400, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_7410, - .machine_check = machine_check_generic, - .platform = "ppc7400", - }, - { /* 7450 2.0 - no doze/nap */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x80000200, - .cpu_name = "7450", - .cpu_features = CPU_FTRS_7450_20, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7450 2.1 */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x80000201, - .cpu_name = "7450", - .cpu_features = CPU_FTRS_7450_21, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7450 2.3 and newer */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80000000, - .cpu_name = "7450", - .cpu_features = CPU_FTRS_7450_23, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7455 rev 1.x */ - .pvr_mask = 0xffffff00, - .pvr_value = 0x80010100, - .cpu_name = "7455", - .cpu_features = CPU_FTRS_7455_1, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7455 rev 2.0 */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x80010200, - .cpu_name = "7455", - .cpu_features = CPU_FTRS_7455_20, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7455 others */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80010000, - .cpu_name = "7455", - .cpu_features = CPU_FTRS_7455, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7447/7457 Rev 1.0 */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x80020100, - .cpu_name = "7447/7457", - .cpu_features = CPU_FTRS_7447_10, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7447/7457 Rev 1.1 */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x80020101, - .cpu_name = "7447/7457", - .cpu_features = CPU_FTRS_7447_10, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7447/7457 Rev 1.2 and later */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80020000, - .cpu_name = "7447/7457", - .cpu_features = CPU_FTRS_7447, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7447A */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80030000, - .cpu_name = "7447A", - .cpu_features = CPU_FTRS_7447A, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 7448 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80040000, - .cpu_name = "7448", - .cpu_features = CPU_FTRS_7448, - .cpu_user_features = COMMON_USER | - PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, - .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 6, - .pmc_type = PPC_PMC_G4, - .cpu_setup = __setup_cpu_745x, - .oprofile_cpu_type = "ppc/7450", - .oprofile_type = PPC_OPROFILE_G4, - .machine_check = machine_check_generic, - .platform = "ppc7450", - }, - { /* 82xx (8240, 8245, 8260 are all 603e cores) */ - .pvr_mask = 0x7fff0000, - .pvr_value = 0x00810000, - .cpu_name = "82xx", - .cpu_features = CPU_FTRS_82XX, - .cpu_user_features = COMMON_USER, - .mmu_features = 0, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_generic, - .platform = "ppc603", - }, - { /* All G2_LE (603e core, plus some) have the same pvr */ - .pvr_mask = 0x7fff0000, - .pvr_value = 0x00820000, - .cpu_name = "G2_LE", - .cpu_features = CPU_FTRS_G2_LE, - .cpu_user_features = COMMON_USER, - .mmu_features = MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_generic, - .platform = "ppc603", - }, -#ifdef CONFIG_PPC_83xx - { /* e300c1 (a 603e core, plus some) on 83xx */ - .pvr_mask = 0x7fff0000, - .pvr_value = 0x00830000, - .cpu_name = "e300c1", - .cpu_features = CPU_FTRS_E300, - .cpu_user_features = COMMON_USER, - .mmu_features = MMU_FTR_USE_HIGH_BATS, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_83xx, - .platform = "ppc603", - }, - { /* e300c2 (an e300c1 core, plus some, minus FPU) on 83xx */ - .pvr_mask = 0x7fff0000, - .pvr_value = 0x00840000, - .cpu_name = "e300c2", - .cpu_features = CPU_FTRS_E300C2, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, - .mmu_features = MMU_FTR_USE_HIGH_BATS | - MMU_FTR_NEED_DTLB_SW_LRU, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_83xx, - .platform = "ppc603", - }, - { /* e300c3 (e300c1, plus one IU, half cache size) on 83xx */ - .pvr_mask = 0x7fff0000, - .pvr_value = 0x00850000, - .cpu_name = "e300c3", - .cpu_features = CPU_FTRS_E300, - .cpu_user_features = COMMON_USER, - .mmu_features = MMU_FTR_USE_HIGH_BATS | - MMU_FTR_NEED_DTLB_SW_LRU, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_83xx, - .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e300", - .oprofile_type = PPC_OPROFILE_FSL_EMB, - .platform = "ppc603", - }, - { /* e300c4 (e300c1, plus one IU) */ - .pvr_mask = 0x7fff0000, - .pvr_value = 0x00860000, - .cpu_name = "e300c4", - .cpu_features = CPU_FTRS_E300, - .cpu_user_features = COMMON_USER, - .mmu_features = MMU_FTR_USE_HIGH_BATS | - MMU_FTR_NEED_DTLB_SW_LRU, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603, - .machine_check = machine_check_83xx, - .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e300", - .oprofile_type = PPC_OPROFILE_FSL_EMB, - .platform = "ppc603", - }, -#endif - { /* default match, we assume split I/D cache & TB (non-601)... */ - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "(generic PPC)", - .cpu_features = CPU_FTRS_CLASSIC32, - .cpu_user_features = COMMON_USER, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_generic, - .platform = "ppc603", - }, -#endif /* CONFIG_PPC_BOOK3S_6xx */ -#ifdef CONFIG_PPC_8xx - { /* 8xx */ - .pvr_mask = 0xffff0000, - .pvr_value = PVR_8xx, - .cpu_name = "8xx", - /* CPU_FTR_MAYBE_CAN_DOZE is possible, - * if the 8xx code is there.... */ - .cpu_features = CPU_FTRS_8XX, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, - .mmu_features = MMU_FTR_TYPE_8xx, - .icache_bsize = 16, - .dcache_bsize = 16, - .machine_check = machine_check_8xx, - .platform = "ppc823", - }, -#endif /* CONFIG_PPC_8xx */ -#ifdef CONFIG_40x - { /* 403GC */ - .pvr_mask = 0xffffff00, - .pvr_value = 0x00200200, - .cpu_name = "403GC", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 16, - .dcache_bsize = 16, - .machine_check = machine_check_4xx, - .platform = "ppc403", - }, - { /* 403GCX */ - .pvr_mask = 0xffffff00, - .pvr_value = 0x00201400, - .cpu_name = "403GCX", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_NO_TB, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 16, - .dcache_bsize = 16, - .machine_check = machine_check_4xx, - .platform = "ppc403", - }, - { /* 403G ?? */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00200000, - .cpu_name = "403G ??", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 16, - .dcache_bsize = 16, - .machine_check = machine_check_4xx, - .platform = "ppc403", - }, - { /* 405GP */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x40110000, - .cpu_name = "405GP", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* STB 03xxx */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x40130000, - .cpu_name = "STB03xxx", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* STB 04xxx */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x41810000, - .cpu_name = "STB04xxx", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* NP405L */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x41610000, - .cpu_name = "NP405L", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* NP4GS3 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x40B10000, - .cpu_name = "NP4GS3", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* NP405H */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x41410000, - .cpu_name = "NP405H", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405GPr */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x50910000, - .cpu_name = "405GPr", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* STBx25xx */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x51510000, - .cpu_name = "STBx25xx", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405LP */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x41F10000, - .cpu_name = "405LP", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* Xilinx Virtex-II Pro */ - .pvr_mask = 0xfffff000, - .pvr_value = 0x20010000, - .cpu_name = "Virtex-II Pro", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* Xilinx Virtex-4 FX */ - .pvr_mask = 0xfffff000, - .pvr_value = 0x20011000, - .cpu_name = "Virtex-4 FX", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EP */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x51210000, - .cpu_name = "405EP", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EX Rev. A/B with Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x12910007, - .cpu_name = "405EX Rev. A/B", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EX Rev. C without Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x1291000d, - .cpu_name = "405EX Rev. C", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EX Rev. C with Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x1291000f, - .cpu_name = "405EX Rev. C", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EX Rev. D without Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x12910003, - .cpu_name = "405EX Rev. D", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EX Rev. D with Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x12910005, - .cpu_name = "405EX Rev. D", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EXr Rev. A/B without Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x12910001, - .cpu_name = "405EXr Rev. A/B", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EXr Rev. C without Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x12910009, - .cpu_name = "405EXr Rev. C", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EXr Rev. C with Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x1291000b, - .cpu_name = "405EXr Rev. C", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EXr Rev. D without Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x12910000, - .cpu_name = "405EXr Rev. D", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* 405EXr Rev. D with Security */ - .pvr_mask = 0xffff000f, - .pvr_value = 0x12910002, - .cpu_name = "405EXr Rev. D", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { - /* 405EZ */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x41510000, - .cpu_name = "405EZ", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* APM8018X */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x7ff11432, - .cpu_name = "APM8018X", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - }, - { /* default match */ - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "(generic 40x PPC)", - .cpu_features = CPU_FTRS_40X, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, - .mmu_features = MMU_FTR_TYPE_40x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc405", - } - -#endif /* CONFIG_40x */ -#ifdef CONFIG_44x - { - .pvr_mask = 0xf0000fff, - .pvr_value = 0x40000850, - .cpu_name = "440GR Rev. A", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc440", - }, - { /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x40000858, - .cpu_name = "440EP Rev. A", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440ep, - .machine_check = machine_check_4xx, - .platform = "ppc440", - }, - { - .pvr_mask = 0xf0000fff, - .pvr_value = 0x400008d3, - .cpu_name = "440GR Rev. B", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc440", - }, - { /* Matches both physical and logical PVR for 440EP (logical pvr = pvr | 0x8) */ - .pvr_mask = 0xf0000ff7, - .pvr_value = 0x400008d4, - .cpu_name = "440EP Rev. C", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440ep, - .machine_check = machine_check_4xx, - .platform = "ppc440", - }, - { /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x400008db, - .cpu_name = "440EP Rev. B", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440ep, - .machine_check = machine_check_4xx, - .platform = "ppc440", - }, - { /* 440GRX */ - .pvr_mask = 0xf0000ffb, - .pvr_value = 0x200008D0, - .cpu_name = "440GRX", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440grx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* Use logical PVR for 440EPx (logical pvr = pvr | 0x8) */ - .pvr_mask = 0xf0000ffb, - .pvr_value = 0x200008D8, - .cpu_name = "440EPX", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440epx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 440GP Rev. B */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x40000440, - .cpu_name = "440GP Rev. B", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc440gp", - }, - { /* 440GP Rev. C */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x40000481, - .cpu_name = "440GP Rev. C", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc440gp", - }, - { /* 440GX Rev. A */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x50000850, - .cpu_name = "440GX Rev. A", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440gx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 440GX Rev. B */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x50000851, - .cpu_name = "440GX Rev. B", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440gx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 440GX Rev. C */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x50000892, - .cpu_name = "440GX Rev. C", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440gx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 440GX Rev. F */ - .pvr_mask = 0xf0000fff, - .pvr_value = 0x50000894, - .cpu_name = "440GX Rev. F", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440gx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 440SP Rev. A */ - .pvr_mask = 0xfff00fff, - .pvr_value = 0x53200891, - .cpu_name = "440SP Rev. A", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc440", - }, - { /* 440SPe Rev. A */ - .pvr_mask = 0xfff00fff, - .pvr_value = 0x53400890, - .cpu_name = "440SPe Rev. A", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440spe, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 440SPe Rev. B */ - .pvr_mask = 0xfff00fff, - .pvr_value = 0x53400891, - .cpu_name = "440SPe Rev. B", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440spe, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 440 in Xilinx Virtex-5 FXT */ - .pvr_mask = 0xfffffff0, - .pvr_value = 0x7ff21910, - .cpu_name = "440 in Virtex-5 FXT", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_440x5, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 460EX */ - .pvr_mask = 0xffff0006, - .pvr_value = 0x13020002, - .cpu_name = "460EX", - .cpu_features = CPU_FTRS_440x6, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_460ex, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 460EX Rev B */ - .pvr_mask = 0xffff0007, - .pvr_value = 0x13020004, - .cpu_name = "460EX Rev. B", - .cpu_features = CPU_FTRS_440x6, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_460ex, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 460GT */ - .pvr_mask = 0xffff0006, - .pvr_value = 0x13020000, - .cpu_name = "460GT", - .cpu_features = CPU_FTRS_440x6, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_460gt, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 460GT Rev B */ - .pvr_mask = 0xffff0007, - .pvr_value = 0x13020005, - .cpu_name = "460GT Rev. B", - .cpu_features = CPU_FTRS_440x6, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_460gt, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 460SX */ - .pvr_mask = 0xffffff00, - .pvr_value = 0x13541800, - .cpu_name = "460SX", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_460sx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, - { /* 464 in APM821xx */ - .pvr_mask = 0xfffffff0, - .pvr_value = 0x12C41C80, - .cpu_name = "APM821XX", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_apm821xx, - .machine_check = machine_check_440A, - .platform = "ppc440", - }, -#ifdef CONFIG_PPC_47x - { /* 476 DD2 core */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x11a52080, - .cpu_name = "476", - .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_47x | - MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, - .icache_bsize = 32, - .dcache_bsize = 128, - .machine_check = machine_check_47x, - .platform = "ppc470", - }, - { /* 476fpe */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x7ff50000, - .cpu_name = "476fpe", - .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_47x | - MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, - .icache_bsize = 32, - .dcache_bsize = 128, - .machine_check = machine_check_47x, - .platform = "ppc470", - }, - { /* 476 iss */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00050000, - .cpu_name = "476", - .cpu_features = CPU_FTRS_47X, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_47x | - MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, - .icache_bsize = 32, - .dcache_bsize = 128, - .machine_check = machine_check_47x, - .platform = "ppc470", - }, - { /* 476 others */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x11a50000, - .cpu_name = "476", - .cpu_features = CPU_FTRS_47X, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_FPU, - .mmu_features = MMU_FTR_TYPE_47x | - MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, - .icache_bsize = 32, - .dcache_bsize = 128, - .machine_check = machine_check_47x, - .platform = "ppc470", - }, -#endif /* CONFIG_PPC_47x */ - { /* default match */ - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "(generic 44x PPC)", - .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER_BOOKE, - .mmu_features = MMU_FTR_TYPE_44x, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_4xx, - .platform = "ppc440", - } -#endif /* CONFIG_44x */ -#ifdef CONFIG_E200 - { /* e200z5 */ - .pvr_mask = 0xfff00000, - .pvr_value = 0x81000000, - .cpu_name = "e200z5", - /* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */ - .cpu_features = CPU_FTRS_E200, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_EFP_SINGLE | - PPC_FEATURE_UNIFIED_CACHE, - .mmu_features = MMU_FTR_TYPE_FSL_E, - .dcache_bsize = 32, - .machine_check = machine_check_e200, - .platform = "ppc5554", - }, - { /* e200z6 */ - .pvr_mask = 0xfff00000, - .pvr_value = 0x81100000, - .cpu_name = "e200z6", - /* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */ - .cpu_features = CPU_FTRS_E200, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_SPE_COMP | - PPC_FEATURE_HAS_EFP_SINGLE_COMP | - PPC_FEATURE_UNIFIED_CACHE, - .mmu_features = MMU_FTR_TYPE_FSL_E, - .dcache_bsize = 32, - .machine_check = machine_check_e200, - .platform = "ppc5554", - }, - { /* default match */ - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "(generic E200 PPC)", - .cpu_features = CPU_FTRS_E200, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_EFP_SINGLE | - PPC_FEATURE_UNIFIED_CACHE, - .mmu_features = MMU_FTR_TYPE_FSL_E, - .dcache_bsize = 32, - .cpu_setup = __setup_cpu_e200, - .machine_check = machine_check_e200, - .platform = "ppc5554", - } -#endif /* CONFIG_E200 */ -#endif /* CONFIG_PPC32 */ -#ifdef CONFIG_E500 -#ifdef CONFIG_PPC32 -#ifndef CONFIG_PPC_E500MC - { /* e500 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80200000, - .cpu_name = "e500", - .cpu_features = CPU_FTRS_E500, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_SPE_COMP | - PPC_FEATURE_HAS_EFP_SINGLE_COMP, - .cpu_user_features2 = PPC_FEATURE2_ISEL, - .mmu_features = MMU_FTR_TYPE_FSL_E, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500", - .oprofile_type = PPC_OPROFILE_FSL_EMB, - .cpu_setup = __setup_cpu_e500v1, - .machine_check = machine_check_e500, - .platform = "ppc8540", - }, - { /* e500v2 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80210000, - .cpu_name = "e500v2", - .cpu_features = CPU_FTRS_E500_2, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_SPE_COMP | - PPC_FEATURE_HAS_EFP_SINGLE_COMP | - PPC_FEATURE_HAS_EFP_DOUBLE_COMP, - .cpu_user_features2 = PPC_FEATURE2_ISEL, - .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS, - .icache_bsize = 32, - .dcache_bsize = 32, - .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500", - .oprofile_type = PPC_OPROFILE_FSL_EMB, - .cpu_setup = __setup_cpu_e500v2, - .machine_check = machine_check_e500, - .platform = "ppc8548", - .cpu_down_flush = cpu_down_flush_e500v2, - }, -#else - { /* e500mc */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80230000, - .cpu_name = "e500mc", - .cpu_features = CPU_FTRS_E500MC, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .cpu_user_features2 = PPC_FEATURE2_ISEL, - .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | - MMU_FTR_USE_TLBILX, - .icache_bsize = 64, - .dcache_bsize = 64, - .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500mc", - .oprofile_type = PPC_OPROFILE_FSL_EMB, - .cpu_setup = __setup_cpu_e500mc, - .machine_check = machine_check_e500mc, - .platform = "ppce500mc", - .cpu_down_flush = cpu_down_flush_e500mc, - }, -#endif /* CONFIG_PPC_E500MC */ -#endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC_E500MC - { /* e5500 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80240000, - .cpu_name = "e5500", - .cpu_features = CPU_FTRS_E5500, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, - .cpu_user_features2 = PPC_FEATURE2_ISEL, - .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | - MMU_FTR_USE_TLBILX, - .icache_bsize = 64, - .dcache_bsize = 64, - .num_pmcs = 4, - .oprofile_cpu_type = "ppc/e500mc", - .oprofile_type = PPC_OPROFILE_FSL_EMB, - .cpu_setup = __setup_cpu_e5500, -#ifndef CONFIG_PPC32 - .cpu_restore = __restore_cpu_e5500, -#endif - .machine_check = machine_check_e500mc, - .platform = "ppce5500", - .cpu_down_flush = cpu_down_flush_e5500, - }, - { /* e6500 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x80400000, - .cpu_name = "e6500", - .cpu_features = CPU_FTRS_E6500, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU | - PPC_FEATURE_HAS_ALTIVEC_COMP, - .cpu_user_features2 = PPC_FEATURE2_ISEL, - .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | - MMU_FTR_USE_TLBILX, - .icache_bsize = 64, - .dcache_bsize = 64, - .num_pmcs = 6, - .oprofile_cpu_type = "ppc/e6500", - .oprofile_type = PPC_OPROFILE_FSL_EMB, - .cpu_setup = __setup_cpu_e6500, -#ifndef CONFIG_PPC32 - .cpu_restore = __restore_cpu_e6500, -#endif - .machine_check = machine_check_e500mc, - .platform = "ppce6500", - .cpu_down_flush = cpu_down_flush_e6500, - }, -#endif /* CONFIG_PPC_E500MC */ -#ifdef CONFIG_PPC32 - { /* default match */ - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "(generic E500 PPC)", - .cpu_features = CPU_FTRS_E500, - .cpu_user_features = COMMON_USER_BOOKE | - PPC_FEATURE_HAS_SPE_COMP | - PPC_FEATURE_HAS_EFP_SINGLE_COMP, - .mmu_features = MMU_FTR_TYPE_FSL_E, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_e500, - .platform = "powerpc", - } -#endif /* CONFIG_PPC32 */ -#endif /* CONFIG_E500 */ -}; +#include "cpu_specs.h" void __init set_cur_cpu_spec(struct cpu_spec *s) { @@ -2177,30 +67,18 @@ static struct cpu_spec * __init setup_cpu_spec(unsigned long offset, if (old.num_pmcs && !s->num_pmcs) { t->num_pmcs = old.num_pmcs; t->pmc_type = old.pmc_type; - t->oprofile_type = old.oprofile_type; - t->oprofile_mmcra_sihv = old.oprofile_mmcra_sihv; - t->oprofile_mmcra_sipr = old.oprofile_mmcra_sipr; - t->oprofile_mmcra_clear = old.oprofile_mmcra_clear; /* - * If we have passed through this logic once before and - * have pulled the default case because the real PVR was - * not found inside cpu_specs[], then we are possibly - * running in compatibility mode. In that case, let the - * oprofiler know which set of compatibility counters to - * pull from by making sure the oprofile_cpu_type string - * is set to that of compatibility mode. If the - * oprofile_cpu_type already has a value, then we are - * possibly overriding a real PVR with a logical one, - * and, in that case, keep the current value for - * oprofile_cpu_type. + * Let's ensure that the + * fix for the PMAO bug is enabled on compatibility mode. */ - if (old.oprofile_cpu_type != NULL) { - t->oprofile_cpu_type = old.oprofile_cpu_type; - t->oprofile_type = old.oprofile_type; - } + t->cpu_features |= old.cpu_features & CPU_FTR_PMAO_BUG; } + /* Set kuap ON at startup, will be disabled later if cmdline has 'nosmap' */ + if (IS_ENABLED(CONFIG_PPC_KUAP) && IS_ENABLED(CONFIG_PPC32)) + t->mmu_features |= MMU_FTR_KUAP; + *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec; /* @@ -2230,6 +108,8 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr) struct cpu_spec *s = cpu_specs; int i; + BUILD_BUG_ON(!ARRAY_SIZE(cpu_specs)); + s = PTRRELOC(s); for (i = 0; i < ARRAY_SIZE(cpu_specs); i++,s++) { @@ -2286,7 +166,7 @@ void __init cpu_feature_keys_init(void) struct static_key_true mmu_feature_keys[NUM_MMU_FTR_KEYS] = { [0 ... NUM_MMU_FTR_KEYS - 1] = STATIC_KEY_TRUE_INIT }; -EXPORT_SYMBOL_GPL(mmu_feature_keys); +EXPORT_SYMBOL(mmu_feature_keys); void __init mmu_feature_keys_init(void) { diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index 05745ddbd229..2086fa6cdc25 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -12,12 +12,14 @@ #include <linux/crash_dump.h> #include <linux/io.h> #include <linux/memblock.h> +#include <linux/of.h> #include <asm/code-patching.h> #include <asm/kdump.h> -#include <asm/prom.h> #include <asm/firmware.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/rtas.h> +#include <asm/inst.h> +#include <asm/fadump.h> #ifdef DEBUG #include <asm/udbg.h> @@ -34,7 +36,7 @@ void __init reserve_kdump_trampoline(void) static void __init create_trampoline(unsigned long addr) { - unsigned int *p = (unsigned int *)addr; + u32 *p = (u32 *)addr; /* The maximum range of a single instruction branch, is the current * instruction's address + (32 MB - 4) bytes. For the trampoline we @@ -44,8 +46,8 @@ static void __init create_trampoline(unsigned long addr) * branch to "addr" we jump to ("addr" + 32 MB). Although it requires * two instructions it doesn't require any registers. */ - patch_instruction(p, PPC_INST_NOP); - patch_branch(++p, addr + PHYSICAL_START, 0); + patch_instruction(p, ppc_inst(PPC_RAW_NOP())); + patch_branch(p + 1, addr + PHYSICAL_START, 0); } void __init setup_kdump_trampoline(void) @@ -67,33 +69,8 @@ void __init setup_kdump_trampoline(void) } #endif /* CONFIG_NONSTATIC_KERNEL */ -static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize, - unsigned long offset, int userbuf) -{ - if (userbuf) { - if (copy_to_user((char __user *)buf, (vaddr + offset), csize)) - return -EFAULT; - } else - memcpy(buf, (vaddr + offset), csize); - - return csize; -} - -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; phys_addr_t paddr; @@ -106,16 +83,27 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (memblock_is_region_memory(paddr, csize)) { vaddr = __va(paddr); - csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + csize = copy_to_iter(vaddr + offset, csize, iter); } else { vaddr = ioremap_cache(paddr, PAGE_SIZE); - csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap(vaddr); } return csize; } +/* + * Return true only when kexec based kernel dump capturing method is used. + * This ensures all restritions applied for kdump case are not automatically + * applied for fadump case. + */ +bool is_kdump_kernel(void) +{ + return !is_fadump_active() && elfcorehdr_addr != ELFCORE_ADDR_MAX; +} +EXPORT_SYMBOL_GPL(is_kdump_kernel); + #ifdef CONFIG_PPC_RTAS /* * The crashkernel region will almost always overlap the RTAS region, so diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index cc14aa6c4a1b..909a05cd2809 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -9,14 +9,14 @@ #include <linux/export.h> #include <linux/fs.h> #include <linux/debugfs.h> -#include <asm/debugfs.h> #include <asm/machdep.h> #include <asm/hvcall.h> +#include <asm/firmware.h> bool dawr_force_enable; EXPORT_SYMBOL_GPL(dawr_force_enable); -int set_dawr(struct arch_hw_breakpoint *brk) +int set_dawr(int nr, struct arch_hw_breakpoint *brk) { unsigned long dawr, dawrx, mrd; @@ -28,7 +28,7 @@ int set_dawr(struct arch_hw_breakpoint *brk) dawrx |= (brk->type & (HW_BRK_TYPE_PRIV_ALL)) >> 3; /* * DAWR length is stored in field MDR bits 48:53. Matches range in - * doublewords (64 bits) baised by -1 eg. 0b000000=1DW and + * doublewords (64 bits) biased by -1 eg. 0b000000=1DW and * 0b111111=64DW. * brk->hw_len is in bytes. * This aligns up to double word size, shifts and does the bias. @@ -37,17 +37,26 @@ int set_dawr(struct arch_hw_breakpoint *brk) dawrx |= (mrd & 0x3f) << (63 - 53); if (ppc_md.set_dawr) - return ppc_md.set_dawr(dawr, dawrx); - - mtspr(SPRN_DAWR, dawr); - mtspr(SPRN_DAWRX, dawrx); + return ppc_md.set_dawr(nr, dawr, dawrx); + + if (nr == 0) { + mtspr(SPRN_DAWR0, dawr); + mtspr(SPRN_DAWRX0, dawrx); + } else { + mtspr(SPRN_DAWR1, dawr); + mtspr(SPRN_DAWRX1, dawrx); + } return 0; } -static void set_dawr_cb(void *info) +static void disable_dawrs_cb(void *info) { - set_dawr(info); + struct arch_hw_breakpoint null_brk = {0}; + int i; + + for (i = 0; i < nr_wp_slots(); i++) + set_dawr(i, &null_brk); } static ssize_t dawr_write_file_bool(struct file *file, @@ -60,7 +69,7 @@ static ssize_t dawr_write_file_bool(struct file *file, /* Send error to user if they hypervisor won't allow us to write DAWR */ if (!dawr_force_enable && firmware_has_feature(FW_FEATURE_LPAR) && - set_dawr(&null_brk) != H_SUCCESS) + set_dawr(0, &null_brk) != H_SUCCESS) return -ENODEV; rc = debugfs_write_file_bool(file, user_buf, count, ppos); @@ -69,7 +78,7 @@ static ssize_t dawr_write_file_bool(struct file *file, /* If we are clearing, make sure all CPUs have the DAWR cleared */ if (!dawr_force_enable) - smp_call_function(set_dawr_cb, &null_brk, 0); + smp_call_function(disable_dawrs_cb, NULL, 0); return rc; } @@ -92,7 +101,7 @@ static int __init dawr_force_setup(void) if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) { /* Turn DAWR off by default, but allow admin to turn it on */ debugfs_create_file_unsafe("dawr_enable_dangerous", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &dawr_force_enable, &dawr_enable_fops); } diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index f17ff1200eaa..5712dd846263 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -12,77 +12,23 @@ #include <linux/hardirq.h> #include <asm/dbell.h> +#include <asm/interrupt.h> #include <asm/irq_regs.h> #include <asm/kvm_ppc.h> #include <asm/trace.h> #ifdef CONFIG_SMP -/* - * Doorbells must only be used if CPU_FTR_DBELL is available. - * msgsnd is used in HV, and msgsndp is used in !HV. - * - * These should be used by platform code that is aware of restrictions. - * Other arch code should use ->cause_ipi. - * - * doorbell_global_ipi() sends a dbell to any target CPU. - * Must be used only by architectures that address msgsnd target - * by PIR/get_hard_smp_processor_id. - */ -void doorbell_global_ipi(int cpu) -{ - u32 tag = get_hard_smp_processor_id(cpu); - - kvmppc_set_host_ipi(cpu); - /* Order previous accesses vs. msgsnd, which is treated as a store */ - ppc_msgsnd_sync(); - ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); -} - -/* - * doorbell_core_ipi() sends a dbell to a target CPU in the same core. - * Must be used only by architectures that address msgsnd target - * by TIR/cpu_thread_in_core. - */ -void doorbell_core_ipi(int cpu) -{ - u32 tag = cpu_thread_in_core(cpu); - - kvmppc_set_host_ipi(cpu); - /* Order previous accesses vs. msgsnd, which is treated as a store */ - ppc_msgsnd_sync(); - ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); -} - -/* - * Attempt to cause a core doorbell if destination is on the same core. - * Returns 1 on success, 0 on failure. - */ -int doorbell_try_core_ipi(int cpu) -{ - int this_cpu = get_cpu(); - int ret = 0; - - if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) { - doorbell_core_ipi(cpu); - ret = 1; - } - - put_cpu(); - - return ret; -} - -void doorbell_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception) { struct pt_regs *old_regs = set_irq_regs(regs); - irq_enter(); trace_doorbell_entry(regs); ppc_msgsync(); - may_hard_irq_enable(); + if (should_hard_irq_enable(regs)) + do_hard_irq_enable(); kvmppc_clear_host_ipi(smp_processor_id()); __this_cpu_inc(irq_stat.doorbell_irqs); @@ -90,13 +36,12 @@ void doorbell_exception(struct pt_regs *regs) smp_ipi_demux_relaxed(); /* already performed the barrier */ trace_doorbell_exit(regs); - irq_exit(); + set_irq_regs(old_regs); } #else /* CONFIG_SMP */ -void doorbell_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception) { printk(KERN_WARNING "Received doorbell on non-smp system\n"); } #endif /* CONFIG_SMP */ - diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index e486d1d78de2..8920862ffd79 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -10,26 +10,66 @@ #include <linux/pci.h> #include <asm/iommu.h> -/* - * Generic iommu implementation - */ +#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT +#define can_map_direct(dev, addr) \ + ((dev)->bus_dma_limit >= phys_to_dma((dev), (addr))) -/* - * The coherent mask may be smaller than the real mask, check if we can - * really use a direct window. - */ -static inline bool dma_iommu_alloc_bypass(struct device *dev) +bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr) +{ + if (likely(!dev->bus_dma_limit)) + return false; + + return can_map_direct(dev, addr); +} + +#define is_direct_handle(dev, h) ((h) >= (dev)->archdata.dma_offset) + +bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle) { - return dev->archdata.iommu_bypass && !iommu_fixed_is_weak && - dma_direct_supported(dev, dev->coherent_dma_mask); + if (likely(!dev->bus_dma_limit)) + return false; + + return is_direct_handle(dev, dma_handle); } -static inline bool dma_iommu_map_bypass(struct device *dev, - unsigned long attrs) +bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg, + int nents) { - return dev->archdata.iommu_bypass && - (!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING)); + struct scatterlist *s; + int i; + + if (likely(!dev->bus_dma_limit)) + return false; + + for_each_sg(sg, s, nents, i) { + if (!can_map_direct(dev, sg_phys(s) + s->offset + s->length)) + return false; + } + + return true; +} + +bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, + int nents) +{ + struct scatterlist *s; + int i; + + if (likely(!dev->bus_dma_limit)) + return false; + + for_each_sg(sg, s, nents, i) { + if (!is_direct_handle(dev, s->dma_address + s->length)) + return false; + } + + return true; } +#endif /* CONFIG_ARCH_HAS_DMA_MAP_DIRECT */ + +/* + * Generic iommu implementation + */ /* Allocates a contiguous real buffer and creates mappings over it. * Returns the virtual address of the buffer and sets dma_handle @@ -39,8 +79,6 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { - if (dma_iommu_alloc_bypass(dev)) - return dma_direct_alloc(dev, size, dma_handle, flag, attrs); return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, dma_handle, dev->coherent_dma_mask, flag, dev_to_node(dev)); @@ -50,11 +88,7 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - if (dma_iommu_alloc_bypass(dev)) - dma_direct_free(dev, size, vaddr, dma_handle, attrs); - else - iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, - dma_handle); + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); } /* Creates TCEs for a user provided buffer. The user buffer must be @@ -67,9 +101,6 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page, enum dma_data_direction direction, unsigned long attrs) { - if (dma_iommu_map_bypass(dev, attrs)) - return dma_direct_map_page(dev, page, offset, size, direction, - attrs); return iommu_map_page(dev, get_iommu_table_base(dev), page, offset, size, dma_get_mask(dev), direction, attrs); } @@ -79,11 +110,8 @@ static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction, unsigned long attrs) { - if (!dma_iommu_map_bypass(dev, attrs)) - iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, - direction, attrs); - else - dma_direct_unmap_page(dev, dma_handle, size, direction, attrs); + iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction, + attrs); } @@ -91,8 +119,6 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { - if (dma_iommu_map_bypass(dev, attrs)) - return dma_direct_map_sg(dev, sglist, nelems, direction, attrs); return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems, dma_get_mask(dev), direction, attrs); } @@ -101,11 +127,8 @@ static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { - if (!dma_iommu_map_bypass(dev, attrs)) - ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, + ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, direction, attrs); - else - dma_direct_unmap_sg(dev, sglist, nelems, direction, attrs); } static bool dma_iommu_bypass_supported(struct device *dev, u64 mask) @@ -113,21 +136,34 @@ static bool dma_iommu_bypass_supported(struct device *dev, u64 mask) struct pci_dev *pdev = to_pci_dev(dev); struct pci_controller *phb = pci_bus_to_host(pdev->bus); - return phb->controller_ops.iommu_bypass_supported && - phb->controller_ops.iommu_bypass_supported(pdev, mask); + if (iommu_fixed_is_weak || !phb->controller_ops.iommu_bypass_supported) + return false; + return phb->controller_ops.iommu_bypass_supported(pdev, mask); } /* We support DMA to/from any memory page via the iommu */ int dma_iommu_dma_supported(struct device *dev, u64 mask) { - struct iommu_table *tbl = get_iommu_table_base(dev); + struct iommu_table *tbl; if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { - dev->archdata.iommu_bypass = true; - dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); + /* + * dma_iommu_bypass_supported() sets dma_max when there is + * 1:1 mapping but it is somehow limited. + * ibm,pmemory is one example. + */ + dev->dma_ops_bypass = dev->bus_dma_limit == 0; + if (!dev->dma_ops_bypass) + dev_warn(dev, + "iommu: 64-bit OK but direct DMA is limited by %llx\n", + dev->bus_dma_limit); + else + dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); return 1; } + tbl = get_iommu_table_base(dev); + if (!tbl) { dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask); return 0; @@ -141,7 +177,7 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) } dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); - dev->archdata.iommu_bypass = false; + dev->dma_ops_bypass = false; return 1; } @@ -150,50 +186,25 @@ u64 dma_iommu_get_required_mask(struct device *dev) struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; - if (!tbl) - return 0; - if (dev_is_pci(dev)) { u64 bypass_mask = dma_direct_get_required_mask(dev); - if (dma_iommu_bypass_supported(dev, bypass_mask)) + if (dma_iommu_dma_supported(dev, bypass_mask)) { + dev_info(dev, "%s: returning bypass mask 0x%llx\n", __func__, bypass_mask); return bypass_mask; + } } - mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); + if (!tbl) + return 0; + + mask = 1ULL << (fls_long(tbl->it_offset + tbl->it_size) + + tbl->it_page_shift - 1); mask += mask - 1; return mask; } -static void dma_iommu_sync_for_cpu(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ - if (dma_iommu_alloc_bypass(dev)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); -} - -static void dma_iommu_sync_for_device(struct device *dev, dma_addr_t addr, - size_t sz, enum dma_data_direction dir) -{ - if (dma_iommu_alloc_bypass(dev)) - dma_direct_sync_single_for_device(dev, addr, sz, dir); -} - -extern void dma_iommu_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - if (dma_iommu_alloc_bypass(dev)) - dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir); -} - -extern void dma_iommu_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - if (dma_iommu_alloc_bypass(dev)) - dma_direct_sync_sg_for_device(dev, sgl, nents, dir); -} - const struct dma_map_ops dma_iommu_ops = { .alloc = dma_iommu_alloc_coherent, .free = dma_iommu_free_coherent, @@ -203,10 +214,8 @@ const struct dma_map_ops dma_iommu_ops = { .map_page = dma_iommu_map_page, .unmap_page = dma_iommu_unmap_page, .get_required_mask = dma_iommu_get_required_mask, - .sync_single_for_cpu = dma_iommu_sync_for_cpu, - .sync_single_for_device = dma_iommu_sync_for_device, - .sync_sg_for_cpu = dma_iommu_sync_sg_for_cpu, - .sync_sg_for_device = dma_iommu_sync_sg_for_device, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; diff --git a/arch/powerpc/kernel/dma-mask.c b/arch/powerpc/kernel/dma-mask.c index ffbbbc432612..5b07ca7b73aa 100644 --- a/arch/powerpc/kernel/dma-mask.c +++ b/arch/powerpc/kernel/dma-mask.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/export.h> #include <asm/machdep.h> diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index fc7816126a40..ba256c37bcc0 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -10,6 +10,7 @@ #include <asm/swiotlb.h> unsigned int ppc_swiotlb_enable; +unsigned int ppc_swiotlb_flags; void __init swiotlb_detect_4g(void) { diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 180b3a5d1001..c3fb9fdf5bd7 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -10,6 +10,7 @@ #include <linux/jump_label.h> #include <linux/libfdt.h> #include <linux/memblock.h> +#include <linux/of_fdt.h> #include <linux/printk.h> #include <linux/sched.h> #include <linux/string.h> @@ -17,15 +18,14 @@ #include <asm/cputable.h> #include <asm/dt_cpu_ftrs.h> +#include <asm/mce.h> #include <asm/mmu.h> -#include <asm/oprofile_impl.h> -#include <asm/prom.h> #include <asm/setup.h> /* Device-tree visible constants follow */ -#define ISA_V2_07B 2070 #define ISA_V3_0B 3000 +#define ISA_V3_1 3100 #define USABLE_PR (1U << 0) #define USABLE_OS (1U << 1) @@ -64,44 +64,25 @@ struct dt_cpu_feature { * Set up the base CPU */ -extern long __machine_check_early_realmode_p8(struct pt_regs *regs); -extern long __machine_check_early_realmode_p9(struct pt_regs *regs); - static int hv_mode; static struct { u64 lpcr; - u64 lpcr_clear; u64 hfscr; u64 fscr; + u64 pcr; } system_registers; static void (*init_pmu_registers)(void); static void __restore_cpu_cpufeatures(void) { - u64 lpcr; - - /* - * LPCR is restored by the power on engine already. It can be changed - * after early init e.g., by radix enable, and we have no unified API - * for saving and restoring such SPRs. - * - * This ->restore hook should really be removed from idle and register - * restore moved directly into the idle restore code, because this code - * doesn't know how idle is implemented or what it needs restored here. - * - * The best we can do to accommodate secondary boot and idle restore - * for now is "or" LPCR with existing. - */ - lpcr = mfspr(SPRN_LPCR); - lpcr |= system_registers.lpcr; - lpcr &= ~system_registers.lpcr_clear; - mtspr(SPRN_LPCR, lpcr); + mtspr(SPRN_LPCR, system_registers.lpcr); if (hv_mode) { mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_HFSCR, system_registers.hfscr); - mtspr(SPRN_PCR, PCR_MASK); + mtspr(SPRN_PCR, system_registers.pcr); } mtspr(SPRN_FSCR, system_registers.fscr); @@ -121,8 +102,6 @@ static struct cpu_spec __initdata base_cpu_spec = { .dcache_bsize = 32, /* cache info init. */ .num_pmcs = 0, .pmc_type = PPC_PMC_DEFAULT, - .oprofile_cpu_type = NULL, - .oprofile_type = PPC_OPROFILE_INVALID, .cpu_setup = NULL, .cpu_restore = __restore_cpu_cpufeatures, .machine_check_early = NULL, @@ -139,7 +118,6 @@ static void __init cpufeatures_setup_cpu(void) /* Initialize the base environment -- clear FSCR/HFSCR. */ hv_mode = !!(mfmsr() & MSR_HV); if (hv_mode) { - /* CPU_FTR_HVMODE is used early in PACA setup */ cur_cpu_spec->cpu_features |= CPU_FTR_HVMODE; mtspr(SPRN_HFSCR, 0); } @@ -238,6 +216,7 @@ static int __init feat_enable_hv(struct dt_cpu_feature *f) } mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); lpcr = mfspr(SPRN_LPCR); lpcr &= ~LPCR_LPES0; /* HV external interrupts */ @@ -275,13 +254,6 @@ static int __init feat_enable_idle_nap(struct dt_cpu_feature *f) return 1; } -static int __init feat_enable_align_dsisr(struct dt_cpu_feature *f) -{ - cur_cpu_spec->cpu_features &= ~CPU_FTR_NODSISRALIGN; - - return 1; -} - static int __init feat_enable_idle_stop(struct dt_cpu_feature *f) { u64 lpcr; @@ -300,6 +272,9 @@ static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f) { u64 lpcr; + if (!IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + return 0; + lpcr = mfspr(SPRN_LPCR); lpcr &= ~LPCR_ISL; @@ -319,7 +294,9 @@ static int __init feat_enable_mmu_hash_v3(struct dt_cpu_feature *f) { u64 lpcr; - system_registers.lpcr_clear |= (LPCR_ISL | LPCR_UPRT | LPCR_HR); + if (!IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + return 0; + lpcr = mfspr(SPRN_LPCR); lpcr &= ~(LPCR_ISL | LPCR_UPRT | LPCR_HR); mtspr(SPRN_LPCR, lpcr); @@ -333,20 +310,29 @@ static int __init feat_enable_mmu_hash_v3(struct dt_cpu_feature *f) static int __init feat_enable_mmu_radix(struct dt_cpu_feature *f) { -#ifdef CONFIG_PPC_RADIX_MMU + if (!IS_ENABLED(CONFIG_PPC_RADIX_MMU)) + return 0; + + cur_cpu_spec->mmu_features |= MMU_FTR_KERNEL_RO; cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX; - cur_cpu_spec->mmu_features |= MMU_FTRS_HASH_BASE; + cur_cpu_spec->mmu_features |= MMU_FTR_GTSE; cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_MMU; return 1; -#endif - return 0; } static int __init feat_enable_dscr(struct dt_cpu_feature *f) { u64 lpcr; + /* + * Linux relies on FSCR[DSCR] being clear, so that we can take the + * facility unavailable interrupt and track the task's usage of DSCR. + * See facility_unavailable_exception(). + * Clear the bit here so that feat_enable() doesn't set it. + */ + f->fscr_bit_nr = -1; + feat_enable(f); lpcr = mfspr(SPRN_LPCR); @@ -357,7 +343,7 @@ static int __init feat_enable_dscr(struct dt_cpu_feature *f) return 1; } -static void hfscr_pmu_enable(void) +static void __init hfscr_pmu_enable(void) { u64 hfscr = mfspr(SPRN_HFSCR); hfscr |= PPC_BIT(60); @@ -372,7 +358,7 @@ static void init_pmu_power8(void) } mtspr(SPRN_MMCRA, 0); - mtspr(SPRN_MMCR0, 0); + mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0); mtspr(SPRN_MMCRS, 0); @@ -400,7 +386,6 @@ static int __init feat_enable_pmu_power8(struct dt_cpu_feature *f) cur_cpu_spec->num_pmcs = 6; cur_cpu_spec->pmc_type = PPC_PMC_IBM; - cur_cpu_spec->oprofile_cpu_type = "ppc64/power8"; return 1; } @@ -411,7 +396,7 @@ static void init_pmu_power9(void) mtspr(SPRN_MMCRC, 0); mtspr(SPRN_MMCRA, 0); - mtspr(SPRN_MMCR0, 0); + mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0); } @@ -436,7 +421,39 @@ static int __init feat_enable_pmu_power9(struct dt_cpu_feature *f) cur_cpu_spec->num_pmcs = 6; cur_cpu_spec->pmc_type = PPC_PMC_IBM; - cur_cpu_spec->oprofile_cpu_type = "ppc64/power9"; + + return 1; +} + +static void init_pmu_power10(void) +{ + init_pmu_power9(); + + mtspr(SPRN_MMCR3, 0); + mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE); + mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT); +} + +static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f) +{ + hfscr_pmu_enable(); + + init_pmu_power10(); + init_pmu_registers = init_pmu_power10; + + cur_cpu_spec->cpu_features |= CPU_FTR_MMCRA; + cur_cpu_spec->cpu_user_features |= PPC_FEATURE_PSERIES_PERFMON_COMPAT; + + cur_cpu_spec->num_pmcs = 6; + cur_cpu_spec->pmc_type = PPC_PMC_IBM; + + return 1; +} + +static int __init feat_enable_mce_power10(struct dt_cpu_feature *f) +{ + cur_cpu_spec->platform = "power10"; + cur_cpu_spec->machine_check_early = __machine_check_early_realmode_p10; return 1; } @@ -553,6 +570,18 @@ static int __init feat_enable_large_ci(struct dt_cpu_feature *f) return 1; } +static int __init feat_enable_mma(struct dt_cpu_feature *f) +{ + u64 pcr; + + feat_enable(f); + pcr = mfspr(SPRN_PCR); + pcr &= ~PCR_MMA_DIS; + mtspr(SPRN_PCR, pcr); + + return 1; +} + struct dt_cpu_feature_match { const char *name; int (*enable)(struct dt_cpu_feature *f); @@ -566,6 +595,7 @@ static struct dt_cpu_feature_match __initdata {"little-endian", feat_enable_le, CPU_FTR_REAL_LE}, {"smt", feat_enable_smt, 0}, {"interrupt-facilities", feat_enable, 0}, + {"system-call-vectored", feat_enable, 0}, {"timer-facilities", feat_enable, 0}, {"timer-facilities-v3", feat_enable, 0}, {"debug-facilities", feat_enable, 0}, @@ -588,7 +618,7 @@ static struct dt_cpu_feature_match __initdata {"tm-suspend-hypervisor-assist", feat_enable, CPU_FTR_P9_TM_HV_ASSIST}, {"tm-suspend-xer-so-bug", feat_enable, CPU_FTR_P9_TM_XER_SO_BUG}, {"idle-nap", feat_enable_idle_nap, 0}, - {"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0}, + /* alignment-interrupt-dsisr ignored */ {"idle-stop", feat_enable_idle_stop, 0}, {"machine-check-power8", feat_enable_mce_power8, 0}, {"performance-monitor-power8", feat_enable_pmu_power8, 0}, @@ -617,7 +647,9 @@ static struct dt_cpu_feature_match __initdata {"group-start-register", feat_enable, 0}, {"pc-relative-addressing", feat_enable, 0}, {"machine-check-power9", feat_enable_mce_power9, 0}, + {"machine-check-power10", feat_enable_mce_power10, 0}, {"performance-monitor-power9", feat_enable_pmu_power9, 0}, + {"performance-monitor-power10", feat_enable_pmu_power10, 0}, {"event-based-branch-v3", feat_enable, 0}, {"random-number-generator", feat_enable, 0}, {"system-call-vectored", feat_disable, 0}, @@ -626,6 +658,9 @@ static struct dt_cpu_feature_match __initdata {"vector-binary128", feat_enable, 0}, {"vector-binary16", feat_enable, 0}, {"wait-v3", feat_enable, 0}, + {"prefix-instructions", feat_enable, 0}, + {"matrix-multiply-assist", feat_enable_mma, 0}, + {"debug-facilities-v31", feat_enable, CPU_FTR_DAWR1}, }; static bool __initdata using_dt_cpu_ftrs; @@ -651,10 +686,15 @@ static void __init cpufeatures_setup_start(u32 isa) { pr_info("setup for ISA %d\n", isa); - if (isa >= 3000) { + if (isa >= ISA_V3_0B) { cur_cpu_spec->cpu_features |= CPU_FTR_ARCH_300; cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_ARCH_3_00; } + + if (isa >= ISA_V3_1) { + cur_cpu_spec->cpu_features |= CPU_FTR_ARCH_31; + cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_ARCH_3_1; + } } static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) @@ -727,30 +767,33 @@ static __init void cpufeatures_cpu_quirks(void) /* * Not all quirks can be derived from the cpufeatures device tree. */ - if ((version & 0xffffefff) == 0x004e0200) - ; /* DD2.0 has no feature flag */ - else if ((version & 0xffffefff) == 0x004e0201) + if ((version & 0xffffefff) == 0x004e0200) { + /* DD2.0 has no feature flag */ + cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); + } else if ((version & 0xffffefff) == 0x004e0201) { + cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); + } else if ((version & 0xffffefff) == 0x004e0202) { + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - else if ((version & 0xffffefff) == 0x004e0202) { + cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); + } else if ((version & 0xffffefff) == 0x004e0203) { cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - } else if ((version & 0xffff0000) == 0x004e0000) + } else if ((version & 0xffff0000) == 0x004e0000) { /* DD2.1 and up have DD2_1 */ cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; + } if ((version & 0xffff0000) == 0x004e0000) { - cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR; } update_tlbie_feature_flag(version); - /* - * PKEY was not in the initial base or feature node - * specification, but it should become optional in the next - * cpu feature version sequence. - */ - cur_cpu_spec->cpu_features |= CPU_FTR_PKEY; } static void __init cpufeatures_setup_finished(void) @@ -768,6 +811,7 @@ static void __init cpufeatures_setup_finished(void) system_registers.lpcr = mfspr(SPRN_LPCR); system_registers.hfscr = mfspr(SPRN_HFSCR); system_registers.fscr = mfspr(SPRN_FSCR); + system_registers.pcr = mfspr(SPRN_PCR); pr_info("final cpu/mmu features = 0x%016lx 0x%08x\n", cur_cpu_spec->cpu_features, cur_cpu_spec->mmu_features); @@ -1055,14 +1099,14 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char prop = of_get_flat_dt_prop(node, "display-name", NULL); if (prop && strlen((char *)prop) != 0) { - strlcpy(dt_cpu_name, (char *)prop, sizeof(dt_cpu_name)); + strscpy(dt_cpu_name, (char *)prop, sizeof(dt_cpu_name)); cur_cpu_spec->cpu_name = dt_cpu_name; } cpufeatures_setup_finished(); - memblock_free(__pa(dt_cpu_features), - sizeof(struct dt_cpu_feature)*nr_dt_cpu_features); + memblock_free(dt_cpu_features, + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features); return 0; } diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c index ef2ad4945904..03f1135ef64f 100644 --- a/arch/powerpc/kernel/early_32.c +++ b/arch/powerpc/kernel/early_32.c @@ -8,7 +8,6 @@ #include <linux/kernel.h> #include <asm/setup.h> #include <asm/sections.h> -#include <asm/asm-prototypes.h> /* * We're called here very early in the boot. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index bc8a551013be..ab316e155ea9 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -21,9 +21,9 @@ #include <linux/spinlock.h> #include <linux/export.h> #include <linux/of.h> +#include <linux/debugfs.h> #include <linux/atomic.h> -#include <asm/debugfs.h> #include <asm/eeh.h> #include <asm/eeh_event.h> #include <asm/io.h> @@ -167,39 +167,33 @@ void eeh_show_enabled(void) */ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); u32 cfg; int cap, i; int n = 0, l = 0; char buffer[128]; - if (!pdn) { - pr_warn("EEH: Note: No error log for absent device.\n"); - return 0; - } - n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", - pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + edev->pe->phb->global_number, edev->bdfn >> 8, + PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", - pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + edev->pe->phb->global_number, edev->bdfn >> 8, + PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); - eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg); + eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg); n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); pr_warn("EEH: PCI device/vendor: %08x\n", cfg); - eeh_ops->read_config(pdn, PCI_COMMAND, 4, &cfg); + eeh_ops->read_config(edev, PCI_COMMAND, 4, &cfg); n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); pr_warn("EEH: PCI cmd/status register: %08x\n", cfg); /* Gather bridge-specific registers */ if (edev->mode & EEH_DEV_BRIDGE) { - eeh_ops->read_config(pdn, PCI_SEC_STATUS, 2, &cfg); + eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, &cfg); n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); pr_warn("EEH: Bridge secondary status: %04x\n", cfg); - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &cfg); n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); pr_warn("EEH: Bridge control: %04x\n", cfg); } @@ -207,11 +201,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) /* Dump out the PCI-X command and status regs */ cap = edev->pcix_cap; if (cap) { - eeh_ops->read_config(pdn, cap, 4, &cfg); + eeh_ops->read_config(edev, cap, 4, &cfg); n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); pr_warn("EEH: PCI-X cmd: %08x\n", cfg); - eeh_ops->read_config(pdn, cap+4, 4, &cfg); + eeh_ops->read_config(edev, cap+4, 4, &cfg); n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); pr_warn("EEH: PCI-X status: %08x\n", cfg); } @@ -223,7 +217,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) pr_warn("EEH: PCI-E capabilities and status follow:\n"); for (i=0; i<=8; i++) { - eeh_ops->read_config(pdn, cap+4*i, 4, &cfg); + eeh_ops->read_config(edev, cap+4*i, 4, &cfg); n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); if ((i % 4) == 0) { @@ -250,7 +244,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) pr_warn("EEH: PCI-E AER capability register set follows:\n"); for (i=0; i<=13; i++) { - eeh_ops->read_config(pdn, cap+4*i, 4, &cfg); + eeh_ops->read_config(edev, cap+4*i, 4, &cfg); n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); if ((i % 4) == 0) { @@ -352,31 +346,7 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity) */ static inline unsigned long eeh_token_to_phys(unsigned long token) { - pte_t *ptep; - unsigned long pa; - int hugepage_shift; - - /* - * We won't find hugepages here(this is iomem). Hence we are not - * worried about _PAGE_SPLITTING/collapse. Also we will not hit - * page table free, because of init_mm. - */ - ptep = find_init_mm_pte(token, &hugepage_shift); - if (!ptep) - return token; - - pa = pte_pfn(*ptep); - - /* On radix we can do hugepage mappings for io, so handle that */ - if (hugepage_shift) { - pa <<= hugepage_shift; - pa |= token & ((1ul << hugepage_shift) - 1); - } else { - pa <<= PAGE_SHIFT; - pa |= token & (PAGE_SIZE - 1); - } - - return pa; + return ppc_find_vmap_phys(token); } /* @@ -429,6 +399,14 @@ out: return ret; } +static inline const char *eeh_driver_name(struct pci_dev *pdev) +{ + if (pdev) + return dev_driver_string(&pdev->dev); + + return "<null>"; +} + /** * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze * @edev: eeh device @@ -472,11 +450,6 @@ int eeh_dev_check_failure(struct eeh_dev *edev) return 0; } - if (!pe->addr && !pe->config_addr) { - eeh_stats.no_cfg_addr++; - return 0; - } - /* * On PowerNV platform, we might already have fenced PHB * there and we need take care of that firstly. @@ -503,7 +476,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) rc = 1; if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; - if (pe->check_count % EEH_MAX_FAILS == 0) { + if (pe->check_count == EEH_MAX_FAILS) { dn = pci_device_to_OF_node(dev); if (dn) location = of_get_property(dn, "ibm,loc-code", @@ -624,6 +597,7 @@ EXPORT_SYMBOL(eeh_check_failure); /** * eeh_pci_enable - Enable MMIO or DMA transfers for this slot * @pe: EEH PE + * @function: EEH option * * This routine should be called to reenable frozen MMIO or DMA * so that it would work correctly again. It's useful while doing @@ -726,7 +700,6 @@ static void eeh_disable_and_save_dev_state(struct eeh_dev *edev, static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); struct pci_dev *dev = userdata; @@ -734,73 +707,14 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) return; /* Apply customization from firmware */ - if (pdn && eeh_ops->restore_config) - eeh_ops->restore_config(pdn); + if (eeh_ops->restore_config) + eeh_ops->restore_config(edev); /* The caller should restore state for the specified device */ if (pdev != dev) pci_restore_state(pdev); } -int eeh_restore_vf_config(struct pci_dn *pdn) -{ - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - u32 devctl, cmd, cap2, aer_capctl; - int old_mps; - - if (edev->pcie_cap) { - /* Restore MPS */ - old_mps = (ffs(pdn->mps) - 8) << 5; - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, - 2, &devctl); - devctl &= ~PCI_EXP_DEVCTL_PAYLOAD; - devctl |= old_mps; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, - 2, devctl); - - /* Disable Completion Timeout if possible */ - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2, - 4, &cap2); - if (cap2 & PCI_EXP_DEVCAP2_COMP_TMOUT_DIS) { - eeh_ops->read_config(pdn, - edev->pcie_cap + PCI_EXP_DEVCTL2, - 4, &cap2); - cap2 |= PCI_EXP_DEVCTL2_COMP_TMOUT_DIS; - eeh_ops->write_config(pdn, - edev->pcie_cap + PCI_EXP_DEVCTL2, - 4, cap2); - } - } - - /* Enable SERR and parity checking */ - eeh_ops->read_config(pdn, PCI_COMMAND, 2, &cmd); - cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR); - eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd); - - /* Enable report various errors */ - if (edev->pcie_cap) { - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, - 2, &devctl); - devctl &= ~PCI_EXP_DEVCTL_CERE; - devctl |= (PCI_EXP_DEVCTL_NFERE | - PCI_EXP_DEVCTL_FERE | - PCI_EXP_DEVCTL_URRE); - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, - 2, devctl); - } - - /* Enable ECRC generation and check */ - if (edev->pcie_cap && edev->aer_cap) { - eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP, - 4, &aer_capctl); - aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE); - eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP, - 4, aer_capctl); - } - - return 0; -} - /** * pcibios_set_pcie_reset_state - Set PCI-E reset state * @dev: pci device struct @@ -850,14 +764,14 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat default: eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED, true); return -EINVAL; - }; + } return 0; } /** - * eeh_set_pe_freset - Check the required reset for the indicated device - * @data: EEH device + * eeh_set_dev_freset - Check the required reset for the indicated device + * @edev: EEH device * @flag: return value * * Each device might have its preferred reset type: fundamental or @@ -896,6 +810,7 @@ static void eeh_pe_refreeze_passed(struct eeh_pe *root) /** * eeh_pe_reset_full - Complete a full reset process on the indicated PE * @pe: EEH PE + * @include_passed: include passed-through devices? * * This function executes a full reset procedure on a PE, including setting * the appropriate flags, performing a fundamental or hot reset, and then @@ -977,15 +892,13 @@ int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed) */ void eeh_save_bars(struct eeh_dev *edev) { - struct pci_dn *pdn; int i; - pdn = eeh_dev_to_pdn(edev); - if (!pdn) + if (!edev) return; for (i = 0; i < 16; i++) - eeh_ops->read_config(pdn, i * 4, 4, &edev->config_space[i]); + eeh_ops->read_config(edev, i * 4, 4, &edev->config_space[i]); /* * For PCI bridges including root port, we need enable bus @@ -997,56 +910,6 @@ void eeh_save_bars(struct eeh_dev *edev) edev->config_space[1] |= PCI_COMMAND_MASTER; } -/** - * eeh_ops_register - Register platform dependent EEH operations - * @ops: platform dependent EEH operations - * - * Register the platform dependent EEH operation callback - * functions. The platform should call this function before - * any other EEH operations. - */ -int __init eeh_ops_register(struct eeh_ops *ops) -{ - if (!ops->name) { - pr_warn("%s: Invalid EEH ops name for %p\n", - __func__, ops); - return -EINVAL; - } - - if (eeh_ops && eeh_ops != ops) { - pr_warn("%s: EEH ops of platform %s already existing (%s)\n", - __func__, eeh_ops->name, ops->name); - return -EEXIST; - } - - eeh_ops = ops; - - return 0; -} - -/** - * eeh_ops_unregister - Unreigster platform dependent EEH operations - * @name: name of EEH platform operations - * - * Unregister the platform dependent EEH operation callback - * functions. - */ -int __exit eeh_ops_unregister(const char *name) -{ - if (!name || !strlen(name)) { - pr_warn("%s: Invalid EEH ops name\n", - __func__); - return -EINVAL; - } - - if (eeh_ops && !strcmp(eeh_ops->name, name)) { - eeh_ops = NULL; - return 0; - } - - return -EEXIST; -} - static int eeh_reboot_notifier(struct notifier_block *nb, unsigned long action, void *unused) { @@ -1058,45 +921,66 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; +static int eeh_device_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + switch (action) { + /* + * Note: It's not possible to perform EEH device addition (i.e. + * {pseries,pnv}_pcibios_bus_add_device()) here because it depends on + * the device's resources, which have not yet been set up. + */ + case BUS_NOTIFY_DEL_DEVICE: + eeh_remove_device(to_pci_dev(dev)); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block eeh_device_nb = { + .notifier_call = eeh_device_notifier, +}; + /** - * eeh_init - EEH initialization + * eeh_init - System wide EEH initialization + * @ops: struct to trace EEH operation callback functions * - * Initialize EEH by trying to enable it for all of the adapters in the system. - * As a side effect we can determine here if eeh is supported at all. - * Note that we leave EEH on so failed config cycles won't cause a machine - * check. If a user turns off EEH for a particular adapter they are really - * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't - * grant access to a slot if EEH isn't enabled, and so we always enable - * EEH for all slots/all devices. - * - * The eeh-force-off option disables EEH checking globally, for all slots. - * Even if force-off is set, the EEH hardware is still enabled, so that - * newer systems can boot. + * It's the platform's job to call this from an arch_initcall(). */ -static int eeh_init(void) +int eeh_init(struct eeh_ops *ops) { struct pci_controller *hose, *tmp; int ret = 0; + /* the platform should only initialise EEH once */ + if (WARN_ON(eeh_ops)) + return -EEXIST; + if (WARN_ON(!ops)) + return -ENOENT; + eeh_ops = ops; + /* Register reboot notifier */ ret = register_reboot_notifier(&eeh_reboot_nb); if (ret) { - pr_warn("%s: Failed to register notifier (%d)\n", + pr_warn("%s: Failed to register reboot notifier (%d)\n", __func__, ret); return ret; } - /* call platform initialization function */ - if (!eeh_ops) { - pr_warn("%s: Platform EEH operation not found\n", - __func__); - return -EEXIST; - } else if ((ret = eeh_ops->init())) + ret = bus_register_notifier(&pci_bus_type, &eeh_device_nb); + if (ret) { + pr_warn("%s: Failed to register bus notifier (%d)\n", + __func__, ret); return ret; + } /* Initialize PHB PEs */ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) - eeh_dev_phb_init_dynamic(hose); + eeh_phb_pe_create(hose); eeh_addr_cache_init(); @@ -1104,94 +988,47 @@ static int eeh_init(void) return eeh_event_init(); } -core_initcall_sync(eeh_init); - -/** - * eeh_add_device_early - Enable EEH for the indicated device node - * @pdn: PCI device node for which to set up EEH - * - * This routine must be used to perform EEH initialization for PCI - * devices that were added after system boot (e.g. hotplug, dlpar). - * This routine must be called before any i/o is performed to the - * adapter (inluding any config-space i/o). - * Whether this actually enables EEH or not for this device depends - * on the CEC architecture, type of the device, on earlier boot - * command-line arguments & etc. - */ -void eeh_add_device_early(struct pci_dn *pdn) -{ - struct pci_controller *phb = pdn ? pdn->phb : NULL; - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - - if (!edev) - return; - - if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) - return; - - /* USB Bus children of PCI devices will not have BUID's */ - if (NULL == phb || - (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid)) - return; - - eeh_ops->probe(pdn, NULL); -} - /** - * eeh_add_device_tree_early - Enable EEH for the indicated device - * @pdn: PCI device node - * - * This routine must be used to perform EEH initialization for the - * indicated PCI device that was added after system boot (e.g. - * hotplug, dlpar). - */ -void eeh_add_device_tree_early(struct pci_dn *pdn) -{ - struct pci_dn *n; - - if (!pdn) - return; - - list_for_each_entry(n, &pdn->child_list, list) - eeh_add_device_tree_early(n); - eeh_add_device_early(pdn); -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); - -/** - * eeh_add_device_late - Perform EEH initialization for the indicated pci device + * eeh_probe_device() - Perform EEH initialization for the indicated pci device * @dev: pci device for which to set up EEH * * This routine must be used to complete EEH initialization for PCI * devices that were added after system boot (e.g. hotplug, dlpar). */ -void eeh_add_device_late(struct pci_dev *dev) +void eeh_probe_device(struct pci_dev *dev) { - struct pci_dn *pdn; struct eeh_dev *edev; - if (!dev) + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + /* + * pci_dev_to_eeh_dev() can only work if eeh_probe_dev() was + * already called for this device. + */ + if (WARN_ON_ONCE(pci_dev_to_eeh_dev(dev))) { + pci_dbg(dev, "Already bound to an eeh_dev!\n"); return; + } - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - edev = pdn_to_eeh_dev(pdn); - eeh_edev_dbg(edev, "Adding device\n"); - if (edev->pdev == dev) { - eeh_edev_dbg(edev, "Device already referenced!\n"); + edev = eeh_ops->probe(dev); + if (!edev) { + pr_debug("EEH: Adding device failed\n"); return; } /* - * The EEH cache might not be removed correctly because of - * unbalanced kref to the device during unplug time, which - * relies on pcibios_release_device(). So we have to remove - * that here explicitly. + * FIXME: We rely on pcibios_release_device() to remove the + * existing EEH state. The release function is only called if + * the pci_dev's refcount drops to zero so if something is + * keeping a ref to a device (e.g. a filesystem) we need to + * remove the old EEH state. + * + * FIXME: HEY MA, LOOK AT ME, NO LOCKING! */ - if (edev->pdev) { - eeh_rmv_from_parent_pe(edev); + if (edev->pdev && edev->pdev != dev) { + eeh_pe_tree_remove(edev); eeh_addr_cache_rmv_dev(edev->pdev); eeh_sysfs_remove_device(edev->pdev); - edev->mode &= ~EEH_DEV_SYSFS; /* * We definitely should have the PCI device removed @@ -1199,69 +1036,16 @@ void eeh_add_device_late(struct pci_dev *dev) * into error handler afterwards. */ edev->mode |= EEH_DEV_NO_HANDLER; - - edev->pdev = NULL; - dev->dev.archdata.edev = NULL; } - if (eeh_has_flag(EEH_PROBE_MODE_DEV)) - eeh_ops->probe(pdn, NULL); - + /* bind the pdev and the edev together */ edev->pdev = dev; dev->dev.archdata.edev = edev; - eeh_addr_cache_insert_dev(dev); + eeh_sysfs_add_device(dev); } /** - * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus - * @bus: PCI bus - * - * This routine must be used to perform EEH initialization for PCI - * devices which are attached to the indicated PCI bus. The PCI bus - * is added after system boot through hotplug or dlpar. - */ -void eeh_add_device_tree_late(struct pci_bus *bus) -{ - struct pci_dev *dev; - - if (eeh_has_flag(EEH_FORCE_DISABLED)) - return; - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_add_device_late(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_device_tree_late(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); - -/** - * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus - * @bus: PCI bus - * - * This routine must be used to add EEH sysfs files for PCI - * devices which are attached to the indicated PCI bus. The PCI bus - * is added after system boot through hotplug or dlpar. - */ -void eeh_add_sysfs_files(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_sysfs_add_device(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_sysfs_files(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_sysfs_files); - -/** * eeh_remove_device - Undo EEH setup for the indicated pci device * @dev: pci device to be removed * @@ -1296,17 +1080,11 @@ void eeh_remove_device(struct pci_dev *dev) edev->pdev = NULL; /* - * The flag "in_error" is used to trace EEH devices for VFs - * in error state or not. It's set in eeh_report_error(). If - * it's not set, eeh_report_{reset,resume}() won't be called - * for the VF EEH device. + * eeh_sysfs_remove_device() uses pci_dev_to_eeh_dev() so we need to + * remove the sysfs files before clearing dev.archdata.edev */ - edev->in_error = false; - dev->dev.archdata.edev = NULL; - if (!(edev->pe->state & EEH_PE_KEEP)) - eeh_rmv_from_parent_pe(edev); - else - edev->mode |= EEH_DEV_DISCONNECTED; + if (edev->mode & EEH_DEV_SYSFS) + eeh_sysfs_remove_device(dev); /* * We're removing from the PCI subsystem, that means @@ -1317,8 +1095,19 @@ void eeh_remove_device(struct pci_dev *dev) edev->mode |= EEH_DEV_NO_HANDLER; eeh_addr_cache_rmv_dev(dev); - eeh_sysfs_remove_device(dev); - edev->mode &= ~EEH_DEV_SYSFS; + + /* + * The flag "in_error" is used to trace EEH devices for VFs + * in error state or not. It's set in eeh_report_error(). If + * it's not set, eeh_report_{reset,resume}() won't be called + * for the VF EEH device. + */ + edev->in_error = false; + dev->dev.archdata.edev = NULL; + if (!(edev->pe->state & EEH_PE_KEEP)) + eeh_pe_tree_remove(edev); + else + edev->mode |= EEH_DEV_DISCONNECTED; } int eeh_unfreeze_pe(struct eeh_pe *pe) @@ -1540,7 +1329,7 @@ int eeh_pe_set_option(struct eeh_pe *pe, int option) /* * EEH functionality could possibly be disabled, just - * return error for the case. And the EEH functinality + * return error for the case. And the EEH functionality * isn't expected to be disabled on one specific PE. */ switch (option) { @@ -1664,6 +1453,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe, bool include_passed) * eeh_pe_reset - Issue PE reset according to specified type * @pe: EEH PE * @option: reset type + * @include_passed: include passed-through devices? * * The routine is called to reset the specified PE with the * indicated type, either fundamental reset or hot reset. @@ -1735,12 +1525,12 @@ EXPORT_SYMBOL_GPL(eeh_pe_configure); * eeh_pe_inject_err - Injecting the specified PCI error to the indicated PE * @pe: the indicated PE * @type: error type - * @function: error function + * @func: error function * @addr: address * @mask: address mask * * The routine is called to inject the specified PCI error, which - * is determined by @type and @function, to the indicated PE for + * is determined by @type and @func, to the indicated PE for * testing purpose. */ int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func, @@ -1766,6 +1556,7 @@ int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func, } EXPORT_SYMBOL_GPL(eeh_pe_inject_err); +#ifdef CONFIG_PROC_FS static int proc_eeh_show(struct seq_file *m, void *v) { if (!eeh_enabled()) { @@ -1792,8 +1583,38 @@ static int proc_eeh_show(struct seq_file *m, void *v) return 0; } +#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_FS + + +static struct pci_dev *eeh_debug_lookup_pdev(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + uint32_t domain, bus, dev, fn; + struct pci_dev *pdev; + char buf[20]; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); + if (!ret) + return ERR_PTR(-EFAULT); + + ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); + if (ret != 4) { + pr_err("%s: expected 4 args, got %d\n", __func__, ret); + return ERR_PTR(-EINVAL); + } + + pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); + if (!pdev) + return ERR_PTR(-ENODEV); + + return pdev; +} + static int eeh_enable_dbgfs_set(void *data, u64 val) { if (val) @@ -1850,7 +1671,7 @@ static ssize_t eeh_force_recover_write(struct file *filp, return -ENODEV; /* Retrieve PE */ - pe = eeh_pe_get(hose, pe_no, 0); + pe = eeh_pe_get(hose, pe_no); if (!pe) return -ENODEV; @@ -1886,26 +1707,13 @@ static ssize_t eeh_dev_check_write(struct file *filp, const char __user *user_buf, size_t count, loff_t *ppos) { - uint32_t domain, bus, dev, fn; struct pci_dev *pdev; struct eeh_dev *edev; - char buf[20]; int ret; - memset(buf, 0, sizeof(buf)); - ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); - if (!ret) - return -EFAULT; - - ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); - if (ret != 4) { - pr_err("%s: expected 4 args, got %d\n", __func__, ret); - return -EINVAL; - } - - pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); - if (!pdev) - return -ENODEV; + pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); edev = pci_dev_to_eeh_dev(pdev); if (!edev) { @@ -1915,8 +1723,8 @@ static ssize_t eeh_dev_check_write(struct file *filp, } ret = eeh_dev_check_failure(edev); - pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n", - domain, bus, dev, fn, ret); + pci_info(pdev, "eeh_dev_check_failure(%s) = %d\n", + pci_name(pdev), ret); pci_dev_put(pdev); @@ -1996,7 +1804,7 @@ static int eeh_debugfs_break_device(struct pci_dev *pdev) * PE freeze. Using the in_8() accessor skips the eeh detection hook * so the freeze hook so the EEH Detection machinery won't be * triggered here. This is to match the usual behaviour of EEH - * where the HW will asyncronously freeze a PE and it's up to + * where the HW will asynchronously freeze a PE and it's up to * the kernel to notice and deal with it. * * 3. Turn Memory space back on. This is more important for VFs @@ -2027,25 +1835,12 @@ static ssize_t eeh_dev_break_write(struct file *filp, const char __user *user_buf, size_t count, loff_t *ppos) { - uint32_t domain, bus, dev, fn; struct pci_dev *pdev; - char buf[20]; int ret; - memset(buf, 0, sizeof(buf)); - ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); - if (!ret) - return -EFAULT; - - ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); - if (ret != 4) { - pr_err("%s: expected 4 args, got %d\n", __func__, ret); - return -EINVAL; - } - - pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); - if (!pdev) - return -ENODEV; + pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); ret = eeh_debugfs_break_device(pdev); pci_dev_put(pdev); @@ -2063,6 +1858,53 @@ static const struct file_operations eeh_dev_break_fops = { .read = eeh_debugfs_dev_usage, }; +static ssize_t eeh_dev_can_recover(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct pci_driver *drv; + struct pci_dev *pdev; + size_t ret; + + pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + /* + * In order for error recovery to work the driver needs to implement + * .error_detected(), so it can quiesce IO to the device, and + * .slot_reset() so it can re-initialise the device after a reset. + * + * Ideally they'd implement .resume() too, but some drivers which + * we need to support (notably IPR) don't so I guess we can tolerate + * that. + * + * .mmio_enabled() is mostly there as a work-around for devices which + * take forever to re-init after a hot reset. Implementing that is + * strictly optional. + */ + drv = pci_dev_driver(pdev); + if (drv && + drv->err_handler && + drv->err_handler->error_detected && + drv->err_handler->slot_reset) { + ret = count; + } else { + ret = -EOPNOTSUPP; + } + + pci_dev_put(pdev); + + return ret; +} + +static const struct file_operations eeh_dev_can_recover_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_dev_can_recover, + .read = eeh_debugfs_dev_usage, +}; + #endif static int __init eeh_init_proc(void) @@ -2071,22 +1913,25 @@ static int __init eeh_init_proc(void) proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show); #ifdef CONFIG_DEBUG_FS debugfs_create_file_unsafe("eeh_enable", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_enable_dbgfs_ops); debugfs_create_u32("eeh_max_freezes", 0600, - powerpc_debugfs_root, &eeh_max_freezes); + arch_debugfs_dir, &eeh_max_freezes); debugfs_create_bool("eeh_disable_recovery", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &eeh_debugfs_no_recover); debugfs_create_file_unsafe("eeh_dev_check", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_check_fops); debugfs_create_file_unsafe("eeh_dev_break", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_force_recover_fops); + debugfs_create_file_unsafe("eeh_dev_can_recover", 0600, + arch_debugfs_dir, NULL, + &eeh_dev_can_recover_fops); eeh_cache_debugfs_init(); #endif } diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index cf11277ebd02..2f9dbf8ad2ee 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -12,8 +12,8 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/atomic.h> +#include <linux/debugfs.h> #include <asm/pci-bridge.h> -#include <asm/debugfs.h> #include <asm/ppc-pci.h> @@ -159,18 +159,10 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) { - struct pci_dn *pdn; struct eeh_dev *edev; int i; - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - if (!pdn) { - pr_warn("PCI: no pci dn found for dev=%s\n", - pci_name(dev)); - return; - } - - edev = pdn_to_eeh_dev(pdn); + edev = pci_dev_to_eeh_dev(dev); if (!edev) { pr_warn("PCI: no EEH dev found for %s\n", pci_name(dev)); @@ -272,8 +264,9 @@ static int eeh_addr_cache_show(struct seq_file *s, void *v) { struct pci_io_addr_range *piar; struct rb_node *n; + unsigned long flags; - spin_lock(&pci_io_addr_cache_root.piar_lock); + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) { piar = rb_entry(n, struct pci_io_addr_range, rb_node); @@ -281,15 +274,15 @@ static int eeh_addr_cache_show(struct seq_file *s, void *v) (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); } - spin_unlock(&pci_io_addr_cache_root.piar_lock); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); return 0; } DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); -void eeh_cache_debugfs_init(void) +void __init eeh_cache_debugfs_init(void) { debugfs_create_file_unsafe("eeh_address_cache", 0400, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_addr_cache_fops); } diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c deleted file mode 100644 index 7370185c7a05..000000000000 --- a/arch/powerpc/kernel/eeh_dev.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The file intends to implement dynamic creation of EEH device, which will - * be bound with OF node and PCI device simutaneously. The EEH devices would - * be foundamental information for EEH core components to work proerly. Besides, - * We have to support multiple situations where dynamic creation of EEH device - * is required: - * - * 1) Before PCI emunation starts, we need create EEH devices according to the - * PCI sensitive OF nodes. - * 2) When PCI emunation is done, we need do the binding between PCI device and - * the associated EEH device. - * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device - * will be created while PCI sensitive OF node is detected from DR. - * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If - * PHB is newly inserted, we also need create EEH devices accordingly. - * - * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012. - */ - -#include <linux/export.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/string.h> - -#include <asm/pci-bridge.h> -#include <asm/ppc-pci.h> - -/** - * eeh_dev_init - Create EEH device according to OF node - * @pdn: PCI device node - * - * It will create EEH device according to the given OF node. The function - * might be called by PCI emunation, DR, PHB hotplug. - */ -struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) -{ - struct eeh_dev *edev; - - /* Allocate EEH device */ - edev = kzalloc(sizeof(*edev), GFP_KERNEL); - if (!edev) - return NULL; - - /* Associate EEH device with OF node */ - pdn->edev = edev; - edev->pdn = pdn; - edev->bdfn = (pdn->busno << 8) | pdn->devfn; - edev->controller = pdn->phb; - - return edev; -} - -/** - * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB - * @phb: PHB - * - * Scan the PHB OF node and its child association, then create the - * EEH devices accordingly - */ -void eeh_dev_phb_init_dynamic(struct pci_controller *phb) -{ - /* EEH PE for PHB */ - eeh_phb_pe_create(phb); -} diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3dd1a422fc29..48773d2d9be3 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -16,7 +16,6 @@ #include <asm/eeh_event.h> #include <asm/ppc-pci.h> #include <asm/pci-bridge.h> -#include <asm/prom.h> #include <asm/rtas.h> struct eeh_rmv_data { @@ -40,7 +39,7 @@ static int eeh_result_priority(enum pci_ers_result result) case PCI_ERS_RESULT_NEED_RESET: return 6; default: - WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); + WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", result); return 0; } }; @@ -61,7 +60,7 @@ static const char *pci_ers_result_name(enum pci_ers_result result) case PCI_ERS_RESULT_NO_AER_DRIVER: return "no AER driver"; default: - WARN_ONCE(1, "Unknown result type: %d\n", (int)result); + WARN_ONCE(1, "Unknown result type: %d\n", result); return "unknown"; } }; @@ -104,13 +103,13 @@ static bool eeh_edev_actionable(struct eeh_dev *edev) */ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return NULL; - if (!try_module_get(pdev->driver->driver.owner)) + if (!try_module_get(pdev->dev.driver->owner)) return NULL; - return pdev->driver; + return to_pci_driver(pdev->dev.driver); } /** @@ -122,10 +121,10 @@ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) */ static inline void eeh_pcid_put(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return; - module_put(pdev->driver->driver.owner); + module_put(pdev->dev.driver->owner); } /** @@ -214,7 +213,7 @@ static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) pci_save_state(pdev); } -static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) +static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s) { struct eeh_pe *pe; struct eeh_dev *edev, *tmp; @@ -425,8 +424,8 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); #ifdef CONFIG_PCI_IOV - if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) - eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); + if (eeh_ops->notify_resume) + eeh_ops->notify_resume(edev); #endif return PCI_ERS_RESULT_NONE; } @@ -477,7 +476,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) } #ifdef CONFIG_PCI_IOV - pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); + pci_iov_add_virtfn(edev->physfn, edev->vf_index); #endif return NULL; } @@ -521,16 +520,8 @@ static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) if (edev->physfn) { #ifdef CONFIG_PCI_IOV - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - - pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); + pci_iov_remove_virtfn(edev->physfn, edev->vf_index); edev->pdev = NULL; - - /* - * We have to set the VF PE number to invalid one, which is - * required to plug the VF successfully. - */ - pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); @@ -550,7 +541,7 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) continue; edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); - eeh_rmv_from_parent_pe(edev); + eeh_pe_tree_remove(edev); } return NULL; @@ -759,7 +750,7 @@ static void eeh_pe_cleanup(struct eeh_pe *pe) * @pdev: pci_dev to check * * This function may return a false positive if we can't determine the slot's - * presence state. This might happen for for PCIe slots if the PE containing + * presence state. This might happen for PCIe slots if the PE containing * the upstream bridge is also frozen, or the bridge is part of the same PE * as the device. * @@ -913,18 +904,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } #endif /* CONFIG_STACKTRACE */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + edev->mode &= ~EEH_DEV_NO_HANDLER; + eeh_pe_update_time_stamp(pe); pe->freeze_count++; if (pe->freeze_count > eeh_max_freezes) { pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - result = PCI_ERS_RESULT_DISCONNECT; - } - eeh_for_each_pe(pe, tmp_pe) - eeh_pe_for_each_dev(tmp_pe, edev, tmp) - edev->mode &= ~EEH_DEV_NO_HANDLER; + goto recover_failed; + } /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -936,39 +928,38 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, - eeh_report_error, &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; - } + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if (result == PCI_ERS_RESULT_DISCONNECT) + goto recover_failed; + + /* + * Error logged on a PHB are always fences which need a full + * PHB reset to clear so force that to happen. + */ + if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE) + result = PCI_ERS_RESULT_NEED_RESET; /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - result = PCI_ERS_RESULT_DISCONNECT; - } + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + goto recover_failed; } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); - } + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -978,9 +969,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Reset with hotplug activity\n"); rc = eeh_reset_device(pe, bus, NULL, false); if (rc) { - pr_warn("%s: Unable to reset, err=%d\n", - __func__, rc); - result = PCI_ERS_RESULT_DISCONNECT; + pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); + goto recover_failed; } } @@ -988,10 +978,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (result == PCI_ERS_RESULT_CAN_RECOVER) { pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + if (rc < 0) + goto recover_failed; - if (rc < 0) { - result = PCI_ERS_RESULT_DISCONNECT; - } else if (rc) { + if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -999,15 +989,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_report_mmio_enabled, &result); } } - - /* If all devices reported they can proceed, then re-enable DMA */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + if (rc < 0) + goto recover_failed; - if (rc < 0) { - result = PCI_ERS_RESULT_DISCONNECT; - } else if (rc) { + if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -1025,16 +1013,15 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Reset without hotplug activity\n"); rc = eeh_reset_device(pe, bus, &rmv_data, true); if (rc) { - pr_warn("%s: Cannot reset, err=%d\n", - __func__, rc); - result = PCI_ERS_RESULT_DISCONNECT; - } else { - result = PCI_ERS_RESULT_NONE; - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, - &result); + pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); + goto recover_failed; } + + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); } if ((result == PCI_ERS_RESULT_RECOVERED) || @@ -1062,45 +1049,47 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } pr_info("EEH: Recovery successful.\n"); - } else { - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); + goto out; + } - eeh_slot_error_detail(pe, EEH_LOG_PERM); +recover_failed: + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); + /* Notify all devices that they're about to go down. */ + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; - } + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; } out: @@ -1190,6 +1179,17 @@ void eeh_handle_special_event(void) eeh_pe_state_mark(pe, EEH_PE_RECOVERING); eeh_handle_normal_event(pe); } else { + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) + edev->mode &= ~EEH_DEV_NO_HANDLER; + + /* Notify all devices to be down */ + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); + eeh_pe_report( + "error_detected(permanent failure)", pe, + eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + pci_lock_rescan_remove(); list_for_each_entry(hose, &hose_list, list_node) { phb_pe = eeh_phb_pe_get(hose); @@ -1198,16 +1198,6 @@ void eeh_handle_special_event(void) (phb_pe->state & EEH_PE_RECOVERING)) continue; - eeh_for_each_pe(pe, tmp_pe) - eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) - edev->mode &= ~EEH_DEV_NO_HANDLER; - - /* Notify all devices to be down */ - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_pe_report( - "error_detected(permanent failure)", pe, - eeh_report_failure, NULL); bus = eeh_pe_bus_get(phb_pe); if (!bus) { pr_err("%s: Cannot find PCI bus for " diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index a7a8dc182efb..c23a454af08a 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -143,7 +143,7 @@ int __eeh_send_failure_event(struct eeh_pe *pe) int eeh_send_failure_event(struct eeh_pe *pe) { /* - * If we've manually supressed recovery events via debugfs + * If we've manually suppressed recovery events via debugfs * then just drop it on the floor. */ if (eeh_debugfs_no_recover) { diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 177852e39a25..e0ce81279624 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -13,6 +13,7 @@ #include <linux/export.h> #include <linux/gfp.h> #include <linux/kernel.h> +#include <linux/of.h> #include <linux/pci.h> #include <linux/string.h> @@ -251,43 +252,21 @@ void eeh_pe_dev_traverse(struct eeh_pe *root, /** * __eeh_pe_get - Check the PE address - * @data: EEH PE - * @flag: EEH device * * For one particular PE, it can be identified by PE address * or tranditional BDF address. BDF address is composed of * Bus/Device/Function number. The extra data referred by flag * indicates which type of address should be used. */ -struct eeh_pe_get_flag { - int pe_no; - int config_addr; -}; - static void *__eeh_pe_get(struct eeh_pe *pe, void *flag) { - struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag; + int *target_pe = flag; - /* Unexpected PHB PE */ + /* PHB PEs are special and should be ignored */ if (pe->type & EEH_PE_PHB) return NULL; - /* - * We prefer PE address. For most cases, we should - * have non-zero PE address - */ - if (eeh_has_flag(EEH_VALID_PE_ZERO)) { - if (tmp->pe_no == pe->addr) - return pe; - } else { - if (tmp->pe_no && - (tmp->pe_no == pe->addr)) - return pe; - } - - /* Try BDF address */ - if (tmp->config_addr && - (tmp->config_addr == pe->config_addr)) + if (*target_pe == pe->addr) return pe; return NULL; @@ -297,7 +276,6 @@ static void *__eeh_pe_get(struct eeh_pe *pe, void *flag) * eeh_pe_get - Search PE based on the given address * @phb: PCI controller * @pe_no: PE number - * @config_addr: Config address * * Search the corresponding PE based on the specified address which * is included in the eeh device. The function is used to check if @@ -306,75 +284,30 @@ static void *__eeh_pe_get(struct eeh_pe *pe, void *flag) * which is composed of PCI bus/device/function number, or unified * PE address. */ -struct eeh_pe *eeh_pe_get(struct pci_controller *phb, - int pe_no, int config_addr) +struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no) { struct eeh_pe *root = eeh_phb_pe_get(phb); - struct eeh_pe_get_flag tmp = { pe_no, config_addr }; - struct eeh_pe *pe; - - pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp); - return pe; + return eeh_pe_traverse(root, __eeh_pe_get, &pe_no); } /** - * eeh_pe_get_parent - Retrieve the parent PE + * eeh_pe_tree_insert - Add EEH device to parent PE * @edev: EEH device + * @new_pe_parent: PE to create additional PEs under * - * The whole PEs existing in the system are organized as hierarchy - * tree. The function is used to retrieve the parent PE according - * to the parent EEH device. - */ -static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) -{ - struct eeh_dev *parent; - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - - /* - * It might have the case for the indirect parent - * EEH device already having associated PE, but - * the direct parent EEH device doesn't have yet. - */ - if (edev->physfn) - pdn = pci_get_pdn(edev->physfn); - else - pdn = pdn ? pdn->parent : NULL; - while (pdn) { - /* We're poking out of PCI territory */ - parent = pdn_to_eeh_dev(pdn); - if (!parent) - return NULL; - - if (parent->pe) - return parent->pe; - - pdn = pdn->parent; - } - - return NULL; -} - -/** - * eeh_add_to_parent_pe - Add EEH device to parent PE - * @edev: EEH device + * Add EEH device to the PE in edev->pe_config_addr. If a PE already + * exists with that address then @edev is added to that PE. Otherwise + * a new PE is created and inserted into the PE tree as a child of + * @new_pe_parent. * - * Add EEH device to the parent PE. If the parent PE already - * exists, the PE type will be changed to EEH_PE_BUS. Otherwise, - * we have to create new PE to hold the EEH device and the new - * PE will be linked to its parent PE as well. + * If @new_pe_parent is NULL then the new PE will be inserted under + * directly under the PHB. */ -int eeh_add_to_parent_pe(struct eeh_dev *edev) +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent) { + struct pci_controller *hose = edev->controller; struct eeh_pe *pe, *parent; - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - int config_addr = (pdn->busno << 8) | (pdn->devfn); - - /* Check if the PE number is valid */ - if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { - eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n"); - return -EINVAL; - } /* * Search the PE has been existing or not according @@ -382,7 +315,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * PE should be composed of PCI bus and its subordinate * components. */ - pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); + pe = eeh_pe_get(hose, edev->pe_config_addr); if (pe) { if (pe->type & EEH_PE_INVALID) { list_add_tail(&edev->entry, &pe->edevs); @@ -399,8 +332,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) parent = parent->parent; } - eeh_edev_dbg(edev, - "Added to device PE (parent: PE#%x)\n", + eeh_edev_dbg(edev, "Added to existing PE (parent: PE#%x)\n", pe->parent->addr); } else { /* Mark the PE as type of PCI bus */ @@ -416,15 +348,15 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Create a new EEH PE */ if (edev->physfn) - pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF); + pe = eeh_pe_alloc(hose, EEH_PE_VF); else - pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE); + pe = eeh_pe_alloc(hose, EEH_PE_DEVICE); if (!pe) { pr_err("%s: out of memory!\n", __func__); return -ENOMEM; } - pe->addr = edev->pe_config_addr; - pe->config_addr = config_addr; + + pe->addr = edev->pe_config_addr; /* * Put the new EEH PE into hierarchy tree. If the parent @@ -432,34 +364,35 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * to PHB directly. Otherwise, we have to associate the * PE with its parent. */ - parent = eeh_pe_get_parent(edev); - if (!parent) { - parent = eeh_phb_pe_get(pdn->phb); - if (!parent) { + if (!new_pe_parent) { + new_pe_parent = eeh_phb_pe_get(hose); + if (!new_pe_parent) { pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", - __func__, pdn->phb->global_number); + __func__, hose->global_number); edev->pe = NULL; kfree(pe); return -EEXIST; } } - pe->parent = parent; + + /* link new PE into the tree */ + pe->parent = new_pe_parent; + list_add_tail(&pe->child, &new_pe_parent->child_list); /* * Put the newly created PE into the child list and * link the EEH device accordingly. */ - list_add_tail(&pe->child, &parent->child_list); list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; - eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n", - pe->parent->addr); + eeh_edev_dbg(edev, "Added to new (parent: PE#%x)\n", + new_pe_parent->addr); return 0; } /** - * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE + * eeh_pe_tree_remove - Remove one EEH device from the associated PE * @edev: EEH device * * The PE hierarchy tree might be changed when doing PCI hotplug. @@ -467,7 +400,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * during EEH recovery. So we have to call the function remove the * corresponding PE accordingly if necessary. */ -int eeh_rmv_from_parent_pe(struct eeh_dev *edev) +int eeh_pe_tree_remove(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; bool keep, recover; @@ -698,7 +631,6 @@ void eeh_pe_state_clear(struct eeh_pe *root, int state, bool include_passed) */ static void eeh_bridge_check_link(struct eeh_dev *edev) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); int cap; uint32_t val; int timeout = 0; @@ -714,34 +646,33 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) /* Check slot status */ cap = edev->pcie_cap; - eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val); + eeh_ops->read_config(edev, cap + PCI_EXP_SLTSTA, 2, &val); if (!(val & PCI_EXP_SLTSTA_PDS)) { eeh_edev_dbg(edev, "No card in the slot (0x%04x) !\n", val); return; } /* Check power status if we have the capability */ - eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCAP, 2, &val); + eeh_ops->read_config(edev, cap + PCI_EXP_SLTCAP, 2, &val); if (val & PCI_EXP_SLTCAP_PCP) { - eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val); + eeh_ops->read_config(edev, cap + PCI_EXP_SLTCTL, 2, &val); if (val & PCI_EXP_SLTCTL_PCC) { eeh_edev_dbg(edev, "In power-off state, power it on ...\n"); val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); val |= (0x0100 & PCI_EXP_SLTCTL_PIC); - eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val); + eeh_ops->write_config(edev, cap + PCI_EXP_SLTCTL, 2, val); msleep(2 * 1000); } } /* Enable link */ - eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCTL, 2, &val); + eeh_ops->read_config(edev, cap + PCI_EXP_LNKCTL, 2, &val); val &= ~PCI_EXP_LNKCTL_LD; - eeh_ops->write_config(pdn, cap + PCI_EXP_LNKCTL, 2, val); + eeh_ops->write_config(edev, cap + PCI_EXP_LNKCTL, 2, val); /* Check link */ - eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val); - if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { - eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", val); + if (!edev->pdev->link_active_reporting) { + eeh_edev_dbg(edev, "No link reporting capability\n"); msleep(1000); return; } @@ -752,7 +683,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) msleep(20); timeout += 20; - eeh_ops->read_config(pdn, cap + PCI_EXP_LNKSTA, 2, &val); + eeh_ops->read_config(edev, cap + PCI_EXP_LNKSTA, 2, &val); if (val & PCI_EXP_LNKSTA_DLLLA) break; } @@ -769,7 +700,6 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) static void eeh_restore_bridge_bars(struct eeh_dev *edev) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); int i; /* @@ -777,20 +707,20 @@ static void eeh_restore_bridge_bars(struct eeh_dev *edev) * Bus numbers and windows: 0x18 - 0x30 */ for (i = 4; i < 13; i++) - eeh_ops->write_config(pdn, i*4, 4, edev->config_space[i]); + eeh_ops->write_config(edev, i*4, 4, edev->config_space[i]); /* Rom: 0x38 */ - eeh_ops->write_config(pdn, 14*4, 4, edev->config_space[14]); + eeh_ops->write_config(edev, 14*4, 4, edev->config_space[14]); /* Cache line & Latency timer: 0xC 0xD */ - eeh_ops->write_config(pdn, PCI_CACHE_LINE_SIZE, 1, + eeh_ops->write_config(edev, PCI_CACHE_LINE_SIZE, 1, SAVED_BYTE(PCI_CACHE_LINE_SIZE)); - eeh_ops->write_config(pdn, PCI_LATENCY_TIMER, 1, - SAVED_BYTE(PCI_LATENCY_TIMER)); + eeh_ops->write_config(edev, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); /* Max latency, min grant, interrupt ping and line: 0x3C */ - eeh_ops->write_config(pdn, 15*4, 4, edev->config_space[15]); + eeh_ops->write_config(edev, 15*4, 4, edev->config_space[15]); /* PCI Command: 0x4 */ - eeh_ops->write_config(pdn, PCI_COMMAND, 4, edev->config_space[1] | + eeh_ops->write_config(edev, PCI_COMMAND, 4, edev->config_space[1] | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); /* Check the PCIe link is ready */ @@ -799,28 +729,27 @@ static void eeh_restore_bridge_bars(struct eeh_dev *edev) static void eeh_restore_device_bars(struct eeh_dev *edev) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); int i; u32 cmd; for (i = 4; i < 10; i++) - eeh_ops->write_config(pdn, i*4, 4, edev->config_space[i]); + eeh_ops->write_config(edev, i*4, 4, edev->config_space[i]); /* 12 == Expansion ROM Address */ - eeh_ops->write_config(pdn, 12*4, 4, edev->config_space[12]); + eeh_ops->write_config(edev, 12*4, 4, edev->config_space[12]); - eeh_ops->write_config(pdn, PCI_CACHE_LINE_SIZE, 1, + eeh_ops->write_config(edev, PCI_CACHE_LINE_SIZE, 1, SAVED_BYTE(PCI_CACHE_LINE_SIZE)); - eeh_ops->write_config(pdn, PCI_LATENCY_TIMER, 1, + eeh_ops->write_config(edev, PCI_LATENCY_TIMER, 1, SAVED_BYTE(PCI_LATENCY_TIMER)); /* max latency, min grant, interrupt pin and line */ - eeh_ops->write_config(pdn, 15*4, 4, edev->config_space[15]); + eeh_ops->write_config(edev, 15*4, 4, edev->config_space[15]); /* * Restore PERR & SERR bits, some devices require it, * don't touch the other command bits */ - eeh_ops->read_config(pdn, PCI_COMMAND, 4, &cmd); + eeh_ops->read_config(edev, PCI_COMMAND, 4, &cmd); if (edev->config_space[1] & PCI_COMMAND_PARITY) cmd |= PCI_COMMAND_PARITY; else @@ -829,7 +758,7 @@ static void eeh_restore_device_bars(struct eeh_dev *edev) cmd |= PCI_COMMAND_SERR; else cmd &= ~PCI_COMMAND_SERR; - eeh_ops->write_config(pdn, PCI_COMMAND, 4, cmd); + eeh_ops->write_config(edev, PCI_COMMAND, 4, cmd); } /** @@ -843,16 +772,14 @@ static void eeh_restore_device_bars(struct eeh_dev *edev) */ static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - /* Do special restore for bridges */ if (edev->mode & EEH_DEV_BRIDGE) eeh_restore_bridge_bars(edev); else eeh_restore_device_bars(edev); - if (eeh_ops->restore_config && pdn) - eeh_ops->restore_config(pdn); + if (eeh_ops->restore_config) + eeh_ops->restore_config(edev); } /** diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index ab44d965a53c..706e1eb95efe 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -6,6 +6,7 @@ * * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> */ +#include <linux/of.h> #include <linux/pci.h> #include <linux/stat.h> #include <asm/ppc-pci.h> @@ -14,7 +15,7 @@ /** * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic * @_name: name of file in sysfs directory - * @_memb: name of member in struct pci_dn to access + * @_memb: name of member in struct eeh_dev to access * @_format: printf format for display * * All of the attributes look very similar, so just @@ -75,7 +76,7 @@ static ssize_t eeh_pe_state_store(struct device *dev, static DEVICE_ATTR_RW(eeh_pe_state); -#ifdef CONFIG_PCI_IOV +#if defined(CONFIG_PCI_IOV) && defined(CONFIG_PPC_PSERIES) static ssize_t eeh_notify_resume_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -86,7 +87,6 @@ static ssize_t eeh_notify_resume_show(struct device *dev, if (!edev || !edev->pe) return -ENODEV; - pdn = pci_get_pdn(pdev); return sprintf(buf, "%d\n", pdn->last_allow_rc); } @@ -100,7 +100,7 @@ static ssize_t eeh_notify_resume_store(struct device *dev, if (!edev || !edev->pe || !eeh_ops->notify_resume) return -ENODEV; - if (eeh_ops->notify_resume(pci_get_pdn(pdev))) + if (eeh_ops->notify_resume(edev)) return -EIO; return count; @@ -132,7 +132,7 @@ static void eeh_notify_resume_remove(struct pci_dev *pdev) #else static inline int eeh_notify_resume_add(struct pci_dev *pdev) { return 0; } static inline void eeh_notify_resume_remove(struct pci_dev *pdev) { } -#endif /* CONFIG_PCI_IOV */ +#endif /* CONFIG_PCI_IOV && CONFIG PPC_PSERIES*/ void eeh_sysfs_add_device(struct pci_dev *pdev) { @@ -160,22 +160,23 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev) { struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + if (!edev) { + WARN_ON(eeh_enabled()); + return; + } + + edev->mode &= ~EEH_DEV_SYSFS; + /* * The parent directory might have been removed. We needn't * continue for that case. */ - if (!pdev->dev.kobj.sd) { - if (edev) - edev->mode &= ~EEH_DEV_SYSFS; + if (!pdev->dev.kobj.sd) return; - } device_remove_file(&pdev->dev, &dev_attr_eeh_mode); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state); eeh_notify_resume_remove(pdev); - - if (edev) - edev->mode &= ~EEH_DEV_SYSFS; } diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index d60908ea37fb..7eda33a24bb4 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -18,6 +18,8 @@ #include <linux/err.h> #include <linux/sys.h> #include <linux/threads.h> +#include <linux/linkage.h> + #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> @@ -27,253 +29,37 @@ #include <asm/asm-offsets.h> #include <asm/unistd.h> #include <asm/ptrace.h> -#include <asm/export.h> -#include <asm/asm-405.h> #include <asm/feature-fixups.h> #include <asm/barrier.h> #include <asm/kup.h> #include <asm/bug.h> +#include <asm/interrupt.h> #include "head_32.h" /* + * powerpc relies on return from interrupt/syscall being context synchronising + * (which rfi is) to support ARCH_HAS_MEMBARRIER_SYNC_CORE without additional + * synchronisation instructions. + */ + +/* * Align to 4k in order to ensure that all functions modyfing srr0/srr1 * fit into one page in order to not encounter a TLB miss between the * modification of srr0/srr1 and the associated rfi. */ .align 12 -#ifdef CONFIG_BOOKE - .globl mcheck_transfer_to_handler -mcheck_transfer_to_handler: - mfspr r0,SPRN_DSRR0 - stw r0,_DSRR0(r11) - mfspr r0,SPRN_DSRR1 - stw r0,_DSRR1(r11) - /* fall through */ - - .globl debug_transfer_to_handler -debug_transfer_to_handler: - mfspr r0,SPRN_CSRR0 - stw r0,_CSRR0(r11) - mfspr r0,SPRN_CSRR1 - stw r0,_CSRR1(r11) - /* fall through */ - - .globl crit_transfer_to_handler -crit_transfer_to_handler: -#ifdef CONFIG_PPC_BOOK3E_MMU - mfspr r0,SPRN_MAS0 - stw r0,MAS0(r11) - mfspr r0,SPRN_MAS1 - stw r0,MAS1(r11) - mfspr r0,SPRN_MAS2 - stw r0,MAS2(r11) - mfspr r0,SPRN_MAS3 - stw r0,MAS3(r11) - mfspr r0,SPRN_MAS6 - stw r0,MAS6(r11) -#ifdef CONFIG_PHYS_64BIT - mfspr r0,SPRN_MAS7 - stw r0,MAS7(r11) -#endif /* CONFIG_PHYS_64BIT */ -#endif /* CONFIG_PPC_BOOK3E_MMU */ -#ifdef CONFIG_44x - mfspr r0,SPRN_MMUCR - stw r0,MMUCR(r11) -#endif - mfspr r0,SPRN_SRR0 - stw r0,_SRR0(r11) - mfspr r0,SPRN_SRR1 - stw r0,_SRR1(r11) - - /* set the stack limit to the current stack */ - mfspr r8,SPRN_SPRG_THREAD - lwz r0,KSP_LIMIT(r8) - stw r0,SAVED_KSP_LIMIT(r11) - rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) - stw r0,KSP_LIMIT(r8) - /* fall through */ -#endif - -#ifdef CONFIG_40x - .globl crit_transfer_to_handler -crit_transfer_to_handler: - lwz r0,crit_r10@l(0) - stw r0,GPR10(r11) - lwz r0,crit_r11@l(0) - stw r0,GPR11(r11) - mfspr r0,SPRN_SRR0 - stw r0,crit_srr0@l(0) - mfspr r0,SPRN_SRR1 - stw r0,crit_srr1@l(0) - - /* set the stack limit to the current stack */ - mfspr r8,SPRN_SPRG_THREAD - lwz r0,KSP_LIMIT(r8) - stw r0,saved_ksp_limit@l(0) - rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) - stw r0,KSP_LIMIT(r8) - /* fall through */ -#endif - -/* - * This code finishes saving the registers to the exception frame - * and jumps to the appropriate handler for the exception, turning - * on address translation. - * Note that we rely on the caller having set cr0.eq iff the exception - * occurred in kernel mode (i.e. MSR:PR = 0). - */ - .globl transfer_to_handler_full -transfer_to_handler_full: - SAVE_NVGPRS(r11) - /* fall through */ - - .globl transfer_to_handler -transfer_to_handler: - stw r2,GPR2(r11) - stw r12,_NIP(r11) - stw r9,_MSR(r11) - andi. r2,r9,MSR_PR - mfctr r12 - mfspr r2,SPRN_XER - stw r12,_CTR(r11) - stw r2,_XER(r11) - mfspr r12,SPRN_SPRG_THREAD - beq 2f /* if from user, fix up THREAD.regs */ - addi r2, r12, -THREAD - addi r11,r1,STACK_FRAME_OVERHEAD - stw r11,PT_REGS(r12) -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - /* Check to see if the dbcr0 register is set up to debug. Use the - internal debug mode bit to do this. */ - lwz r12,THREAD_DBCR0(r12) - andis. r12,r12,DBCR0_IDM@h -#endif - ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) -#ifdef CONFIG_PPC_BOOK3S_32 - kuep_lock r11, r12 -#endif -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - beq+ 3f - /* From user and task is ptraced - load up global dbcr0 */ - li r12,-1 /* clear all pending debug events */ - mtspr SPRN_DBSR,r12 - lis r11,global_dbcr0@ha - tophys(r11,r11) - addi r11,r11,global_dbcr0@l -#ifdef CONFIG_SMP - lwz r9,TASK_CPU(r2) - slwi r9,r9,3 - add r11,r11,r9 -#endif - lwz r12,0(r11) - mtspr SPRN_DBCR0,r12 - lwz r12,4(r11) - addi r12,r12,-1 - stw r12,4(r11) -#endif - - b 3f - -2: /* if from kernel, check interrupted DOZE/NAP mode and - * check for stack overflow - */ - kuap_save_and_lock r11, r12, r9, r2, r0 - addi r2, r12, -THREAD - lwz r9,KSP_LIMIT(r12) - cmplw r1,r9 /* if r1 <= ksp_limit */ - ble- stack_ovf /* then the kernel stack overflowed */ -5: -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_E500) + .globl prepare_transfer_to_handler +prepare_transfer_to_handler: + /* if from kernel, check interrupted DOZE/NAP mode */ lwz r12,TI_LOCAL_FLAGS(r2) mtcrf 0x01,r12 bt- 31-TLF_NAPPING,4f bt- 31-TLF_SLEEPING,7f -#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */ - .globl transfer_to_handler_cont -transfer_to_handler_cont: -3: - mflr r9 - tovirt(r2, r2) /* set r2 to current */ - lwz r11,0(r9) /* virtual address of handler */ - lwz r9,4(r9) /* where to go when done */ -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * When tracing IRQ state (lockdep) we enable the MMU before we call - * the IRQ tracing functions as they might access vmalloc space or - * perform IOs for console output. - * - * To speed up the syscall path where interrupts stay on, let's check - * first if we are changing the MSR value at all. - */ - tophys(r12, r1) - lwz r12,_MSR(r12) - andi. r12,r12,MSR_EE - bne 1f - - /* MSR isn't changing, just transition directly */ -#endif - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r10 - mtlr r9 - SYNC - RFI /* jump to handler, enable MMU */ - -#ifdef CONFIG_TRACE_IRQFLAGS -1: /* MSR is changing, re-enable MMU so we can notify lockdep. We need to - * keep interrupts disabled at this point otherwise we might risk - * taking an interrupt before we tell lockdep they are enabled. - */ - lis r12,reenable_mmu@h - ori r12,r12,reenable_mmu@l - LOAD_REG_IMMEDIATE(r0, MSR_KERNEL) - mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r0 - SYNC - RFI - -reenable_mmu: - /* - * We save a bunch of GPRs, - * r3 can be different from GPR3(r1) at this point, r9 and r11 - * contains the old MSR and handler address respectively, - * r4 & r5 can contain page fault arguments that need to be passed - * along as well. r12, CCR, CTR, XER etc... are left clobbered as - * they aren't useful past this point (aren't syscall arguments), - * the rest is restored from the exception frame. - */ - - stwu r1,-32(r1) - stw r9,8(r1) - stw r11,12(r1) - stw r3,16(r1) - stw r4,20(r1) - stw r5,24(r1) - - /* If we are disabling interrupts (normal case), simply log it with - * lockdep - */ -1: bl trace_hardirqs_off -2: lwz r5,24(r1) - lwz r4,20(r1) - lwz r3,16(r1) - lwz r11,12(r1) - lwz r9,8(r1) - addi r1,r1,32 - lwz r0,GPR0(r1) - lwz r6,GPR6(r1) - lwz r7,GPR7(r1) - lwz r8,GPR8(r1) - mtctr r11 - mtlr r9 - bctr /* jump to handler */ -#endif /* CONFIG_TRACE_IRQFLAGS */ + blr -#if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) 4: rlwinm r12,r12,0,~_TLF_NAPPING stw r12,TI_LOCAL_FLAGS(r2) b power_save_ppc32_restore @@ -283,481 +69,162 @@ reenable_mmu: lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ - kuap_restore r11, r2, r3, r4, r5 + REST_GPR(2, r11) b fast_exception_return -#endif - -/* - * On kernel stack overflow, load up an initial stack pointer - * and call StackOverflow(regs), which should not return. - */ -stack_ovf: - /* sometimes we use a statically-allocated stack, which is OK. */ - lis r12,_end@h - ori r12,r12,_end@l - cmplw r1,r12 - ble 5b /* r1 <= &_end is OK */ - SAVE_NVGPRS(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - lis r1,init_thread_union@ha - addi r1,r1,init_thread_union@l - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD - lis r9,StackOverflow@ha - addi r9,r9,StackOverflow@l - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif - mtspr SPRN_SRR0,r9 - mtspr SPRN_SRR1,r10 - SYNC - RFI - -#ifdef CONFIG_TRACE_IRQFLAGS -trace_syscall_entry_irq_off: - /* - * Syscall shouldn't happen while interrupts are disabled, - * so let's do a warning here. - */ -0: trap - EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING - bl trace_hardirqs_on +_ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler) +#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_PPC_E500 */ - /* Now enable for real */ - LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) - mtmsr r10 +#if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32) +SYM_FUNC_START(__kuep_lock) + lwz r9, THREAD+THSR0(r2) + update_user_segments_by_4 r9, r10, r11, r12 + blr +SYM_FUNC_END(__kuep_lock) - REST_GPR(0, r1) - REST_4GPRS(3, r1) - REST_2GPRS(7, r1) - b DoSyscall -#endif /* CONFIG_TRACE_IRQFLAGS */ +SYM_FUNC_START_LOCAL(__kuep_unlock) + lwz r9, THREAD+THSR0(r2) + rlwinm r9,r9,0,~SR_NX + update_user_segments_by_4 r9, r10, r11, r12 + blr +SYM_FUNC_END(__kuep_unlock) + +.macro kuep_lock + bl __kuep_lock +.endm +.macro kuep_unlock + bl __kuep_unlock +.endm +#else +.macro kuep_lock +.endm +.macro kuep_unlock +.endm +#endif .globl transfer_to_syscall transfer_to_syscall: -#ifdef CONFIG_TRACE_IRQFLAGS - andi. r12,r9,MSR_EE - beq- trace_syscall_entry_irq_off -#endif /* CONFIG_TRACE_IRQFLAGS */ - -/* - * Handle a system call. - */ - .stabs "arch/powerpc/kernel/",N_SO,0,0,0f - .stabs "entry_32.S",N_SO,0,0,0f -0: - -_GLOBAL(DoSyscall) - stw r3,ORIG_GPR3(r1) - li r12,0 - stw r12,RESULT(r1) -#ifdef CONFIG_TRACE_IRQFLAGS - /* Make sure interrupts are enabled */ - mfmsr r11 - andi. r12,r11,MSR_EE - /* We came in with interrupts disabled, we WARN and mark them enabled - * for lockdep now */ -0: tweqi r12, 0 - EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING -#endif /* CONFIG_TRACE_IRQFLAGS */ - lwz r11,TI_FLAGS(r2) - andi. r11,r11,_TIF_SYSCALL_DOTRACE - bne- syscall_dotrace -syscall_dotrace_cont: - cmplwi 0,r0,NR_syscalls - lis r10,sys_call_table@h - ori r10,r10,sys_call_table@l - slwi r0,r0,2 - bge- 66f + stw r3, ORIG_GPR3(r1) + stw r11, GPR1(r1) + stw r11, 0(r1) + mflr r12 + stw r12, _LINK(r1) +#ifdef CONFIG_BOOKE_OR_40x + rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ +#endif + lis r12,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ + SAVE_GPR(2, r1) + addi r12,r12,STACK_FRAME_REGS_MARKER@l + stw r9,_MSR(r1) + li r2, INTERRUPT_SYSCALL + stw r12,STACK_INT_FRAME_MARKER(r1) + stw r2,_TRAP(r1) + SAVE_GPR(0, r1) + SAVE_GPRS(3, 8, r1) + addi r2,r10,-THREAD + SAVE_NVGPRS(r1) + kuep_lock - barrier_nospec_asm - /* - * Prevent the load of the handler below (based on the user-passed - * system call number) being speculatively executed until the test - * against NR_syscalls and branch to .66f above has - * committed. - */ + /* Calling convention has r3 = regs, r4 = orig r0 */ + addi r3,r1,STACK_INT_FRAME_REGS + mr r4,r0 + bl system_call_exception - lwzx r10,r10,r0 /* Fetch system call handler [ptr] */ - mtlr r10 - addi r9,r1,STACK_FRAME_OVERHEAD - PPC440EP_ERR42 - blrl /* Call handler */ - .globl ret_from_syscall ret_from_syscall: -#ifdef CONFIG_DEBUG_RSEQ - /* Check whether the syscall is issued inside a restartable sequence */ - stw r3,GPR3(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl rseq_syscall - lwz r3,GPR3(r1) -#endif - mr r6,r3 - /* disable interrupts so current_thread_info()->flags can't change */ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */ - /* Note: We don't bother telling lockdep about it */ - SYNC - MTMSRD(r10) - lwz r9,TI_FLAGS(r2) - li r8,-MAX_ERRNO - andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- syscall_exit_work - cmplw 0,r3,r8 - blt+ syscall_exit_cont - lwz r11,_CCR(r1) /* Load CR */ - neg r3,r3 - oris r11,r11,0x1000 /* Set SO bit in CR */ - stw r11,_CCR(r1) -syscall_exit_cont: - lwz r8,_MSR(r1) -#ifdef CONFIG_TRACE_IRQFLAGS - /* If we are going to return from the syscall with interrupts - * off, we trace that here. It shouldn't normally happen. - */ - andi. r10,r8,MSR_EE - bne+ 1f - stw r3,GPR3(r1) - bl trace_hardirqs_off - lwz r3,GPR3(r1) -1: -#endif /* CONFIG_TRACE_IRQFLAGS */ -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - /* If the process has its own DBCR0 value, load it up. The internal - debug mode bit tells us that dbcr0 should be loaded. */ - lwz r0,THREAD+THREAD_DBCR0(r2) - andis. r10,r0,DBCR0_IDM@h - bnel- load_dbcr0 -#endif -#ifdef CONFIG_44x -BEGIN_MMU_FTR_SECTION + addi r4,r1,STACK_INT_FRAME_REGS + li r5,0 + bl syscall_exit_prepare +#ifdef CONFIG_PPC_47x lis r4,icache_44x_need_flush@ha lwz r5,icache_44x_need_flush@l(r4) cmplwi cr0,r5,0 - bne- 2f -1: -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_47x) -#endif /* CONFIG_44x */ -BEGIN_FTR_SECTION - lwarx r7,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) - stwcx. r0,0,r1 /* to clear the reservation */ - ACCOUNT_CPU_USER_EXIT(r2, r5, r7) -#ifdef CONFIG_PPC_BOOK3S_32 - kuep_unlock r5, r7 -#endif - kuap_check r2, r4 + bne- .L44x_icache_flush +#endif /* CONFIG_PPC_47x */ +.L44x_icache_flush_return: + kuep_unlock lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlr r4 - mtcr r5 lwz r7,_NIP(r1) - lwz r2,GPR2(r1) - lwz r1,GPR1(r1) -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif + lwz r8,_MSR(r1) + cmpwi r3,0 + REST_GPR(3, r1) +syscall_exit_finish: mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - SYNC - RFI + + bne 3f + mtcr r5 + +1: REST_GPR(2, r1) + REST_GPR(1, r1) + rfi +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ +#endif + +3: mtcr r5 + lwz r4,_CTR(r1) + lwz r5,_XER(r1) + REST_NVGPRS(r1) + mtctr r4 + mtxer r5 + REST_GPR(0, r1) + REST_GPRS(3, 12, r1) + b 1b + #ifdef CONFIG_44x -2: li r7,0 +.L44x_icache_flush: + li r7,0 iccci r0,r0 stw r7,icache_44x_need_flush@l(r4) - b 1b + b .L44x_icache_flush_return #endif /* CONFIG_44x */ -66: li r3,-ENOSYS - b ret_from_syscall - .globl ret_from_fork ret_from_fork: REST_NVGPRS(r1) bl schedule_tail - li r3,0 + li r3,0 /* fork() return value */ b ret_from_syscall - .globl ret_from_kernel_thread -ret_from_kernel_thread: - REST_NVGPRS(r1) + .globl ret_from_kernel_user_thread +ret_from_kernel_user_thread: bl schedule_tail - mtlr r14 + mtctr r14 mr r3,r15 PPC440EP_ERR42 - blrl + bctrl li r3,0 b ret_from_syscall -/* Traced system call support */ -syscall_dotrace: - SAVE_NVGPRS(r1) - li r0,0xc00 - stw r0,_TRAP(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_syscall_trace_enter + .globl start_kernel_thread +start_kernel_thread: + bl schedule_tail + mtctr r14 + mr r3,r15 + PPC440EP_ERR42 + bctrl /* - * Restore argument registers possibly just changed. - * We use the return value of do_syscall_trace_enter - * for call number to look up in the table (r0). + * This must not return. We actually want to BUG here, not WARN, + * because BUG will exit the process which is what the kernel thread + * should have done, which may give some hope of continuing. */ - mr r0,r3 - lwz r3,GPR3(r1) - lwz r4,GPR4(r1) - lwz r5,GPR5(r1) - lwz r6,GPR6(r1) - lwz r7,GPR7(r1) - lwz r8,GPR8(r1) - REST_NVGPRS(r1) - - cmplwi r0,NR_syscalls - /* Return code is already in r3 thanks to do_syscall_trace_enter() */ - bge- ret_from_syscall - b syscall_dotrace_cont - -syscall_exit_work: - andi. r0,r9,_TIF_RESTOREALL - beq+ 0f - REST_NVGPRS(r1) - b 2f -0: cmplw 0,r3,r8 - blt+ 1f - andi. r0,r9,_TIF_NOERROR - bne- 1f - lwz r11,_CCR(r1) /* Load CR */ - neg r3,r3 - oris r11,r11,0x1000 /* Set SO bit in CR */ - stw r11,_CCR(r1) - -1: stw r6,RESULT(r1) /* Save result */ - stw r3,GPR3(r1) /* Update return value */ -2: andi. r0,r9,(_TIF_PERSYSCALL_MASK) - beq 4f - - /* Clear per-syscall TIF flags if any are set. */ - - li r11,_TIF_PERSYSCALL_MASK - addi r12,r2,TI_FLAGS -3: lwarx r8,0,r12 - andc r8,r8,r11 -#ifdef CONFIG_IBM405_ERR77 - dcbt 0,r12 -#endif - stwcx. r8,0,r12 - bne- 3b - -4: /* Anything which requires enabling interrupts? */ - andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) - beq ret_from_except - - /* Re-enable interrupts. There is no need to trace that with - * lockdep as we are supposed to have IRQs on at this point - */ - ori r10,r10,MSR_EE - SYNC - MTMSRD(r10) - - /* Save NVGPRS if they're not saved already */ - lwz r4,_TRAP(r1) - andi. r4,r4,1 - beq 5f - SAVE_NVGPRS(r1) - li r4,0xc00 - stw r4,_TRAP(r1) -5: - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_syscall_trace_leave - b ret_from_except_full - -/* - * The fork/clone functions need to copy the full register set into - * the child process. Therefore we need to save all the nonvolatile - * registers (r13 - r31) before calling the C code. - */ - .globl ppc_fork -ppc_fork: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ - stw r0,_TRAP(r1) /* register set saved */ - b sys_fork - - .globl ppc_vfork -ppc_vfork: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ - stw r0,_TRAP(r1) /* register set saved */ - b sys_vfork - - .globl ppc_clone -ppc_clone: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ - stw r0,_TRAP(r1) /* register set saved */ - b sys_clone - - .globl ppc_clone3 -ppc_clone3: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ - stw r0,_TRAP(r1) /* register set saved */ - b sys_clone3 - - .globl ppc_swapcontext -ppc_swapcontext: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ - stw r0,_TRAP(r1) /* register set saved */ - b sys_swapcontext - -/* - * Top-level page fault handling. - * This is in assembler because if do_page_fault tells us that - * it is a bad kernel page fault, we want to save the non-volatile - * registers before calling bad_page_fault. - */ - .globl handle_page_fault -handle_page_fault: - stw r4,_DAR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S_32 - andis. r0,r5,DSISR_DABRMATCH@h - bne- handle_dabr_fault -#endif - bl do_page_fault - cmpwi r3,0 - beq+ ret_from_except - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - clrrwi r0,r0,1 - stw r0,_TRAP(r1) - mr r5,r3 - addi r3,r1,STACK_FRAME_OVERHEAD - lwz r4,_DAR(r1) - bl bad_page_fault - b ret_from_except_full - -#ifdef CONFIG_PPC_BOOK3S_32 - /* We have a data breakpoint exception - handle it */ -handle_dabr_fault: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - clrrwi r0,r0,1 - stw r0,_TRAP(r1) - bl do_break - b ret_from_except_full -#endif - -/* - * This routine switches between two different tasks. The process - * state of one is saved on its kernel stack. Then the state - * of the other is restored from its kernel stack. The memory - * management hardware is updated to the second process's state. - * Finally, we can return to the second process. - * On entry, r3 points to the THREAD for the current task, r4 - * points to the THREAD for the new task. - * - * This routine is always called with interrupts disabled. - * - * Note: there are two ways to get to the "going out" portion - * of this code; either by coming in via the entry (_switch) - * or via "fork" which must set up an environment equivalent - * to the "_switch" path. If you change this , you'll have to - * change the fork code also. - * - * The code which creates the new task context is in 'copy_thread' - * in arch/ppc/kernel/process.c - */ -_GLOBAL(_switch) - stwu r1,-INT_FRAME_SIZE(r1) - mflr r0 - stw r0,INT_FRAME_SIZE+4(r1) - /* r3-r12 are caller saved -- Cort */ - SAVE_NVGPRS(r1) - stw r0,_NIP(r1) /* Return to switch caller */ - mfmsr r11 - li r0,MSR_FP /* Disable floating-point */ -#ifdef CONFIG_ALTIVEC -BEGIN_FTR_SECTION - oris r0,r0,MSR_VEC@h /* Disable altivec */ - mfspr r12,SPRN_VRSAVE /* save vrsave register value */ - stw r12,THREAD+THREAD_VRSAVE(r2) -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) -#endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_SPE -BEGIN_FTR_SECTION - oris r0,r0,MSR_SPE@h /* Disable SPE */ - mfspr r12,SPRN_SPEFSCR /* save spefscr register value */ - stw r12,THREAD+THREAD_SPEFSCR(r2) -END_FTR_SECTION_IFSET(CPU_FTR_SPE) -#endif /* CONFIG_SPE */ - and. r0,r0,r11 /* FP or altivec or SPE enabled? */ - beq+ 1f - andc r11,r11,r0 - MTMSRD(r11) - isync -1: stw r11,_MSR(r1) - mfcr r10 - stw r10,_CCR(r1) - stw r1,KSP(r3) /* Set old stack pointer */ - - kuap_check r2, r4 -#ifdef CONFIG_SMP - /* We need a sync somewhere here to make sure that if the - * previous task gets rescheduled on another CPU, it sees all - * stores it has performed on this one. - */ - sync -#endif /* CONFIG_SMP */ - - tophys(r0,r4) - mtspr SPRN_SPRG_THREAD,r0 /* Update current THREAD phys addr */ - lwz r1,KSP(r4) /* Load new stack pointer */ - - /* save the old current 'last' for return value */ - mr r3,r2 - addi r2,r4,-THREAD /* Update current */ - -#ifdef CONFIG_ALTIVEC -BEGIN_FTR_SECTION - lwz r0,THREAD+THREAD_VRSAVE(r2) - mtspr SPRN_VRSAVE,r0 /* if G4, restore VRSAVE reg */ -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) -#endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_SPE -BEGIN_FTR_SECTION - lwz r0,THREAD+THREAD_SPEFSCR(r2) - mtspr SPRN_SPEFSCR,r0 /* restore SPEFSCR reg */ -END_FTR_SECTION_IFSET(CPU_FTR_SPE) -#endif /* CONFIG_SPE */ - - lwz r0,_CCR(r1) - mtcrf 0xFF,r0 - /* r3-r12 are destroyed -- Cort */ - REST_NVGPRS(r1) - - lwz r4,_NIP(r1) /* Return to _switch caller in new task */ - mtlr r4 - addi r1,r1,INT_FRAME_SIZE - blr +100: trap + EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,0 .globl fast_exception_return fast_exception_return: #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) andi. r10,r9,MSR_RI /* check for recoverable interrupt */ - beq 1f /* if not, we've got problems */ + beq 3f /* if not, we've got problems */ #endif -2: REST_4GPRS(3, r11) - lwz r10,_CCR(r11) - REST_GPR(1, r11) +2: lwz r10,_CCR(r11) + REST_GPRS(1, 6, r11) mtcr r10 lwz r10,_LINK(r11) mtlr r10 - /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + /* Clear the exception marker on the stack to avoid confusing stacktrace */ li r10, 0 stw r10, 8(r11) REST_GPR(10, r11) @@ -768,279 +235,152 @@ fast_exception_return: mtspr SPRN_SRR0,r12 REST_GPR(9, r11) REST_GPR(12, r11) - lwz r11,GPR11(r11) - SYNC - RFI - -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) -/* check if the exception happened in a restartable section */ -1: lis r3,exc_exit_restart_end@ha - addi r3,r3,exc_exit_restart_end@l - cmplw r12,r3 -#if CONFIG_PPC_BOOK3S_601 - bge 2b -#else - bge 3f -#endif - lis r4,exc_exit_restart@ha - addi r4,r4,exc_exit_restart@l - cmplw r12,r4 -#if CONFIG_PPC_BOOK3S_601 - blt 2b -#else - blt 3f + REST_GPR(11, r11) + rfi +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ #endif - lis r3,fee_restarts@ha - tophys(r3,r3) - lwz r5,fee_restarts@l(r3) - addi r5,r5,1 - stw r5,fee_restarts@l(r3) - mr r12,r4 /* restart at exc_exit_restart */ - b 2b - - .section .bss - .align 2 -fee_restarts: - .space 4 - .previous +_ASM_NOKPROBE_SYMBOL(fast_exception_return) /* aargh, a nonrecoverable interrupt, panic */ /* aargh, we don't know which trap this is */ -/* but the 601 doesn't implement the RI bit, so assume it's OK */ 3: li r10,-1 stw r10,_TRAP(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - lis r10,MSR_KERNEL@h - ori r10,r10,MSR_KERNEL@l - bl transfer_to_handler_full - .long unrecoverable_exception - .long ret_from_except -#endif - - .globl ret_from_except_full -ret_from_except_full: - REST_NVGPRS(r1) - /* fall through */ - - .globl ret_from_except -ret_from_except: - /* Hard-disable interrupts so that current_thread_info()->flags - * can't change between when we test it and when we return - * from the interrupt. */ - /* Note: We don't bother telling lockdep about it */ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - SYNC /* Some chip revs have problems here... */ - MTMSRD(r10) /* disable interrupts */ - - lwz r3,_MSR(r1) /* Returning to user mode? */ - andi. r0,r3,MSR_PR - beq resume_kernel - -user_exc_return: /* r10 contains MSR_KERNEL here */ - /* Check current_thread_info()->flags */ - lwz r9,TI_FLAGS(r2) - andi. r0,r9,_TIF_USER_WORK_MASK - bne do_work - -restore_user: -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - /* Check whether this process has its own DBCR0 value. The internal - debug mode bit tells us that dbcr0 should be loaded. */ - lwz r0,THREAD+THREAD_DBCR0(r2) - andis. r10,r0,DBCR0_IDM@h - bnel- load_dbcr0 -#endif - ACCOUNT_CPU_USER_EXIT(r2, r10, r11) -#ifdef CONFIG_PPC_BOOK3S_32 - kuep_unlock r10, r11 -#endif + prepare_transfer_to_handler + bl unrecoverable_exception + trap /* should not get here */ + + .globl interrupt_return +interrupt_return: + lwz r4,_MSR(r1) + addi r3,r1,STACK_INT_FRAME_REGS + andi. r0,r4,MSR_PR + beq .Lkernel_interrupt_return + bl interrupt_exit_user_prepare + cmpwi r3,0 + kuep_unlock + bne- .Lrestore_nvgprs - b restore +.Lfast_user_interrupt_return: + lwz r11,_NIP(r1) + lwz r12,_MSR(r1) + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 -/* N.B. the only way to get here is from the beq following ret_from_except. */ -resume_kernel: - /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - lwz r8,TI_FLAGS(r2) - andis. r0,r8,_TIF_EMULATE_STACK_STORE@h - beq+ 1f +BEGIN_FTR_SECTION + stwcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + lwarx r0,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ + lwz r3,_CCR(r1) + lwz r4,_LINK(r1) + lwz r5,_CTR(r1) + lwz r6,_XER(r1) + li r0,0 - lwz r3,GPR1(r1) - subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ - mr r4,r1 /* src: current exception frame */ - mr r1,r3 /* Reroute the trampoline frame to r1 */ + /* + * Leaving a stale exception marker on the stack can confuse + * the reliable stack unwinder later on. Clear it. + */ + stw r0,8(r1) + REST_GPRS(7, 12, r1) - /* Copy from the original to the trampoline. */ - li r5,INT_FRAME_SIZE/4 /* size: INT_FRAME_SIZE */ - li r6,0 /* start offset: 0 */ + mtcr r3 + mtlr r4 mtctr r5 -2: lwzx r0,r6,r4 - stwx r0,r6,r3 - addi r6,r6,4 - bdnz 2b - - /* Do real store operation to complete stwu */ - lwz r5,GPR1(r1) - stw r8,0(r5) + mtspr SPRN_XER,r6 - /* Clear _TIF_EMULATE_STACK_STORE flag */ - lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r2,TI_FLAGS -0: lwarx r8,0,r5 - andc r8,r8,r11 -#ifdef CONFIG_IBM405_ERR77 - dcbt 0,r5 -#endif - stwcx. r8,0,r5 - bne- 0b -1: - -#ifdef CONFIG_PREEMPT - /* check current_thread_info->preempt_count */ - lwz r0,TI_PREEMPT(r2) - cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ - bne restore_kuap - andi. r8,r8,_TIF_NEED_RESCHED - beq+ restore_kuap - lwz r3,_MSR(r1) - andi. r0,r3,MSR_EE /* interrupts off? */ - beq restore_kuap /* don't schedule if so */ -#ifdef CONFIG_TRACE_IRQFLAGS - /* Lockdep thinks irqs are enabled, we need to call - * preempt_schedule_irq with IRQs off, so we inform lockdep - * now that we -did- turn them off already - */ - bl trace_hardirqs_off -#endif - bl preempt_schedule_irq -#ifdef CONFIG_TRACE_IRQFLAGS - /* And now, to properly rebalance the above, we tell lockdep they - * are being turned back on, which will happen when we return - */ - bl trace_hardirqs_on + REST_GPRS(2, 6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + rfi +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ #endif -#endif /* CONFIG_PREEMPT */ -restore_kuap: - kuap_restore r1, r2, r9, r10, r0 - - /* interrupts are hard-disabled at this point */ -restore: -#ifdef CONFIG_44x -BEGIN_MMU_FTR_SECTION - b 1f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) - lis r4,icache_44x_need_flush@ha - lwz r5,icache_44x_need_flush@l(r4) - cmplwi cr0,r5,0 - beq+ 1f - li r6,0 - iccci r0,r0 - stw r6,icache_44x_need_flush@l(r4) -1: -#endif /* CONFIG_44x */ - lwz r9,_MSR(r1) -#ifdef CONFIG_TRACE_IRQFLAGS - /* Lockdep doesn't know about the fact that IRQs are temporarily turned - * off in this assembly code while peeking at TI_FLAGS() and such. However - * we need to inform it if the exception turned interrupts off, and we - * are about to trun them back on. - */ - andi. r10,r9,MSR_EE - beq 1f - stwu r1,-32(r1) - mflr r0 - stw r0,4(r1) - bl trace_hardirqs_on - addi r1, r1, 32 - lwz r9,_MSR(r1) -1: -#endif /* CONFIG_TRACE_IRQFLAGS */ +.Lrestore_nvgprs: + REST_NVGPRS(r1) + b .Lfast_user_interrupt_return - lwz r0,GPR0(r1) - lwz r2,GPR2(r1) - REST_4GPRS(3, r1) - REST_2GPRS(7, r1) +.Lkernel_interrupt_return: + bl interrupt_exit_kernel_prepare - lwz r10,_XER(r1) - lwz r11,_CTR(r1) - mtspr SPRN_XER,r10 - mtctr r11 +.Lfast_kernel_interrupt_return: + cmpwi cr1,r3,0 + lwz r11,_NIP(r1) + lwz r12,_MSR(r1) + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 - PPC405_ERR77(0,r1) BEGIN_FTR_SECTION - lwarx r11,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) - stwcx. r0,0,r1 /* to clear the reservation */ + stwcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + lwarx r0,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + + lwz r3,_LINK(r1) + lwz r4,_CTR(r1) + lwz r5,_XER(r1) + lwz r6,_CCR(r1) + li r0,0 -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) - andi. r10,r9,MSR_RI /* check if this exception occurred */ - beql nonrecoverable /* at a bad place (MSR:RI = 0) */ + REST_GPRS(7, 12, r1) - lwz r10,_CCR(r1) - lwz r11,_LINK(r1) - mtcrf 0xFF,r10 - mtlr r11 + mtlr r3 + mtctr r4 + mtspr SPRN_XER,r5 - /* Clear the exception_marker on the stack to avoid confusing stacktrace */ - li r10, 0 - stw r10, 8(r1) /* - * Once we put values in SRR0 and SRR1, we are in a state - * where exceptions are not recoverable, since taking an - * exception will trash SRR0 and SRR1. Therefore we clear the - * MSR:RI bit to indicate this. If we do take an exception, - * we can't return to the point of the exception but we - * can restart the exception exit path at the label - * exc_exit_restart below. -- paulus + * Leaving a stale exception marker on the stack can confuse + * the reliable stack unwinder later on. Clear it. */ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) - SYNC - MTMSRD(r10) /* clear the RI bit */ - .globl exc_exit_restart -exc_exit_restart: - lwz r12,_NIP(r1) - mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r9 - REST_4GPRS(9, r1) - lwz r1,GPR1(r1) - .globl exc_exit_restart_end -exc_exit_restart_end: - SYNC - RFI + stw r0,8(r1) -#else /* !(CONFIG_4xx || CONFIG_BOOKE) */ - /* - * This is a bit different on 4xx/Book-E because it doesn't have - * the RI bit in the MSR. - * The TLB miss handler checks if we have interrupted - * the exception exit path and restarts it if so - * (well maybe one day it will... :). + REST_GPRS(2, 5, r1) + + bne- cr1,1f /* emulate stack store */ + mtcr r6 + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + rfi +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ +#endif + +1: /* + * Emulate stack store with update. New r1 value was already calculated + * and updated in our interrupt regs by emulate_loadstore, but we can't + * store the previous value of r1 to the stack before re-loading our + * registers from it, otherwise they could be clobbered. Use + * SPRG Scratch0 as temporary storage to hold the store + * data, as interrupts are disabled here so it won't be clobbered. */ - lwz r11,_LINK(r1) - mtlr r11 - lwz r10,_CCR(r1) - mtcrf 0xff,r10 - /* Clear the exception_marker on the stack to avoid confusing stacktrace */ - li r10, 0 - stw r10, 8(r1) - REST_2GPRS(9, r1) - .globl exc_exit_restart -exc_exit_restart: - lwz r11,_NIP(r1) - lwz r12,_MSR(r1) -exc_exit_start: - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r12 - REST_2GPRS(11, r1) - lwz r1,GPR1(r1) - .globl exc_exit_restart_end -exc_exit_restart_end: - PPC405_ERR77_SYNC + mtcr r6 +#ifdef CONFIG_BOOKE + mtspr SPRN_SPRG_WSCRATCH0, r9 +#else + mtspr SPRN_SPRG_SCRATCH0, r9 +#endif + addi r9,r1,INT_FRAME_SIZE /* get original r1 */ + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + stw r9,0(r1) /* perform store component of stwu */ +#ifdef CONFIG_BOOKE + mfspr r9, SPRN_SPRG_RSCRATCH0 +#else + mfspr r9, SPRN_SPRG_SCRATCH0 +#endif rfi - b . /* prevent prefetch past rfi */ +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ +#endif +_ASM_NOKPROBE_SYMBOL(interrupt_return) + +#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) /* * Returning from a critical interrupt in user mode doesn't need @@ -1071,17 +411,13 @@ exc_exit_restart_end: REST_NVGPRS(r1); \ lwz r3,_MSR(r1); \ andi. r3,r3,MSR_PR; \ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL); \ - bne user_exc_return; \ - lwz r0,GPR0(r1); \ - lwz r2,GPR2(r1); \ - REST_4GPRS(3, r1); \ - REST_2GPRS(7, r1); \ + bne interrupt_return; \ + REST_GPR(0, r1); \ + REST_GPRS(2, 8, r1); \ lwz r10,_XER(r1); \ lwz r11,_CTR(r1); \ mtspr SPRN_XER,r10; \ mtctr r11; \ - PPC405_ERR77(0,r1); \ stwcx. r0,0,r1; /* to clear the reservation */ \ lwz r11,_LINK(r1); \ mtlr r11; \ @@ -1096,12 +432,8 @@ exc_exit_restart_end: lwz r12,_MSR(r1); \ mtspr exc_lvl_srr0,r11; \ mtspr exc_lvl_srr1,r12; \ - lwz r9,GPR9(r1); \ - lwz r12,GPR12(r1); \ - lwz r10,GPR10(r1); \ - lwz r11,GPR11(r1); \ - lwz r1,GPR1(r1); \ - PPC405_ERR77_SYNC; \ + REST_GPRS(9, 12, r1); \ + REST_GPR(1, r1); \ exc_lvl_rfi; \ b .; /* prevent prefetch past exc_lvl_rfi */ @@ -1111,7 +443,7 @@ exc_exit_restart_end: mtspr SPRN_##exc_lvl_srr0,r9; \ mtspr SPRN_##exc_lvl_srr1,r10; -#if defined(CONFIG_PPC_BOOK3E_MMU) +#if defined(CONFIG_PPC_E500) #ifdef CONFIG_PHYS_64BIT #define RESTORE_MAS7 \ lwz r11,MAS7(r1); \ @@ -1142,11 +474,6 @@ exc_exit_restart_end: #ifdef CONFIG_40x .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG_THREAD - lis r10,saved_ksp_limit@ha; - lwz r10,saved_ksp_limit@l(r10); - tovirt(r9,r9); - stw r10,KSP_LIMIT(r9) lis r9,crit_srr0@ha; lwz r9,crit_srr0@l(r9); lis r10,crit_srr1@ha; @@ -1154,214 +481,32 @@ ret_from_crit_exc: mtspr SPRN_SRR0,r9; mtspr SPRN_SRR1,r10; RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI) +_ASM_NOKPROBE_SYMBOL(ret_from_crit_exc) #endif /* CONFIG_40x */ #ifdef CONFIG_BOOKE .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG_THREAD - lwz r10,SAVED_KSP_LIMIT(r1) - stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_MMU_REGS; RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI) +_ASM_NOKPROBE_SYMBOL(ret_from_crit_exc) .globl ret_from_debug_exc ret_from_debug_exc: - mfspr r9,SPRN_SPRG_THREAD - lwz r10,SAVED_KSP_LIMIT(r1) - stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_MMU_REGS; RET_FROM_EXC_LEVEL(SPRN_DSRR0, SPRN_DSRR1, PPC_RFDI) +_ASM_NOKPROBE_SYMBOL(ret_from_debug_exc) .globl ret_from_mcheck_exc ret_from_mcheck_exc: - mfspr r9,SPRN_SPRG_THREAD - lwz r10,SAVED_KSP_LIMIT(r1) - stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_xSRR(DSRR0,DSRR1); RESTORE_MMU_REGS; RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, PPC_RFMCI) +_ASM_NOKPROBE_SYMBOL(ret_from_mcheck_exc) #endif /* CONFIG_BOOKE */ - -/* - * Load the DBCR0 value for a task that is being ptraced, - * having first saved away the global DBCR0. Note that r0 - * has the dbcr0 value to set upon entry to this. - */ -load_dbcr0: - mfmsr r10 /* first disable debug exceptions */ - rlwinm r10,r10,0,~MSR_DE - mtmsr r10 - isync - mfspr r10,SPRN_DBCR0 - lis r11,global_dbcr0@ha - addi r11,r11,global_dbcr0@l -#ifdef CONFIG_SMP - lwz r9,TASK_CPU(r2) - slwi r9,r9,3 - add r11,r11,r9 -#endif - stw r10,0(r11) - mtspr SPRN_DBCR0,r0 - lwz r10,4(r11) - addi r10,r10,1 - stw r10,4(r11) - li r11,-1 - mtspr SPRN_DBSR,r11 /* clear all pending debug events */ - blr - - .section .bss - .align 4 - .global global_dbcr0 -global_dbcr0: - .space 8*NR_CPUS - .previous #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ - -do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED - beq do_user_signal - -do_resched: /* r10 contains MSR_KERNEL here */ -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on - mfmsr r10 -#endif - ori r10,r10,MSR_EE - SYNC - MTMSRD(r10) /* hard-enable interrupts */ - bl schedule -recheck: - /* Note: And we don't tell it we are disabling them again - * neither. Those disable/enable cycles used to peek at - * TI_FLAGS aren't advertised. - */ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - SYNC - MTMSRD(r10) /* disable interrupts */ - lwz r9,TI_FLAGS(r2) - andi. r0,r9,_TIF_NEED_RESCHED - bne- do_resched - andi. r0,r9,_TIF_USER_WORK_MASK - beq restore_user -do_user_signal: /* r10 contains MSR_KERNEL here */ - ori r10,r10,MSR_EE - SYNC - MTMSRD(r10) /* hard-enable interrupts */ - /* save r13-r31 in the exception frame, if not already done */ - lwz r3,_TRAP(r1) - andi. r0,r3,1 - beq 2f - SAVE_NVGPRS(r1) - rlwinm r3,r3,0,0,30 - stw r3,_TRAP(r1) -2: addi r3,r1,STACK_FRAME_OVERHEAD - mr r4,r9 - bl do_notify_resume - REST_NVGPRS(r1) - b recheck - -/* - * We come here when we are at the end of handling an exception - * that occurred at a place where taking an exception will lose - * state information, such as the contents of SRR0 and SRR1. - */ -nonrecoverable: - lis r10,exc_exit_restart_end@ha - addi r10,r10,exc_exit_restart_end@l - cmplw r12,r10 -#ifdef CONFIG_PPC_BOOK3S_601 - bgelr -#else - bge 3f -#endif - lis r11,exc_exit_restart@ha - addi r11,r11,exc_exit_restart@l - cmplw r12,r11 -#ifdef CONFIG_PPC_BOOK3S_601 - bltlr -#else - blt 3f -#endif - lis r10,ee_restarts@ha - lwz r12,ee_restarts@l(r10) - addi r12,r12,1 - stw r12,ee_restarts@l(r10) - mr r12,r11 /* restart at exc_exit_restart */ - blr -3: /* OK, we can't recover, kill this process */ - /* but the 601 doesn't implement the RI bit, so assume it's OK */ - lwz r3,_TRAP(r1) - andi. r0,r3,1 - beq 5f - SAVE_NVGPRS(r1) - rlwinm r3,r3,0,0,30 - stw r3,_TRAP(r1) -5: mfspr r2,SPRN_SPRG_THREAD - addi r2,r2,-THREAD - tovirt(r2,r2) /* set back r2 to current */ -4: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - /* shouldn't return */ - b 4b - - .section .bss - .align 2 -ee_restarts: - .space 4 - .previous - -/* - * PROM code for specific machines follows. Put it - * here so it's easy to add arch-specific sections later. - * -- Cort - */ -#ifdef CONFIG_PPC_RTAS -/* - * On CHRP, the Run-Time Abstraction Services (RTAS) have to be - * called with the MMU off. - */ -_GLOBAL(enter_rtas) - stwu r1,-INT_FRAME_SIZE(r1) - mflr r0 - stw r0,INT_FRAME_SIZE+4(r1) - LOAD_REG_ADDR(r4, rtas) - lis r6,1f@ha /* physical return address for rtas */ - addi r6,r6,1f@l - tophys(r6,r6) - tophys(r7,r1) - lwz r8,RTASENTRY(r4) - lwz r4,RTASBASE(r4) - mfmsr r9 - stw r9,8(r1) - LOAD_REG_IMMEDIATE(r0,MSR_KERNEL) - SYNC /* disable interrupts so SRR0/1 */ - MTMSRD(r0) /* don't get trashed */ - li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) - mtlr r6 - stw r7, THREAD + RTAS_SP(r2) - mtspr SPRN_SRR0,r8 - mtspr SPRN_SRR1,r9 - RFI -1: tophys(r9,r1) - lwz r8,INT_FRAME_SIZE+4(r9) /* get return address */ - lwz r9,8(r9) /* original msr value */ - addi r1,r1,INT_FRAME_SIZE - li r0,0 - tophys(r7, r2) - stw r0, THREAD + RTAS_SP(r7) - mtspr SPRN_SRR0,r8 - mtspr SPRN_SRR1,r9 - RFI /* return to caller */ - - .globl machine_check_in_rtas -machine_check_in_rtas: - twi 31,0,0 - /* XXX load up BATs and panic */ - -#endif /* CONFIG_PPC_RTAS */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S deleted file mode 100644 index 3fd3ef352e3f..000000000000 --- a/arch/powerpc/kernel/entry_64.S +++ /dev/null @@ -1,1354 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * PowerPC version - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP - * Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu> - * Adapted for Power Macintosh by Paul Mackerras. - * Low-level exception handlers and MMU support - * rewritten by Paul Mackerras. - * Copyright (C) 1996 Paul Mackerras. - * MPC8xx modifications Copyright (C) 1997 Dan Malek (dmalek@jlc.net). - * - * This file contains the system call entry code, context switch - * code, and exception/interrupt return code for PowerPC. - */ - -#include <linux/errno.h> -#include <linux/err.h> -#include <asm/unistd.h> -#include <asm/processor.h> -#include <asm/page.h> -#include <asm/mmu.h> -#include <asm/thread_info.h> -#include <asm/code-patching-asm.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/cputable.h> -#include <asm/firmware.h> -#include <asm/bug.h> -#include <asm/ptrace.h> -#include <asm/irqflags.h> -#include <asm/hw_irq.h> -#include <asm/context_tracking.h> -#include <asm/tm.h> -#include <asm/ppc-opcode.h> -#include <asm/barrier.h> -#include <asm/export.h> -#include <asm/asm-compat.h> -#ifdef CONFIG_PPC_BOOK3S -#include <asm/exception-64s.h> -#else -#include <asm/exception-64e.h> -#endif -#include <asm/feature-fixups.h> -#include <asm/kup.h> - -/* - * System calls. - */ - .section ".toc","aw" -SYS_CALL_TABLE: - .tc sys_call_table[TC],sys_call_table - -COMPAT_SYS_CALL_TABLE: - .tc compat_sys_call_table[TC],compat_sys_call_table - -/* This value is used to mark exception frames on the stack. */ -exception_marker: - .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER - - .section ".text" - .align 7 - - .globl system_call_common -system_call_common: -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -BEGIN_FTR_SECTION - extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne .Ltabort_syscall -END_FTR_SECTION_IFSET(CPU_FTR_TM) -#endif - mr r10,r1 - ld r1,PACAKSAVE(r13) - std r10,0(r1) - std r11,_NIP(r1) - std r12,_MSR(r1) - std r0,GPR0(r1) - std r10,GPR1(r1) -#ifdef CONFIG_PPC_FSL_BOOK3E -START_BTB_FLUSH_SECTION - BTB_FLUSH(r10) -END_BTB_FLUSH_SECTION -#endif - ACCOUNT_CPU_USER_ENTRY(r13, r10, r11) - std r2,GPR2(r1) - std r3,GPR3(r1) - mfcr r2 - std r4,GPR4(r1) - std r5,GPR5(r1) - std r6,GPR6(r1) - std r7,GPR7(r1) - std r8,GPR8(r1) - li r11,0 - std r11,GPR9(r1) - std r11,GPR10(r1) - std r11,GPR11(r1) - std r11,GPR12(r1) - std r11,_XER(r1) - std r11,_CTR(r1) - std r9,GPR13(r1) - mflr r10 - /* - * This clears CR0.SO (bit 28), which is the error indication on - * return from this system call. - */ - rldimi r2,r11,28,(63-28) - li r11,0xc01 - std r10,_LINK(r1) - std r11,_TRAP(r1) - std r3,ORIG_GPR3(r1) - std r2,_CCR(r1) - ld r2,PACATOC(r13) - addi r9,r1,STACK_FRAME_OVERHEAD - ld r11,exception_marker@toc(r2) - std r11,-16(r9) /* "regshere" marker */ - - kuap_check_amr r10, r11 - -#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) -BEGIN_FW_FTR_SECTION - /* see if there are any DTL entries to process */ - ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ - ld r11,PACA_DTL_RIDX(r13) /* get log read index */ - addi r10,r10,LPPACA_DTLIDX - LDX_BE r10,0,r10 /* get log write index */ - cmpd r11,r10 - beq+ 33f - bl accumulate_stolen_time - REST_GPR(0,r1) - REST_4GPRS(3,r1) - REST_2GPRS(7,r1) - addi r9,r1,STACK_FRAME_OVERHEAD -33: -END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */ - - /* - * A syscall should always be called with interrupts enabled - * so we just unconditionally hard-enable here. When some kind - * of irq tracing is used, we additionally check that condition - * is correct - */ -#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG) - lbz r10,PACAIRQSOFTMASK(r13) -1: tdnei r10,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING -#endif - -#ifdef CONFIG_PPC_BOOK3E - wrteei 1 -#else - li r11,MSR_RI - ori r11,r11,MSR_EE - mtmsrd r11,1 -#endif /* CONFIG_PPC_BOOK3E */ - -system_call: /* label this so stack traces look sane */ - /* We do need to set SOFTE in the stack frame or the return - * from interrupt will be painful - */ - li r10,IRQS_ENABLED - std r10,SOFTE(r1) - - ld r11, PACA_THREAD_INFO(r13) - ld r10,TI_FLAGS(r11) - andi. r11,r10,_TIF_SYSCALL_DOTRACE - bne .Lsyscall_dotrace /* does not return */ - cmpldi 0,r0,NR_syscalls - bge- .Lsyscall_enosys - -.Lsyscall: -/* - * Need to vector to 32 Bit or default sys_call_table here, - * based on caller's run-mode / personality. - */ - ld r11,SYS_CALL_TABLE@toc(2) - andis. r10,r10,_TIF_32BIT@h - beq 15f - ld r11,COMPAT_SYS_CALL_TABLE@toc(2) - clrldi r3,r3,32 - clrldi r4,r4,32 - clrldi r5,r5,32 - clrldi r6,r6,32 - clrldi r7,r7,32 - clrldi r8,r8,32 -15: - slwi r0,r0,3 - - barrier_nospec_asm - /* - * Prevent the load of the handler below (based on the user-passed - * system call number) being speculatively executed until the test - * against NR_syscalls and branch to .Lsyscall_enosys above has - * committed. - */ - - ldx r12,r11,r0 /* Fetch system call handler [ptr] */ - mtctr r12 - bctrl /* Call handler */ - - /* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */ -.Lsyscall_exit: - std r3,RESULT(r1) - -#ifdef CONFIG_DEBUG_RSEQ - /* Check whether the syscall is issued inside a restartable sequence */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl rseq_syscall - ld r3,RESULT(r1) -#endif - - ld r12, PACA_THREAD_INFO(r13) - - ld r8,_MSR(r1) - -/* - * This is a few instructions into the actual syscall exit path (which actually - * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the - * number of visible symbols for profiling purposes. - * - * We can probe from system_call until this point as MSR_RI is set. But once it - * is cleared below, we won't be able to take a trap. - * - * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL(). - */ -system_call_exit: - /* - * Disable interrupts so current_thread_info()->flags can't change, - * and so that we don't get interrupted after loading SRR0/1. - * - * Leave MSR_RI enabled for now, because with THREAD_INFO_IN_TASK we - * could fault on the load of the TI_FLAGS below. - */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 0 -#else - li r11,MSR_RI - mtmsrd r11,1 -#endif /* CONFIG_PPC_BOOK3E */ - - ld r9,TI_FLAGS(r12) - li r11,-MAX_ERRNO - andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- .Lsyscall_exit_work - - andi. r0,r8,MSR_FP - beq 2f -#ifdef CONFIG_ALTIVEC - andis. r0,r8,MSR_VEC@h - bne 3f -#endif -2: addi r3,r1,STACK_FRAME_OVERHEAD - bl restore_math - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO - -3: cmpld r3,r11 - ld r5,_CCR(r1) - bge- .Lsyscall_error -.Lsyscall_error_cont: - ld r7,_NIP(r1) -BEGIN_FTR_SECTION - stdcx. r0,0,r1 /* to clear the reservation */ -END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - andi. r6,r8,MSR_PR - ld r4,_LINK(r1) - - kuap_check_amr r10, r11 - -#ifdef CONFIG_PPC_BOOK3S - /* - * Clear MSR_RI, MSR_EE is already and remains disabled. We could do - * this later, but testing shows that doing it here causes less slow - * down than doing it closer to the rfid. - */ - li r11,0 - mtmsrd r11,1 -#endif - - beq- 1f - ACCOUNT_CPU_USER_EXIT(r13, r11, r12) - -BEGIN_FTR_SECTION - HMT_MEDIUM_LOW -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - std r8, PACATMSCRATCH(r13) -#endif - - /* - * We don't need to restore AMR on the way back to userspace for KUAP. - * The value of AMR only matters while we're in the kernel. - */ - ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ - ld r2,GPR2(r1) - ld r1,GPR1(r1) - mtlr r4 - mtcr r5 - mtspr SPRN_SRR0,r7 - mtspr SPRN_SRR1,r8 - RFI_TO_USER - b . /* prevent speculative execution */ - -1: /* exit to kernel */ - kuap_restore_amr r2 - - ld r2,GPR2(r1) - ld r1,GPR1(r1) - mtlr r4 - mtcr r5 - mtspr SPRN_SRR0,r7 - mtspr SPRN_SRR1,r8 - RFI_TO_KERNEL - b . /* prevent speculative execution */ - -.Lsyscall_error: - oris r5,r5,0x1000 /* Set SO bit in CR */ - neg r3,r3 - std r5,_CCR(r1) - b .Lsyscall_error_cont - -/* Traced system call support */ -.Lsyscall_dotrace: - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_syscall_trace_enter - - /* - * We use the return value of do_syscall_trace_enter() as the syscall - * number. If the syscall was rejected for any reason do_syscall_trace_enter() - * returns an invalid syscall number and the test below against - * NR_syscalls will fail. - */ - mr r0,r3 - - /* Restore argument registers just clobbered and/or possibly changed. */ - ld r3,GPR3(r1) - ld r4,GPR4(r1) - ld r5,GPR5(r1) - ld r6,GPR6(r1) - ld r7,GPR7(r1) - ld r8,GPR8(r1) - - /* Repopulate r9 and r10 for the syscall path */ - addi r9,r1,STACK_FRAME_OVERHEAD - ld r10, PACA_THREAD_INFO(r13) - ld r10,TI_FLAGS(r10) - - cmpldi r0,NR_syscalls - blt+ .Lsyscall - - /* Return code is already in r3 thanks to do_syscall_trace_enter() */ - b .Lsyscall_exit - - -.Lsyscall_enosys: - li r3,-ENOSYS - b .Lsyscall_exit - -.Lsyscall_exit_work: - /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr. - If TIF_NOERROR is set, just save r3 as it is. */ - - andi. r0,r9,_TIF_RESTOREALL - beq+ 0f - REST_NVGPRS(r1) - b 2f -0: cmpld r3,r11 /* r11 is -MAX_ERRNO */ - blt+ 1f - andi. r0,r9,_TIF_NOERROR - bne- 1f - ld r5,_CCR(r1) - neg r3,r3 - oris r5,r5,0x1000 /* Set SO bit in CR */ - std r5,_CCR(r1) -1: std r3,GPR3(r1) -2: andi. r0,r9,(_TIF_PERSYSCALL_MASK) - beq 4f - - /* Clear per-syscall TIF flags if any are set. */ - - li r11,_TIF_PERSYSCALL_MASK - addi r12,r12,TI_FLAGS -3: ldarx r10,0,r12 - andc r10,r10,r11 - stdcx. r10,0,r12 - bne- 3b - subi r12,r12,TI_FLAGS - -4: /* Anything else left to do? */ -BEGIN_FTR_SECTION - lis r3,DEFAULT_PPR@highest /* Set default PPR */ - sldi r3,r3,32 /* bits 11-13 are used for ppr */ - std r3,_PPR(r1) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - - andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) - beq ret_from_except_lite - - /* Re-enable interrupts */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 1 -#else - li r10,MSR_RI - ori r10,r10,MSR_EE - mtmsrd r10,1 -#endif /* CONFIG_PPC_BOOK3E */ - - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_syscall_trace_leave - b ret_from_except - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -.Ltabort_syscall: - /* Firstly we need to enable TM in the kernel */ - mfmsr r10 - li r9, 1 - rldimi r10, r9, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r10, 0 - - /* tabort, this dooms the transaction, nothing else */ - li r9, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT) - TABORT(R9) - - /* - * Return directly to userspace. We have corrupted user register state, - * but userspace will never see that register state. Execution will - * resume after the tbegin of the aborted transaction with the - * checkpointed register state. - */ - li r9, MSR_RI - andc r10, r10, r9 - mtmsrd r10, 1 - mtspr SPRN_SRR0, r11 - mtspr SPRN_SRR1, r12 - RFI_TO_USER - b . /* prevent speculative execution */ -#endif -_ASM_NOKPROBE_SYMBOL(system_call_common); -_ASM_NOKPROBE_SYMBOL(system_call_exit); - -/* Save non-volatile GPRs, if not already saved. */ -_GLOBAL(save_nvgprs) - ld r11,_TRAP(r1) - andi. r0,r11,1 - beqlr- - SAVE_NVGPRS(r1) - clrrdi r0,r11,1 - std r0,_TRAP(r1) - blr -_ASM_NOKPROBE_SYMBOL(save_nvgprs); - - -/* - * The sigsuspend and rt_sigsuspend system calls can call do_signal - * and thus put the process into the stopped state where we might - * want to examine its user state with ptrace. Therefore we need - * to save all the nonvolatile registers (r14 - r31) before calling - * the C code. Similarly, fork, vfork and clone need the full - * register state on the stack so that it can be copied to the child. - */ - -_GLOBAL(ppc_fork) - bl save_nvgprs - bl sys_fork - b .Lsyscall_exit - -_GLOBAL(ppc_vfork) - bl save_nvgprs - bl sys_vfork - b .Lsyscall_exit - -_GLOBAL(ppc_clone) - bl save_nvgprs - bl sys_clone - b .Lsyscall_exit - -_GLOBAL(ppc_clone3) - bl save_nvgprs - bl sys_clone3 - b .Lsyscall_exit - -_GLOBAL(ppc32_swapcontext) - bl save_nvgprs - bl compat_sys_swapcontext - b .Lsyscall_exit - -_GLOBAL(ppc64_swapcontext) - bl save_nvgprs - bl sys_swapcontext - b .Lsyscall_exit - -_GLOBAL(ppc_switch_endian) - bl save_nvgprs - bl sys_switch_endian - b .Lsyscall_exit - -_GLOBAL(ret_from_fork) - bl schedule_tail - REST_NVGPRS(r1) - li r3,0 - b .Lsyscall_exit - -_GLOBAL(ret_from_kernel_thread) - bl schedule_tail - REST_NVGPRS(r1) - mtlr r14 - mr r3,r15 -#ifdef PPC64_ELF_ABI_v2 - mr r12,r14 -#endif - blrl - li r3,0 - b .Lsyscall_exit - -#ifdef CONFIG_PPC_BOOK3S_64 - -#define FLUSH_COUNT_CACHE \ -1: nop; \ - patch_site 1b, patch__call_flush_count_cache - - -#define BCCTR_FLUSH .long 0x4c400420 - -.macro nops number - .rept \number - nop - .endr -.endm - -.balign 32 -.global flush_count_cache -flush_count_cache: - /* Save LR into r9 */ - mflr r9 - - // Flush the link stack - .rept 64 - bl .+4 - .endr - b 1f - nops 6 - - .balign 32 - /* Restore LR */ -1: mtlr r9 - - // If we're just flushing the link stack, return here -3: nop - patch_site 3b patch__flush_link_stack_return - - li r9,0x7fff - mtctr r9 - - BCCTR_FLUSH - -2: nop - patch_site 2b patch__flush_count_cache_return - - nops 3 - - .rept 278 - .balign 32 - BCCTR_FLUSH - nops 7 - .endr - - blr -#else -#define FLUSH_COUNT_CACHE -#endif /* CONFIG_PPC_BOOK3S_64 */ - -/* - * This routine switches between two different tasks. The process - * state of one is saved on its kernel stack. Then the state - * of the other is restored from its kernel stack. The memory - * management hardware is updated to the second process's state. - * Finally, we can return to the second process, via ret_from_except. - * On entry, r3 points to the THREAD for the current task, r4 - * points to the THREAD for the new task. - * - * Note: there are two ways to get to the "going out" portion - * of this code; either by coming in via the entry (_switch) - * or via "fork" which must set up an environment equivalent - * to the "_switch" path. If you change this you'll have to change - * the fork code also. - * - * The code which creates the new task context is in 'copy_thread' - * in arch/powerpc/kernel/process.c - */ - .align 7 -_GLOBAL(_switch) - mflr r0 - std r0,16(r1) - stdu r1,-SWITCH_FRAME_SIZE(r1) - /* r3-r13 are caller saved -- Cort */ - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) - std r0,_NIP(r1) /* Return to switch caller */ - mfcr r23 - std r23,_CCR(r1) - std r1,KSP(r3) /* Set old stack pointer */ - - kuap_check_amr r9, r10 - - FLUSH_COUNT_CACHE - - /* - * On SMP kernels, care must be taken because a task may be - * scheduled off CPUx and on to CPUy. Memory ordering must be - * considered. - * - * Cacheable stores on CPUx will be visible when the task is - * scheduled on CPUy by virtue of the core scheduler barriers - * (see "Notes on Program-Order guarantees on SMP systems." in - * kernel/sched/core.c). - * - * Uncacheable stores in the case of involuntary preemption must - * be taken care of. The smp_mb__before_spin_lock() in __schedule() - * is implemented as hwsync on powerpc, which orders MMIO too. So - * long as there is an hwsync in the context switch path, it will - * be executed on the source CPU after the task has performed - * all MMIO ops on that CPU, and on the destination CPU before the - * task performs any MMIO ops there. - */ - - /* - * The kernel context switch path must contain a spin_lock, - * which contains larx/stcx, which will clear any reservation - * of the task being switched. - */ -#ifdef CONFIG_PPC_BOOK3S -/* Cancel all explict user streams as they will have no use after context - * switch and will stop the HW from creating streams itself - */ - DCBT_BOOK3S_STOP_ALL_STREAM_IDS(r6) -#endif - - addi r6,r4,-THREAD /* Convert THREAD to 'current' */ - std r6,PACACURRENT(r13) /* Set new 'current' */ -#if defined(CONFIG_STACKPROTECTOR) - ld r6, TASK_CANARY(r6) - std r6, PACA_CANARY(r13) -#endif - - ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_BOOK3S_64 -BEGIN_MMU_FTR_SECTION - b 2f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) -BEGIN_FTR_SECTION - clrrdi r6,r8,28 /* get its ESID */ - clrrdi r9,r1,28 /* get current sp ESID */ -FTR_SECTION_ELSE - clrrdi r6,r8,40 /* get its 1T ESID */ - clrrdi r9,r1,40 /* get current sp 1T ESID */ -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_1T_SEGMENT) - clrldi. r0,r6,2 /* is new ESID c00000000? */ - cmpd cr1,r6,r9 /* or is new ESID the same as current ESID? */ - cror eq,4*cr1+eq,eq - beq 2f /* if yes, don't slbie it */ - - /* Bolt in the new stack SLB entry */ - ld r7,KSP_VSID(r4) /* Get new stack's VSID */ - oris r0,r6,(SLB_ESID_V)@h - ori r0,r0,(SLB_NUM_BOLTED-1)@l -BEGIN_FTR_SECTION - li r9,MMU_SEGSIZE_1T /* insert B field */ - oris r6,r6,(MMU_SEGSIZE_1T << SLBIE_SSIZE_SHIFT)@h - rldimi r7,r9,SLB_VSID_SSIZE_SHIFT,0 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - - /* Update the last bolted SLB. No write barriers are needed - * here, provided we only update the current CPU's SLB shadow - * buffer. - */ - ld r9,PACA_SLBSHADOWPTR(r13) - li r12,0 - std r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */ - li r12,SLBSHADOW_STACKVSID - STDX_BE r7,r12,r9 /* Save VSID */ - li r12,SLBSHADOW_STACKESID - STDX_BE r0,r12,r9 /* Save ESID */ - - /* No need to check for MMU_FTR_NO_SLBIE_B here, since when - * we have 1TB segments, the only CPUs known to have the errata - * only support less than 1TB of system memory and we'll never - * actually hit this code path. - */ - - isync - slbie r6 -BEGIN_FTR_SECTION - slbie r6 /* Workaround POWER5 < DD2.1 issue */ -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - slbmte r7,r0 - isync -2: -#endif /* CONFIG_PPC_BOOK3S_64 */ - - clrrdi r7, r8, THREAD_SHIFT /* base of new stack */ - /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE - because we don't need to leave the 288-byte ABI gap at the - top of the kernel stack. */ - addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE - - /* - * PMU interrupts in radix may come in here. They will use r1, not - * PACAKSAVE, so this stack switch will not cause a problem. They - * will store to the process stack, which may then be migrated to - * another CPU. However the rq lock release on this CPU paired with - * the rq lock acquire on the new CPU before the stack becomes - * active on the new CPU, will order those stores. - */ - mr r1,r8 /* start using new stack pointer */ - std r7,PACAKSAVE(r13) - - ld r6,_CCR(r1) - mtcrf 0xFF,r6 - - /* r3-r13 are destroyed -- Cort */ - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) - - /* convert old thread to its task_struct for return value */ - addi r3,r3,-THREAD - ld r7,_NIP(r1) /* Return to _switch caller in new task */ - mtlr r7 - addi r1,r1,SWITCH_FRAME_SIZE - blr - - .align 7 -_GLOBAL(ret_from_except) - ld r11,_TRAP(r1) - andi. r0,r11,1 - bne ret_from_except_lite - REST_NVGPRS(r1) - -_GLOBAL(ret_from_except_lite) - /* - * Disable interrupts so that current_thread_info()->flags - * can't change between when we test it and when we return - * from the interrupt. - */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 0 -#else - li r10,MSR_RI - mtmsrd r10,1 /* Update machine state */ -#endif /* CONFIG_PPC_BOOK3E */ - - ld r9, PACA_THREAD_INFO(r13) - ld r3,_MSR(r1) -#ifdef CONFIG_PPC_BOOK3E - ld r10,PACACURRENT(r13) -#endif /* CONFIG_PPC_BOOK3E */ - ld r4,TI_FLAGS(r9) - andi. r3,r3,MSR_PR - beq resume_kernel -#ifdef CONFIG_PPC_BOOK3E - lwz r3,(THREAD+THREAD_DBCR0)(r10) -#endif /* CONFIG_PPC_BOOK3E */ - - /* Check current_thread_info()->flags */ - andi. r0,r4,_TIF_USER_WORK_MASK - bne 1f -#ifdef CONFIG_PPC_BOOK3E - /* - * Check to see if the dbcr0 register is set up to debug. - * Use the internal debug mode bit to do this. - */ - andis. r0,r3,DBCR0_IDM@h - beq restore - mfmsr r0 - rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ - mtmsr r0 - mtspr SPRN_DBCR0,r3 - li r10, -1 - mtspr SPRN_DBSR,r10 - b restore -#else - addi r3,r1,STACK_FRAME_OVERHEAD - bl restore_math - b restore -#endif -1: andi. r0,r4,_TIF_NEED_RESCHED - beq 2f - bl restore_interrupts - SCHEDULE_USER - b ret_from_except_lite -2: -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - andi. r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM - bne 3f /* only restore TM if nothing else to do */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl restore_tm_state - b restore -3: -#endif - bl save_nvgprs - /* - * Use a non volatile GPR to save and restore our thread_info flags - * across the call to restore_interrupts. - */ - mr r30,r4 - bl restore_interrupts - mr r4,r30 - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_notify_resume - b ret_from_except - -resume_kernel: - /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - andis. r8,r4,_TIF_EMULATE_STACK_STORE@h - beq+ 1f - - addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ - - ld r3,GPR1(r1) - subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ - mr r4,r1 /* src: current exception frame */ - mr r1,r3 /* Reroute the trampoline frame to r1 */ - - /* Copy from the original to the trampoline. */ - li r5,INT_FRAME_SIZE/8 /* size: INT_FRAME_SIZE */ - li r6,0 /* start offset: 0 */ - mtctr r5 -2: ldx r0,r6,r4 - stdx r0,r6,r3 - addi r6,r6,8 - bdnz 2b - - /* Do real store operation to complete stdu */ - ld r5,GPR1(r1) - std r8,0(r5) - - /* Clear _TIF_EMULATE_STACK_STORE flag */ - lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r9,TI_FLAGS -0: ldarx r4,0,r5 - andc r4,r4,r11 - stdcx. r4,0,r5 - bne- 0b -1: - -#ifdef CONFIG_PREEMPT - /* Check if we need to preempt */ - andi. r0,r4,_TIF_NEED_RESCHED - beq+ restore - /* Check that preempt_count() == 0 and interrupts are enabled */ - lwz r8,TI_PREEMPT(r9) - cmpwi cr0,r8,0 - bne restore - ld r0,SOFTE(r1) - andi. r0,r0,IRQS_DISABLED - bne restore - - /* - * Here we are preempting the current task. We want to make - * sure we are soft-disabled first and reconcile irq state. - */ - RECONCILE_IRQ_STATE(r3,r4) - bl preempt_schedule_irq - - /* - * arch_local_irq_restore() from preempt_schedule_irq above may - * enable hard interrupt but we really should disable interrupts - * when we return from the interrupt, and so that we don't get - * interrupted after loading SRR0/1. - */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 0 -#else - li r10,MSR_RI - mtmsrd r10,1 /* Update machine state */ -#endif /* CONFIG_PPC_BOOK3E */ -#endif /* CONFIG_PREEMPT */ - - .globl fast_exc_return_irq -fast_exc_return_irq: -restore: - /* - * This is the main kernel exit path. First we check if we - * are about to re-enable interrupts - */ - ld r5,SOFTE(r1) - lbz r6,PACAIRQSOFTMASK(r13) - andi. r5,r5,IRQS_DISABLED - bne .Lrestore_irq_off - - /* We are enabling, were we already enabled ? Yes, just return */ - andi. r6,r6,IRQS_DISABLED - beq cr0,.Ldo_restore - - /* - * We are about to soft-enable interrupts (we are hard disabled - * at this point). We check if there's anything that needs to - * be replayed first. - */ - lbz r0,PACAIRQHAPPENED(r13) - cmpwi cr0,r0,0 - bne- .Lrestore_check_irq_replay - - /* - * Get here when nothing happened while soft-disabled, just - * soft-enable and move-on. We will hard-enable as a side - * effect of rfi - */ -.Lrestore_no_replay: - TRACE_ENABLE_INTS - li r0,IRQS_ENABLED - stb r0,PACAIRQSOFTMASK(r13); - - /* - * Final return path. BookE is handled in a different file - */ -.Ldo_restore: -#ifdef CONFIG_PPC_BOOK3E - b exception_return_book3e -#else - /* - * Clear the reservation. If we know the CPU tracks the address of - * the reservation then we can potentially save some cycles and use - * a larx. On POWER6 and POWER7 this is significantly faster. - */ -BEGIN_FTR_SECTION - stdcx. r0,0,r1 /* to clear the reservation */ -FTR_SECTION_ELSE - ldarx r4,0,r1 -ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - - /* - * Some code path such as load_up_fpu or altivec return directly - * here. They run entirely hard disabled and do not alter the - * interrupt state. They also don't use lwarx/stwcx. and thus - * are known not to leave dangling reservations. - */ - .globl fast_exception_return -fast_exception_return: - ld r3,_MSR(r1) - ld r4,_CTR(r1) - ld r0,_LINK(r1) - mtctr r4 - mtlr r0 - ld r4,_XER(r1) - mtspr SPRN_XER,r4 - - kuap_check_amr r5, r6 - - REST_8GPRS(5, r1) - - andi. r0,r3,MSR_RI - beq- .Lunrecov_restore - - /* - * Clear RI before restoring r13. If we are returning to - * userspace and we take an exception after restoring r13, - * we end up corrupting the userspace r13 value. - */ - li r4,0 - mtmsrd r4,1 - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* TM debug */ - std r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */ -#endif - /* - * r13 is our per cpu area, only restore it if we are returning to - * userspace the value stored in the stack frame may belong to - * another CPU. - */ - andi. r0,r3,MSR_PR - beq 1f -BEGIN_FTR_SECTION - /* Restore PPR */ - ld r2,_PPR(r1) - mtspr SPRN_PPR,r2 -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - ACCOUNT_CPU_USER_EXIT(r13, r2, r4) - REST_GPR(13, r1) - - /* - * We don't need to restore AMR on the way back to userspace for KUAP. - * The value of AMR only matters while we're in the kernel. - */ - mtspr SPRN_SRR1,r3 - - ld r2,_CCR(r1) - mtcrf 0xFF,r2 - ld r2,_NIP(r1) - mtspr SPRN_SRR0,r2 - - ld r0,GPR0(r1) - ld r2,GPR2(r1) - ld r3,GPR3(r1) - ld r4,GPR4(r1) - ld r1,GPR1(r1) - RFI_TO_USER - b . /* prevent speculative execution */ - -1: mtspr SPRN_SRR1,r3 - - ld r2,_CCR(r1) - mtcrf 0xFF,r2 - ld r2,_NIP(r1) - mtspr SPRN_SRR0,r2 - - /* - * Leaving a stale exception_marker on the stack can confuse - * the reliable stack unwinder later on. Clear it. - */ - li r2,0 - std r2,STACK_FRAME_OVERHEAD-16(r1) - - ld r0,GPR0(r1) - ld r2,GPR2(r1) - ld r3,GPR3(r1) - - kuap_restore_amr r4 - - ld r4,GPR4(r1) - ld r1,GPR1(r1) - RFI_TO_KERNEL - b . /* prevent speculative execution */ - -#endif /* CONFIG_PPC_BOOK3E */ - - /* - * We are returning to a context with interrupts soft disabled. - * - * However, we may also about to hard enable, so we need to - * make sure that in this case, we also clear PACA_IRQ_HARD_DIS - * or that bit can get out of sync and bad things will happen - */ -.Lrestore_irq_off: - ld r3,_MSR(r1) - lbz r7,PACAIRQHAPPENED(r13) - andi. r0,r3,MSR_EE - beq 1f - rlwinm r7,r7,0,~PACA_IRQ_HARD_DIS - stb r7,PACAIRQHAPPENED(r13) -1: -#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG) - /* The interrupt should not have soft enabled. */ - lbz r7,PACAIRQSOFTMASK(r13) -1: tdeqi r7,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING -#endif - b .Ldo_restore - - /* - * Something did happen, check if a re-emit is needed - * (this also clears paca->irq_happened) - */ -.Lrestore_check_irq_replay: - /* XXX: We could implement a fast path here where we check - * for irq_happened being just 0x01, in which case we can - * clear it and return. That means that we would potentially - * miss a decrementer having wrapped all the way around. - * - * Still, this might be useful for things like hash_page - */ - bl __check_irq_replay - cmpwi cr0,r3,0 - beq .Lrestore_no_replay - - /* - * We need to re-emit an interrupt. We do so by re-using our - * existing exception frame. We first change the trap value, - * but we need to ensure we preserve the low nibble of it - */ - ld r4,_TRAP(r1) - clrldi r4,r4,60 - or r4,r4,r3 - std r4,_TRAP(r1) - - /* - * PACA_IRQ_HARD_DIS won't always be set here, so set it now - * to reconcile the IRQ state. Tracing is already accounted for. - */ - lbz r4,PACAIRQHAPPENED(r13) - ori r4,r4,PACA_IRQ_HARD_DIS - stb r4,PACAIRQHAPPENED(r13) - - /* - * Then find the right handler and call it. Interrupts are - * still soft-disabled and we keep them that way. - */ - cmpwi cr0,r3,0x500 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl do_IRQ - b ret_from_except -1: cmpwi cr0,r3,0xf00 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl performance_monitor_exception - b ret_from_except -1: cmpwi cr0,r3,0xe60 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl handle_hmi_exception - b ret_from_except -1: cmpwi cr0,r3,0x900 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl timer_interrupt - b ret_from_except -#ifdef CONFIG_PPC_DOORBELL -1: -#ifdef CONFIG_PPC_BOOK3E - cmpwi cr0,r3,0x280 -#else - cmpwi cr0,r3,0xa00 -#endif /* CONFIG_PPC_BOOK3E */ - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl doorbell_exception -#endif /* CONFIG_PPC_DOORBELL */ -1: b ret_from_except /* What else to do here ? */ - -.Lunrecov_restore: - addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b .Lunrecov_restore - -_ASM_NOKPROBE_SYMBOL(ret_from_except); -_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); -_ASM_NOKPROBE_SYMBOL(resume_kernel); -_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq); -_ASM_NOKPROBE_SYMBOL(restore); -_ASM_NOKPROBE_SYMBOL(fast_exception_return); - - -#ifdef CONFIG_PPC_RTAS -/* - * On CHRP, the Run-Time Abstraction Services (RTAS) have to be - * called with the MMU off. - * - * In addition, we need to be in 32b mode, at least for now. - * - * Note: r3 is an input parameter to rtas, so don't trash it... - */ -_GLOBAL(enter_rtas) - mflr r0 - std r0,16(r1) - stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space. */ - - /* Because RTAS is running in 32b mode, it clobbers the high order half - * of all registers that it saves. We therefore save those registers - * RTAS might touch to the stack. (r0, r3-r13 are caller saved) - */ - SAVE_GPR(2, r1) /* Save the TOC */ - SAVE_GPR(13, r1) /* Save paca */ - SAVE_8GPRS(14, r1) /* Save the non-volatiles */ - SAVE_10GPRS(22, r1) /* ditto */ - - mfcr r4 - std r4,_CCR(r1) - mfctr r5 - std r5,_CTR(r1) - mfspr r6,SPRN_XER - std r6,_XER(r1) - mfdar r7 - std r7,_DAR(r1) - mfdsisr r8 - std r8,_DSISR(r1) - - /* Temporary workaround to clear CR until RTAS can be modified to - * ignore all bits. - */ - li r0,0 - mtcr r0 - -#ifdef CONFIG_BUG - /* There is no way it is acceptable to get here with interrupts enabled, - * check it with the asm equivalent of WARN_ON - */ - lbz r0,PACAIRQSOFTMASK(r13) -1: tdeqi r0,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING -#endif - - /* Hard-disable interrupts */ - mfmsr r6 - rldicl r7,r6,48,1 - rotldi r7,r7,16 - mtmsrd r7,1 - - /* Unfortunately, the stack pointer and the MSR are also clobbered, - * so they are saved in the PACA which allows us to restore - * our original state after RTAS returns. - */ - std r1,PACAR1(r13) - std r6,PACASAVEDMSR(r13) - - /* Setup our real return addr */ - LOAD_REG_ADDR(r4,rtas_return_loc) - clrldi r4,r4,2 /* convert to realmode address */ - mtlr r4 - - li r0,0 - ori r0,r0,MSR_EE|MSR_SE|MSR_BE|MSR_RI - andc r0,r6,r0 - - li r9,1 - rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) - ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE - andc r6,r0,r9 - -__enter_rtas: - sync /* disable interrupts so SRR0/1 */ - mtmsrd r0 /* don't get trashed */ - - LOAD_REG_ADDR(r4, rtas) - ld r5,RTASENTRY(r4) /* get the rtas->entry value */ - ld r4,RTASBASE(r4) /* get the rtas->base value */ - - mtspr SPRN_SRR0,r5 - mtspr SPRN_SRR1,r6 - RFI_TO_KERNEL - b . /* prevent speculative execution */ - -rtas_return_loc: - FIXUP_ENDIAN - - /* - * Clear RI and set SF before anything. - */ - mfmsr r6 - li r0,MSR_RI - andc r6,r6,r0 - sldi r0,r0,(MSR_SF_LG - MSR_RI_LG) - or r6,r6,r0 - sync - mtmsrd r6 - - /* relocation is off at this point */ - GET_PACA(r4) - clrldi r4,r4,2 /* convert to realmode address */ - - bcl 20,31,$+4 -0: mflr r3 - ld r3,(1f-0b)(r3) /* get &rtas_restore_regs */ - - ld r1,PACAR1(r4) /* Restore our SP */ - ld r4,PACASAVEDMSR(r4) /* Restore our MSR */ - - mtspr SPRN_SRR0,r3 - mtspr SPRN_SRR1,r4 - RFI_TO_KERNEL - b . /* prevent speculative execution */ -_ASM_NOKPROBE_SYMBOL(__enter_rtas) -_ASM_NOKPROBE_SYMBOL(rtas_return_loc) - - .align 3 -1: .8byte rtas_restore_regs - -rtas_restore_regs: - /* relocation is on at this point */ - REST_GPR(2, r1) /* Restore the TOC */ - REST_GPR(13, r1) /* Restore paca */ - REST_8GPRS(14, r1) /* Restore the non-volatiles */ - REST_10GPRS(22, r1) /* ditto */ - - GET_PACA(r13) - - ld r4,_CCR(r1) - mtcr r4 - ld r5,_CTR(r1) - mtctr r5 - ld r6,_XER(r1) - mtspr SPRN_XER,r6 - ld r7,_DAR(r1) - mtdar r7 - ld r8,_DSISR(r1) - mtdsisr r8 - - addi r1,r1,SWITCH_FRAME_SIZE /* Unstack our frame */ - ld r0,16(r1) /* get return address */ - - mtlr r0 - blr /* return to caller */ - -#endif /* CONFIG_PPC_RTAS */ - -_GLOBAL(enter_prom) - mflr r0 - std r0,16(r1) - stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space */ - - /* Because PROM is running in 32b mode, it clobbers the high order half - * of all registers that it saves. We therefore save those registers - * PROM might touch to the stack. (r0, r3-r13 are caller saved) - */ - SAVE_GPR(2, r1) - SAVE_GPR(13, r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) - mfcr r10 - mfmsr r11 - std r10,_CCR(r1) - std r11,_MSR(r1) - - /* Put PROM address in SRR0 */ - mtsrr0 r4 - - /* Setup our trampoline return addr in LR */ - bcl 20,31,$+4 -0: mflr r4 - addi r4,r4,(1f - 0b) - mtlr r4 - - /* Prepare a 32-bit mode big endian MSR - */ -#ifdef CONFIG_PPC_BOOK3E - rlwinm r11,r11,0,1,31 - mtsrr1 r11 - rfi -#else /* CONFIG_PPC_BOOK3E */ - LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE) - andc r11,r11,r12 - mtsrr1 r11 - RFI_TO_KERNEL -#endif /* CONFIG_PPC_BOOK3E */ - -1: /* Return from OF */ - FIXUP_ENDIAN - - /* Just make sure that r1 top 32 bits didn't get - * corrupt by OF - */ - rldicl r1,r1,0,32 - - /* Restore the MSR (back to 64 bits) */ - ld r0,_MSR(r1) - MTMSRD(r0) - isync - - /* Restore other registers */ - REST_GPR(2, r1) - REST_GPR(13, r1) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) - ld r4,_CCR(r1) - mtcr r4 - - addi r1,r1,SWITCH_FRAME_SIZE - ld r0,16(r1) - mtlr r0 - blr diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S index 69a912550577..1a9b5ae8ccb2 100644 --- a/arch/powerpc/kernel/epapr_hcalls.S +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -3,6 +3,7 @@ * Copyright (C) 2012 Freescale Semiconductor, Inc. */ +#include <linux/export.h> #include <linux/threads.h> #include <asm/epapr_hcalls.h> #include <asm/reg.h> @@ -12,7 +13,6 @@ #include <asm/ppc_asm.h> #include <asm/asm-compat.h> #include <asm/asm-offsets.h> -#include <asm/export.h> #ifndef CONFIG_PPC64 /* epapr_ev_idle() was derived from e500_idle() */ @@ -21,7 +21,13 @@ _GLOBAL(epapr_ev_idle) ori r4, r4,_TLF_NAPPING /* so when we take an exception */ PPC_STL r4, TI_LOCAL_FLAGS(r2) /* it will return to our caller */ +#ifdef CONFIG_BOOKE_OR_40x wrteei 1 +#else + mfmsr r4 + ori r4, r4, MSR_EE + mtmsr r4 +#endif idle_loop: LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c index 9d32158ce36f..d4b8aff20815 100644 --- a/arch/powerpc/kernel/epapr_paravirt.c +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -11,6 +11,7 @@ #include <asm/cacheflush.h> #include <asm/code-patching.h> #include <asm/machdep.h> +#include <asm/inst.h> #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) extern void epapr_ev_idle(void); @@ -36,7 +37,7 @@ static int __init early_init_dt_scan_epapr(unsigned long node, return -1; for (i = 0; i < (len / 4); i++) { - u32 inst = be32_to_cpu(insts[i]); + ppc_inst_t inst = ppc_inst(be32_to_cpu(insts[i])); patch_instruction(epapr_hypercall_start + i, inst); #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) patch_instruction(epapr_ev_idle_start + i, inst); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index e4076e3c072d..dcf0591ad3c2 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -5,6 +5,7 @@ * Copyright (C) 2007 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. */ +#include <linux/linkage.h> #include <linux/threads.h> #include <asm/reg.h> #include <asm/page.h> @@ -13,7 +14,6 @@ #include <asm/cputable.h> #include <asm/setup.h> #include <asm/thread_info.h> -#include <asm/reg_a2.h> #include <asm/exception-64e.h> #include <asm/bug.h> #include <asm/irqflags.h> @@ -24,6 +24,11 @@ #include <asm/kvm_asm.h> #include <asm/kvm_booke_hv_asm.h> #include <asm/feature-fixups.h> +#include <asm/context_tracking.h> + +/* 64e interrupt returns always use SRR registers */ +#define fast_interrupt_return fast_interrupt_return_srr +#define interrupt_return interrupt_return_srr /* XXX This will ultimately add space for a special exception save * structure used to save things like SRR0/SRR1, SPRGs, MAS, etc... @@ -61,10 +66,7 @@ #define SPECIAL_EXC_LOAD(reg, name) \ ld reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1) -special_reg_save: - lbz r9,PACAIRQHAPPENED(r13) - RECONCILE_IRQ_STATE(r3,r4) - +SYM_CODE_START_LOCAL(special_reg_save) /* * We only need (or have stack space) to save this stuff if * we interrupted the kernel. @@ -118,27 +120,25 @@ BEGIN_FTR_SECTION mtspr SPRN_MAS5,r10 mtspr SPRN_MAS8,r10 END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) - SPECIAL_EXC_STORE(r9,IRQHAPPENED) - mfspr r10,SPRN_DEAR SPECIAL_EXC_STORE(r10,DEAR) mfspr r10,SPRN_ESR SPECIAL_EXC_STORE(r10,ESR) - lbz r10,PACAIRQSOFTMASK(r13) - SPECIAL_EXC_STORE(r10,SOFTE) ld r10,_NIP(r1) SPECIAL_EXC_STORE(r10,CSRR0) ld r10,_MSR(r1) SPECIAL_EXC_STORE(r10,CSRR1) blr +SYM_CODE_END(special_reg_save) -ret_from_level_except: +SYM_CODE_START_LOCAL(ret_from_level_except) ld r3,_MSR(r1) andi. r3,r3,MSR_PR beq 1f - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return 1: LOAD_REG_ADDR(r11,extlb_level_exc) @@ -192,27 +192,6 @@ BEGIN_FTR_SECTION mtspr SPRN_MAS8,r10 END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) - lbz r6,PACAIRQSOFTMASK(r13) - ld r5,SOFTE(r1) - - /* Interrupts had better not already be enabled... */ - tweqi r6,IRQS_ENABLED - - andi. r6,r5,IRQS_DISABLED - bne 1f - - TRACE_ENABLE_INTS - stb r5,PACAIRQSOFTMASK(r13) -1: - /* - * Restore PACAIRQHAPPENED rather than setting it based on - * the return MSR[EE], since we could have interrupted - * __check_irq_replay() or other inconsistent transitory - * states that must remain that way. - */ - SPECIAL_EXC_LOAD(r10,IRQHAPPENED) - stb r10,PACAIRQHAPPENED(r13) - SPECIAL_EXC_LOAD(r10,DEAR) mtspr SPRN_DEAR,r10 SPECIAL_EXC_LOAD(r10,ESR) @@ -220,8 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) stdcx. r0,0,r1 /* to clear the reservation */ - REST_4GPRS(2, r1) - REST_4GPRS(6, r1) + REST_GPRS(2, 9, r1) ld r10,_CTR(r1) ld r11,_XER(r1) @@ -229,6 +207,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) mtxer r11 blr +SYM_CODE_END(ret_from_level_except) .macro ret_from_level srr0 srr1 paca_ex scratch bl ret_from_level_except @@ -239,17 +218,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) mtlr r10 mtcr r11 - ld r10,GPR10(r1) - ld r11,GPR11(r1) - ld r12,GPR12(r1) + REST_GPRS(10, 12, r1) mtspr \scratch,r0 std r10,\paca_ex+EX_R10(r13); std r11,\paca_ex+EX_R11(r13); ld r10,_NIP(r1) ld r11,_MSR(r1) - ld r0,GPR0(r1) - ld r1,GPR1(r1) + REST_GPR(0, r1) + REST_GPR(1, r1) mtspr \srr0,r10 mtspr \srr1,r11 ld r10,\paca_ex+EX_R10(r13) @@ -257,13 +234,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) mfspr r13,\scratch .endm -ret_from_crit_except: +SYM_CODE_START_LOCAL(ret_from_crit_except) ret_from_level SPRN_CSRR0 SPRN_CSRR1 PACA_EXCRIT SPRN_SPRG_CRIT_SCRATCH rfci +SYM_CODE_END(ret_from_crit_except) -ret_from_mc_except: +SYM_CODE_START_LOCAL(ret_from_mc_except) ret_from_level SPRN_MCSRR0 SPRN_MCSRR1 PACA_EXMC SPRN_SPRG_MC_SCRATCH rfmci +SYM_CODE_END(ret_from_mc_except) /* Exception prolog code for all exceptions */ #define EXCEPTION_PROLOG(n, intnum, type, addition) \ @@ -314,7 +293,6 @@ ret_from_mc_except: #define SPRN_MC_SRR0 SPRN_MCSRR0 #define SPRN_MC_SRR1 SPRN_MCSRR1 -#ifdef CONFIG_PPC_FSL_BOOK3E #define GEN_BTB_FLUSH \ START_BTB_FLUSH_SECTION \ beq 1f; \ @@ -330,13 +308,6 @@ ret_from_mc_except: #define DBG_BTB_FLUSH CRIT_BTB_FLUSH #define MC_BTB_FLUSH CRIT_BTB_FLUSH #define GDBELL_BTB_FLUSH GEN_BTB_FLUSH -#else -#define GEN_BTB_FLUSH -#define CRIT_BTB_FLUSH -#define DBG_BTB_FLUSH -#define MC_BTB_FLUSH -#define GDBELL_BTB_FLUSH -#endif #define NORMAL_EXCEPTION_PROLOG(n, intnum, addition) \ EXCEPTION_PROLOG(n, intnum, GEN, addition##_GEN(n)) @@ -366,6 +337,12 @@ ret_from_mc_except: andi. r10,r10,IRQS_DISABLED; /* yes -> go out of line */ \ bne masked_interrupt_book3e_##n +/* + * Additional regs must be re-loaded from paca before EXCEPTION_COMMON* is + * called, because that does SAVE_NVGPRS which must see the original register + * values, otherwise the scratch values might be restored when exiting the + * interrupt. + */ #define PROLOG_ADDITION_2REGS_GEN(n) \ std r14,PACA_EXGEN+EX_R14(r13); \ std r15,PACA_EXGEN+EX_R15(r13) @@ -385,46 +362,43 @@ ret_from_mc_except: std r14,PACA_EXMC+EX_R14(r13); \ std r15,PACA_EXMC+EX_R15(r13) - /* Core exception code for all exceptions except TLB misses. */ #define EXCEPTION_COMMON_LVL(n, scratch, excf) \ exc_##n##_common: \ - std r0,GPR0(r1); /* save r0 in stackframe */ \ - std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - std r9,GPR9(r1); /* save r9 in stackframe */ \ + SAVE_GPR(0, r1); /* save r0 in stackframe */ \ + SAVE_GPRS(2, 9, r1); /* save r2 - r9 in stackframe */ \ std r10,_NIP(r1); /* save SRR0 to stackframe */ \ std r11,_MSR(r1); /* save SRR1 to stackframe */ \ beq 2f; /* if from kernel mode */ \ - ACCOUNT_CPU_USER_ENTRY(r13,r10,r11);/* accounting (uses cr0+eq) */ \ 2: ld r3,excf+EX_R10(r13); /* get back r10 */ \ ld r4,excf+EX_R11(r13); /* get back r11 */ \ mfspr r5,scratch; /* get back r13 */ \ - std r12,GPR12(r1); /* save r12 in stackframe */ \ - ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ + SAVE_GPR(12, r1); /* save r12 in stackframe */ \ + LOAD_PACA_TOC(); /* get kernel TOC into r2 */ \ mflr r6; /* save LR in stackframe */ \ mfctr r7; /* save CTR in stackframe */ \ mfspr r8,SPRN_XER; /* save XER in stackframe */ \ ld r9,excf+EX_R1(r13); /* load orig r1 back from PACA */ \ lwz r10,excf+EX_CR(r13); /* load orig CR back from PACA */ \ lbz r11,PACAIRQSOFTMASK(r13); /* get current IRQ softe */ \ - ld r12,exception_marker@toc(r2); \ - li r0,0; \ + LOAD_REG_IMMEDIATE(r12, STACK_FRAME_REGS_MARKER); \ + ZEROIZE_GPR(0); \ std r3,GPR10(r1); /* save r10 to stackframe */ \ std r4,GPR11(r1); /* save r11 to stackframe */ \ std r5,GPR13(r1); /* save it to stackframe */ \ std r6,_LINK(r1); \ std r7,_CTR(r1); \ std r8,_XER(r1); \ - li r3,(n)+1; /* indicate partial regs in trap */ \ + li r3,(n); /* regs.trap vector */ \ std r9,0(r1); /* store stack frame back link */ \ std r10,_CCR(r1); /* store orig CR in stackframe */ \ std r9,GPR1(r1); /* store stack frame back link */ \ std r11,SOFTE(r1); /* and save it to stackframe */ \ - std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ + std r12,STACK_INT_FRAME_MARKER(r1); /* mark the frame */ \ std r3,_TRAP(r1); /* set trap number */ \ - std r0,RESULT(r1); /* clear regs->result */ + std r0,RESULT(r1); /* clear regs->result */ \ + SAVE_NVGPRS(r1); \ + SANITIZE_NVGPRS(); /* minimise speculation influence */ #define EXCEPTION_COMMON(n) \ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN) @@ -435,28 +409,6 @@ exc_##n##_common: \ #define EXCEPTION_COMMON_DBG(n) \ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_DBG_SCRATCH, PACA_EXDBG) -/* - * This is meant for exceptions that don't immediately hard-enable. We - * set a bit in paca->irq_happened to ensure that a subsequent call to - * arch_local_irq_restore() will properly hard-enable and avoid the - * fast-path, and then reconcile irq state. - */ -#define INTS_DISABLE RECONCILE_IRQ_STATE(r3,r4) - -/* - * This is called by exceptions that don't use INTS_DISABLE (that did not - * touch irq indicators in the PACA). This will restore MSR:EE to it's - * previous value - * - * XXX In the long run, we may want to open-code it in order to separate the - * load from the wrtee, thus limiting the latency caused by the dependency - * but at this point, I'll favor code clarity until we have a near to final - * implementation - */ -#define INTS_RESTORE_HARD \ - ld r11,_MSR(r1); \ - wrtee r11; - /* XXX FIXME: Restore r14/r15 when necessary */ #define BAD_STACK_TRAMPOLINE(n) \ exc_##n##_bad_stack: \ @@ -505,18 +457,11 @@ exc_##n##_bad_stack: \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\ EXCEPTION_COMMON(trapnum) \ - INTS_DISABLE; \ ack(r8); \ CHECK_NAPPING(); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ + addi r3,r1,STACK_INT_FRAME_REGS; \ bl hdlr; \ - b ret_from_except_lite; - -/* This value is used to mark exception frames on the stack. */ - .section ".toc","aw" -exception_marker: - .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER - + b interrupt_return /* * And here we have the exception vectors ! @@ -561,11 +506,10 @@ __end_interrupts: CRIT_EXCEPTION_PROLOG(0x100, BOOKE_INTERRUPT_CRITICAL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x100) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD - bl unknown_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl unknown_nmi_exception b ret_from_crit_except /* Machine Check Interrupt */ @@ -573,10 +517,9 @@ __end_interrupts: MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_MC(0x000) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl machine_check_exception b ret_from_mc_except @@ -586,8 +529,11 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR + std r14,_DEAR(r1) + std r15,_ESR(r1) + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x300) - INTS_DISABLE b storage_fault_common /* Instruction Storage Interrupt */ @@ -596,8 +542,11 @@ __end_interrupts: PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 + std r14,_DEAR(r1) + std r15,_ESR(r1) + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x400) - INTS_DISABLE b storage_fault_common /* External Input Interrupt */ @@ -610,6 +559,10 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR + std r14,_DEAR(r1) + std r15,_ESR(r1) + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x600) b alignment_more /* no room, go out of line */ @@ -618,14 +571,13 @@ __end_interrupts: NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - EXCEPTION_COMMON(0x700) - INTS_DISABLE - std r14,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + std r14,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) - bl save_nvgprs + EXCEPTION_COMMON(0x700) + addi r3,r1,STACK_INT_FRAME_REGS bl program_check_exception - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return /* Floating Point Unavailable Interrupt */ START_EXCEPTION(fp_unavailable); @@ -637,12 +589,10 @@ __end_interrupts: andi. r0,r12,MSR_PR; beq- 1f bl load_up_fpu - b fast_exception_return -1: INTS_DISABLE - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD + b fast_interrupt_return +1: addi r3,r1,STACK_INT_FRAME_REGS bl kernel_fp_unavailable_exception - b ret_from_except + b interrupt_return /* Altivec Unavailable Interrupt */ START_EXCEPTION(altivec_unavailable); @@ -656,15 +606,13 @@ BEGIN_FTR_SECTION andi. r0,r12,MSR_PR; beq- 1f bl load_up_altivec - b fast_exception_return + b fast_interrupt_return 1: END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - INTS_DISABLE - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl altivec_unavailable_exception - b ret_from_except + b interrupt_return /* AltiVec Assist */ START_EXCEPTION(altivec_assist); @@ -672,17 +620,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) BOOKE_INTERRUPT_ALTIVEC_ASSIST, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x220) - INTS_DISABLE - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION bl altivec_assist_exception END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + REST_NVGPRS(r1) #else bl unknown_exception #endif - b ret_from_except + b interrupt_return /* Decrementer Interrupt */ @@ -698,14 +645,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) CRIT_EXCEPTION_PROLOG(0x9f0, BOOKE_INTERRUPT_WATCHDOG, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x9f0) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_BOOKE_WDT bl WatchdogException #else - bl unknown_exception + bl unknown_nmi_exception #endif b ret_from_crit_except @@ -722,11 +668,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0xf20) - INTS_DISABLE - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception - b ret_from_except + b interrupt_return /* Debug exception as a critical interrupt*/ START_EXCEPTION(debug_crit); @@ -747,9 +691,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) beq+ 1f #ifdef CONFIG_RELOCATABLE - ld r15,PACATOC(r13) - ld r14,interrupt_base_book3e@got(r15) - ld r15,__end_interrupts@got(r15) + __LOAD_PACA_TOC(r15) + LOAD_REG_ADDR_ALTTOC(r14, r15, interrupt_base_book3e) + LOAD_REG_ADDR_ALTTOC(r15, r15, __end_interrupts) cmpld cr0,r10,r14 cmpld cr1,r10,r15 #else @@ -787,15 +731,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) * normal exception */ mfspr r14,SPRN_DBSR - EXCEPTION_COMMON_CRIT(0xd00) std r14,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - mr r4,r14 ld r14,PACA_EXCRIT+EX_R14(r13) ld r15,PACA_EXCRIT+EX_R15(r13) - bl save_nvgprs + EXCEPTION_COMMON_CRIT(0xd00) + addi r3,r1,STACK_INT_FRAME_REGS bl DebugException - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return kernel_dbg_exc: b . /* NYI */ @@ -819,9 +762,9 @@ kernel_dbg_exc: beq+ 1f #ifdef CONFIG_RELOCATABLE - ld r15,PACATOC(r13) - ld r14,interrupt_base_book3e@got(r15) - ld r15,__end_interrupts@got(r15) + __LOAD_PACA_TOC(r15) + LOAD_REG_ADDR_ALTTOC(r14, r15, interrupt_base_book3e) + LOAD_REG_ADDR_ALTTOC(r15, r15, __end_interrupts) cmpld cr0,r10,r14 cmpld cr1,r10,r15 #else @@ -859,26 +802,30 @@ kernel_dbg_exc: * normal exception */ mfspr r14,SPRN_DBSR - EXCEPTION_COMMON_DBG(0xd08) - INTS_DISABLE std r14,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - mr r4,r14 ld r14,PACA_EXDBG+EX_R14(r13) ld r15,PACA_EXDBG+EX_R15(r13) - bl save_nvgprs + EXCEPTION_COMMON_DBG(0xd08) + addi r3,r1,STACK_INT_FRAME_REGS bl DebugException - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return START_EXCEPTION(perfmon); NORMAL_EXCEPTION_PROLOG(0x260, BOOKE_INTERRUPT_PERFORMANCE_MONITOR, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x260) - INTS_DISABLE CHECK_NAPPING() - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS + /* + * XXX: Returning from performance_monitor_exception taken as a + * soft-NMI (Linux irqs disabled) may be risky to use interrupt_return + * and could cause bugs in return or elsewhere. That case should just + * restore registers and return. There is a workaround for one known + * problem in interrupt_exit_kernel_prepare(). + */ bl performance_monitor_exception - b ret_from_except_lite + b interrupt_return /* Doorbell interrupt */ MASKABLE_EXCEPTION(0x280, BOOKE_INTERRUPT_DOORBELL, @@ -889,11 +836,10 @@ kernel_dbg_exc: CRIT_EXCEPTION_PROLOG(0x2a0, BOOKE_INTERRUPT_DOORBELL_CRITICAL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x2a0) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD - bl unknown_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl unknown_nmi_exception b ret_from_crit_except /* @@ -904,22 +850,19 @@ kernel_dbg_exc: GDBELL_EXCEPTION_PROLOG(0x2c0, BOOKE_INTERRUPT_GUEST_DBELL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x2c0) - addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception - b ret_from_except + b interrupt_return /* Guest Doorbell critical Interrupt */ START_EXCEPTION(guest_doorbell_crit); CRIT_EXCEPTION_PROLOG(0x2e0, BOOKE_INTERRUPT_GUEST_DBELL_CRIT, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x2e0) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD - bl unknown_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl unknown_nmi_exception b ret_from_crit_except /* Hypervisor call */ @@ -927,33 +870,55 @@ kernel_dbg_exc: NORMAL_EXCEPTION_PROLOG(0x310, BOOKE_INTERRUPT_HV_SYSCALL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x310) - addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception - b ret_from_except + b interrupt_return /* Embedded Hypervisor priviledged */ START_EXCEPTION(ehpriv); NORMAL_EXCEPTION_PROLOG(0x320, BOOKE_INTERRUPT_HV_PRIV, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x320) - addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception - b ret_from_except + b interrupt_return /* LRAT Error interrupt */ START_EXCEPTION(lrat_error); NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x340) - addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception - b ret_from_except + b interrupt_return + +.macro SEARCH_RESTART_TABLE +#ifdef CONFIG_RELOCATABLE + __LOAD_PACA_TOC(r11) + LOAD_REG_ADDR_ALTTOC(r14, r11, __start___restart_table) + LOAD_REG_ADDR_ALTTOC(r15, r11, __stop___restart_table) +#else + LOAD_REG_IMMEDIATE_SYM(r14, r11, __start___restart_table) + LOAD_REG_IMMEDIATE_SYM(r15, r11, __stop___restart_table) +#endif +300: + cmpd r14,r15 + beq 302f + ld r11,0(r14) + cmpld r10,r11 + blt 301f + ld r11,8(r14) + cmpld r10,r11 + bge 301f + ld r11,16(r14) + b 303f +301: + addi r14,r14,24 + b 300b +302: + li r11,0 +303: +.endm /* * An interrupt came in while soft-disabled; We mark paca->irq_happened @@ -963,6 +928,9 @@ kernel_dbg_exc: */ .macro masked_interrupt_book3e paca_irq full_mask + std r14,PACA_EXGEN+EX_R14(r13) + std r15,PACA_EXGEN+EX_R15(r13) + lbz r10,PACAIRQHAPPENED(r13) .if \full_mask == 1 ori r10,r10,\paca_irq | PACA_IRQ_HARD_DIS @@ -972,22 +940,29 @@ kernel_dbg_exc: stb r10,PACAIRQHAPPENED(r13) .if \full_mask == 1 - rldicl r10,r11,48,1 /* clear MSR_EE */ - rotldi r11,r10,16 + xori r11,r11,MSR_EE /* clear MSR_EE */ mtspr SPRN_SRR1,r11 .endif + mfspr r10,SPRN_SRR0 + SEARCH_RESTART_TABLE + cmpdi r11,0 + beq 1f + mtspr SPRN_SRR0,r11 /* return to restart address */ +1: + lwz r11,PACA_EXGEN+EX_CR(r13) mtcr r11 ld r10,PACA_EXGEN+EX_R10(r13) ld r11,PACA_EXGEN+EX_R11(r13) + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) mfspr r13,SPRN_SPRG_GEN_SCRATCH rfi b . .endm masked_interrupt_book3e_0x500: - // XXX When adding support for EPR, use PACA_IRQ_EE_EDGE masked_interrupt_book3e PACA_IRQ_EE 1 masked_interrupt_book3e_0x900: @@ -1003,126 +978,26 @@ masked_interrupt_book3e_0x2c0: masked_interrupt_book3e PACA_IRQ_DBELL 0 /* - * Called from arch_local_irq_enable when an interrupt needs - * to be resent. r3 contains either 0x500,0x900,0x260 or 0x280 - * to indicate the kind of interrupt. MSR:EE is already off. - * We generate a stackframe like if a real interrupt had happened. - * - * Note: While MSR:EE is off, we need to make sure that _MSR - * in the generated frame has EE set to 1 or the exception - * handler will not properly re-enable them. - */ -_GLOBAL(__replay_interrupt) - /* We are going to jump to the exception common code which - * will retrieve various register values from the PACA which - * we don't give a damn about. - */ - mflr r10 - mfmsr r11 - mfcr r4 - mtspr SPRN_SPRG_GEN_SCRATCH,r13; - std r1,PACA_EXGEN+EX_R1(r13); - stw r4,PACA_EXGEN+EX_CR(r13); - ori r11,r11,MSR_EE - subi r1,r1,INT_FRAME_SIZE; - cmpwi cr0,r3,0x500 - beq exc_0x500_common - cmpwi cr0,r3,0x900 - beq exc_0x900_common - cmpwi cr0,r3,0x280 - beq exc_0x280_common - blr - - -/* * This is called from 0x300 and 0x400 handlers after the prologs with * r14 and r15 containing the fault address and error code, with the * original values stashed away in the PACA */ -storage_fault_common: - std r14,_DAR(r1) - std r15,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - mr r4,r14 - mr r5,r15 - ld r14,PACA_EXGEN+EX_R14(r13) - ld r15,PACA_EXGEN+EX_R15(r13) +SYM_CODE_START_LOCAL(storage_fault_common) + addi r3,r1,STACK_INT_FRAME_REGS bl do_page_fault - cmpdi r3,0 - bne- 1f - b ret_from_except_lite -1: bl save_nvgprs - mr r5,r3 - addi r3,r1,STACK_FRAME_OVERHEAD - ld r4,_DAR(r1) - bl bad_page_fault - b ret_from_except + b interrupt_return +SYM_CODE_END(storage_fault_common) /* * Alignment exception doesn't fit entirely in the 0x100 bytes so it * continues here. */ -alignment_more: - std r14,_DAR(r1) - std r15,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - ld r14,PACA_EXGEN+EX_R14(r13) - ld r15,PACA_EXGEN+EX_R15(r13) - bl save_nvgprs - INTS_RESTORE_HARD +SYM_CODE_START_LOCAL(alignment_more) + addi r3,r1,STACK_INT_FRAME_REGS bl alignment_exception - b ret_from_except - -/* - * We branch here from entry_64.S for the last stage of the exception - * return code path. MSR:EE is expected to be off at that point - */ -_GLOBAL(exception_return_book3e) - b 1f - -/* This is the return from load_up_fpu fast path which could do with - * less GPR restores in fact, but for now we have a single return path - */ - .globl fast_exception_return -fast_exception_return: - wrteei 0 -1: mr r0,r13 - ld r10,_MSR(r1) - REST_4GPRS(2, r1) - andi. r6,r10,MSR_PR - REST_2GPRS(6, r1) - beq 1f - ACCOUNT_CPU_USER_EXIT(r13, r10, r11) - ld r0,GPR13(r1) - -1: stdcx. r0,0,r1 /* to clear the reservation */ - - ld r8,_CCR(r1) - ld r9,_LINK(r1) - ld r10,_CTR(r1) - ld r11,_XER(r1) - mtcr r8 - mtlr r9 - mtctr r10 - mtxer r11 - REST_2GPRS(8, r1) - ld r10,GPR10(r1) - ld r11,GPR11(r1) - ld r12,GPR12(r1) - mtspr SPRN_SPRG_GEN_SCRATCH,r0 - - std r10,PACA_EXGEN+EX_R10(r13); - std r11,PACA_EXGEN+EX_R11(r13); - ld r10,_NIP(r1) - ld r11,_MSR(r1) - ld r0,GPR0(r1) - ld r1,GPR1(r1) - mtspr SPRN_SRR0,r10 - mtspr SPRN_SRR1,r11 - ld r10,PACA_EXGEN+EX_R10(r13) - ld r11,PACA_EXGEN+EX_R11(r13) - mfspr r13,SPRN_SPRG_GEN_SCRATCH - rfi + REST_NVGPRS(r1) + b interrupt_return +SYM_CODE_END(alignment_more) /* * Trampolines used when spotting a bad kernel stack pointer in @@ -1161,8 +1036,7 @@ BAD_STACK_TRAMPOLINE(0xe00) BAD_STACK_TRAMPOLINE(0xf00) BAD_STACK_TRAMPOLINE(0xf20) - .globl bad_stack_book3e -bad_stack_book3e: +_GLOBAL(bad_stack_book3e) /* XXX: Needs to make SPRN_SPRG_GEN depend on exception type */ mfspr r10,SPRN_SRR0; /* read SRR0 before touching stack */ ld r1,PACAEMERGSP(r13) @@ -1175,19 +1049,16 @@ bad_stack_book3e: std r11,_CCR(r1) mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR - std r10,_DAR(r1) - std r11,_DSISR(r1) - std r0,GPR0(r1); /* save r0 in stackframe */ \ - std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - std r9,GPR9(r1); /* save r9 in stackframe */ \ + std r10,_DEAR(r1) + std r11,_ESR(r1) + SAVE_GPR(0, r1); /* save r0 in stackframe */ \ + SAVE_GPRS(2, 9, r1); /* save r2 - r9 in stackframe */ \ ld r3,PACA_EXGEN+EX_R10(r13);/* get back r10 */ \ ld r4,PACA_EXGEN+EX_R11(r13);/* get back r11 */ \ mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 XXX can be wrong */ \ std r3,GPR10(r1); /* save r10 to stackframe */ \ std r4,GPR11(r1); /* save r11 to stackframe */ \ - std r12,GPR12(r1); /* save r12 in stackframe */ \ + SAVE_GPR(12, r1); /* save r12 in stackframe */ \ std r5,GPR13(r1); /* save it to stackframe */ \ mflr r10 mfctr r11 @@ -1195,16 +1066,15 @@ bad_stack_book3e: std r10,_LINK(r1) std r11,_CTR(r1) std r12,_XER(r1) - SAVE_10GPRS(14,r1) - SAVE_8GPRS(24,r1) + SAVE_NVGPRS(r1) lhz r12,PACA_TRAP_SAVE(r13) std r12,_TRAP(r1) addi r11,r1,INT_FRAME_SIZE std r11,0(r1) - li r12,0 + ZEROIZE_GPR(12) std r12,0(r11) - ld r2,PACATOC(r13) -1: addi r3,r1,STACK_FRAME_OVERHEAD + LOAD_PACA_TOC() +1: addi r3,r1,STACK_INT_FRAME_REGS bl kernel_bad_stack b 1b @@ -1245,7 +1115,7 @@ found_iprot: * r3 = MAS0_TLBSEL (for the iprot array) * r4 = SPRN_TLBnCFG */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ @@ -1314,7 +1184,7 @@ skpinv: addi r6,r6,1 /* Increment */ mfmsr r6 xori r6,r6,MSR_IS mtspr SPRN_SRR1,r6 - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) mtspr SPRN_SRR0,r6 @@ -1374,7 +1244,7 @@ skpinv: addi r6,r6,1 /* Increment */ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping */ /* Now we branch the new virtual address mapped by this entry */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) tovirt(r6,r6) @@ -1420,8 +1290,7 @@ have_hes: * ever takes any parameters, the SCOM code must also be updated to * provide them. */ - .globl a2_tlbinit_code_start -a2_tlbinit_code_start: +_GLOBAL(a2_tlbinit_code_start) ori r11,r3,MAS0_WQ_ALLWAYS oris r11,r11,MAS0_ESEL(3)@h /* Use way 3: workaround A2 erratum 376 */ @@ -1443,7 +1312,12 @@ a2_tlbinit_code_start: a2_tlbinit_after_linear_map: /* Now we branch the new virtual address mapped by this entry */ +#ifdef CONFIG_RELOCATABLE + __LOAD_PACA_TOC(r5) + LOAD_REG_ADDR_ALTTOC(r3, r5, 1f) +#else LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f) +#endif mtctr r3 bctr @@ -1609,8 +1483,7 @@ _GLOBAL(book3e_secondary_thread_init) mflr r28 b 3b - .globl init_core_book3e -init_core_book3e: +_GLOBAL(init_core_book3e) /* Establish the interrupt vector base */ tovirt(r2,r2) LOAD_REG_ADDR(r3, interrupt_base_book3e) @@ -1618,7 +1491,7 @@ init_core_book3e: sync blr -init_thread_book3e: +SYM_CODE_START_LOCAL(init_thread_book3e) lis r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h mtspr SPRN_EPCR,r3 @@ -1632,6 +1505,7 @@ init_thread_book3e: mtspr SPRN_TSR,r3 blr +SYM_CODE_END(init_thread_book3e) _GLOBAL(__setup_base_ivors) SET_IVOR(0, 0x020) /* Critical Input */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 46508b148e16..eaf2f167c342 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -13,6 +13,7 @@ * */ +#include <linux/linkage.h> #include <asm/hw_irq.h> #include <asm/exception-64s.h> #include <asm/ptrace.h> @@ -21,28 +22,6 @@ #include <asm/feature-fixups.h> #include <asm/kup.h> -/* PACA save area offsets (exgen, exmc, etc) */ -#define EX_R9 0 -#define EX_R10 8 -#define EX_R11 16 -#define EX_R12 24 -#define EX_R13 32 -#define EX_DAR 40 -#define EX_DSISR 48 -#define EX_CCR 52 -#define EX_CFAR 56 -#define EX_PPR 64 -#if defined(CONFIG_RELOCATABLE) -#define EX_CTR 72 -.if EX_SIZE != 10 - .error "EX_SIZE is wrong" -.endif -#else -.if EX_SIZE != 9 - .error "EX_SIZE is wrong" -.endif -#endif - /* * Following are fixed section helper macros. * @@ -50,7 +29,6 @@ * EXC_VIRT_BEGIN/END - virt (AIL), unrelocated exception vectors * TRAMP_REAL_BEGIN - real, unrelocated helpers (virt may call these) * TRAMP_VIRT_BEGIN - virt, unreloc helpers (in practice, real can use) - * TRAMP_KVM_BEGIN - KVM handlers, these are put into real, unrelocated * EXC_COMMON - After switching to virtual, relocated mode. */ @@ -71,7 +49,7 @@ .balign IFETCH_ALIGN_BYTES; \ .global name; \ _ASM_NOKPROBE_SYMBOL(name); \ - DEFINE_FIXED_SYMBOL(name); \ + DEFINE_FIXED_SYMBOL(name, text); \ name: #define TRAMP_REAL_BEGIN(name) \ @@ -80,13 +58,6 @@ name: #define TRAMP_VIRT_BEGIN(name) \ FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name) -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -#define TRAMP_KVM_BEGIN(name) \ - TRAMP_VIRT_BEGIN(name) -#else -#define TRAMP_KVM_BEGIN(name) -#endif - #define EXC_REAL_NONE(start, size) \ FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \ FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size) @@ -106,282 +77,161 @@ name: ld reg,PACAKBASE(r13); /* get high part of &label */ \ ori reg,reg,FIXED_SYMBOL_ABS_ADDR(label) -#define __LOAD_HANDLER(reg, label) \ +#define __LOAD_HANDLER(reg, label, section) \ ld reg,PACAKBASE(r13); \ - ori reg,reg,(ABS_ADDR(label))@l + ori reg,reg,(ABS_ADDR(label, section))@l /* * Branches from unrelocated code (e.g., interrupts) to labels outside * head-y require >64K offsets. */ -#define __LOAD_FAR_HANDLER(reg, label) \ +#define __LOAD_FAR_HANDLER(reg, label, section) \ ld reg,PACAKBASE(r13); \ - ori reg,reg,(ABS_ADDR(label))@l; \ - addis reg,reg,(ABS_ADDR(label))@h - -/* Exception register prefixes */ -#define EXC_HV_OR_STD 2 /* depends on HVMODE */ -#define EXC_HV 1 -#define EXC_STD 0 - -#if defined(CONFIG_RELOCATABLE) -/* - * If we support interrupts with relocation on AND we're a relocatable kernel, - * we need to use CTR to get to the 2nd level handler. So, save/restore it - * when required. - */ -#define SAVE_CTR(reg, area) mfctr reg ; std reg,area+EX_CTR(r13) -#define GET_CTR(reg, area) ld reg,area+EX_CTR(r13) -#define RESTORE_CTR(reg, area) ld reg,area+EX_CTR(r13) ; mtctr reg -#else -/* ...else CTR is unused and in register. */ -#define SAVE_CTR(reg, area) -#define GET_CTR(reg, area) mfctr reg -#define RESTORE_CTR(reg, area) -#endif - -/* - * PPR save/restore macros used in exceptions-64s.S - * Used for P7 or later processors - */ -#define SAVE_PPR(area, ra) \ -BEGIN_FTR_SECTION_NESTED(940) \ - ld ra,area+EX_PPR(r13); /* Read PPR from paca */ \ - std ra,_PPR(r1); \ -END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,940) - -#define RESTORE_PPR_PACA(area, ra) \ -BEGIN_FTR_SECTION_NESTED(941) \ - ld ra,area+EX_PPR(r13); \ - mtspr SPRN_PPR,ra; \ -END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,941) - -/* - * Get an SPR into a register if the CPU has the given feature - */ -#define OPT_GET_SPR(ra, spr, ftr) \ -BEGIN_FTR_SECTION_NESTED(943) \ - mfspr ra,spr; \ -END_FTR_SECTION_NESTED(ftr,ftr,943) - -/* - * Set an SPR from a register if the CPU has the given feature - */ -#define OPT_SET_SPR(ra, spr, ftr) \ -BEGIN_FTR_SECTION_NESTED(943) \ - mtspr spr,ra; \ -END_FTR_SECTION_NESTED(ftr,ftr,943) + ori reg,reg,(ABS_ADDR(label, section))@l; \ + addis reg,reg,(ABS_ADDR(label, section))@h /* - * Save a register to the PACA if the CPU has the given feature + * Interrupt code generation macros */ -#define OPT_SAVE_REG_TO_PACA(offset, ra, ftr) \ -BEGIN_FTR_SECTION_NESTED(943) \ - std ra,offset(r13); \ -END_FTR_SECTION_NESTED(ftr,ftr,943) +#define IVEC .L_IVEC_\name\() /* Interrupt vector address */ +#define IHSRR .L_IHSRR_\name\() /* Sets SRR or HSRR registers */ +#define IHSRR_IF_HVMODE .L_IHSRR_IF_HVMODE_\name\() /* HSRR if HV else SRR */ +#define IAREA .L_IAREA_\name\() /* PACA save area */ +#define IVIRT .L_IVIRT_\name\() /* Has virt mode entry point */ +#define IISIDE .L_IISIDE_\name\() /* Uses SRR0/1 not DAR/DSISR */ +#define ICFAR .L_ICFAR_\name\() /* Uses CFAR */ +#define ICFAR_IF_HVMODE .L_ICFAR_IF_HVMODE_\name\() /* Uses CFAR if HV */ +#define IDAR .L_IDAR_\name\() /* Uses DAR (or SRR0) */ +#define IDSISR .L_IDSISR_\name\() /* Uses DSISR (or SRR1) */ +#define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */ +#define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() /* Common runs in realmode */ +#define IMASK .L_IMASK_\name\() /* IRQ soft-mask bit */ +#define IKVM_REAL .L_IKVM_REAL_\name\() /* Real entry tests KVM */ +#define __IKVM_REAL(name) .L_IKVM_REAL_ ## name +#define IKVM_VIRT .L_IKVM_VIRT_\name\() /* Virt entry tests KVM */ +#define ISTACK .L_ISTACK_\name\() /* Set regular kernel stack */ +#define __ISTACK(name) .L_ISTACK_ ## name +#define IKUAP .L_IKUAP_\name\() /* Do KUAP lock */ +#define IMSR_R12 .L_IMSR_R12_\name\() /* Assumes MSR saved to r12 */ + +#define INT_DEFINE_BEGIN(n) \ +.macro int_define_ ## n name + +#define INT_DEFINE_END(n) \ +.endm ; \ +int_define_ ## n n ; \ +do_define_int n + +.macro do_define_int name + .ifndef IVEC + .error "IVEC not defined" + .endif + .ifndef IHSRR + IHSRR=0 + .endif + .ifndef IHSRR_IF_HVMODE + IHSRR_IF_HVMODE=0 + .endif + .ifndef IAREA + IAREA=PACA_EXGEN + .endif + .ifndef IVIRT + IVIRT=1 + .endif + .ifndef IISIDE + IISIDE=0 + .endif + .ifndef ICFAR + ICFAR=1 + .endif + .ifndef ICFAR_IF_HVMODE + ICFAR_IF_HVMODE=0 + .endif + .ifndef IDAR + IDAR=0 + .endif + .ifndef IDSISR + IDSISR=0 + .endif + .ifndef IBRANCH_TO_COMMON + IBRANCH_TO_COMMON=1 + .endif + .ifndef IREALMODE_COMMON + IREALMODE_COMMON=0 + .else + .if ! IBRANCH_TO_COMMON + .error "IREALMODE_COMMON=1 but IBRANCH_TO_COMMON=0" + .endif + .endif + .ifndef IMASK + IMASK=0 + .endif + .ifndef IKVM_REAL + IKVM_REAL=0 + .endif + .ifndef IKVM_VIRT + IKVM_VIRT=0 + .endif + .ifndef ISTACK + ISTACK=1 + .endif + .ifndef IKUAP + IKUAP=1 + .endif + .ifndef IMSR_R12 + IMSR_R12=0 + .endif +.endm /* - * Branch to label using its 0xC000 address. This results in instruction - * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned - * on using mtmsr rather than rfid. + * All interrupts which set HSRR registers, as well as SRESET and MCE and + * syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken, + * so they all generally need to test whether they were taken in guest context. + * + * Note: SRESET and MCE may also be sent to the guest by the hypervisor, and be + * taken with MSR[HV]=0. + * + * Interrupts which set SRR registers (with the above exceptions) do not + * elevate to MSR[HV]=1 mode, though most can be taken when running with + * MSR[HV]=1 (e.g., bare metal kernel and userspace). So these interrupts do + * not need to test whether a guest is running because they get delivered to + * the guest directly, including nested HV KVM guests. * - * This could set the 0xc bits for !RELOCATABLE as an immediate, rather than - * load KBASE for a slight optimisation. + * The exception is PR KVM, where the guest runs with MSR[PR]=1 and the host + * runs with MSR[HV]=0, so the host takes all interrupts on behalf of the + * guest. PR KVM runs with LPCR[AIL]=0 which causes interrupts to always be + * delivered to the real-mode entry point, therefore such interrupts only test + * KVM in their real mode handlers, and only when PR KVM is possible. + * + * Interrupts that are taken in MSR[HV]=0 and escalate to MSR[HV]=1 are always + * delivered in real-mode when the MMU is in hash mode because the MMU + * registers are not set appropriately to translate host addresses. In nested + * radix mode these can be delivered in virt-mode as the host translations are + * used implicitly (see: effective LPID, effective PID). */ -#define BRANCH_TO_C000(reg, label) \ - __LOAD_FAR_HANDLER(reg, label); \ - mtctr reg; \ - bctr -.macro INT_KVM_HANDLER name, vec, hsrr, area, skip - TRAMP_KVM_BEGIN(\name\()_kvm) - KVM_HANDLER \vec, \hsrr, \area, \skip -.endm - -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* - * If hv is possible, interrupts come into to the hv version - * of the kvmppc_interrupt code, which then jumps to the PR handler, - * kvmppc_interrupt_pr, if the guest is a PR guest. + * If an interrupt is taken while a guest is running, it is immediately routed + * to KVM to handle. */ -#define kvmppc_interrupt kvmppc_interrupt_hv -#else -#define kvmppc_interrupt kvmppc_interrupt_pr -#endif -.macro KVMTEST name, hsrr, n +.macro KVMTEST name handler +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER lbz r10,HSTATE_IN_GUEST(r13) cmpwi r10,0 - bne \name\()_kvm -.endm - -.macro KVM_HANDLER vec, hsrr, area, skip - .if \skip - cmpwi r10,KVM_GUEST_MODE_SKIP - beq 89f - .else -BEGIN_FTR_SECTION_NESTED(947) - ld r10,\area+EX_CFAR(r13) - std r10,HSTATE_CFAR(r13) -END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947) - .endif - -BEGIN_FTR_SECTION_NESTED(948) - ld r10,\area+EX_PPR(r13) - std r10,HSTATE_PPR(r13) -END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) - ld r10,\area+EX_R10(r13) - std r12,HSTATE_SCRATCH0(r13) - sldi r12,r9,32 /* HSRR variants have the 0x2 bit added to their trap number */ - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - ori r12,r12,(\vec + 0x2) - FTR_SECTION_ELSE - ori r12,r12,(\vec) - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - ori r12,r12,(\vec + 0x2) - .else - ori r12,r12,(\vec) - .endif - -#ifdef CONFIG_RELOCATABLE - /* - * KVM requires __LOAD_FAR_HANDLER beause kvmppc_interrupt lives - * outside the head section. CONFIG_RELOCATABLE KVM expects CTR - * to be saved in HSTATE_SCRATCH1. - */ - mfctr r9 - std r9,HSTATE_SCRATCH1(r13) - __LOAD_FAR_HANDLER(r9, kvmppc_interrupt) - mtctr r9 - ld r9,\area+EX_R9(r13) - bctr -#else - ld r9,\area+EX_R9(r13) - b kvmppc_interrupt -#endif - - - .if \skip -89: mtocrf 0x80,r9 - ld r9,\area+EX_R9(r13) - ld r10,\area+EX_R10(r13) - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - b kvmppc_skip_Hinterrupt - FTR_SECTION_ELSE - b kvmppc_skip_interrupt - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - b kvmppc_skip_Hinterrupt - .else - b kvmppc_skip_interrupt - .endif - .endif -.endm - -#else -.macro KVMTEST name, hsrr, n -.endm -.macro KVM_HANDLER name, vec, hsrr, area, skip -.endm -#endif - -.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri - ld r10,PACAKMSR(r13) /* get MSR value for kernel */ - .if ! \set_ri - xori r10,r10,MSR_RI /* Clear MSR_RI */ - .endif - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - .endif - LOAD_HANDLER(r10, \label\()) - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - FTR_SECTION_ELSE - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - .else - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - .endif - b . /* prevent speculative execution */ -.endm - -/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */ -.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr -#ifdef CONFIG_RELOCATABLE - .if \hsrr == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + li r10,(IVEC + 0x2) FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ + li r10,(IVEC) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + .elseif IHSRR + li r10,(IVEC + 0x2) .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ + li r10,(IVEC) .endif - LOAD_HANDLER(r12, \label\()) - mtctr r12 - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - FTR_SECTION_ELSE - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - bctr -#else - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - b \label + bne \handler #endif .endm @@ -405,14 +255,53 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * - Fall through and continue executing in real, unrelocated mode. * This is done if early=2. */ -.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 + +.macro GEN_BRANCH_TO_COMMON name, virt + .if IREALMODE_COMMON + LOAD_HANDLER(r10, \name\()_common) + mtctr r10 + bctr + .else + .if \virt +#ifndef CONFIG_RELOCATABLE + b \name\()_common_virt +#else + LOAD_HANDLER(r10, \name\()_common_virt) + mtctr r10 + bctr +#endif + .else + LOAD_HANDLER(r10, \name\()_common_real) + mtctr r10 + bctr + .endif + .endif +.endm + +.macro GEN_INT_ENTRY name, virt, ool=0 SET_SCRATCH0(r13) /* save r13 */ GET_PACA(r13) - std r9,\area\()+EX_R9(r13) /* save r9 */ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) + std r9,IAREA+EX_R9(r13) /* save r9 */ +BEGIN_FTR_SECTION + mfspr r9,SPRN_PPR +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) HMT_MEDIUM - std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ - OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) + std r10,IAREA+EX_R10(r13) /* save r10 */ + .if ICFAR +BEGIN_FTR_SECTION + mfspr r10,SPRN_CFAR +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + .elseif ICFAR_IF_HVMODE +BEGIN_FTR_SECTION + BEGIN_FTR_SECTION_NESTED(69) + mfspr r10,SPRN_CFAR + END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 69) +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(69) + li r10,0 + END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 69) +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .endif .if \ool .if !\virt b tramp_real_\name @@ -425,47 +314,20 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endif .endif - OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) - OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) +BEGIN_FTR_SECTION + std r9,IAREA+EX_PPR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + .if ICFAR || ICFAR_IF_HVMODE +BEGIN_FTR_SECTION + std r10,IAREA+EX_CFAR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + .endif INTERRUPT_TO_KERNEL - SAVE_CTR(r10, \area\()) + mfctr r10 + std r10,IAREA+EX_CTR(r13) mfcr r9 - .if \kvm - KVMTEST \name \hsrr \vec - .endif - .if \bitmask - lbz r10,PACAIRQSOFTMASK(r13) - andi. r10,r10,\bitmask - /* Associate vector numbers with bits in paca->irq_happened */ - .if \vec == 0x500 || \vec == 0xea0 - li r10,PACA_IRQ_EE - .elseif \vec == 0x900 - li r10,PACA_IRQ_DEC - .elseif \vec == 0xa00 || \vec == 0xe80 - li r10,PACA_IRQ_DBELL - .elseif \vec == 0xe60 - li r10,PACA_IRQ_HMI - .elseif \vec == 0xf00 - li r10,PACA_IRQ_PMI - .else - .abort "Bad maskable vector" - .endif - - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - bne masked_Hinterrupt - FTR_SECTION_ELSE - bne masked_interrupt - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - bne masked_Hinterrupt - .else - bne masked_interrupt - .endif - .endif - - std r11,\area\()+EX_R11(r13) - std r12,\area\()+EX_R12(r13) + std r11,IAREA+EX_R11(r13) /* save r11 - r12 */ + std r12,IAREA+EX_R12(r13) /* * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], @@ -473,51 +335,165 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * not recoverable if they are live. */ GET_SCRATCH0(r10) - std r10,\area\()+EX_R13(r13) - .if \dar - .if \hsrr + std r10,IAREA+EX_R13(r13) + .if IDAR && !IISIDE + .if IHSRR mfspr r10,SPRN_HDAR .else mfspr r10,SPRN_DAR .endif - std r10,\area\()+EX_DAR(r13) + std r10,IAREA+EX_DAR(r13) .endif - .if \dsisr - .if \hsrr + .if IDSISR && !IISIDE + .if IHSRR mfspr r10,SPRN_HDSISR .else mfspr r10,SPRN_DSISR .endif - stw r10,\area\()+EX_DSISR(r13) + stw r10,IAREA+EX_DSISR(r13) .endif - .if \early == 2 - /* nothing more */ - .elseif \early - mfctr r10 /* save ctr, even for !RELOCATABLE */ - BRANCH_TO_C000(r11, \name\()_early_common) - .elseif !\virt - INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri + .if IHSRR_IF_HVMODE + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif IHSRR + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ .else - INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + .endif + + .if IBRANCH_TO_COMMON + GEN_BRANCH_TO_COMMON \name \virt .endif + .if \ool .popsection .endif .endm /* - * On entry r13 points to the paca, r9-r13 are saved in the paca, - * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and - * SRR1, and relocation is on. + * __GEN_COMMON_ENTRY is required to receive the branch from interrupt + * entry, except in the case of the real-mode handlers which require + * __GEN_REALMODE_COMMON_ENTRY. * - * If stack=0, then the stack is already set in r1, and r1 is saved in r10. - * PPR save and CPU accounting is not done for the !stack case (XXX why not?) + * This switches to virtual mode and sets MSR[RI]. */ -.macro INT_COMMON vec, area, stack, kuap, reconcile, dar, dsisr - .if \stack +.macro __GEN_COMMON_ENTRY name +DEFINE_FIXED_SYMBOL(\name\()_common_real, text) +\name\()_common_real: + .if IKVM_REAL + KVMTEST \name kvm_interrupt + .endif + + ld r10,PACAKMSR(r13) /* get MSR value for kernel */ + /* MSR[RI] is clear iff using SRR regs */ + .if IHSRR_IF_HVMODE + BEGIN_FTR_SECTION + xori r10,r10,MSR_RI + END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) + .elseif ! IHSRR + xori r10,r10,MSR_RI + .endif + mtmsrd r10 + + .if IVIRT + .if IKVM_VIRT + b 1f /* skip the virt test coming from real */ + .endif + + .balign IFETCH_ALIGN_BYTES +DEFINE_FIXED_SYMBOL(\name\()_common_virt, text) +\name\()_common_virt: + .if IKVM_VIRT + KVMTEST \name kvm_interrupt +1: + .endif + .endif /* IVIRT */ +.endm + +/* + * Don't switch to virt mode. Used for early MCE and HMI handlers that + * want to run in real mode. + */ +.macro __GEN_REALMODE_COMMON_ENTRY name +DEFINE_FIXED_SYMBOL(\name\()_common_real, text) +\name\()_common_real: + .if IKVM_REAL + KVMTEST \name kvm_interrupt + .endif +.endm + +.macro __GEN_COMMON_BODY name + .if IMASK + .if ! ISTACK + .error "No support for masked interrupt to use custom stack" + .endif + + /* If coming from user, skip soft-mask tests. */ + andi. r10,r12,MSR_PR + bne 3f + + /* + * Kernel code running below __end_soft_masked may be + * implicitly soft-masked if it is within the regions + * in the soft mask table. + */ + LOAD_HANDLER(r10, __end_soft_masked) + cmpld r11,r10 + bge+ 1f + + /* SEARCH_SOFT_MASK_TABLE clobbers r9,r10,r12 */ + mtctr r12 + stw r9,PACA_EXGEN+EX_CCR(r13) + SEARCH_SOFT_MASK_TABLE + cmpdi r12,0 + mfctr r12 /* Restore r12 to SRR1 */ + lwz r9,PACA_EXGEN+EX_CCR(r13) + beq 1f /* Not in soft-mask table */ + li r10,IMASK + b 2f /* In soft-mask table, always mask */ + + /* Test the soft mask state against our interrupt's bit */ +1: lbz r10,PACAIRQSOFTMASK(r13) +2: andi. r10,r10,IMASK + /* Associate vector numbers with bits in paca->irq_happened */ + .if IVEC == 0x500 || IVEC == 0xea0 + li r10,PACA_IRQ_EE + .elseif IVEC == 0x900 + li r10,PACA_IRQ_DEC + .elseif IVEC == 0xa00 || IVEC == 0xe80 + li r10,PACA_IRQ_DBELL + .elseif IVEC == 0xe60 + li r10,PACA_IRQ_HMI + .elseif IVEC == 0xf00 + li r10,PACA_IRQ_PMI + .else + .abort "Bad maskable vector" + .endif + + .if IHSRR_IF_HVMODE + BEGIN_FTR_SECTION + bne masked_Hinterrupt + FTR_SECTION_ELSE + bne masked_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif IHSRR + bne masked_Hinterrupt + .else + bne masked_interrupt + .endif + .endif + + .if ISTACK andi. r10,r12,MSR_PR /* See if coming from user */ - mr r10,r1 /* Save r1 */ +3: mr r10,r1 /* Save r1 */ subi r1,r1,INT_FRAME_SIZE /* alloc frame on kernel stack */ beq- 100f ld r1,PACAKSAVE(r13) /* kernel stack to use */ @@ -531,96 +507,192 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r10,0(r1) /* make stack chain pointer */ std r0,GPR0(r1) /* save r0 in stackframe */ std r10,GPR1(r1) /* save r1 in stackframe */ + SANITIZE_GPR(0) - .if \stack - .if \kuap + /* Mark our [H]SRRs valid for return */ + li r10,1 + .if IHSRR_IF_HVMODE + BEGIN_FTR_SECTION + stb r10,PACAHSRR_VALID(r13) + FTR_SECTION_ELSE + stb r10,PACASRR_VALID(r13) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif IHSRR + stb r10,PACAHSRR_VALID(r13) + .else + stb r10,PACASRR_VALID(r13) + .endif + + .if ISTACK + .if IKUAP kuap_save_amr_and_lock r9, r10, cr1, cr0 .endif beq 101f /* if from kernel mode */ - ACCOUNT_CPU_USER_ENTRY(r13, r9, r10) - SAVE_PPR(\area, r9) +BEGIN_FTR_SECTION + ld r9,IAREA+EX_PPR(r13) /* Read PPR from paca */ + std r9,_PPR(r1) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 101: .else - .if \kuap + .if IKUAP kuap_save_amr_and_lock r9, r10, cr1 .endif .endif /* Save original regs values from save area to stack frame. */ - ld r9,\area+EX_R9(r13) /* move r9, r10 to stackframe */ - ld r10,\area+EX_R10(r13) + ld r9,IAREA+EX_R9(r13) /* move r9, r10 to stackframe */ + ld r10,IAREA+EX_R10(r13) std r9,GPR9(r1) std r10,GPR10(r1) - ld r9,\area+EX_R11(r13) /* move r11 - r13 to stackframe */ - ld r10,\area+EX_R12(r13) - ld r11,\area+EX_R13(r13) + ld r9,IAREA+EX_R11(r13) /* move r11 - r13 to stackframe */ + ld r10,IAREA+EX_R12(r13) + ld r11,IAREA+EX_R13(r13) std r9,GPR11(r1) std r10,GPR12(r1) std r11,GPR13(r1) - .if \dar - .if \dar == 2 + .if !IMSR_R12 + SANITIZE_GPRS(9, 12) + .else + SANITIZE_GPRS(9, 11) + .endif + + SAVE_NVGPRS(r1) + SANITIZE_NVGPRS() + + .if IDAR + .if IISIDE ld r10,_NIP(r1) .else - ld r10,\area+EX_DAR(r13) + ld r10,IAREA+EX_DAR(r13) .endif std r10,_DAR(r1) .endif - .if \dsisr - .if \dsisr == 2 + + .if IDSISR + .if IISIDE ld r10,_MSR(r1) lis r11,DSISR_SRR1_MATCH_64S@h and r10,r10,r11 .else - lwz r10,\area+EX_DSISR(r13) + lwz r10,IAREA+EX_DSISR(r13) .endif std r10,_DSISR(r1) .endif -BEGIN_FTR_SECTION_NESTED(66) - ld r10,\area+EX_CFAR(r13) + +BEGIN_FTR_SECTION + .if ICFAR || ICFAR_IF_HVMODE + ld r10,IAREA+EX_CFAR(r13) + .else + li r10,0 + .endif std r10,ORIG_GPR3(r1) -END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) - GET_CTR(r10, \area) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + ld r10,IAREA+EX_CTR(r13) std r10,_CTR(r1) - std r2,GPR2(r1) /* save r2 in stackframe */ - SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */ - SAVE_2GPRS(7, r1) /* save r7, r8 in stackframe */ + SAVE_GPRS(2, 8, r1) /* save r2 - r8 in stackframe */ + SANITIZE_GPRS(2, 8) mflr r9 /* Get LR, later save to stack */ - ld r2,PACATOC(r13) /* get kernel TOC into r2 */ + LOAD_PACA_TOC() /* get kernel TOC into r2 */ std r9,_LINK(r1) lbz r10,PACAIRQSOFTMASK(r13) mfspr r11,SPRN_XER /* save XER in stackframe */ std r10,SOFTE(r1) std r11,_XER(r1) - li r9,(\vec)+1 + li r9,IVEC std r9,_TRAP(r1) /* set trap number */ li r10,0 - ld r11,exception_marker@toc(r2) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) std r10,RESULT(r1) /* clear regs->result */ - std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */ + std r11,STACK_INT_FRAME_MARKER(r1) /* mark the frame */ +.endm - .if \stack - ACCOUNT_STOLEN_TIME - .endif +/* + * On entry r13 points to the paca, r9-r13 are saved in the paca, + * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and + * SRR1, and relocation is on. + * + * If stack=0, then the stack is already set in r1, and r1 is saved in r10. + * PPR save and CPU accounting is not done for the !stack case (XXX why not?) + */ +.macro GEN_COMMON name + __GEN_COMMON_ENTRY \name + __GEN_COMMON_BODY \name +.endm - .if \reconcile - RECONCILE_IRQ_STATE(r10, r11) - .endif +.macro SEARCH_RESTART_TABLE +#ifdef CONFIG_RELOCATABLE + mr r12,r2 + LOAD_PACA_TOC() + LOAD_REG_ADDR(r9, __start___restart_table) + LOAD_REG_ADDR(r10, __stop___restart_table) + mr r2,r12 +#else + LOAD_REG_IMMEDIATE_SYM(r9, r12, __start___restart_table) + LOAD_REG_IMMEDIATE_SYM(r10, r12, __stop___restart_table) +#endif +300: + cmpd r9,r10 + beq 302f + ld r12,0(r9) + cmpld r11,r12 + blt 301f + ld r12,8(r9) + cmpld r11,r12 + bge 301f + ld r12,16(r9) + b 303f +301: + addi r9,r9,24 + b 300b +302: + li r12,0 +303: +.endm + +.macro SEARCH_SOFT_MASK_TABLE +#ifdef CONFIG_RELOCATABLE + mr r12,r2 + LOAD_PACA_TOC() + LOAD_REG_ADDR(r9, __start___soft_mask_table) + LOAD_REG_ADDR(r10, __stop___soft_mask_table) + mr r2,r12 +#else + LOAD_REG_IMMEDIATE_SYM(r9, r12, __start___soft_mask_table) + LOAD_REG_IMMEDIATE_SYM(r10, r12, __stop___soft_mask_table) +#endif +300: + cmpd r9,r10 + beq 302f + ld r12,0(r9) + cmpld r11,r12 + blt 301f + ld r12,8(r9) + cmpld r11,r12 + bge 301f + li r12,1 + b 303f +301: + addi r9,r9,16 + b 300b +302: + li r12,0 +303: .endm /* * Restore all registers including H/SRR0/1 saved in a stack frame of a * standard exception. */ -.macro EXCEPTION_RESTORE_REGS hsrr +.macro EXCEPTION_RESTORE_REGS hsrr=0 /* Move original SRR0 and SRR1 into the respective regs */ ld r9,_MSR(r1) - .if \hsrr == EXC_HV_OR_STD - .error "EXC_HV_OR_STD Not implemented for EXCEPTION_RESTORE_REGS" - .endif + li r10,0 .if \hsrr mtspr SPRN_HSRR1,r9 + stb r10,PACAHSRR_VALID(r13) .else mtspr SPRN_SRR1,r9 + stb r10,PACASRR_VALID(r13) .endif ld r9,_NIP(r1) .if \hsrr @@ -636,61 +708,77 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) mtlr r9 ld r9,_CCR(r1) mtcr r9 - REST_8GPRS(2, r1) - REST_4GPRS(10, r1) + SANITIZE_RESTORE_NVGPRS() + REST_GPRS(2, 13, r1) REST_GPR(0, r1) /* restore original r1. */ ld r1,GPR1(r1) .endm -#define RUNLATCH_ON \ -BEGIN_FTR_SECTION \ - ld r3, PACA_THREAD_INFO(r13); \ - ld r4,TI_LOCAL_FLAGS(r3); \ - andi. r0,r4,_TLF_RUNLATCH; \ - beql ppc64_runlatch_on_trampoline; \ -END_FTR_SECTION_IFSET(CPU_FTR_CTRL) - /* - * When the idle code in power4_idle puts the CPU into NAP mode, - * it has to do so in a loop, and relies on the external interrupt - * and decrementer interrupt entry code to get it out of the loop. - * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags - * to signal that it is in the loop and needs help to get out. + * EARLY_BOOT_FIXUP - Fix real-mode interrupt with wrong endian in early boot. + * + * There's a short window during boot where although the kernel is running + * little endian, any exceptions will cause the CPU to switch back to big + * endian. For example a WARN() boils down to a trap instruction, which will + * cause a program check, and we end up here but with the CPU in big endian + * mode. The first instruction of the program check handler (in GEN_INT_ENTRY + * below) is an mtsprg, which when executed in the wrong endian is an lhzu with + * a ~3GB displacement from r3. The content of r3 is random, so that is a load + * from some random location, and depending on the system can easily lead to a + * checkstop, or an infinitely recursive page fault. + * + * So to handle that case we have a trampoline here that can detect we are in + * the wrong endian and flip us back to the correct endian. We can't flip + * MSR[LE] using mtmsr, so we have to use rfid. That requires backing up SRR0/1 + * as well as a GPR. To do that we use SPRG0/2/3, as SPRG1 is already used for + * the paca. SPRG3 is user readable, but this trampoline is only active very + * early in boot, and SPRG3 will be reinitialised in vdso_getcpu_init() before + * userspace starts. */ -#ifdef CONFIG_PPC_970_NAP -#define FINISH_NAP \ -BEGIN_FTR_SECTION \ - ld r11, PACA_THREAD_INFO(r13); \ - ld r9,TI_LOCAL_FLAGS(r11); \ - andi. r10,r9,_TLF_NAPPING; \ - bnel power4_fixup_nap; \ -END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) -#else -#define FINISH_NAP +.macro EARLY_BOOT_FIXUP +BEGIN_FTR_SECTION +#ifdef CONFIG_CPU_LITTLE_ENDIAN + tdi 0,0,0x48 // Trap never, or in reverse endian: b . + 8 + b 2f // Skip trampoline if endian is correct + .long 0xa643707d // mtsprg 0, r11 Backup r11 + .long 0xa6027a7d // mfsrr0 r11 + .long 0xa643727d // mtsprg 2, r11 Backup SRR0 in SPRG2 + .long 0xa6027b7d // mfsrr1 r11 + .long 0xa643737d // mtsprg 3, r11 Backup SRR1 in SPRG3 + .long 0xa600607d // mfmsr r11 + .long 0x01006b69 // xori r11, r11, 1 Invert MSR[LE] + .long 0xa6037b7d // mtsrr1 r11 + /* + * This is 'li r11,1f' where 1f is the absolute address of that + * label, byteswapped into the SI field of the instruction. + */ + .long 0x00006039 | \ + ((ABS_ADDR(1f, real_vectors) & 0x00ff) << 24) | \ + ((ABS_ADDR(1f, real_vectors) & 0xff00) << 8) + .long 0xa6037a7d // mtsrr0 r11 + .long 0x2400004c // rfid +1: + mfsprg r11, 3 + mtsrr1 r11 // Restore SRR1 + mfsprg r11, 2 + mtsrr0 r11 // Restore SRR0 + mfsprg r11, 0 // Restore r11 +2: #endif - -#define EXC_COMMON(name, realvec, hdlr) \ - EXC_COMMON_BEGIN(name); \ - INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ - bl save_nvgprs; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b ret_from_except - -/* - * Like EXC_COMMON, but for exceptions that can occur in the idle task and - * therefore need the special idle handling (finish nap and runlatch) - */ -#define EXC_COMMON_ASYNC(name, realvec, hdlr) \ - EXC_COMMON_BEGIN(name); \ - INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ - FINISH_NAP; \ - RUNLATCH_ON; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b ret_from_except_lite - + /* + * program check could hit at any time, and pseries can not block + * MSR[ME] in early boot. So check if there is anything useful in r13 + * yet, and spin forever if not. + */ + mtsprg 0, r11 + mfcr r11 + cmpdi r13, 0 + beq . + mtcr r11 + mfsprg r11, 0 +END_FTR_SECTION(0, 1) // nop out after boot +.endm /* * There are a few constraints to be concerned with. @@ -716,6 +804,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * guarantee they will be delivered virtually. Some conditions (see the ISA) * cause exceptions to be delivered in real mode. * + * The scv instructions are a special case. They get a 0x3000 offset applied. + * scv exceptions have unique reentrancy properties, see below. + * * It's impossible to receive interrupts below 0x300 via AIL. * * KVM: None of the virtual exceptions are from the guest. Anything that @@ -725,8 +816,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code * 0x0100 - 0x18ff : Real mode pSeries interrupt vectors - * 0x1900 - 0x3fff : Real mode trampolines - * 0x4000 - 0x58ff : Relon (IR=1,DR=1) mode pSeries interrupt vectors + * 0x1900 - 0x2fff : Real mode trampolines + * 0x3000 - 0x58ff : Relon (IR=1,DR=1) mode pSeries interrupt vectors * 0x5900 - 0x6fff : Relon mode trampolines * 0x7000 - 0x7fff : FWNMI data area * 0x8000 - .... : Common interrupt handlers, remaining early @@ -737,8 +828,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * vectors there. */ OPEN_FIXED_SECTION(real_vectors, 0x0100, 0x1900) -OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x4000) -OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900) +OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x3000) +OPEN_FIXED_SECTION(virt_vectors, 0x3000, 0x5900) OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000) #ifdef CONFIG_PPC_POWERNV @@ -774,10 +865,131 @@ USE_FIXED_SECTION(real_vectors) .globl __start_interrupts __start_interrupts: +/** + * Interrupt 0x3000 - System Call Vectored Interrupt (syscall). + * This is a synchronous interrupt invoked with the "scv" instruction. The + * system call does not alter the HV bit, so it is directed to the OS. + * + * Handling: + * scv instructions enter the kernel without changing EE, RI, ME, or HV. + * In particular, this means we can take a maskable interrupt at any point + * in the scv handler, which is unlike any other interrupt. This is solved + * by treating the instruction addresses in the handler as being soft-masked, + * by adding a SOFT_MASK_TABLE entry for them. + * + * AIL-0 mode scv exceptions go to 0x17000-0x17fff, but we set AIL-3 and + * ensure scv is never executed with relocation off, which means AIL-0 + * should never happen. + * + * Before leaving the following inside-__end_soft_masked text, at least of the + * following must be true: + * - MSR[PR]=1 (i.e., return to userspace) + * - MSR_EE|MSR_RI is clear (no reentrant exceptions) + * - Standard kernel environment is set up (stack, paca, etc) + * + * KVM: + * These interrupts do not elevate HV 0->1, so HV is not involved. PR KVM + * ensures that FSCR[SCV] is disabled whenever it has to force AIL off. + * + * Call convention: + * + * syscall register convention is in Documentation/arch/powerpc/syscall64-abi.rst + */ +EXC_VIRT_BEGIN(system_call_vectored, 0x3000, 0x1000) + /* SCV 0 */ + mr r9,r13 + GET_PACA(r13) + mflr r11 + mfctr r12 + li r10,IRQS_ALL_DISABLED + stb r10,PACAIRQSOFTMASK(r13) +#ifdef CONFIG_RELOCATABLE + b system_call_vectored_tramp +#else + b system_call_vectored_common +#endif + nop + + /* SCV 1 - 127 */ + .rept 127 + mr r9,r13 + GET_PACA(r13) + mflr r11 + mfctr r12 + li r10,IRQS_ALL_DISABLED + stb r10,PACAIRQSOFTMASK(r13) + li r0,-1 /* cause failure */ +#ifdef CONFIG_RELOCATABLE + b system_call_vectored_sigill_tramp +#else + b system_call_vectored_sigill +#endif + .endr +EXC_VIRT_END(system_call_vectored, 0x3000, 0x1000) + +// Treat scv vectors as soft-masked, see comment above. +// Use absolute values rather than labels here, so they don't get relocated, +// because this code runs unrelocated. +SOFT_MASK_TABLE(0xc000000000003000, 0xc000000000004000) + +#ifdef CONFIG_RELOCATABLE +TRAMP_VIRT_BEGIN(system_call_vectored_tramp) + __LOAD_HANDLER(r10, system_call_vectored_common, virt_trampolines) + mtctr r10 + bctr + +TRAMP_VIRT_BEGIN(system_call_vectored_sigill_tramp) + __LOAD_HANDLER(r10, system_call_vectored_sigill, virt_trampolines) + mtctr r10 + bctr +#endif + + /* No virt vectors corresponding with 0x0..0x100 */ EXC_VIRT_NONE(0x4000, 0x100) +/** + * Interrupt 0x100 - System Reset Interrupt (SRESET aka NMI). + * This is a non-maskable, asynchronous interrupt always taken in real-mode. + * It is caused by: + * - Wake from power-saving state, on powernv. + * - An NMI from another CPU, triggered by firmware or hypercall. + * - As crash/debug signal injected from BMC, firmware or hypervisor. + * + * Handling: + * Power-save wakeup is the only performance critical path, so this is + * determined quickly as possible first. In this case volatile registers + * can be discarded and SPRs like CFAR don't need to be read. + * + * If not a powersave wakeup, then it's run as a regular interrupt, however + * it uses its own stack and PACA save area to preserve the regular kernel + * environment for debugging. + * + * This interrupt is not maskable, so triggering it when MSR[RI] is clear, + * or SCRATCH0 is in use, etc. may cause a crash. It's also not entirely + * correct to switch to virtual mode to run the regular interrupt handler + * because it might be interrupted when the MMU is in a bad state (e.g., SLB + * is clear). + * + * FWNMI: + * PAPR specifies a "fwnmi" facility which sends the sreset to a different + * entry point with a different register set up. Some hypervisors will + * send the sreset to 0x100 in the guest if it is not fwnmi capable. + * + * KVM: + * Unlike most SRR interrupts, this may be taken by the host while executing + * in a guest, so a KVM test is required. KVM will pull the CPU out of guest + * mode and then raise the sreset. + */ +INT_DEFINE_BEGIN(system_reset) + IVEC=0x100 + IAREA=PACA_EXNMI + IVIRT=0 /* no virt entry point */ + ISTACK=0 + IKVM_REAL=1 +INT_DEFINE_END(system_reset) + EXC_REAL_BEGIN(system_reset, 0x100, 0x100) #ifdef CONFIG_PPC_P7_NAP /* @@ -815,11 +1027,8 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif - INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0, kvm=1 + GEN_INT_ENTRY system_reset, virt=0 /* - * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is - * being used, so a nested NMI exception would corrupt it. - * * In theory, we should not enable relocation here if it was disabled * in SRR1, because the MMU may not be configured to support it (e.g., * SLB may have been cleared). In practice, there should only be a few @@ -828,14 +1037,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) */ EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) -INT_KVM_HANDLER system_reset 0x100, EXC_STD, PACA_EXNMI, 0 #ifdef CONFIG_PPC_P7_NAP TRAMP_REAL_BEGIN(system_reset_idle_wake) /* We are waking up from idle, so may clobber any volatile register */ cmpwi cr1,r5,2 bltlr cr1 /* no state loss, return to idle caller with r3=SRR1 */ - BRANCH_TO_C000(r12, DOTSYM(idle_return_gpr_loss)) + __LOAD_FAR_HANDLER(r12, DOTSYM(idle_return_gpr_loss), real_trampolines) + mtctr r12 + bctr #endif #ifdef CONFIG_PPC_PSERIES @@ -843,45 +1053,29 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) * Vectors for the FWNMI option. Share common code. */ TRAMP_REAL_BEGIN(system_reset_fwnmi) - /* See comment at system_reset exception, don't turn on RI */ - INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0 + GEN_INT_ENTRY system_reset, virt=0 #endif /* CONFIG_PPC_PSERIES */ EXC_COMMON_BEGIN(system_reset_common) + __GEN_COMMON_ENTRY system_reset /* - * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able - * to recover, but nested NMI will notice in_nmi and not recover - * because of the use of the NMI stack. in_nmi reentrancy is tested in - * system_reset_exception. + * Increment paca->in_nmi. When the interrupt entry wrapper later + * enable MSR_RI, then SLB or MCE will be able to recover, but a nested + * NMI will notice in_nmi and not recover because of the use of the NMI + * stack. in_nmi reentrancy is tested in system_reset_exception. */ lhz r10,PACA_IN_NMI(r13) addi r10,r10,1 sth r10,PACA_IN_NMI(r13) - li r10,MSR_RI - mtmsrd r10,1 mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE - INT_COMMON 0x100, PACA_EXNMI, 0, 1, 0, 0, 0 - bl save_nvgprs - /* - * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does - * the right thing. We do not want to reconcile because that goes - * through irq tracing which we don't want in NMI. - * - * Save PACAIRQHAPPENED because some code will do a hard disable - * (e.g., xmon). So we want to restore this back to where it was - * when we return. DAR is unused in the stack, so save it there. - */ - li r10,IRQS_ALL_DISABLED - stb r10,PACAIRQSOFTMASK(r13) - lbz r10,PACAIRQHAPPENED(r13) - std r10,_DAR(r1) + __GEN_COMMON_BODY system_reset - addi r3,r1,STACK_FRAME_OVERHEAD - bl system_reset_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(system_reset_exception) /* Clear MSR_RI before setting SRR0 and SRR1. */ li r9,0 @@ -894,36 +1088,86 @@ EXC_COMMON_BEGIN(system_reset_common) subi r10,r10,1 sth r10,PACA_IN_NMI(r13) - /* - * Restore soft mask settings. - */ - ld r10,_DAR(r1) - stb r10,PACAIRQHAPPENED(r13) - ld r10,SOFTE(r1) - stb r10,PACAIRQSOFTMASK(r13) - - EXCEPTION_RESTORE_REGS EXC_STD + kuap_kernel_restore r9, r10 + EXCEPTION_RESTORE_REGS RFI_TO_USER_OR_KERNEL +/** + * Interrupt 0x200 - Machine Check Interrupt (MCE). + * This is a non-maskable interrupt always taken in real-mode. It can be + * synchronous or asynchronous, caused by hardware or software, and it may be + * taken in a power-saving state. + * + * Handling: + * Similarly to system reset, this uses its own stack and PACA save area, + * the difference is re-entrancy is allowed on the machine check stack. + * + * machine_check_early is run in real mode, and carefully decodes the + * machine check and tries to handle it (e.g., flush the SLB if there was an + * error detected there), determines if it was recoverable and logs the + * event. + * + * This early code does not "reconcile" irq soft-mask state like SRESET or + * regular interrupts do, so irqs_disabled() among other things may not work + * properly (irq disable/enable already doesn't work because irq tracing can + * not work in real mode). + * + * Then, depending on the execution context when the interrupt is taken, there + * are 3 main actions: + * - Executing in kernel mode. The event is queued with irq_work, which means + * it is handled when it is next safe to do so (i.e., the kernel has enabled + * interrupts), which could be immediately when the interrupt returns. This + * avoids nasty issues like switching to virtual mode when the MMU is in a + * bad state, or when executing OPAL code. (SRESET is exposed to such issues, + * but it has different priorities). Check to see if the CPU was in power + * save, and return via the wake up code if it was. + * + * - Executing in user mode. machine_check_exception is run like a normal + * interrupt handler, which processes the data generated by the early handler. + * + * - Executing in guest mode. The interrupt is run with its KVM test, and + * branches to KVM to deal with. KVM may queue the event for the host + * to report later. + * + * This interrupt is not maskable, so if it triggers when MSR[RI] is clear, + * or SCRATCH0 is in use, it may cause a crash. + * + * KVM: + * See SRESET. + */ +INT_DEFINE_BEGIN(machine_check_early) + IVEC=0x200 + IAREA=PACA_EXMC + IVIRT=0 /* no virt entry point */ + IREALMODE_COMMON=1 + ISTACK=0 + IDAR=1 + IDSISR=1 + IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */ +INT_DEFINE_END(machine_check_early) + +INT_DEFINE_BEGIN(machine_check) + IVEC=0x200 + IAREA=PACA_EXMC + IVIRT=0 /* no virt entry point */ + IDAR=1 + IDSISR=1 + IKVM_REAL=1 +INT_DEFINE_END(machine_check) + EXC_REAL_BEGIN(machine_check, 0x200, 0x100) - INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 - /* - * MSR_RI is not enabled, because PACA_EXMC is being used, so a - * nested machine check corrupts it. machine_check_common enables - * MSR_RI. - */ + EARLY_BOOT_FIXUP + GEN_INT_ENTRY machine_check_early, virt=0 EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) #ifdef CONFIG_PPC_PSERIES TRAMP_REAL_BEGIN(machine_check_fwnmi) /* See comment at machine_check exception, don't turn on RI */ - INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 + GEN_INT_ENTRY machine_check_early, virt=0 #endif -INT_KVM_HANDLER machine_check 0x200, EXC_STD, PACA_EXMC, 1 - #define MACHINE_CHECK_HANDLER_WINDUP \ /* Clear MSR_RI before setting SRR0 and SRR1. */\ li r9,0; \ @@ -932,12 +1176,10 @@ INT_KVM_HANDLER machine_check 0x200, EXC_STD, PACA_EXMC, 1 lhz r12,PACA_IN_MCE(r13); \ subi r12,r12,1; \ sth r12,PACA_IN_MCE(r13); \ - EXCEPTION_RESTORE_REGS EXC_STD + EXCEPTION_RESTORE_REGS EXC_COMMON_BEGIN(machine_check_early_common) - mtctr r10 /* Restore ctr */ - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 + __GEN_REALMODE_COMMON_ENTRY machine_check_early /* * Switch to mc_emergency stack and handle re-entrancy (we limit @@ -974,18 +1216,16 @@ EXC_COMMON_BEGIN(machine_check_early_common) bgt cr1,unrecoverable_mce /* Check if we hit limit of 4 */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - /* We don't touch AMR here, we never go to virtual mode */ - INT_COMMON 0x200, PACA_EXMC, 0, 0, 0, 1, 1 + __GEN_COMMON_BODY machine_check_early BEGIN_FTR_SECTION bl enable_machine_check END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - li r10,MSR_RI - mtmsrd r10,1 - - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_early + addi r3,r1,STACK_INT_FRAME_REGS +BEGIN_FTR_SECTION + bl CFUNC(machine_check_early_boot) +END_FTR_SECTION(0, 1) // nop out after boot + bl CFUNC(machine_check_early) std r3,RESULT(r1) /* Save result */ ld r12,_MSR(r1) @@ -1009,7 +1249,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) /* * Check if we are coming from guest. If yes, then run the normal * exception handler which will take the - * machine_check_kvm->kvmppc_interrupt branch to deliver the MC event + * machine_check_kvm->kvm_interrupt branch to deliver the MC event * to guest. */ lbz r11,HSTATE_IN_GUEST(r13) @@ -1046,7 +1286,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) * Queue up the MCE event so that we can log it later, while * returning from kernel or opal call. */ - bl machine_check_queue_event + bl CFUNC(machine_check_queue_event) MACHINE_CHECK_HANDLER_WINDUP RFI_TO_KERNEL @@ -1063,23 +1303,18 @@ BEGIN_FTR_SECTION mtspr SPRN_CFAR,r10 END_FTR_SECTION_IFSET(CPU_FTR_CFAR) MACHINE_CHECK_HANDLER_WINDUP - /* See comment at machine_check exception, don't turn on RI */ - INT_HANDLER machine_check, 0x200, area=PACA_EXMC, ri=0, dar=1, dsisr=1, kvm=1 + GEN_INT_ENTRY machine_check, virt=0 EXC_COMMON_BEGIN(machine_check_common) /* * Machine check is different because we use a different * save area: PACA_EXMC instead of PACA_EXGEN. */ - INT_COMMON 0x200, PACA_EXMC, 1, 1, 1, 1, 1 - FINISH_NAP - /* Enable MSR_RI when finished with PACA_EXMC */ - li r10,MSR_RI - mtmsrd r10,1 - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception - b ret_from_except + GEN_COMMON machine_check + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(machine_check_exception_async) + b interrupt_return_srr + #ifdef CONFIG_PPC_P7_NAP /* @@ -1087,20 +1322,22 @@ EXC_COMMON_BEGIN(machine_check_common) * done. Queue the event then call the idle code to do the wake up. */ EXC_COMMON_BEGIN(machine_check_idle_common) - bl machine_check_queue_event + bl CFUNC(machine_check_queue_event) /* - * We have not used any non-volatile GPRs here, and as a rule - * most exception code including machine check does not. - * Therefore PACA_NAPSTATELOST does not need to be set. Idle - * wakeup will restore volatile registers. + * GPR-loss wakeups are relatively straightforward, because the + * idle sleep code has saved all non-volatile registers on its + * own stack, and r1 in PACAR1. * - * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce. + * For no-loss wakeups the r1 and lr registers used by the + * early machine check handler have to be restored first. r2 is + * the kernel TOC, so no need to restore it. * * Then decrement MCE nesting after finishing with the stack. */ ld r3,_MSR(r1) ld r4,_LINK(r1) + ld r1,GPR1(r1) lhz r11,PACA_IN_MCE(r13) subi r11,r11,1 @@ -1109,7 +1346,7 @@ EXC_COMMON_BEGIN(machine_check_idle_common) mtlr r4 rlwinm r10,r3,47-31,30,31 cmpwi cr1,r10,2 - bltlr cr1 /* no state loss, return to idle caller */ + bltlr cr1 /* no state loss, return to idle caller with r3=SRR1 */ b idle_return_gpr_loss #endif @@ -1124,168 +1361,355 @@ EXC_COMMON_BEGIN(unrecoverable_mce) BEGIN_FTR_SECTION li r10,0 /* clear MSR_RI */ mtmsrd r10,1 - bl disable_machine_check + bl CFUNC(disable_machine_check) END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) ld r10,PACAKMSR(r13) li r3,MSR_ME andc r10,r10,r3 mtmsrd r10 - /* Invoke machine_check_exception to print MCE event and panic. */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception + lhz r12,PACA_IN_MCE(r13) + subi r12,r12,1 + sth r12,PACA_IN_MCE(r13) + + /* + * Invoke machine_check_exception to print MCE event and panic. + * This is the NMI version of the handler because we are called from + * the early handler which is a true NMI. + */ + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(machine_check_exception) /* * We will not reach here. Even if we did, there is no way out. * Call unrecoverable_exception and die. */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(unrecoverable_exception) b . +/** + * Interrupt 0x300 - Data Storage Interrupt (DSI). + * This is a synchronous interrupt generated due to a data access exception, + * e.g., a load orstore which does not have a valid page table entry with + * permissions. DAWR matches also fault here, as do RC updates, and minor misc + * errors e.g., copy/paste, AMO, certain invalid CI accesses, etc. + * + * Handling: + * - Hash MMU + * Go to do_hash_fault, which attempts to fill the HPT from an entry in the + * Linux page table. Hash faults can hit in kernel mode in a fairly + * arbitrary state (e.g., interrupts disabled, locks held) when accessing + * "non-bolted" regions, e.g., vmalloc space. However these should always be + * backed by Linux page table entries. + * + * If no entry is found the Linux page fault handler is invoked (by + * do_hash_fault). Linux page faults can happen in kernel mode due to user + * copy operations of course. + * + * KVM: The KVM HDSI handler may perform a load with MSR[DR]=1 in guest + * MMU context, which may cause a DSI in the host, which must go to the + * KVM handler. MSR[IR] is not enabled, so the real-mode handler will + * always be used regardless of AIL setting. + * + * - Radix MMU + * The hardware loads from the Linux page table directly, so a fault goes + * immediately to Linux page fault. + * + * Conditions like DAWR match are handled on the way in to Linux page fault. + */ +INT_DEFINE_BEGIN(data_access) + IVEC=0x300 + IDAR=1 + IDSISR=1 + IKVM_REAL=1 +INT_DEFINE_END(data_access) + EXC_REAL_BEGIN(data_access, 0x300, 0x80) - INT_HANDLER data_access, 0x300, ool=1, dar=1, dsisr=1, kvm=1 + GEN_INT_ENTRY data_access, virt=0 EXC_REAL_END(data_access, 0x300, 0x80) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) - INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1 + GEN_INT_ENTRY data_access, virt=1 EXC_VIRT_END(data_access, 0x4300, 0x80) -INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON_BEGIN(data_access_common) - /* - * Here r13 points to the paca, r9 contains the saved CR, - * SRR0 and SRR1 are saved in r11 and r12, - * r9 - r13 are saved in paca->exgen. - * EX_DAR and EX_DSISR have saved DAR/DSISR - */ - INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1 - ld r4,_DAR(r1) - ld r5,_DSISR(r1) + GEN_COMMON data_access + ld r4,_DSISR(r1) + addi r3,r1,STACK_INT_FRAME_REGS + andis. r0,r4,DSISR_DABRMATCH@h + bne- 1f +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION - ld r6,_MSR(r1) - li r3,0x300 - b do_hash_page /* Try to handle as hpte fault */ + bl CFUNC(do_hash_fault) MMU_FTR_SECTION_ELSE - b handle_page_fault + bl CFUNC(do_page_fault) ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + bl CFUNC(do_page_fault) +#endif + b interrupt_return_srr +1: bl CFUNC(do_break) + /* + * do_break() may have changed the NV GPRS while handling a breakpoint. + * If so, we need to restore them with their updated values. + */ + HANDLER_RESTORE_NVGPRS() + b interrupt_return_srr + + +/** + * Interrupt 0x380 - Data Segment Interrupt (DSLB). + * This is a synchronous interrupt in response to an MMU fault missing SLB + * entry for HPT, or an address outside RPT translation range. + * + * Handling: + * - HPT: + * This refills the SLB, or reports an access fault similarly to a bad page + * fault. When coming from user-mode, the SLB handler may access any kernel + * data, though it may itself take a DSLB. When coming from kernel mode, + * recursive faults must be avoided so access is restricted to the kernel + * image text/data, kernel stack, and any data allocated below + * ppc64_bolted_size (first segment). The kernel handler must avoid stomping + * on user-handler data structures. + * + * KVM: Same as 0x300, DSLB must test for KVM guest. + */ +INT_DEFINE_BEGIN(data_access_slb) + IVEC=0x380 + IDAR=1 + IKVM_REAL=1 +INT_DEFINE_END(data_access_slb) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - INT_HANDLER data_access_slb, 0x380, ool=1, area=PACA_EXSLB, dar=1, kvm=1 + GEN_INT_ENTRY data_access_slb, virt=0 EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1 + GEN_INT_ENTRY data_access_slb, virt=1 EXC_VIRT_END(data_access_slb, 0x4380, 0x80) -INT_KVM_HANDLER data_access_slb, 0x380, EXC_STD, PACA_EXSLB, 1 EXC_COMMON_BEGIN(data_access_slb_common) - INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0 - ld r4,_DAR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + GEN_COMMON data_access_slb +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ - bl do_slb_fault + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(do_slb_fault) cmpdi r3,0 bne- 1f - b fast_exception_return + b fast_interrupt_return_srr 1: /* Error case */ MMU_FTR_SECTION_ELSE /* Radix case, access is outside page table range */ li r3,-EFAULT ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + li r3,-EFAULT +#endif std r3,RESULT(r1) - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - ld r4,_DAR(r1) - ld r5,RESULT(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_bad_slb_fault - b ret_from_except + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(do_bad_segment_interrupt) + b interrupt_return_srr +/** + * Interrupt 0x400 - Instruction Storage Interrupt (ISI). + * This is a synchronous interrupt in response to an MMU fault due to an + * instruction fetch. + * + * Handling: + * Similar to DSI, though in response to fetch. The faulting address is found + * in SRR0 (rather than DAR), and status in SRR1 (rather than DSISR). + */ +INT_DEFINE_BEGIN(instruction_access) + IVEC=0x400 + IISIDE=1 + IDAR=1 + IDSISR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(instruction_access) + EXC_REAL_BEGIN(instruction_access, 0x400, 0x80) - INT_HANDLER instruction_access, 0x400, kvm=1 + GEN_INT_ENTRY instruction_access, virt=0 EXC_REAL_END(instruction_access, 0x400, 0x80) EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) - INT_HANDLER instruction_access, 0x400, virt=1 + GEN_INT_ENTRY instruction_access, virt=1 EXC_VIRT_END(instruction_access, 0x4400, 0x80) -INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) - INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2 - ld r4,_DAR(r1) - ld r5,_DSISR(r1) + GEN_COMMON instruction_access + addi r3,r1,STACK_INT_FRAME_REGS +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION - ld r6,_MSR(r1) - li r3,0x400 - b do_hash_page /* Try to handle as hpte fault */ + bl CFUNC(do_hash_fault) MMU_FTR_SECTION_ELSE - b handle_page_fault + bl CFUNC(do_page_fault) ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + bl CFUNC(do_page_fault) +#endif + b interrupt_return_srr + +/** + * Interrupt 0x480 - Instruction Segment Interrupt (ISLB). + * This is a synchronous interrupt in response to an MMU fault due to an + * instruction fetch. + * + * Handling: + * Similar to DSLB, though in response to fetch. The faulting address is found + * in SRR0 (rather than DAR). + */ +INT_DEFINE_BEGIN(instruction_access_slb) + IVEC=0x480 + IISIDE=1 + IDAR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(instruction_access_slb) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) - INT_HANDLER instruction_access_slb, 0x480, area=PACA_EXSLB, kvm=1 + GEN_INT_ENTRY instruction_access_slb, virt=0 EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) - INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB + GEN_INT_ENTRY instruction_access_slb, virt=1 EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) -INT_KVM_HANDLER instruction_access_slb, 0x480, EXC_STD, PACA_EXSLB, 0 EXC_COMMON_BEGIN(instruction_access_slb_common) - INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0 - ld r4,_DAR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + GEN_COMMON instruction_access_slb +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ - bl do_slb_fault + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(do_slb_fault) cmpdi r3,0 bne- 1f - b fast_exception_return + b fast_interrupt_return_srr 1: /* Error case */ MMU_FTR_SECTION_ELSE /* Radix case, access is outside page table range */ li r3,-EFAULT ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + li r3,-EFAULT +#endif std r3,RESULT(r1) - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - ld r4,_DAR(r1) - ld r5,RESULT(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_bad_slb_fault - b ret_from_except + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(do_bad_segment_interrupt) + b interrupt_return_srr + + +/** + * Interrupt 0x500 - External Interrupt. + * This is an asynchronous maskable interrupt in response to an "external + * exception" from the interrupt controller or hypervisor (e.g., device + * interrupt). It is maskable in hardware by clearing MSR[EE], and + * soft-maskable with IRQS_DISABLED mask (i.e., local_irq_disable()). + * + * When running in HV mode, Linux sets up the LPCR[LPES] bit such that + * interrupts are delivered with HSRR registers, guests use SRRs, which + * reqiures IHSRR_IF_HVMODE. + * + * On bare metal POWER9 and later, Linux sets the LPCR[HVICE] bit such that + * external interrupts are delivered as Hypervisor Virtualization Interrupts + * rather than External Interrupts. + * + * Handling: + * This calls into Linux IRQ handler. NVGPRs are not saved to reduce overhead, + * because registers at the time of the interrupt are not so important as it is + * asynchronous. + * + * If soft masked, the masked handler will note the pending interrupt for + * replay, and clear MSR[EE] in the interrupted context. + * + * CFAR is not required because this is an asynchronous interrupt that in + * general won't have much bearing on the state of the CPU, with the possible + * exception of crash/debug IPIs, but those are generally moving to use SRESET + * IPIs. Unless this is an HV interrupt and KVM HV is possible, in which case + * it may be exiting the guest and need CFAR to be saved. + */ +INT_DEFINE_BEGIN(hardware_interrupt) + IVEC=0x500 + IHSRR_IF_HVMODE=1 + IMASK=IRQS_DISABLED + IKVM_REAL=1 + IKVM_VIRT=1 + ICFAR=0 +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + ICFAR_IF_HVMODE=1 +#endif +INT_DEFINE_END(hardware_interrupt) EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) - INT_HANDLER hardware_interrupt, 0x500, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY hardware_interrupt, virt=0 EXC_REAL_END(hardware_interrupt, 0x500, 0x100) EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) - INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY hardware_interrupt, virt=1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) -INT_KVM_HANDLER hardware_interrupt, 0x500, EXC_HV_OR_STD, PACA_EXGEN, 0 -EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) +EXC_COMMON_BEGIN(hardware_interrupt_common) + GEN_COMMON hardware_interrupt + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(do_IRQ) + BEGIN_FTR_SECTION + b interrupt_return_hsrr + FTR_SECTION_ELSE + b interrupt_return_srr + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) +/** + * Interrupt 0x600 - Alignment Interrupt + * This is a synchronous interrupt in response to data alignment fault. + */ +INT_DEFINE_BEGIN(alignment) + IVEC=0x600 + IDAR=1 + IDSISR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(alignment) + EXC_REAL_BEGIN(alignment, 0x600, 0x100) - INT_HANDLER alignment, 0x600, dar=1, dsisr=1, kvm=1 + GEN_INT_ENTRY alignment, virt=0 EXC_REAL_END(alignment, 0x600, 0x100) EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) - INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1 + GEN_INT_ENTRY alignment, virt=1 EXC_VIRT_END(alignment, 0x4600, 0x100) -INT_KVM_HANDLER alignment, 0x600, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(alignment_common) - INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1 - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl alignment_exception - b ret_from_except + GEN_COMMON alignment + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(alignment_exception) + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ + b interrupt_return_srr +/** + * Interrupt 0x700 - Program Interrupt (program check). + * This is a synchronous interrupt in response to various instruction faults: + * traps, privilege errors, TM errors, floating point exceptions. + * + * Handling: + * This interrupt may use the "emergency stack" in some cases when being taken + * from kernel context, which complicates handling. + */ +INT_DEFINE_BEGIN(program_check) + IVEC=0x700 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(program_check) + EXC_REAL_BEGIN(program_check, 0x700, 0x100) - INT_HANDLER program_check, 0x700, kvm=1 + EARLY_BOOT_FIXUP + GEN_INT_ENTRY program_check, virt=0 EXC_REAL_END(program_check, 0x700, 0x100) EXC_VIRT_BEGIN(program_check, 0x4700, 0x100) - INT_HANDLER program_check, 0x700, virt=1 + GEN_INT_ENTRY program_check, virt=1 EXC_VIRT_END(program_check, 0x4700, 0x100) -INT_KVM_HANDLER program_check, 0x700, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(program_check_common) + __GEN_COMMON_ENTRY program_check + /* * It's possible to receive a TM Bad Thing type program check with * userspace register values (in particular r1), but with SRR1 reporting @@ -1296,45 +1720,64 @@ EXC_COMMON_BEGIN(program_check_common) */ andi. r10,r12,MSR_PR - bne 2f /* If userspace, go normal path */ + bne .Lnormal_stack /* If userspace, go normal path */ andis. r10,r12,(SRR1_PROGTM)@h - bne 1f /* If TM, emergency */ + bne .Lemergency_stack /* If TM, emergency */ cmpdi r1,-INT_FRAME_SIZE /* check if r1 is in userspace */ - blt 2f /* normal path if not */ + blt .Lnormal_stack /* normal path if not */ /* Use the emergency stack */ -1: andi. r10,r12,MSR_PR /* Set CR0 correctly for label */ +.Lemergency_stack: + andi. r10,r12,MSR_PR /* Set CR0 correctly for label */ /* 3 in EXCEPTION_PROLOG_COMMON */ mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - INT_COMMON 0x700, PACA_EXGEN, 0, 1, 1, 0, 0 - b 3f -2: - INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0 -3: - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl program_check_exception - b ret_from_except + __ISTACK(program_check)=0 + __GEN_COMMON_BODY program_check + b .Ldo_program_check + +.Lnormal_stack: + __ISTACK(program_check)=1 + __GEN_COMMON_BODY program_check + +.Ldo_program_check: + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(program_check_exception) + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ + b interrupt_return_srr +/* + * Interrupt 0x800 - Floating-Point Unavailable Interrupt. + * This is a synchronous interrupt in response to executing an fp instruction + * with MSR[FP]=0. + * + * Handling: + * This will load FP registers and enable the FP bit if coming from userspace, + * otherwise report a bad kernel use of FP. + */ +INT_DEFINE_BEGIN(fp_unavailable) + IVEC=0x800 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif + IMSR_R12=1 +INT_DEFINE_END(fp_unavailable) + EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100) - INT_HANDLER fp_unavailable, 0x800, kvm=1 + GEN_INT_ENTRY fp_unavailable, virt=0 EXC_REAL_END(fp_unavailable, 0x800, 0x100) EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) - INT_HANDLER fp_unavailable, 0x800, virt=1 + GEN_INT_ENTRY fp_unavailable, virt=1 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) -INT_KVM_HANDLER fp_unavailable, 0x800, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(fp_unavailable_common) - INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0 + GEN_COMMON fp_unavailable bne 1f /* if from user, just load it up */ - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - addi r3,r1,STACK_FRAME_OVERHEAD - bl kernel_fp_unavailable_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(kernel_fp_unavailable_exception) 0: trap EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 1: @@ -1347,89 +1790,183 @@ BEGIN_FTR_SECTION bne- 2f END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif - bl load_up_fpu - b fast_exception_return + bl CFUNC(load_up_fpu) + b fast_interrupt_return_srr #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - addi r3,r1,STACK_FRAME_OVERHEAD - bl fp_unavailable_tm - b ret_from_except + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(fp_unavailable_tm) + b interrupt_return_srr #endif +/** + * Interrupt 0x900 - Decrementer Interrupt. + * This is an asynchronous interrupt in response to a decrementer exception + * (e.g., DEC has wrapped below zero). It is maskable in hardware by clearing + * MSR[EE], and soft-maskable with IRQS_DISABLED mask (i.e., + * local_irq_disable()). + * + * Handling: + * This calls into Linux timer handler. NVGPRs are not saved (see 0x500). + * + * If soft masked, the masked handler will note the pending interrupt for + * replay, and bump the decrementer to a high value, leaving MSR[EE] enabled + * in the interrupted context. + * If PPC_WATCHDOG is configured, the soft masked handler will actually set + * things back up to run soft_nmi_interrupt as a regular interrupt handler + * on the emergency stack. + * + * CFAR is not required because this is asynchronous (see hardware_interrupt). + * A watchdog interrupt may like to have CFAR, but usually the interesting + * branch is long gone by that point (e.g., infinite loop). + */ +INT_DEFINE_BEGIN(decrementer) + IVEC=0x900 + IMASK=IRQS_DISABLED +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif + ICFAR=0 +INT_DEFINE_END(decrementer) + EXC_REAL_BEGIN(decrementer, 0x900, 0x80) - INT_HANDLER decrementer, 0x900, ool=1, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY decrementer, virt=0 EXC_REAL_END(decrementer, 0x900, 0x80) EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) - INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED + GEN_INT_ENTRY decrementer, virt=1 EXC_VIRT_END(decrementer, 0x4900, 0x80) -INT_KVM_HANDLER decrementer, 0x900, EXC_STD, PACA_EXGEN, 0 -EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) +EXC_COMMON_BEGIN(decrementer_common) + GEN_COMMON decrementer + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(timer_interrupt) + b interrupt_return_srr +/** + * Interrupt 0x980 - Hypervisor Decrementer Interrupt. + * This is an asynchronous interrupt, similar to 0x900 but for the HDEC + * register. + * + * Handling: + * Linux does not use this outside KVM where it's used to keep a host timer + * while the guest is given control of DEC. It should normally be caught by + * the KVM test and routed there. + */ +INT_DEFINE_BEGIN(hdecrementer) + IVEC=0x980 + IHSRR=1 + ISTACK=0 + IKVM_REAL=1 + IKVM_VIRT=1 +INT_DEFINE_END(hdecrementer) + EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80) - INT_HANDLER hdecrementer, 0x980, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY hdecrementer, virt=0 EXC_REAL_END(hdecrementer, 0x980, 0x80) EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) - INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY hdecrementer, virt=1 EXC_VIRT_END(hdecrementer, 0x4980, 0x80) -INT_KVM_HANDLER hdecrementer, 0x980, EXC_HV, PACA_EXGEN, 0 -EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) +EXC_COMMON_BEGIN(hdecrementer_common) + __GEN_COMMON_ENTRY hdecrementer + /* + * Hypervisor decrementer interrupts not caught by the KVM test + * shouldn't occur but are sometimes left pending on exit from a KVM + * guest. We don't need to do anything to clear them, as they are + * edge-triggered. + * + * Be careful to avoid touching the kernel stack. + */ + li r10,0 + stb r10,PACAHSRR_VALID(r13) + ld r10,PACA_EXGEN+EX_CTR(r13) + mtctr r10 + mtcrf 0x80,r9 + ld r9,PACA_EXGEN+EX_R9(r13) + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + HRFI_TO_KERNEL +/** + * Interrupt 0xa00 - Directed Privileged Doorbell Interrupt. + * This is an asynchronous interrupt in response to a msgsndp doorbell. + * It is maskable in hardware by clearing MSR[EE], and soft-maskable with + * IRQS_DISABLED mask (i.e., local_irq_disable()). + * + * Handling: + * Guests may use this for IPIs between threads in a core if the + * hypervisor supports it. NVGPRS are not saved (see 0x500). + * + * If soft masked, the masked handler will note the pending interrupt for + * replay, leaving MSR[EE] enabled in the interrupted context because the + * doorbells are edge triggered. + * + * CFAR is not required, similarly to hardware_interrupt. + */ +INT_DEFINE_BEGIN(doorbell_super) + IVEC=0xa00 + IMASK=IRQS_DISABLED +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif + ICFAR=0 +INT_DEFINE_END(doorbell_super) + EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100) - INT_HANDLER doorbell_super, 0xa00, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY doorbell_super, virt=0 EXC_REAL_END(doorbell_super, 0xa00, 0x100) EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) - INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED + GEN_INT_ENTRY doorbell_super, virt=1 EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) -INT_KVM_HANDLER doorbell_super, 0xa00, EXC_STD, PACA_EXGEN, 0 +EXC_COMMON_BEGIN(doorbell_super_common) + GEN_COMMON doorbell_super + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_PPC_DOORBELL -EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) + bl CFUNC(doorbell_exception) #else -EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception) + bl CFUNC(unknown_async_exception) #endif + b interrupt_return_srr EXC_REAL_NONE(0xb00, 0x100) EXC_VIRT_NONE(0x4b00, 0x100) -/* - * system call / hypercall (0xc00, 0x4c00) - * - * The system call exception is invoked with "sc 0" and does not alter HV bit. - * - * The hypercall is invoked with "sc 1" and sets HV=1. +/** + * Interrupt 0xc00 - System Call Interrupt (syscall, hcall). + * This is a synchronous interrupt invoked with the "sc" instruction. The + * system call is invoked with "sc 0" and does not alter the HV bit, so it + * is directed to the currently running OS. The hypercall is invoked with + * "sc 1" and it sets HV=1, so it elevates to hypervisor. * * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to * 0x4c00 virtual mode. * + * Handling: + * If the KVM test fires then it was due to a hypercall and is accordingly + * routed to KVM. Otherwise this executes a normal Linux system call. + * * Call convention: * - * syscall register convention is in Documentation/powerpc/syscall64-abi.rst - * - * For hypercalls, the register convention is as follows: - * r0 volatile - * r1-2 nonvolatile - * r3 volatile parameter and return value for status - * r4-r10 volatile input and output value - * r11 volatile hypercall number and output value - * r12 volatile input and output value - * r13-r31 nonvolatile - * LR nonvolatile - * CTR volatile - * XER volatile - * CR0-1 CR5-7 volatile - * CR2-4 nonvolatile - * Other registers nonvolatile + * syscall and hypercalls register conventions are documented in + * Documentation/arch/powerpc/syscall64-abi.rst and + * Documentation/arch/powerpc/papr_hcalls.rst respectively. * * The intersection of volatile registers that don't contain possible * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry * without saving, though xer is not a good idea to use, as hardware may * interpret some bits so it may be costly to change them. */ +INT_DEFINE_BEGIN(system_call) + IVEC=0xc00 + IKVM_REAL=1 + IKVM_VIRT=1 + ICFAR=0 +INT_DEFINE_END(system_call) + .macro SYSTEM_CALL virt #ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* @@ -1444,7 +1981,7 @@ EXC_VIRT_NONE(0x4b00, 0x100) GET_PACA(r13) std r10,PACA_EXGEN+EX_R10(r13) INTERRUPT_TO_KERNEL - KVMTEST system_call EXC_STD 0xc00 /* uses r10, branch to system_call_kvm */ + KVMTEST system_call kvm_hcall /* uses r10, branch to kvm_hcall */ mfctr r9 #else mr r9,r13 @@ -1466,17 +2003,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) HMT_MEDIUM .if ! \virt - __LOAD_HANDLER(r10, system_call_common) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . /* prevent speculative execution */ + __LOAD_HANDLER(r10, system_call_common_real, real_vectors) + mtctr r10 + bctr .else - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ #ifdef CONFIG_RELOCATABLE - __LOAD_HANDLER(r10, system_call_common) + __LOAD_HANDLER(r10, system_call_common, virt_vectors) mtctr r10 bctr #else @@ -1503,108 +2035,213 @@ EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) EXC_VIRT_END(system_call, 0x4c00, 0x100) #ifdef CONFIG_KVM_BOOK3S_64_HANDLER - /* - * This is a hcall, so register convention is as above, with these - * differences: - * r13 = PACA - * ctr = orig r13 - * orig r10 saved in PACA - */ -TRAMP_KVM_BEGIN(system_call_kvm) +TRAMP_REAL_BEGIN(kvm_hcall) + std r9,PACA_EXGEN+EX_R9(r13) + std r11,PACA_EXGEN+EX_R11(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfcr r9 + mfctr r10 + std r10,PACA_EXGEN+EX_R13(r13) + li r10,0 + std r10,PACA_EXGEN+EX_CFAR(r13) + std r10,PACA_EXGEN+EX_CTR(r13) /* * Save the PPR (on systems that support it) before changing to * HMT_MEDIUM. That allows the KVM code to save that value into the * guest state (it is the guest's PPR value). */ - OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR) +BEGIN_FTR_SECTION + mfspr r10,SPRN_PPR + std r10,PACA_EXGEN+EX_PPR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + HMT_MEDIUM - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR) - mfctr r10 - SET_SCRATCH0(r10) - std r9,PACA_EXGEN+EX_R9(r13) - mfcr r9 - KVM_HANDLER 0xc00, EXC_STD, PACA_EXGEN, 0 + +#ifdef CONFIG_RELOCATABLE + /* + * Requires __LOAD_FAR_HANDLER beause kvmppc_hcall lives + * outside the head section. + */ + __LOAD_FAR_HANDLER(r10, kvmppc_hcall, real_trampolines) + mtctr r10 + bctr +#else + b kvmppc_hcall +#endif #endif +/** + * Interrupt 0xd00 - Trace Interrupt. + * This is a synchronous interrupt in response to instruction step or + * breakpoint faults. + */ +INT_DEFINE_BEGIN(single_step) + IVEC=0xd00 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(single_step) EXC_REAL_BEGIN(single_step, 0xd00, 0x100) - INT_HANDLER single_step, 0xd00, kvm=1 + GEN_INT_ENTRY single_step, virt=0 EXC_REAL_END(single_step, 0xd00, 0x100) EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) - INT_HANDLER single_step, 0xd00, virt=1 + GEN_INT_ENTRY single_step, virt=1 EXC_VIRT_END(single_step, 0x4d00, 0x100) -INT_KVM_HANDLER single_step, 0xd00, EXC_STD, PACA_EXGEN, 0 -EXC_COMMON(single_step_common, 0xd00, single_step_exception) +EXC_COMMON_BEGIN(single_step_common) + GEN_COMMON single_step + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(single_step_exception) + b interrupt_return_srr +/** + * Interrupt 0xe00 - Hypervisor Data Storage Interrupt (HDSI). + * This is a synchronous interrupt in response to an MMU fault caused by a + * guest data access. + * + * Handling: + * This should always get routed to KVM. In radix MMU mode, this is caused + * by a guest nested radix access that can't be performed due to the + * partition scope page table. In hash mode, this can be caused by guests + * running with translation disabled (virtual real mode) or with VPM enabled. + * KVM will update the page table structures or disallow the access. + */ +INT_DEFINE_BEGIN(h_data_storage) + IVEC=0xe00 + IHSRR=1 + IDAR=1 + IDSISR=1 + IKVM_REAL=1 + IKVM_VIRT=1 +INT_DEFINE_END(h_data_storage) + EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20) - INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 + GEN_INT_ENTRY h_data_storage, virt=0, ool=1 EXC_REAL_END(h_data_storage, 0xe00, 0x20) EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) - INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 + GEN_INT_ENTRY h_data_storage, virt=1, ool=1 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) -INT_KVM_HANDLER h_data_storage, 0xe00, EXC_HV, PACA_EXGEN, 1 EXC_COMMON_BEGIN(h_data_storage_common) - INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1 - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD + GEN_COMMON h_data_storage + addi r3,r1,STACK_INT_FRAME_REGS BEGIN_MMU_FTR_SECTION - ld r4,_DAR(r1) - li r5,SIGSEGV - bl bad_page_fault + bl CFUNC(do_bad_page_fault_segv) MMU_FTR_SECTION_ELSE - bl unknown_exception + bl CFUNC(unknown_exception) ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) - b ret_from_except + b interrupt_return_hsrr +/** + * Interrupt 0xe20 - Hypervisor Instruction Storage Interrupt (HISI). + * This is a synchronous interrupt in response to an MMU fault caused by a + * guest instruction fetch, similar to HDSI. + */ +INT_DEFINE_BEGIN(h_instr_storage) + IVEC=0xe20 + IHSRR=1 + IKVM_REAL=1 + IKVM_VIRT=1 +INT_DEFINE_END(h_instr_storage) + EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20) - INT_HANDLER h_instr_storage, 0xe20, ool=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY h_instr_storage, virt=0, ool=1 EXC_REAL_END(h_instr_storage, 0xe20, 0x20) EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) - INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY h_instr_storage, virt=1, ool=1 EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) -INT_KVM_HANDLER h_instr_storage, 0xe20, EXC_HV, PACA_EXGEN, 0 -EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) +EXC_COMMON_BEGIN(h_instr_storage_common) + GEN_COMMON h_instr_storage + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(unknown_exception) + b interrupt_return_hsrr +/** + * Interrupt 0xe40 - Hypervisor Emulation Assistance Interrupt. + */ +INT_DEFINE_BEGIN(emulation_assist) + IVEC=0xe40 + IHSRR=1 + IKVM_REAL=1 + IKVM_VIRT=1 +INT_DEFINE_END(emulation_assist) + EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20) - INT_HANDLER emulation_assist, 0xe40, ool=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY emulation_assist, virt=0, ool=1 EXC_REAL_END(emulation_assist, 0xe40, 0x20) EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) - INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY emulation_assist, virt=1, ool=1 EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) -INT_KVM_HANDLER emulation_assist, 0xe40, EXC_HV, PACA_EXGEN, 0 -EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) - - -/* - * hmi_exception trampoline is a special case. It jumps to hmi_exception_early - * first, and then eventaully from there to the trampoline to get into virtual - * mode. +EXC_COMMON_BEGIN(emulation_assist_common) + GEN_COMMON emulation_assist + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(emulation_assist_interrupt) + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ + b interrupt_return_hsrr + + +/** + * Interrupt 0xe60 - Hypervisor Maintenance Interrupt (HMI). + * This is an asynchronous interrupt caused by a Hypervisor Maintenance + * Exception. It is always taken in real mode but uses HSRR registers + * unlike SRESET and MCE. + * + * It is maskable in hardware by clearing MSR[EE], and partially soft-maskable + * with IRQS_DISABLED mask (i.e., local_irq_disable()). + * + * Handling: + * This is a special case, this is handled similarly to machine checks, with an + * initial real mode handler that is not soft-masked, which attempts to fix the + * problem. Then a regular handler which is soft-maskable and reports the + * problem. + * + * The emergency stack is used for the early real mode handler. + * + * XXX: unclear why MCE and HMI schemes could not be made common, e.g., + * either use soft-masking for the MCE, or use irq_work for the HMI. + * + * KVM: + * Unlike MCE, this calls into KVM without calling the real mode handler + * first. */ +INT_DEFINE_BEGIN(hmi_exception_early) + IVEC=0xe60 + IHSRR=1 + IREALMODE_COMMON=1 + ISTACK=0 + IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */ + IKVM_REAL=1 +INT_DEFINE_END(hmi_exception_early) + +INT_DEFINE_BEGIN(hmi_exception) + IVEC=0xe60 + IHSRR=1 + IMASK=IRQS_DISABLED + IKVM_REAL=1 +INT_DEFINE_END(hmi_exception) + EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) - INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1 + GEN_INT_ENTRY hmi_exception_early, virt=0, ool=1 EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) -INT_KVM_HANDLER hmi_exception, 0xe60, EXC_HV, PACA_EXGEN, 0 + EXC_COMMON_BEGIN(hmi_exception_early_common) - mtctr r10 /* Restore ctr */ - mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* Save HSRR1 */ + __GEN_REALMODE_COMMON_ENTRY hmi_exception_early + mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - /* We don't touch AMR here, we never go to virtual mode */ - INT_COMMON 0xe60, PACA_EXGEN, 0, 0, 0, 0, 0 + __GEN_COMMON_BODY hmi_exception_early - addi r3,r1,STACK_FRAME_OVERHEAD - bl hmi_exception_realmode + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(hmi_exception_realmode) cmpdi cr0,r3,0 bne 1f - EXCEPTION_RESTORE_REGS EXC_HV + EXCEPTION_RESTORE_REGS hsrr=1 HRFI_TO_USER_OR_KERNEL 1: @@ -1612,41 +2249,84 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) * Go to virtual mode and pull the HMI event information from * firmware. */ - EXCEPTION_RESTORE_REGS EXC_HV - INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 + EXCEPTION_RESTORE_REGS hsrr=1 + GEN_INT_ENTRY hmi_exception, virt=0 EXC_COMMON_BEGIN(hmi_exception_common) - INT_COMMON 0xe60, PACA_EXGEN, 1, 1, 1, 0, 0 - FINISH_NAP - RUNLATCH_ON - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl handle_hmi_exception - b ret_from_except + GEN_COMMON hmi_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(handle_hmi_exception) + b interrupt_return_hsrr + +/** + * Interrupt 0xe80 - Directed Hypervisor Doorbell Interrupt. + * This is an asynchronous interrupt in response to a msgsnd doorbell. + * Similar to the 0xa00 doorbell but for host rather than guest. + * + * CFAR is not required (similar to doorbell_interrupt), unless KVM HV + * is enabled, in which case it may be a guest exit. Most PowerNV kernels + * include KVM support so it would be nice if this could be dynamically + * patched out if KVM was not currently running any guests. + */ +INT_DEFINE_BEGIN(h_doorbell) + IVEC=0xe80 + IHSRR=1 + IMASK=IRQS_DISABLED + IKVM_REAL=1 + IKVM_VIRT=1 +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE + ICFAR=0 +#endif +INT_DEFINE_END(h_doorbell) EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20) - INT_HANDLER h_doorbell, 0xe80, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY h_doorbell, virt=0, ool=1 EXC_REAL_END(h_doorbell, 0xe80, 0x20) EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) - INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY h_doorbell, virt=1, ool=1 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) -INT_KVM_HANDLER h_doorbell, 0xe80, EXC_HV, PACA_EXGEN, 0 +EXC_COMMON_BEGIN(h_doorbell_common) + GEN_COMMON h_doorbell + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_PPC_DOORBELL -EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) + bl CFUNC(doorbell_exception) #else -EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception) + bl CFUNC(unknown_async_exception) #endif + b interrupt_return_hsrr + +/** + * Interrupt 0xea0 - Hypervisor Virtualization Interrupt. + * This is an asynchronous interrupt in response to an "external exception". + * Similar to 0x500 but for host only. + * + * Like h_doorbell, CFAR is only required for KVM HV because this can be + * a guest exit. + */ +INT_DEFINE_BEGIN(h_virt_irq) + IVEC=0xea0 + IHSRR=1 + IMASK=IRQS_DISABLED + IKVM_REAL=1 + IKVM_VIRT=1 +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE + ICFAR=0 +#endif +INT_DEFINE_END(h_virt_irq) EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20) - INT_HANDLER h_virt_irq, 0xea0, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY h_virt_irq, virt=0, ool=1 EXC_REAL_END(h_virt_irq, 0xea0, 0x20) EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) - INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 + GEN_INT_ENTRY h_virt_irq, virt=1, ool=1 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) -INT_KVM_HANDLER h_virt_irq, 0xea0, EXC_HV, PACA_EXGEN, 0 -EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) +EXC_COMMON_BEGIN(h_virt_irq_common) + GEN_COMMON h_virt_irq + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(do_IRQ) + b interrupt_return_hsrr EXC_REAL_NONE(0xec0, 0x20) @@ -1655,25 +2335,80 @@ EXC_REAL_NONE(0xee0, 0x20) EXC_VIRT_NONE(0x4ee0, 0x20) +/* + * Interrupt 0xf00 - Performance Monitor Interrupt (PMI, PMU). + * This is an asynchronous interrupt in response to a PMU exception. + * It is maskable in hardware by clearing MSR[EE], and soft-maskable with + * IRQS_PMI_DISABLED mask (NOTE: NOT local_irq_disable()). + * + * Handling: + * This calls into the perf subsystem. + * + * Like the watchdog soft-nmi, it appears an NMI interrupt to Linux, in that it + * runs under local_irq_disable. However it may be soft-masked in + * powerpc-specific code. + * + * If soft masked, the masked handler will note the pending interrupt for + * replay, and clear MSR[EE] in the interrupted context. + * + * CFAR is not used by perf interrupts so not required. + */ +INT_DEFINE_BEGIN(performance_monitor) + IVEC=0xf00 + IMASK=IRQS_PMI_DISABLED +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif + ICFAR=0 +INT_DEFINE_END(performance_monitor) + EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20) - INT_HANDLER performance_monitor, 0xf00, ool=1, bitmask=IRQS_PMI_DISABLED, kvm=1 + GEN_INT_ENTRY performance_monitor, virt=0, ool=1 EXC_REAL_END(performance_monitor, 0xf00, 0x20) EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) - INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED + GEN_INT_ENTRY performance_monitor, virt=1, ool=1 EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) -INT_KVM_HANDLER performance_monitor, 0xf00, EXC_STD, PACA_EXGEN, 0 -EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) +EXC_COMMON_BEGIN(performance_monitor_common) + GEN_COMMON performance_monitor + addi r3,r1,STACK_INT_FRAME_REGS + lbz r4,PACAIRQSOFTMASK(r13) + cmpdi r4,IRQS_ENABLED + bne 1f + bl CFUNC(performance_monitor_exception_async) + b interrupt_return_srr +1: + bl CFUNC(performance_monitor_exception_nmi) + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + + kuap_kernel_restore r9, r10 + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL + +/** + * Interrupt 0xf20 - Vector Unavailable Interrupt. + * This is a synchronous interrupt in response to + * executing a vector (or altivec) instruction with MSR[VEC]=0. + * Similar to FP unavailable. + */ +INT_DEFINE_BEGIN(altivec_unavailable) + IVEC=0xf20 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif + IMSR_R12=1 +INT_DEFINE_END(altivec_unavailable) EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20) - INT_HANDLER altivec_unavailable, 0xf20, ool=1, kvm=1 + GEN_INT_ENTRY altivec_unavailable, virt=0, ool=1 EXC_REAL_END(altivec_unavailable, 0xf20, 0x20) EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) - INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1 + GEN_INT_ENTRY altivec_unavailable, virt=1, ool=1 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) -INT_KVM_HANDLER altivec_unavailable, 0xf20, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(altivec_unavailable_common) - INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0 + GEN_COMMON altivec_unavailable #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION beq 1f @@ -1686,35 +2421,44 @@ BEGIN_FTR_SECTION bne- 2f END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69) #endif - bl load_up_altivec - b fast_exception_return + bl CFUNC(load_up_altivec) + b fast_interrupt_return_srr #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - addi r3,r1,STACK_FRAME_OVERHEAD - bl altivec_unavailable_tm - b ret_from_except + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(altivec_unavailable_tm) + b interrupt_return_srr #endif 1: END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - addi r3,r1,STACK_FRAME_OVERHEAD - bl altivec_unavailable_exception - b ret_from_except + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(altivec_unavailable_exception) + b interrupt_return_srr + +/** + * Interrupt 0xf40 - VSX Unavailable Interrupt. + * This is a synchronous interrupt in response to + * executing a VSX instruction with MSR[VSX]=0. + * Similar to FP unavailable. + */ +INT_DEFINE_BEGIN(vsx_unavailable) + IVEC=0xf40 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif + IMSR_R12=1 +INT_DEFINE_END(vsx_unavailable) EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20) - INT_HANDLER vsx_unavailable, 0xf40, ool=1, kvm=1 + GEN_INT_ENTRY vsx_unavailable, virt=0, ool=1 EXC_REAL_END(vsx_unavailable, 0xf40, 0x20) EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) - INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1 + GEN_INT_ENTRY vsx_unavailable, virt=1, ool=1 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) -INT_KVM_HANDLER vsx_unavailable, 0xf40, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(vsx_unavailable_common) - INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0 + GEN_COMMON vsx_unavailable #ifdef CONFIG_VSX BEGIN_FTR_SECTION beq 1f @@ -1730,40 +2474,73 @@ BEGIN_FTR_SECTION b load_up_vsx #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - addi r3,r1,STACK_FRAME_OVERHEAD - bl vsx_unavailable_tm - b ret_from_except + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(vsx_unavailable_tm) + b interrupt_return_srr #endif 1: END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - addi r3,r1,STACK_FRAME_OVERHEAD - bl vsx_unavailable_exception - b ret_from_except + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(vsx_unavailable_exception) + b interrupt_return_srr + +/** + * Interrupt 0xf60 - Facility Unavailable Interrupt. + * This is a synchronous interrupt in response to + * executing an instruction without access to the facility that can be + * resolved by the OS (e.g., FSCR, MSR). + * Similar to FP unavailable. + */ +INT_DEFINE_BEGIN(facility_unavailable) + IVEC=0xf60 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(facility_unavailable) EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20) - INT_HANDLER facility_unavailable, 0xf60, ool=1, kvm=1 + GEN_INT_ENTRY facility_unavailable, virt=0, ool=1 EXC_REAL_END(facility_unavailable, 0xf60, 0x20) EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) - INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1 + GEN_INT_ENTRY facility_unavailable, virt=1, ool=1 EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) -INT_KVM_HANDLER facility_unavailable, 0xf60, EXC_STD, PACA_EXGEN, 0 -EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) - +EXC_COMMON_BEGIN(facility_unavailable_common) + GEN_COMMON facility_unavailable + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(facility_unavailable_exception) + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ + b interrupt_return_srr + + +/** + * Interrupt 0xf60 - Hypervisor Facility Unavailable Interrupt. + * This is a synchronous interrupt in response to + * executing an instruction without access to the facility that can only + * be resolved in HV mode (e.g., HFSCR). + * Similar to FP unavailable. + */ +INT_DEFINE_BEGIN(h_facility_unavailable) + IVEC=0xf80 + IHSRR=1 + IKVM_REAL=1 + IKVM_VIRT=1 +INT_DEFINE_END(h_facility_unavailable) EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20) - INT_HANDLER h_facility_unavailable, 0xf80, ool=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY h_facility_unavailable, virt=0, ool=1 EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20) EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) - INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY h_facility_unavailable, virt=1, ool=1 EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) -INT_KVM_HANDLER h_facility_unavailable, 0xf80, EXC_HV, PACA_EXGEN, 0 -EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) +EXC_COMMON_BEGIN(h_facility_unavailable_common) + GEN_COMMON h_facility_unavailable + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(facility_unavailable_exception) + /* XXX Shouldn't be necessary in practice */ + HANDLER_RESTORE_NVGPRS() + b interrupt_return_hsrr EXC_REAL_NONE(0xfa0, 0x20) @@ -1779,56 +2556,94 @@ EXC_REAL_NONE(0x1100, 0x100) EXC_VIRT_NONE(0x5100, 0x100) #ifdef CONFIG_CBE_RAS +INT_DEFINE_BEGIN(cbe_system_error) + IVEC=0x1200 + IHSRR=1 +INT_DEFINE_END(cbe_system_error) + EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) - INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY cbe_system_error, virt=0 EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) -INT_KVM_HANDLER cbe_system_error, 0x1200, EXC_HV, PACA_EXGEN, 1 -EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) +EXC_COMMON_BEGIN(cbe_system_error_common) + GEN_COMMON cbe_system_error + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(cbe_system_error_exception) + b interrupt_return_hsrr + #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) #endif +/** + * Interrupt 0x1300 - Instruction Address Breakpoint Interrupt. + * This has been removed from the ISA before 2.01, which is the earliest + * 64-bit BookS ISA supported, however the G5 / 970 implements this + * interrupt with a non-architected feature available through the support + * processor interface. + */ +INT_DEFINE_BEGIN(instruction_breakpoint) + IVEC=0x1300 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(instruction_breakpoint) EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100) - INT_HANDLER instruction_breakpoint, 0x1300, kvm=1 + GEN_INT_ENTRY instruction_breakpoint, virt=0 EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100) EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) - INT_HANDLER instruction_breakpoint, 0x1300, virt=1 + GEN_INT_ENTRY instruction_breakpoint, virt=1 EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) -INT_KVM_HANDLER instruction_breakpoint, 0x1300, EXC_STD, PACA_EXGEN, 1 -EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) +EXC_COMMON_BEGIN(instruction_breakpoint_common) + GEN_COMMON instruction_breakpoint + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(instruction_breakpoint_exception) + b interrupt_return_srr EXC_REAL_NONE(0x1400, 0x100) EXC_VIRT_NONE(0x5400, 0x100) -EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) - INT_HANDLER denorm_exception_hv, 0x1500, early=2, hsrr=EXC_HV +/** + * Interrupt 0x1500 - Soft Patch Interrupt + * + * Handling: + * This is an implementation specific interrupt which can be used for a + * range of exceptions. + * + * This interrupt handler is unique in that it runs the denormal assist + * code even for guests (and even in guest context) without going to KVM, + * for speed. POWER9 does not raise denorm exceptions, so this special case + * could be phased out in future to reduce special cases. + */ +INT_DEFINE_BEGIN(denorm_exception) + IVEC=0x1500 + IHSRR=1 + IBRANCH_TO_COMMON=0 + IKVM_REAL=1 +INT_DEFINE_END(denorm_exception) + +EXC_REAL_BEGIN(denorm_exception, 0x1500, 0x100) + GEN_INT_ENTRY denorm_exception, virt=0 #ifdef CONFIG_PPC_DENORMALISATION - mfspr r10,SPRN_HSRR1 - andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ + andis. r10,r12,(HSRR1_DENORM)@h /* denorm? */ bne+ denorm_assist #endif - KVMTEST denorm_exception_hv, EXC_HV 0x1500 - INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1 -EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) - + GEN_BRANCH_TO_COMMON denorm_exception, virt=0 +EXC_REAL_END(denorm_exception, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) - INT_HANDLER denorm_exception, 0x1500, 0, 2, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0 - mfspr r10,SPRN_HSRR1 - andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ + GEN_INT_ENTRY denorm_exception, virt=1 + andis. r10,r12,(HSRR1_DENORM)@h /* denorm? */ bne+ denorm_assist - INT_VIRT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV + GEN_BRANCH_TO_COMMON denorm_exception, virt=1 EXC_VIRT_END(denorm_exception, 0x5500, 0x100) #else EXC_VIRT_NONE(0x5500, 0x100) #endif -INT_KVM_HANDLER denorm_exception_hv, 0x1500, EXC_HV, PACA_EXGEN, 0 - #ifdef CONFIG_PPC_DENORMALISATION TRAMP_REAL_BEGIN(denorm_assist) BEGIN_FTR_SECTION @@ -1885,11 +2700,16 @@ denorm_done: mtspr SPRN_HSRR0,r11 mtcrf 0x80,r9 ld r9,PACA_EXGEN+EX_R9(r13) - RESTORE_PPR_PACA(PACA_EXGEN, r10) +BEGIN_FTR_SECTION + ld r10,PACA_EXGEN+EX_PPR(r13) + mtspr SPRN_PPR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) BEGIN_FTR_SECTION ld r10,PACA_EXGEN+EX_CFAR(r13) mtspr SPRN_CFAR,r10 END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + li r10,0 + stb r10,PACAHSRR_VALID(r13) ld r10,PACA_EXGEN+EX_R10(r13) ld r11,PACA_EXGEN+EX_R11(r13) ld r12,PACA_EXGEN+EX_R12(r13) @@ -1898,43 +2718,76 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) b . #endif -EXC_COMMON(denorm_common, 0x1500, unknown_exception) +EXC_COMMON_BEGIN(denorm_exception_common) + GEN_COMMON denorm_exception + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(unknown_exception) + b interrupt_return_hsrr #ifdef CONFIG_CBE_RAS +INT_DEFINE_BEGIN(cbe_maintenance) + IVEC=0x1600 + IHSRR=1 +INT_DEFINE_END(cbe_maintenance) + EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) - INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY cbe_maintenance, virt=0 EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) -INT_KVM_HANDLER cbe_maintenance, 0x1600, EXC_HV, PACA_EXGEN, 1 -EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) +EXC_COMMON_BEGIN(cbe_maintenance_common) + GEN_COMMON cbe_maintenance + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(cbe_maintenance_exception) + b interrupt_return_hsrr + #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) #endif +INT_DEFINE_BEGIN(altivec_assist) + IVEC=0x1700 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + IKVM_REAL=1 +#endif +INT_DEFINE_END(altivec_assist) + EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100) - INT_HANDLER altivec_assist, 0x1700, kvm=1 + GEN_INT_ENTRY altivec_assist, virt=0 EXC_REAL_END(altivec_assist, 0x1700, 0x100) EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) - INT_HANDLER altivec_assist, 0x1700, virt=1 + GEN_INT_ENTRY altivec_assist, virt=1 EXC_VIRT_END(altivec_assist, 0x5700, 0x100) -INT_KVM_HANDLER altivec_assist, 0x1700, EXC_STD, PACA_EXGEN, 0 +EXC_COMMON_BEGIN(altivec_assist_common) + GEN_COMMON altivec_assist + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_ALTIVEC -EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) + bl CFUNC(altivec_assist_exception) + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ #else -EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception) + bl CFUNC(unknown_exception) #endif + b interrupt_return_srr #ifdef CONFIG_CBE_RAS +INT_DEFINE_BEGIN(cbe_thermal) + IVEC=0x1800 + IHSRR=1 +INT_DEFINE_END(cbe_thermal) + EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) - INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1 + GEN_INT_ENTRY cbe_thermal, virt=0 EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) -INT_KVM_HANDLER cbe_thermal, 0x1800, EXC_HV, PACA_EXGEN, 1 -EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) +EXC_COMMON_BEGIN(cbe_thermal_common) + GEN_COMMON cbe_thermal + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(cbe_thermal_exception) + b interrupt_return_hsrr + #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) @@ -1943,14 +2796,11 @@ EXC_VIRT_NONE(0x5800, 0x100) #ifdef CONFIG_PPC_WATCHDOG -#define MASKED_DEC_HANDLER_LABEL 3f - -#define MASKED_DEC_HANDLER(_H) \ -3: /* soft-nmi */ \ - std r12,PACA_EXGEN+EX_R12(r13); \ - GET_SCRATCH0(r10); \ - std r10,PACA_EXGEN+EX_R13(r13); \ - INT_SAVE_SRR_AND_JUMP soft_nmi_common, _H, 1 +INT_DEFINE_BEGIN(soft_nmi) + IVEC=0x900 + ISTACK=0 + ICFAR=0 +INT_DEFINE_END(soft_nmi) /* * Branch to soft_nmi_interrupt using the emergency stack. The emergency @@ -1965,20 +2815,25 @@ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) subi r1,r1,INT_FRAME_SIZE - INT_COMMON 0x900, PACA_EXGEN, 0, 1, 1, 0, 0 - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl soft_nmi_interrupt - b ret_from_except - -#else /* CONFIG_PPC_WATCHDOG */ -#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ -#define MASKED_DEC_HANDLER(_H) + __GEN_COMMON_BODY soft_nmi + + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(soft_nmi_interrupt) + + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + + kuap_kernel_restore r9, r10 + + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL + #endif /* CONFIG_PPC_WATCHDOG */ /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: - * - If it was a decrementer interrupt, we bump the dec to max and and return. + * - If it was a decrementer interrupt, we bump the dec to max and return. * - If it was a doorbell we return immediately since doorbells are edge * triggered and won't automatically refire. * - If it was a HMI we return immediately since we handled it in realmode @@ -1986,49 +2841,89 @@ EXC_COMMON_BEGIN(soft_nmi_common) * - Else it is one of PACA_IRQ_MUST_HARD_MASK, so hard disable and return. * This is called with r10 containing the value to OR to the paca field. */ -.macro MASKED_INTERRUPT hsrr +.macro MASKED_INTERRUPT hsrr=0 .if \hsrr masked_Hinterrupt: .else masked_interrupt: .endif - std r11,PACA_EXGEN+EX_R11(r13) - lbz r11,PACAIRQHAPPENED(r13) - or r11,r11,r10 - stb r11,PACAIRQHAPPENED(r13) + stw r9,PACA_EXGEN+EX_CCR(r13) +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + /* + * Ensure there was no previous MUST_HARD_MASK interrupt or + * HARD_DIS setting. If this does fire, the interrupt is still + * masked and MSR[EE] will be cleared on return, so no need to + * panic, but somebody probably enabled MSR[EE] under + * PACA_IRQ_HARD_DIS, mtmsr(mfmsr() | MSR_x) being a common + * cause. + */ + lbz r9,PACAIRQHAPPENED(r13) + andi. r9,r9,(PACA_IRQ_MUST_HARD_MASK|PACA_IRQ_HARD_DIS) +0: tdnei r9,0 + EMIT_WARN_ENTRY 0b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) +#endif + lbz r9,PACAIRQHAPPENED(r13) + or r9,r9,r10 + stb r9,PACAIRQHAPPENED(r13) + + .if ! \hsrr cmpwi r10,PACA_IRQ_DEC bne 1f - lis r10,0x7fff - ori r10,r10,0xffff - mtspr SPRN_DEC,r10 - b MASKED_DEC_HANDLER_LABEL + LOAD_REG_IMMEDIATE(r9, 0x7fffffff) + mtspr SPRN_DEC,r9 +#ifdef CONFIG_PPC_WATCHDOG + lwz r9,PACA_EXGEN+EX_CCR(r13) + b soft_nmi_common +#else + b 2f +#endif + .endif + 1: andi. r10,r10,PACA_IRQ_MUST_HARD_MASK beq 2f + xori r12,r12,MSR_EE /* clear MSR_EE */ .if \hsrr - mfspr r10,SPRN_HSRR1 - xori r10,r10,MSR_EE /* clear MSR_EE */ - mtspr SPRN_HSRR1,r10 + mtspr SPRN_HSRR1,r12 .else - mfspr r10,SPRN_SRR1 - xori r10,r10,MSR_EE /* clear MSR_EE */ - mtspr SPRN_SRR1,r10 + mtspr SPRN_SRR1,r12 .endif - ori r11,r11,PACA_IRQ_HARD_DIS - stb r11,PACAIRQHAPPENED(r13) + ori r9,r9,PACA_IRQ_HARD_DIS + stb r9,PACAIRQHAPPENED(r13) 2: /* done */ + li r9,0 + .if \hsrr + stb r9,PACAHSRR_VALID(r13) + .else + stb r9,PACASRR_VALID(r13) + .endif + + SEARCH_RESTART_TABLE + cmpdi r12,0 + beq 3f + .if \hsrr + mtspr SPRN_HSRR0,r12 + .else + mtspr SPRN_SRR0,r12 + .endif +3: + + ld r9,PACA_EXGEN+EX_CTR(r13) + mtctr r9 + lwz r9,PACA_EXGEN+EX_CCR(r13) mtcrf 0x80,r9 std r1,PACAR1(r13) ld r9,PACA_EXGEN+EX_R9(r13) ld r10,PACA_EXGEN+EX_R10(r13) ld r11,PACA_EXGEN+EX_R11(r13) - /* returns to kernel where r13 must be set up, so don't restore it */ + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + /* May return to masked low address where r13 is not set up */ .if \hsrr HRFI_TO_KERNEL .else RFI_TO_KERNEL .endif b . - MASKED_DEC_HANDLER(\hsrr\()) .endm TRAMP_REAL_BEGIN(stf_barrier_fallback) @@ -2044,15 +2939,8 @@ TRAMP_REAL_BEGIN(stf_barrier_fallback) .endr blr -TRAMP_REAL_BEGIN(rfi_flush_fallback) - SET_SCRATCH0(r13); - GET_PACA(r13); - std r1,PACA_EXRFI+EX_R12(r13) - ld r1,PACAKSAVE(r13) - std r9,PACA_EXRFI+EX_R9(r13) - std r10,PACA_EXRFI+EX_R10(r13) - std r11,PACA_EXRFI+EX_R11(r13) - mfctr r9 +/* Clobbers r10, r11, ctr */ +.macro L1D_DISPLACEMENT_FLUSH ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) ld r11,PACA_L1D_FLUSH_SIZE(r13) srdi r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */ @@ -2063,7 +2951,7 @@ TRAMP_REAL_BEGIN(rfi_flush_fallback) sync /* - * The load adresses are at staggered offsets within cachelines, + * The load addresses are at staggered offsets within cachelines, * which suits some pipelines better (on others it should not * hurt). */ @@ -2078,7 +2966,49 @@ TRAMP_REAL_BEGIN(rfi_flush_fallback) ld r11,(0x80 + 8)*7(r10) addi r10,r10,0x80*8 bdnz 1b +.endm +TRAMP_REAL_BEGIN(entry_flush_fallback) + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + mfctr r9 + L1D_DISPLACEMENT_FLUSH + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + blr + +/* + * The SCV entry flush happens with interrupts enabled, so it must disable + * to prevent EXRFI being clobbered by NMIs (e.g., soft_nmi_common). r10 + * (containing LR) does not need to be preserved here because scv entry + * puts 0 in the pt_regs, CTR can be clobbered for the same reason. + */ +TRAMP_REAL_BEGIN(scv_entry_flush_fallback) + li r10,0 + mtmsrd r10,1 + lbz r10,PACAIRQHAPPENED(r13) + ori r10,r10,PACA_IRQ_HARD_DIS + stb r10,PACAIRQHAPPENED(r13) + std r11,PACA_EXRFI+EX_R11(r13) + L1D_DISPLACEMENT_FLUSH + ld r11,PACA_EXRFI+EX_R11(r13) + li r10,MSR_RI + mtmsrd r10,1 + blr + +TRAMP_REAL_BEGIN(rfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r1,PACA_EXRFI+EX_R12(r13) + ld r1,PACAKSAVE(r13) + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + mfctr r9 + L1D_DISPLACEMENT_FLUSH mtctr r9 ld r9,PACA_EXRFI+EX_R9(r13) ld r10,PACA_EXRFI+EX_R10(r13) @@ -2096,6 +3026,22 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback) std r10,PACA_EXRFI+EX_R10(r13) std r11,PACA_EXRFI+EX_R11(r13) mfctr r9 + L1D_DISPLACEMENT_FLUSH + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r1,PACA_EXRFI+EX_R12(r13) + GET_SCRATCH0(r13); + hrfid + +TRAMP_REAL_BEGIN(rfscv_flush_fallback) + /* system call volatile */ + mr r7,r13 + GET_PACA(r13); + mr r8,r1 + ld r1,PACAKSAVE(r13) + mfctr r9 ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) ld r11,PACA_L1D_FLUSH_SIZE(r13) srdi r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */ @@ -2123,79 +3069,45 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback) bdnz 1b mtctr r9 - ld r9,PACA_EXRFI+EX_R9(r13) - ld r10,PACA_EXRFI+EX_R10(r13) - ld r11,PACA_EXRFI+EX_R11(r13) - ld r1,PACA_EXRFI+EX_R12(r13) - GET_SCRATCH0(r13); - hrfid + li r9,0 + li r10,0 + li r11,0 + mr r1,r8 + mr r13,r7 + RFSCV -/* - * Real mode exceptions actually use this too, but alternate - * instruction code patches (which end up in the common .text area) - * cannot reach these if they are put there. - */ -USE_FIXED_SECTION(virt_trampolines) - MASKED_INTERRUPT EXC_STD - MASKED_INTERRUPT EXC_HV +USE_TEXT_SECTION() #ifdef CONFIG_KVM_BOOK3S_64_HANDLER -TRAMP_REAL_BEGIN(kvmppc_skip_interrupt) +kvm_interrupt: /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. + * The conditional branch in KVMTEST can't reach all the way, + * make a stub. */ - mfspr r13, SPRN_SRR0 - addi r13, r13, 4 - mtspr SPRN_SRR0, r13 - GET_SCRATCH0(r13) - RFI_TO_KERNEL - b . - -TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_HSRR0 - addi r13, r13, 4 - mtspr SPRN_HSRR0, r13 - GET_SCRATCH0(r13) - HRFI_TO_KERNEL - b . + b kvmppc_interrupt #endif -/* - * Ensure that any handlers that get invoked from the exception prologs - * above are below the first 64KB (0x10000) of the kernel image because - * the prologs assemble the addresses of these handlers using the - * LOAD_HANDLER macro, which uses an ori instruction. - */ - -/*** Common interrupt handlers ***/ - - - /* - * Relocation-on interrupts: A subset of the interrupts can be delivered - * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering - * it. Addresses are the same as the original interrupt addresses, but - * offset by 0xc000000000004000. - * It's impossible to receive interrupts below 0x300 via this mechanism. - * KVM: None of these traps are from the guest ; anything that escalated - * to HV=1 from HV=0 is delivered via real mode handlers. - */ +_GLOBAL(do_uaccess_flush) + UACCESS_FLUSH_FIXUP_SECTION + nop + nop + nop + blr + L1D_DISPLACEMENT_FLUSH + blr +_ASM_NOKPROBE_SYMBOL(do_uaccess_flush) +EXPORT_SYMBOL(do_uaccess_flush) - /* - * This uses the standard macro, since the original 0x300 vector - * only has extra guff for STAB-based processors -- which never - * come here. - */ -EXC_COMMON_BEGIN(ppc64_runlatch_on_trampoline) - b __ppc64_runlatch_on +MASKED_INTERRUPT +MASKED_INTERRUPT hsrr=1 USE_FIXED_SECTION(virt_trampolines) /* + * All code below __end_soft_masked is treated as soft-masked. If + * any code runs here with MSR[EE]=1, it must then cope with pending + * soft interrupt being raised (i.e., by ensuring it is replayed). + * * The __end_interrupts marker must be past the out-of-line (OOL) * handlers, so that they are copied to real address 0x100 when running * a relocatable kernel. This ensures they can be reached from the short @@ -2205,16 +3117,7 @@ USE_FIXED_SECTION(virt_trampolines) .align 7 .globl __end_interrupts __end_interrupts: -DEFINE_FIXED_SYMBOL(__end_interrupts) - -#ifdef CONFIG_PPC_970_NAP -EXC_COMMON_BEGIN(power4_fixup_nap) - andc r9,r9,r10 - std r9,TI_LOCAL_FLAGS(r11) - ld r10,_LINK(r1) /* make idle task do the */ - std r10,_NIP(r1) /* equivalent of a blr */ - blr -#endif +DEFINE_FIXED_SYMBOL(__end_interrupts, virt_trampolines) CLOSE_FIXED_SECTION(real_vectors); CLOSE_FIXED_SECTION(real_trampolines); @@ -2224,7 +3127,7 @@ CLOSE_FIXED_SECTION(virt_trampolines); USE_TEXT_SECTION() /* MSR[RI] should be clear because this uses SRR[01] */ -enable_machine_check: +_GLOBAL(enable_machine_check) mflr r0 bcl 20,31,$+4 0: mflr r3 @@ -2238,7 +3141,7 @@ enable_machine_check: blr /* MSR[RI] should be clear because this uses SRR[01] */ -disable_machine_check: +SYM_FUNC_START_LOCAL(disable_machine_check) mflr r0 bcl 20,31,$+4 0: mflr r3 @@ -2251,161 +3154,4 @@ disable_machine_check: RFI_TO_KERNEL 1: mtlr r0 blr - -/* - * Hash table stuff - */ - .balign IFETCH_ALIGN_BYTES -do_hash_page: -#ifdef CONFIG_PPC_BOOK3S_64 - lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h - ori r0,r0,DSISR_BAD_FAULT_64S@l - and. r0,r5,r0 /* weird error? */ - bne- handle_page_fault /* if not, try to insert a HPTE */ - ld r11, PACA_THREAD_INFO(r13) - lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ - andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ - bne 77f /* then don't call hash_page now */ - - /* - * r3 contains the trap number - * r4 contains the faulting address - * r5 contains dsisr - * r6 msr - * - * at return r3 = 0 for success, 1 for page fault, negative for error - */ - bl __hash_page /* build HPTE if possible */ - cmpdi r3,0 /* see if __hash_page succeeded */ - - /* Success */ - beq fast_exc_return_irq /* Return from exception on success */ - - /* Error */ - blt- 13f - - /* Reload DAR/DSISR into r4/r5 for the DABR check below */ - ld r4,_DAR(r1) - ld r5,_DSISR(r1) -#endif /* CONFIG_PPC_BOOK3S_64 */ - -/* Here we have a page fault that hash_page can't handle. */ -handle_page_fault: -11: andis. r0,r5,DSISR_DABRMATCH@h - bne- handle_dabr_fault - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_page_fault - cmpdi r3,0 - beq+ ret_from_except_lite - bl save_nvgprs - mr r5,r3 - addi r3,r1,STACK_FRAME_OVERHEAD - ld r4,_DAR(r1) - bl bad_page_fault - b ret_from_except - -/* We have a data breakpoint exception - handle it */ -handle_dabr_fault: - bl save_nvgprs - ld r4,_DAR(r1) - ld r5,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_break - /* - * do_break() may have changed the NV GPRS while handling a breakpoint. - * If so, we need to restore them with their updated values. Don't use - * ret_from_except_lite here. - */ - b ret_from_except - - -#ifdef CONFIG_PPC_BOOK3S_64 -/* We have a page fault that hash_page could handle but HV refused - * the PTE insertion - */ -13: bl save_nvgprs - mr r5,r3 - addi r3,r1,STACK_FRAME_OVERHEAD - ld r4,_DAR(r1) - bl low_hash_fault - b ret_from_except -#endif - -/* - * We come here as a result of a DSI at a point where we don't want - * to call hash_page, such as when we are accessing memory (possibly - * user memory) inside a PMU interrupt that occurred while interrupts - * were soft-disabled. We want to invoke the exception handler for - * the access, or panic if there isn't a handler. - */ -77: bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - li r5,SIGSEGV - bl bad_page_fault - b ret_from_except - -/* - * When doorbell is triggered from system reset wakeup, the message is - * not cleared, so it would fire again when EE is enabled. - * - * When coming from local_irq_enable, there may be the same problem if - * we were hard disabled. - * - * Execute msgclr to clear pending exceptions before handling it. - */ -h_doorbell_common_msgclr: - LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) - PPC_MSGCLR(3) - b h_doorbell_common - -doorbell_super_common_msgclr: - LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) - PPC_MSGCLRP(3) - b doorbell_super_common - -/* - * Called from arch_local_irq_enable when an interrupt needs - * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate - * which kind of interrupt. MSR:EE is already off. We generate a - * stackframe like if a real interrupt had happened. - * - * Note: While MSR:EE is off, we need to make sure that _MSR - * in the generated frame has EE set to 1 or the exception - * handler will not properly re-enable them. - * - * Note that we don't specify LR as the NIP (return address) for - * the interrupt because that would unbalance the return branch - * predictor. - */ -_GLOBAL(__replay_interrupt) - /* We are going to jump to the exception common code which - * will retrieve various register values from the PACA which - * we don't give a damn about, so we don't bother storing them. - */ - mfmsr r12 - LOAD_REG_ADDR(r11, replay_interrupt_return) - mfcr r9 - ori r12,r12,MSR_EE - cmpwi r3,0x900 - beq decrementer_common - cmpwi r3,0x500 -BEGIN_FTR_SECTION - beq h_virt_irq_common -FTR_SECTION_ELSE - beq hardware_interrupt_common -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300) - cmpwi r3,0xf00 - beq performance_monitor_common -BEGIN_FTR_SECTION - cmpwi r3,0xa00 - beq h_doorbell_common_msgclr - cmpwi r3,0xe60 - beq hmi_exception_common -FTR_SECTION_ELSE - cmpwi r3,0xa00 - beq doorbell_super_common_msgclr -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -replay_interrupt_return: - blr - -_ASM_NOKPROBE_SYMBOL(__replay_interrupt) +SYM_FUNC_END(disable_machine_check) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index ff0114aeba9b..d14eda1e8589 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -24,22 +24,45 @@ #include <linux/slab.h> #include <linux/cma.h> #include <linux/hugetlb.h> +#include <linux/debugfs.h> +#include <linux/of.h> +#include <linux/of_fdt.h> -#include <asm/debugfs.h> #include <asm/page.h> -#include <asm/prom.h> #include <asm/fadump.h> #include <asm/fadump-internal.h> #include <asm/setup.h> +#include <asm/interrupt.h> + +/* + * The CPU who acquired the lock to trigger the fadump crash should + * wait for other CPUs to enter. + * + * The timeout is in milliseconds. + */ +#define CRASH_TIMEOUT 500 static struct fw_dump fw_dump; static void __init fadump_reserve_crash_area(u64 base); #ifndef CONFIG_PRESERVE_FA_DUMP + +static struct kobject *fadump_kobj; + +static atomic_t cpus_in_fadump; static DEFINE_MUTEX(fadump_mutex); -struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; -struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; + +static struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; + +#define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ +#define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ + sizeof(struct fadump_memory_range)) +static struct fadump_memory_range rngs[RESERVED_RNGS_CNT]; +static struct fadump_mrange_info +reserved_mrange_info = { "reserved", rngs, RESERVED_RNGS_SZ, 0, RESERVED_RNGS_CNT, true }; + +static void __init early_init_dt_scan_reserved_ranges(unsigned long node); #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -51,13 +74,13 @@ static struct cma *fadump_cma; * The total size of fadump reserved memory covers for boot memory size * + cpu data size + hpte size and metadata. * Initialize only the area equivalent to boot memory size for CMA use. - * The reamining portion of fadump reserved memory will be not given - * to CMA and pages for thoes will stay reserved. boot memory size is + * The remaining portion of fadump reserved memory will be not given + * to CMA and pages for those will stay reserved. boot memory size is * aligned per CMA requirement to satisy cma_init_reserved_mem() call. * But for some reason even if it fails we still have the memory reservation * with us and we can still continue doing fadump. */ -int __init fadump_cma_init(void) +static int __init fadump_cma_init(void) { unsigned long long base, size; int rc; @@ -91,6 +114,12 @@ int __init fadump_cma_init(void) } /* + * If CMA activation fails, keep the pages reserved, instead of + * exposing them to buddy allocator. Same as 'fadump=nocma' case. + */ + cma_reserve_pages_on_error(fadump_cma); + + /* * So we now have successfully initialized cma area for fadump. */ pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx " @@ -108,6 +137,11 @@ static int __init fadump_cma_init(void) { return 1; } int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { + if (depth == 0) { + early_init_dt_scan_reserved_ranges(node); + return 0; + } + if (depth != 1) return 0; @@ -164,13 +198,13 @@ int is_fadump_active(void) */ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) { - struct memblock_region *reg; + phys_addr_t reg_start, reg_end; bool ret = false; - u64 start, end; + u64 i, start, end; - for_each_memblock(memory, reg) { - start = max_t(u64, d_start, reg->base); - end = min_t(u64, d_end, (reg->base + reg->size)); + for_each_mem_range(i, ®_start, ®_end) { + start = max_t(u64, d_start, reg_start); + end = min_t(u64, d_end, reg_end); if (d_start < end) { /* Memory hole from d_start to start */ if (start > d_start) @@ -224,7 +258,7 @@ bool is_fadump_reserved_mem_contiguous(void) } /* Print firmware assisted dump configurations for debugging purpose. */ -static void fadump_show_config(void) +static void __init fadump_show_config(void) { int i; @@ -265,7 +299,7 @@ static void fadump_show_config(void) * that is required for a kernel to boot successfully. * */ -static inline u64 fadump_calculate_reserve_size(void) +static __init u64 fadump_calculate_reserve_size(void) { u64 base, size, bootmem_min; int ret; @@ -279,7 +313,7 @@ static inline u64 fadump_calculate_reserve_size(void) * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &size, &base); + &size, &base, NULL, NULL); if (ret == 0 && size > 0) { unsigned long max_size; @@ -326,12 +360,17 @@ static inline u64 fadump_calculate_reserve_size(void) * Calculate the total memory size required to be reserved for * firmware-assisted dump registration. */ -static unsigned long get_fadump_area_size(void) +static unsigned long __init get_fadump_area_size(void) { unsigned long size = 0; size += fw_dump.cpu_state_data_size; size += fw_dump.hpte_region_size; + /* + * Account for pagesize alignment of boot memory area destination address. + * This faciliates in mmap reading of first kernel's memory. + */ + size = PAGE_ALIGN(size); size += fw_dump.boot_memory_size; size += sizeof(struct fadump_crash_info_header); size += sizeof(struct elfhdr); /* ELF core header.*/ @@ -395,44 +434,106 @@ static int __init add_boot_mem_regions(unsigned long mstart, static int __init fadump_get_boot_mem_regions(void) { - unsigned long base, size, cur_size, hole_size, last_end; + unsigned long size, cur_size, hole_size, last_end; unsigned long mem_size = fw_dump.boot_memory_size; - struct memblock_region *reg; + phys_addr_t reg_start, reg_end; int ret = 1; + u64 i; fw_dump.boot_mem_regs_cnt = 0; last_end = 0; hole_size = 0; cur_size = 0; - for_each_memblock(memory, reg) { - base = reg->base; - size = reg->size; - hole_size += (base - last_end); + for_each_mem_range(i, ®_start, ®_end) { + size = reg_end - reg_start; + hole_size += (reg_start - last_end); if ((cur_size + size) >= mem_size) { size = (mem_size - cur_size); - ret = add_boot_mem_regions(base, size); + ret = add_boot_mem_regions(reg_start, size); break; } mem_size -= size; cur_size += size; - ret = add_boot_mem_regions(base, size); + ret = add_boot_mem_regions(reg_start, size); if (!ret) break; - last_end = base + size; + last_end = reg_end; } fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size); return ret; } +/* + * Returns true, if the given range overlaps with reserved memory ranges + * starting at idx. Also, updates idx to index of overlapping memory range + * with the given memory range. + * False, otherwise. + */ +static bool __init overlaps_reserved_ranges(u64 base, u64 end, int *idx) +{ + bool ret = false; + int i; + + for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) { + u64 rbase = reserved_mrange_info.mem_ranges[i].base; + u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size; + + if (end <= rbase) + break; + + if ((end > rbase) && (base < rend)) { + *idx = i; + ret = true; + break; + } + } + + return ret; +} + +/* + * Locate a suitable memory area to reserve memory for FADump. While at it, + * lookup reserved-ranges & avoid overlap with them, as they are used by F/W. + */ +static u64 __init fadump_locate_reserve_mem(u64 base, u64 size) +{ + struct fadump_memory_range *mrngs; + phys_addr_t mstart, mend; + int idx = 0; + u64 i, ret = 0; + + mrngs = reserved_mrange_info.mem_ranges; + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { + pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n", + i, mstart, mend, base); + + if (mstart > base) + base = PAGE_ALIGN(mstart); + + while ((mend > base) && ((mend - base) >= size)) { + if (!overlaps_reserved_ranges(base, base+size, &idx)) { + ret = base; + goto out; + } + + base = mrngs[idx].base + mrngs[idx].size; + base = PAGE_ALIGN(base); + } + } + +out: + return ret; +} + int __init fadump_reserve_mem(void) { - u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; - bool is_memblock_bottom_up = memblock_bottom_up(); + u64 base, size, mem_boundary, bootmem_min; int ret = 1; if (!fw_dump.fadump_enabled) @@ -453,9 +554,9 @@ int __init fadump_reserve_mem(void) PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA if (!fw_dump.nocma) { - align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, align); + ALIGN(fw_dump.boot_memory_size, + CMA_MIN_ALIGNMENT_BYTES); } #endif @@ -523,13 +624,9 @@ int __init fadump_reserve_mem(void) * Reserve memory at an offset closer to bottom of the RAM to * minimize the impact of memory hot-remove operation. */ - memblock_set_bottom_up(true); - base = memblock_find_in_range(base, mem_boundary, size, align); + base = fadump_locate_reserve_mem(base, size); - /* Restore the previous allocation mode */ - memblock_set_bottom_up(is_memblock_bottom_up); - - if (!base) { + if (!base || (base + size > mem_boundary)) { pr_err("Failed to find memory chunk for reservation!\n"); goto error_out; } @@ -557,6 +654,7 @@ int __init fadump_reserve_mem(void) return ret; error_out: fw_dump.fadump_enabled = 0; + fw_dump.reserve_dump_area_size = 0; return 0; } @@ -594,8 +692,11 @@ early_param("fadump_reserve_mem", early_fadump_reserve_mem); void crash_fadump(struct pt_regs *regs, const char *str) { + unsigned int msecs; struct fadump_crash_info_header *fdh = NULL; int old_cpu, this_cpu; + /* Do not include first CPU */ + unsigned int ncpus = num_online_cpus() - 1; if (!should_fadump_crash()) return; @@ -611,6 +712,8 @@ void crash_fadump(struct pt_regs *regs, const char *str) old_cpu = cmpxchg(&crashing_cpu, -1, this_cpu); if (old_cpu != -1) { + atomic_inc(&cpus_in_fadump); + /* * We can't loop here indefinitely. Wait as long as fadump * is in force. If we race with fadump un-registration this @@ -632,12 +735,22 @@ void crash_fadump(struct pt_regs *regs, const char *str) else ppc_save_regs(&fdh->regs); - fdh->online_mask = *cpu_online_mask; + fdh->cpu_mask = *cpu_online_mask; + + /* + * If we came in via system reset, wait a while for the secondary + * CPUs to enter. + */ + if (TRAP(&(fdh->regs)) == INTERRUPT_SYSTEM_RESET) { + msecs = CRASH_TIMEOUT; + while ((atomic_read(&cpus_in_fadump) < ncpus) && (--msecs > 0)) + mdelay(1); + } fw_dump.ops->fadump_trigger(fdh, str); } -u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -646,18 +759,16 @@ u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) * FIXME: How do i get PID? Do I really need it? * prstatus.pr_pid = ???? */ - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + elf_core_copy_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); return buf; } -void fadump_update_elfcore_header(char *bufp) +void __init fadump_update_elfcore_header(char *bufp) { - struct elfhdr *elf; struct elf_phdr *phdr; - elf = (struct elfhdr *)bufp; bufp += sizeof(struct elfhdr); /* First note is a place holder for cpu notes info. */ @@ -672,7 +783,7 @@ void fadump_update_elfcore_header(char *bufp) return; } -static void *fadump_alloc_buffer(unsigned long size) +static void *__init fadump_alloc_buffer(unsigned long size) { unsigned long count, i; struct page *page; @@ -694,7 +805,7 @@ static void fadump_free_buffer(unsigned long vaddr, unsigned long size) free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL); } -s32 fadump_setup_cpu_notes_buf(u32 num_cpus) +s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus) { /* Allocate buffer to hold cpu crash notes. */ fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); @@ -726,10 +837,14 @@ void fadump_free_cpu_notes_buf(void) static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { + if (mrange_info->is_static) { + mrange_info->mem_range_cnt = 0; + return; + } + kfree(mrange_info->mem_ranges); - mrange_info->mem_ranges = NULL; - mrange_info->mem_ranges_sz = 0; - mrange_info->max_mem_ranges = 0; + memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0, + (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ)); } /* @@ -759,7 +874,6 @@ static int fadump_alloc_mem_ranges(struct fadump_mrange_info *mrange_info) sizeof(struct fadump_memory_range)); return 0; } - static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, u64 base, u64 end) { @@ -778,7 +892,12 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, start = mem_ranges[mrange_info->mem_range_cnt - 1].base; size = mem_ranges[mrange_info->mem_range_cnt - 1].size; - if ((start + size) == base) + /* + * Boot memory area needs separate PT_LOAD segment(s) as it + * is moved to a different location at the time of crash. + * So, fold only if the region is not boot memory area. + */ + if ((start + size) == base && start >= fw_dump.boot_mem_top) is_adjacent = true; } if (!is_adjacent) { @@ -786,6 +905,12 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; + if (mrange_info->is_static) { + pr_err("Reached array size limit for %s memory ranges\n", + mrange_info->name); + return -ENOSPC; + } + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; @@ -854,11 +979,14 @@ static int fadump_init_elfcore_header(char *bufp) elf->e_entry = 0; elf->e_phoff = sizeof(struct elfhdr); elf->e_shoff = 0; -#if defined(_CALL_ELF) - elf->e_flags = _CALL_ELF; -#else - elf->e_flags = 0; -#endif + + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) + elf->e_flags = 2; + else if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1)) + elf->e_flags = 1; + else + elf->e_flags = 0; + elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize = sizeof(struct elf_phdr); elf->e_phnum = 0; @@ -875,9 +1003,8 @@ static int fadump_init_elfcore_header(char *bufp) */ static int fadump_setup_crash_memory_ranges(void) { - struct memblock_region *reg; - u64 start, end; - int i, ret; + u64 i, start, end; + int ret; pr_debug("Setup crash memory ranges.\n"); crash_mrange_info.mem_range_cnt = 0; @@ -895,10 +1022,7 @@ static int fadump_setup_crash_memory_ranges(void) return ret; } - for_each_memblock(memory, reg) { - start = (u64)reg->base; - end = start + (u64)reg->size; - + for_each_mem_range(i, &start, &end) { /* * skip the memory chunk that is already added * (0 through boot_memory_top). @@ -1054,6 +1178,11 @@ static unsigned long init_fadump_header(unsigned long addr) fdh->elfcorehdr_addr = addr; /* We will set the crashing cpu id in crash_fadump() during crash. */ fdh->crashing_cpu = FADUMP_CPU_UNKNOWN; + /* + * When LPAR is terminated by PYHP, ensure all possible CPUs' + * register data is processed while exporting the vmcore. + */ + fdh->cpu_mask = *cpu_possible_mask; return addr; } @@ -1132,14 +1261,17 @@ static void fadump_free_reserved_memory(unsigned long start_pfn, */ static void fadump_release_reserved_area(u64 start, u64 end) { + unsigned long reg_spfn, reg_epfn; u64 tstart, tend, spfn, epfn; - struct memblock_region *reg; + int i; spfn = PHYS_PFN(start); epfn = PHYS_PFN(end); - for_each_memblock(memory, reg) { - tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg)); - tend = min_t(u64, epfn, memblock_region_memory_end_pfn(reg)); + + for_each_mem_pfn_range(i, MAX_NUMNODES, ®_spfn, ®_epfn, NULL) { + tstart = max_t(u64, spfn, reg_spfn); + tend = min_t(u64, epfn, reg_epfn); + if (tstart < tend) { fadump_free_reserved_memory(tstart, tend); @@ -1158,7 +1290,6 @@ static void fadump_release_reserved_area(u64 start, u64 end) static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) { struct fadump_memory_range *mem_ranges; - struct fadump_memory_range tmp_range; u64 base, size; int i, j, idx; @@ -1173,11 +1304,8 @@ static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) if (mem_ranges[idx].base > mem_ranges[j].base) idx = j; } - if (idx != i) { - tmp_range = mem_ranges[idx]; - mem_ranges[idx] = mem_ranges[i]; - mem_ranges[i] = tmp_range; - } + if (idx != i) + swap(mem_ranges[idx], mem_ranges[i]); } /* Merge adjacent reserved ranges */ @@ -1202,20 +1330,19 @@ static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) * Scan reserved-ranges to consider them while reserving/releasing * memory for FADump. */ -static inline int fadump_scan_reserved_mem_ranges(void) +static void __init early_init_dt_scan_reserved_ranges(unsigned long node) { - struct device_node *root; const __be32 *prop; int len, ret = -1; unsigned long i; - root = of_find_node_by_path("/"); - if (!root) - return ret; + /* reserved-ranges already scanned */ + if (reserved_mrange_info.mem_range_cnt != 0) + return; - prop = of_get_property(root, "reserved-ranges", &len); + prop = of_get_flat_dt_prop(node, "reserved-ranges", &len); if (!prop) - return ret; + return; /* * Each reserved range is an (address,size) pair, 2 cells each, @@ -1237,7 +1364,8 @@ static inline int fadump_scan_reserved_mem_ranges(void) } } - return ret; + /* Compact reserved ranges */ + sort_and_merge_mem_ranges(&reserved_mrange_info); } /* @@ -1251,32 +1379,21 @@ static void fadump_release_memory(u64 begin, u64 end) u64 ra_start, ra_end, tstart; int i, ret; - fadump_scan_reserved_mem_ranges(); - ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; /* - * Add reserved dump area to reserved ranges list - * and exclude all these ranges while releasing memory. + * If reserved ranges array limit is hit, overwrite the last reserved + * memory range with reserved dump area to ensure it is excluded from + * the memory being released (reused for next FADump registration). */ - ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end); - if (ret != 0) { - /* - * Not enough memory to setup reserved ranges but the system is - * running shortage of memory. So, release all the memory except - * Reserved dump area (reused for next fadump registration). - */ - if (begin < ra_end && end > ra_start) { - if (begin < ra_start) - fadump_release_reserved_area(begin, ra_start); - if (end > ra_end) - fadump_release_reserved_area(ra_end, end); - } else - fadump_release_reserved_area(begin, end); + if (reserved_mrange_info.mem_range_cnt == + reserved_mrange_info.max_mem_ranges) + reserved_mrange_info.mem_range_cnt--; + ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end); + if (ret != 0) return; - } /* Get the reserved ranges list in order first. */ sort_and_merge_mem_ranges(&reserved_mrange_info); @@ -1323,9 +1440,9 @@ static void fadump_invalidate_release_mem(void) fw_dump.ops->fadump_init_mem_struct(&fw_dump); } -static ssize_t fadump_release_memory_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t release_mem_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int input = -1; @@ -1350,23 +1467,40 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj, return count; } -static ssize_t fadump_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +/* Release the reserved memory and disable the FADump */ +static void __init unregister_fadump(void) +{ + fadump_cleanup(); + fadump_release_memory(fw_dump.reserve_dump_area_start, + fw_dump.reserve_dump_area_size); + fw_dump.fadump_enabled = 0; + kobject_put(fadump_kobj); +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sprintf(buf, "%d\n", fw_dump.fadump_enabled); } -static ssize_t fadump_register_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t mem_reserved_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%ld\n", fw_dump.reserve_dump_area_size); +} + +static ssize_t registered_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sprintf(buf, "%d\n", fw_dump.dump_registered); } -static ssize_t fadump_register_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t registered_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int ret = 0; int input = -1; @@ -1418,45 +1552,82 @@ static int fadump_region_show(struct seq_file *m, void *private) return 0; } -static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem, - 0200, NULL, - fadump_release_memory_store); -static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled, - 0444, fadump_enabled_show, - NULL); -static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered, - 0644, fadump_register_show, - fadump_register_store); +static struct kobj_attribute release_attr = __ATTR_WO(release_mem); +static struct kobj_attribute enable_attr = __ATTR_RO(enabled); +static struct kobj_attribute register_attr = __ATTR_RW(registered); +static struct kobj_attribute mem_reserved_attr = __ATTR_RO(mem_reserved); + +static struct attribute *fadump_attrs[] = { + &enable_attr.attr, + ®ister_attr.attr, + &mem_reserved_attr.attr, + NULL, +}; + +ATTRIBUTE_GROUPS(fadump); DEFINE_SHOW_ATTRIBUTE(fadump_region); -static void fadump_init_files(void) +static void __init fadump_init_files(void) { - struct dentry *debugfs_file; int rc = 0; - rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr); - if (rc) - printk(KERN_ERR "fadump: unable to create sysfs file" - " fadump_enabled (%d)\n", rc); + fadump_kobj = kobject_create_and_add("fadump", kernel_kobj); + if (!fadump_kobj) { + pr_err("failed to create fadump kobject\n"); + return; + } + + debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL, + &fadump_region_fops); - rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr); - if (rc) - printk(KERN_ERR "fadump: unable to create sysfs file" - " fadump_registered (%d)\n", rc); + if (fw_dump.dump_active) { + rc = sysfs_create_file(fadump_kobj, &release_attr.attr); + if (rc) + pr_err("unable to create release_mem sysfs file (%d)\n", + rc); + } - debugfs_file = debugfs_create_file("fadump_region", 0444, - powerpc_debugfs_root, NULL, - &fadump_region_fops); - if (!debugfs_file) - printk(KERN_ERR "fadump: unable to create debugfs file" - " fadump_region\n"); + rc = sysfs_create_groups(fadump_kobj, fadump_groups); + if (rc) { + pr_err("sysfs group creation failed (%d), unregistering FADump", + rc); + unregister_fadump(); + return; + } + + /* + * The FADump sysfs are moved from kernel_kobj to fadump_kobj need to + * create symlink at old location to maintain backward compatibility. + * + * - fadump_enabled -> fadump/enabled + * - fadump_registered -> fadump/registered + * - fadump_release_mem -> fadump/release_mem + */ + rc = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, fadump_kobj, + "enabled", "fadump_enabled"); + if (rc) { + pr_err("unable to create fadump_enabled symlink (%d)", rc); + return; + } + + rc = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, fadump_kobj, + "registered", + "fadump_registered"); + if (rc) { + pr_err("unable to create fadump_registered symlink (%d)", rc); + sysfs_remove_link(kernel_kobj, "fadump_enabled"); + return; + } if (fw_dump.dump_active) { - rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr); + rc = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, + fadump_kobj, + "release_mem", + "fadump_release_mem"); if (rc) - printk(KERN_ERR "fadump: unable to create sysfs file" - " fadump_release_mem (%d)\n", rc); + pr_err("unable to create fadump_release_mem symlink (%d)", + rc); } return; } @@ -1487,13 +1658,28 @@ int __init setup_fadump(void) if (fw_dump.ops->fadump_process(&fw_dump) < 0) fadump_invalidate_release_mem(); } - /* Initialize the kernel dump memory structure for FAD registration. */ - else if (fw_dump.reserve_dump_area_size) + /* Initialize the kernel dump memory structure and register with f/w */ + else if (fw_dump.reserve_dump_area_size) { fw_dump.ops->fadump_init_mem_struct(&fw_dump); + register_fadump(); + } + + /* + * In case of panic, fadump is triggered via ppc_panic_event() + * panic notifier. Setting crash_kexec_post_notifiers to 'true' + * lets panic() function take crash friendly path before panic + * notifiers are invoked. + */ + crash_kexec_post_notifiers = true; return 1; } -subsys_initcall(setup_fadump); +/* + * Use subsys_initcall_sync() here because there is dependency with + * crash_save_vmcoreinfo_init(), which must run first to ensure vmcoreinfo initialization + * is done before registering with f/w. + */ +subsys_initcall_sync(setup_fadump); #else /* !CONFIG_PRESERVE_FA_DUMP */ /* Scan the Firmware Assisted dump configuration details. */ @@ -1531,12 +1717,10 @@ int __init fadump_reserve_mem(void) /* Preserve everything above the base address */ static void __init fadump_reserve_crash_area(u64 base) { - struct memblock_region *reg; - u64 mstart, msize; + u64 i, mstart, mend, msize; - for_each_memblock(memory, reg) { - mstart = reg->base; - msize = reg->size; + for_each_mem_range(i, &mstart, &mend) { + msize = mend - mstart; if ((mstart + msize) < base) continue; diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c index cc4a5e3f51f1..8987eee33dc8 100644 --- a/arch/powerpc/kernel/firmware.c +++ b/arch/powerpc/kernel/firmware.c @@ -11,8 +11,33 @@ #include <linux/export.h> #include <linux/cache.h> +#include <linux/of.h> #include <asm/firmware.h> +#include <asm/kvm_guest.h> +#ifdef CONFIG_PPC64 unsigned long powerpc_firmware_features __read_mostly; EXPORT_SYMBOL_GPL(powerpc_firmware_features); +#endif + +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST) +DEFINE_STATIC_KEY_FALSE(kvm_guest); +EXPORT_SYMBOL_GPL(kvm_guest); + +int __init check_kvm_guest(void) +{ + struct device_node *hyper_node; + + hyper_node = of_find_node_by_path("/hypervisor"); + if (!hyper_node) + return 0; + + if (of_device_is_compatible(hyper_node, "linux,kvm")) + static_branch_enable(&kvm_guest); + + of_node_put(hyper_node); + return 0; +} +core_initcall(check_kvm_guest); // before kvm_guest_init() +#endif diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 0bb991ddd264..2f8f3f93cbb6 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -9,21 +9,29 @@ * Copyright (C) 1997 Dan Malek (dmalek@jlc.net). */ +#include <linux/export.h> #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/cputable.h> #include <asm/cache.h> #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/ptrace.h> -#include <asm/export.h> #include <asm/asm-compat.h> #include <asm/feature-fixups.h> #ifdef CONFIG_VSX +#define __REST_1FPVSR(n,c,base) \ +BEGIN_FTR_SECTION \ + b 2f; \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ + REST_FPR(n,base); \ + b 3f; \ +2: REST_VSR(n,c,base); \ +3: + #define __REST_32FPVSRS(n,c,base) \ BEGIN_FTR_SECTION \ b 2f; \ @@ -42,9 +50,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ 2: SAVE_32VSRS(n,c,base); \ 3: #else +#define __REST_1FPVSR(n,b,base) REST_FPR(n, base) #define __REST_32FPVSRS(n,b,base) REST_32FPRS(n, base) #define __SAVE_32FPVSRS(n,b,base) SAVE_32FPRS(n, base) #endif +#define REST_1FPVSR(n,c,base) __REST_1FPVSR(n,__REG_##c,__REG_##base) #define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base) #define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base) @@ -68,6 +78,7 @@ _GLOBAL(store_fp_state) SAVE_32FPVSRS(0, R4, R3) mffs fr0 stfd fr0,FPSTATE_FPSCR(r3) + REST_1FPVSR(0, R4, R3) blr EXPORT_SYMBOL(store_fp_state) @@ -82,18 +93,22 @@ EXPORT_SYMBOL(store_fp_state) */ _GLOBAL(load_up_fpu) mfmsr r5 +#ifdef CONFIG_PPC_BOOK3S_64 + /* interrupt doesn't set MSR[RI] and HPT can fault on current access */ + ori r5,r5,MSR_FP|MSR_RI +#else ori r5,r5,MSR_FP +#endif #ifdef CONFIG_VSX BEGIN_FTR_SECTION oris r5,r5,MSR_VSX@h END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - SYNC MTMSRD(r5) /* enable use of fpu now */ isync /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + addi r5,r2,THREAD lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 @@ -104,10 +119,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) ori r12,r12,MSR_FP or r12,r12,r4 std r12,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3S_64 + li r4,0 + stb r4,PACASRR_VALID(r13) #endif - /* Don't care if r4 overflows, this is desired behaviour */ - lbz r4,THREAD_LOAD_FP(r5) - addi r4,r4,1 +#endif + li r4,1 stb r4,THREAD_LOAD_FP(r5) addi r10,r5,THREAD_FPSTATE lfd fr0,FPSTATE_FPSCR(r10) @@ -116,6 +133,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) /* restore registers and return */ /* we haven't used ctr or xer or lr */ blr +_ASM_NOKPROBE_SYMBOL(load_up_fpu) /* * save_fpu(tsk) @@ -132,19 +150,5 @@ _GLOBAL(save_fpu) 2: SAVE_32FPVSRS(0, R4, R6) mffs fr0 stfd fr0,FPSTATE_FPSCR(r6) - blr - -/* - * These are used in the alignment trap handler when emulating - * single-precision loads and stores. - */ - -_GLOBAL(cvt_fd) - lfs 0,0(r3) - stfd 0,0(r4) - blr - -_GLOBAL(cvt_df) - lfd 0,0(r3) - stfs 0,0(r4) + REST_1FPVSR(0, R4, R6) blr diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 8abc7783dbe5..f8e2911478a7 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -10,135 +10,161 @@ * We assume sprg3 has the physical address of the current * task's thread_struct. */ +.macro EXCEPTION_PROLOG trapno name handle_dar_dsisr=0 + EXCEPTION_PROLOG_0 handle_dar_dsisr=\handle_dar_dsisr + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 \trapno \name handle_dar_dsisr=\handle_dar_dsisr +.endm -.macro EXCEPTION_PROLOG +.macro EXCEPTION_PROLOG_0 handle_dar_dsisr=0 mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 + mfspr r10, SPRN_SPRG_THREAD + .if \handle_dar_dsisr +#ifdef CONFIG_40x + mfspr r11, SPRN_DEAR +#else + mfspr r11, SPRN_DAR +#endif + stw r11, DAR(r10) +#ifdef CONFIG_40x + mfspr r11, SPRN_ESR +#else + mfspr r11, SPRN_DSISR +#endif + stw r11, DSISR(r10) + .endif + mfspr r11, SPRN_SRR0 + stw r11, SRR0(r10) + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ + stw r11, SRR1(r10) mfcr r10 - EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + andi. r11, r11, MSR_PR .endm .macro EXCEPTION_PROLOG_1 - mfspr r11,SPRN_SRR1 /* check whether user or kernel */ - andi. r11,r11,MSR_PR - tophys(r11,r1) /* use tophys(r1) if kernel */ + mtspr SPRN_SPRG_SCRATCH2,r1 + subi r1, r1, INT_FRAME_SIZE /* use r1 if kernel */ beq 1f - mfspr r11,SPRN_SPRG_THREAD - lwz r11,TASK_STACK-THREAD(r11) - addi r11,r11,THREAD_SIZE - tophys(r11,r11) -1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ + mfspr r1,SPRN_SPRG_THREAD + lwz r1,TASK_STACK-THREAD(r1) + addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE +1: +#ifdef CONFIG_VMAP_STACK + mtcrf 0x3f, r1 + bt 32 - THREAD_ALIGN_SHIFT, vmap_stack_overflow +#endif .endm -.macro EXCEPTION_PROLOG_2 +.macro EXCEPTION_PROLOG_2 trapno name handle_dar_dsisr=0 +#ifdef CONFIG_PPC_8xx + .if \handle_dar_dsisr + li r11, RPN_PATTERN + mtspr SPRN_DAR, r11 /* Tag DAR, to be used in DTLB Error */ + .endif +#endif + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~MSR_RI) /* re-enable MMU */ + mtspr SPRN_SRR1, r11 + lis r11, 1f@h + ori r11, r11, 1f@l + mtspr SPRN_SRR0, r11 + mfspr r11, SPRN_SPRG_SCRATCH2 + rfi + + .text +\name\()_virt: +1: + stw r11,GPR1(r1) + stw r11,0(r1) + mr r11, r1 stw r10,_CCR(r11) /* save registers */ stw r12,GPR12(r11) stw r9,GPR9(r11) mfspr r10,SPRN_SPRG_SCRATCH0 - stw r10,GPR10(r11) mfspr r12,SPRN_SPRG_SCRATCH1 + stw r10,GPR10(r11) stw r12,GPR11(r11) mflr r10 stw r10,_LINK(r11) - mfspr r12,SPRN_SRR0 - mfspr r9,SPRN_SRR1 - stw r1,GPR1(r11) - stw r1,0(r11) - tovirt(r1,r11) /* set new kernel sp */ + mfspr r12, SPRN_SPRG_THREAD + tovirt(r12, r12) + .if \handle_dar_dsisr + lwz r10, DAR(r12) + stw r10, _DAR(r11) + lwz r10, DSISR(r12) + stw r10, _DSISR(r11) + .endif + lwz r9, SRR1(r12) + lwz r12, SRR0(r12) #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ +#elif defined(CONFIG_PPC_8xx) + mtspr SPRN_EID, r2 /* Set MSR_RI */ #else - li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */ - MTMSRD(r10) /* (except for mach check in rtas) */ + li r10, MSR_KERNEL /* can take exceptions */ + mtmsr r10 /* (except for mach check in rtas) */ #endif - stw r0,GPR0(r11) - lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ - addi r10,r10,STACK_FRAME_REGS_MARKER@l - stw r10,8(r11) - SAVE_4GPRS(3, r11) - SAVE_2GPRS(7, r11) + COMMON_EXCEPTION_PROLOG_END \trapno +_ASM_NOKPROBE_SYMBOL(\name\()_virt) .endm -.macro SYSCALL_ENTRY trapno - mfspr r12,SPRN_SPRG_THREAD - mfcr r10 - lwz r11,TASK_STACK-THREAD(r12) - mflr r9 - addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE - rlwinm r10,r10,0,4,2 /* Clear SO bit in CR */ - tophys(r11,r11) - stw r10,_CCR(r11) /* save registers */ - mfspr r10,SPRN_SRR0 - stw r9,_LINK(r11) - mfspr r9,SPRN_SRR1 - stw r1,GPR1(r11) - stw r1,0(r11) - tovirt(r1,r11) /* set new kernel sp */ - stw r10,_NIP(r11) -#ifdef CONFIG_40x - rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ -#else - LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ - MTMSRD(r10) /* (except for mach check in rtas) */ -#endif +.macro COMMON_EXCEPTION_PROLOG_END trapno + stw r0,GPR0(r1) lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ - stw r2,GPR2(r11) addi r10,r10,STACK_FRAME_REGS_MARKER@l - stw r9,_MSR(r11) - li r2, \trapno + 1 - stw r10,8(r11) - stw r2,_TRAP(r11) - SAVE_GPR(0, r11) - SAVE_4GPRS(3, r11) - SAVE_2GPRS(7, r11) - addi r11,r1,STACK_FRAME_OVERHEAD - addi r2,r12,-THREAD - stw r11,PT_REGS(r12) -#if defined(CONFIG_40x) - /* Check to see if the dbcr0 register is set up to debug. Use the - internal debug mode bit to do this. */ - lwz r12,THREAD_DBCR0(r12) - andis. r12,r12,DBCR0_IDM@h -#endif - ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) -#if defined(CONFIG_40x) - beq+ 3f - /* From user and task is ptraced - load up global dbcr0 */ - li r12,-1 /* clear all pending debug events */ - mtspr SPRN_DBSR,r12 - lis r11,global_dbcr0@ha - tophys(r11,r11) - addi r11,r11,global_dbcr0@l - lwz r12,0(r11) - mtspr SPRN_DBCR0,r12 - lwz r12,4(r11) - addi r12,r12,-1 - stw r12,4(r11) -#endif + stw r10,STACK_INT_FRAME_MARKER(r1) + li r10, \trapno + stw r10,_TRAP(r1) + SAVE_GPRS(3, 8, r1) + SAVE_NVGPRS(r1) + stw r2,GPR2(r1) + stw r12,_NIP(r1) + stw r9,_MSR(r1) + mfctr r10 + mfspr r2,SPRN_SPRG_THREAD + stw r10,_CTR(r1) + tovirt(r2, r2) + mfspr r10,SPRN_XER + addi r2, r2, -THREAD + stw r10,_XER(r1) + addi r3,r1,STACK_INT_FRAME_REGS +.endm -3: - tovirt(r2, r2) /* set r2 to current */ - lis r11, transfer_to_syscall@h - ori r11, r11, transfer_to_syscall@l -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * If MSR is changing we need to keep interrupts disabled at this point - * otherwise we might risk taking an interrupt before we tell lockdep - * they are enabled. - */ - LOAD_REG_IMMEDIATE(r10, MSR_KERNEL) - rlwimi r10, r9, 0, MSR_EE -#else - LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) +.macro prepare_transfer_to_handler +#ifdef CONFIG_PPC_BOOK3S_32 + andi. r12,r9,MSR_PR + bne 777f + bl prepare_transfer_to_handler +#ifdef CONFIG_PPC_KUEP + b 778f +777: + bl __kuep_lock +778: #endif -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 +777: #endif - mtspr SPRN_SRR1,r10 - mtspr SPRN_SRR0,r11 - SYNC - RFI /* jump to handler, enable MMU */ +.endm + +.macro SYSCALL_ENTRY trapno + mfspr r9, SPRN_SRR1 + mfspr r12, SPRN_SRR0 + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL) /* can take exceptions */ + lis r10, 1f@h + ori r10, r10, 1f@l + mtspr SPRN_SRR1, r11 + mtspr SPRN_SRR0, r10 + mfspr r10,SPRN_SPRG_THREAD + mr r11, r1 + lwz r1,TASK_STACK-THREAD(r10) + tovirt(r10, r10) + addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE + rfi +1: + stw r12,_NIP(r1) + mfcr r12 + rlwinm r12,r12,0,4,2 /* Clear SO bit in CR */ + stw r12,_CCR(r1) + b transfer_to_syscall /* jump to handler */ .endm /* @@ -154,37 +180,43 @@ */ #ifdef CONFIG_PPC_BOOK3S #define START_EXCEPTION(n, label) \ + __HEAD; \ . = n; \ DO_KVM n; \ label: #else #define START_EXCEPTION(n, label) \ + __HEAD; \ . = n; \ label: #endif -#define EXCEPTION(n, label, hdlr, xfer) \ +#define EXCEPTION(n, label, hdlr) \ START_EXCEPTION(n, label) \ - EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - xfer(n, hdlr) - -#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ - li r10,trap; \ - stw r10,_TRAP(r11); \ - LOAD_REG_IMMEDIATE(r10, msr); \ - bl tfer; \ - .long hdlr; \ - .long ret - -#define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ - ret_from_except) + EXCEPTION_PROLOG n label; \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b interrupt_return + +.macro vmap_stack_overflow_exception + __HEAD +vmap_stack_overflow: +#ifdef CONFIG_SMP + mfspr r1, SPRN_SPRG_THREAD + lwz r1, TASK_CPU - THREAD(r1) + slwi r1, r1, 3 + addis r1, r1, emergency_ctx-PAGE_OFFSET@ha +#else + lis r1, emergency_ctx-PAGE_OFFSET@ha +#endif + lwz r1, emergency_ctx-PAGE_OFFSET@l(r1) + addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE + EXCEPTION_PROLOG_2 0 vmap_stack_overflow + prepare_transfer_to_handler + bl stack_overflow_exception + b interrupt_return +.endm #endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 585ea1976550..9fc90410b385 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -26,17 +26,18 @@ */ #include <linux/init.h> +#include <linux/pgtable.h> +#include <linux/sizes.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/ptrace.h> -#include <asm/export.h> -#include <asm/asm-405.h> #include "head_32.h" @@ -53,8 +54,8 @@ * This is all going to change RSN when we add bi_recs....... -- Dan */ __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); mr r31,r3 /* save device tree ptr */ @@ -73,7 +74,6 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC rfi /* enables MMU */ b . /* prevent prefetch past rfi */ @@ -83,15 +83,19 @@ turn_on_mmu: */ . = 0xc0 crit_save: -_ENTRY(crit_r10) +_GLOBAL(crit_r10) + .space 4 +_GLOBAL(crit_r11) .space 4 -_ENTRY(crit_r11) +_GLOBAL(crit_srr0) .space 4 -_ENTRY(crit_srr0) +_GLOBAL(crit_srr1) .space 4 -_ENTRY(crit_srr1) +_GLOBAL(crit_r1) .space 4 -_ENTRY(saved_ksp_limit) +_GLOBAL(crit_dear) + .space 4 +_GLOBAL(crit_esr) .space 4 /* @@ -102,42 +106,62 @@ _ENTRY(saved_ksp_limit) * Instead we use a couple of words of memory at low physical addresses. * This is OK since we don't support SMP on these processors. */ -#define CRITICAL_EXCEPTION_PROLOG \ - stw r10,crit_r10@l(0); /* save two registers to work with */\ - stw r11,crit_r11@l(0); \ - mfcr r10; /* save CR in r10 for now */\ - mfspr r11,SPRN_SRR3; /* check whether user or kernel */\ - andi. r11,r11,MSR_PR; \ - lis r11,critirq_ctx@ha; \ - tophys(r11,r11); \ - lwz r11,critirq_ctx@l(r11); \ - beq 1f; \ - /* COMING FROM USER MODE */ \ - mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ -1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ - tophys(r11,r11); \ - stw r10,_CCR(r11); /* save various registers */\ - stw r12,GPR12(r11); \ - stw r9,GPR9(r11); \ - mflr r10; \ - stw r10,_LINK(r11); \ - mfspr r12,SPRN_DEAR; /* save DEAR and ESR in the frame */\ - stw r12,_DEAR(r11); /* since they may have had stuff */\ - mfspr r9,SPRN_ESR; /* in them at the point where the */\ - stw r9,_ESR(r11); /* exception was taken */\ - mfspr r12,SPRN_SRR2; \ - stw r1,GPR1(r11); \ - mfspr r9,SPRN_SRR3; \ - stw r1,0(r11); \ - tovirt(r1,r11); \ - rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ - stw r0,GPR0(r11); \ - lis r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\ - addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ - stw r10, 8(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) +.macro CRITICAL_EXCEPTION_PROLOG trapno name + stw r10,crit_r10@l(0) /* save two registers to work with */ + stw r11,crit_r11@l(0) + mfspr r10,SPRN_SRR0 + mfspr r11,SPRN_SRR1 + stw r10,crit_srr0@l(0) + stw r11,crit_srr1@l(0) + mfspr r10,SPRN_DEAR + mfspr r11,SPRN_ESR + stw r10,crit_dear@l(0) + stw r11,crit_esr@l(0) + mfcr r10 /* save CR in r10 for now */ + mfspr r11,SPRN_SRR3 /* check whether user or kernel */ + andi. r11,r11,MSR_PR + lis r11,(critirq_ctx-PAGE_OFFSET)@ha + lwz r11,(critirq_ctx-PAGE_OFFSET)@l(r11) + beq 1f + /* COMING FROM USER MODE */ + mfspr r11,SPRN_SPRG_THREAD /* if from user, start at top of */ + lwz r11,TASK_STACK-THREAD(r11) /* this thread's kernel stack */ +1: stw r1,crit_r1@l(0) + addi r1,r11,THREAD_SIZE-INT_FRAME_SIZE /* Alloc an excpt frm */ + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)) /* re-enable MMU */ + mtspr SPRN_SRR1, r11 + lis r11, 1f@h + ori r11, r11, 1f@l + mtspr SPRN_SRR0, r11 + rfi + + .text +1: +\name\()_virt: + lwz r11,crit_r1@l(0) + stw r11,GPR1(r1) + stw r11,0(r1) + mr r11,r1 + stw r10,_CCR(r11) /* save various registers */ + stw r12,GPR12(r11) + stw r9,GPR9(r11) + mflr r10 + stw r10,_LINK(r11) + lis r9,PAGE_OFFSET@ha + lwz r10,crit_r10@l(r9) + lwz r12,crit_r11@l(r9) + stw r10,GPR10(r11) + stw r12,GPR11(r11) + lwz r12,crit_dear@l(r9) + lwz r9,crit_esr@l(r9) + stw r12,_DEAR(r11) /* since they may have had stuff */ + stw r9,_ESR(r11) /* exception was taken */ + mfspr r12,SPRN_SRR2 + mfspr r9,SPRN_SRR3 + rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ + COMMON_EXCEPTION_PROLOG_END \trapno + 2 +_ASM_NOKPROBE_SYMBOL(\name\()_virt) +.endm /* * State at this point: @@ -157,10 +181,10 @@ _ENTRY(saved_ksp_limit) */ #define CRITICAL_EXCEPTION(n, label, hdlr) \ START_EXCEPTION(n, label); \ - CRITICAL_EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - crit_transfer_to_handler, ret_from_crit_exc) + CRITICAL_EXCEPTION_PROLOG n label; \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b ret_from_crit_exc /* * 0x0100 - Critical Interrupt Exception @@ -176,190 +200,71 @@ _ENTRY(saved_ksp_limit) * 0x0300 - Data Storage Exception * This happens for just a few reasons. U0 set (but we don't do that), * or zone protection fault (user violation, write to protected page). - * If this is just an update of modified status, we do that quickly - * and exit. Otherwise, we call heavywight functions to do the work. + * The other Data TLB exceptions bail out to this point + * if they can't resolve the lightweight TLB fault. */ START_EXCEPTION(0x0300, DataStorage) - mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ - mtspr SPRN_SPRG_SCRATCH1, r11 -#ifdef CONFIG_403GCX - stw r12, 0(r0) - stw r9, 4(r0) - mfcr r11 - mfspr r12, SPRN_PID - stw r11, 8(r0) - stw r12, 12(r0) -#else - mtspr SPRN_SPRG_SCRATCH3, r12 - mtspr SPRN_SPRG_SCRATCH4, r9 - mfcr r11 - mfspr r12, SPRN_PID - mtspr SPRN_SPRG_SCRATCH6, r11 - mtspr SPRN_SPRG_SCRATCH5, r12 -#endif - - /* First, check if it was a zone fault (which means a user - * tried to access a kernel or read-protected page - always - * a SEGV). All other faults here must be stores, so no - * need to check ESR_DST as well. */ - mfspr r10, SPRN_ESR - andis. r10, r10, ESR_DIZ@h - bne 2f - - mfspr r10, SPRN_DEAR /* Get faulting address */ - - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - lis r11, PAGE_OFFSET@h - cmplw r10, r11 - blt+ 3f - lis r11, swapper_pg_dir@h - ori r11, r11, swapper_pg_dir@l - li r9, 0 - mtspr SPRN_PID, r9 /* TLB will have 0 TID */ - b 4f - - /* Get the PGD for the current thread. - */ -3: - mfspr r11,SPRN_SPRG_THREAD - lwz r11,PGDIR(r11) -4: - tophys(r11, r11) - rlwimi r11, r10, 12, 20, 29 /* Create L1 (pgdir/pmd) address */ - lwz r11, 0(r11) /* Get L1 entry */ - rlwinm. r12, r11, 0, 0, 19 /* Extract L2 (pte) base address */ - beq 2f /* Bail if no table */ - - rlwimi r12, r10, 22, 20, 29 /* Compute PTE address */ - lwz r11, 0(r12) /* Get Linux PTE */ - - andi. r9, r11, _PAGE_RW /* Is it writeable? */ - beq 2f /* Bail if not */ - - /* Update 'changed'. - */ - ori r11, r11, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE - stw r11, 0(r12) /* Update Linux page table */ - - /* Most of the Linux PTE is ready to load into the TLB LO. - * We set ZSEL, where only the LS-bit determines user access. - * We set execute, because we don't have the granularity to - * properly set this at the page level (Linux problem). - * If shared is set, we cause a zero PID->TID load. - * Many of these bits are software only. Bits we don't set - * here we (properly should) assume have the appropriate value. - */ - li r12, 0x0ce2 - andc r11, r11, r12 /* Make sure 20, 21 are zero */ - - /* find the TLB index that caused the fault. It has to be here. - */ - tlbsx r9, 0, r10 - - tlbwe r11, r9, TLB_DATA /* Load TLB LO */ - - /* Done...restore registers and get out of here. - */ -#ifdef CONFIG_403GCX - lwz r12, 12(r0) - lwz r11, 8(r0) - mtspr SPRN_PID, r12 - mtcr r11 - lwz r9, 4(r0) - lwz r12, 0(r0) -#else - mfspr r12, SPRN_SPRG_SCRATCH5 - mfspr r11, SPRN_SPRG_SCRATCH6 - mtspr SPRN_PID, r12 - mtcr r11 - mfspr r9, SPRN_SPRG_SCRATCH4 - mfspr r12, SPRN_SPRG_SCRATCH3 -#endif - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 - PPC405_ERR77_SYNC - rfi /* Should sync shadow TLBs */ - b . /* prevent prefetch past rfi */ - -2: - /* The bailout. Restore registers to pre-exception conditions - * and call the heavyweights to help us out. - */ -#ifdef CONFIG_403GCX - lwz r12, 12(r0) - lwz r11, 8(r0) - mtspr SPRN_PID, r12 - mtcr r11 - lwz r9, 4(r0) - lwz r12, 0(r0) -#else - mfspr r12, SPRN_SPRG_SCRATCH5 - mfspr r11, SPRN_SPRG_SCRATCH6 - mtspr SPRN_PID, r12 - mtcr r11 - mfspr r9, SPRN_SPRG_SCRATCH4 - mfspr r12, SPRN_SPRG_SCRATCH3 -#endif - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 - b DataAccess + EXCEPTION_PROLOG 0x300 DataStorage handle_dar_dsisr=1 + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* * 0x0400 - Instruction Storage Exception * This is caused by a fetch from non-execute or guarded pages. */ START_EXCEPTION(0x0400, InstructionAccess) - EXCEPTION_PROLOG - mr r4,r12 /* Pass SRR0 as arg2 */ - li r5,0 /* Pass zero as arg3 */ - EXC_XFER_LITE(0x400, handle_page_fault) + EXCEPTION_PROLOG 0x400 InstructionAccess + li r5,0 + stw r5, _ESR(r11) /* Zero ESR */ + stw r12, _DEAR(r11) /* SRR0 as DEAR */ + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* 0x0500 - External Interrupt Exception */ - EXCEPTION(0x0500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, HardwareInterrupt, do_IRQ) /* 0x0600 - Alignment Exception */ START_EXCEPTION(0x0600, Alignment) - EXCEPTION_PROLOG - mfspr r4,SPRN_DEAR /* Grab the DEAR and save it */ - stw r4,_DEAR(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x600, alignment_exception) + EXCEPTION_PROLOG 0x600 Alignment handle_dar_dsisr=1 + prepare_transfer_to_handler + bl alignment_exception + REST_NVGPRS(r1) + b interrupt_return /* 0x0700 - Program Exception */ START_EXCEPTION(0x0700, ProgramCheck) - EXCEPTION_PROLOG - mfspr r4,SPRN_ESR /* Grab the ESR and save it */ - stw r4,_ESR(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x700, program_check_exception) + EXCEPTION_PROLOG 0x700 ProgramCheck handle_dar_dsisr=1 + prepare_transfer_to_handler + bl program_check_exception + REST_NVGPRS(r1) + b interrupt_return - EXCEPTION(0x0800, Trap_08, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0900, Trap_09, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0A00, Trap_0A, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0B00, Trap_0B, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0800, Trap_08, unknown_exception) + EXCEPTION(0x0900, Trap_09, unknown_exception) + EXCEPTION(0x0A00, Trap_0A, unknown_exception) + EXCEPTION(0x0B00, Trap_0B, unknown_exception) /* 0x0C00 - System Call Exception */ START_EXCEPTION(0x0C00, SystemCall) SYSCALL_ENTRY 0xc00 +/* Trap_0D is commented out to get more space for system call exception */ - EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_STD) +/* EXCEPTION(0x0D00, Trap_0D, unknown_exception) */ + EXCEPTION(0x0E00, Trap_0E, unknown_exception) + EXCEPTION(0x0F00, Trap_0F, unknown_exception) /* 0x1000 - Programmable Interval Timer (PIT) Exception */ - . = 0x1000 + START_EXCEPTION(0x1000, DecrementerTrap) b Decrementer -/* 0x1010 - Fixed Interval Timer (FIT) Exception -*/ - . = 0x1010 +/* 0x1010 - Fixed Interval Timer (FIT) Exception */ + START_EXCEPTION(0x1010, FITExceptionTrap) b FITException -/* 0x1020 - Watchdog Timer (WDT) Exception -*/ - . = 0x1020 +/* 0x1020 - Watchdog Timer (WDT) Exception */ + START_EXCEPTION(0x1020, WDTExceptionTrap) b WDTException /* 0x1100 - Data TLB Miss Exception @@ -368,23 +273,13 @@ _ENTRY(saved_ksp_limit) * load TLB entries from the page table if they exist. */ START_EXCEPTION(0x1100, DTLBMiss) - mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ - mtspr SPRN_SPRG_SCRATCH1, r11 -#ifdef CONFIG_403GCX - stw r12, 0(r0) - stw r9, 4(r0) - mfcr r11 - mfspr r12, SPRN_PID - stw r11, 8(r0) - stw r12, 12(r0) -#else + mtspr SPRN_SPRG_SCRATCH5, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH6, r11 mtspr SPRN_SPRG_SCRATCH3, r12 mtspr SPRN_SPRG_SCRATCH4, r9 - mfcr r11 - mfspr r12, SPRN_PID - mtspr SPRN_SPRG_SCRATCH6, r11 - mtspr SPRN_SPRG_SCRATCH5, r12 -#endif + mfcr r12 + mfspr r9, SPRN_PID + rlwimi r12, r9, 0, 0xff mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -404,31 +299,37 @@ _ENTRY(saved_ksp_limit) 3: mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + rlwinm. r9, r9, 0, 0xff + beq 5f /* Kuap fault */ +#endif 4: tophys(r11, r11) rlwimi r11, r10, 12, 20, 29 /* Create L1 (pgdir/pmd) address */ - lwz r12, 0(r11) /* Get L1 entry */ - andi. r9, r12, _PMD_PRESENT /* Check if it points to a PTE page */ + lwz r11, 0(r11) /* Get L1 entry */ + andi. r9, r11, _PMD_PRESENT /* Check if it points to a PTE page */ beq 2f /* Bail if no table */ - rlwimi r12, r10, 22, 20, 29 /* Compute PTE address */ - lwz r11, 0(r12) /* Get Linux PTE */ - andi. r9, r11, _PAGE_PRESENT - beq 5f + rlwimi r11, r10, 22, 20, 29 /* Compute PTE address */ + lwz r11, 0(r11) /* Get Linux PTE */ + li r9, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_READ + andc. r9, r9, r11 /* Check permission */ + bne 5f - ori r11, r11, _PAGE_ACCESSED - stw r11, 0(r12) + rlwinm r9, r11, 1, _PAGE_WRITE /* dirty => w */ + and r9, r9, r11 /* hwwrite = dirty & w */ + rlwimi r11, r9, 0, _PAGE_WRITE /* replace w by hwwrite */ /* Create TLB tag. This is the faulting address plus a static * set of bits. These are size, valid, E, U0. */ - li r12, 0x00c0 - rlwimi r10, r12, 0, 20, 31 + li r9, 0x00c0 + rlwimi r10, r9, 0, 20, 31 b finish_tlb_load 2: /* Check for possible large-page pmd entry */ - rlwinm. r9, r12, 2, 22, 24 + rlwinm. r9, r11, 2, 22, 24 beq 5f /* Create TLB tag. This is the faulting address, plus a static @@ -436,7 +337,6 @@ _ENTRY(saved_ksp_limit) */ ori r9, r9, 0x40 rlwimi r10, r9, 0, 20, 31 - mr r11, r12 b finish_tlb_load @@ -444,47 +344,26 @@ _ENTRY(saved_ksp_limit) /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ -#ifdef CONFIG_403GCX - lwz r12, 12(r0) - lwz r11, 8(r0) - mtspr SPRN_PID, r12 - mtcr r11 - lwz r9, 4(r0) - lwz r12, 0(r0) -#else - mfspr r12, SPRN_SPRG_SCRATCH5 - mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 - mtcr r11 + mtcrf 0x80, r12 mfspr r9, SPRN_SPRG_SCRATCH4 mfspr r12, SPRN_SPRG_SCRATCH3 -#endif - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 - b DataAccess + mfspr r11, SPRN_SPRG_SCRATCH6 + mfspr r10, SPRN_SPRG_SCRATCH5 + b DataStorage /* 0x1200 - Instruction TLB Miss Exception * Nearly the same as above, except we get our information from different * registers and bailout to a different point. */ START_EXCEPTION(0x1200, ITLBMiss) - mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ - mtspr SPRN_SPRG_SCRATCH1, r11 -#ifdef CONFIG_403GCX - stw r12, 0(r0) - stw r9, 4(r0) - mfcr r11 - mfspr r12, SPRN_PID - stw r11, 8(r0) - stw r12, 12(r0) -#else + mtspr SPRN_SPRG_SCRATCH5, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH6, r11 mtspr SPRN_SPRG_SCRATCH3, r12 mtspr SPRN_SPRG_SCRATCH4, r9 - mfcr r11 - mfspr r12, SPRN_PID - mtspr SPRN_SPRG_SCRATCH6, r11 - mtspr SPRN_SPRG_SCRATCH5, r12 -#endif + mfcr r12 + mfspr r9, SPRN_PID + rlwimi r12, r9, 0, 0xff mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -504,31 +383,37 @@ _ENTRY(saved_ksp_limit) 3: mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + rlwinm. r9, r9, 0, 0xff + beq 5f /* Kuap fault */ +#endif 4: tophys(r11, r11) rlwimi r11, r10, 12, 20, 29 /* Create L1 (pgdir/pmd) address */ - lwz r12, 0(r11) /* Get L1 entry */ - andi. r9, r12, _PMD_PRESENT /* Check if it points to a PTE page */ + lwz r11, 0(r11) /* Get L1 entry */ + andi. r9, r11, _PMD_PRESENT /* Check if it points to a PTE page */ beq 2f /* Bail if no table */ - rlwimi r12, r10, 22, 20, 29 /* Compute PTE address */ - lwz r11, 0(r12) /* Get Linux PTE */ - andi. r9, r11, _PAGE_PRESENT - beq 5f + rlwimi r11, r10, 22, 20, 29 /* Compute PTE address */ + lwz r11, 0(r11) /* Get Linux PTE */ + li r9, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC + andc. r9, r9, r11 /* Check permission */ + bne 5f - ori r11, r11, _PAGE_ACCESSED - stw r11, 0(r12) + rlwinm r9, r11, 1, _PAGE_WRITE /* dirty => w */ + and r9, r9, r11 /* hwwrite = dirty & w */ + rlwimi r11, r9, 0, _PAGE_WRITE /* replace w by hwwrite */ /* Create TLB tag. This is the faulting address plus a static * set of bits. These are size, valid, E, U0. */ - li r12, 0x00c0 - rlwimi r10, r12, 0, 20, 31 + li r9, 0x00c0 + rlwimi r10, r9, 0, 20, 31 b finish_tlb_load 2: /* Check for possible large-page pmd entry */ - rlwinm. r9, r12, 2, 22, 24 + rlwinm. r9, r11, 2, 22, 24 beq 5f /* Create TLB tag. This is the faulting address, plus a static @@ -536,7 +421,6 @@ _ENTRY(saved_ksp_limit) */ ori r9, r9, 0x40 rlwimi r10, r9, 0, 20, 31 - mr r11, r12 b finish_tlb_load @@ -544,44 +428,27 @@ _ENTRY(saved_ksp_limit) /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ -#ifdef CONFIG_403GCX - lwz r12, 12(r0) - lwz r11, 8(r0) - mtspr SPRN_PID, r12 - mtcr r11 - lwz r9, 4(r0) - lwz r12, 0(r0) -#else - mfspr r12, SPRN_SPRG_SCRATCH5 - mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 - mtcr r11 + mtcrf 0x80, r12 mfspr r9, SPRN_SPRG_SCRATCH4 mfspr r12, SPRN_SPRG_SCRATCH3 -#endif - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 + mfspr r11, SPRN_SPRG_SCRATCH6 + mfspr r10, SPRN_SPRG_SCRATCH5 b InstructionAccess - EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1400, Trap_14, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD) -#ifdef CONFIG_IBM405_ERR51 - /* 405GP errata 51 */ - START_EXCEPTION(0x1700, Trap_17) - b DTLBMiss -#else - EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD) -#endif - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1A00, Trap_1A, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1B00, Trap_1B, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1C00, Trap_1C, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1D00, Trap_1D, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1E00, Trap_1E, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1F00, Trap_1F, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1300, Trap_13, unknown_exception) + EXCEPTION(0x1400, Trap_14, unknown_exception) + EXCEPTION(0x1500, Trap_15, unknown_exception) + EXCEPTION(0x1600, Trap_16, unknown_exception) + EXCEPTION(0x1700, Trap_17, unknown_exception) + EXCEPTION(0x1800, Trap_18, unknown_exception) + EXCEPTION(0x1900, Trap_19, unknown_exception) + EXCEPTION(0x1A00, Trap_1A, unknown_exception) + EXCEPTION(0x1B00, Trap_1B, unknown_exception) + EXCEPTION(0x1C00, Trap_1C, unknown_exception) + EXCEPTION(0x1D00, Trap_1D, unknown_exception) + EXCEPTION(0x1E00, Trap_1E, unknown_exception) + EXCEPTION(0x1F00, Trap_1F, unknown_exception) /* Check for a single step debug exception while in an exception * handler before state has been saved. This is to catch the case @@ -598,7 +465,7 @@ _ENTRY(saved_ksp_limit) */ /* 0x2000 - Debug Exception */ START_EXCEPTION(0x2000, DebugTrap) - CRITICAL_EXCEPTION_PROLOG + CRITICAL_EXCEPTION_PROLOG 0x2000 DebugTrap /* * If this is a single step or branch-taken exception in an @@ -634,49 +501,41 @@ _ENTRY(saved_ksp_limit) lwz r12,GPR12(r11) lwz r10,crit_r10@l(0) lwz r11,crit_r11@l(0) - PPC405_ERR77_SYNC rfci b . /* continue normal handling for a critical exception... */ 2: mfspr r4,SPRN_DBSR - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_TEMPLATE(DebugException, 0x2002, \ - (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - crit_transfer_to_handler, ret_from_crit_exc) + stw r4,_ESR(r11) /* DebugException takes DBSR in _ESR */ + prepare_transfer_to_handler + bl DebugException + b ret_from_crit_exc /* Programmable Interval Timer (PIT) Exception. (from 0x1000) */ + __HEAD Decrementer: - EXCEPTION_PROLOG + EXCEPTION_PROLOG 0x1000 Decrementer lis r0,TSR_PIS@h mtspr SPRN_TSR,r0 /* Clear the PIT exception */ - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x1000, timer_interrupt) + prepare_transfer_to_handler + bl timer_interrupt + b interrupt_return /* Fixed Interval Timer (FIT) Exception. (from 0x1010) */ + __HEAD FITException: - EXCEPTION_PROLOG - addi r3,r1,STACK_FRAME_OVERHEAD; - EXC_XFER_STD(0x1010, unknown_exception) + EXCEPTION_PROLOG 0x1010 FITException + prepare_transfer_to_handler + bl unknown_exception + b interrupt_return /* Watchdog Timer (WDT) Exception. (from 0x1020) */ + __HEAD WDTException: - CRITICAL_EXCEPTION_PROLOG; - addi r3,r1,STACK_FRAME_OVERHEAD; - EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2, - (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), - crit_transfer_to_handler, ret_from_crit_exc) - -/* - * The other Data TLB exceptions bail out to this point - * if they can't resolve the lightweight TLB fault. - */ -DataAccess: - EXCEPTION_PROLOG - mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ - stw r5,_ESR(r11) - mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ - EXC_XFER_LITE(0x300, handle_page_fault) + CRITICAL_EXCEPTION_PROLOG 0x1020 WDTException + prepare_transfer_to_handler + bl WatchdogException + b ret_from_crit_exc /* Other PowerPC processors, namely those derived from the 6xx-series * have vectors from 0x2100 through 0x2F00 defined, but marked as reserved. @@ -684,12 +543,13 @@ DataAccess: * reserved. */ + __HEAD /* Damn, I came up one instruction too many to fit into the * exception space :-). Both the instruction and data TLB * miss get to this point to load the TLB. * r10 - TLB_TAG value * r11 - Linux PTE - * r12, r9 - available to use + * r9 - available to use * PID - loaded with proper value when we get here * Upon exit, we reload everything and RFI. * Actually, it will fit now, but oh well.....a common place @@ -698,45 +558,32 @@ DataAccess: tlb_4xx_index: .long 0 finish_tlb_load: - /* load the next available TLB index. - */ - lwz r9, tlb_4xx_index@l(0) - addi r9, r9, 1 - andi. r9, r9, (PPC40X_TLB_SIZE-1) - stw r9, tlb_4xx_index@l(0) - -6: /* * Clear out the software-only bits in the PTE to generate the * TLB_DATA value. These are the bottom 2 bits of the RPM, the - * top 3 bits of the zone field, and M. + * 4 bits of the zone field, and M. */ - li r12, 0x0ce2 - andc r11, r11, r12 + li r9, 0x0cf2 + andc r11, r11, r9 + rlwimi r11, r10, 8, 24, 27 /* Copy 4 upper address bit into zone */ + + /* load the next available TLB index. */ + lwz r9, tlb_4xx_index@l(0) + addi r9, r9, 1 + andi. r9, r9, PPC40X_TLB_SIZE - 1 + stw r9, tlb_4xx_index@l(0) tlbwe r11, r9, TLB_DATA /* Load TLB LO */ tlbwe r10, r9, TLB_TAG /* Load TLB HI */ /* Done...restore registers and get out of here. */ -#ifdef CONFIG_403GCX - lwz r12, 12(r0) - lwz r11, 8(r0) - mtspr SPRN_PID, r12 - mtcr r11 - lwz r9, 4(r0) - lwz r12, 0(r0) -#else - mfspr r12, SPRN_SPRG_SCRATCH5 - mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 - mtcr r11 + mtcrf 0x80, r12 mfspr r9, SPRN_SPRG_SCRATCH4 mfspr r12, SPRN_SPRG_SCRATCH3 -#endif - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 - PPC405_ERR77_SYNC + mfspr r11, SPRN_SPRG_SCRATCH6 + mfspr r10, SPRN_SPRG_SCRATCH5 rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ @@ -757,7 +604,7 @@ start_here: lis r1,init_thread_union@ha addi r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) bl early_init /* We have to do this with MMU on */ @@ -799,7 +646,7 @@ start_here: ori r6, r6, swapper_pg_dir@l lis r5, abatron_pteptrs@h ori r5, r5, abatron_pteptrs@l - stw r5, 0xf0(r0) /* Must match your Abatron config file */ + stw r5, 0xf0(0) /* Must match your Abatron config file */ tophys(r5,r5) stw r6, 0(r5) @@ -814,10 +661,10 @@ start_here: b . /* prevent prefetch past rfi */ /* Set up the initial MMU state so we can do the first level of - * kernel initialization. This maps the first 16 MBytes of memory 1:1 + * kernel initialization. This maps the first 32 MBytes of memory 1:1 * virtual to physical and more importantly sets the cache mode. */ -initial_mmu: +SYM_FUNC_START_LOCAL(initial_mmu) tlbia /* Invalidate all TLB entries */ isync @@ -851,6 +698,12 @@ initial_mmu: tlbwe r4,r0,TLB_DATA /* Load the data portion of the entry */ tlbwe r3,r0,TLB_TAG /* Load the tag portion of the entry */ + li r0,62 /* TLB slot 62 */ + addis r4,r4,SZ_16M@h + addis r3,r3,SZ_16M@h + tlbwe r4,r0,TLB_DATA /* Load the data portion of the entry */ + tlbwe r3,r0,TLB_TAG /* Load the tag portion of the entry */ + isync /* Establish the exception vector base @@ -860,44 +713,9 @@ initial_mmu: mtspr SPRN_EVPR,r0 blr +SYM_FUNC_END(initial_mmu) _GLOBAL(abort) mfspr r13,SPRN_DBCR0 oris r13,r13,DBCR0_RST_SYSTEM@h mtspr SPRN_DBCR0,r13 - -_GLOBAL(set_context) - -#ifdef CONFIG_BDI_SWITCH - /* Context switch the PTE pointer for the Abatron BDI2000. - * The PGDIR is the second parameter. - */ - lis r5, abatron_pteptrs@ha - stw r4, abatron_pteptrs@l + 0x4(r5) -#endif - sync - mtspr SPRN_PID,r3 - isync /* Need an isync to flush shadow */ - /* TLBs after changing PID */ - blr - -/* We put a few things here that have to be page-aligned. This stuff - * goes at the beginning of the data segment, which is page-aligned. - */ - .data - .align 12 - .globl sdata -sdata: - .globl empty_zero_page -empty_zero_page: - .space 4096 -EXPORT_SYMBOL(empty_zero_page) - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE - -/* Room for two PTE pointers, usually the kernel and current user pointers - * to their respective root page table. - */ -abatron_pteptrs: - .space 8 diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 51dd01a27314..25642e802ed3 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -25,17 +25,16 @@ */ #include <linux/init.h> +#include <linux/pgtable.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/ptrace.h> #include <asm/synch.h> -#include <asm/export.h> #include <asm/code-patching-asm.h> #include "head_booke.h" @@ -52,8 +51,8 @@ * */ __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); /* * Reserve a word at a fixed location to store the address * of abatron_pteptrs @@ -70,7 +69,7 @@ _ENTRY(_start); * address. * r21 will be loaded with the physical runtime address of _stext */ - bl 0f /* Get our runtime address */ + bcl 20,31,$+4 /* Get our runtime address */ 0: mflr r21 /* Make it accessible */ addis r21,r21,(_stext - 0b)@ha addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */ @@ -109,7 +108,7 @@ _ENTRY(_start); lis r1,init_thread_union@h ori r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) bl early_init @@ -263,8 +262,7 @@ interrupt_base: INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \ - do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, do_IRQ) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -277,7 +275,7 @@ interrupt_base: FP_UNAVAILABLE_EXCEPTION #else EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \ - FloatingPointUnavailable, unknown_exception, EXC_XFER_STD) + FloatingPointUnavailable, unknown_exception) #endif /* System Call Interrupt */ START_EXCEPTION(SystemCall) @@ -285,15 +283,14 @@ interrupt_base: /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \ - AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_STD) + AuxillaryProcessorUnavailable, unknown_exception) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, unknown_exception) /* Watchdog Timer Interrupt */ /* TODO: Add watchdog support */ @@ -317,8 +314,8 @@ interrupt_base: * kernel page tables. */ lis r11, PAGE_OFFSET@h - cmplw r10, r11 - blt+ 3f + cmplw cr7, r10, r11 + blt+ cr7, 3f lis r11, swapper_pg_dir@h ori r11, r11, swapper_pg_dir@l @@ -336,12 +333,16 @@ interrupt_base: mfspr r12,SPRN_MMUCR mfspr r13,SPRN_PID /* Get PID */ rlwimi r12,r13,0,24,31 /* Set TID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r13,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_RW position as trying to write + * do copy ESR:ST to _PAGE_WRITE position as trying to write * to an RO page is pretty common, we don't do it with * _PAGE_DIRTY. We could do it, but it's a fairly rare * event so I'd rather take the overhead when it happens @@ -354,7 +355,7 @@ interrupt_base: * place or can we save a couple of instructions here ? */ mfspr r12,SPRN_ESR - li r13,_PAGE_PRESENT|_PAGE_ACCESSED + li r13,_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_READ rlwimi r13,r12,10,30,30 /* Load the PTE */ @@ -376,7 +377,7 @@ interrupt_base: /* Load the next available TLB index */ lwz r13,tlb_44x_index@l(r10) - bne 2f /* Bail if permission mismach */ + bne 2f /* Bail if permission mismatch */ /* Increment, rollover, and store TLB index */ addi r13,r13,1 @@ -427,8 +428,8 @@ interrupt_base: * kernel page tables. */ lis r11, PAGE_OFFSET@h - cmplw r10, r11 - blt+ 3f + cmplw cr7, r10, r11 + blt+ cr7, 3f lis r11, swapper_pg_dir@h ori r11, r11, swapper_pg_dir@l @@ -446,6 +447,10 @@ interrupt_base: mfspr r12,SPRN_MMUCR mfspr r13,SPRN_PID /* Get PID */ rlwimi r12,r13,0,24,31 /* Set TID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r13,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 @@ -471,7 +476,7 @@ interrupt_base: /* Load the next available TLB index */ lwz r13,tlb_44x_index@l(r10) - bne 2f /* Bail if permission mismach */ + bne 2f /* Bail if permission mismatch */ /* Increment, rollover, and store TLB index */ addi r13,r13,1 @@ -510,6 +515,7 @@ interrupt_base: * r11 - PTE high word value * r12 - PTE low word value * r13 - TLB index + * cr7 - Result of comparison with PAGE_OFFSET * MMUCR - loaded with proper value when we get here * Upon exit, we reload everything and RFI. */ @@ -528,12 +534,12 @@ finish_tlb_load_44x: tlbwe r10,r13,PPC44x_TLB_PAGEID /* Write PAGEID */ /* And WS 2 */ - li r10,0xf85 /* Mask to apply from PTE */ - rlwimi r10,r12,29,30,30 /* DIRTY -> SW position */ + li r10,0xf84 /* Mask to apply from PTE */ + rlwimi r10,r12,29,30,31 /* DIRTY,READ -> SW,SR position */ and r11,r12,r10 /* Mask PTE bits to keep */ - andi. r10,r12,_PAGE_USER /* User page ? */ - beq 1f /* nope, leave U bits empty */ + bge cr7,1f /* User page ? no, leave U bits empty */ rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ + rlwinm r11,r11,0,~PPC44x_TLB_SX /* Clear SX if User page */ 1: tlbwe r11,r13,PPC44x_TLB_ATTRIB /* Write ATTRIB */ /* Done...restore registers and get out of here. @@ -562,8 +568,8 @@ finish_tlb_load_44x: * kernel page tables. */ lis r11,PAGE_OFFSET@h - cmplw cr0,r10,r11 - blt+ 3f + cmplw cr7,r10,r11 + blt+ cr7,3f lis r11,swapper_pg_dir@h ori r11,r11, swapper_pg_dir@l li r12,0 /* MMUCR = 0 */ @@ -573,10 +579,14 @@ finish_tlb_load_44x: 3: mfspr r11,SPRN_SPRG3 lwz r11,PGDIR(r11) mfspr r12,SPRN_PID /* Get PID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r12,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 /* Set MMUCR */ /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_RW position as trying to write + * do copy ESR:ST to _PAGE_WRITE position as trying to write * to an RO page is pretty common, we don't do it with * _PAGE_DIRTY. We could do it, but it's a fairly rare * event so I'd rather take the overhead when it happens @@ -589,7 +599,7 @@ finish_tlb_load_44x: * place or can we save a couple of instructions here ? */ mfspr r12,SPRN_ESR - li r13,_PAGE_PRESENT|_PAGE_ACCESSED + li r13,_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_READ rlwimi r13,r12,10,30,30 /* Load the PTE */ @@ -659,8 +669,8 @@ finish_tlb_load_44x: * kernel page tables. */ lis r11,PAGE_OFFSET@h - cmplw cr0,r10,r11 - blt+ 3f + cmplw cr7,r10,r11 + blt+ cr7,3f lis r11,swapper_pg_dir@h ori r11,r11, swapper_pg_dir@l li r12,0 /* MMUCR = 0 */ @@ -670,6 +680,10 @@ finish_tlb_load_44x: 3: mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) mfspr r12,SPRN_PID /* Get PID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r12,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 /* Set MMUCR */ /* Make up the required permissions */ @@ -730,6 +744,7 @@ finish_tlb_load_44x: * r11 - PTE high word value * r12 - PTE low word value * r13 - free to use + * cr7 - Result of comparison with PAGE_OFFSET * MMUCR - loaded with proper value when we get here * Upon exit, we reload everything and RFI. */ @@ -739,12 +754,12 @@ finish_tlb_load_47x: tlbwe r11,r13,1 /* And make up word 2 */ - li r10,0xf85 /* Mask to apply from PTE */ - rlwimi r10,r12,29,30,30 /* DIRTY -> SW position */ + li r10,0xf84 /* Mask to apply from PTE */ + rlwimi r10,r12,29,30,31 /* DIRTY,READ -> SW,SR position */ and r11,r12,r10 /* Mask PTE bits to keep */ - andi. r10,r12,_PAGE_USER /* User page ? */ - beq 1f /* nope, leave U bits empty */ + bge cr7,1f /* User page ? no, leave U bits empty */ rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ + rlwinm r11,r11,0,~PPC47x_TLB2_SX /* Clear SX if User page */ 1: tlbwe r11,r13,2 /* Done...restore registers and get out of here. @@ -782,20 +797,6 @@ _GLOBAL(__fixup_440A_mcheck) sync blr -_GLOBAL(set_context) - -#ifdef CONFIG_BDI_SWITCH - /* Context switch the PTE pointer for the Abatron BDI2000. - * The PGDIR is the second parameter. - */ - lis r5, abatron_pteptrs@h - ori r5, r5, abatron_pteptrs@l - stw r4, 0x4(r5) -#endif - mtspr SPRN_PID,r3 - isync /* Force context change */ - blr - /* * Init CPU state. This is called at boot time or for secondary CPUs * to setup initial TLB entries, setup IVORs, etc... @@ -861,7 +862,7 @@ _GLOBAL(init_cpu_state) wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ sync - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -1010,7 +1011,7 @@ _GLOBAL(start_secondary_47x) */ lis r1,temp_boot_stack@h ori r1,r1,temp_boot_stack@l - addi r1,r1,1024-STACK_FRAME_OVERHEAD + addi r1,r1,1024-STACK_FRAME_MIN_SIZE li r0,0 stw r0,0(r1) bl mmu_init_secondary @@ -1023,7 +1024,7 @@ _GLOBAL(start_secondary_47x) lwz r1,TASK_STACK(r2) /* Current stack pointer */ - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r0,0 stw r0,0(r1) @@ -1053,7 +1054,7 @@ head_start_47x: sync /* Find the entry we are running from */ - bl 1f + bcl 20,31,$+4 1: mflr r23 tlbsx r23,0,r23 tlbre r24,r23,0 @@ -1241,34 +1242,8 @@ head_start_common: isync blr -/* - * We put a few things here that have to be page-aligned. This stuff - * goes at the beginning of the data segment, which is page-aligned. - */ - .data - .align PAGE_SHIFT - .globl sdata -sdata: - .globl empty_zero_page -empty_zero_page: - .space PAGE_SIZE -EXPORT_SYMBOL(empty_zero_page) - -/* - * To support >32-bit physical addresses, we use an 8KB pgdir. - */ - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE - -/* - * Room for two PTE pointers, usually the kernel and current user pointers - * to their respective root page table. - */ -abatron_pteptrs: - .space 8 - #ifdef CONFIG_SMP + .data .align 12 temp_boot_stack: .space 1024 diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index ad79fddb974d..4690c219bfa4 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -18,6 +18,7 @@ * variants. */ +#include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> #include <asm/reg.h> @@ -39,8 +40,12 @@ #include <asm/hw_irq.h> #include <asm/cputhreads.h> #include <asm/ppc-opcode.h> -#include <asm/export.h> #include <asm/feature-fixups.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/exception-64s.h> +#else +#include <asm/exception-64e.h> +#endif /* The physical memory is laid out such that the secondary processor * spin code sits at 0x0000...0x00ff. On server, the vectors follow @@ -70,6 +75,13 @@ * 2. The kernel is entered at __start */ +/* + * boot_from_prom and prom_init run at the physical address. Everything + * after prom and kexec entry run at the virtual address (PAGE_OFFSET). + * Secondaries run at the virtual address from generic_secondary_common_init + * onward. + */ + OPEN_FIXED_SECTION(first_256B, 0x0, 0x100) USE_FIXED_SECTION(first_256B) /* @@ -106,7 +118,7 @@ __secondary_hold_acknowledge: #ifdef CONFIG_RELOCATABLE /* This flag is set to 1 by a loader if the kernel should run * at the loaded address instead of the linked address. This - * is used by kexec-tools to keep the the kdump kernel in the + * is used by kexec-tools to keep the kdump kernel in the * crash_kernel region. The loader is responsible for * observing the alignment requirement. */ @@ -121,7 +133,7 @@ __secondary_hold_acknowledge: . = 0x5c .globl __run_at_load __run_at_load: -DEFINE_FIXED_SYMBOL(__run_at_load) +DEFINE_FIXED_SYMBOL(__run_at_load, first_256B) .long RUN_AT_LOAD_DEFAULT #endif @@ -138,7 +150,7 @@ DEFINE_FIXED_SYMBOL(__run_at_load) .globl __secondary_hold __secondary_hold: FIXUP_ENDIAN -#ifndef CONFIG_PPC_BOOK3E +#ifndef CONFIG_PPC_BOOK3E_64 mfmsr r24 ori r24,r24,MSR_RI mtmsrd r24 /* RI on */ @@ -151,20 +163,16 @@ __secondary_hold: /* Tell the master cpu we're here */ /* Relocation is off & we are located at an address less */ /* than 0x100, so only need to grab low order offset. */ - std r24,(ABS_ADDR(__secondary_hold_acknowledge))(0) + std r24,(ABS_ADDR(__secondary_hold_acknowledge, first_256B))(0) sync - li r26,0 -#ifdef CONFIG_PPC_BOOK3E - tovirt(r26,r26) -#endif /* All secondary cpus wait here until told to start. */ -100: ld r12,(ABS_ADDR(__secondary_hold_spinloop))(r26) +100: ld r12,(ABS_ADDR(__secondary_hold_spinloop, first_256B))(0) cmpdi 0,r12,0 beq 100b #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 tovirt(r12,r12) #endif mtctr r12 @@ -173,7 +181,7 @@ __secondary_hold: * it may be the case that other platforms have r4 right to * begin with, this gives us some safety in case it is not */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 mr r4,r25 #else li r4,0 @@ -187,12 +195,6 @@ __secondary_hold: #endif CLOSE_FIXED_SECTION(first_256B) -/* This value is used to mark exception frames on the stack. */ - .section ".toc","aw" -exception_marker: - .tc ID_72656773_68657265[TC],0x7265677368657265 - .previous - /* * On server, we include the exception vectors code here as it * relies on absolute addressing which is only possible within @@ -206,7 +208,9 @@ OPEN_TEXT_SECTION(0x100) USE_TEXT_SECTION() -#ifdef CONFIG_PPC_BOOK3E +#include "interrupt_64.S" + +#ifdef CONFIG_PPC_BOOK3E_64 /* * The booting_thread_hwid holds the thread id we want to boot in cpu * hotplug case. It is set by cpu hotplug code, and is invalid by default. @@ -300,25 +304,20 @@ _GLOBAL(fsl_secondary_thread_init) rlwimi r3, r3, 30, 2, 30 mtspr SPRN_PIR, r3 1: -#endif - -_GLOBAL(generic_secondary_thread_init) mr r24,r3 /* turn on 64-bit mode */ bl enable_64b_mode - /* get a valid TOC pointer, wherever we're mapped at */ - bl relative_toc - tovirt(r2,r2) - -#ifdef CONFIG_PPC_BOOK3E /* Book3E initialization */ mr r3,r24 bl book3e_secondary_thread_init -#endif + bl relative_toc + b generic_secondary_common_init +#endif /* CONFIG_PPC_BOOK3E_64 */ + /* * On pSeries and most other platforms, secondary processors spin * in the following code. @@ -330,22 +329,24 @@ _GLOBAL(generic_secondary_thread_init) */ _GLOBAL(generic_secondary_smp_init) FIXUP_ENDIAN + + li r13,0 + + /* Poison TOC */ + li r2,-1 + mr r24,r3 mr r25,r4 /* turn on 64-bit mode */ bl enable_64b_mode - /* get a valid TOC pointer, wherever we're mapped at */ - bl relative_toc - tovirt(r2,r2) - -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* Book3E initialization */ mr r3,r24 mr r4,r25 bl book3e_secondary_core_init - + /* Now NIA and r2 are relocated to PAGE_OFFSET if not already */ /* * After common core init has finished, check if the current thread is the * one we wanted to boot. If not, start the specified thread and stop the @@ -373,8 +374,7 @@ _GLOBAL(generic_secondary_smp_init) beq 20f /* start the specified thread */ - LOAD_REG_ADDR(r5, fsl_secondary_thread_init) - ld r4, 0(r5) + LOAD_REG_ADDR(r5, DOTSYM(fsl_secondary_thread_init)) bl book3e_start_thread /* stop the current thread */ @@ -383,6 +383,16 @@ _GLOBAL(generic_secondary_smp_init) 10: b 10b 20: +#else + /* Now the MMU is off, can branch to our PAGE_OFFSET address */ + bcl 20,31,$+4 +1: mflr r11 + addi r11,r11,(2f - 1b) + tovirt(r11, r11) + mtctr r11 + bctr +2: + bl relative_toc #endif generic_secondary_common_init: @@ -395,8 +405,12 @@ generic_secondary_common_init: #else LOAD_REG_ADDR(r8, paca_ptrs) /* Load paca_ptrs pointe */ ld r8,0(r8) /* Get base vaddr of array */ +#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) + LOAD_REG_IMMEDIATE(r7, NR_CPUS) +#else LOAD_REG_ADDR(r7, nr_cpu_ids) /* Load nr_cpu_ids address */ lwz r7,0(r7) /* also the max paca allocated */ +#endif li r5,0 /* logical cpu id */ 1: sldi r9,r5,3 /* get paca_ptrs[] index from cpu id */ @@ -412,7 +426,7 @@ generic_secondary_common_init: b kexec_wait /* next kernel might do better */ 2: SET_PACA(r13) -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 addi r12,r13,PACA_EXTLB /* and TLB exc frame in another */ mtspr SPRN_SPRG_TLB_EXFRAME,r12 #endif @@ -420,13 +434,17 @@ generic_secondary_common_init: /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 + /* Create a temp kernel stack for use before relocation is on. */ + ld r1,PACAEMERGSP(r13) + subi r1,r1,STACK_FRAME_MIN_SIZE + /* See if we need to call a cpu state restore handler */ LOAD_REG_ADDR(r23, cur_cpu_spec) ld r23,0(r23) ld r12,CPU_SPEC_RESTORE(r23) cmpdi 0,r12,0 beq 3f -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 ld r12,0(r12) #endif mtctr r12 @@ -448,10 +466,6 @@ generic_secondary_common_init: sync /* order paca.run and cur_cpu_spec */ isync /* In case code patching happened */ - /* Create a temp kernel stack for use before relocation is on. */ - ld r1,PACAEMERGSP(r13) - subi r1,r1,STACK_FRAME_OVERHEAD - b __secondary_start #endif /* SMP */ @@ -460,7 +474,7 @@ generic_secondary_common_init: * Assumes we're mapped EA == RA if the MMU is on. */ #ifdef CONFIG_PPC_BOOK3S -__mmu_off: +SYM_FUNC_START_LOCAL(__mmu_off) mfmsr r3 andi. r0,r3,MSR_IR|MSR_DR beqlr @@ -471,8 +485,34 @@ __mmu_off: sync rfid b . /* prevent speculative execution */ -#endif +SYM_FUNC_END(__mmu_off) +SYM_FUNC_START_LOCAL(start_initialization_book3s) + mflr r25 + + /* Setup some critical 970 SPRs before switching MMU off */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 /* 970 */ + beq 1f + cmpwi r0,0x3c /* 970FX */ + beq 1f + cmpwi r0,0x44 /* 970MP */ + beq 1f + cmpwi r0,0x45 /* 970GX */ + bne 2f +1: bl __cpu_preinit_ppc970 +2: + + /* Switch off MMU if not already off */ + bl __mmu_off + + /* Now the MMU is off, can return to our PAGE_OFFSET address */ + tovirt(r25,r25) + mtlr r25 + blr +SYM_FUNC_END(start_initialization_book3s) +#endif /* * Here is our main kernel entry point. We support currently 2 kind of entries @@ -489,14 +529,11 @@ __start_initialization_multiplatform: /* Make sure we are running in 64 bits mode */ bl enable_64b_mode - /* Get TOC pointer (current runtime address) */ - bl relative_toc + /* Zero r13 (paca) so early program check / mce don't use it */ + li r13,0 - /* find out where we are now */ - bcl 20,31,$+4 -0: mflr r26 /* r26 = runtime addr here */ - addis r26,r26,(_stext - 0b)@ha - addi r26,r26,(_stext - 0b)@l /* current runtime base addr */ + /* Poison TOC */ + li r2,-1 /* * Are we booted from a PROM Of-type client-interface ? @@ -514,31 +551,41 @@ __start_initialization_multiplatform: mr r29,r9 #endif -#ifdef CONFIG_PPC_BOOK3E + /* Get TOC pointer (current runtime address) */ + bl relative_toc + + /* These functions return to the virtual (PAGE_OFFSET) address */ +#ifdef CONFIG_PPC_BOOK3E_64 bl start_initialization_book3e - b __after_prom_start #else - /* Setup some critical 970 SPRs before switching MMU off */ - mfspr r0,SPRN_PVR - srwi r0,r0,16 - cmpwi r0,0x39 /* 970 */ - beq 1f - cmpwi r0,0x3c /* 970FX */ - beq 1f - cmpwi r0,0x44 /* 970MP */ - beq 1f - cmpwi r0,0x45 /* 970GX */ - bne 2f -1: bl __cpu_preinit_ppc970 -2: + bl start_initialization_book3s +#endif /* CONFIG_PPC_BOOK3E_64 */ + + /* Get TOC pointer, virtual */ + bl relative_toc + + /* find out where we are now */ + + /* OPAL doesn't pass base address in r4, have to derive it. */ + bcl 20,31,$+4 +0: mflr r26 /* r26 = runtime addr here */ + addis r26,r26,(_stext - 0b)@ha + addi r26,r26,(_stext - 0b)@l /* current runtime base addr */ - /* Switch off MMU if not already off */ - bl __mmu_off b __after_prom_start -#endif /* CONFIG_PPC_BOOK3E */ +__REF __boot_from_prom: #ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE + /* Get TOC pointer, non-virtual */ + bl relative_toc + + /* find out where we are now */ + bcl 20,31,$+4 +0: mflr r26 /* r26 = runtime addr here */ + addis r26,r26,(_stext - 0b)@ha + addi r26,r26,(_stext - 0b)@l /* current runtime base addr */ + /* Save parameters */ mr r31,r3 mr r30,r4 @@ -568,31 +615,25 @@ __boot_from_prom: /* Do all of the interaction with OF client interface */ mr r8,r26 - bl prom_init + bl CFUNC(prom_init) #endif /* #CONFIG_PPC_OF_BOOT_TRAMPOLINE */ /* We never return. We also hit that trap if trying to boot * from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */ trap + .previous __after_prom_start: #ifdef CONFIG_RELOCATABLE /* process relocations for the final address of the kernel */ - lis r25,PAGE_OFFSET@highest /* compute virtual base of kernel */ - sldi r25,r25,32 -#if defined(CONFIG_PPC_BOOK3E) - tovirt(r26,r26) /* on booke, we already run at PAGE_OFFSET */ -#endif lwz r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26) -#if defined(CONFIG_PPC_BOOK3E) - tophys(r26,r26) -#endif cmplwi cr0,r7,1 /* flagged to stay where we are ? */ - bne 1f - add r25,r25,r26 + mr r25,r26 /* then use current kernel base */ + beq 1f + LOAD_REG_IMMEDIATE(r25, PAGE_OFFSET) /* else use static kernel base */ 1: mr r3,r25 bl relocate -#if defined(CONFIG_PPC_BOOK3E) +#if defined(CONFIG_PPC_BOOK3E_64) /* IVPR needs to be set after relocation. */ bl init_core_book3e #endif @@ -605,14 +646,8 @@ __after_prom_start: * * Note: This process overwrites the OF exception vectors. */ - li r3,0 /* target addr */ -#ifdef CONFIG_PPC_BOOK3E - tovirt(r3,r3) /* on booke, we already run at PAGE_OFFSET */ -#endif + LOAD_REG_IMMEDIATE(r3, PAGE_OFFSET) mr. r4,r26 /* In some cases the loader may */ -#if defined(CONFIG_PPC_BOOK3E) - tovirt(r4,r4) -#endif beq 9f /* have already put us at zero */ li r6,0x100 /* Start offset, the first 0x100 */ /* bytes were copied earlier. */ @@ -623,14 +658,11 @@ __after_prom_start: * variable __run_at_load, if it is set the kernel is treated as relocatable * kernel, otherwise it will be moved to PHYSICAL_START */ -#if defined(CONFIG_PPC_BOOK3E) - tovirt(r26,r26) /* on booke, we already run at PAGE_OFFSET */ -#endif lwz r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26) cmplwi cr0,r7,1 bne 3f -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 LOAD_REG_ADDR(r5, __end_interrupts) LOAD_REG_ADDR(r11, _stext) sub r5,r5,r11 @@ -642,15 +674,15 @@ __after_prom_start: 3: #endif /* # bytes of memory to copy */ - lis r5,(ABS_ADDR(copy_to_here))@ha - addi r5,r5,(ABS_ADDR(copy_to_here))@l + lis r5,(ABS_ADDR(copy_to_here, text))@ha + addi r5,r5,(ABS_ADDR(copy_to_here, text))@l bl copy_and_flush /* copy the first n bytes */ /* this includes the code being */ /* executed here. */ /* Jump to the copy of this code that we just made */ - addis r8,r3,(ABS_ADDR(4f))@ha - addi r12,r8,(ABS_ADDR(4f))@l + addis r8,r3,(ABS_ADDR(4f, text))@ha + addi r12,r8,(ABS_ADDR(4f, text))@l mtctr r12 bctr @@ -662,8 +694,8 @@ p_end: .8byte _end - copy_to_here * Now copy the rest of the kernel up to _end, add * _end - copy_to_here to the copy limit and run again. */ - addis r8,r26,(ABS_ADDR(p_end))@ha - ld r8,(ABS_ADDR(p_end))@l(r8) + addis r8,r26,(ABS_ADDR(p_end, text))@ha + ld r8,(ABS_ADDR(p_end, text))@l(r8) add r5,r5,r8 5: bl copy_and_flush /* copy the rest */ @@ -705,6 +737,8 @@ _GLOBAL(copy_and_flush) isync blr +_ASM_NOKPROBE_SYMBOL(copy_and_flush); /* Called in real mode */ + .align 8 copy_to_here: @@ -742,9 +776,15 @@ _GLOBAL(pmac_secondary_start) sync slbia - /* get TOC pointer (real address) */ + /* Branch to our PAGE_OFFSET address */ + bcl 20,31,$+4 +1: mflr r11 + addi r11,r11,(2f - 1b) + tovirt(r11, r11) + mtctr r11 + bctr +2: bl relative_toc - tovirt(r2,r2) /* Copy some CPU settings from CPU 0 */ bl __restore_cpu_ppc970 @@ -771,7 +811,7 @@ _GLOBAL(pmac_secondary_start) /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) - subi r1,r1,STACK_FRAME_OVERHEAD + subi r1,r1,STACK_FRAME_MIN_SIZE b __secondary_start @@ -803,7 +843,7 @@ __secondary_start: * can turn it on below. This is a call to C, which is OK, we're still * running on the emergency stack. */ - bl early_setup_secondary + bl CFUNC(early_setup_secondary) /* * The primary has initialized our kernel stack for us in the paca, grab @@ -830,7 +870,7 @@ __secondary_start: mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - RFI + RFI_TO_KERNEL b . /* prevent speculative execution */ /* @@ -839,10 +879,10 @@ __secondary_start: * before going into C code. */ start_secondary_prolog: - ld r2,PACATOC(r13) + LOAD_PACA_TOC() li r3,0 std r3,0(r1) /* Zero the stack frame pointer */ - bl start_secondary + bl CFUNC(start_secondary) b . /* * Reset stack pointer and call start_secondary @@ -853,26 +893,26 @@ _GLOBAL(start_secondary_resume) ld r1,PACAKSAVE(r13) /* Reload kernel stack pointer */ li r3,0 std r3,0(r1) /* Zero the stack frame pointer */ - bl start_secondary + bl CFUNC(start_secondary) b . #endif /* * This subroutine clobbers r11 and r12 */ -enable_64b_mode: +SYM_FUNC_START_LOCAL(enable_64b_mode) mfmsr r11 /* grab the current MSR */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */ mtmsr r11 -#else /* CONFIG_PPC_BOOK3E */ - li r12,(MSR_64BIT | MSR_ISF)@highest - sldi r12,r12,48 +#else /* CONFIG_PPC_BOOK3E_64 */ + LOAD_REG_IMMEDIATE(r12, MSR_64BIT) or r11,r11,r12 mtmsrd r11 isync #endif blr +SYM_FUNC_END(enable_64b_mode) /* * This puts the TOC pointer into r2, offset by 0x8000 (as expected @@ -883,10 +923,15 @@ enable_64b_mode: * TOC in -mcmodel=medium mode. After we relocate to 0 but before * the MMU is on we need our TOC to be a virtual address otherwise * these pointers will be real addresses which may get stored and - * accessed later with the MMU on. We use tovirt() at the call - * sites to handle this. + * accessed later with the MMU on. We branch to the virtual address + * while still in real mode then call relative_toc again to handle + * this. */ _GLOBAL(relative_toc) +#ifdef CONFIG_PPC_KERNEL_PCREL + tdnei r2,-1 + blr +#else mflr r0 bcl 20,31,$+4 0: mflr r11 @@ -896,16 +941,16 @@ _GLOBAL(relative_toc) blr .balign 8 -p_toc: .8byte __toc_start + 0x8000 - 0b +p_toc: .8byte .TOC. - 0b +#endif /* * This is where the main kernel code starts. */ __REF start_here_multiplatform: - /* set up the TOC */ - bl relative_toc - tovirt(r2,r2) + /* Adjust TOC for moved kernel. Could adjust when moving it instead. */ + bl relative_toc /* Clear out the BSS. It may have been done in prom_init, * already but that's irrelevant since prom_init will soon @@ -932,7 +977,7 @@ start_here_multiplatform: std r29,8(r11); #endif -#ifndef CONFIG_PPC_BOOK3E +#ifndef CONFIG_PPC_BOOK3E_64 mfmsr r6 ori r6,r6,MSR_RI mtmsrd r6 /* RI on */ @@ -945,25 +990,21 @@ start_here_multiplatform: std r0,0(r4) #endif - /* The following gets the stack set up with the regs */ - /* pointing to the real addr of the kernel stack. This is */ - /* all done to support the C function call below which sets */ - /* up the htab. This is done because we have relocated the */ - /* kernel but are still running in real mode. */ - - LOAD_REG_ADDR(r3,init_thread_union) - /* set up a stack pointer */ + LOAD_REG_ADDR(r3,init_thread_union) LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) add r1,r3,r1 li r0,0 - stdu r0,-STACK_FRAME_OVERHEAD(r1) + stdu r0,-STACK_FRAME_MIN_SIZE(r1) /* * Do very early kernel initializations, including initial hash table * and SLB setup before we turn on relocation. */ +#ifdef CONFIG_KASAN + bl CFUNC(kasan_early_init) +#endif /* Restore parameters passed from prom_init/kexec */ mr r3,r31 LOAD_REG_ADDR(r12, DOTSYM(early_setup)) @@ -974,10 +1015,9 @@ start_here_multiplatform: ld r4,PACAKMSR(r13) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - RFI + RFI_TO_KERNEL b . /* prevent speculative execution */ - .previous /* This is where all platforms converge execution */ start_here_common: @@ -985,7 +1025,7 @@ start_here_common: std r1,PACAKSAVE(r13) /* Load the TOC (virtual address) */ - ld r2,PACATOC(r13) + LOAD_PACA_TOC() /* Mark interrupts soft and hard disabled (they might be enabled * in the PACA when doing hotplug) @@ -996,28 +1036,9 @@ start_here_common: stb r0,PACAIRQHAPPENED(r13) /* Generic kernel entry */ - bl start_kernel + bl CFUNC(start_kernel) /* Not reached */ - trap +0: trap EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 - -/* - * We put a few things here that have to be page-aligned. - * This stuff goes at the beginning of the bss, which is page-aligned. - */ - .section ".bss" -/* - * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K. - * We will need to find a better way to fix this - */ - .align 16 - - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE - - .globl empty_zero_page -empty_zero_page: - .space PAGE_SIZE -EXPORT_SYMBOL(empty_zero_page) + .previous diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_85xx.S index 6f7a3a7162c5..39724ff5ae1f 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -28,17 +28,18 @@ #include <linux/init.h> #include <linux/threads.h> +#include <linux/pgtable.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/cache.h> #include <asm/ptrace.h> -#include <asm/export.h> #include <asm/feature-fixups.h> #include "head_booke.h" @@ -54,8 +55,8 @@ * */ __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); /* * Reserve a word at a fixed location to store the address * of abatron_pteptrs @@ -79,7 +80,7 @@ _ENTRY(_start); mr r23,r3 mr r25,r4 - bl 0f + bcl 20,31,$+4 0: mflr r8 addis r3,r8,(is_second_reloc - 0b)@ha lwz r19,(is_second_reloc - 0b)@l(r3) @@ -113,7 +114,7 @@ _ENTRY(_start); 1: /* - * We have the runtime (virutal) address of our base. + * We have the runtime (virtual) address of our base. * We calculate our shift of offset from a 64M page. * We could map the 64M page we belong to at PAGE_OFFSET and * get going from there. @@ -129,7 +130,7 @@ _ENTRY(_start); /* * For the second relocation, we already set the right tlb entries - * for the kernel space, so skip the code in fsl_booke_entry_mapping.S + * for the kernel space, so skip the code in 85xx_entry_mapping.S */ cmpwi r19,1 beq set_ivor @@ -154,12 +155,12 @@ _ENTRY(_start); * if needed */ -_ENTRY(__early_start) +_GLOBAL(__early_start) LOAD_REG_ADDR_PIC(r20, kernstart_virt_addr) lwz r20,0(r20) #define ENTRY_MAPPING_BOOT_SETUP -#include "fsl_booke_entry_mapping.S" +#include "85xx_entry_mapping.S" #undef ENTRY_MAPPING_BOOT_SETUP set_ivor: @@ -187,9 +188,6 @@ set_ivor: /* Setup the defaults for TLB entries */ li r2,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l -#ifdef CONFIG_E200 - oris r2,r2,MAS4_TLBSELD(1)@h -#endif mtspr SPRN_MAS4, r2 #if !defined(CONFIG_BDI_SWITCH) @@ -232,7 +230,7 @@ set_ivor: lis r1,init_thread_union@h ori r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) #ifdef CONFIG_SMP stw r24, TASK_CPU(r2) @@ -362,32 +360,30 @@ interrupt_base: CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception) /* Machine Check Interrupt */ -#ifdef CONFIG_E200 - /* no RFMCI, MCSRRs on E200 */ - CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \ - machine_check_exception) -#else MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception) -#endif /* Data Storage Interrupt */ START_EXCEPTION(DataStorage) - NORMAL_EXCEPTION_PROLOG(DATA_STORAGE) - mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ + NORMAL_EXCEPTION_PROLOG(0x300, DATA_STORAGE) + mfspr r5,SPRN_ESR /* Grab the ESR, save it */ stw r5,_ESR(r11) - mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ + mfspr r4,SPRN_DEAR /* Grab the DEAR, save it */ + stw r4, _DEAR(r11) andis. r10,r5,(ESR_ILK|ESR_DLK)@h bne 1f - EXC_XFER_LITE(0x0300, handle_page_fault) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return 1: - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x0300, CacheLockingException) + prepare_transfer_to_handler + bl CacheLockingException + b interrupt_return /* Instruction Storage Interrupt */ INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -399,14 +395,7 @@ interrupt_base: #ifdef CONFIG_PPC_FPU FP_UNAVAILABLE_EXCEPTION #else -#ifdef CONFIG_E200 - /* E200 treats 'normal' floating point instructions as FP Unavail exception */ - EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ - program_check_exception, EXC_XFER_STD) -#else - EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ - unknown_exception, EXC_XFER_STD) -#endif + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, emulation_assist_interrupt) #endif /* System Call Interrupt */ @@ -414,16 +403,14 @@ interrupt_base: SYSCALL_ENTRY 0xc00 BOOKE_INTERRUPT_SYSCALL SPRN_SRR1 /* Auxiliary Processor Unavailable Interrupt */ - EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, unknown_exception) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x3100, FIT, FixedIntervalTimer, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x3100, FIT, FixedIntervalTimer, unknown_exception) /* Watchdog Timer Interrupt */ #ifdef CONFIG_BOOKE_WDT @@ -476,9 +463,15 @@ END_BTB_FLUSH_SECTION mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + mfspr r12, SPRN_MAS1 + rlwinm. r12,r12,0,0x3fff0000 + beq 2f /* KUAP fault */ +#endif + 4: /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_RW position as trying to write + * do copy ESR:ST to _PAGE_WRITE position as trying to write * to an RO page is pretty common, we don't do it with * _PAGE_DIRTY. We could do it, but it's a fairly rare * event so I'd rather take the overhead when it happens @@ -492,10 +485,10 @@ END_BTB_FLUSH_SECTION */ mfspr r12,SPRN_ESR #ifdef CONFIG_PTE_64BIT - li r13,_PAGE_PRESENT + li r13,_PAGE_PRESENT|_PAGE_BAP_SR oris r13,r13,_PAGE_ACCESSED@h #else - li r13,_PAGE_PRESENT|_PAGE_ACCESSED + li r13,_PAGE_PRESENT|_PAGE_READ|_PAGE_ACCESSED #endif rlwimi r13,r12,11,29,29 @@ -511,7 +504,7 @@ END_BTB_FLUSH_SECTION #endif #endif - bne 2f /* Bail if permission/valid mismach */ + bne 2f /* Bail if permission/valid mismatch */ /* Jump to common tlb load */ b finish_tlb_load @@ -585,6 +578,12 @@ END_BTB_FLUSH_SECTION mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + mfspr r12, SPRN_MAS1 + rlwinm. r12,r12,0,0x3fff0000 + beq 2f /* KUAP fault */ +#endif + /* Make up the required permissions for user code */ #ifdef CONFIG_PTE_64BIT li r13,_PAGE_PRESENT | _PAGE_BAP_UX @@ -606,7 +605,7 @@ END_BTB_FLUSH_SECTION #endif #endif - bne 2f /* Bail if permission mismach */ + bne 2f /* Bail if permission mismatch */ /* Jump to common TLB load point */ b finish_tlb_load @@ -624,42 +623,48 @@ END_BTB_FLUSH_SECTION mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage -/* Define SPE handlers for e200 and e500v2 */ +/* Define SPE handlers for e500v2 */ #ifdef CONFIG_SPE /* SPE Unavailable */ START_EXCEPTION(SPEUnavailable) - NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL) + NORMAL_EXCEPTION_PROLOG(0x2010, SPE_UNAVAIL) beq 1f bl load_up_spe b fast_exception_return -1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x2010, KernelSPE) +1: prepare_transfer_to_handler + bl KernelSPE + b interrupt_return #elif defined(CONFIG_SPE_POSSIBLE) - EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, unknown_exception) #endif /* CONFIG_SPE_POSSIBLE */ /* SPE Floating Point Data */ #ifdef CONFIG_SPE - EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, - SPEFloatingPointException, EXC_XFER_STD) + START_EXCEPTION(SPEFloatingPointData) + NORMAL_EXCEPTION_PROLOG(0x2030, SPE_FP_DATA) + prepare_transfer_to_handler + bl SPEFloatingPointException + REST_NVGPRS(r1) + b interrupt_return /* SPE Floating Point Round */ - EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ - SPEFloatingPointRoundException, EXC_XFER_STD) + START_EXCEPTION(SPEFloatingPointRound) + NORMAL_EXCEPTION_PROLOG(0x2050, SPE_FP_ROUND) + prepare_transfer_to_handler + bl SPEFloatingPointRoundException + REST_NVGPRS(r1) + b interrupt_return #elif defined(CONFIG_SPE_POSSIBLE) - EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, - unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, unknown_exception) + EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, unknown_exception) #endif /* CONFIG_SPE_POSSIBLE */ /* Performance Monitor */ EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \ - performance_monitor_exception, EXC_XFER_STD) + performance_monitor_exception) - EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD) + EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception) CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \ CriticalDoorbell, unknown_exception) @@ -674,10 +679,10 @@ END_BTB_FLUSH_SECTION unknown_exception) /* Hypercall */ - EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_STD) + EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception) /* Embedded Hypervisor Privilege */ - EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_STD) + EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception) interrupt_end: @@ -778,14 +783,15 @@ BEGIN_MMU_FTR_SECTION mtspr SPRN_MAS7, r10 END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) #else - li r10, (_PAGE_EXEC | _PAGE_PRESENT) + li r10, (_PAGE_EXEC | _PAGE_READ) mr r13, r11 rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ and r12, r11, r10 - andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ + mcrf cr0, cr5 /* Test for user page */ slwi r10, r12, 1 or r10, r10, r12 - iseleq r12, r12, r10 + rlwinm r10, r10, 0, ~_PAGE_EXEC /* Clear SX on user pages */ + isellt r12, r10, r12 rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ mtspr SPRN_MAS3, r13 #endif @@ -806,31 +812,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) #endif 3: mtspr SPRN_MAS2, r12 -#ifdef CONFIG_E200 - /* Round robin TLB1 entries assignment */ - mfspr r12, SPRN_MAS0 - - /* Extract TLB1CFG(NENTRY) */ - mfspr r11, SPRN_TLB1CFG - andi. r11, r11, 0xfff - - /* Extract MAS0(NV) */ - andi. r13, r12, 0xfff - addi r13, r13, 1 - cmpw 0, r13, r11 - addi r12, r12, 1 - - /* check if we need to wrap */ - blt 7f - - /* wrap back to first free tlbcam entry */ - lis r13, tlbcam_index@ha - lwz r13, tlbcam_index@l(r13) - rlwimi r12, r13, 0, 20, 31 -7: - mtspr SPRN_MAS0,r12 -#endif /* CONFIG_E200 */ - tlb_write_entry: tlbwe @@ -882,7 +863,7 @@ _GLOBAL(load_up_spe) * SPE unavailable trap from kernel - print a message, but let * the task use SPE in the kernel until it returns to user mode. */ -KernelSPE: +SYM_FUNC_START_LOCAL(KernelSPE) lwz r3,_MSR(r1) oris r3,r3,MSR_SPE@h stw r3,_MSR(r1) /* enable use of SPE after return */ @@ -891,21 +872,22 @@ KernelSPE: ori r3,r3,87f@l mr r4,r2 /* current */ lwz r5,_NIP(r1) - bl printk + bl _printk #endif - b ret_from_except + b interrupt_return #ifdef CONFIG_PRINTK 87: .string "SPE used in kernel (task=%p, pc=%x) \n" #endif .align 4,0 +SYM_FUNC_END(KernelSPE) #endif /* CONFIG_SPE */ /* * Translate the effec addr in r3 to phys addr. The phys addr will be put * into r3(higher 32bit) and r4(lower 32bit) */ -get_phys_addr: +SYM_FUNC_START_LOCAL(get_phys_addr) mfmsr r8 mfspr r9,SPRN_PID rlwinm r9,r9,16,0x3fff0000 /* turn PID into MAS6[SPID] */ @@ -927,27 +909,13 @@ get_phys_addr: mfspr r3,SPRN_MAS7 #endif blr +SYM_FUNC_END(get_phys_addr) /* * Global functions */ -#ifdef CONFIG_E200 -/* Adjust or setup IVORs for e200 */ -_GLOBAL(__setup_e200_ivors) - li r3,DebugDebug@l - mtspr SPRN_IVOR15,r3 - li r3,SPEUnavailable@l - mtspr SPRN_IVOR32,r3 - li r3,SPEFloatingPointData@l - mtspr SPRN_IVOR33,r3 - li r3,SPEFloatingPointRound@l - mtspr SPRN_IVOR34,r3 - sync - blr -#endif - -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 #ifndef CONFIG_PPC_E500MC /* Adjust or setup IVORs for e500v1/v2 */ _GLOBAL(__setup_e500_ivors) @@ -990,7 +958,7 @@ _GLOBAL(__setup_ehv_ivors) sync blr #endif /* CONFIG_PPC_E500MC */ -#endif /* CONFIG_E500 */ +#endif /* CONFIG_PPC_E500 */ #ifdef CONFIG_SPE /* @@ -1007,10 +975,10 @@ _GLOBAL(__giveup_spe) li r4,THREAD_ACC evstddx evr6, r4, r3 /* save off accumulator */ beq 1f - lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lwz r4,_MSR-STACK_INT_FRAME_REGS(r5) lis r3,MSR_SPE@h andc r4,r4,r3 /* disable SPE for previous task */ - stw r4,_MSR-STACK_FRAME_OVERHEAD(r5) + stw r4,_MSR-STACK_INT_FRAME_REGS(r5) 1: blr #endif /* CONFIG_SPE */ @@ -1033,20 +1001,6 @@ _GLOBAL(abort) mtspr SPRN_DBCR0,r13 isync -_GLOBAL(set_context) - -#ifdef CONFIG_BDI_SWITCH - /* Context switch the PTE pointer for the Abatron BDI2000. - * The PGDIR is the second parameter. - */ - lis r5, abatron_pteptrs@h - ori r5, r5, abatron_pteptrs@l - stw r4, 0x4(r5) -#endif - mtspr SPRN_PID,r3 - isync /* Force context change */ - blr - #ifdef CONFIG_SMP /* When we get here, r24 needs to hold the CPU # */ .globl __secondary_start @@ -1093,7 +1047,7 @@ __secondary_start: lwz r1,TASK_STACK(r2) /* stack */ - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r0,0 stw r0,0(r1) @@ -1194,7 +1148,7 @@ _GLOBAL(switch_to_as1) bne 1b /* Get the tlb entry used by the current running code */ - bl 0f + bcl 20,31,$+4 0: mflr r4 tlbsx 0,r4 @@ -1228,7 +1182,7 @@ _GLOBAL(switch_to_as1) _GLOBAL(restore_to_as0) mflr r0 - bl 0f + bcl 20,31,$+4 0: mflr r9 addi r9,r9,1f - 0b @@ -1274,26 +1228,3 @@ _GLOBAL(restore_to_as0) */ 3: mr r3,r5 bl _start - -/* - * We put a few things here that have to be page-aligned. This stuff - * goes at the beginning of the data segment, which is page-aligned. - */ - .data - .align 12 - .globl sdata -sdata: - .globl empty_zero_page -empty_zero_page: - .space 4096 -EXPORT_SYMBOL(empty_zero_page) - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE - -/* - * Room for two PTE pointers, usually the kernel and current user pointers - * to their respective root page table. - */ -abatron_pteptrs: - .space 8 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 19f583e18402..647b0b445e89 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -16,35 +16,21 @@ #include <linux/init.h> #include <linux/magic.h> +#include <linux/pgtable.h> +#include <linux/sizes.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> #include <asm/cache.h> -#include <asm/pgtable.h> #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/ptrace.h> -#include <asm/export.h> #include <asm/code-patching-asm.h> - -#include "head_32.h" - -#if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000 -/* By simply checking Address >= 0x80000000, we know if its a kernel address */ -#define SIMPLE_KERNEL_ADDRESS 1 -#endif - -/* - * We need an ITLB miss handler for kernel addresses if: - * - Either we have modules - * - Or we have not pinned the first 8M - */ -#if defined(CONFIG_MODULES) || !defined(CONFIG_PIN_TLB_TEXT) || \ - defined(CONFIG_DEBUG_PAGEALLOC) -#define ITLB_MISS_KERNEL 1 -#endif +#include <asm/interrupt.h> /* * Value for the bits that have fixed value in RPN entries. @@ -52,12 +38,24 @@ */ #define RPN_PATTERN 0x00f0 +#include "head_32.h" + +.macro compare_to_kernel_boundary scratch, addr +#if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000 +/* By simply checking Address >= 0x80000000, we know if its a kernel address */ + not. \scratch, \addr +#else + rlwinm \scratch, \addr, 16, 0xfff8 + cmpli cr0, \scratch, PAGE_OFFSET@h +#endif +.endm + #define PAGE_SHIFT_512K 19 #define PAGE_SHIFT_8M 23 __HEAD -_ENTRY(_stext); -_ENTRY(_start); +_GLOBAL(_stext); +_GLOBAL(_start); /* MPC8xx * This port was done on an MBX board with an 860. Right now I only @@ -122,89 +120,54 @@ instruction_counter: #endif /* System reset */ - EXCEPTION(0x100, Reset, system_reset_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_SYSTEM_RESET, Reset, system_reset_exception) /* Machine check */ - . = 0x200 -MachineCheck: - EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - li r5,RPN_PATTERN - mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x200, machine_check_exception) - -/* Data access exception. - * This is "never generated" by the MPC8xx. - */ - . = 0x300 -DataAccess: - -/* Instruction access exception. - * This is "never generated" by the MPC8xx. - */ - . = 0x400 -InstructionAccess: + START_EXCEPTION(INTERRUPT_MACHINE_CHECK, MachineCheck) + EXCEPTION_PROLOG INTERRUPT_MACHINE_CHECK MachineCheck handle_dar_dsisr=1 + prepare_transfer_to_handler + bl machine_check_exception + b interrupt_return /* External interrupt */ - EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) + EXCEPTION(INTERRUPT_EXTERNAL, HardwareInterrupt, do_IRQ) /* Alignment exception */ - . = 0x600 -Alignment: - EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - li r5,RPN_PATTERN - mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x600, alignment_exception) + START_EXCEPTION(INTERRUPT_ALIGNMENT, Alignment) + EXCEPTION_PROLOG INTERRUPT_ALIGNMENT Alignment handle_dar_dsisr=1 + prepare_transfer_to_handler + bl alignment_exception + REST_NVGPRS(r1) + b interrupt_return /* Program check exception */ - EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) - -/* No FPU on MPC8xx. This exception is not supposed to happen. -*/ - EXCEPTION(0x800, FPUnavailable, unknown_exception, EXC_XFER_STD) + START_EXCEPTION(INTERRUPT_PROGRAM, ProgramCheck) + EXCEPTION_PROLOG INTERRUPT_PROGRAM ProgramCheck + prepare_transfer_to_handler + bl program_check_exception + REST_NVGPRS(r1) + b interrupt_return /* Decrementer */ - EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) - - EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_DECREMENTER, Decrementer, timer_interrupt) /* System call */ - . = 0xc00 -SystemCall: - SYSCALL_ENTRY 0xc00 + START_EXCEPTION(INTERRUPT_SYSCALL, SystemCall) + SYSCALL_ENTRY INTERRUPT_SYSCALL /* Single step - not used on 601 */ - EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) - EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_TRACE, SingleStep, single_step_exception) /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. */ - EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) - -/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ -#ifdef CONFIG_PERF_EVENTS - patch_site 0f, patch__dtlbmiss_perf -0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - addi r10, r10, 1 - stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 - rfi -#endif + START_EXCEPTION(INTERRUPT_SOFT_EMU_8xx, SoftEmu) + EXCEPTION_PROLOG INTERRUPT_SOFT_EMU_8xx SoftEmu + prepare_transfer_to_handler + bl emulation_assist_interrupt + REST_NVGPRS(r1) + b interrupt_return - . = 0x1100 /* * For the MPC8xx, this is a software tablewalk to load the instruction * TLB. The task switch loads the M_TWB register with the pointer to the first @@ -217,88 +180,58 @@ SystemCall: */ #ifdef CONFIG_8xx_CPU15 -#define INVALIDATE_ADJACENT_PAGES_CPU15(addr) \ - addi addr, addr, PAGE_SIZE; \ - tlbie addr; \ - addi addr, addr, -(PAGE_SIZE << 1); \ - tlbie addr; \ - addi addr, addr, PAGE_SIZE +#define INVALIDATE_ADJACENT_PAGES_CPU15(addr, tmp) \ + addi tmp, addr, PAGE_SIZE; \ + tlbie tmp; \ + addi tmp, addr, -PAGE_SIZE; \ + tlbie tmp #else -#define INVALIDATE_ADJACENT_PAGES_CPU15(addr) +#define INVALIDATE_ADJACENT_PAGES_CPU15(addr, tmp) #endif -InstructionTLBMiss: - mtspr SPRN_SPRG_SCRATCH0, r10 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) - mtspr SPRN_SPRG_SCRATCH1, r11 -#endif + START_EXCEPTION(INTERRUPT_INST_TLB_MISS_8xx, InstructionTLBMiss) + mtspr SPRN_SPRG_SCRATCH2, r10 + mtspr SPRN_M_TW, r11 /* If we are faulting a kernel address, we have to use the * kernel page tables. */ mfspr r10, SPRN_SRR0 /* Get effective address of fault */ - INVALIDATE_ADJACENT_PAGES_CPU15(r10) + INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11) mtspr SPRN_MD_EPN, r10 - /* Only modules will cause ITLB Misses as we always - * pin the first 8MB of kernel memory */ -#ifdef ITLB_MISS_KERNEL +#ifdef CONFIG_MODULES mfcr r11 -#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) - cmpi cr0, r10, 0 /* Address >= 0x80000000 */ -#else - rlwinm r10, r10, 16, 0xfff8 - cmpli cr0, r10, PAGE_OFFSET@h -#ifndef CONFIG_PIN_TLB_TEXT - /* It is assumed that kernel code fits into the first 32M */ -0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h - patch_site 0b, patch__itlbmiss_linmem_top -#endif -#endif + compare_to_kernel_boundary r10, r10 #endif mfspr r10, SPRN_M_TWB /* Get level 1 table */ -#ifdef ITLB_MISS_KERNEL -#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) - bge+ 3f -#else +#ifdef CONFIG_MODULES blt+ 3f -#endif -#ifndef CONFIG_PIN_TLB_TEXT - blt cr7, ITLBMissLinear -#endif rlwinm r10, r10, 0, 20, 31 oris r10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha 3: + mtcr r11 #endif - lwz r10, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ - mtspr SPRN_MI_TWC, r10 /* Set segment attributes */ - - mtspr SPRN_MD_TWC, r10 + lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ + mtspr SPRN_MD_TWC, r11 mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ -#ifdef ITLB_MISS_KERNEL - mtcr r11 -#endif -#ifdef CONFIG_SWAP - rlwinm r11, r10, 32-5, _PAGE_PRESENT - and r11, r11, r10 - rlwimi r10, r11, 0, _PAGE_PRESENT -#endif + rlwimi r11, r10, 0, _PAGE_GUARDED | _PAGE_ACCESSED + rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K + mtspr SPRN_MI_TWC, r11 /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 20 and 23 must be clear. * Software indicator bits 22, 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior * of the MMU. */ - rlwimi r10, r10, 0, 0x0f00 /* Clear bits 20-23 */ + rlwinm r10, r10, 0, ~0x0f00 /* Clear bits 20-23 */ rlwimi r10, r10, 4, 0x0400 /* Copy _PAGE_EXEC into bit 21 */ ori r10, r10, RPN_PATTERN | 0x200 /* Set 22 and 24-27 */ mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ /* Restore registers */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) - mfspr r11, SPRN_SPRG_SCRATCH1 -#endif +0: mfspr r10, SPRN_SPRG_SCRATCH2 + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__itlbmiss_exit_1 @@ -307,65 +240,23 @@ InstructionTLBMiss: 0: lwz r10, (itlb_miss_counter - PAGE_OFFSET)@l(0) addi r10, r10, 1 stw r10, (itlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_SPRG_SCRATCH0 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) - mfspr r11, SPRN_SPRG_SCRATCH1 -#endif + mfspr r10, SPRN_SPRG_SCRATCH2 + mfspr r11, SPRN_M_TW rfi #endif -#ifndef CONFIG_PIN_TLB_TEXT -ITLBMissLinear: - mtcr r11 -#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23 - patch_site 0f, patch__itlbmiss_linmem_top8 - - mfspr r10, SPRN_SRR0 -0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha - rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K - ori r11, r11, MI_PS512K | MI_SVALID - rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ -#else - /* Set 8M byte page and mark it valid */ - li r11, MI_PS8MEG | MI_SVALID - rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ -#endif - mtspr SPRN_MI_TWC, r11 - ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ - _PAGE_PRESENT - mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ - -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 - rfi - patch_site 0b, patch__itlbmiss_exit_2 -#endif - - . = 0x1200 -DataStoreTLBMiss: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 + START_EXCEPTION(INTERRUPT_DATA_TLB_MISS_8xx, DataStoreTLBMiss) + mtspr SPRN_SPRG_SCRATCH2, r10 + mtspr SPRN_M_TW, r11 mfcr r11 /* If we are faulting a kernel address, we have to use the * kernel page tables. */ mfspr r10, SPRN_MD_EPN - rlwinm r10, r10, 16, 0xfff8 - cmpli cr0, r10, PAGE_OFFSET@h -#ifndef CONFIG_PIN_TLB_IMMR - cmpli cr6, r10, VIRT_IMMR_BASE@h -#endif -0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h - patch_site 0b, patch__dtlbmiss_linmem_top - + compare_to_kernel_boundary r10, r10 mfspr r10, SPRN_M_TWB /* Get level 1 table */ blt+ 3f -#ifndef CONFIG_PIN_TLB_IMMR -0: beq- cr6, DTLBMissIMMR - patch_site 0b, patch__dtlbmiss_immr_jmp -#endif - blt cr7, DTLBMissLinear rlwinm r10, r10, 0, 20, 31 oris r10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha 3: @@ -376,29 +267,16 @@ DataStoreTLBMiss: mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ - /* Insert the Guarded flag into the TWC from the Linux PTE. + /* Insert Guarded and Accessed flags into the TWC from the Linux PTE. * It is bit 27 of both the Linux PTE and the TWC (at least * I got that right :-). It will be better when we can put * this into the Linux pgd/pmd and load it in the operation * above. */ - rlwimi r11, r10, 0, _PAGE_GUARDED + rlwimi r11, r10, 0, _PAGE_GUARDED | _PAGE_ACCESSED + rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K mtspr SPRN_MD_TWC, r11 - /* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set. - * We also need to know if the insn is a load/store, so: - * Clear _PAGE_PRESENT and load that which will - * trap into DTLB Error with store bit set accordinly. - */ - /* PRESENT=0x1, ACCESSED=0x20 - * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5)); - * r10 = (r10 & ~PRESENT) | r11; - */ -#ifdef CONFIG_SWAP - rlwinm r11, r10, 32-5, _PAGE_PRESENT - and r11, r11, r10 - rlwimi r10, r11, 0, _PAGE_PRESENT -#endif /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior @@ -407,153 +285,97 @@ DataStoreTLBMiss: li r11, RPN_PATTERN rlwimi r10, r11, 0, 24, 27 /* Set 24-27 */ mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ + mtspr SPRN_DAR, r11 /* Tag DAR */ /* Restore registers */ - mtspr SPRN_DAR, r11 /* Tag DAR */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +0: mfspr r10, SPRN_SPRG_SCRATCH2 + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__dtlbmiss_exit_1 -DTLBMissIMMR: - mtcr r11 - /* Set 512k byte guarded page and mark it valid */ - li r10, MD_PS512K | MD_GUARDED | MD_SVALID - mtspr SPRN_MD_TWC, r10 - mfspr r10, SPRN_IMMR /* Get current IMMR */ - rlwinm r10, r10, 0, 0xfff80000 /* Get 512 kbytes boundary */ - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ - _PAGE_PRESENT | _PAGE_NO_CACHE - mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ - - li r11, RPN_PATTERN - mtspr SPRN_DAR, r11 /* Tag DAR */ - -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +#ifdef CONFIG_PERF_EVENTS + patch_site 0f, patch__dtlbmiss_perf +0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + addi r10, r10, 1 + stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + mfspr r10, SPRN_SPRG_SCRATCH2 + mfspr r11, SPRN_M_TW rfi - patch_site 0b, patch__dtlbmiss_exit_2 - -DTLBMissLinear: - mtcr r11 - rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ -#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_DATA_SHIFT < 23 - patch_site 0f, patch__dtlbmiss_romem_top8 - -0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha - rlwinm r11, r11, 0, 0xff800000 - neg r10, r11 - or r11, r11, r10 - rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K - ori r11, r11, MI_PS512K | MI_SVALID - mfspr r10, SPRN_MD_EPN - rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ -#else - /* Set 8M byte page and mark it valid */ - li r11, MD_PS8MEG | MD_SVALID #endif - mtspr SPRN_MD_TWC, r11 -#ifdef CONFIG_STRICT_KERNEL_RWX - patch_site 0f, patch__dtlbmiss_romem_top - -0: subis r11, r10, 0 - rlwimi r10, r11, 11, _PAGE_RO -#endif - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ - _PAGE_PRESENT - mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ - - li r11, RPN_PATTERN - mtspr SPRN_DAR, r11 /* Tag DAR */ - -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 - rfi - patch_site 0b, patch__dtlbmiss_exit_3 /* This is an instruction TLB error on the MPC8xx. This could be due * to many reasons, such as executing guarded memory or illegal instruction * addresses. There is nothing to do but handle a big time error fault. */ - . = 0x1300 -InstructionTLBError: - EXCEPTION_PROLOG - mr r4,r12 + START_EXCEPTION(INTERRUPT_INST_TLB_ERROR_8xx, InstructionTLBError) + /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ + EXCEPTION_PROLOG INTERRUPT_INST_STORAGE InstructionTLBError andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ andis. r10,r9,SRR1_ISI_NOPT@h beq+ .Litlbie - tlbie r4 - /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ + tlbie r12 .Litlbie: - EXC_XFER_LITE(0x400, handle_page_fault) + stw r12, _DAR(r11) + stw r5, _DSISR(r11) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* This is the data TLB error on the MPC8xx. This could be due to * many reasons, including a dirty update to a pte. We bail out to * a higher level function that can handle it. */ - . = 0x1400 -DataTLBError: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 - mfcr r10 - + START_EXCEPTION(INTERRUPT_DATA_TLB_ERROR_8xx, DataTLBError) + EXCEPTION_PROLOG_0 handle_dar_dsisr=1 mfspr r11, SPRN_DAR - cmpwi cr0, r11, RPN_PATTERN - beq- FixupDAR /* must be a buggy dcbX, icbi insn. */ + cmpwi cr1, r11, RPN_PATTERN + beq- cr1, FixupDAR /* must be a buggy dcbX, icbi insn. */ DARFixed:/* Return from dcbx instruction bug workaround */ EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) - mfspr r4,SPRN_DAR + /* 0x300 is DataAccess exception, needed by bad_page_fault() */ + EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataTLBError handle_dar_dsisr=1 + lwz r4, _DAR(r11) + lwz r5, _DSISR(r11) andis. r10,r5,DSISR_NOHPTE@h beq+ .Ldtlbie tlbie r4 .Ldtlbie: - li r10,RPN_PATTERN - mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ - /* 0x300 is DataAccess exception, needed by bad_page_fault() */ - EXC_XFER_LITE(0x300, handle_page_fault) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD) +#ifdef CONFIG_VMAP_STACK + vmap_stack_overflow_exception +#endif /* On the MPC8xx, these next four traps are used for development * support of breakpoints and such. Someday I will get around to * using them. */ - . = 0x1c00 -DataBreakpoint: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 - mfcr r10 + START_EXCEPTION(INTERRUPT_DATA_BREAKPOINT_8xx, DataBreakpoint) + EXCEPTION_PROLOG_0 handle_dar_dsisr=1 mfspr r11, SPRN_SRR0 - cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l - beq- cr0, 11f - beq- cr7, 11f - EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 - addi r3,r1,STACK_FRAME_OVERHEAD - mfspr r4,SPRN_BAR - stw r4,_DAR(r11) - mfspr r5,SPRN_DSISR - EXC_XFER_STD(0x1c00, do_break) -11: + cror 4*cr1+eq, 4*cr1+eq, 4*cr7+eq + bne cr1, 1f mtcr r10 mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 rfi +1: EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 INTERRUPT_DATA_BREAKPOINT_8xx DataBreakpoint handle_dar_dsisr=1 + mfspr r4,SPRN_BAR + stw r4,_DAR(r11) + prepare_transfer_to_handler + bl do_break + REST_NVGPRS(r1) + b interrupt_return + #ifdef CONFIG_PERF_EVENTS - . = 0x1d00 -InstructionBreakpoint: + START_EXCEPTION(INTERRUPT_INST_BREAKPOINT_8xx, InstructionBreakpoint) mtspr SPRN_SPRG_SCRATCH0, r10 lwz r10, (instruction_counter - PAGE_OFFSET)@l(0) addi r10, r10, -1 @@ -564,11 +386,12 @@ InstructionBreakpoint: mfspr r10, SPRN_SPRG_SCRATCH0 rfi #else - EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_INST_BREAKPOINT_8xx, Trap_1d, unknown_exception) #endif - EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1e00, Trap_1e, unknown_exception) + EXCEPTION(0x1f00, Trap_1f, unknown_exception) + __HEAD . = 0x2000 /* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions @@ -581,28 +404,22 @@ FixupDAR:/* Entry point for dcbx workaround. */ mfspr r10, SPRN_SRR0 mtspr SPRN_MD_EPN, r10 rlwinm r11, r10, 16, 0xfff8 - cmpli cr0, r11, PAGE_OFFSET@h + cmpli cr1, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TWB /* Get level 1 table */ - blt+ 3f - rlwinm r11, r10, 16, 0xfff8 - -0: cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h - patch_site 0b, patch__fixupdar_linmem_top + blt+ cr1, 3f /* create physical page address from effective address */ tophys(r11, r10) - blt- cr7, 201f mfspr r11, SPRN_M_TWB /* Get level 1 table */ rlwinm r11, r11, 0, 20, 31 oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha 3: lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ mtspr SPRN_MD_TWC, r11 - mtcr r11 + mtcrf 0x01, r11 mfspr r11, SPRN_MD_TWC lwz r11, 0(r11) /* Get the pte */ bt 28,200f /* bit 28 = Large page (8M) */ - bt 29,202f /* bit 29 = Large page (8M or 512K) */ /* concat physical page address(r11) and page offset(r10) */ rlwimi r11, r10, 0, 32 - PAGE_SHIFT, 31 201: lwz r11,0(r11) @@ -611,16 +428,16 @@ FixupDAR:/* Entry point for dcbx workaround. */ * no need to include them here */ xoris r10, r11, 0x7c00 /* check if major OP code is 31 */ rlwinm r10, r10, 0, 21, 5 - cmpwi cr0, r10, 2028 /* Is dcbz? */ - beq+ 142f - cmpwi cr0, r10, 940 /* Is dcbi? */ - beq+ 142f - cmpwi cr0, r10, 108 /* Is dcbst? */ - beq+ 144f /* Fix up store bit! */ - cmpwi cr0, r10, 172 /* Is dcbf? */ - beq+ 142f - cmpwi cr0, r10, 1964 /* Is icbi? */ - beq+ 142f + cmpwi cr1, r10, 2028 /* Is dcbz? */ + beq+ cr1, 142f + cmpwi cr1, r10, 940 /* Is dcbi? */ + beq+ cr1, 142f + cmpwi cr1, r10, 108 /* Is dcbst? */ + beq+ cr1, 144f /* Fix up store bit! */ + cmpwi cr1, r10, 172 /* Is dcbf? */ + beq+ cr1, 142f + cmpwi cr1, r10, 1964 /* Is icbi? */ + beq+ cr1, 142f 141: mfspr r10,SPRN_M_TW b DARFixed /* Nope, go back to normal TLB processing */ @@ -629,11 +446,6 @@ FixupDAR:/* Entry point for dcbx workaround. */ rlwimi r11, r10, 0, 32 - PAGE_SHIFT_8M, 31 b 201b -202: - /* concat physical page address(r11) and page offset(r10) */ - rlwimi r11, r10, 0, 32 - PAGE_SHIFT_512K, 31 - b 201b - 144: mfspr r10, SPRN_DSISR rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */ mtspr SPRN_DSISR, r10 @@ -679,8 +491,9 @@ FixupDAR:/* Entry point for dcbx workaround. */ add r10, r10, r30 ;b 151f add r10, r10, r31 151: - rlwinm. r11,r11,19,24,28 /* offset into jump table for reg RA */ - beq 152f /* if reg RA is zero, don't add it */ + rlwinm r11,r11,19,24,28 /* offset into jump table for reg RA */ + cmpwi cr1, r11, 0 + beq cr1, 152f /* if reg RA is zero, don't add it */ addi r11, r11, 150b@l /* add start of table */ mtctr r11 /* load ctr with jump address */ rlwinm r11,r11,0,16,10 /* make sure we don't execute this more than once */ @@ -688,7 +501,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ 152: mfdar r11 mtctr r11 /* restore ctr reg from DAR */ - mtdar r10 /* save fault EA to DAR */ + mfspr r11, SPRN_SPRG_THREAD + stw r10, DAR(r11) + mfspr r10, SPRN_DSISR + stw r10, DSISR(r11) mfspr r10,SPRN_M_TW b DARFixed /* Go back to normal TLB handling */ @@ -722,7 +538,7 @@ start_here: ori r0, r0, STACK_END_MAGIC@l stw r0, 0(r1) li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) lis r6, swapper_pg_dir@ha tophys(r6,r6) @@ -760,6 +576,27 @@ start_here: rfi /* Load up the kernel context */ 2: +#ifdef CONFIG_PIN_TLB_IMMR + lis r0, MD_TWAM@h + oris r0, r0, 0x1f00 + mtspr SPRN_MD_CTR, r0 + LOAD_REG_IMMEDIATE(r0, VIRT_IMMR_BASE | MD_EVALID) + tlbie r0 + mtspr SPRN_MD_EPN, r0 + LOAD_REG_IMMEDIATE(r0, MD_SVALID | MD_PS512K | MD_GUARDED) + mtspr SPRN_MD_TWC, r0 + mfspr r0, SPRN_IMMR + rlwinm r0, r0, 0, 0xfff80000 + ori r0, r0, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | \ + _PAGE_NO_CACHE | _PAGE_PRESENT + mtspr SPRN_MD_RPN, r0 + lis r0, (MD_TWAM | MD_RSV4I)@h + mtspr SPRN_MD_CTR, r0 +#endif +#if !defined(CONFIG_PIN_TLB_DATA) && !defined(CONFIG_PIN_TLB_IMMR) + lis r0, MD_TWAM@h + mtspr SPRN_MD_CTR, r0 +#endif tlbia /* Clear all TLB entries */ sync /* wait for tlbia/tlbie to finish */ @@ -789,20 +626,13 @@ start_here: * 24 Mbytes of data, and the 512k IMMR space. Anything not covered by * these mappings is mapped by page tables. */ -initial_mmu: +SYM_FUNC_START_LOCAL(initial_mmu) li r8, 0 mtspr SPRN_MI_CTR, r8 /* remove PINNED ITLB entries */ - lis r10, MD_RESETVAL@h -#ifndef CONFIG_8xx_COPYBACK - oris r10, r10, MD_WTDEF@h -#endif + lis r10, MD_TWAM@h mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ tlbia /* Invalidate all TLB entries */ -#ifdef CONFIG_PIN_TLB_DATA - oris r10, r10, MD_RSV4I@h - mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ -#endif lis r8, MI_APG_INIT@h /* Set protection modes */ ori r8, r8, MI_APG_INIT@l @@ -811,55 +641,32 @@ initial_mmu: ori r8, r8, MD_APG_INIT@l mtspr SPRN_MD_AP, r8 - /* Map a 512k page for the IMMR to get the processor - * internal registers (among other things). - */ -#ifdef CONFIG_PIN_TLB_IMMR - oris r10, r10, MD_RSV4I@h - ori r10, r10, 0x1c00 - mtspr SPRN_MD_CTR, r10 - - mfspr r9, 638 /* Get current IMMR */ - andis. r9, r9, 0xfff8 /* Get 512 kbytes boundary */ - - lis r8, VIRT_IMMR_BASE@h /* Create vaddr for TLB */ - ori r8, r8, MD_EVALID /* Mark it valid */ - mtspr SPRN_MD_EPN, r8 - li r8, MD_PS512K | MD_GUARDED /* Set 512k byte page */ - ori r8, r8, MD_SVALID /* Make it valid */ - mtspr SPRN_MD_TWC, r8 - mr r8, r9 /* Create paddr for TLB */ - ori r8, r8, MI_BOOTINIT|0x2 /* Inhibit cache -- Cort */ - mtspr SPRN_MD_RPN, r8 -#endif - - /* Now map the lower RAM (up to 32 Mbytes) into the ITLB. */ -#ifdef CONFIG_PIN_TLB_TEXT + /* Map the lower RAM (up to 32 Mbytes) into the ITLB and DTLB */ lis r8, MI_RSV4I@h ori r8, r8, 0x1c00 -#endif + oris r12, r10, MD_RSV4I@h + ori r12, r12, 0x1c00 li r9, 4 /* up to 4 pages of 8M */ mtctr r9 lis r9, KERNELBASE@h /* Create vaddr for TLB */ - li r10, MI_PS8MEG | MI_SVALID /* Set 8M byte page */ + li r10, MI_PS8MEG | _PMD_ACCESSED | MI_SVALID li r11, MI_BOOTINIT /* Create RPN for address 0 */ - lis r12, _einittext@h - ori r12, r12, _einittext@l 1: -#ifdef CONFIG_PIN_TLB_TEXT mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ addi r8, r8, 0x100 -#endif - ori r0, r9, MI_EVALID /* Mark it valid */ mtspr SPRN_MI_EPN, r0 mtspr SPRN_MI_TWC, r10 mtspr SPRN_MI_RPN, r11 /* Store TLB entry */ + mtspr SPRN_MD_CTR, r12 + addi r12, r12, 0x100 + mtspr SPRN_MD_EPN, r0 + mtspr SPRN_MD_TWC, r10 + mtspr SPRN_MD_RPN, r11 addis r9, r9, 0x80 addis r11, r11, 0x80 - cmpl cr0, r9, r12 - bdnzf gt, 1b + bdnz 1b /* Since the cache is enabled according to the information we * just loaded into the TLB, invalidate and enable the caches here. @@ -870,17 +677,7 @@ initial_mmu: mtspr SPRN_DC_CST, r8 lis r8, IDC_ENABLE@h mtspr SPRN_IC_CST, r8 -#ifdef CONFIG_8xx_COPYBACK - mtspr SPRN_DC_CST, r8 -#else - /* For a debug option, I left this here to easily enable - * the write through cache mode - */ - lis r8, DC_SFWT@h - mtspr SPRN_DC_CST, r8 - lis r8, IDC_ENABLE@h mtspr SPRN_DC_CST, r8 -#endif /* Disable debug mode entry on breakpoints */ mfspr r8, SPRN_DER #ifdef CONFIG_PERF_EVENTS @@ -890,29 +687,105 @@ initial_mmu: #endif mtspr SPRN_DER, r8 blr - - -/* - * We put a few things here that have to be page-aligned. - * This stuff goes at the beginning of the data segment, - * which is page-aligned. - */ - .data - .globl sdata -sdata: - .globl empty_zero_page - .align PAGE_SHIFT -empty_zero_page: - .space PAGE_SIZE -EXPORT_SYMBOL(empty_zero_page) - - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE - -/* Room for two PTE table poiners, usually the kernel and current user - * pointer to their respective root page table (pgdir). - */ - .globl abatron_pteptrs -abatron_pteptrs: - .space 8 +SYM_FUNC_END(initial_mmu) + +_GLOBAL(mmu_pin_tlb) + lis r9, (1f - PAGE_OFFSET)@h + ori r9, r9, (1f - PAGE_OFFSET)@l + mfmsr r10 + mflr r11 + li r12, MSR_KERNEL & ~(MSR_IR | MSR_DR | MSR_RI) + rlwinm r0, r10, 0, ~MSR_RI + rlwinm r0, r0, 0, ~MSR_EE + mtmsr r0 + isync + .align 4 + mtspr SPRN_SRR0, r9 + mtspr SPRN_SRR1, r12 + rfi +1: + li r5, 0 + lis r6, MD_TWAM@h + mtspr SPRN_MI_CTR, r5 + mtspr SPRN_MD_CTR, r6 + tlbia + + LOAD_REG_IMMEDIATE(r5, 28 << 8) + LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) + LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) + LOAD_REG_IMMEDIATE(r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT) + LOAD_REG_ADDR(r9, _sinittext) + li r0, 4 + mtctr r0 + +2: ori r0, r6, MI_EVALID + mtspr SPRN_MI_CTR, r5 + mtspr SPRN_MI_EPN, r0 + mtspr SPRN_MI_TWC, r7 + mtspr SPRN_MI_RPN, r8 + addi r5, r5, 0x100 + addis r6, r6, SZ_8M@h + addis r8, r8, SZ_8M@h + cmplw r6, r9 + bdnzt lt, 2b + lis r0, MI_RSV4I@h + mtspr SPRN_MI_CTR, r0 + + LOAD_REG_IMMEDIATE(r5, 28 << 8 | MD_TWAM) +#ifdef CONFIG_PIN_TLB_DATA + LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) + LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) + li r8, 0 +#ifdef CONFIG_PIN_TLB_IMMR + li r0, 3 +#else + li r0, 4 +#endif + mtctr r0 + cmpwi r4, 0 + beq 4f + LOAD_REG_ADDR(r9, _sinittext) + +2: ori r0, r6, MD_EVALID + ori r12, r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT + mtspr SPRN_MD_CTR, r5 + mtspr SPRN_MD_EPN, r0 + mtspr SPRN_MD_TWC, r7 + mtspr SPRN_MD_RPN, r12 + addi r5, r5, 0x100 + addis r6, r6, SZ_8M@h + addis r8, r8, SZ_8M@h + cmplw r6, r9 + bdnzt lt, 2b +4: +2: ori r0, r6, MD_EVALID + ori r12, r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT + mtspr SPRN_MD_CTR, r5 + mtspr SPRN_MD_EPN, r0 + mtspr SPRN_MD_TWC, r7 + mtspr SPRN_MD_RPN, r12 + addi r5, r5, 0x100 + addis r6, r6, SZ_8M@h + addis r8, r8, SZ_8M@h + cmplw r6, r3 + bdnzt lt, 2b +#endif +#ifdef CONFIG_PIN_TLB_IMMR + LOAD_REG_IMMEDIATE(r0, VIRT_IMMR_BASE | MD_EVALID) + LOAD_REG_IMMEDIATE(r7, MD_SVALID | MD_PS512K | MD_GUARDED | _PMD_ACCESSED) + mfspr r8, SPRN_IMMR + rlwinm r8, r8, 0, 0xfff80000 + ori r8, r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | \ + _PAGE_NO_CACHE | _PAGE_PRESENT + mtspr SPRN_MD_CTR, r5 + mtspr SPRN_MD_EPN, r0 + mtspr SPRN_MD_TWC, r7 + mtspr SPRN_MD_RPN, r8 +#endif +#if defined(CONFIG_PIN_TLB_IMMR) || defined(CONFIG_PIN_TLB_DATA) + lis r0, (MD_RSV4I | MD_TWAM)@h + mtspr SPRN_MD_CTR, r0 +#endif + mtspr SPRN_SRR1, r10 + mtspr SPRN_SRR0, r11 + rfi diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_book3s_32.S index 4a24f8f026c7..c1d89764dd22 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -17,10 +17,12 @@ */ #include <linux/init.h> +#include <linux/pgtable.h> +#include <linux/linkage.h> + #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/cputable.h> #include <asm/cache.h> #include <asm/thread_info.h> @@ -29,21 +31,11 @@ #include <asm/ptrace.h> #include <asm/bug.h> #include <asm/kvm_book3s_asm.h> -#include <asm/export.h> #include <asm/feature-fixups.h> +#include <asm/interrupt.h> #include "head_32.h" -/* 601 only have IBAT */ -#ifdef CONFIG_PPC_BOOK3S_601 -#define LOAD_BAT(n, reg, RA, RB) \ - li RA,0; \ - mtspr SPRN_IBAT##n##U,RA; \ - lwz RA,(n*16)+0(reg); \ - lwz RB,(n*16)+4(reg); \ - mtspr SPRN_IBAT##n##U,RA; \ - mtspr SPRN_IBAT##n##L,RB -#else #define LOAD_BAT(n, reg, RA, RB) \ /* see the comment for clear_bats() -- Cort */ \ li RA,0; \ @@ -57,19 +49,15 @@ lwz RB,(n*16)+12(reg); \ mtspr SPRN_DBAT##n##U,RA; \ mtspr SPRN_DBAT##n##L,RB -#endif __HEAD - .stabs "arch/powerpc/kernel/",N_SO,0,0,0f - .stabs "head_32.S",N_SO,0,0,0f -0: -_ENTRY(_stext); +_GLOBAL(_stext); /* * _start is defined this way because the XCOFF loader in the OpenFirmware * on the powermac expects the entry point to be a procedure descriptor. */ -_ENTRY(_start); +_GLOBAL(_start); /* * These are here for legacy reasons, the kernel used to * need to look like a coff function entry for the pmac @@ -166,9 +154,8 @@ __after_mmu_off: bl initial_bats bl load_segment_registers -#ifdef CONFIG_KASAN + bl reloc_offset bl early_hash_table -#endif #if defined(CONFIG_BOOTX_TEXT) bl setup_disp_bat #endif @@ -185,10 +172,8 @@ __after_mmu_off: bl reloc_offset li r24,0 /* cpu# */ bl call_setup_cpu /* Call setup_cpu for this CPU */ -#ifdef CONFIG_PPC_BOOK3S_32 bl reloc_offset bl init_idle_6xx -#endif /* CONFIG_PPC_BOOK3S_32 */ /* @@ -219,8 +204,7 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC - RFI /* enables MMU */ + rfi /* enables MMU */ /* * We need __secondary_hold as a place to hold the other cpus on @@ -253,8 +237,8 @@ __secondary_hold_acknowledge: /* System reset */ /* core99 pmac starts the seconary here by changing the vector, and - putting it back to what it was (unknown_exception) when done. */ - EXCEPTION(0x100, Reset, unknown_exception, EXC_XFER_STD) + putting it back to what it was (unknown_async_exception) when done. */ + EXCEPTION(INTERRUPT_SYSTEM_RESET, Reset, unknown_async_exception) /* Machine check */ /* @@ -270,89 +254,111 @@ __secondary_hold_acknowledge: * pointer when we take an exception from supervisor mode.) * -- paulus. */ - . = 0x200 - DO_KVM 0x200 - mtspr SPRN_SPRG_SCRATCH0,r10 - mtspr SPRN_SPRG_SCRATCH1,r11 - mfcr r10 + START_EXCEPTION(INTERRUPT_MACHINE_CHECK, MachineCheck) + EXCEPTION_PROLOG_0 #ifdef CONFIG_PPC_CHRP - mfspr r11, SPRN_SPRG_THREAD - lwz r11, RTAS_SP(r11) - cmpwi cr1, r11, 0 + mtspr SPRN_SPRG_SCRATCH2,r1 + mfspr r1, SPRN_SPRG_THREAD + lwz r1, RTAS_SP(r1) + cmpwi cr1, r1, 0 bne cr1, 7f + mfspr r1, SPRN_SPRG_SCRATCH2 #endif /* CONFIG_PPC_CHRP */ EXCEPTION_PROLOG_1 -7: EXCEPTION_PROLOG_2 - addi r3,r1,STACK_FRAME_OVERHEAD +7: EXCEPTION_PROLOG_2 0x200 MachineCheck #ifdef CONFIG_PPC_CHRP - bne cr1,1f -#endif - EXC_XFER_STD(0x200, machine_check_exception) -#ifdef CONFIG_PPC_CHRP -1: b machine_check_in_rtas + beq cr1, 1f + twi 31, 0, 0 #endif +1: prepare_transfer_to_handler + bl machine_check_exception + b interrupt_return /* Data access exception. */ - . = 0x300 - DO_KVM 0x300 -DataAccess: - EXCEPTION_PROLOG - mfspr r10,SPRN_DSISR - stw r10,_DSISR(r11) -#ifdef CONFIG_PPC_KUAP - andis. r0,r10,(DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h -#else - andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h -#endif - bne 1f /* if not, try to put a PTE */ - mfspr r4,SPRN_DAR /* into the hash table */ - rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ + START_EXCEPTION(INTERRUPT_DATA_STORAGE, DataAccess) +#ifdef CONFIG_PPC_BOOK3S_604 BEGIN_MMU_FTR_SECTION - bl hash_page -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -1: lwz r5,_DSISR(r11) /* get DSISR value */ - mfspr r4,SPRN_DAR - EXC_XFER_LITE(0x300, handle_page_fault) + mtspr SPRN_SPRG_SCRATCH2,r10 + mfspr r10, SPRN_SPRG_THREAD + stw r11, THR11(r10) + mfspr r10, SPRN_DSISR + mfcr r11 + andis. r10, r10, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h + mfspr r10, SPRN_SPRG_THREAD + beq hash_page_dsi +.Lhash_page_dsi_cont: + mtcr r11 + lwz r11, THR11(r10) + mfspr r10, SPRN_SPRG_SCRATCH2 +MMU_FTR_SECTION_ELSE + b 1f +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) +#endif +1: EXCEPTION_PROLOG_0 handle_dar_dsisr=1 + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataAccess handle_dar_dsisr=1 + prepare_transfer_to_handler + lwz r5, _DSISR(r1) + andis. r0, r5, DSISR_DABRMATCH@h + bne- 1f + bl do_page_fault + b interrupt_return +1: bl do_break + REST_NVGPRS(r1) + b interrupt_return /* Instruction access exception. */ - . = 0x400 - DO_KVM 0x400 -InstructionAccess: - EXCEPTION_PROLOG - andis. r0,r9,SRR1_ISI_NOPT@h /* no pte found? */ - beq 1f /* if so, try to put a PTE */ - li r3,0 /* into the hash table */ - mr r4,r12 /* SRR0 is fault address */ + START_EXCEPTION(INTERRUPT_INST_STORAGE, InstructionAccess) + mtspr SPRN_SPRG_SCRATCH0,r10 + mtspr SPRN_SPRG_SCRATCH1,r11 + mfspr r10, SPRN_SPRG_THREAD + mfspr r11, SPRN_SRR0 + stw r11, SRR0(r10) + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ + stw r11, SRR1(r10) + mfcr r10 +#ifdef CONFIG_PPC_BOOK3S_604 BEGIN_MMU_FTR_SECTION - bl hash_page + andis. r11, r11, SRR1_ISI_NOPT@h /* no pte found? */ + bne hash_page_isi +.Lhash_page_isi_cont: + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -1: mr r4,r12 +#endif + andi. r11, r11, MSR_PR + + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 INTERRUPT_INST_STORAGE InstructionAccess andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ - EXC_XFER_LITE(0x400, handle_page_fault) + stw r5, _DSISR(r11) + stw r12, _DAR(r11) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* External interrupt */ - EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) + EXCEPTION(INTERRUPT_EXTERNAL, HardwareInterrupt, do_IRQ) /* Alignment exception */ - . = 0x600 - DO_KVM 0x600 -Alignment: - EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x600, alignment_exception) + START_EXCEPTION(INTERRUPT_ALIGNMENT, Alignment) + EXCEPTION_PROLOG INTERRUPT_ALIGNMENT Alignment handle_dar_dsisr=1 + prepare_transfer_to_handler + bl alignment_exception + REST_NVGPRS(r1) + b interrupt_return /* Program check exception */ - EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) + START_EXCEPTION(INTERRUPT_PROGRAM, ProgramCheck) + EXCEPTION_PROLOG INTERRUPT_PROGRAM ProgramCheck + prepare_transfer_to_handler + bl program_check_exception + REST_NVGPRS(r1) + b interrupt_return /* Floating-point unavailable */ - . = 0x800 - DO_KVM 0x800 -FPUnavailable: + START_EXCEPTION(0x800, FPUnavailable) +#ifdef CONFIG_PPC_FPU BEGIN_FTR_SECTION /* * Certain Freescale cores don't have a FPU and treat fp instructions @@ -360,28 +366,29 @@ BEGIN_FTR_SECTION */ b ProgramCheck END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) - EXCEPTION_PROLOG + EXCEPTION_PROLOG INTERRUPT_FP_UNAVAIL FPUnavailable beq 1f bl load_up_fpu /* if from user, just load it up */ b fast_exception_return -1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x800, kernel_fp_unavailable_exception) +1: prepare_transfer_to_handler + bl kernel_fp_unavailable_exception + b interrupt_return +#else + b ProgramCheck +#endif /* Decrementer */ - EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) + EXCEPTION(INTERRUPT_DECREMENTER, Decrementer, timer_interrupt) - EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD) + EXCEPTION(0xa00, Trap_0a, unknown_exception) + EXCEPTION(0xb00, Trap_0b, unknown_exception) /* System call */ - . = 0xc00 - DO_KVM 0xc00 -SystemCall: - SYSCALL_ENTRY 0xc00 + START_EXCEPTION(INTERRUPT_SYSCALL, SystemCall) + SYSCALL_ENTRY INTERRUPT_SYSCALL -/* Single step - not used on 601 */ - EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) - EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_TRACE, SingleStep, single_step_exception) + EXCEPTION(0xe00, Trap_0e, unknown_exception) /* * The Altivec unavailable trap is at 0x0f20. Foo. @@ -391,41 +398,39 @@ SystemCall: * non-altivec kernel running on a machine with altivec just * by executing an altivec instruction. */ - . = 0xf00 - DO_KVM 0xf00 + START_EXCEPTION(INTERRUPT_PERFMON, PerformanceMonitorTrap) b PerformanceMonitor - . = 0xf20 - DO_KVM 0xf20 + START_EXCEPTION(INTERRUPT_ALTIVEC_UNAVAIL, AltiVecUnavailableTrap) b AltiVecUnavailable + __HEAD /* * Handle TLB miss for instruction on 603/603e. * Note: we get an alternate set of r0 - r3 to use automatically. */ - . = 0x1000 + . = INTERRUPT_INST_TLB_MISS_603 InstructionTLBMiss: /* - * r0: scratch + * r0: userspace flag (later scratch) * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte - * r3: scratch + * r3: fault address */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS -#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) - lis r1,PAGE_OFFSET@h /* check if kernel address */ +#ifdef CONFIG_MODULES + lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 #endif - mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP + mfspr r2, SPRN_SDR1 li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC -#else - li r1,_PAGE_PRESENT | _PAGE_EXEC -#endif -#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) - bge- 112f + rlwinm r2, r2, 28, 0xfffff000 +#ifdef CONFIG_MODULES + li r0, 3 + bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + li r0, 0 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ #endif 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ @@ -433,13 +438,15 @@ InstructionTLBMiss: rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- InstructionAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r0,0(r2) /* get linux-style pte */ - andc. r1,r1,r0 /* check access & ~permission */ + lwz r2,0(r2) /* get linux-style pte */ + andc. r1,r1,r2 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ +#ifdef CONFIG_MODULES + rlwimi r2, r0, 0, 31, 31 /* userspace ? -> PP lsb */ +#endif ori r1, r1, 0xe06 /* clear out reserved bits */ - andc r1, r0, r1 /* PP = user? 1 : 0 */ + andc r1, r2, r1 /* PP = user? 1 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -471,51 +478,45 @@ InstructionAddressInvalid: /* * Handle TLB miss for DATA Load operation on 603/603e */ - . = 0x1100 + . = INTERRUPT_DATA_LOAD_TLB_MISS_603 DataLoadTLBMiss: /* - * r0: scratch + * r0: userspace flag (later scratch) * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte - * r3: scratch + * r3: fault address */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_DMISS - lis r1,PAGE_OFFSET@h /* check if kernel address */ + lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP - li r1, _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r1, _PAGE_PRESENT -#endif - bge- 112f + mfspr r2, SPRN_SDR1 + li r1, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_READ + rlwinm r2, r2, 28, 0xfffff000 + li r0, 3 + bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + li r0, 0 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r0,0(r2) /* get linux-style pte */ - andc. r1,r1,r0 /* check access & ~permission */ + lwz r2,0(r2) /* get linux-style pte */ + andc. r1,r1,r2 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - /* - * NOTE! We are assuming this is not an SMP system, otherwise - * we would need to update the pte atomically with lwarx/stwcx. - */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-9,30,30 /* _PAGE_RW -> PP msb */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ + rlwinm r1,r2,32-9,30,30 /* _PAGE_WRITE -> PP msb */ + rlwimi r2,r0,0,30,31 /* userspace ? -> PP */ + rlwimi r1,r2,32-3,24,24 /* _PAGE_WRITE -> _PAGE_DIRTY */ + xori r1,r1,_PAGE_DIRTY /* clear dirty when not rw */ ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? rw? 1: 3: 0 */ + andc r1,r2,r1 /* PP = user? rw? 1: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtspr SPRN_RPA,r1 - mfspr r2,SPRN_SRR1 /* Need to restore CR0 */ - mtcrf 0x80,r2 BEGIN_MMU_FTR_SECTION li r0,1 mfspr r1,SPRN_SPRG_603_LRU @@ -527,9 +528,15 @@ BEGIN_MMU_FTR_SECTION mfspr r2,SPRN_SRR1 rlwimi r2,r0,31-14,14,14 mtspr SPRN_SRR1,r2 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) + mtcrf 0x80,r2 + tlbld r3 + rfi +MMU_FTR_SECTION_ELSE + mfspr r2,SPRN_SRR1 /* Need to restore CR0 */ + mtcrf 0x80,r2 tlbld r3 rfi +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) DataAddressInvalid: mfspr r3,SPRN_SRR1 rlwinm r1,r3,9,6,6 /* Get load/store bit */ @@ -551,43 +558,38 @@ DataAddressInvalid: /* * Handle TLB miss for DATA Store on 603/603e */ - . = 0x1200 + . = INTERRUPT_DATA_STORE_TLB_MISS_603 DataStoreTLBMiss: /* - * r0: scratch + * r0: userspace flag (later scratch) * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte - * r3: scratch + * r3: fault address */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_DMISS - lis r1,PAGE_OFFSET@h /* check if kernel address */ + lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP + mfspr r2, SPRN_SDR1 li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT -#endif - bge- 112f + rlwinm r2, r2, 28, 0xfffff000 + li r0, 3 + bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + li r0, 0 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r0,0(r2) /* get linux-style pte */ - andc. r1,r1,r0 /* check access & ~permission */ + lwz r2,0(r2) /* get linux-style pte */ + andc. r1,r1,r2 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - /* - * NOTE! We are assuming this is not an SMP system, otherwise - * we would need to update the pte atomically with lwarx/stwcx. - */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ + rlwimi r2,r0,0,31,31 /* userspace ? -> PP lsb */ li r1,0xe06 /* clear out reserved bits & PP msb */ - andc r1,r0,r1 /* PP = user? 1: 0 */ + andc r1,r2,r1 /* PP = user? 1: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -605,70 +607,162 @@ BEGIN_MMU_FTR_SECTION mfspr r2,SPRN_SRR1 rlwimi r2,r0,31-14,14,14 mtspr SPRN_SRR1,r2 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) + mtcrf 0x80,r2 + tlbld r3 + rfi +MMU_FTR_SECTION_ELSE + mfspr r2,SPRN_SRR1 /* Need to restore CR0 */ + mtcrf 0x80,r2 tlbld r3 rfi +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) #ifndef CONFIG_ALTIVEC #define altivec_assist_exception unknown_exception #endif - EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_STD) - EXCEPTION(0x1400, SMI, SMIException, EXC_XFER_STD) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1600, Trap_16, altivec_assist_exception, EXC_XFER_STD) - EXCEPTION(0x1700, Trap_17, TAUException, EXC_XFER_STD) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1c00, Trap_1c, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2000, RunMode, RunModeException, EXC_XFER_STD) - EXCEPTION(0x2100, Trap_21, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2200, Trap_22, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2300, Trap_23, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2400, Trap_24, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2500, Trap_25, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2600, Trap_26, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2700, Trap_27, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2800, Trap_28, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2900, Trap_29, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2a00, Trap_2a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2b00, Trap_2b, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_STD) +#ifndef CONFIG_TAU_INT +#define TAUException unknown_async_exception +#endif + + EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception) + EXCEPTION(0x1400, SMI, SMIException) + EXCEPTION(0x1500, Trap_15, unknown_exception) + EXCEPTION(0x1600, Trap_16, altivec_assist_exception) + EXCEPTION(0x1700, Trap_17, TAUException) + EXCEPTION(0x1800, Trap_18, unknown_exception) + EXCEPTION(0x1900, Trap_19, unknown_exception) + EXCEPTION(0x1a00, Trap_1a, unknown_exception) + EXCEPTION(0x1b00, Trap_1b, unknown_exception) + EXCEPTION(0x1c00, Trap_1c, unknown_exception) + EXCEPTION(0x1d00, Trap_1d, unknown_exception) + EXCEPTION(0x1e00, Trap_1e, unknown_exception) + EXCEPTION(0x1f00, Trap_1f, unknown_exception) + EXCEPTION(0x2000, RunMode, RunModeException) + EXCEPTION(0x2100, Trap_21, unknown_exception) + EXCEPTION(0x2200, Trap_22, unknown_exception) + EXCEPTION(0x2300, Trap_23, unknown_exception) + EXCEPTION(0x2400, Trap_24, unknown_exception) + EXCEPTION(0x2500, Trap_25, unknown_exception) + EXCEPTION(0x2600, Trap_26, unknown_exception) + EXCEPTION(0x2700, Trap_27, unknown_exception) + EXCEPTION(0x2800, Trap_28, unknown_exception) + EXCEPTION(0x2900, Trap_29, unknown_exception) + EXCEPTION(0x2a00, Trap_2a, unknown_exception) + EXCEPTION(0x2b00, Trap_2b, unknown_exception) + EXCEPTION(0x2c00, Trap_2c, unknown_exception) + EXCEPTION(0x2d00, Trap_2d, unknown_exception) + EXCEPTION(0x2e00, Trap_2e, unknown_exception) + EXCEPTION(0x2f00, Trap_2f, unknown_exception) + __HEAD . = 0x3000 +#ifdef CONFIG_PPC_BOOK3S_604 +.macro save_regs_thread thread + stw r0, THR0(\thread) + stw r3, THR3(\thread) + stw r4, THR4(\thread) + stw r5, THR5(\thread) + stw r6, THR6(\thread) + stw r8, THR8(\thread) + stw r9, THR9(\thread) + mflr r0 + stw r0, THLR(\thread) + mfctr r0 + stw r0, THCTR(\thread) +.endm + +.macro restore_regs_thread thread + lwz r0, THLR(\thread) + mtlr r0 + lwz r0, THCTR(\thread) + mtctr r0 + lwz r0, THR0(\thread) + lwz r3, THR3(\thread) + lwz r4, THR4(\thread) + lwz r5, THR5(\thread) + lwz r6, THR6(\thread) + lwz r8, THR8(\thread) + lwz r9, THR9(\thread) +.endm + +hash_page_dsi: + save_regs_thread r10 + mfdsisr r3 + mfdar r4 + mfsrr0 r5 + mfsrr1 r9 + rlwinm r3, r3, 32 - 15, _PAGE_WRITE /* DSISR_STORE -> _PAGE_WRITE */ + ori r3, r3, _PAGE_PRESENT | _PAGE_READ + bl hash_page + mfspr r10, SPRN_SPRG_THREAD + restore_regs_thread r10 + b .Lhash_page_dsi_cont + +hash_page_isi: + mr r11, r10 + mfspr r10, SPRN_SPRG_THREAD + save_regs_thread r10 + li r3, _PAGE_PRESENT | _PAGE_EXEC + lwz r4, SRR0(r10) + lwz r9, SRR1(r10) + bl hash_page + mfspr r10, SPRN_SPRG_THREAD + restore_regs_thread r10 + mr r10, r11 + b .Lhash_page_isi_cont + + .globl fast_hash_page_return +fast_hash_page_return: + andis. r10, r9, SRR1_ISI_NOPT@h /* Set on ISI, cleared on DSI */ + mfspr r10, SPRN_SPRG_THREAD + restore_regs_thread r10 + bne 1f + + /* DSI */ + mtcr r11 + lwz r11, THR11(r10) + mfspr r10, SPRN_SPRG_SCRATCH2 + rfi + +1: /* ISI */ + mtcr r11 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 + rfi +#endif /* CONFIG_PPC_BOOK3S_604 */ + +#ifdef CONFIG_VMAP_STACK + vmap_stack_overflow_exception +#endif + + __HEAD AltiVecUnavailable: - EXCEPTION_PROLOG + EXCEPTION_PROLOG 0xf20 AltiVecUnavailable #ifdef CONFIG_ALTIVEC beq 1f bl load_up_altivec /* if from user, just load it up */ b fast_exception_return #endif /* CONFIG_ALTIVEC */ -1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0xf20, altivec_unavailable_exception) +1: prepare_transfer_to_handler + bl altivec_unavailable_exception + b interrupt_return + __HEAD PerformanceMonitor: - EXCEPTION_PROLOG - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0xf00, performance_monitor_exception) + EXCEPTION_PROLOG 0xf00 PerformanceMonitor + prepare_transfer_to_handler + bl performance_monitor_exception + b interrupt_return + __HEAD /* * This code is jumped to from the startup code to copy * the kernel image to physical address PHYSICAL_START. */ relocate_kernel: - addis r9,r26,klimit@ha /* fetch klimit */ - lwz r25,klimit@l(r9) - addis r25,r25,-KERNELBASE@h lis r3,PHYSICAL_START@h /* Destination base address */ li r6,0 /* Destination offset */ li r5,0x4000 /* # bytes of memory to copy */ @@ -676,7 +770,8 @@ relocate_kernel: addi r0,r3,4f@l /* jump to the address of 4f */ mtctr r0 /* in copy and do the rest. */ bctr /* jump to the copy */ -4: mr r5,r25 +4: lis r5,_end-KERNELBASE@h + ori r5,r5,_end-KERNELBASE@l bl copy_and_flush /* copy the rest */ b turn_on_mmu @@ -686,7 +781,7 @@ relocate_kernel: * r3 = dest addr, r4 = source addr, r5 = copy limit, r6 = start offset * on exit, r3, r4, r5 are unchanged, r6 is updated to be >= r5. */ -_ENTRY(copy_and_flush) +_GLOBAL(copy_and_flush) addi r5,r5,-4 addi r6,r6,-4 4: li r0,L1_CACHE_BYTES/4 @@ -729,7 +824,6 @@ __secondary_start_pmac_0: set to map the 0xf0000000 - 0xffffffff region */ mfmsr r0 rlwinm r0,r0,0,28,26 /* clear DR (0x10) */ - SYNC mtmsr r0 isync @@ -741,10 +835,8 @@ __secondary_start: lis r3,-KERNELBASE@h mr r4,r24 bl call_setup_cpu /* Call setup_cpu for this CPU */ -#ifdef CONFIG_PPC_BOOK3S_32 lis r3,-KERNELBASE@h bl init_idle_6xx -#endif /* CONFIG_PPC_BOOK3S_32 */ /* get current's stack and current */ lis r2,secondary_current@ha @@ -754,7 +846,7 @@ __secondary_start: lwz r1,TASK_STACK(r1) /* stack */ - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r0,0 tophys(r3,r1) stw r0,0(r3) @@ -767,9 +859,12 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ mtspr SPRN_SPRG_THREAD,r4 +BEGIN_MMU_FTR_SECTION lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l - mtspr SPRN_SPRG_PGDIR, r4 + rlwinm r4, r4, 4, 0xffff01ff + mtspr SPRN_SDR1, r4 +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_HPTE_TABLE) /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -777,8 +872,7 @@ __secondary_start: ori r3,r3,start_secondary@l mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - SYNC - RFI + rfi #endif /* CONFIG_SMP */ #ifdef CONFIG_KVM_BOOK3S_HANDLER @@ -786,22 +880,10 @@ __secondary_start: #endif /* - * Those generic dummy functions are kept for CPUs not - * included in CONFIG_PPC_BOOK3S_32 - */ -#if !defined(CONFIG_PPC_BOOK3S_32) -_ENTRY(__save_cpu_setup) - blr -_ENTRY(__restore_cpu_setup) - blr -#endif /* !defined(CONFIG_PPC_BOOK3S_32) */ - -/* * Load stuff into the MMU. Intended to be called with * IR=0 and DR=0. */ -#ifdef CONFIG_KASAN -early_hash_table: +SYM_FUNC_START_LOCAL(early_hash_table) sync /* Force all PTE updates to finish */ isync tlbia /* Clear all TLB entries */ @@ -812,22 +894,23 @@ early_hash_table: ori r6, r6, 3 /* 256kB table */ mtspr SPRN_SDR1, r6 blr -#endif +SYM_FUNC_END(early_hash_table) -load_up_mmu: +SYM_FUNC_START_LOCAL(load_up_mmu) sync /* Force all PTE updates to finish */ isync tlbia /* Clear all TLB entries */ sync /* wait for tlbia/tlbie to finish */ TLBSYNC /* ... on all CPUs */ +BEGIN_MMU_FTR_SECTION /* Load the SDR1 register (hash table base & size) */ lis r6,_SDR1@ha tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -/* Load the BAT registers with the values set up by MMU_init. - MMU_init takes care of whether we're on a 601 or not. */ +/* Load the BAT registers with the values set up by MMU_init. */ lis r3,BATS@ha addi r3,r3,BATS@l tophys(r3,r3) @@ -842,16 +925,15 @@ BEGIN_MMU_FTR_SECTION LOAD_BAT(7,r3,r4,r5) END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +SYM_FUNC_END(load_up_mmu) -load_segment_registers: +_GLOBAL(load_segment_registers) li r0, NUM_USER_SEGMENTS /* load up user segment register values */ mtctr r0 /* for context 0 */ - li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ #ifdef CONFIG_PPC_KUEP - oris r3, r3, SR_NX@h /* Set Nx */ -#endif -#ifdef CONFIG_PPC_KUAP - oris r3, r3, SR_KS@h /* Set Ks */ + lis r3, SR_NX@h /* Kp = 0, Ks = 0, VSID = 0 */ +#else + li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ #endif li r4, 0 3: mtsrin r3, r4 @@ -881,15 +963,18 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ mtspr SPRN_SPRG_THREAD,r4 +BEGIN_MMU_FTR_SECTION lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l - mtspr SPRN_SPRG_PGDIR, r4 + rlwinm r4, r4, 4, 0xffff01ff + mtspr SPRN_SDR1, r4 +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_HPTE_TABLE) /* stack */ lis r1,init_thread_union@ha addi r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) /* * Do early platform-specific initialization, * and set up the MMU. @@ -902,11 +987,7 @@ start_here: bl machine_init bl __save_cpu_setup bl MMU_init -#ifdef CONFIG_KASAN -BEGIN_MMU_FTR_SECTION bl MMU_init_hw_patch -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -#endif /* * Go back to running unmapped so we can load up new values @@ -917,10 +998,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) ori r4,r4,2f@l tophys(r4,r4) li r3,MSR_KERNEL & ~(MSR_IR|MSR_DR) + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 - SYNC - RFI + rfi /* Load up the kernel context */ 2: bl load_up_mmu @@ -931,7 +1013,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) */ lis r5, abatron_pteptrs@h ori r5, r5, abatron_pteptrs@l - stw r5, 0xf0(r0) /* This much match your Abatron config */ + stw r5, 0xf0(0) /* This much match your Abatron config */ lis r6, swapper_pg_dir@h ori r6, r6, swapper_pg_dir@l tophys(r5, r5) @@ -944,54 +1026,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) ori r3,r3,start_kernel@l mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - SYNC - RFI - -/* - * void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next); - * - * Set up the segment registers for a new context. - */ -_ENTRY(switch_mmu_context) - lwz r3,MMCONTEXTID(r4) - cmpwi cr0,r3,0 - blt- 4f - mulli r3,r3,897 /* multiply context by skew factor */ - rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ -#ifdef CONFIG_PPC_KUEP - oris r3, r3, SR_NX@h /* Set Nx */ -#endif -#ifdef CONFIG_PPC_KUAP - oris r3, r3, SR_KS@h /* Set Ks */ -#endif - li r0,NUM_USER_SEGMENTS - mtctr r0 - - lwz r4, MM_PGD(r4) -#ifdef CONFIG_BDI_SWITCH - /* Context switch the PTE pointer for the Abatron BDI2000. - * The PGDIR is passed as second argument. - */ - lis r5, abatron_pteptrs@ha - stw r4, abatron_pteptrs@l + 0x4(r5) -#endif - tophys(r4, r4) - mtspr SPRN_SPRG_PGDIR, r4 - li r4,0 - isync -3: - mtsrin r3,r4 - addi r3,r3,0x111 /* next VSID */ - rlwinm r3,r3,0,8,3 /* clear out any overflow from VSID field */ - addis r4,r4,0x1000 /* address of next segment */ - bdnz 3b - sync - isync - blr -4: trap - EMIT_BUG_ENTRY 4b,__FILE__,__LINE__,0 - blr -EXPORT_SYMBOL(switch_mmu_context) + rfi /* * An undocumented "feature" of 604e requires that the v bit @@ -1001,10 +1036,9 @@ EXPORT_SYMBOL(switch_mmu_context) * this makes sure it's done. * -- Cort */ -clear_bats: +SYM_FUNC_START_LOCAL(clear_bats) li r10,0 -#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT0U,r10 mtspr SPRN_DBAT0L,r10 mtspr SPRN_DBAT1U,r10 @@ -1013,7 +1047,6 @@ clear_bats: mtspr SPRN_DBAT2L,r10 mtspr SPRN_DBAT3U,r10 mtspr SPRN_DBAT3L,r10 -#endif mtspr SPRN_IBAT0U,r10 mtspr SPRN_IBAT0L,r10 mtspr SPRN_IBAT1U,r10 @@ -1047,8 +1080,9 @@ BEGIN_MMU_FTR_SECTION mtspr SPRN_IBAT7L,r10 END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +SYM_FUNC_END(clear_bats) -_ENTRY(update_bats) +_GLOBAL(update_bats) lis r4, 1f@h ori r4, r4, 1f@l tophys(r4, r4) @@ -1058,10 +1092,11 @@ _ENTRY(update_bats) rlwinm r0, r6, 0, ~MSR_RI rlwinm r0, r0, 0, ~MSR_EE mtmsr r0 + + .align 4 mtspr SPRN_SRR0, r4 mtspr SPRN_SRR1, r3 - SYNC - RFI + rfi 1: bl clear_bats lis r3, BATS@ha addi r3, r3, BATS@l @@ -1080,48 +1115,34 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) mtmsr r3 mtspr SPRN_SRR0, r7 mtspr SPRN_SRR1, r6 - SYNC - RFI + rfi -flush_tlbs: +SYM_FUNC_START_LOCAL(flush_tlbs) lis r10, 0x40 1: addic. r10, r10, -0x1000 tlbie r10 bgt 1b sync blr +SYM_FUNC_END(flush_tlbs) -mmu_off: +SYM_FUNC_START_LOCAL(mmu_off) addi r4, r3, __after_mmu_off - _start mfmsr r3 andi. r0,r3,MSR_DR|MSR_IR /* MMU enabled? */ beqlr andc r3,r3,r0 + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 sync - RFI + rfi +SYM_FUNC_END(mmu_off) -/* - * On 601, we use 3 BATs to map up to 24M of RAM at _PAGE_OFFSET - * (we keep one for debugging) and on others, we use one 256M BAT. - */ -initial_bats: +/* We use one BAT to map up to 256M of RAM at _PAGE_OFFSET */ +SYM_FUNC_START_LOCAL(initial_bats) lis r11,PAGE_OFFSET@h -#ifdef CONFIG_PPC_BOOK3S_601 - ori r11,r11,4 /* set up BAT registers for 601 */ - li r8,0x7f /* valid, block length = 8MB */ - mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */ - mtspr SPRN_IBAT0L,r8 /* lower BAT register */ - addis r11,r11,0x800000@h - addis r8,r8,0x800000@h - mtspr SPRN_IBAT1U,r11 - mtspr SPRN_IBAT1L,r8 - addis r11,r11,0x800000@h - addis r8,r8,0x800000@h - mtspr SPRN_IBAT2U,r11 - mtspr SPRN_IBAT2L,r8 -#else tophys(r8,r11) #ifdef CONFIG_SMP ori r8,r8,0x12 /* R/W access, M=1 */ @@ -1130,16 +1151,16 @@ initial_bats: #endif /* CONFIG_SMP */ ori r11,r11,BL_256M<<2|0x2 /* set up BAT registers for 604 */ - mtspr SPRN_DBAT0L,r8 /* N.B. 6xx (not 601) have valid */ + mtspr SPRN_DBAT0L,r8 /* N.B. 6xx have valid */ mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */ mtspr SPRN_IBAT0L,r8 mtspr SPRN_IBAT0U,r11 -#endif isync blr +SYM_FUNC_END(initial_bats) #ifdef CONFIG_BOOTX_TEXT -setup_disp_bat: +SYM_FUNC_START_LOCAL(setup_disp_bat) /* * setup the display bat prepared for us in prom.c */ @@ -1152,18 +1173,14 @@ setup_disp_bat: beqlr lwz r11,0(r8) lwz r8,4(r8) -#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT3L,r8 mtspr SPRN_DBAT3U,r11 -#else - mtspr SPRN_IBAT3L,r8 - mtspr SPRN_IBAT3U,r11 -#endif blr +SYM_FUNC_END(setup_disp_bat) #endif /* CONFIG_BOOTX_TEXT */ #ifdef CONFIG_PPC_EARLY_DEBUG_CPM -setup_cpm_bat: +SYM_FUNC_START_LOCAL(setup_cpm_bat) lis r8, 0xf000 ori r8, r8, 0x002a mtspr SPRN_DBAT1L, r8 @@ -1173,10 +1190,11 @@ setup_cpm_bat: mtspr SPRN_DBAT1U, r11 blr +SYM_FUNC_END(setup_cpm_bat) #endif #ifdef CONFIG_PPC_EARLY_DEBUG_USBGECKO -setup_usbgecko_bat: +SYM_FUNC_START_LOCAL(setup_usbgecko_bat) /* prepare a BAT for early io */ #if defined(CONFIG_GAMECUBE) lis r8, 0x0c00 @@ -1195,63 +1213,7 @@ setup_usbgecko_bat: mtspr SPRN_DBAT1L, r8 mtspr SPRN_DBAT1U, r11 blr +SYM_FUNC_END(setup_usbgecko_bat) #endif -#ifdef CONFIG_8260 -/* Jump into the system reset for the rom. - * We first disable the MMU, and then jump to the ROM reset address. - * - * r3 is the board info structure, r4 is the location for starting. - * I use this for building a small kernel that can load other kernels, - * rather than trying to write or rely on a rom monitor that can tftp load. - */ - .globl m8260_gorom -m8260_gorom: - mfmsr r0 - rlwinm r0,r0,0,17,15 /* clear MSR_EE in r0 */ - sync - mtmsr r0 - sync - mfspr r11, SPRN_HID0 - lis r10, 0 - ori r10,r10,HID0_ICE|HID0_DCE - andc r11, r11, r10 - mtspr SPRN_HID0, r11 - isync - li r5, MSR_ME|MSR_RI - lis r6,2f@h - addis r6,r6,-KERNELBASE@h - ori r6,r6,2f@l - mtspr SPRN_SRR0,r6 - mtspr SPRN_SRR1,r5 - isync - sync - rfi -2: - mtlr r4 - blr -#endif - - -/* - * We put a few things here that have to be page-aligned. - * This stuff goes at the beginning of the data segment, - * which is page-aligned. - */ .data - .globl sdata -sdata: - .globl empty_zero_page -empty_zero_page: - .space 4096 -EXPORT_SYMBOL(empty_zero_page) - - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE - -/* Room for two PTE pointers, usually the kernel and current user pointers - * to their respective root page table. - */ -abatron_pteptrs: - .space 8 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 2ae635df9026..b6b5b01a173c 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -5,6 +5,7 @@ #include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ #include <asm/kvm_asm.h> #include <asm/kvm_booke_hv_asm.h> +#include <asm/thread_info.h> /* for THREAD_SHIFT */ #ifdef __ASSEMBLY__ @@ -34,7 +35,7 @@ */ #define THREAD_NORMSAVE(offset) (THREAD_NORMSAVES + (offset * 4)) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 #define BOOKE_CLEAR_BTB(reg) \ START_BTB_FLUSH_SECTION \ BTB_FLUSH(reg) \ @@ -44,7 +45,7 @@ END_BTB_FLUSH_SECTION #endif -#define NORMAL_EXCEPTION_PROLOG(intno) \ +#define NORMAL_EXCEPTION_PROLOG(trapno, intno) \ mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ mfspr r10, SPRN_SPRG_THREAD; \ stw r11, THREAD_NORMSAVE(0)(r10); \ @@ -53,6 +54,8 @@ END_BTB_FLUSH_SECTION mfspr r11, SPRN_SRR1; \ DO_KVM BOOKE_INTERRUPT_##intno SPRN_SRR1; \ andi. r11, r11, MSR_PR; /* check whether user or kernel */\ + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL); \ + mtmsr r11; \ mr r11, r1; \ beq 1f; \ BOOKE_CLEAR_BTB(r11) \ @@ -76,12 +79,38 @@ END_BTB_FLUSH_SECTION stw r1, 0(r11); \ mr r1, r11; \ rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ - stw r0,GPR0(r11); \ - lis r10, STACK_FRAME_REGS_MARKER@ha;/* exception frame marker */ \ - addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ - stw r10, 8(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) + COMMON_EXCEPTION_PROLOG_END trapno + +.macro COMMON_EXCEPTION_PROLOG_END trapno + stw r0,GPR0(r1) + lis r10, STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ + addi r10, r10, STACK_FRAME_REGS_MARKER@l + stw r10, STACK_INT_FRAME_MARKER(r1) + li r10, \trapno + stw r10,_TRAP(r1) + SAVE_GPRS(3, 8, r1) + SAVE_NVGPRS(r1) + stw r2,GPR2(r1) + stw r12,_NIP(r1) + stw r9,_MSR(r1) + mfctr r10 + mfspr r2,SPRN_SPRG_THREAD + stw r10,_CTR(r1) + tovirt(r2, r2) + mfspr r10,SPRN_XER + addi r2, r2, -THREAD + stw r10,_XER(r1) + addi r3,r1,STACK_INT_FRAME_REGS +.endm + +.macro prepare_transfer_to_handler +#ifdef CONFIG_PPC_E500 + andi. r12,r9,MSR_PR + bne 777f + bl prepare_transfer_to_handler +777: +#endif +.endm .macro SYSCALL_ENTRY trapno intno srr1 mfspr r10, SPRN_SPRG_THREAD @@ -99,83 +128,21 @@ BEGIN_FTR_SECTION mr r12, r13 lwz r13, THREAD_NORMSAVE(2)(r10) FTR_SECTION_ELSE -#endif mfcr r12 -#ifdef CONFIG_KVM_BOOKE_HV ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) +#else + mfcr r12 #endif + mfspr r9, SPRN_SRR1 BOOKE_CLEAR_BTB(r11) - lwz r11, TASK_STACK - THREAD(r10) + mr r11, r1 + lwz r1, TASK_STACK - THREAD(r10) rlwinm r12,r12,0,4,2 /* Clear SO bit in CR */ - ALLOC_STACK_FRAME(r11, THREAD_SIZE - INT_FRAME_SIZE) - stw r12, _CCR(r11) /* save various registers */ - mflr r12 - stw r12,_LINK(r11) + ALLOC_STACK_FRAME(r1, THREAD_SIZE - INT_FRAME_SIZE) + stw r12, _CCR(r1) mfspr r12,SPRN_SRR0 - stw r1, GPR1(r11) - mfspr r9,SPRN_SRR1 - stw r1, 0(r11) - mr r1, r11 - stw r12,_NIP(r11) - rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ - lis r12, STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ - stw r2,GPR2(r11) - addi r12, r12, STACK_FRAME_REGS_MARKER@l - stw r9,_MSR(r11) - li r2, \trapno + 1 - stw r12, 8(r11) - stw r2,_TRAP(r11) - SAVE_GPR(0, r11) - SAVE_4GPRS(3, r11) - SAVE_2GPRS(7, r11) - - addi r11,r1,STACK_FRAME_OVERHEAD - addi r2,r10,-THREAD - stw r11,PT_REGS(r10) - /* Check to see if the dbcr0 register is set up to debug. Use the - internal debug mode bit to do this. */ - lwz r12,THREAD_DBCR0(r10) - andis. r12,r12,DBCR0_IDM@h - ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) - beq+ 3f - /* From user and task is ptraced - load up global dbcr0 */ - li r12,-1 /* clear all pending debug events */ - mtspr SPRN_DBSR,r12 - lis r11,global_dbcr0@ha - tophys(r11,r11) - addi r11,r11,global_dbcr0@l -#ifdef CONFIG_SMP - lwz r10, TASK_CPU(r2) - slwi r10, r10, 3 - add r11, r11, r10 -#endif - lwz r12,0(r11) - mtspr SPRN_DBCR0,r12 - lwz r12,4(r11) - addi r12,r12,-1 - stw r12,4(r11) - -3: - tovirt(r2, r2) /* set r2 to current */ - lis r11, transfer_to_syscall@h - ori r11, r11, transfer_to_syscall@l -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * If MSR is changing we need to keep interrupts disabled at this point - * otherwise we might risk taking an interrupt before we tell lockdep - * they are enabled. - */ - lis r10, MSR_KERNEL@h - ori r10, r10, MSR_KERNEL@l - rlwimi r10, r9, 0, MSR_EE -#else - lis r10, (MSR_KERNEL | MSR_EE)@h - ori r10, r10, (MSR_KERNEL | MSR_EE)@l -#endif - mtspr SPRN_SRR1,r10 - mtspr SPRN_SRR0,r11 - SYNC - RFI /* jump to handler, enable MMU */ + stw r12,_NIP(r1) + b transfer_to_syscall /* jump to handler */ .endm /* To handle the additional exception priority levels on 40x and Book-E @@ -183,7 +150,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) * * On 40x critical is the only additional level * On 44x/e500 we have critical and machine check - * On e200 we have critical and debug (machine check occurs via critical) * * Additionally we reserve a SPRG for each priority level so we can free up a * GPR to use as the base for indirect access to the exception stacks. This @@ -199,23 +165,21 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) #define MC_STACK_BASE mcheckirq_ctx #define CRIT_STACK_BASE critirq_ctx -/* only on e500mc/e200 */ +/* only on e500mc */ #define DBG_STACK_BASE dbgirq_ctx -#define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE) - #ifdef CONFIG_SMP #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ mfspr r8,SPRN_PIR; \ slwi r8,r8,2; \ addis r8,r8,level##_STACK_BASE@ha; \ lwz r8,level##_STACK_BASE@l(r8); \ - addi r8,r8,EXC_LVL_FRAME_OVERHEAD; + addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE; #else #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ lis r8,level##_STACK_BASE@ha; \ lwz r8,level##_STACK_BASE@l(r8); \ - addi r8,r8,EXC_LVL_FRAME_OVERHEAD; + addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE; #endif /* @@ -226,7 +190,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) * registers as the normal prolog above. Instead we use a portion of the * critical/machine check exception stack at low physical addresses. */ -#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \ +#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, trapno, intno, exc_level_srr0, exc_level_srr1) \ mtspr SPRN_SPRG_WSCRATCH_##exc_level,r8; \ BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \ stw r9,GPR9(r8); /* save various registers */\ @@ -238,9 +202,11 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) DO_KVM BOOKE_INTERRUPT_##intno exc_level_srr1; \ BOOKE_CLEAR_BTB(r10) \ andi. r11,r11,MSR_PR; \ + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)); \ + mtmsr r11; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\ - addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ + addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE; /* allocate stack frame */\ beq 1f; \ /* COMING FROM USER MODE */ \ stw r9,_CCR(r11); /* save CR */\ @@ -267,16 +233,44 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) stw r1,0(r11); \ mr r1,r11; \ rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ - stw r0,GPR0(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) - -#define CRITICAL_EXCEPTION_PROLOG(intno) \ - EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1) -#define DEBUG_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1) -#define MCHECK_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \ + COMMON_EXCEPTION_PROLOG_END trapno + +#define SAVE_xSRR(xSRR) \ + mfspr r0,SPRN_##xSRR##0; \ + stw r0,_##xSRR##0(r1); \ + mfspr r0,SPRN_##xSRR##1; \ + stw r0,_##xSRR##1(r1) + + +.macro SAVE_MMU_REGS +#ifdef CONFIG_PPC_E500 + mfspr r0,SPRN_MAS0 + stw r0,MAS0(r1) + mfspr r0,SPRN_MAS1 + stw r0,MAS1(r1) + mfspr r0,SPRN_MAS2 + stw r0,MAS2(r1) + mfspr r0,SPRN_MAS3 + stw r0,MAS3(r1) + mfspr r0,SPRN_MAS6 + stw r0,MAS6(r1) +#ifdef CONFIG_PHYS_64BIT + mfspr r0,SPRN_MAS7 + stw r0,MAS7(r1) +#endif /* CONFIG_PHYS_64BIT */ +#endif /* CONFIG_PPC_E500 */ +#ifdef CONFIG_44x + mfspr r0,SPRN_MMUCR + stw r0,MMUCR(r1) +#endif +.endm + +#define CRITICAL_EXCEPTION_PROLOG(trapno, intno) \ + EXC_LEVEL_EXCEPTION_PROLOG(CRIT, trapno+2, intno, SPRN_CSRR0, SPRN_CSRR1) +#define DEBUG_EXCEPTION_PROLOG(trapno) \ + EXC_LEVEL_EXCEPTION_PROLOG(DBG, trapno+8, DEBUG, SPRN_DSRR0, SPRN_DSRR1) +#define MCHECK_EXCEPTION_PROLOG(trapno) \ + EXC_LEVEL_EXCEPTION_PROLOG(MC, trapno+4, MACHINE_CHECK, \ SPRN_MCSRR0, SPRN_MCSRR1) /* @@ -303,44 +297,34 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) .align 5; \ label: -#define EXCEPTION(n, intno, label, hdlr, xfer) \ +#define EXCEPTION(n, intno, label, hdlr) \ START_EXCEPTION(label); \ - NORMAL_EXCEPTION_PROLOG(intno); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - xfer(n, hdlr) + NORMAL_EXCEPTION_PROLOG(n, intno); \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b interrupt_return #define CRITICAL_EXCEPTION(n, intno, label, hdlr) \ START_EXCEPTION(label); \ - CRITICAL_EXCEPTION_PROLOG(intno); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - crit_transfer_to_handler, ret_from_crit_exc) + CRITICAL_EXCEPTION_PROLOG(n, intno); \ + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b ret_from_crit_exc #define MCHECK_EXCEPTION(n, label, hdlr) \ START_EXCEPTION(label); \ - MCHECK_EXCEPTION_PROLOG; \ + MCHECK_EXCEPTION_PROLOG(n); \ mfspr r5,SPRN_ESR; \ stw r5,_ESR(r11); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(hdlr, n+4, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - mcheck_transfer_to_handler, ret_from_mcheck_exc) - -#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ - li r10,trap; \ - stw r10,_TRAP(r11); \ - lis r10,msr@h; \ - ori r10,r10,msr@l; \ - bl tfer; \ - .long hdlr; \ - .long ret - -#define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ - ret_from_except) + SAVE_xSRR(DSRR); \ + SAVE_xSRR(CSRR); \ + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b ret_from_mcheck_exc /* Check for a single step debug exception while in an exception * handler before state has been saved. This is to catch the case @@ -357,7 +341,7 @@ label: */ #define DEBUG_DEBUG_EXCEPTION \ START_EXCEPTION(DebugDebug); \ - DEBUG_EXCEPTION_PROLOG; \ + DEBUG_EXCEPTION_PROLOG(2000); \ \ /* \ * If there is a single step or branch-taken exception in an \ @@ -405,12 +389,17 @@ label: \ /* continue normal handling for a debug exception... */ \ 2: mfspr r4,SPRN_DBSR; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), debug_transfer_to_handler, ret_from_debug_exc) + stw r4,_ESR(r11); /* DebugException takes DBSR in _ESR */\ + SAVE_xSRR(CSRR); \ + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl DebugException; \ + b ret_from_debug_exc #define DEBUG_CRIT_EXCEPTION \ START_EXCEPTION(DebugCrit); \ - CRITICAL_EXCEPTION_PROLOG(DEBUG); \ + CRITICAL_EXCEPTION_PROLOG(2000,DEBUG); \ \ /* \ * If there is a single step or branch-taken exception in an \ @@ -458,78 +447,81 @@ label: \ /* continue normal handling for a critical exception... */ \ 2: mfspr r4,SPRN_DBSR; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), crit_transfer_to_handler, ret_from_crit_exc) + stw r4,_ESR(r11); /* DebugException takes DBSR in _ESR */\ + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl DebugException; \ + b ret_from_crit_exc #define DATA_STORAGE_EXCEPTION \ START_EXCEPTION(DataStorage) \ - NORMAL_EXCEPTION_PROLOG(DATA_STORAGE); \ + NORMAL_EXCEPTION_PROLOG(0x300, DATA_STORAGE); \ mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \ - EXC_XFER_LITE(0x0300, handle_page_fault) + stw r4, _DEAR(r11); \ + prepare_transfer_to_handler; \ + bl do_page_fault; \ + b interrupt_return +/* + * Instruction TLB Error interrupt handlers may call InstructionStorage + * directly without clearing ESR, so the ESR at this point may be left over + * from a prior interrupt. + * + * In any case, do_page_fault for BOOK3E does not use ESR and always expects + * dsisr to be 0. ESR_DST from a prior store in particular would confuse fault + * handling. + */ #define INSTRUCTION_STORAGE_EXCEPTION \ START_EXCEPTION(InstructionStorage) \ - NORMAL_EXCEPTION_PROLOG(INST_STORAGE); \ - mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ + NORMAL_EXCEPTION_PROLOG(0x400, INST_STORAGE); \ + li r5,0; /* Store 0 in regs->esr (dsisr) */ \ stw r5,_ESR(r11); \ - mr r4,r12; /* Pass SRR0 as arg2 */ \ - li r5,0; /* Pass zero as arg3 */ \ - EXC_XFER_LITE(0x0400, handle_page_fault) + stw r12, _DEAR(r11); /* Set regs->dear (dar) to SRR0 */ \ + prepare_transfer_to_handler; \ + bl do_page_fault; \ + b interrupt_return #define ALIGNMENT_EXCEPTION \ START_EXCEPTION(Alignment) \ - NORMAL_EXCEPTION_PROLOG(ALIGNMENT); \ + NORMAL_EXCEPTION_PROLOG(0x600, ALIGNMENT); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR and save it */ \ stw r4,_DEAR(r11); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_STD(0x0600, alignment_exception) + prepare_transfer_to_handler; \ + bl alignment_exception; \ + REST_NVGPRS(r1); \ + b interrupt_return #define PROGRAM_EXCEPTION \ START_EXCEPTION(Program) \ - NORMAL_EXCEPTION_PROLOG(PROGRAM); \ + NORMAL_EXCEPTION_PROLOG(0x700, PROGRAM); \ mfspr r4,SPRN_ESR; /* Grab the ESR and save it */ \ stw r4,_ESR(r11); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_STD(0x0700, program_check_exception) + prepare_transfer_to_handler; \ + bl program_check_exception; \ + REST_NVGPRS(r1); \ + b interrupt_return #define DECREMENTER_EXCEPTION \ START_EXCEPTION(Decrementer) \ - NORMAL_EXCEPTION_PROLOG(DECREMENTER); \ + NORMAL_EXCEPTION_PROLOG(0x900, DECREMENTER); \ lis r0,TSR_DIS@h; /* Setup the DEC interrupt mask */ \ mtspr SPRN_TSR,r0; /* Clear the DEC interrupt */ \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_LITE(0x0900, timer_interrupt) + prepare_transfer_to_handler; \ + bl timer_interrupt; \ + b interrupt_return #define FP_UNAVAILABLE_EXCEPTION \ START_EXCEPTION(FloatingPointUnavailable) \ - NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL); \ + NORMAL_EXCEPTION_PROLOG(0x800, FP_UNAVAIL); \ beq 1f; \ bl load_up_fpu; /* if from user, just load it up */ \ b fast_exception_return; \ -1: addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_STD(0x800, kernel_fp_unavailable_exception) - -#else /* __ASSEMBLY__ */ -struct exception_regs { - unsigned long mas0; - unsigned long mas1; - unsigned long mas2; - unsigned long mas3; - unsigned long mas6; - unsigned long mas7; - unsigned long srr0; - unsigned long srr1; - unsigned long csrr0; - unsigned long csrr1; - unsigned long dsrr0; - unsigned long dsrr1; - unsigned long saved_ksp_limit; -}; - -/* ensure this structure is always sized to a multiple of the stack alignment */ -#define STACK_EXC_LVL_FRAME_SIZE _ALIGN_UP(sizeof (struct exception_regs), 16) +1: prepare_transfer_to_handler; \ + bl kernel_fp_unavailable_exception; \ + b interrupt_return #endif /* __ASSEMBLY__ */ #endif /* __HEAD_BOOKE_H__ */ diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 58ce3d37c2a3..a1318ce18d0e 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/init.h> @@ -22,15 +23,15 @@ #include <asm/processor.h> #include <asm/sstep.h> #include <asm/debug.h> -#include <asm/debugfs.h> #include <asm/hvcall.h> +#include <asm/inst.h> #include <linux/uaccess.h> /* * Stores the breakpoints currently in use on each breakpoint address * register for every cpu */ -static DEFINE_PER_CPU(struct perf_event *, bp_per_reg); +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM_MAX]); /* * Returns total number of data or instruction breakpoints available. @@ -38,10 +39,11 @@ static DEFINE_PER_CPU(struct perf_event *, bp_per_reg); int hw_breakpoint_slots(int type) { if (type == TYPE_DATA) - return HBP_NUM; + return nr_wp_slots(); return 0; /* no instruction breakpoints available */ } + /* * Install a perf counter breakpoint. * @@ -54,16 +56,26 @@ int hw_breakpoint_slots(int type) int arch_install_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); - struct perf_event **slot = this_cpu_ptr(&bp_per_reg); + struct perf_event **slot; + int i; + + for (i = 0; i < nr_wp_slots(); i++) { + slot = this_cpu_ptr(&bp_per_reg[i]); + if (!*slot) { + *slot = bp; + break; + } + } - *slot = bp; + if (WARN_ONCE(i == nr_wp_slots(), "Can't find any breakpoint slot")) + return -EBUSY; /* * Do not install DABR values if the instruction must be single-stepped. * If so, DABR will be populated in single_step_dabr_instruction(). */ - if (current->thread.last_hit_ubp != bp) - __set_breakpoint(info); + if (!info->perf_single_step) + __set_breakpoint(i, info); return 0; } @@ -79,31 +91,27 @@ int arch_install_hw_breakpoint(struct perf_event *bp) */ void arch_uninstall_hw_breakpoint(struct perf_event *bp) { - struct perf_event **slot = this_cpu_ptr(&bp_per_reg); + struct arch_hw_breakpoint null_brk = {0}; + struct perf_event **slot; + int i; + + for (i = 0; i < nr_wp_slots(); i++) { + slot = this_cpu_ptr(&bp_per_reg[i]); + if (*slot == bp) { + *slot = NULL; + break; + } + } - if (*slot != bp) { - WARN_ONCE(1, "Can't find the breakpoint"); + if (WARN_ONCE(i == nr_wp_slots(), "Can't find any breakpoint slot")) return; - } - *slot = NULL; - hw_breakpoint_disable(); + __set_breakpoint(i, &null_brk); } -/* - * Perform cleanup of arch-specific counters during unregistration - * of the perf-event - */ -void arch_unregister_hw_breakpoint(struct perf_event *bp) +static bool is_ptrace_bp(struct perf_event *bp) { - /* - * If the breakpoint is unregistered between a hw_breakpoint_handler() - * and the single_step_dabr_instruction(), then cleanup the breakpoint - * restoration variables to prevent dangling pointers. - * FIXME, this should not be using bp->ctx at all! Sayeth peterz. - */ - if (bp->ctx && bp->ctx->task && bp->ctx->task != ((void *)-1L)) - bp->ctx->task->thread.last_hit_ubp = NULL; + return bp->overflow_handler == ptrace_triggered; } /* @@ -140,10 +148,10 @@ int arch_bp_generic_fields(int type, int *gen_bp_type) * <---8 bytes---> * * In this case, we should configure hw as: - * start_addr = address & ~HW_BREAKPOINT_ALIGN + * start_addr = address & ~(HW_BREAKPOINT_SIZE - 1) * len = 16 bytes * - * @start_addr and @end_addr are inclusive. + * @start_addr is inclusive but @end_addr is exclusive. */ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw) { @@ -151,15 +159,19 @@ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw) u16 hw_len; unsigned long start_addr, end_addr; - start_addr = hw->address & ~HW_BREAKPOINT_ALIGN; - end_addr = (hw->address + hw->len - 1) | HW_BREAKPOINT_ALIGN; - hw_len = end_addr - start_addr + 1; + start_addr = ALIGN_DOWN(hw->address, HW_BREAKPOINT_SIZE); + end_addr = ALIGN(hw->address + hw->len, HW_BREAKPOINT_SIZE); + hw_len = end_addr - start_addr; if (dawr_enabled()) { max_len = DAWR_MAX_LEN; - /* DAWR region can't cross 512 bytes boundary */ - if ((start_addr >> 9) != (end_addr >> 9)) + /* DAWR region can't cross 512 bytes boundary on p10 predecessors */ + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, SZ_512))) return -EINVAL; + } else if (IS_ENABLED(CONFIG_PPC_8xx)) { + /* 8xx can setup a range without limitation */ + max_len = U16_MAX; } if (hw_len > max_len) @@ -208,94 +220,173 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, * Restores the breakpoint on the debug registers. * Invoke this function if it is known that the execution context is * about to change to cause loss of MSR_SE settings. + * + * The perf watchpoint will simply re-trigger once the thread is started again, + * and the watchpoint handler will set up MSR_SE and perf_single_step as + * needed. */ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) { struct arch_hw_breakpoint *info; + int i; - if (likely(!tsk->thread.last_hit_ubp)) - return; + preempt_disable(); + + for (i = 0; i < nr_wp_slots(); i++) { + struct perf_event *bp = __this_cpu_read(bp_per_reg[i]); + + if (unlikely(bp && counter_arch_bp(bp)->perf_single_step)) + goto reset; + } + goto out; + +reset: + regs_set_return_msr(regs, regs->msr & ~MSR_SE); + for (i = 0; i < nr_wp_slots(); i++) { + info = counter_arch_bp(__this_cpu_read(bp_per_reg[i])); + __set_breakpoint(i, info); + info->perf_single_step = false; + } - info = counter_arch_bp(tsk->thread.last_hit_ubp); - regs->msr &= ~MSR_SE; - __set_breakpoint(info); - tsk->thread.last_hit_ubp = NULL; +out: + preempt_enable(); } -static bool dar_within_range(unsigned long dar, struct arch_hw_breakpoint *info) +static bool is_larx_stcx_instr(int type) { - return ((info->address <= dar) && (dar - info->address < info->len)); + return type == LARX || type == STCX; } -static bool -dar_range_overlaps(unsigned long dar, int size, struct arch_hw_breakpoint *info) +static bool is_octword_vsx_instr(int type, int size) { - return ((dar <= info->address + info->len - 1) && - (dar + size - 1 >= info->address)); + return ((type == LOAD_VSX || type == STORE_VSX) && size == 32); } /* - * Handle debug exception notifications. + * We've failed in reliably handling the hw-breakpoint. Unregister + * it and throw a warning message to let the user know about it. */ -static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, - struct arch_hw_breakpoint *info) +static void handler_error(struct perf_event *bp) { - unsigned int instr = 0; - int ret, type, size; - struct instruction_op op; - unsigned long addr = info->address; - - if (__get_user_inatomic(instr, (unsigned int *)regs->nip)) - goto fail; - - ret = analyse_instr(&op, regs, instr); - type = GETTYPE(op.type); - size = GETSIZE(op.type); - - if (!ret && (type == LARX || type == STCX)) { - printk_ratelimited("Breakpoint hit on instruction that can't be emulated." - " Breakpoint at 0x%lx will be disabled.\n", addr); - goto disable; - } + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at 0x%lx will be disabled.", + counter_arch_bp(bp)->address); + perf_event_disable_inatomic(bp); +} - /* - * If it's extraneous event, we still need to emulate/single- - * step the instruction, but we don't generate an event. - */ - if (size && !dar_range_overlaps(regs->dar, size, info)) - info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; +static void larx_stcx_err(struct perf_event *bp) +{ + printk_ratelimited("Breakpoint hit on instruction that can't be emulated. Breakpoint at 0x%lx will be disabled.\n", + counter_arch_bp(bp)->address); + perf_event_disable_inatomic(bp); +} + +static bool stepping_handler(struct pt_regs *regs, struct perf_event **bp, + int *hit, ppc_inst_t instr) +{ + int i; + int stepped; /* Do not emulate user-space instructions, instead single-step them */ if (user_mode(regs)) { - current->thread.last_hit_ubp = bp; - regs->msr |= MSR_SE; + for (i = 0; i < nr_wp_slots(); i++) { + if (!hit[i]) + continue; + + counter_arch_bp(bp[i])->perf_single_step = true; + bp[i] = NULL; + } + regs_set_return_msr(regs, regs->msr | MSR_SE); return false; } - if (!emulate_step(regs, instr)) - goto fail; - + stepped = emulate_step(regs, instr); + if (!stepped) { + for (i = 0; i < nr_wp_slots(); i++) { + if (!hit[i]) + continue; + handler_error(bp[i]); + bp[i] = NULL; + } + return false; + } return true; +} + +static void handle_p10dd1_spurious_exception(struct perf_event **bp, + int *hit, unsigned long ea) +{ + int i; + unsigned long hw_end_addr; -fail: /* - * We've failed in reliably handling the hw-breakpoint. Unregister - * it and throw a warning message to let the user know about it. + * Handle spurious exception only when any bp_per_reg is set. + * Otherwise this might be created by xmon and not actually a + * spurious exception. */ - WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " - "0x%lx will be disabled.", addr); + for (i = 0; i < nr_wp_slots(); i++) { + struct arch_hw_breakpoint *info; -disable: - perf_event_disable_inatomic(bp); - return false; + if (!bp[i]) + continue; + + info = counter_arch_bp(bp[i]); + + hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE); + + /* + * Ending address of DAWR range is less than starting + * address of op. + */ + if ((hw_end_addr - 1) >= ea) + continue; + + /* + * Those addresses need to be in the same or in two + * consecutive 512B blocks; + */ + if (((hw_end_addr - 1) >> 10) != (ea >> 10)) + continue; + + /* + * 'op address + 64B' generates an address that has a + * carry into bit 52 (crosses 2K boundary). + */ + if ((ea & 0x800) == ((ea + 64) & 0x800)) + continue; + + break; + } + + if (i == nr_wp_slots()) + return; + + for (i = 0; i < nr_wp_slots(); i++) { + if (bp[i]) { + hit[i] = 1; + counter_arch_bp(bp[i])->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + } + } } +/* + * Handle a DABR or DAWR exception. + * + * Called in atomic context. + */ int hw_breakpoint_handler(struct die_args *args) { + bool err = false; int rc = NOTIFY_STOP; - struct perf_event *bp; + struct perf_event *bp[HBP_NUM_MAX] = { NULL }; struct pt_regs *regs = args->regs; - struct arch_hw_breakpoint *info; + int i; + int hit[HBP_NUM_MAX] = {0}; + int nr_hit = 0; + bool ptrace_bp = false; + ppc_inst_t instr = ppc_inst(0); + int type = 0; + int size = 0; + unsigned long ea = 0; /* Disable breakpoints during exception handling */ hw_breakpoint_disable(); @@ -308,12 +399,48 @@ int hw_breakpoint_handler(struct die_args *args) */ rcu_read_lock(); - bp = __this_cpu_read(bp_per_reg); - if (!bp) { - rc = NOTIFY_DONE; - goto out; + if (!IS_ENABLED(CONFIG_PPC_8xx)) + wp_get_instr_detail(regs, &instr, &type, &size, &ea); + + for (i = 0; i < nr_wp_slots(); i++) { + struct arch_hw_breakpoint *info; + + bp[i] = __this_cpu_read(bp_per_reg[i]); + if (!bp[i]) + continue; + + info = counter_arch_bp(bp[i]); + info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; + + if (wp_check_constraints(regs, instr, ea, type, size, info)) { + if (!IS_ENABLED(CONFIG_PPC_8xx) && + ppc_inst_equal(instr, ppc_inst(0))) { + handler_error(bp[i]); + bp[i] = NULL; + err = 1; + continue; + } + + if (is_ptrace_bp(bp[i])) + ptrace_bp = true; + hit[i] = 1; + nr_hit++; + } + } + + if (err) + goto reset; + + if (!nr_hit) { + /* Workaround for Power10 DD1 */ + if (!IS_ENABLED(CONFIG_PPC_8xx) && mfspr(SPRN_PVR) == 0x800100 && + is_octword_vsx_instr(type, size)) { + handle_p10dd1_spurious_exception(bp, hit, ea); + } else { + rc = NOTIFY_DONE; + goto out; + } } - info = counter_arch_bp(bp); /* * Return early after invoking user-callback function without restoring @@ -321,29 +448,50 @@ int hw_breakpoint_handler(struct die_args *args) * one-shot mode. The ptrace-ed process will receive the SIGTRAP signal * generated in do_dabr(). */ - if (bp->overflow_handler == ptrace_triggered) { - perf_bp_event(bp, regs); + if (ptrace_bp) { + for (i = 0; i < nr_wp_slots(); i++) { + if (!hit[i] || !is_ptrace_bp(bp[i])) + continue; + perf_bp_event(bp[i], regs); + bp[i] = NULL; + } rc = NOTIFY_DONE; - goto out; + goto reset; } - info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; - if (IS_ENABLED(CONFIG_PPC_8xx)) { - if (!dar_within_range(regs->dar, info)) - info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; - } else { - if (!stepping_handler(regs, bp, info)) - goto out; + if (!IS_ENABLED(CONFIG_PPC_8xx)) { + if (is_larx_stcx_instr(type)) { + for (i = 0; i < nr_wp_slots(); i++) { + if (!hit[i]) + continue; + larx_stcx_err(bp[i]); + bp[i] = NULL; + } + goto reset; + } + + if (!stepping_handler(regs, bp, hit, instr)) + goto reset; } /* * As a policy, the callback is invoked in a 'trigger-after-execute' * fashion */ - if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) - perf_bp_event(bp, regs); + for (i = 0; i < nr_wp_slots(); i++) { + if (!hit[i]) + continue; + if (!(counter_arch_bp(bp[i])->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) + perf_bp_event(bp[i], regs); + } + +reset: + for (i = 0; i < nr_wp_slots(); i++) { + if (!bp[i]) + continue; + __set_breakpoint(i, counter_arch_bp(bp[i])); + } - __set_breakpoint(info); out: rcu_read_unlock(); return rc; @@ -352,38 +500,51 @@ NOKPROBE_SYMBOL(hw_breakpoint_handler); /* * Handle single-step exceptions following a DABR hit. + * + * Called in atomic context. */ static int single_step_dabr_instruction(struct die_args *args) { struct pt_regs *regs = args->regs; - struct perf_event *bp = NULL; - struct arch_hw_breakpoint *info; + bool found = false; - bp = current->thread.last_hit_ubp; /* * Check if we are single-stepping as a result of a * previous HW Breakpoint exception */ - if (!bp) - return NOTIFY_DONE; + for (int i = 0; i < nr_wp_slots(); i++) { + struct perf_event *bp; + struct arch_hw_breakpoint *info; - info = counter_arch_bp(bp); + bp = __this_cpu_read(bp_per_reg[i]); - /* - * We shall invoke the user-defined callback function in the single - * stepping handler to confirm to 'trigger-after-execute' semantics - */ - if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) - perf_bp_event(bp, regs); + if (!bp) + continue; + + info = counter_arch_bp(bp); + + if (!info->perf_single_step) + continue; - __set_breakpoint(info); - current->thread.last_hit_ubp = NULL; + found = true; + + /* + * We shall invoke the user-defined callback function in the + * single stepping handler to confirm to 'trigger-after-execute' + * semantics + */ + if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) + perf_bp_event(bp, regs); + + info->perf_single_step = false; + __set_breakpoint(i, counter_arch_bp(bp)); + } /* * If the process was being single-stepped by ptrace, let the * other single-step actions occur (e.g. generate SIGTRAP). */ - if (test_thread_flag(TIF_SINGLESTEP)) + if (!found || test_thread_flag(TIF_SINGLESTEP)) return NOTIFY_DONE; return NOTIFY_STOP; @@ -392,6 +553,8 @@ NOKPROBE_SYMBOL(single_step_dabr_instruction); /* * Handle debug exception notifications. + * + * Called in atomic context. */ int hw_breakpoint_exceptions_notify( struct notifier_block *unused, unsigned long val, void *data) @@ -416,13 +579,32 @@ NOKPROBE_SYMBOL(hw_breakpoint_exceptions_notify); */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { + int i; struct thread_struct *t = &tsk->thread; - unregister_hw_breakpoint(t->ptrace_bps[0]); - t->ptrace_bps[0] = NULL; + for (i = 0; i < nr_wp_slots(); i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } } void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } + +void ptrace_triggered(struct perf_event *bp, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct perf_event_attr attr; + + /* + * Disable the breakpoint request here since ptrace has defined a + * one-shot behaviour for breakpoint exceptions in PPC64. + * The SIGTRAP signal is generated automatically for us in do_dabr(). + * We don't have to do anything about that here + */ + attr = bp->attr; + attr.disabled = true; + modify_user_hw_breakpoint(bp, &attr); +} diff --git a/arch/powerpc/kernel/hw_breakpoint_constraints.c b/arch/powerpc/kernel/hw_breakpoint_constraints.c new file mode 100644 index 000000000000..9e51801c4915 --- /dev/null +++ b/arch/powerpc/kernel/hw_breakpoint_constraints.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <asm/hw_breakpoint.h> +#include <asm/sstep.h> +#include <asm/cache.h> + +static bool dar_in_user_range(unsigned long dar, struct arch_hw_breakpoint *info) +{ + return ((info->address <= dar) && (dar - info->address < info->len)); +} + +static bool ea_user_range_overlaps(unsigned long ea, int size, + struct arch_hw_breakpoint *info) +{ + return ((ea < info->address + info->len) && + (ea + size > info->address)); +} + +static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info) +{ + unsigned long hw_start_addr, hw_end_addr; + + hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE); + hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE); + + return ((hw_start_addr <= dar) && (hw_end_addr > dar)); +} + +static bool ea_hw_range_overlaps(unsigned long ea, int size, + struct arch_hw_breakpoint *info) +{ + unsigned long hw_start_addr, hw_end_addr; + unsigned long align_size = HW_BREAKPOINT_SIZE; + + /* + * On p10 predecessors, quadword is handle differently then + * other instructions. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_31) && size == 16) + align_size = HW_BREAKPOINT_SIZE_QUADWORD; + + hw_start_addr = ALIGN_DOWN(info->address, align_size); + hw_end_addr = ALIGN(info->address + info->len, align_size); + + return ((ea < hw_end_addr) && (ea + size > hw_start_addr)); +} + +/* + * If hw has multiple DAWR registers, we also need to check all + * dawrx constraint bits to confirm this is _really_ a valid event. + * If type is UNKNOWN, but privilege level matches, consider it as + * a positive match. + */ +static bool check_dawrx_constraints(struct pt_regs *regs, int type, + struct arch_hw_breakpoint *info) +{ + if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ)) + return false; + + /* + * The Cache Management instructions other than dcbz never + * cause a match. i.e. if type is CACHEOP, the instruction + * is dcbz, and dcbz is treated as Store. + */ + if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & HW_BRK_TYPE_WRITE)) + return false; + + if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL)) + return false; + + if (user_mode(regs) && !(info->type & HW_BRK_TYPE_USER)) + return false; + + return true; +} + +/* + * Return true if the event is valid wrt dawr configuration, + * including extraneous exception. Otherwise return false. + */ +bool wp_check_constraints(struct pt_regs *regs, ppc_inst_t instr, + unsigned long ea, int type, int size, + struct arch_hw_breakpoint *info) +{ + bool in_user_range = dar_in_user_range(regs->dar, info); + bool dawrx_constraints; + + /* + * 8xx supports only one breakpoint and thus we can + * unconditionally return true. + */ + if (IS_ENABLED(CONFIG_PPC_8xx)) { + if (!in_user_range) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + return true; + } + + if (unlikely(ppc_inst_equal(instr, ppc_inst(0)))) { + if (cpu_has_feature(CPU_FTR_ARCH_31) && + !dar_in_hw_range(regs->dar, info)) + return false; + + return true; + } + + dawrx_constraints = check_dawrx_constraints(regs, type, info); + + if (type == UNKNOWN) { + if (cpu_has_feature(CPU_FTR_ARCH_31) && + !dar_in_hw_range(regs->dar, info)) + return false; + + return dawrx_constraints; + } + + if (ea_user_range_overlaps(ea, size, info)) + return dawrx_constraints; + + if (ea_hw_range_overlaps(ea, size, info)) { + if (dawrx_constraints) { + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + return true; + } + } + return false; +} + +void wp_get_instr_detail(struct pt_regs *regs, ppc_inst_t *instr, + int *type, int *size, unsigned long *ea) +{ + struct instruction_op op; + int err; + + pagefault_disable(); + err = __get_user_instr(*instr, (void __user *)regs->nip); + pagefault_enable(); + + if (err) + return; + + analyse_instr(&op, regs, *instr); + *type = GETTYPE(op.type); + *ea = op.ea; + + if (!(regs->msr & MSR_64BIT)) + *ea &= 0xffffffffUL; + + + *size = GETSIZE(op.type); + if (*type == CACHEOP) { + *size = l1_dcache_bytes(); + *ea &= ~(*size - 1); + } else if (*type == LOAD_VMX || *type == STORE_VMX) { + *ea &= ~(*size - 1); + } +} diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index a36fd053c3db..30b56c67fa61 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -37,18 +37,10 @@ static int __init powersave_off(char *arg) { ppc_md.power_save = NULL; cpuidle_disable = IDLE_POWERSAVE_OFF; - return 0; + return 1; } __setup("powersave=off", powersave_off); -#ifdef CONFIG_HOTPLUG_CPU -void arch_cpu_idle_dead(void) -{ - sched_preempt_enable_no_resched(); - cpu_die(); -} -#endif - void arch_cpu_idle(void) { ppc64_runlatch_off(); @@ -59,10 +51,9 @@ void arch_cpu_idle(void) * Some power_save functions return with * interrupts enabled, some don't. */ - if (irqs_disabled()) - local_irq_enable(); + if (!irqs_disabled()) + raw_local_irq_disable(); } else { - local_irq_enable(); /* * Go into low thread priority and possibly * low power mode. @@ -77,6 +68,31 @@ void arch_cpu_idle(void) int powersave_nap; +#ifdef CONFIG_PPC_970_NAP +void power4_idle(void) +{ + if (!cpu_has_feature(CPU_FTR_CAN_NAP)) + return; + + if (!powersave_nap) + return; + + if (!prep_irq_for_idle()) + return; + + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + asm volatile(PPC_DSSALL " ; sync" ::: "memory"); + + power4_idle_nap(); + + /* + * power4_idle_nap returns with interrupts enabled (soft and hard). + * to our caller with interrupts enabled (soft and hard). Our caller + * can cope with either interrupts disabled or enabled upon return. + */ +} +#endif + #ifdef CONFIG_SYSCTL /* * Register the sysctl to set/clear powersave_nap. @@ -89,21 +105,12 @@ static struct ctl_table powersave_nap_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {} -}; -static struct ctl_table powersave_nap_sysctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = powersave_nap_ctl_table, - }, - {} }; static int __init register_powersave_nap_sysctl(void) { - register_sysctl_table(powersave_nap_sysctl_root); + register_sysctl("kernel", powersave_nap_ctl_table); return 0; } diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_64e.S index cc008de58b05..0fc680e03dee 100644 --- a/arch/powerpc/kernel/idle_book3e.S +++ b/arch/powerpc/kernel/idle_64e.S @@ -2,7 +2,7 @@ /* * Copyright 2010 IBM Corp, Benjamin Herrenschmidt <benh@kernel.crashing.org> * - * Generic idle routine for Book3E processors + * Generic idle routine for 64 bits e500 processors */ #include <linux/threads.h> @@ -16,8 +16,6 @@ #include <asm/hw_irq.h> /* 64-bit version only for now */ -#ifdef CONFIG_PPC64 - .macro BOOK3E_IDLE name loop _GLOBAL(\name) /* Save LR for later */ @@ -77,7 +75,7 @@ _GLOBAL(\name) .macro BOOK3E_IDLE_LOOP 1: - PPC_WAIT(0) + PPC_WAIT_v203 b 1b .endm @@ -98,6 +96,4 @@ epapr_ev_idle_start: BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP -BOOK3E_IDLE book3e_idle BOOK3E_IDLE_LOOP - -#endif /* CONFIG_PPC64 */ +BOOK3E_IDLE e500_idle BOOK3E_IDLE_LOOP diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index 0ffdd18b9f26..3c097356366b 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -129,7 +129,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFCLR(CPU_FTR_NO_DPM) mtspr SPRN_HID0,r4 BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) lwz r8,TI_LOCAL_FLAGS(r2) /* set napping bit */ @@ -145,9 +145,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) /* * Return from NAP/DOZE mode, restore some CPU specific registers, - * we are called with DR/IR still off and r2 containing physical - * address of current. R11 points to the exception frame (physical - * address). We have to preserve r10. + * R11 points to the exception frame. We have to preserve r10. */ _GLOBAL(power_save_ppc32_restore) lwz r9,_LINK(r11) /* interrupted in ppc6xx_idle: */ @@ -166,7 +164,7 @@ BEGIN_FTR_SECTION mfspr r9,SPRN_HID0 andis. r9,r9,HID0_NAP@h beq 1f - addis r9,r11,(nap_save_msscr0-KERNELBASE)@ha + addis r9, r11, nap_save_msscr0@ha lwz r9,nap_save_msscr0@l(r9) mtspr SPRN_MSSCR0, r9 sync @@ -174,11 +172,12 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR) BEGIN_FTR_SECTION - addis r9,r11,(nap_save_hid1-KERNELBASE)@ha + addis r9, r11, nap_save_hid1@ha lwz r9,nap_save_hid1@l(r9) mtspr SPRN_HID1, r9 END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX) - b transfer_to_handler_cont + blr +_ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore) .data diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_85xx.S index 308f499e146c..9e1bc4502c50 100644 --- a/arch/powerpc/kernel/idle_e500.S +++ b/arch/powerpc/kernel/idle_85xx.S @@ -74,19 +74,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) /* * Return from NAP/DOZE mode, restore some CPU specific registers, - * r2 containing physical address of current. - * r11 points to the exception frame (physical address). + * r2 containing address of current. + * r11 points to the exception frame. * We have to preserve r10. */ _GLOBAL(power_save_ppc32_restore) lwz r9,_LINK(r11) /* interrupted in e500_idle */ stw r9,_NIP(r11) /* make it do a blr */ - -#ifdef CONFIG_SMP - lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ - slwi r11,r11,2 -#else - li r11,0 -#endif - - b transfer_to_handler_cont + blr +_ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index d32751994a62..3d97fb833834 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -15,7 +15,9 @@ #include <asm/asm-offsets.h> #include <asm/ppc-opcode.h> #include <asm/cpuidle.h> +#include <asm/thread_info.h> /* TLF_NAPPING */ +#ifdef CONFIG_PPC_P7_NAP /* * Desired PSSCR in r3 * @@ -50,28 +52,32 @@ _GLOBAL(isa300_idle_stop_mayloss) std r1,PACAR1(r13) mflr r4 mfcr r5 - /* use stack red zone rather than a new frame for saving regs */ - std r2,-8*0(r1) - std r14,-8*1(r1) - std r15,-8*2(r1) - std r16,-8*3(r1) - std r17,-8*4(r1) - std r18,-8*5(r1) - std r19,-8*6(r1) - std r20,-8*7(r1) - std r21,-8*8(r1) - std r22,-8*9(r1) - std r23,-8*10(r1) - std r24,-8*11(r1) - std r25,-8*12(r1) - std r26,-8*13(r1) - std r27,-8*14(r1) - std r28,-8*15(r1) - std r29,-8*16(r1) - std r30,-8*17(r1) - std r31,-8*18(r1) - std r4,-8*19(r1) - std r5,-8*20(r1) + /* + * Use the stack red zone rather than a new frame for saving regs since + * in the case of no GPR loss the wakeup code branches directly back to + * the caller without deallocating the stack frame first. + */ + std r2,-8*1(r1) + std r14,-8*2(r1) + std r15,-8*3(r1) + std r16,-8*4(r1) + std r17,-8*5(r1) + std r18,-8*6(r1) + std r19,-8*7(r1) + std r20,-8*8(r1) + std r21,-8*9(r1) + std r22,-8*10(r1) + std r23,-8*11(r1) + std r24,-8*12(r1) + std r25,-8*13(r1) + std r26,-8*14(r1) + std r27,-8*15(r1) + std r28,-8*16(r1) + std r29,-8*17(r1) + std r30,-8*18(r1) + std r31,-8*19(r1) + std r4,-8*20(r1) + std r5,-8*21(r1) /* 168 bytes */ PPC_STOP b . /* catch bugs */ @@ -87,8 +93,8 @@ _GLOBAL(isa300_idle_stop_mayloss) */ _GLOBAL(idle_return_gpr_loss) ld r1,PACAR1(r13) - ld r4,-8*19(r1) - ld r5,-8*20(r1) + ld r4,-8*20(r1) + ld r5,-8*21(r1) mtlr r4 mtcr r5 /* @@ -96,38 +102,40 @@ _GLOBAL(idle_return_gpr_loss) * from PACATOC. This could be avoided for that less common case * if KVM saved its r2. */ - ld r2,-8*0(r1) - ld r14,-8*1(r1) - ld r15,-8*2(r1) - ld r16,-8*3(r1) - ld r17,-8*4(r1) - ld r18,-8*5(r1) - ld r19,-8*6(r1) - ld r20,-8*7(r1) - ld r21,-8*8(r1) - ld r22,-8*9(r1) - ld r23,-8*10(r1) - ld r24,-8*11(r1) - ld r25,-8*12(r1) - ld r26,-8*13(r1) - ld r27,-8*14(r1) - ld r28,-8*15(r1) - ld r29,-8*16(r1) - ld r30,-8*17(r1) - ld r31,-8*18(r1) + ld r2,-8*1(r1) + ld r14,-8*2(r1) + ld r15,-8*3(r1) + ld r16,-8*4(r1) + ld r17,-8*5(r1) + ld r18,-8*6(r1) + ld r19,-8*7(r1) + ld r20,-8*8(r1) + ld r21,-8*9(r1) + ld r22,-8*10(r1) + ld r23,-8*11(r1) + ld r24,-8*12(r1) + ld r25,-8*13(r1) + ld r26,-8*14(r1) + ld r27,-8*15(r1) + ld r28,-8*16(r1) + ld r29,-8*17(r1) + ld r30,-8*18(r1) + ld r31,-8*19(r1) blr /* * This is the sequence required to execute idle instructions, as * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0. - * - * The 0(r1) slot is used to save r2 in isa206, so use that here. + * We have to store a GPR somewhere, ptesync, then reload it, and create + * a false dependency on the result of the load. It doesn't matter which + * GPR we store, or where we store it. We have already stored r2 to the + * stack at -8(r1) in isa206_idle_insn_mayloss, so use that. */ #define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ - std r2,0(r1); \ + std r2,-8(r1); \ ptesync; \ - ld r2,0(r1); \ + ld r2,-8(r1); \ 236: cmpd cr0,r2,r2; \ bne 236b; \ IDLE_INST; \ @@ -152,28 +160,32 @@ _GLOBAL(isa206_idle_insn_mayloss) std r1,PACAR1(r13) mflr r4 mfcr r5 - /* use stack red zone rather than a new frame for saving regs */ - std r2,-8*0(r1) - std r14,-8*1(r1) - std r15,-8*2(r1) - std r16,-8*3(r1) - std r17,-8*4(r1) - std r18,-8*5(r1) - std r19,-8*6(r1) - std r20,-8*7(r1) - std r21,-8*8(r1) - std r22,-8*9(r1) - std r23,-8*10(r1) - std r24,-8*11(r1) - std r25,-8*12(r1) - std r26,-8*13(r1) - std r27,-8*14(r1) - std r28,-8*15(r1) - std r29,-8*16(r1) - std r30,-8*17(r1) - std r31,-8*18(r1) - std r4,-8*19(r1) - std r5,-8*20(r1) + /* + * Use the stack red zone rather than a new frame for saving regs since + * in the case of no GPR loss the wakeup code branches directly back to + * the caller without deallocating the stack frame first. + */ + std r2,-8*1(r1) + std r14,-8*2(r1) + std r15,-8*3(r1) + std r16,-8*4(r1) + std r17,-8*5(r1) + std r18,-8*6(r1) + std r19,-8*7(r1) + std r20,-8*8(r1) + std r21,-8*9(r1) + std r22,-8*10(r1) + std r23,-8*11(r1) + std r24,-8*12(r1) + std r25,-8*13(r1) + std r26,-8*14(r1) + std r27,-8*15(r1) + std r28,-8*16(r1) + std r29,-8*17(r1) + std r30,-8*18(r1) + std r31,-8*19(r1) + std r4,-8*20(r1) + std r5,-8*21(r1) cmpwi r3,PNV_THREAD_NAP bne 1f IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) @@ -181,4 +193,26 @@ _GLOBAL(isa206_idle_insn_mayloss) bne 2f IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) 2: IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) +#endif +#ifdef CONFIG_PPC_970_NAP +_GLOBAL(power4_idle_nap) + LOAD_REG_IMMEDIATE(r7, MSR_KERNEL|MSR_EE|MSR_POW) + ld r9,PACA_THREAD_INFO(r13) + ld r8,TI_LOCAL_FLAGS(r9) + ori r8,r8,_TLF_NAPPING + std r8,TI_LOCAL_FLAGS(r9) + /* + * NAPPING bit is set, from this point onward power4_fixup_nap + * will cause exceptions to return to power4_idle_nap_return. + */ +1: sync + isync + mtmsrd r7 + isync + b 1b + + .globl power4_idle_nap_return +power4_idle_nap_return: + blr +#endif diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S deleted file mode 100644 index 33c625329078..000000000000 --- a/arch/powerpc/kernel/idle_power4.S +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file contains the power_save function for 970-family CPUs. - */ - -#include <linux/threads.h> -#include <asm/processor.h> -#include <asm/page.h> -#include <asm/cputable.h> -#include <asm/thread_info.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/irqflags.h> -#include <asm/hw_irq.h> -#include <asm/feature-fixups.h> - -#undef DEBUG - - .text - -_GLOBAL(power4_idle) -BEGIN_FTR_SECTION - blr -END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) - /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - - /* This sequence is similar to prep_irq_for_idle() */ - - /* Hard disable interrupts */ - mfmsr r7 - rldicl r0,r7,48,1 - rotldi r0,r0,16 - mtmsrd r0,1 - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - cmpwi cr0,r0,0 - bne- 2f - - /* - * Soft-enable interrupts. This will make power4_fixup_nap return - * to our caller with interrupts enabled (soft and hard). The caller - * can cope with either interrupts disabled or enabled upon return. - */ -#ifdef CONFIG_TRACE_IRQFLAGS - /* Tell the tracer interrupts are on, because idle responds to them. */ - mflr r0 - std r0,16(r1) - stdu r1,-128(r1) - bl trace_hardirqs_on - addi r1,r1,128 - ld r0,16(r1) - mtlr r0 - mfmsr r7 -#endif /* CONFIG_TRACE_IRQFLAGS */ - - li r0,IRQS_ENABLED - stb r0,PACAIRQSOFTMASK(r13) /* we'll hard-enable shortly */ -BEGIN_FTR_SECTION - DSSALL - sync -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - ld r9, PACA_THREAD_INFO(r13) - ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ - ori r8,r8,_TLF_NAPPING /* so when we take an exception */ - std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ - ori r7,r7,MSR_EE - oris r7,r7,MSR_POW@h -1: sync - isync - mtmsrd r7 - isync - b 1b - -2: /* Return if an interrupt had happened while soft disabled */ - /* Set the HARD_DIS flag because interrupts are now hard disabled */ - ori r0,r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) - blr diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c index e34116255ced..b7029beed847 100644 --- a/arch/powerpc/kernel/ima_arch.c +++ b/arch/powerpc/kernel/ima_arch.c @@ -19,13 +19,13 @@ bool arch_ima_get_secureboot(void) * to be stored as an xattr or as an appended signature. * * To avoid duplicate signature verification as much as possible, the IMA - * policy rule for module appraisal is added only if CONFIG_MODULE_SIG_FORCE + * policy rule for module appraisal is added only if CONFIG_MODULE_SIG * is not enabled. */ static const char *const secure_rules[] = { - "appraise func=KEXEC_KERNEL_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", -#ifndef CONFIG_MODULE_SIG_FORCE - "appraise func=MODULE_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", + "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig", +#ifndef CONFIG_MODULE_SIG + "appraise func=MODULE_CHECK appraise_type=imasig|modsig", #endif NULL }; @@ -49,9 +49,9 @@ static const char *const trusted_rules[] = { static const char *const secure_and_trusted_rules[] = { "measure func=KEXEC_KERNEL_CHECK template=ima-modsig", "measure func=MODULE_CHECK template=ima-modsig", - "appraise func=KEXEC_KERNEL_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", -#ifndef CONFIG_MODULE_SIG_FORCE - "appraise func=MODULE_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", + "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig", +#ifndef CONFIG_MODULE_SIG + "appraise func=MODULE_CHECK appraise_type=imasig|modsig", #endif NULL }; diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c new file mode 100644 index 000000000000..eca293794a1e --- /dev/null +++ b/arch/powerpc/kernel/interrupt.c @@ -0,0 +1,505 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/context_tracking.h> +#include <linux/err.h> +#include <linux/compat.h> +#include <linux/rseq.h> +#include <linux/sched/debug.h> /* for show_regs */ + +#include <asm/kup.h> +#include <asm/cputime.h> +#include <asm/hw_irq.h> +#include <asm/interrupt.h> +#include <asm/kprobes.h> +#include <asm/paca.h> +#include <asm/ptrace.h> +#include <asm/reg.h> +#include <asm/signal.h> +#include <asm/switch_to.h> +#include <asm/syscall.h> +#include <asm/time.h> +#include <asm/tm.h> +#include <asm/unistd.h> + +#if defined(CONFIG_PPC_ADV_DEBUG_REGS) && defined(CONFIG_PPC32) +unsigned long global_dbcr0[NR_CPUS]; +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +DEFINE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant); +static inline bool exit_must_hard_disable(void) +{ + return static_branch_unlikely(&interrupt_exit_not_reentrant); +} +#else +static inline bool exit_must_hard_disable(void) +{ + return true; +} +#endif + +/* + * local irqs must be disabled. Returns false if the caller must re-enable + * them, check for new work, and try again. + * + * This should be called with local irqs disabled, but if they were previously + * enabled when the interrupt handler returns (indicating a process-context / + * synchronous interrupt) then irqs_enabled should be true. + * + * restartable is true then EE/RI can be left on because interrupts are handled + * with a restart sequence. + */ +static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable) +{ + bool must_hard_disable = (exit_must_hard_disable() || !restartable); + + /* This must be done with RI=1 because tracing may touch vmaps */ + trace_hardirqs_on(); + + if (must_hard_disable) + __hard_EE_RI_disable(); + +#ifdef CONFIG_PPC64 + /* This pattern matches prep_irq_for_idle */ + if (unlikely(lazy_irq_pending_nocheck())) { + if (must_hard_disable) { + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + __hard_RI_enable(); + } + trace_hardirqs_off(); + + return false; + } +#endif + return true; +} + +static notrace void booke_load_dbcr0(void) +{ +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + unsigned long dbcr0 = current->thread.debug.dbcr0; + + if (likely(!(dbcr0 & DBCR0_IDM))) + return; + + /* + * Check to see if the dbcr0 register is set up to debug. + * Use the internal debug mode bit to do this. + */ + mtmsr(mfmsr() & ~MSR_DE); + if (IS_ENABLED(CONFIG_PPC32)) { + isync(); + global_dbcr0[smp_processor_id()] = mfspr(SPRN_DBCR0); + } + mtspr(SPRN_DBCR0, dbcr0); + mtspr(SPRN_DBSR, -1); +#endif +} + +static notrace void check_return_regs_valid(struct pt_regs *regs) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + unsigned long trap, srr0, srr1; + static bool warned; + u8 *validp; + char *h; + + if (trap_is_scv(regs)) + return; + + trap = TRAP(regs); + // EE in HV mode sets HSRRs like 0xea0 + if (cpu_has_feature(CPU_FTR_HVMODE) && trap == INTERRUPT_EXTERNAL) + trap = 0xea0; + + switch (trap) { + case 0x980: + case INTERRUPT_H_DATA_STORAGE: + case 0xe20: + case 0xe40: + case INTERRUPT_HMI: + case 0xe80: + case 0xea0: + case INTERRUPT_H_FAC_UNAVAIL: + case 0x1200: + case 0x1500: + case 0x1600: + case 0x1800: + validp = &local_paca->hsrr_valid; + if (!READ_ONCE(*validp)) + return; + + srr0 = mfspr(SPRN_HSRR0); + srr1 = mfspr(SPRN_HSRR1); + h = "H"; + + break; + default: + validp = &local_paca->srr_valid; + if (!READ_ONCE(*validp)) + return; + + srr0 = mfspr(SPRN_SRR0); + srr1 = mfspr(SPRN_SRR1); + h = ""; + break; + } + + if (srr0 == regs->nip && srr1 == regs->msr) + return; + + /* + * A NMI / soft-NMI interrupt may have come in after we found + * srr_valid and before the SRRs are loaded. The interrupt then + * comes in and clobbers SRRs and clears srr_valid. Then we load + * the SRRs here and test them above and find they don't match. + * + * Test validity again after that, to catch such false positives. + * + * This test in general will have some window for false negatives + * and may not catch and fix all such cases if an NMI comes in + * later and clobbers SRRs without clearing srr_valid, but hopefully + * such things will get caught most of the time, statistically + * enough to be able to get a warning out. + */ + if (!READ_ONCE(*validp)) + return; + + if (!data_race(warned)) { + data_race(warned = true); + printk("%sSRR0 was: %lx should be: %lx\n", h, srr0, regs->nip); + printk("%sSRR1 was: %lx should be: %lx\n", h, srr1, regs->msr); + show_regs(regs); + } + + WRITE_ONCE(*validp, 0); /* fixup */ +#endif +} + +static notrace unsigned long +interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs) +{ + unsigned long ti_flags; + +again: + ti_flags = read_thread_flags(); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); + if (ti_flags & _TIF_NEED_RESCHED) { + schedule(); + } else { + /* + * SIGPENDING must restore signal handler function + * argument GPRs, and some non-volatiles (e.g., r1). + * Restore all for now. This could be made lighter. + */ + if (ti_flags & _TIF_SIGPENDING) + ret |= _TIF_RESTOREALL; + do_notify_resume(regs, ti_flags); + } + local_irq_disable(); + ti_flags = read_thread_flags(); + } + + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) { + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && + unlikely((ti_flags & _TIF_RESTORE_TM))) { + restore_tm_state(regs); + } else { + unsigned long mathflags = MSR_FP; + + if (cpu_has_feature(CPU_FTR_VSX)) + mathflags |= MSR_VEC | MSR_VSX; + else if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mathflags |= MSR_VEC; + + /* + * If userspace MSR has all available FP bits set, + * then they are live and no need to restore. If not, + * it means the regs were given up and restore_math + * may decide to restore them (to avoid taking an FP + * fault). + */ + if ((regs->msr & mathflags) != mathflags) + restore_math(regs); + } + } + + check_return_regs_valid(regs); + + user_enter_irqoff(); + if (!prep_irq_for_enabled_exit(true)) { + user_exit_irqoff(); + local_irq_enable(); + local_irq_disable(); + goto again; + } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + local_paca->tm_scratch = regs->msr; +#endif + + booke_load_dbcr0(); + + account_cpu_user_exit(); + + /* Restore user access locks last */ + kuap_user_restore(regs); + + return ret; +} + +/* + * This should be called after a syscall returns, with r3 the return value + * from the syscall. If this function returns non-zero, the system call + * exit assembly should additionally load all GPR registers and CTR and XER + * from the interrupt frame. + * + * The function graph tracer can not trace the return side of this function, + * because RI=0 and soft mask state is "unreconciled", so it is marked notrace. + */ +notrace unsigned long syscall_exit_prepare(unsigned long r3, + struct pt_regs *regs, + long scv) +{ + unsigned long ti_flags; + unsigned long ret = 0; + bool is_not_scv = !IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !scv; + + CT_WARN_ON(ct_state() == CONTEXT_USER); + + kuap_assert_locked(); + + regs->result = r3; + + /* Check whether the syscall is issued inside a restartable sequence */ + rseq_syscall(regs); + + ti_flags = read_thread_flags(); + + if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && is_not_scv) { + if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) { + r3 = -r3; + regs->ccr |= 0x10000000; /* Set SO bit in CR */ + } + } + + if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) { + if (ti_flags & _TIF_RESTOREALL) + ret = _TIF_RESTOREALL; + else + regs->gpr[3] = r3; + clear_bits(_TIF_PERSYSCALL_MASK, ¤t_thread_info()->flags); + } else { + regs->gpr[3] = r3; + } + + if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) { + do_syscall_trace_leave(regs); + ret |= _TIF_RESTOREALL; + } + + local_irq_disable(); + ret = interrupt_exit_user_prepare_main(ret, regs); + +#ifdef CONFIG_PPC64 + regs->exit_result = ret; +#endif + + return ret; +} + +#ifdef CONFIG_PPC64 +notrace unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *regs) +{ + /* + * This is called when detecting a soft-pending interrupt as well as + * an alternate-return interrupt. So we can't just have the alternate + * return path clear SRR1[MSR] and set PACA_IRQ_HARD_DIS (unless + * the soft-pending case were to fix things up as well). RI might be + * disabled, in which case it gets re-enabled by __hard_irq_disable(). + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + +#ifdef CONFIG_PPC_BOOK3S_64 + set_kuap(AMR_KUAP_BLOCKED); +#endif + + trace_hardirqs_off(); + user_exit_irqoff(); + account_cpu_user_entry(); + + BUG_ON(!user_mode(regs)); + + regs->exit_result = interrupt_exit_user_prepare_main(regs->exit_result, regs); + + return regs->exit_result; +} +#endif + +notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) +{ + unsigned long ret; + + BUG_ON(regs_is_unrecoverable(regs)); + BUG_ON(arch_irq_disabled_regs(regs)); + CT_WARN_ON(ct_state() == CONTEXT_USER); + + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * AMR can only have been unlocked if we interrupted the kernel. + */ + kuap_assert_locked(); + + local_irq_disable(); + + ret = interrupt_exit_user_prepare_main(0, regs); + +#ifdef CONFIG_PPC64 + regs->exit_result = ret; +#endif + + return ret; +} + +void preempt_schedule_irq(void); + +notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) +{ + unsigned long ret = 0; + unsigned long kuap; + bool stack_store = read_thread_flags() & _TIF_EMULATE_STACK_STORE; + + if (regs_is_unrecoverable(regs)) + unrecoverable_exception(regs); + /* + * CT_WARN_ON comes here via program_check_exception, so avoid + * recursion. + * + * Skip the assertion on PMIs on 64e to work around a problem caused + * by NMI PMIs incorrectly taking this interrupt return path, it's + * possible for this to hit after interrupt exit to user switches + * context to user. See also the comment in the performance monitor + * handler in exceptions-64e.S + */ + if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) && + TRAP(regs) != INTERRUPT_PROGRAM && + TRAP(regs) != INTERRUPT_PERFMON) + CT_WARN_ON(ct_state() == CONTEXT_USER); + + kuap = kuap_get_and_assert_locked(); + + local_irq_disable(); + + if (!arch_irq_disabled_regs(regs)) { + /* Returning to a kernel context with local irqs enabled. */ + WARN_ON_ONCE(!(regs->msr & MSR_EE)); +again: + if (IS_ENABLED(CONFIG_PREEMPT)) { + /* Return to preemptible kernel context */ + if (unlikely(read_thread_flags() & _TIF_NEED_RESCHED)) { + if (preempt_count() == 0) + preempt_schedule_irq(); + } + } + + check_return_regs_valid(regs); + + /* + * Stack store exit can't be restarted because the interrupt + * stack frame might have been clobbered. + */ + if (!prep_irq_for_enabled_exit(unlikely(stack_store))) { + /* + * Replay pending soft-masked interrupts now. Don't + * just local_irq_enabe(); local_irq_disable(); because + * if we are returning from an asynchronous interrupt + * here, another one might hit after irqs are enabled, + * and it would exit via this same path allowing + * another to fire, and so on unbounded. + */ + hard_irq_disable(); + replay_soft_interrupts(); + /* Took an interrupt, may have more exit work to do. */ + goto again; + } +#ifdef CONFIG_PPC64 + /* + * An interrupt may clear MSR[EE] and set this concurrently, + * but it will be marked pending and the exit will be retried. + * This leaves a racy window where MSR[EE]=0 and HARD_DIS is + * clear, until interrupt_exit_kernel_restart() calls + * hard_irq_disable(), which will set HARD_DIS again. + */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + + } else { + check_return_regs_valid(regs); + + if (unlikely(stack_store)) + __hard_EE_RI_disable(); +#endif /* CONFIG_PPC64 */ + } + + if (unlikely(stack_store)) { + clear_bits(_TIF_EMULATE_STACK_STORE, ¤t_thread_info()->flags); + ret = 1; + } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + local_paca->tm_scratch = regs->msr; +#endif + + /* + * 64s does not want to mfspr(SPRN_AMR) here, because this comes after + * mtmsr, which would cause Read-After-Write stalls. Hence, take the + * AMR value from the check above. + */ + kuap_kernel_restore(regs, kuap); + + return ret; +} + +#ifdef CONFIG_PPC64 +notrace unsigned long interrupt_exit_user_restart(struct pt_regs *regs) +{ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + +#ifdef CONFIG_PPC_BOOK3S_64 + set_kuap(AMR_KUAP_BLOCKED); +#endif + + trace_hardirqs_off(); + user_exit_irqoff(); + account_cpu_user_entry(); + + BUG_ON(!user_mode(regs)); + + regs->exit_result |= interrupt_exit_user_prepare(regs); + + return regs->exit_result; +} + +/* + * No real need to return a value here because the stack store case does not + * get restarted. + */ +notrace unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs) +{ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + +#ifdef CONFIG_PPC_BOOK3S_64 + set_kuap(AMR_KUAP_BLOCKED); +#endif + + if (regs->softe == IRQS_ENABLED) + trace_hardirqs_off(); + + BUG_ON(user_mode(regs)); + + return interrupt_exit_kernel_prepare(regs); +} +#endif diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S new file mode 100644 index 000000000000..bd863702d812 --- /dev/null +++ b/arch/powerpc/kernel/interrupt_64.S @@ -0,0 +1,772 @@ +#include <asm/asm-offsets.h> +#include <asm/bug.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/exception-64s.h> +#else +#include <asm/exception-64e.h> +#endif +#include <asm/feature-fixups.h> +#include <asm/head-64.h> +#include <asm/hw_irq.h> +#include <asm/kup.h> +#include <asm/mmu.h> +#include <asm/ppc_asm.h> +#include <asm/ptrace.h> + + .align 7 + +.macro DEBUG_SRR_VALID srr +#ifdef CONFIG_PPC_RFI_SRR_DEBUG + .ifc \srr,srr + mfspr r11,SPRN_SRR0 + ld r12,_NIP(r1) + clrrdi r11,r11,2 + clrrdi r12,r12,2 +100: tdne r11,r12 + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + mfspr r11,SPRN_SRR1 + ld r12,_MSR(r1) +100: tdne r11,r12 + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + .else + mfspr r11,SPRN_HSRR0 + ld r12,_NIP(r1) + clrrdi r11,r11,2 + clrrdi r12,r12,2 +100: tdne r11,r12 + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + mfspr r11,SPRN_HSRR1 + ld r12,_MSR(r1) +100: tdne r11,r12 + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + .endif +#endif +.endm + +#ifdef CONFIG_PPC_BOOK3S +.macro system_call_vectored name trapnr + .globl system_call_vectored_\name +system_call_vectored_\name: +_ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) + SCV_INTERRUPT_TO_KERNEL + mr r10,r1 + ld r1,PACAKSAVE(r13) + std r10,0(r1) + std r11,_NIP(r1) + std r12,_MSR(r1) + std r0,GPR0(r1) + std r10,GPR1(r1) + std r2,GPR2(r1) + LOAD_PACA_TOC() + mfcr r12 + li r11,0 + /* Save syscall parameters in r3-r8 */ + SAVE_GPRS(3, 8, r1) + /* Zero r9-r12, this should only be required when restoring all GPRs */ + std r11,GPR9(r1) + std r11,GPR10(r1) + std r11,GPR11(r1) + std r11,GPR12(r1) + std r9,GPR13(r1) + SAVE_NVGPRS(r1) + std r11,_XER(r1) + std r11,_LINK(r1) + std r11,_CTR(r1) + + li r11,\trapnr + std r11,_TRAP(r1) + std r12,_CCR(r1) + std r3,ORIG_GPR3(r1) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) + std r11,STACK_INT_FRAME_MARKER(r1) /* "regs" marker */ + /* Calling convention has r3 = regs, r4 = orig r0 */ + addi r3,r1,STACK_INT_FRAME_REGS + mr r4,r0 + +BEGIN_FTR_SECTION + HMT_MEDIUM +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + + /* + * scv enters with MSR[EE]=1 and is immediately considered soft-masked. + * The entry vector already sets PACAIRQSOFTMASK to IRQS_ALL_DISABLED, + * and interrupts may be masked and pending already. + * system_call_exception() will call trace_hardirqs_off() which means + * interrupts could already have been blocked before trace_hardirqs_off, + * but this is the best we can do. + */ + + /* + * Zero user registers to prevent influencing speculative execution + * state of kernel code. + */ + SANITIZE_SYSCALL_GPRS() + bl CFUNC(system_call_exception) + +.Lsyscall_vectored_\name\()_exit: + addi r4,r1,STACK_INT_FRAME_REGS + li r5,1 /* scv */ + bl CFUNC(syscall_exit_prepare) + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ +.Lsyscall_vectored_\name\()_rst_start: + lbz r11,PACAIRQHAPPENED(r13) + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + bne- syscall_vectored_\name\()_restart + li r11,IRQS_ENABLED + stb r11,PACAIRQSOFTMASK(r13) + li r11,0 + stb r11,PACAIRQHAPPENED(r13) # clear out possible HARD_DIS + + ld r2,_CCR(r1) + ld r4,_NIP(r1) + ld r5,_MSR(r1) + +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + +BEGIN_FTR_SECTION + HMT_MEDIUM_LOW +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + + SANITIZE_RESTORE_NVGPRS() + cmpdi r3,0 + bne .Lsyscall_vectored_\name\()_restore_regs + + /* rfscv returns with LR->NIA and CTR->MSR */ + mtlr r4 + mtctr r5 + + /* Could zero these as per ABI, but we may consider a stricter ABI + * which preserves these if libc implementations can benefit, so + * restore them for now until further measurement is done. */ + REST_GPR(0, r1) + REST_GPRS(4, 8, r1) + /* Zero volatile regs that may contain sensitive kernel data */ + ZEROIZE_GPRS(9, 12) + mtspr SPRN_XER,r0 + + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ + mtcr r2 + REST_GPRS(2, 3, r1) + REST_GPR(13, r1) + REST_GPR(1, r1) + RFSCV_TO_USER + b . /* prevent speculative execution */ + +.Lsyscall_vectored_\name\()_restore_regs: + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 + + ld r3,_CTR(r1) + ld r4,_LINK(r1) + ld r5,_XER(r1) + + HANDLER_RESTORE_NVGPRS() + REST_GPR(0, r1) + mtcr r2 + mtctr r3 + mtlr r4 + mtspr SPRN_XER,r5 + REST_GPRS(2, 13, r1) + REST_GPR(1, r1) + RFI_TO_USER +.Lsyscall_vectored_\name\()_rst_end: + +syscall_vectored_\name\()_restart: +_ASM_NOKPROBE_SYMBOL(syscall_vectored_\name\()_restart) + GET_PACA(r13) + ld r1,PACA_EXIT_SAVE_R1(r13) + LOAD_PACA_TOC() + ld r3,RESULT(r1) + addi r4,r1,STACK_INT_FRAME_REGS + li r11,IRQS_ALL_DISABLED + stb r11,PACAIRQSOFTMASK(r13) + bl CFUNC(syscall_exit_restart) + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ + b .Lsyscall_vectored_\name\()_rst_start +1: + +SOFT_MASK_TABLE(.Lsyscall_vectored_\name\()_rst_start, 1b) +RESTART_TABLE(.Lsyscall_vectored_\name\()_rst_start, .Lsyscall_vectored_\name\()_rst_end, syscall_vectored_\name\()_restart) + +.endm + +system_call_vectored common 0x3000 + +/* + * We instantiate another entry copy for the SIGILL variant, with TRAP=0x7ff0 + * which is tested by system_call_exception when r0 is -1 (as set by vector + * entry code). + */ +system_call_vectored sigill 0x7ff0 + +#endif /* CONFIG_PPC_BOOK3S */ + + .balign IFETCH_ALIGN_BYTES + .globl system_call_common_real +system_call_common_real: +_ASM_NOKPROBE_SYMBOL(system_call_common_real) + ld r10,PACAKMSR(r13) /* get MSR value for kernel */ + mtmsrd r10 + + .balign IFETCH_ALIGN_BYTES + .globl system_call_common +system_call_common: +_ASM_NOKPROBE_SYMBOL(system_call_common) + mr r10,r1 + ld r1,PACAKSAVE(r13) + std r10,0(r1) + std r11,_NIP(r1) + std r12,_MSR(r1) + std r0,GPR0(r1) + std r10,GPR1(r1) + std r2,GPR2(r1) +#ifdef CONFIG_PPC_E500 +START_BTB_FLUSH_SECTION + BTB_FLUSH(r10) +END_BTB_FLUSH_SECTION +#endif + LOAD_PACA_TOC() + mfcr r12 + li r11,0 + /* Save syscall parameters in r3-r8 */ + SAVE_GPRS(3, 8, r1) + /* Zero r9-r12, this should only be required when restoring all GPRs */ + std r11,GPR9(r1) + std r11,GPR10(r1) + std r11,GPR11(r1) + std r11,GPR12(r1) + std r9,GPR13(r1) + SAVE_NVGPRS(r1) + std r11,_XER(r1) + std r11,_CTR(r1) + mflr r10 + + /* + * This clears CR0.SO (bit 28), which is the error indication on + * return from this system call. + */ + rldimi r12,r11,28,(63-28) + li r11,0xc00 + std r10,_LINK(r1) + std r11,_TRAP(r1) + std r12,_CCR(r1) + std r3,ORIG_GPR3(r1) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) + std r11,STACK_INT_FRAME_MARKER(r1) /* "regs" marker */ + /* Calling convention has r3 = regs, r4 = orig r0 */ + addi r3,r1,STACK_INT_FRAME_REGS + mr r4,r0 + +#ifdef CONFIG_PPC_BOOK3S + li r11,1 + stb r11,PACASRR_VALID(r13) +#endif + + /* + * We always enter kernel from userspace with irq soft-mask enabled and + * nothing pending. system_call_exception() will call + * trace_hardirqs_off(). + */ + li r11,IRQS_ALL_DISABLED + stb r11,PACAIRQSOFTMASK(r13) +#ifdef CONFIG_PPC_BOOK3S + li r12,-1 /* Set MSR_EE and MSR_RI */ + mtmsrd r12,1 +#else + wrteei 1 +#endif + + /* + * Zero user registers to prevent influencing speculative execution + * state of kernel code. + */ + SANITIZE_SYSCALL_GPRS() + bl CFUNC(system_call_exception) + +.Lsyscall_exit: + addi r4,r1,STACK_INT_FRAME_REGS + li r5,0 /* !scv */ + bl CFUNC(syscall_exit_prepare) + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ +#ifdef CONFIG_PPC_BOOK3S +.Lsyscall_rst_start: + lbz r11,PACAIRQHAPPENED(r13) + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + bne- syscall_restart +#endif + li r11,IRQS_ENABLED + stb r11,PACAIRQSOFTMASK(r13) + li r11,0 + stb r11,PACAIRQHAPPENED(r13) # clear out possible HARD_DIS + + ld r2,_CCR(r1) + ld r6,_LINK(r1) + mtlr r6 + +#ifdef CONFIG_PPC_BOOK3S + lbz r4,PACASRR_VALID(r13) + cmpdi r4,0 + bne 1f + li r4,0 + stb r4,PACASRR_VALID(r13) +#endif + ld r4,_NIP(r1) + ld r5,_MSR(r1) + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 +1: + DEBUG_SRR_VALID srr + +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + + SANITIZE_RESTORE_NVGPRS() + cmpdi r3,0 + bne .Lsyscall_restore_regs + /* Zero volatile regs that may contain sensitive kernel data */ + ZEROIZE_GPR(0) + ZEROIZE_GPRS(4, 12) + mtctr r0 + mtspr SPRN_XER,r0 +.Lsyscall_restore_regs_cont: + +BEGIN_FTR_SECTION + HMT_MEDIUM_LOW +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ + mtcr r2 + REST_GPRS(2, 3, r1) + REST_GPR(13, r1) + REST_GPR(1, r1) + RFI_TO_USER + b . /* prevent speculative execution */ + +.Lsyscall_restore_regs: + ld r3,_CTR(r1) + ld r4,_XER(r1) + HANDLER_RESTORE_NVGPRS() + mtctr r3 + mtspr SPRN_XER,r4 + REST_GPR(0, r1) + REST_GPRS(4, 12, r1) + b .Lsyscall_restore_regs_cont +.Lsyscall_rst_end: + +#ifdef CONFIG_PPC_BOOK3S +syscall_restart: +_ASM_NOKPROBE_SYMBOL(syscall_restart) + GET_PACA(r13) + ld r1,PACA_EXIT_SAVE_R1(r13) + LOAD_PACA_TOC() + ld r3,RESULT(r1) + addi r4,r1,STACK_INT_FRAME_REGS + li r11,IRQS_ALL_DISABLED + stb r11,PACAIRQSOFTMASK(r13) + bl CFUNC(syscall_exit_restart) + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ + b .Lsyscall_rst_start +1: + +SOFT_MASK_TABLE(.Lsyscall_rst_start, 1b) +RESTART_TABLE(.Lsyscall_rst_start, .Lsyscall_rst_end, syscall_restart) +#endif + + /* + * If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not + * touched, no exit work created, then this can be used. + */ + .balign IFETCH_ALIGN_BYTES + .globl fast_interrupt_return_srr +fast_interrupt_return_srr: +_ASM_NOKPROBE_SYMBOL(fast_interrupt_return_srr) + kuap_check_amr r3, r4 + ld r5,_MSR(r1) + andi. r0,r5,MSR_PR +#ifdef CONFIG_PPC_BOOK3S + beq 1f + kuap_user_restore r3, r4 + b .Lfast_user_interrupt_return_srr +1: kuap_kernel_restore r3, r4 + andi. r0,r5,MSR_RI + li r3,0 /* 0 return value, no EMULATE_STACK_STORE */ + bne+ .Lfast_kernel_interrupt_return_srr + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(unrecoverable_exception) + b . /* should not get here */ +#else + bne .Lfast_user_interrupt_return_srr + b .Lfast_kernel_interrupt_return_srr +#endif + +.macro interrupt_return_macro srr + .balign IFETCH_ALIGN_BYTES + .globl interrupt_return_\srr +interrupt_return_\srr\(): +_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()) + ld r4,_MSR(r1) + andi. r0,r4,MSR_PR + beq interrupt_return_\srr\()_kernel +interrupt_return_\srr\()_user: /* make backtraces match the _kernel variant */ +_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user) + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(interrupt_exit_user_prepare) +#ifndef CONFIG_INTERRUPT_SANITIZE_REGISTERS + cmpdi r3,0 + bne- .Lrestore_nvgprs_\srr +.Lrestore_nvgprs_\srr\()_cont: +#endif + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ +#ifdef CONFIG_PPC_BOOK3S +.Linterrupt_return_\srr\()_user_rst_start: + lbz r11,PACAIRQHAPPENED(r13) + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + bne- interrupt_return_\srr\()_user_restart +#endif + li r11,IRQS_ENABLED + stb r11,PACAIRQSOFTMASK(r13) + li r11,0 + stb r11,PACAIRQHAPPENED(r13) # clear out possible HARD_DIS + +.Lfast_user_interrupt_return_\srr\(): + SANITIZE_RESTORE_NVGPRS() +#ifdef CONFIG_PPC_BOOK3S + .ifc \srr,srr + lbz r4,PACASRR_VALID(r13) + .else + lbz r4,PACAHSRR_VALID(r13) + .endif + cmpdi r4,0 + li r4,0 + bne 1f +#endif + ld r11,_NIP(r1) + ld r12,_MSR(r1) + .ifc \srr,srr + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 +1: +#ifdef CONFIG_PPC_BOOK3S + stb r4,PACASRR_VALID(r13) +#endif + .else + mtspr SPRN_HSRR0,r11 + mtspr SPRN_HSRR1,r12 +1: +#ifdef CONFIG_PPC_BOOK3S + stb r4,PACAHSRR_VALID(r13) +#endif + .endif + DEBUG_SRR_VALID \srr + +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + lbz r4,PACAIRQSOFTMASK(r13) + tdnei r4,IRQS_ENABLED +#endif + +BEGIN_FTR_SECTION + ld r10,_PPR(r1) + mtspr SPRN_PPR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + ldarx r0,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + + ld r3,_CCR(r1) + ld r4,_LINK(r1) + ld r5,_CTR(r1) + ld r6,_XER(r1) + li r0,0 + + REST_GPRS(7, 13, r1) + + mtcr r3 + mtlr r4 + mtctr r5 + mtspr SPRN_XER,r6 + + REST_GPRS(2, 6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + .ifc \srr,srr + RFI_TO_USER + .else + HRFI_TO_USER + .endif + b . /* prevent speculative execution */ +.Linterrupt_return_\srr\()_user_rst_end: + +#ifndef CONFIG_INTERRUPT_SANITIZE_REGISTERS +.Lrestore_nvgprs_\srr\(): + REST_NVGPRS(r1) + b .Lrestore_nvgprs_\srr\()_cont +#endif + +#ifdef CONFIG_PPC_BOOK3S +interrupt_return_\srr\()_user_restart: +_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user_restart) + GET_PACA(r13) + ld r1,PACA_EXIT_SAVE_R1(r13) + LOAD_PACA_TOC() + addi r3,r1,STACK_INT_FRAME_REGS + li r11,IRQS_ALL_DISABLED + stb r11,PACAIRQSOFTMASK(r13) + bl CFUNC(interrupt_exit_user_restart) + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ + b .Linterrupt_return_\srr\()_user_rst_start +1: + +SOFT_MASK_TABLE(.Linterrupt_return_\srr\()_user_rst_start, 1b) +RESTART_TABLE(.Linterrupt_return_\srr\()_user_rst_start, .Linterrupt_return_\srr\()_user_rst_end, interrupt_return_\srr\()_user_restart) +#endif + + .balign IFETCH_ALIGN_BYTES +interrupt_return_\srr\()_kernel: +_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) + addi r3,r1,STACK_INT_FRAME_REGS + bl CFUNC(interrupt_exit_kernel_prepare) + + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ +.Linterrupt_return_\srr\()_kernel_rst_start: + ld r11,SOFTE(r1) + cmpwi r11,IRQS_ENABLED + stb r11,PACAIRQSOFTMASK(r13) + beq .Linterrupt_return_\srr\()_soft_enabled + + /* + * Returning to soft-disabled context. + * Check if a MUST_HARD_MASK interrupt has become pending, in which + * case we need to disable MSR[EE] in the return context. + * + * The MSR[EE] check catches among other things the short incoherency + * in hard_irq_disable() between clearing MSR[EE] and setting + * PACA_IRQ_HARD_DIS. + */ + ld r12,_MSR(r1) + andi. r10,r12,MSR_EE + beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled + lbz r11,PACAIRQHAPPENED(r13) + andi. r10,r11,PACA_IRQ_MUST_HARD_MASK + bne 1f // HARD_MASK is pending + // No HARD_MASK pending, clear possible HARD_DIS set by interrupt + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + stb r11,PACAIRQHAPPENED(r13) + b .Lfast_kernel_interrupt_return_\srr\() + + +1: /* Must clear MSR_EE from _MSR */ +#ifdef CONFIG_PPC_BOOK3S + li r10,0 + /* Clear valid before changing _MSR */ + .ifc \srr,srr + stb r10,PACASRR_VALID(r13) + .else + stb r10,PACAHSRR_VALID(r13) + .endif +#endif + xori r12,r12,MSR_EE + std r12,_MSR(r1) + b .Lfast_kernel_interrupt_return_\srr\() + +.Linterrupt_return_\srr\()_soft_enabled: + /* + * In the soft-enabled case, need to double-check that we have no + * pending interrupts that might have come in before we reached the + * restart section of code, and restart the exit so those can be + * handled. + * + * If there are none, it is be possible that the interrupt still + * has PACA_IRQ_HARD_DIS set, which needs to be cleared for the + * interrupted context. This clear will not clobber a new pending + * interrupt coming in, because we're in the restart section, so + * such would return to the restart location. + */ +#ifdef CONFIG_PPC_BOOK3S + lbz r11,PACAIRQHAPPENED(r13) + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + bne- interrupt_return_\srr\()_kernel_restart +#endif + li r11,0 + stb r11,PACAIRQHAPPENED(r13) // clear the possible HARD_DIS + +.Lfast_kernel_interrupt_return_\srr\(): + SANITIZE_RESTORE_NVGPRS() + cmpdi cr1,r3,0 +#ifdef CONFIG_PPC_BOOK3S + .ifc \srr,srr + lbz r4,PACASRR_VALID(r13) + .else + lbz r4,PACAHSRR_VALID(r13) + .endif + cmpdi r4,0 + li r4,0 + bne 1f +#endif + ld r11,_NIP(r1) + ld r12,_MSR(r1) + .ifc \srr,srr + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 +1: +#ifdef CONFIG_PPC_BOOK3S + stb r4,PACASRR_VALID(r13) +#endif + .else + mtspr SPRN_HSRR0,r11 + mtspr SPRN_HSRR1,r12 +1: +#ifdef CONFIG_PPC_BOOK3S + stb r4,PACAHSRR_VALID(r13) +#endif + .endif + DEBUG_SRR_VALID \srr + +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + ldarx r0,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + + ld r3,_LINK(r1) + ld r4,_CTR(r1) + ld r5,_XER(r1) + ld r6,_CCR(r1) + li r0,0 + + REST_GPRS(7, 12, r1) + + mtlr r3 + mtctr r4 + mtspr SPRN_XER,r5 + + /* + * Leaving a stale STACK_FRAME_REGS_MARKER on the stack can confuse + * the reliable stack unwinder later on. Clear it. + */ + std r0,STACK_INT_FRAME_MARKER(r1) + + REST_GPRS(2, 5, r1) + + bne- cr1,1f /* emulate stack store */ + mtcr r6 + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + .ifc \srr,srr + RFI_TO_KERNEL + .else + HRFI_TO_KERNEL + .endif + b . /* prevent speculative execution */ + +1: /* + * Emulate stack store with update. New r1 value was already calculated + * and updated in our interrupt regs by emulate_loadstore, but we can't + * store the previous value of r1 to the stack before re-loading our + * registers from it, otherwise they could be clobbered. Use + * PACA_EXGEN as temporary storage to hold the store data, as + * interrupts are disabled here so it won't be clobbered. + */ + mtcr r6 + std r9,PACA_EXGEN+0(r13) + addi r9,r1,INT_FRAME_SIZE /* get original r1 */ + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + std r9,0(r1) /* perform store component of stdu */ + ld r9,PACA_EXGEN+0(r13) + + .ifc \srr,srr + RFI_TO_KERNEL + .else + HRFI_TO_KERNEL + .endif + b . /* prevent speculative execution */ +.Linterrupt_return_\srr\()_kernel_rst_end: + +#ifdef CONFIG_PPC_BOOK3S +interrupt_return_\srr\()_kernel_restart: +_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel_restart) + GET_PACA(r13) + ld r1,PACA_EXIT_SAVE_R1(r13) + LOAD_PACA_TOC() + addi r3,r1,STACK_INT_FRAME_REGS + li r11,IRQS_ALL_DISABLED + stb r11,PACAIRQSOFTMASK(r13) + bl CFUNC(interrupt_exit_kernel_restart) + std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ + b .Linterrupt_return_\srr\()_kernel_rst_start +1: + +SOFT_MASK_TABLE(.Linterrupt_return_\srr\()_kernel_rst_start, 1b) +RESTART_TABLE(.Linterrupt_return_\srr\()_kernel_rst_start, .Linterrupt_return_\srr\()_kernel_rst_end, interrupt_return_\srr\()_kernel_restart) +#endif + +.endm + +interrupt_return_macro srr +#ifdef CONFIG_PPC_BOOK3S +interrupt_return_macro hsrr + + .globl __end_soft_masked +__end_soft_masked: +DEFINE_FIXED_SYMBOL(__end_soft_masked, text) +#endif /* CONFIG_PPC_BOOK3S */ + +#ifdef CONFIG_PPC_BOOK3S +_GLOBAL(ret_from_fork_scv) + bl CFUNC(schedule_tail) + HANDLER_RESTORE_NVGPRS() + li r3,0 /* fork() return value */ + b .Lsyscall_vectored_common_exit +#endif + +_GLOBAL(ret_from_fork) + bl CFUNC(schedule_tail) + HANDLER_RESTORE_NVGPRS() + li r3,0 /* fork() return value */ + b .Lsyscall_exit + +_GLOBAL(ret_from_kernel_user_thread) + bl CFUNC(schedule_tail) + mtctr r14 + mr r3,r15 +#ifdef CONFIG_PPC64_ELF_ABI_V2 + mr r12,r14 +#endif + bctrl + li r3,0 + /* + * It does not matter whether this returns via the scv or sc path + * because it returns as execve() and therefore has no calling ABI + * (i.e., it sets registers according to the exec()ed entry point). + */ + b .Lsyscall_exit + +_GLOBAL(start_kernel_thread) + bl CFUNC(schedule_tail) + mtctr r14 + mr r3,r15 +#ifdef CONFIG_PPC64_ELF_ABI_V2 + mr r12,r14 +#endif + bctrl + /* + * This must not return. We actually want to BUG here, not WARN, + * because BUG will exit the process which is what the kernel thread + * should have done, which may give some hope of continuing. + */ +100: trap + EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,0 diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 0276bc8c8969..c877f074d174 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -10,10 +10,10 @@ #include <linux/kernel.h> #include <linux/sched/mm.h> /* for init_mm */ +#include <linux/pgtable.h> #include <asm/io.h> #include <asm/machdep.h> -#include <asm/pgtable.h> #include <asm/ppc-pci.h> #include <asm/io-workarounds.h> #include <asm/pte-walk.h> @@ -55,7 +55,6 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr) #ifdef CONFIG_PPC_INDIRECT_MMIO struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) { - unsigned hugepage_shift; struct iowa_bus *bus; int token; @@ -65,22 +64,13 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) bus = &iowa_busses[token - 1]; else { unsigned long vaddr, paddr; - pte_t *ptep; vaddr = (unsigned long)PCI_FIX_ADDR(addr); if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - /* - * We won't find huge pages here (iomem). Also can't hit - * a page table free due to init_mm - */ - ptep = find_init_mm_pte(vaddr, &hugepage_shift); - if (ptep == NULL) - paddr = 0; - else { - WARN_ON(hugepage_shift); - paddr = pte_pfn(*ptep) << PAGE_SHIFT; - } + + paddr = ppc_find_vmap_phys(vaddr); + bus = iowa_pci_find(vaddr, paddr); if (bus == NULL) diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c index 2f29b7d432de..6af535905984 100644 --- a/arch/powerpc/kernel/io.c +++ b/arch/powerpc/kernel/io.c @@ -33,7 +33,7 @@ void _insb(const volatile u8 __iomem *port, void *buf, long count) return; asm volatile("sync"); do { - tmp = *port; + tmp = *(const volatile u8 __force *)port; eieio(); *tbuf++ = tmp; } while (--count != 0); @@ -49,7 +49,7 @@ void _outsb(volatile u8 __iomem *port, const void *buf, long count) return; asm volatile("sync"); do { - *port = *tbuf++; + *(volatile u8 __force *)port = *tbuf++; } while (--count != 0); asm volatile("sync"); } @@ -64,7 +64,7 @@ void _insw_ns(const volatile u16 __iomem *port, void *buf, long count) return; asm volatile("sync"); do { - tmp = *port; + tmp = *(const volatile u16 __force *)port; eieio(); *tbuf++ = tmp; } while (--count != 0); @@ -80,7 +80,7 @@ void _outsw_ns(volatile u16 __iomem *port, const void *buf, long count) return; asm volatile("sync"); do { - *port = *tbuf++; + *(volatile u16 __force *)port = *tbuf++; } while (--count != 0); asm volatile("sync"); } @@ -95,7 +95,7 @@ void _insl_ns(const volatile u32 __iomem *port, void *buf, long count) return; asm volatile("sync"); do { - tmp = *port; + tmp = *(const volatile u32 __force *)port; eieio(); *tbuf++ = tmp; } while (--count != 0); @@ -111,7 +111,7 @@ void _outsl_ns(volatile u32 __iomem *port, const void *buf, long count) return; asm volatile("sync"); do { - *port = *tbuf++; + *(volatile u32 __force *)port = *tbuf++; } while (--count != 0); asm volatile("sync"); } diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c index 5ac84efc6ede..72862a4d3a5d 100644 --- a/arch/powerpc/kernel/iomap.c +++ b/arch/powerpc/kernel/iomap.c @@ -11,177 +11,11 @@ #include <asm/pci-bridge.h> #include <asm/isa-bridge.h> -/* - * Here comes the ppc64 implementation of the IOMAP - * interfaces. - */ -unsigned int ioread8(void __iomem *addr) -{ - return readb(addr); -} -unsigned int ioread16(void __iomem *addr) -{ - return readw(addr); -} -unsigned int ioread16be(void __iomem *addr) -{ - return readw_be(addr); -} -unsigned int ioread32(void __iomem *addr) -{ - return readl(addr); -} -unsigned int ioread32be(void __iomem *addr) -{ - return readl_be(addr); -} -EXPORT_SYMBOL(ioread8); -EXPORT_SYMBOL(ioread16); -EXPORT_SYMBOL(ioread16be); -EXPORT_SYMBOL(ioread32); -EXPORT_SYMBOL(ioread32be); -#ifdef __powerpc64__ -u64 ioread64(void __iomem *addr) -{ - return readq(addr); -} -u64 ioread64_lo_hi(void __iomem *addr) -{ - return readq(addr); -} -u64 ioread64_hi_lo(void __iomem *addr) -{ - return readq(addr); -} -u64 ioread64be(void __iomem *addr) -{ - return readq_be(addr); -} -u64 ioread64be_lo_hi(void __iomem *addr) -{ - return readq_be(addr); -} -u64 ioread64be_hi_lo(void __iomem *addr) -{ - return readq_be(addr); -} -EXPORT_SYMBOL(ioread64); -EXPORT_SYMBOL(ioread64_lo_hi); -EXPORT_SYMBOL(ioread64_hi_lo); -EXPORT_SYMBOL(ioread64be); -EXPORT_SYMBOL(ioread64be_lo_hi); -EXPORT_SYMBOL(ioread64be_hi_lo); -#endif /* __powerpc64__ */ - -void iowrite8(u8 val, void __iomem *addr) -{ - writeb(val, addr); -} -void iowrite16(u16 val, void __iomem *addr) -{ - writew(val, addr); -} -void iowrite16be(u16 val, void __iomem *addr) -{ - writew_be(val, addr); -} -void iowrite32(u32 val, void __iomem *addr) -{ - writel(val, addr); -} -void iowrite32be(u32 val, void __iomem *addr) -{ - writel_be(val, addr); -} -EXPORT_SYMBOL(iowrite8); -EXPORT_SYMBOL(iowrite16); -EXPORT_SYMBOL(iowrite16be); -EXPORT_SYMBOL(iowrite32); -EXPORT_SYMBOL(iowrite32be); -#ifdef __powerpc64__ -void iowrite64(u64 val, void __iomem *addr) -{ - writeq(val, addr); -} -void iowrite64_lo_hi(u64 val, void __iomem *addr) -{ - writeq(val, addr); -} -void iowrite64_hi_lo(u64 val, void __iomem *addr) -{ - writeq(val, addr); -} -void iowrite64be(u64 val, void __iomem *addr) -{ - writeq_be(val, addr); -} -void iowrite64be_lo_hi(u64 val, void __iomem *addr) -{ - writeq_be(val, addr); -} -void iowrite64be_hi_lo(u64 val, void __iomem *addr) -{ - writeq_be(val, addr); -} -EXPORT_SYMBOL(iowrite64); -EXPORT_SYMBOL(iowrite64_lo_hi); -EXPORT_SYMBOL(iowrite64_hi_lo); -EXPORT_SYMBOL(iowrite64be); -EXPORT_SYMBOL(iowrite64be_lo_hi); -EXPORT_SYMBOL(iowrite64be_hi_lo); -#endif /* __powerpc64__ */ - -/* - * These are the "repeat read/write" functions. Note the - * non-CPU byte order. We do things in "IO byteorder" - * here. - * - * FIXME! We could make these do EEH handling if we really - * wanted. Not clear if we do. - */ -void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) -{ - readsb(addr, dst, count); -} -void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) -{ - readsw(addr, dst, count); -} -void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) -{ - readsl(addr, dst, count); -} -EXPORT_SYMBOL(ioread8_rep); -EXPORT_SYMBOL(ioread16_rep); -EXPORT_SYMBOL(ioread32_rep); - -void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) -{ - writesb(addr, src, count); -} -void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) -{ - writesw(addr, src, count); -} -void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) -{ - writesl(addr, src, count); -} -EXPORT_SYMBOL(iowrite8_rep); -EXPORT_SYMBOL(iowrite16_rep); -EXPORT_SYMBOL(iowrite32_rep); - void __iomem *ioport_map(unsigned long port, unsigned int len) { return (void __iomem *) (port + _IO_BASE); } - -void ioport_unmap(void __iomem *addr) -{ - /* Nothing to do */ -} EXPORT_SYMBOL(ioport_map); -EXPORT_SYMBOL(ioport_unmap); #ifdef CONFIG_PCI void pci_iounmap(struct pci_dev *dev, void __iomem *addr) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 9704f3f76e63..ebe259bdd462 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -25,8 +25,8 @@ #include <linux/pci.h> #include <linux/iommu.h> #include <linux/sched.h> +#include <linux/debugfs.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/iommu.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> @@ -35,9 +35,48 @@ #include <asm/vio.h> #include <asm/tce.h> #include <asm/mmu_context.h> +#include <asm/ppc-pci.h> #define DBG(...) +#ifdef CONFIG_IOMMU_DEBUGFS +static int iommu_debugfs_weight_get(void *data, u64 *val) +{ + struct iommu_table *tbl = data; + *val = bitmap_weight(tbl->it_map, tbl->it_size); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n"); + +static void iommu_debugfs_add(struct iommu_table *tbl) +{ + char name[10]; + struct dentry *liobn_entry; + + sprintf(name, "%08lx", tbl->it_index); + liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir); + + debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight); + debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size); + debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift); + debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start); + debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end); + debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels); + debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size); +} + +static void iommu_debugfs_del(struct iommu_table *tbl) +{ + char name[10]; + + sprintf(name, "%08lx", tbl->it_index); + debugfs_lookup_and_remove(name, iommu_debugfs_dir); +} +#else +static void iommu_debugfs_add(struct iommu_table *tbl){} +static void iommu_debugfs_del(struct iommu_table *tbl){} +#endif + static int novmerge; static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); @@ -133,17 +172,28 @@ static int fail_iommu_bus_notify(struct notifier_block *nb, return 0; } -static struct notifier_block fail_iommu_bus_notifier = { +/* + * PCI and VIO buses need separate notifier_block structs, since they're linked + * list nodes. Sharing a notifier_block would mean that any notifiers later + * registered for PCI buses would also get called by VIO buses and vice versa. + */ +static struct notifier_block fail_iommu_pci_bus_notifier = { + .notifier_call = fail_iommu_bus_notify +}; + +#ifdef CONFIG_IBMVIO +static struct notifier_block fail_iommu_vio_bus_notifier = { .notifier_call = fail_iommu_bus_notify }; +#endif static int __init fail_iommu_setup(void) { #ifdef CONFIG_PCI - bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier); + bus_register_notifier(&pci_bus_type, &fail_iommu_pci_bus_notifier); #endif #ifdef CONFIG_IBMVIO - bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier); + bus_register_notifier(&vio_bus_type, &fail_iommu_vio_bus_notifier); #endif return 0; @@ -172,7 +222,6 @@ static unsigned long iommu_range_alloc(struct device *dev, int largealloc = npages > 15; int pass = 0; unsigned long align_mask; - unsigned long boundary_size; unsigned long flags; unsigned int pool_nr; struct iommu_pool *pool; @@ -236,15 +285,9 @@ again: } } - if (dev) - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - 1 << tbl->it_page_shift); - else - boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift); - /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ - n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, - boundary_size >> tbl->it_page_shift, align_mask); + dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift), + align_mask); if (n == -1) { if (likely(pass == 0)) { /* First try the pool from the start */ @@ -262,6 +305,15 @@ again: pass++; goto again; + } else if (pass == tbl->nr_pools + 1) { + /* Last resort: try largepool */ + spin_unlock(&pool->lock); + pool = &tbl->large_pool; + spin_lock(&pool->lock); + pool->hint = pool->start; + pass++; + goto again; + } else { /* Give up */ spin_unlock_irqrestore(&(pool->lock), flags); @@ -430,7 +482,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, BUG_ON(direction == DMA_NONE); if ((nelems == 0) || !tbl) - return 0; + return -EINVAL; outs = s = segstart = &sglist[0]; outcount = 1; @@ -477,7 +529,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, /* Convert entry to a dma_addr_t */ entry += tbl->it_offset; dma_addr = entry << tbl->it_page_shift; - dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl)); + dma_addr |= (vaddr & ~IOMMU_PAGE_MASK(tbl)); DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", npages, entry, dma_addr); @@ -532,7 +584,6 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, */ if (outcount < incount) { outs = sg_next(outs); - outs->dma_address = DMA_MAPPING_ERROR; outs->dma_length = 0; } @@ -550,13 +601,12 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, npages = iommu_num_pages(s->dma_address, s->dma_length, IOMMU_PAGE_SIZE(tbl)); __iommu_free(tbl, vaddr, npages); - s->dma_address = DMA_MAPPING_ERROR; s->dma_length = 0; } if (s == outs) break; } - return 0; + return -EIO; } @@ -647,32 +697,24 @@ static void iommu_table_reserve_pages(struct iommu_table *tbl, if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - tbl->it_reserved_start = res_start; - tbl->it_reserved_end = res_end; + if (res_start < tbl->it_offset) + res_start = tbl->it_offset; - /* Check if res_start..res_end isn't empty and overlaps the table */ - if (res_start && res_end && - (tbl->it_offset + tbl->it_size < res_start || - res_end < tbl->it_offset)) - return; + if (res_end > (tbl->it_offset + tbl->it_size)) + res_end = tbl->it_offset + tbl->it_size; - for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) - set_bit(i - tbl->it_offset, tbl->it_map); -} - -static void iommu_table_release_pages(struct iommu_table *tbl) -{ - int i; + /* Check if res_start..res_end is a valid range in the table */ + if (res_start >= res_end) { + tbl->it_reserved_start = tbl->it_offset; + tbl->it_reserved_end = tbl->it_offset; + return; + } - /* - * In case we have reserved the first bit, we should not emit - * the warning below. - */ - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + tbl->it_reserved_start = res_start; + tbl->it_reserved_end = res_end; for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) - clear_bit(i - tbl->it_offset, tbl->it_map); + set_bit(i - tbl->it_offset, tbl->it_map); } /* @@ -684,7 +726,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, { unsigned long sz; static int welcomed = 0; - struct page *page; unsigned int i; struct iommu_pool *p; @@ -693,11 +734,11 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, /* number of bytes needed for the bitmap */ sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); - page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz)); - if (!page) - panic("iommu_init_table: Can't allocate %ld bytes\n", sz); - tbl->it_map = page_address(page); - memset(tbl->it_map, 0, sz); + tbl->it_map = vzalloc_node(sz, nid); + if (!tbl->it_map) { + pr_err("%s: Can't allocate %ld bytes\n", __func__, sz); + return NULL; + } iommu_table_reserve_pages(tbl, res_start, res_end); @@ -732,13 +773,34 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, welcomed = 1; } + iommu_debugfs_add(tbl); + return tbl; } +bool iommu_table_in_use(struct iommu_table *tbl) +{ + unsigned long start = 0, end; + + /* ignore reserved bit0 */ + if (tbl->it_offset == 0) + start = 1; + + /* Simple case with no reserved MMIO32 region */ + if (!tbl->it_reserved_start && !tbl->it_reserved_end) + return find_next_bit(tbl->it_map, tbl->it_size, start) != tbl->it_size; + + end = tbl->it_reserved_start - tbl->it_offset; + if (find_next_bit(tbl->it_map, end, start) != end) + return true; + + start = tbl->it_reserved_end - tbl->it_offset; + end = tbl->it_size; + return find_next_bit(tbl->it_map, end, start) != end; +} + static void iommu_table_free(struct kref *kref) { - unsigned long bitmap_sz; - unsigned int order; struct iommu_table *tbl; tbl = container_of(kref, struct iommu_table, it_kref); @@ -751,18 +813,14 @@ static void iommu_table_free(struct kref *kref) return; } - iommu_table_release_pages(tbl); + iommu_debugfs_del(tbl); /* verify that table contains no entries */ - if (!bitmap_empty(tbl->it_map, tbl->it_size)) + if (iommu_table_in_use(tbl)) pr_warn("%s: Unexpected TCEs\n", __func__); - /* calculate bitmap size in bytes */ - bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); - /* free bitmap */ - order = get_order(bitmap_sz); - free_pages((unsigned long) tbl->it_map, order); + vfree(tbl->it_map); /* free table */ kfree(tbl); @@ -858,6 +916,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, unsigned int order; unsigned int nio_pages, io_order; struct page *page; + int tcesize = (1 << tbl->it_page_shift); size = PAGE_ALIGN(size); order = get_order(size); @@ -884,7 +943,8 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, memset(ret, 0, size); /* Set up tces to cover the allocated range */ - nio_pages = size >> tbl->it_page_shift; + nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; + io_order = get_iommu_order(size, tbl); mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, mask >> tbl->it_page_shift, io_order, 0); @@ -892,7 +952,8 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, free_pages((unsigned long)ret, order); return NULL; } - *dma_handle = mapping; + + *dma_handle = mapping | ((u64)ret & (tcesize - 1)); return ret; } @@ -903,7 +964,7 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, unsigned int nio_pages; size = PAGE_ALIGN(size); - nio_pages = size >> tbl->it_page_shift; + nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; iommu_free(tbl, dma_handle, nio_pages); size = PAGE_ALIGN(size); free_pages((unsigned long)vaddr, get_order(size)); @@ -1013,15 +1074,15 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, - struct iommu_table *tbl, - unsigned long entry, unsigned long *hpa, - enum dma_data_direction *direction) +long iommu_tce_xchg_no_kill(struct mm_struct *mm, + struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) { long ret; unsigned long size = 0; - ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false); + ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction); if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL)) && !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, @@ -1036,11 +1097,12 @@ void iommu_tce_kill(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { if (tbl->it_ops->tce_kill) - tbl->it_ops->tce_kill(tbl, entry, pages, false); + tbl->it_ops->tce_kill(tbl, entry, pages); } EXPORT_SYMBOL_GPL(iommu_tce_kill); -int iommu_take_ownership(struct iommu_table *tbl) +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +static int iommu_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; int ret = 0; @@ -1057,16 +1119,11 @@ int iommu_take_ownership(struct iommu_table *tbl) spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) - spin_lock(&tbl->pools[i].lock); - - iommu_table_release_pages(tbl); + spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); - if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + if (iommu_table_in_use(tbl)) { pr_err("iommu_tce: it_map is not empty"); ret = -EBUSY; - /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ - iommu_table_reserve_pages(tbl, tbl->it_reserved_start, - tbl->it_reserved_end); } else { memset(tbl->it_map, 0xff, sz); } @@ -1077,15 +1134,14 @@ int iommu_take_ownership(struct iommu_table *tbl) return ret; } -EXPORT_SYMBOL_GPL(iommu_take_ownership); -void iommu_release_ownership(struct iommu_table *tbl) +static void iommu_release_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) - spin_lock(&tbl->pools[i].lock); + spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); memset(tbl->it_map, 0, sz); @@ -1096,7 +1152,7 @@ void iommu_release_ownership(struct iommu_table *tbl) spin_unlock(&tbl->pools[i].lock); spin_unlock_irqrestore(&tbl->large_pool.lock, flags); } -EXPORT_SYMBOL_GPL(iommu_release_ownership); +#endif int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) { @@ -1117,25 +1173,240 @@ int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) pr_debug("%s: Adding %s to iommu group %d\n", __func__, dev_name(dev), iommu_group_id(table_group->group)); - - return iommu_group_add_device(table_group->group, dev); + /* + * This is still not adding devices via the IOMMU bus notifier because + * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls + * pcibios_scan_phb() first (and this guy adds devices and triggers + * the notifier) and only then it calls pci_bus_add_devices() which + * configures DMA for buses which also creates PEs and IOMMU groups. + */ + return iommu_probe_device(dev); } EXPORT_SYMBOL_GPL(iommu_add_device); -void iommu_del_device(struct device *dev) +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +/* + * A simple iommu_table_group_ops which only allows reusing the existing + * iommu_table. This handles VFIO for POWER7 or the nested KVM. + * The ops does not allow creating windows and only allows reusing the existing + * one if it matches table_group->tce32_start/tce32_size/page_shift. + */ +static unsigned long spapr_tce_get_table_size(__u32 page_shift, + __u64 window_size, __u32 levels) +{ + unsigned long size; + + if (levels > 1) + return ~0U; + size = window_size >> (page_shift - 3); + return size; +} + +static long spapr_tce_create_table(struct iommu_table_group *table_group, int num, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) { + struct iommu_table *tbl = table_group->tables[0]; + + if (num > 0) + return -EPERM; + + if (tbl->it_page_shift != page_shift || + tbl->it_size != (window_size >> page_shift) || + tbl->it_indirect_levels != levels - 1) + return -EINVAL; + + *ptbl = iommu_tce_table_get(tbl); + return 0; +} + +static long spapr_tce_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + return tbl == table_group->tables[num] ? 0 : -EPERM; +} + +static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num) +{ + return 0; +} + +static long spapr_tce_take_ownership(struct iommu_table_group *table_group) +{ + int i, j, rc = 0; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl || !tbl->it_map) + continue; + + rc = iommu_take_ownership(tbl); + if (!rc) + continue; + + for (j = 0; j < i; ++j) + iommu_release_ownership(table_group->tables[j]); + return rc; + } + return 0; +} + +static void spapr_tce_release_ownership(struct iommu_table_group *table_group) +{ + int i; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl) + continue; + + iommu_table_clear(tbl); + if (tbl->it_map) + iommu_release_ownership(tbl); + } +} + +struct iommu_table_group_ops spapr_tce_table_group_ops = { + .get_table_size = spapr_tce_get_table_size, + .create_table = spapr_tce_create_table, + .set_window = spapr_tce_set_window, + .unset_window = spapr_tce_unset_window, + .take_ownership = spapr_tce_take_ownership, + .release_ownership = spapr_tce_release_ownership, +}; + +/* + * A simple iommu_ops to allow less cruft in generic VFIO code. + */ +static int +spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, + struct device *dev) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + struct iommu_group *grp = iommu_group_get(dev); + struct iommu_table_group *table_group; + int ret = -EINVAL; + + /* At first attach the ownership is already set */ + if (!domain) + return 0; + + if (!grp) + return -ENODEV; + + table_group = iommu_group_get_iommudata(grp); + ret = table_group->ops->take_ownership(table_group); + iommu_group_put(grp); + + return ret; +} + +static const struct iommu_domain_ops spapr_tce_platform_domain_ops = { + .attach_dev = spapr_tce_platform_iommu_attach_dev, +}; + +static struct iommu_domain spapr_tce_platform_domain = { + .type = IOMMU_DOMAIN_PLATFORM, + .ops = &spapr_tce_platform_domain_ops, +}; + +static struct iommu_domain spapr_tce_blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, /* - * Some devices might not have IOMMU table and group - * and we needn't detach them from the associated - * IOMMU groups + * FIXME: SPAPR mixes blocked and platform behaviors, the blocked domain + * also sets the dma_api ops */ - if (!device_iommu_mapped(dev)) { - pr_debug("iommu_tce: skipping device %s with no tbl\n", - dev_name(dev)); - return; + .ops = &spapr_tce_platform_domain_ops, +}; + +static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return true; + default: + break; } - iommu_group_remove_device(dev); + return false; } -EXPORT_SYMBOL_GPL(iommu_del_device); + +static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev) +{ + struct pci_dev *pdev; + struct pci_controller *hose; + + if (!dev_is_pci(dev)) + return ERR_PTR(-EPERM); + + pdev = to_pci_dev(dev); + hose = pdev->bus->sysdata; + + return &hose->iommu; +} + +static void spapr_tce_iommu_release_device(struct device *dev) +{ +} + +static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev) +{ + struct pci_controller *hose; + struct pci_dev *pdev; + + pdev = to_pci_dev(dev); + hose = pdev->bus->sysdata; + + if (!hose->controller_ops.device_group) + return ERR_PTR(-ENOENT); + + return hose->controller_ops.device_group(hose, pdev); +} + +static const struct iommu_ops spapr_tce_iommu_ops = { + .default_domain = &spapr_tce_platform_domain, + .blocked_domain = &spapr_tce_blocked_domain, + .capable = spapr_tce_iommu_capable, + .probe_device = spapr_tce_iommu_probe_device, + .release_device = spapr_tce_iommu_release_device, + .device_group = spapr_tce_iommu_device_group, +}; + +static struct attribute *spapr_tce_iommu_attrs[] = { + NULL, +}; + +static struct attribute_group spapr_tce_iommu_group = { + .name = "spapr-tce-iommu", + .attrs = spapr_tce_iommu_attrs, +}; + +static const struct attribute_group *spapr_tce_iommu_groups[] = { + &spapr_tce_iommu_group, + NULL, +}; + +/* + * This registers IOMMU devices of PHBs. This needs to happen + * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and + * before subsys_initcall(iommu_subsys_init). + */ +static int __init spapr_tce_setup_phb_iommus_initcall(void) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) { + iommu_device_sysfs_add(&hose->iommu, hose->parent, + spapr_tce_iommu_groups, "iommu-phb%04x", + hose->global_number); + iommu_device_register(&hose->iommu, &spapr_tce_iommu_ops, + hose->parent); + } + return 0; +} +postcore_initcall_sync(spapr_tce_setup_phb_iommus_initcall); +#endif + #endif /* CONFIG_IOMMU_API */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5645bc9cbc09..6f7d4edaa0bc 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -50,26 +50,23 @@ #include <linux/debugfs.h> #include <linux/of.h> #include <linux/of_irq.h> +#include <linux/vmalloc.h> +#include <linux/pgtable.h> +#include <linux/static_call.h> #include <linux/uaccess.h> +#include <asm/interrupt.h> #include <asm/io.h> -#include <asm/pgtable.h> #include <asm/irq.h> #include <asm/cache.h> -#include <asm/prom.h> #include <asm/ptrace.h> #include <asm/machdep.h> #include <asm/udbg.h> #include <asm/smp.h> -#include <asm/livepatch.h> -#include <asm/asm-prototypes.h> #include <asm/hw_irq.h> +#include <asm/softirq_stack.h> +#include <asm/ppc_asm.h> -#ifdef CONFIG_PPC64 -#include <asm/paca.h> -#include <asm/firmware.h> -#include <asm/lv1call.h> -#endif #define CREATE_TRACE_POINTS #include <asm/trace.h> #include <asm/cpu_has_feature.h> @@ -86,417 +83,6 @@ u32 tau_interrupts(unsigned long cpu); #endif #endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 - -int distribute_irqs = 1; - -static inline notrace unsigned long get_irq_happened(void) -{ - unsigned long happened; - - __asm__ __volatile__("lbz %0,%1(13)" - : "=r" (happened) : "i" (offsetof(struct paca_struct, irq_happened))); - - return happened; -} - -static inline notrace int decrementer_check_overflow(void) -{ - u64 now = get_tb_or_rtc(); - u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); - - return now >= *next_tb; -} - -/* This is called whenever we are re-enabling interrupts - * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if - * there's an EE, DEC or DBELL to generate. - * - * This is called in two contexts: From arch_local_irq_restore() - * before soft-enabling interrupts, and from the exception exit - * path when returning from an interrupt from a soft-disabled to - * a soft enabled context. In both case we have interrupts hard - * disabled. - * - * We take care of only clearing the bits we handled in the - * PACA irq_happened field since we can only re-emit one at a - * time and we don't want to "lose" one. - */ -notrace unsigned int __check_irq_replay(void) -{ - /* - * We use local_paca rather than get_paca() to avoid all - * the debug_smp_processor_id() business in this low level - * function - */ - unsigned char happened = local_paca->irq_happened; - - /* - * We are responding to the next interrupt, so interrupt-off - * latencies should be reset here. - */ - trace_hardirqs_on(); - trace_hardirqs_off(); - - /* - * We are always hard disabled here, but PACA_IRQ_HARD_DIS may - * not be set, which means interrupts have only just been hard - * disabled as part of the local_irq_restore or interrupt return - * code. In that case, skip the decrementr check becaus it's - * expensive to read the TB. - * - * HARD_DIS then gets cleared here, but it's reconciled later. - * Either local_irq_disable will replay the interrupt and that - * will reconcile state like other hard interrupts. Or interrupt - * retur will replay the interrupt and in that case it sets - * PACA_IRQ_HARD_DIS by hand (see comments in entry_64.S). - */ - if (happened & PACA_IRQ_HARD_DIS) { - local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; - - /* - * We may have missed a decrementer interrupt if hard disabled. - * Check the decrementer register in case we had a rollover - * while hard disabled. - */ - if (!(happened & PACA_IRQ_DEC)) { - if (decrementer_check_overflow()) { - local_paca->irq_happened |= PACA_IRQ_DEC; - happened |= PACA_IRQ_DEC; - } - } - } - - /* - * Force the delivery of pending soft-disabled interrupts on PS3. - * Any HV call will have this side effect. - */ - if (firmware_has_feature(FW_FEATURE_PS3_LV1)) { - u64 tmp, tmp2; - lv1_get_version_info(&tmp, &tmp2); - } - - /* - * Check if an hypervisor Maintenance interrupt happened. - * This is a higher priority interrupt than the others, so - * replay it first. - */ - if (happened & PACA_IRQ_HMI) { - local_paca->irq_happened &= ~PACA_IRQ_HMI; - return 0xe60; - } - - if (happened & PACA_IRQ_DEC) { - local_paca->irq_happened &= ~PACA_IRQ_DEC; - return 0x900; - } - - if (happened & PACA_IRQ_PMI) { - local_paca->irq_happened &= ~PACA_IRQ_PMI; - return 0xf00; - } - - if (happened & PACA_IRQ_EE) { - local_paca->irq_happened &= ~PACA_IRQ_EE; - return 0x500; - } - -#ifdef CONFIG_PPC_BOOK3E - /* - * Check if an EPR external interrupt happened this bit is typically - * set if we need to handle another "edge" interrupt from within the - * MPIC "EPR" handler. - */ - if (happened & PACA_IRQ_EE_EDGE) { - local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; - return 0x500; - } - - if (happened & PACA_IRQ_DBELL) { - local_paca->irq_happened &= ~PACA_IRQ_DBELL; - return 0x280; - } -#else - if (happened & PACA_IRQ_DBELL) { - local_paca->irq_happened &= ~PACA_IRQ_DBELL; - return 0xa00; - } -#endif /* CONFIG_PPC_BOOK3E */ - - /* There should be nothing left ! */ - BUG_ON(local_paca->irq_happened != 0); - - return 0; -} - -notrace void arch_local_irq_restore(unsigned long mask) -{ - unsigned char irq_happened; - unsigned int replay; - - /* Write the new soft-enabled value */ - irq_soft_mask_set(mask); - if (mask) - return; - - /* - * From this point onward, we can take interrupts, preempt, - * etc... unless we got hard-disabled. We check if an event - * happened. If none happened, we know we can just return. - * - * We may have preempted before the check below, in which case - * we are checking the "new" CPU instead of the old one. This - * is only a problem if an event happened on the "old" CPU. - * - * External interrupt events will have caused interrupts to - * be hard-disabled, so there is no problem, we - * cannot have preempted. - */ - irq_happened = get_irq_happened(); - if (!irq_happened) { -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON_ONCE(!(mfmsr() & MSR_EE)); -#endif - return; - } - - /* - * We need to hard disable to get a trusted value from - * __check_irq_replay(). We also need to soft-disable - * again to avoid warnings in there due to the use of - * per-cpu variables. - */ - if (!(irq_happened & PACA_IRQ_HARD_DIS)) { -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON_ONCE(!(mfmsr() & MSR_EE)); -#endif - __hard_irq_disable(); -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - } else { - /* - * We should already be hard disabled here. We had bugs - * where that wasn't the case so let's dbl check it and - * warn if we are wrong. Only do that when IRQ tracing - * is enabled as mfmsr() can be costly. - */ - if (WARN_ON_ONCE(mfmsr() & MSR_EE)) - __hard_irq_disable(); -#endif - } - - irq_soft_mask_set(IRQS_ALL_DISABLED); - trace_hardirqs_off(); - - /* - * Check if anything needs to be re-emitted. We haven't - * soft-enabled yet to avoid warnings in decrementer_check_overflow - * accessing per-cpu variables - */ - replay = __check_irq_replay(); - - /* We can soft-enable now */ - trace_hardirqs_on(); - irq_soft_mask_set(IRQS_ENABLED); - - /* - * And replay if we have to. This will return with interrupts - * hard-enabled. - */ - if (replay) { - __replay_interrupt(replay); - return; - } - - /* Finally, let's ensure we are hard enabled */ - __hard_irq_enable(); -} -EXPORT_SYMBOL(arch_local_irq_restore); - -/* - * This is specifically called by assembly code to re-enable interrupts - * if they are currently disabled. This is typically called before - * schedule() or do_signal() when returning to userspace. We do it - * in C to avoid the burden of dealing with lockdep etc... - * - * NOTE: This is called with interrupts hard disabled but not marked - * as such in paca->irq_happened, so we need to resync this. - */ -void notrace restore_interrupts(void) -{ - if (irqs_disabled()) { - local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - local_irq_enable(); - } else - __hard_irq_enable(); -} - -/* - * This is a helper to use when about to go into idle low-power - * when the latter has the side effect of re-enabling interrupts - * (such as calling H_CEDE under pHyp). - * - * You call this function with interrupts soft-disabled (this is - * already the case when ppc_md.power_save is called). The function - * will return whether to enter power save or just return. - * - * In the former case, it will have notified lockdep of interrupts - * being re-enabled and generally sanitized the lazy irq state, - * and in the latter case it will leave with interrupts hard - * disabled and marked as such, so the local_irq_enable() call - * in arch_cpu_idle() will properly re-enable everything. - */ -bool prep_irq_for_idle(void) -{ - /* - * First we need to hard disable to ensure no interrupt - * occurs before we effectively enter the low power state - */ - __hard_irq_disable(); - local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - - /* - * If anything happened while we were soft-disabled, - * we return now and do not enter the low power state. - */ - if (lazy_irq_pending()) - return false; - - /* Tell lockdep we are about to re-enable */ - trace_hardirqs_on(); - - /* - * Mark interrupts as soft-enabled and clear the - * PACA_IRQ_HARD_DIS from the pending mask since we - * are about to hard enable as well as a side effect - * of entering the low power state. - */ - local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; - irq_soft_mask_set(IRQS_ENABLED); - - /* Tell the caller to enter the low power state */ - return true; -} - -#ifdef CONFIG_PPC_BOOK3S -/* - * This is for idle sequences that return with IRQs off, but the - * idle state itself wakes on interrupt. Tell the irq tracer that - * IRQs are enabled for the duration of idle so it does not get long - * off times. Must be paired with fini_irq_for_idle_irqsoff. - */ -bool prep_irq_for_idle_irqsoff(void) -{ - WARN_ON(!irqs_disabled()); - - /* - * First we need to hard disable to ensure no interrupt - * occurs before we effectively enter the low power state - */ - __hard_irq_disable(); - local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - - /* - * If anything happened while we were soft-disabled, - * we return now and do not enter the low power state. - */ - if (lazy_irq_pending()) - return false; - - /* Tell lockdep we are about to re-enable */ - trace_hardirqs_on(); - - return true; -} - -/* - * Take the SRR1 wakeup reason, index into this table to find the - * appropriate irq_happened bit. - * - * Sytem reset exceptions taken in idle state also come through here, - * but they are NMI interrupts so do not need to wait for IRQs to be - * restored, and should be taken as early as practical. These are marked - * with 0xff in the table. The Power ISA specifies 0100b as the system - * reset interrupt reason. - */ -#define IRQ_SYSTEM_RESET 0xff - -static const u8 srr1_to_lazyirq[0x10] = { - 0, 0, 0, - PACA_IRQ_DBELL, - IRQ_SYSTEM_RESET, - PACA_IRQ_DBELL, - PACA_IRQ_DEC, - 0, - PACA_IRQ_EE, - PACA_IRQ_EE, - PACA_IRQ_HMI, - 0, 0, 0, 0, 0 }; - -void replay_system_reset(void) -{ - struct pt_regs regs; - - ppc_save_regs(®s); - regs.trap = 0x100; - get_paca()->in_nmi = 1; - system_reset_exception(®s); - get_paca()->in_nmi = 0; -} -EXPORT_SYMBOL_GPL(replay_system_reset); - -void irq_set_pending_from_srr1(unsigned long srr1) -{ - unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; - u8 reason = srr1_to_lazyirq[idx]; - - /* - * Take the system reset now, which is immediately after registers - * are restored from idle. It's an NMI, so interrupts need not be - * re-enabled before it is taken. - */ - if (unlikely(reason == IRQ_SYSTEM_RESET)) { - replay_system_reset(); - return; - } - - /* - * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, - * so this can be called unconditionally with the SRR1 wake - * reason as returned by the idle code, which uses 0 to mean no - * interrupt. - * - * If a future CPU was to designate this as an interrupt reason, - * then a new index for no interrupt must be assigned. - */ - local_paca->irq_happened |= reason; -} -#endif /* CONFIG_PPC_BOOK3S */ - -/* - * Force a replay of the external interrupt handler on this CPU. - */ -void force_external_irq_replay(void) -{ - /* - * This must only be called with interrupts soft-disabled, - * the replay will happen when re-enabling. - */ - WARN_ON(!arch_irqs_disabled()); - - /* - * Interrupts must always be hard disabled before irq_happened is - * modified (to prevent lost update in case of interrupt between - * load and store). - */ - __hard_irq_disable(); - local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - - /* Indicate in the PACA that we have an interrupt to replay */ - local_paca->irq_happened |= PACA_IRQ_EE; -} - -#endif /* CONFIG_PPC64 */ - int arch_show_interrupts(struct seq_file *p, int prec) { int j; @@ -540,13 +126,14 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions); seq_printf(p, " Machine check exceptions\n"); +#ifdef CONFIG_PPC_BOOK3S_64 if (cpu_has_feature(CPU_FTR_HVMODE)) { seq_printf(p, "%*s: ", prec, "HMI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat, j).hmi_exceptions); + seq_printf(p, "%10u ", paca_ptrs[j]->hmi_irqs); seq_printf(p, " Hypervisor Maintenance Interrupts\n"); } +#endif seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) @@ -584,7 +171,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += per_cpu(irq_stat, cpu).mce_exceptions; sum += per_cpu(irq_stat, cpu).spurious_irqs; sum += per_cpu(irq_stat, cpu).timer_irqs_others; - sum += per_cpu(irq_stat, cpu).hmi_exceptions; +#ifdef CONFIG_PPC_BOOK3S_64 + sum += paca_ptrs[cpu]->hmi_irqs; +#endif sum += per_cpu(irq_stat, cpu).sreset_irqs; #ifdef CONFIG_PPC_WATCHDOG sum += per_cpu(irq_stat, cpu).soft_nmi_irqs; @@ -596,40 +185,65 @@ u64 arch_irq_stat_cpu(unsigned int cpu) return sum; } -static inline void check_stack_overflow(void) +static inline void check_stack_overflow(unsigned long sp) { -#ifdef CONFIG_DEBUG_STACKOVERFLOW - long sp; + if (!IS_ENABLED(CONFIG_DEBUG_STACKOVERFLOW)) + return; - sp = current_stack_pointer() & (THREAD_SIZE-1); + sp &= THREAD_SIZE - 1; - /* check for stack overflow: is there less than 2KB free? */ - if (unlikely(sp < 2048)) { + /* check for stack overflow: is there less than 1/4th free? */ + if (unlikely(sp < THREAD_SIZE / 4)) { pr_err("do_IRQ: stack overflow: %ld\n", sp); dump_stack(); } +} + +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK +static __always_inline void call_do_softirq(const void *sp) +{ + /* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */ + asm volatile ( + PPC_STLU " %%r1, %[offset](%[sp]) ;" + "mr %%r1, %[sp] ;" +#ifdef CONFIG_PPC_KERNEL_PCREL + "bl %[callee]@notoc ;" +#else + "bl %[callee] ;" #endif + PPC_LL " %%r1, 0(%%r1) ;" + : // Outputs + : // Inputs + [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_MIN_SIZE), + [callee] "i" (__do_softirq) + : // Clobbers + "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", + "cr7", "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12" + ); } +#endif -void __do_irq(struct pt_regs *regs) +DEFINE_STATIC_CALL_RET0(ppc_get_irq, *ppc_md.get_irq); + +static void __do_irq(struct pt_regs *regs, unsigned long oldsp) { unsigned int irq; - irq_enter(); - trace_irq_entry(regs); - check_stack_overflow(); + check_stack_overflow(oldsp); /* * Query the platform PIC for the interrupt & ack it. * * This will typically lower the interrupt line to the CPU */ - irq = ppc_md.get_irq(); + irq = static_call(ppc_get_irq)(); /* We can hard enable interrupts now to allow perf interrupts */ - may_hard_irq_enable(); + if (should_hard_irq_enable(regs)) + do_hard_irq_enable(); /* And finally process it */ if (unlikely(!irq)) @@ -638,39 +252,89 @@ void __do_irq(struct pt_regs *regs) generic_handle_irq(irq); trace_irq_exit(regs); +} - irq_exit(); +static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) +{ + register unsigned long r3 asm("r3") = (unsigned long)regs; + + /* Temporarily switch r1 to sp, call __do_irq() then restore r1. */ + asm volatile ( + PPC_STLU " %%r1, %[offset](%[sp]) ;" + "mr %%r4, %%r1 ;" + "mr %%r1, %[sp] ;" +#ifdef CONFIG_PPC_KERNEL_PCREL + "bl %[callee]@notoc ;" +#else + "bl %[callee] ;" +#endif + PPC_LL " %%r1, 0(%%r1) ;" + : // Outputs + "+r" (r3) + : // Inputs + [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_MIN_SIZE), + [callee] "i" (__do_irq) + : // Clobbers + "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", + "cr7", "r0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12" + ); } -void do_IRQ(struct pt_regs *regs) +void __do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); void *cursp, *irqsp, *sirqsp; /* Switch to the irq stack to handle this */ - cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + cursp = (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); irqsp = hardirq_ctx[raw_smp_processor_id()]; sirqsp = softirq_ctx[raw_smp_processor_id()]; - /* Already there ? */ - if (unlikely(cursp == irqsp || cursp == sirqsp)) { - __do_irq(regs); - set_irq_regs(old_regs); - return; - } - /* Switch stack and call */ - call_do_irq(regs, irqsp); + /* Already there ? If not switch stack and call */ + if (unlikely(cursp == irqsp || cursp == sirqsp)) + __do_irq(regs, current_stack_pointer); + else + call_do_irq(regs, irqsp); set_irq_regs(old_regs); } +DEFINE_INTERRUPT_HANDLER_ASYNC(do_IRQ) +{ + __do_IRQ(regs); +} + +static void *__init alloc_vm_stack(void) +{ + return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP, + NUMA_NO_NODE, (void *)_RET_IP_); +} + +static void __init vmap_irqstack_init(void) +{ + int i; + + for_each_possible_cpu(i) { + softirq_ctx[i] = alloc_vm_stack(); + hardirq_ctx[i] = alloc_vm_stack(); + } +} + + void __init init_IRQ(void) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) + vmap_irqstack_init(); + if (ppc_md.init_IRQ) ppc_md.init_IRQ(); + + if (!WARN_ON(!ppc_md.get_irq)) + static_call_update(ppc_get_irq, ppc_md.get_irq); } -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x void *critirq_ctx[NR_CPUS] __read_mostly; void *dbgirq_ctx[NR_CPUS] __read_mostly; void *mcheckirq_ctx[NR_CPUS] __read_mostly; @@ -679,10 +343,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; void *softirq_ctx[NR_CPUS] __read_mostly; void *hardirq_ctx[NR_CPUS] __read_mostly; +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK void do_softirq_own_stack(void) { call_do_softirq(softirq_ctx[smp_processor_id()]); } +#endif irq_hw_number_t virq_to_hw(unsigned int virq) { @@ -726,13 +392,3 @@ int irq_choose_cpu(const struct cpumask *mask) return hard_smp_processor_id(); } #endif - -#ifdef CONFIG_PPC64 -static int __init setup_noirqdistrib(char *str) -{ - distribute_irqs = 0; - return 1; -} - -__setup("noirqdistrib", setup_noirqdistrib); -#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/irq_64.c b/arch/powerpc/kernel/irq_64.c new file mode 100644 index 000000000000..938e66829eae --- /dev/null +++ b/arch/powerpc/kernel/irq_64.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Derived from arch/i386/kernel/irq.c + * Copyright (C) 1992 Linus Torvalds + * Adapted from arch/i386 by Gary Thomas + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Updated and modified by Cort Dougan <cort@fsmlabs.com> + * Copyright (C) 1996-2001 Cort Dougan + * Adapted for Power Macintosh by Paul Mackerras + * Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au) + * + * This file contains the code used by various IRQ handling routines: + * asking for different IRQ's should be done through these routines + * instead of just grabbing them. Thus setups with different IRQ numbers + * shouldn't result in any weird surprises, and installing new handlers + * should be easier. + */ + +#undef DEBUG + +#include <linux/export.h> +#include <linux/threads.h> +#include <linux/kernel_stat.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/seq_file.h> +#include <linux/cpumask.h> +#include <linux/profile.h> +#include <linux/bitops.h> +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/debugfs.h> +#include <linux/of.h> +#include <linux/of_irq.h> +#include <linux/vmalloc.h> +#include <linux/pgtable.h> +#include <linux/static_call.h> + +#include <linux/uaccess.h> +#include <asm/interrupt.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/cache.h> +#include <asm/ptrace.h> +#include <asm/machdep.h> +#include <asm/udbg.h> +#include <asm/smp.h> +#include <asm/hw_irq.h> +#include <asm/softirq_stack.h> +#include <asm/ppc_asm.h> + +#include <asm/paca.h> +#include <asm/firmware.h> +#include <asm/lv1call.h> +#include <asm/dbell.h> +#include <asm/trace.h> +#include <asm/cpu_has_feature.h> + +int distribute_irqs = 1; + +static inline void next_interrupt(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON(!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)); + WARN_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); + } + + /* + * We are responding to the next interrupt, so interrupt-off + * latencies should be reset here. + */ + lockdep_hardirq_exit(); + trace_hardirqs_on(); + trace_hardirqs_off(); + lockdep_hardirq_enter(); +} + +static inline bool irq_happened_test_and_clear(u8 irq) +{ + if (local_paca->irq_happened & irq) { + local_paca->irq_happened &= ~irq; + return true; + } + return false; +} + +static __no_kcsan void __replay_soft_interrupts(void) +{ + struct pt_regs regs; + + /* + * We use local_paca rather than get_paca() to avoid all the + * debug_smp_processor_id() business in this low level function. + */ + + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON_ONCE(mfmsr() & MSR_EE); + WARN_ON(!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)); + WARN_ON(local_paca->irq_happened & PACA_IRQ_REPLAYING); + } + + /* + * PACA_IRQ_REPLAYING prevents interrupt handlers from enabling + * MSR[EE] to get PMIs, which can result in more IRQs becoming + * pending. + */ + local_paca->irq_happened |= PACA_IRQ_REPLAYING; + + ppc_save_regs(®s); + regs.softe = IRQS_ENABLED; + regs.msr |= MSR_EE; + + /* + * Force the delivery of pending soft-disabled interrupts on PS3. + * Any HV call will have this side effect. + */ + if (firmware_has_feature(FW_FEATURE_PS3_LV1)) { + u64 tmp, tmp2; + lv1_get_version_info(&tmp, &tmp2); + } + + /* + * Check if an hypervisor Maintenance interrupt happened. + * This is a higher priority interrupt than the others, so + * replay it first. + */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && + irq_happened_test_and_clear(PACA_IRQ_HMI)) { + regs.trap = INTERRUPT_HMI; + handle_hmi_exception(®s); + next_interrupt(®s); + } + + if (irq_happened_test_and_clear(PACA_IRQ_DEC)) { + regs.trap = INTERRUPT_DECREMENTER; + timer_interrupt(®s); + next_interrupt(®s); + } + + if (irq_happened_test_and_clear(PACA_IRQ_EE)) { + regs.trap = INTERRUPT_EXTERNAL; + do_IRQ(®s); + next_interrupt(®s); + } + + if (IS_ENABLED(CONFIG_PPC_DOORBELL) && + irq_happened_test_and_clear(PACA_IRQ_DBELL)) { + regs.trap = INTERRUPT_DOORBELL; + doorbell_exception(®s); + next_interrupt(®s); + } + + /* Book3E does not support soft-masking PMI interrupts */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && + irq_happened_test_and_clear(PACA_IRQ_PMI)) { + regs.trap = INTERRUPT_PERFMON; + performance_monitor_exception(®s); + next_interrupt(®s); + } + + local_paca->irq_happened &= ~PACA_IRQ_REPLAYING; +} + +__no_kcsan void replay_soft_interrupts(void) +{ + irq_enter(); /* See comment in arch_local_irq_restore */ + __replay_soft_interrupts(); + irq_exit(); +} + +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_KUAP) +static inline __no_kcsan void replay_soft_interrupts_irqrestore(void) +{ + unsigned long kuap_state = get_kuap(); + + /* + * Check if anything calls local_irq_enable/restore() when KUAP is + * disabled (user access enabled). We handle that case here by saving + * and re-locking AMR but we shouldn't get here in the first place, + * hence the warning. + */ + kuap_assert_locked(); + + if (kuap_state != AMR_KUAP_BLOCKED) + set_kuap(AMR_KUAP_BLOCKED); + + __replay_soft_interrupts(); + + if (kuap_state != AMR_KUAP_BLOCKED) + set_kuap(kuap_state); +} +#else +#define replay_soft_interrupts_irqrestore() __replay_soft_interrupts() +#endif + +notrace __no_kcsan void arch_local_irq_restore(unsigned long mask) +{ + unsigned char irq_happened; + + /* Write the new soft-enabled value if it is a disable */ + if (mask) { + irq_soft_mask_set(mask); + return; + } + + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON_ONCE(in_nmi()); + WARN_ON_ONCE(in_hardirq()); + WARN_ON_ONCE(local_paca->irq_happened & PACA_IRQ_REPLAYING); + } + +again: + /* + * After the stb, interrupts are unmasked and there are no interrupts + * pending replay. The restart sequence makes this atomic with + * respect to soft-masked interrupts. If this was just a simple code + * sequence, a soft-masked interrupt could become pending right after + * the comparison and before the stb. + * + * This allows interrupts to be unmasked without hard disabling, and + * also without new hard interrupts coming in ahead of pending ones. + */ + asm_volatile_goto( +"1: \n" +" lbz 9,%0(13) \n" +" cmpwi 9,0 \n" +" bne %l[happened] \n" +" stb 9,%1(13) \n" +"2: \n" + RESTART_TABLE(1b, 2b, 1b) + : : "i" (offsetof(struct paca_struct, irq_happened)), + "i" (offsetof(struct paca_struct, irq_soft_mask)) + : "cr0", "r9" + : happened); + + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(!(mfmsr() & MSR_EE)); + + /* + * If we came here from the replay below, we might have a preempt + * pending (due to preempt_enable_no_resched()). Have to check now. + */ + preempt_check_resched(); + + return; + +happened: + irq_happened = READ_ONCE(local_paca->irq_happened); + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(!irq_happened); + + if (irq_happened == PACA_IRQ_HARD_DIS) { + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(mfmsr() & MSR_EE); + irq_soft_mask_set(IRQS_ENABLED); + local_paca->irq_happened = 0; + __hard_irq_enable(); + preempt_check_resched(); + return; + } + + /* Have interrupts to replay, need to hard disable first */ + if (!(irq_happened & PACA_IRQ_HARD_DIS)) { + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + if (!(mfmsr() & MSR_EE)) { + /* + * An interrupt could have come in and cleared + * MSR[EE] and set IRQ_HARD_DIS, so check + * IRQ_HARD_DIS again and warn if it is still + * clear. + */ + irq_happened = READ_ONCE(local_paca->irq_happened); + WARN_ON_ONCE(!(irq_happened & PACA_IRQ_HARD_DIS)); + } + } + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + } else { + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + if (WARN_ON_ONCE(mfmsr() & MSR_EE)) + __hard_irq_disable(); + } + } + + /* + * Disable preempt here, so that the below preempt_enable will + * perform resched if required (a replayed interrupt may set + * need_resched). + */ + preempt_disable(); + irq_soft_mask_set(IRQS_ALL_DISABLED); + trace_hardirqs_off(); + + /* + * Now enter interrupt context. The interrupt handlers themselves + * also call irq_enter/exit (which is okay, they can nest). But call + * it here now to hold off softirqs until the below irq_exit(). If + * we allowed replayed handlers to run softirqs, that enables irqs, + * which must replay interrupts, which recurses in here and makes + * things more complicated. The recursion is limited to 2, and it can + * be made to work, but it's complicated. + * + * local_bh_disable can not be used here because interrupts taken in + * idle are not in the right context (RCU, tick, etc) to run softirqs + * so irq_enter must be called. + */ + irq_enter(); + + replay_soft_interrupts_irqrestore(); + + irq_exit(); + + if (unlikely(local_paca->irq_happened != PACA_IRQ_HARD_DIS)) { + /* + * The softirq processing in irq_exit() may enable interrupts + * temporarily, which can result in MSR[EE] being enabled and + * more irqs becoming pending. Go around again if that happens. + */ + trace_hardirqs_on(); + preempt_enable_no_resched(); + goto again; + } + + trace_hardirqs_on(); + irq_soft_mask_set(IRQS_ENABLED); + local_paca->irq_happened = 0; + __hard_irq_enable(); + preempt_enable(); +} +EXPORT_SYMBOL(arch_local_irq_restore); + +/* + * This is a helper to use when about to go into idle low-power + * when the latter has the side effect of re-enabling interrupts + * (such as calling H_CEDE under pHyp). + * + * You call this function with interrupts soft-disabled (this is + * already the case when ppc_md.power_save is called). The function + * will return whether to enter power save or just return. + * + * In the former case, it will have generally sanitized the lazy irq + * state, and in the latter case it will leave with interrupts hard + * disabled and marked as such, so the local_irq_enable() call + * in arch_cpu_idle() will properly re-enable everything. + */ +__cpuidle bool prep_irq_for_idle(void) +{ + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* + * Mark interrupts as soft-enabled and clear the + * PACA_IRQ_HARD_DIS from the pending mask since we + * are about to hard enable as well as a side effect + * of entering the low power state. + */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + irq_soft_mask_set(IRQS_ENABLED); + + /* Tell the caller to enter the low power state */ + return true; +} + +#ifdef CONFIG_PPC_BOOK3S +/* + * This is for idle sequences that return with IRQs off, but the + * idle state itself wakes on interrupt. Tell the irq tracer that + * IRQs are enabled for the duration of idle so it does not get long + * off times. Must be paired with fini_irq_for_idle_irqsoff. + */ +bool prep_irq_for_idle_irqsoff(void) +{ + WARN_ON(!irqs_disabled()); + + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* Tell lockdep we are about to re-enable */ + trace_hardirqs_on(); + + return true; +} + +/* + * Take the SRR1 wakeup reason, index into this table to find the + * appropriate irq_happened bit. + * + * Sytem reset exceptions taken in idle state also come through here, + * but they are NMI interrupts so do not need to wait for IRQs to be + * restored, and should be taken as early as practical. These are marked + * with 0xff in the table. The Power ISA specifies 0100b as the system + * reset interrupt reason. + */ +#define IRQ_SYSTEM_RESET 0xff + +static const u8 srr1_to_lazyirq[0x10] = { + 0, 0, 0, + PACA_IRQ_DBELL, + IRQ_SYSTEM_RESET, + PACA_IRQ_DBELL, + PACA_IRQ_DEC, + 0, + PACA_IRQ_EE, + PACA_IRQ_EE, + PACA_IRQ_HMI, + 0, 0, 0, 0, 0 }; + +void replay_system_reset(void) +{ + struct pt_regs regs; + + ppc_save_regs(®s); + regs.trap = 0x100; + get_paca()->in_nmi = 1; + system_reset_exception(®s); + get_paca()->in_nmi = 0; +} +EXPORT_SYMBOL_GPL(replay_system_reset); + +void irq_set_pending_from_srr1(unsigned long srr1) +{ + unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + u8 reason = srr1_to_lazyirq[idx]; + + /* + * Take the system reset now, which is immediately after registers + * are restored from idle. It's an NMI, so interrupts need not be + * re-enabled before it is taken. + */ + if (unlikely(reason == IRQ_SYSTEM_RESET)) { + replay_system_reset(); + return; + } + + if (reason == PACA_IRQ_DBELL) { + /* + * When doorbell triggers a system reset wakeup, the message + * is not cleared, so if the doorbell interrupt is replayed + * and the IPI handled, the doorbell interrupt would still + * fire when EE is enabled. + * + * To avoid taking the superfluous doorbell interrupt, + * execute a msgclr here before the interrupt is replayed. + */ + ppc_msgclr(PPC_DBELL_MSGTYPE); + } + + /* + * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, + * so this can be called unconditionally with the SRR1 wake + * reason as returned by the idle code, which uses 0 to mean no + * interrupt. + * + * If a future CPU was to designate this as an interrupt reason, + * then a new index for no interrupt must be assigned. + */ + local_paca->irq_happened |= reason; +} +#endif /* CONFIG_PPC_BOOK3S */ + +/* + * Force a replay of the external interrupt handler on this CPU. + */ +void force_external_irq_replay(void) +{ + /* + * This must only be called with interrupts soft-disabled, + * the replay will happen when re-enabling. + */ + WARN_ON(!arch_irqs_disabled()); + + /* + * Interrupts must always be hard disabled before irq_happened is + * modified (to prevent lost update in case of interrupt between + * load and store). + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* Indicate in the PACA that we have an interrupt to replay */ + local_paca->irq_happened |= PACA_IRQ_EE; +} + +static int __init setup_noirqdistrib(char *str) +{ + distribute_irqs = 0; + return 1; +} + +__setup("noirqdistrib", setup_noirqdistrib); diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 773671b512df..48e0eaf1ad61 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -18,10 +18,11 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/notifier.h> +#include <linux/of_address.h> +#include <linux/vmalloc.h> #include <asm/processor.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> #include <asm/ppc-pci.h> @@ -38,82 +39,66 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev); #define ISA_SPACE_MASK 0x1 #define ISA_SPACE_IO 0x1 -static void pci_process_ISA_OF_ranges(struct device_node *isa_node, - unsigned long phb_io_base_phys) +static void remap_isa_base(phys_addr_t pa, unsigned long size) +{ + WARN_ON_ONCE(ISA_IO_BASE & ~PAGE_MASK); + WARN_ON_ONCE(pa & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + if (slab_is_available()) { + if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa, + pgprot_noncached(PAGE_KERNEL))) + vunmap_range(ISA_IO_BASE, ISA_IO_BASE + size); + } else { + early_ioremap_range(ISA_IO_BASE, pa, size, + pgprot_noncached(PAGE_KERNEL)); + } +} + +static int process_ISA_OF_ranges(struct device_node *isa_node, + unsigned long phb_io_base_phys) { - /* We should get some saner parsing here and remove these structs */ - struct pci_address { - u32 a_hi; - u32 a_mid; - u32 a_lo; - }; - - struct isa_address { - u32 a_hi; - u32 a_lo; - }; - - struct isa_range { - struct isa_address isa_addr; - struct pci_address pci_addr; - unsigned int size; - }; - - const struct isa_range *range; - unsigned long pci_addr; - unsigned int isa_addr; unsigned int size; - int rlen = 0; + struct of_range_parser parser; + struct of_range range; - range = of_get_property(isa_node, "ranges", &rlen); - if (range == NULL || (rlen < sizeof(struct isa_range))) + if (of_range_parser_init(&parser, isa_node)) goto inval_range; - /* From "ISA Binding to 1275" - * The ranges property is laid out as an array of elements, - * each of which comprises: - * cells 0 - 1: an ISA address - * cells 2 - 4: a PCI address - * (size depending on dev->n_addr_cells) - * cell 5: the size of the range - */ - if ((range->isa_addr.a_hi & ISA_SPACE_MASK) != ISA_SPACE_IO) { - range++; - rlen -= sizeof(struct isa_range); - if (rlen < sizeof(struct isa_range)) - goto inval_range; - } - if ((range->isa_addr.a_hi & ISA_SPACE_MASK) != ISA_SPACE_IO) - goto inval_range; + for_each_of_range(&parser, &range) { + if ((range.flags & ISA_SPACE_MASK) != ISA_SPACE_IO) + continue; - isa_addr = range->isa_addr.a_lo; - pci_addr = (unsigned long) range->pci_addr.a_mid << 32 | - range->pci_addr.a_lo; + if (range.cpu_addr == OF_BAD_ADDR) { + pr_err("ISA: Bad CPU mapping: %s\n", __func__); + return -EINVAL; + } - /* Assume these are both zero. Note: We could fix that and - * do a proper parsing instead ... oh well, that will do for - * now as nobody uses fancy mappings for ISA bridges - */ - if ((pci_addr != 0) || (isa_addr != 0)) { - printk(KERN_ERR "unexpected isa to pci mapping: %s\n", - __func__); - return; - } + /* We need page alignment */ + if ((range.bus_addr & ~PAGE_MASK) || (range.cpu_addr & ~PAGE_MASK)) { + pr_warn("ISA: bridge %pOF has non aligned IO range\n", isa_node); + return -EINVAL; + } - /* Align size and make sure it's cropped to 64K */ - size = PAGE_ALIGN(range->size); - if (size > 0x10000) - size = 0x10000; + /* Align size and make sure it's cropped to 64K */ + size = PAGE_ALIGN(range.size); + if (size > 0x10000) + size = 0x10000; - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); - return; + if (!phb_io_base_phys) + phb_io_base_phys = range.cpu_addr; + + remap_isa_base(phb_io_base_phys, size); + return 0; + } inval_range: - printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " - "mapping 64k\n"); - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_noncached(PAGE_KERNEL)); + if (phb_io_base_phys) { + pr_err("no ISA IO ranges or unexpected isa range, mapping 64k\n"); + remap_isa_base(phb_io_base_phys, 0x10000); + return 0; + } + return -EINVAL; } @@ -155,7 +140,7 @@ void __init isa_bridge_find_early(struct pci_controller *hose) isa_bridge_devnode = np; /* Now parse the "ranges" property and setup the ISA mapping */ - pci_process_ISA_OF_ranges(np, hose->io_base_phys); + process_ISA_OF_ranges(np, hose->io_base_phys); /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; @@ -171,75 +156,15 @@ void __init isa_bridge_find_early(struct pci_controller *hose) */ void __init isa_bridge_init_non_pci(struct device_node *np) { - const __be32 *ranges, *pbasep = NULL; - int rlen, i, rs; - u32 na, ns, pna; - u64 cbase, pbase, size = 0; + int ret; /* If we already have an ISA bridge, bail off */ if (isa_bridge_devnode != NULL) return; - pna = of_n_addr_cells(np); - if (of_property_read_u32(np, "#address-cells", &na) || - of_property_read_u32(np, "#size-cells", &ns)) { - pr_warn("ISA: Non-PCI bridge %pOF is missing address format\n", - np); - return; - } - - /* Check it's a supported address format */ - if (na != 2 || ns != 1) { - pr_warn("ISA: Non-PCI bridge %pOF has unsupported address format\n", - np); - return; - } - rs = na + ns + pna; - - /* Grab the ranges property */ - ranges = of_get_property(np, "ranges", &rlen); - if (ranges == NULL || rlen < rs) { - pr_warn("ISA: Non-PCI bridge %pOF has absent or invalid ranges\n", - np); + ret = process_ISA_OF_ranges(np, 0); + if (ret) return; - } - - /* Parse it. We are only looking for IO space */ - for (i = 0; (i + rs - 1) < rlen; i += rs) { - if (be32_to_cpup(ranges + i) != 1) - continue; - cbase = be32_to_cpup(ranges + i + 1); - size = of_read_number(ranges + i + na + pna, ns); - pbasep = ranges + i + na; - break; - } - - /* Got something ? */ - if (!size || !pbasep) { - pr_warn("ISA: Non-PCI bridge %pOF has no usable IO range\n", - np); - return; - } - - /* Align size and make sure it's cropped to 64K */ - size = PAGE_ALIGN(size); - if (size > 0x10000) - size = 0x10000; - - /* Map pbase */ - pbase = of_translate_address(np, pbasep); - if (pbase == OF_BAD_ADDR) { - pr_warn("ISA: Non-PCI bridge %pOF failed to translate IO base\n", - np); - return; - } - - /* We need page alignment */ - if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) { - pr_warn("ISA: Non-PCI bridge %pOF has non aligned IO range\n", - np); - return; - } /* Got it */ isa_bridge_devnode = np; @@ -248,8 +173,6 @@ void __init isa_bridge_init_non_pci(struct device_node *np) * and map it */ isa_io_base = ISA_IO_BASE; - __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } @@ -268,7 +191,7 @@ static void isa_bridge_find_late(struct pci_dev *pdev, isa_bridge_pcidev = pdev; /* Now parse the "ranges" property and setup the ISA mapping */ - pci_process_ISA_OF_ranges(devnode, hose->io_base_phys); + process_ISA_OF_ranges(devnode, hose->io_base_phys); /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; @@ -297,7 +220,7 @@ static void isa_bridge_remove(void) isa_bridge_pcidev = NULL; /* Unmap the ISA area */ - __iounmap_at((void *)ISA_IO_BASE, 0x10000); + vunmap_range(ISA_IO_BASE, ISA_IO_BASE + 0x10000); } /** diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c index ca37702bde97..5277cf582c16 100644 --- a/arch/powerpc/kernel/jump_label.c +++ b/arch/powerpc/kernel/jump_label.c @@ -6,14 +6,15 @@ #include <linux/kernel.h> #include <linux/jump_label.h> #include <asm/code-patching.h> +#include <asm/inst.h> void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - u32 *addr = (u32 *)(unsigned long)entry->code; + u32 *addr = (u32 *)jump_entry_code(entry); if (type == JUMP_LABEL_JMP) - patch_branch(addr, entry->target, 0); + patch_branch(addr, jump_entry_target(entry), 0); else - patch_instruction(addr, PPC_INST_NOP); + patch_instruction(addr, ppc_inst(PPC_RAW_NOP())); } diff --git a/arch/powerpc/kernel/kdebugfs.c b/arch/powerpc/kernel/kdebugfs.c new file mode 100644 index 000000000000..36d3124d5a8b --- /dev/null +++ b/arch/powerpc/kernel/kdebugfs.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/init.h> + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("powerpc", NULL); + return 0; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 7dd55eb1259d..ebe4d1645ca1 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * PowerPC backend to the KGDB stub. * @@ -8,10 +9,6 @@ * PPC32 support restored by Vitaly Wool <vwool@ru.mvista.com> and * Sergei Shtylyov <sshtylyov@ru.mvista.com> * Copyright (C) 2007-2008 Wind River Systems, Inc. - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program as licensed "as is" without any warranty of any - * kind, whether express or implied. */ #include <linux/kernel.h> @@ -26,6 +23,7 @@ #include <asm/debug.h> #include <asm/code-patching.h> #include <linux/slab.h> +#include <asm/inst.h> /* * This table contains the mapping between PowerPC hardware trap types, and @@ -47,9 +45,9 @@ static struct hard_trap_info { 0x0800, 0x08 /* SIGFPE */ }, /* fp unavailable */ { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */ { 0x0c00, 0x14 /* SIGCHLD */ }, /* system call */ -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +#ifdef CONFIG_BOOKE_OR_40x { 0x2002, 0x05 /* SIGTRAP */ }, /* debug */ -#if defined(CONFIG_FSL_BOOKE) +#if defined(CONFIG_PPC_85xx) { 0x2010, 0x08 /* SIGFPE */ }, /* spe unavailable */ { 0x2020, 0x08 /* SIGFPE */ }, /* spe unavailable */ { 0x2030, 0x08 /* SIGFPE */ }, /* spe fp data */ @@ -59,14 +57,14 @@ static struct hard_trap_info { 0x2900, 0x08 /* SIGFPE */ }, /* apu unavailable */ { 0x3100, 0x0e /* SIGALRM */ }, /* fixed interval timer */ { 0x3200, 0x02 /* SIGINT */ }, /* watchdog */ -#else /* ! CONFIG_FSL_BOOKE */ +#else /* ! CONFIG_PPC_85xx */ { 0x1000, 0x0e /* SIGALRM */ }, /* prog interval timer */ { 0x1010, 0x0e /* SIGALRM */ }, /* fixed interval timer */ { 0x1020, 0x02 /* SIGINT */ }, /* watchdog */ { 0x2010, 0x08 /* SIGFPE */ }, /* fp unavailable */ { 0x2020, 0x08 /* SIGFPE */ }, /* ap unavailable */ #endif -#else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */ +#else /* !CONFIG_BOOKE_OR_40x */ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ #if defined(CONFIG_PPC_8xx) { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ @@ -146,7 +144,7 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) return 0; if (*(u32 *)regs->nip == BREAK_INSTR) - regs->nip += BREAK_INSTR_SIZE; + regs_add_return_ip(regs, BREAK_INSTR_SIZE); return 1; } @@ -193,7 +191,7 @@ static int kgdb_break_match(struct pt_regs *regs) void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) { struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp + - STACK_FRAME_OVERHEAD); + STACK_INT_FRAME_REGS); unsigned long *ptr = gdb_regs; int reg; @@ -210,7 +208,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) for (reg = 14; reg < 32; reg++) PACK64(ptr, regs->gpr[reg]); -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx #ifdef CONFIG_SPE for (reg = 0; reg < 32; reg++) PACK64(ptr, p->thread.evr[reg]); @@ -236,7 +234,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) #define GDB_SIZEOF_REG sizeof(unsigned long) #define GDB_SIZEOF_REG_U32 sizeof(u32) -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx #define GDB_SIZEOF_FLOAT_REG sizeof(unsigned long) #else #define GDB_SIZEOF_FLOAT_REG sizeof(u64) @@ -331,7 +329,7 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) if (regno >= 32 && regno < 64) { /* FP registers 32 -> 63 */ -#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE) +#if defined(CONFIG_PPC_85xx) && defined(CONFIG_SPE) if (current) memcpy(mem, ¤t->thread.evr[regno-32], dbg_reg_def[regno].size); @@ -357,7 +355,7 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) if (regno >= 32 && regno < 64) { /* FP registers 32 -> 63 */ -#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE) +#if defined(CONFIG_PPC_85xx) && defined(CONFIG_SPE) memcpy(¤t->thread.evr[regno-32], mem, dbg_reg_def[regno].size); #else @@ -371,11 +369,11 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) { - regs->nip = pc; + regs_set_return_ip(regs, pc); } /* - * This function does PowerPC specific procesing for interfacing to gdb. + * This function does PowerPC specific processing for interfacing to gdb. */ int kgdb_arch_handle_exception(int vector, int signo, int err_code, char *remcom_in_buffer, char *remcom_out_buffer, @@ -393,7 +391,7 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code, case 'c': /* handle the optional parameter */ if (kgdb_hex2long(&ptr, &addr)) - linux_regs->nip = addr; + regs_set_return_ip(linux_regs, addr); atomic_set(&kgdb_cpu_doing_single_step, -1); /* set the trace bit if we're stepping */ @@ -401,9 +399,9 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code, #ifdef CONFIG_PPC_ADV_DEBUG_REGS mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); - linux_regs->msr |= MSR_DE; + regs_set_return_msr(linux_regs, linux_regs->msr | MSR_DE); #else - linux_regs->msr |= MSR_SE; + regs_set_return_msr(linux_regs, linux_regs->msr | MSR_SE); #endif atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id()); @@ -416,19 +414,18 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code, int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { + u32 instr, *addr = (u32 *)bpt->bpt_addr; int err; - unsigned int instr; - unsigned int *addr = (unsigned int *)bpt->bpt_addr; - err = probe_kernel_address(addr, instr); + err = get_kernel_nofault(instr, addr); if (err) return err; - err = patch_instruction(addr, BREAK_INSTR); + err = patch_instruction(addr, ppc_inst(BREAK_INSTR)); if (err) return -EFAULT; - *(unsigned int *)bpt->saved_instr = instr; + *(u32 *)bpt->saved_instr = instr; return 0; } @@ -437,9 +434,9 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { int err; unsigned int instr = *(unsigned int *)bpt->saved_instr; - unsigned int *addr = (unsigned int *)bpt->bpt_addr; + u32 *addr = (u32 *)bpt->bpt_addr; - err = patch_instruction(addr, instr); + err = patch_instruction(addr, ppc_inst(instr)); if (err) return -EFAULT; diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 972cb28174b2..072ebe7f290b 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -14,14 +14,21 @@ /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe *p; struct kprobe_ctlblk *kcb; + struct pt_regs *regs; + int bit; + bit = ftrace_test_recursion_trylock(nip, parent_nip); + if (bit < 0) + return; + + regs = ftrace_get_regs(fregs); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -31,7 +38,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, * On powerpc, NIP is *before* this instruction for the * pre handler */ - regs->nip -= MCOUNT_INSN_SIZE; + regs_add_return_ip(regs, -MCOUNT_INSN_SIZE); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; @@ -40,7 +47,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, * Emulate singlestep (and also recover regs->nip) * as if there is a nop */ - regs->nip += MCOUNT_INSN_SIZE; + regs_add_return_ip(regs, MCOUNT_INSN_SIZE); if (unlikely(p->post_handler)) { kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); @@ -52,6 +59,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, */ __this_cpu_write(current_kprobe, NULL); } +out: + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 2d27ec4feee4..b20ee72e873a 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -19,10 +19,13 @@ #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/slab.h> +#include <linux/moduleloader.h> +#include <linux/set_memory.h> #include <asm/code-patching.h> #include <asm/cacheflush.h> #include <asm/sstep.h> #include <asm/sections.h> +#include <asm/inst.h> #include <linux/uaccess.h> DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; @@ -42,7 +45,7 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) { kprobe_opcode_t *addr = NULL; -#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_PPC64_ELF_ABI_V2 /* PPC64 ABIv2 needs local entry point */ addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); if (addr && !offset) { @@ -60,7 +63,7 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) #endif addr = (kprobe_opcode_t *)ppc_function_entry(addr); } -#elif defined(PPC64_ELF_ABI_v1) +#elif defined(CONFIG_PPC64_ELF_ABI_V1) /* * 64bit powerpc ABIv1 uses function descriptors: * - Check for the dot variant of the symbol first. @@ -102,16 +105,67 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) return addr; } +static bool arch_kprobe_on_func_entry(unsigned long offset) +{ +#ifdef CONFIG_PPC64_ELF_ABI_V2 +#ifdef CONFIG_KPROBES_ON_FTRACE + return offset <= 16; +#else + return offset <= 8; +#endif +#else + return !offset; +#endif +} + +/* XXX try and fold the magic of kprobe_lookup_name() in this */ +kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, + bool *on_func_entry) +{ + *on_func_entry = arch_kprobe_on_func_entry(offset); + return (kprobe_opcode_t *)(addr + offset); +} + +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (!page) + return NULL; + + if (strict_module_rwx_enabled()) + set_memory_rox((unsigned long)page, 1); + + return page; +} + int arch_prepare_kprobe(struct kprobe *p) { int ret = 0; - kprobe_opcode_t insn = *p->addr; + struct kprobe *prev; + ppc_inst_t insn = ppc_inst_read(p->addr); if ((unsigned long)p->addr & 0x03) { printk("Attempt to register kprobe at an unaligned address\n"); ret = -EINVAL; - } else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) { - printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n"); + } else if (!can_single_step(ppc_inst_val(insn))) { + printk("Cannot register a kprobe on instructions that can't be single stepped\n"); + ret = -EINVAL; + } else if ((unsigned long)p->addr & ~PAGE_MASK && + ppc_inst_prefixed(ppc_inst_read(p->addr - 1))) { + printk("Cannot register a kprobe on the second word of prefixed instruction\n"); + ret = -EINVAL; + } + prev = get_kprobe(p->addr - 1); + + /* + * When prev is a ftrace-based kprobe, we don't have an insn, and it + * doesn't probe for prefixed instruction. + */ + if (prev && !kprobe_ftrace(prev) && + ppc_inst_prefixed(ppc_inst_read(prev->ainsn.insn))) { + printk("Cannot register a kprobe on the second word of prefixed instruction\n"); ret = -EINVAL; } @@ -124,11 +178,8 @@ int arch_prepare_kprobe(struct kprobe *p) } if (!ret) { - memcpy(p->ainsn.insn, p->addr, - MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - p->opcode = *p->addr; - flush_icache_range((unsigned long)p->ainsn.insn, - (unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t)); + patch_instruction(p->ainsn.insn, insn); + p->opcode = ppc_inst_val(insn); } p->ainsn.boostable = 0; @@ -138,13 +189,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe); void arch_arm_kprobe(struct kprobe *p) { - patch_instruction(p->addr, BREAKPOINT_INSTRUCTION); + WARN_ON_ONCE(patch_instruction(p->addr, ppc_inst(BREAKPOINT_INSTRUCTION))); } NOKPROBE_SYMBOL(arch_arm_kprobe); void arch_disarm_kprobe(struct kprobe *p) { - patch_instruction(p->addr, p->opcode); + WARN_ON_ONCE(patch_instruction(p->addr, ppc_inst(p->opcode))); } NOKPROBE_SYMBOL(arch_disarm_kprobe); @@ -167,7 +218,7 @@ static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs * variant as values in regs could play a part in * if the trap is taken or not */ - regs->nip = (unsigned long)p->ainsn.insn; + regs_set_return_ip(regs, (unsigned long)p->ainsn.insn); } static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) @@ -191,32 +242,20 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs kcb->kprobe_saved_msr = regs->msr; } -bool arch_kprobe_on_func_entry(unsigned long offset) -{ -#ifdef PPC64_ELF_ABI_v2 -#ifdef CONFIG_KPROBES_ON_FTRACE - return offset <= 16; -#else - return offset <= 8; -#endif -#else - return !offset; -#endif -} - void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { ri->ret_addr = (kprobe_opcode_t *)regs->link; + ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->link = (unsigned long)kretprobe_trampoline; + regs->link = (unsigned long)__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; - unsigned int insn = *p->ainsn.insn; + ppc_inst_t insn = ppc_inst_read(p->ainsn.insn); /* regs->nip is also adjusted if emulate_step returns 1 */ ret = emulate_step(regs, insn); @@ -233,7 +272,7 @@ static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) * So, we should never get here... but, its still * good to catch them, just in case... */ - printk("Can't step on instruction %x\n", insn); + printk("Can't step on instruction %08lx\n", ppc_inst_as_ulong(insn)); BUG(); } else { /* @@ -264,6 +303,10 @@ int kprobe_handler(struct pt_regs *regs) if (user_mode(regs)) return 0; + if (!IS_ENABLED(CONFIG_BOOKE) && + (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))) + return 0; + /* * We don't want to be preempted for the entire * duration of kprobe processing @@ -271,64 +314,20 @@ int kprobe_handler(struct pt_regs *regs) preempt_disable(); kcb = get_kprobe_ctlblk(); - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - kprobe_opcode_t insn = *p->ainsn.insn; - if (kcb->kprobe_status == KPROBE_HIT_SS && - is_trap(insn)) { - /* Turn off 'trace' bits */ - regs->msr &= ~MSR_SINGLESTEP; - regs->msr |= kcb->kprobe_saved_msr; - goto no_kprobe; - } - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the handler. - * We here save the original kprobes variables and - * just single step on the instruction of the new probe - * without calling any user handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - kcb->kprobe_status = KPROBE_REENTER; - if (p->ainsn.boostable >= 0) { - ret = try_to_emulate(p, regs); - - if (ret > 0) { - restore_previous_kprobe(kcb); - preempt_enable_no_resched(); - return 1; - } - } - prepare_singlestep(p, regs); - return 1; - } else if (*addr != BREAKPOINT_INSTRUCTION) { - /* If trap variant, then it belongs not to us */ - kprobe_opcode_t cur_insn = *addr; - - if (is_trap(cur_insn)) - goto no_kprobe; - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - ret = 1; - } - goto no_kprobe; - } - p = get_kprobe(addr); if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { + unsigned int instr; + + if (get_kernel_nofault(instr, addr)) + goto no_kprobe; + + if (instr != BREAKPOINT_INSTRUCTION) { /* * PowerPC has multiple variants of the "trap" * instruction. If the current instruction is a * trap variant, it could belong to someone else */ - kprobe_opcode_t cur_insn = *addr; - if (is_trap(cur_insn)) + if (is_trap(instr)) goto no_kprobe; /* * The breakpoint instruction was removed right @@ -343,12 +342,46 @@ int kprobe_handler(struct pt_regs *regs) goto no_kprobe; } + /* Check we're not actually recursing */ + if (kprobe_running()) { + kprobe_opcode_t insn = *p->ainsn.insn; + if (kcb->kprobe_status == KPROBE_HIT_SS && is_trap(insn)) { + /* Turn off 'trace' bits */ + regs_set_return_msr(regs, + (regs->msr & ~MSR_SINGLESTEP) | + kcb->kprobe_saved_msr); + goto no_kprobe; + } + + /* + * We have reentered the kprobe_handler(), since another probe + * was hit while within the handler. We here save the original + * kprobes variables and just single step on the instruction of + * the new probe without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kprobes_inc_nmissed_count(p); + kcb->kprobe_status = KPROBE_REENTER; + if (p->ainsn.boostable >= 0) { + ret = try_to_emulate(p, regs); + + if (ret > 0) { + restore_previous_kprobe(kcb); + preempt_enable(); + return 1; + } + } + prepare_singlestep(p, regs); + return 1; + } + kcb->kprobe_status = KPROBE_HIT_ACTIVE; set_current_kprobe(p, regs, kcb); if (p->pre_handler && p->pre_handler(p, regs)) { /* handler changed execution path, so skip ss setup */ reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); return 1; } @@ -361,7 +394,7 @@ int kprobe_handler(struct pt_regs *regs) kcb->kprobe_status = KPROBE_HIT_SSDONE; reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); return 1; } } @@ -370,7 +403,7 @@ int kprobe_handler(struct pt_regs *regs) return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } NOKPROBE_SYMBOL(kprobe_handler); @@ -381,62 +414,21 @@ NOKPROBE_SYMBOL(kprobe_handler); * - When the probed function returns, this probe * causes the handlers to fire */ -asm(".global kretprobe_trampoline\n" - ".type kretprobe_trampoline, @function\n" - "kretprobe_trampoline:\n" +asm(".global __kretprobe_trampoline\n" + ".type __kretprobe_trampoline, @function\n" + "__kretprobe_trampoline:\n" "nop\n" "blr\n" - ".size kretprobe_trampoline, .-kretprobe_trampoline\n"); + ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"); /* * Called when the probe at kretprobe trampoline is hit */ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more than one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler) - ri->rp->handler(ri, regs); - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); + unsigned long orig_ret_address; + orig_ret_address = __kretprobe_trampoline_handler(regs, NULL); /* * We get here through one of two paths: * 1. by taking a trap -> kprobe_handler() -> here @@ -446,22 +438,15 @@ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * as it is used to determine the return address from the trap. * For (2), since nip is not honoured with optprobes, we instead setup * the link register properly so that the subsequent 'blr' in - * kretprobe_trampoline jumps back to the right instruction. + * __kretprobe_trampoline jumps back to the right instruction. * * For nip, we should set the address to the previous instruction since * we end up emulating it in kprobe_handler(), which increments the nip * again. */ - regs->nip = orig_ret_address - 4; + regs_set_return_ip(regs, orig_ret_address - 4); regs->link = orig_ret_address; - kretprobe_hash_unlock(current, &flags); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return 0; } NOKPROBE_SYMBOL(trampoline_probe_handler); @@ -476,14 +461,16 @@ NOKPROBE_SYMBOL(trampoline_probe_handler); */ int kprobe_post_handler(struct pt_regs *regs) { + int len; struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); if (!cur || user_mode(regs)) return 0; + len = ppc_inst_len(ppc_inst_read(cur->ainsn.insn)); /* make sure we got here for instruction we have a kprobe on */ - if (((unsigned long)cur->ainsn.insn + 4) != regs->nip) + if (((unsigned long)cur->ainsn.insn + len) != regs->nip) return 0; if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { @@ -492,8 +479,8 @@ int kprobe_post_handler(struct pt_regs *regs) } /* Adjust nip to after the single-stepped instruction */ - regs->nip = (unsigned long)cur->addr + 4; - regs->msr |= kcb->kprobe_saved_msr; + regs_set_return_ip(regs, (unsigned long)cur->addr + len); + regs_set_return_msr(regs, regs->msr | kcb->kprobe_saved_msr); /*Restore back the original saved kprobes variables and continue. */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -502,7 +489,7 @@ int kprobe_post_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, msr @@ -532,40 +519,25 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * and allow the page fault handler to continue as a * normal page fault. */ - regs->nip = (unsigned long)cur->addr; - regs->msr &= ~MSR_SINGLESTEP; /* Turn off 'trace' bits */ - regs->msr |= kcb->kprobe_saved_msr; + regs_set_return_ip(regs, (unsigned long)cur->addr); + /* Turn off 'trace' bits */ + regs_set_return_msr(regs, + (regs->msr & ~MSR_SINGLESTEP) | + kcb->kprobe_saved_msr); if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accounting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* * In case the user-specified fault handler returned * zero, try to fix up. */ if ((entry = search_exception_tables(regs->nip)) != NULL) { - regs->nip = extable_fixup(entry); + regs_set_return_ip(regs, extable_fixup(entry)); return 1; } @@ -581,19 +553,8 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) } NOKPROBE_SYMBOL(kprobe_fault_handler); -unsigned long arch_deref_entry_point(void *entry) -{ -#ifdef PPC64_ELF_ABI_v1 - if (!kernel_text_address((unsigned long)entry)) - return ppc_global_function_entry(entry); - else -#endif - return (unsigned long)entry; -} -NOKPROBE_SYMBOL(arch_deref_entry_point); - static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -604,7 +565,7 @@ int __init arch_init_kprobes(void) int arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) return 1; return 0; diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 617eba82531c..5b3c093611ba 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -455,7 +455,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt); break; -#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_PPC_E500 case KVM_INST_MFSPR(SPRN_MAS0): if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt); @@ -484,7 +484,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt); break; -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* CONFIG_PPC_E500 */ case KVM_INST_MFSPR(SPRN_SPRG4): #ifdef CONFIG_BOOKE @@ -557,7 +557,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) case KVM_INST_MTSPR(SPRN_DSISR): kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt); break; -#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_PPC_E500 case KVM_INST_MTSPR(SPRN_MAS0): if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt); @@ -586,7 +586,7 @@ static void __init kvm_check_ins(u32 *inst, u32 features) if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt); break; -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* CONFIG_PPC_E500 */ case KVM_INST_MTSPR(SPRN_SPRG4): if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) @@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void) on_each_cpu(kvm_map_magic_page, &features, 1); /* Quick self-test to see if the mapping works */ - if (!fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) { + if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE, + sizeof(u32))) { kvm_patching_worked = false; return; } diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S index 2020d255585f..f2e03ed423d0 100644 --- a/arch/powerpc/kernel/l2cr_6xx.S +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -96,7 +96,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L2CR) /* Stop DST streams */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) @@ -256,7 +256,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) sync /* Restore MSR (restores EE and DR bits to original state) */ - SYNC mtmsr r7 isync @@ -293,7 +292,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) isync /* Stop DST streams */ - DSSALL + PPC_DSSALL sync /* Get the current enable bit of the L3CR into r4 */ @@ -377,7 +376,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) 1: bdnz 1b /* Restore MSR (restores EE and DR bits to original state) */ -4: SYNC +4: mtmsr r7 isync blr @@ -402,7 +401,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_L3CR) _GLOBAL(__flush_disable_L1) /* Stop pending alitvec streams and memory accesses */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) sync @@ -455,5 +454,6 @@ _GLOBAL(__inval_enable_L1) sync blr +_ASM_NOKPROBE_SYMBOL(__inval_enable_L1) diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 7cea5978f21f..1da2f6e7d2a1 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -5,16 +5,17 @@ #include <linux/serial_core.h> #include <linux/console.h> #include <linux/pci.h> +#include <linux/of.h> #include <linux/of_address.h> -#include <linux/of_device.h> +#include <linux/of_irq.h> #include <linux/serial_reg.h> #include <asm/io.h> #include <asm/mmu.h> -#include <asm/prom.h> #include <asm/serial.h> #include <asm/udbg.h> #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> +#include <asm/early_ioremap.h> #undef DEBUG @@ -34,6 +35,7 @@ static struct legacy_serial_info { unsigned int clock; int irq_check_parent; phys_addr_t taddr; + void __iomem *early_addr; } legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS]; static const struct of_device_id legacy_serial_parents[] __initconst = { @@ -169,15 +171,15 @@ static int __init add_legacy_soc_port(struct device_node *np, /* We only support ports that have a clock frequency properly * encoded in the device-tree. */ - if (of_get_property(np, "clock-frequency", NULL) == NULL) + if (!of_property_present(np, "clock-frequency")) return -1; /* if reg-offset don't try to use it */ - if ((of_get_property(np, "reg-offset", NULL) != NULL)) + if (of_property_present(np, "reg-offset")) return -1; /* if rtas uses this device, don't try to use it as well */ - if (of_get_property(np, "used-by-rtas", NULL) != NULL) + if (of_property_read_bool(np, "used-by-rtas")) return -1; /* Get the address */ @@ -235,7 +237,7 @@ static int __init add_legacy_isa_port(struct device_node *np, * Note: Don't even try on P8 lpc, we know it's not directly mapped */ if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc") || - of_get_property(isa_brg, "ranges", NULL)) { + of_property_present(isa_brg, "ranges")) { taddr = of_translate_address(np, reg); if (taddr == OF_BAD_ADDR) taddr = 0; @@ -266,7 +268,7 @@ static int __init add_legacy_pci_port(struct device_node *np, * compatible UARTs on PCI need all sort of quirks (port offsets * etc...) that this code doesn't know about */ - if (of_get_property(np, "clock-frequency", NULL) == NULL) + if (!of_property_present(np, "clock-frequency")) return -1; /* Get the PCI address. Assume BAR 0 */ @@ -325,17 +327,16 @@ static void __init setup_legacy_serial_console(int console) { struct legacy_serial_info *info = &legacy_serial_infos[console]; struct plat_serial8250_port *port = &legacy_serial_ports[console]; - void __iomem *addr; unsigned int stride; stride = 1 << port->regshift; /* Check if a translated MMIO address has been found */ if (info->taddr) { - addr = ioremap(info->taddr, 0x1000); - if (addr == NULL) + info->early_addr = early_ioremap(info->taddr, 0x1000); + if (info->early_addr == NULL) return; - udbg_uart_init_mmio(addr, stride); + udbg_uart_init_mmio(info->early_addr, stride); } else { /* Check if it's PIO and we support untranslated PIO */ if (port->iotype == UPIO_PORT && isa_io_special) @@ -353,6 +354,33 @@ static void __init setup_legacy_serial_console(int console) udbg_uart_setup(info->speed, info->clock); } +static int __init ioremap_legacy_serial_console(void) +{ + struct plat_serial8250_port *port; + struct legacy_serial_info *info; + void __iomem *vaddr; + + if (legacy_serial_console < 0) + return 0; + + info = &legacy_serial_infos[legacy_serial_console]; + port = &legacy_serial_ports[legacy_serial_console]; + + if (!info->early_addr) + return 0; + + vaddr = ioremap(info->taddr, 0x1000); + if (WARN_ON(!vaddr)) + return -ENOMEM; + + udbg_uart_init_mmio(vaddr, 1 << port->regshift); + early_iounmap(info->early_addr, 0x1000); + info->early_addr = NULL; + + return 0; +} +early_initcall(ioremap_legacy_serial_console); + /* * This is called very early, as part of setup_system() or eventually * setup_arch(), basically before anything else in this file. This function @@ -443,6 +471,8 @@ void __init find_legacy_serial_ports(void) } #endif + of_node_put(stdout); + DBG("legacy_serial_console = %d\n", legacy_serial_console); if (legacy_serial_console >= 0) setup_legacy_serial_console(legacy_serial_console); @@ -478,10 +508,16 @@ static void __init fixup_port_irq(int index, port->irq = virq; -#ifdef CONFIG_SERIAL_8250_FSL - if (of_device_is_compatible(np, "fsl,ns16550")) - port->handle_irq = fsl8250_handle_irq; -#endif + if (IS_ENABLED(CONFIG_SERIAL_8250) && + of_device_is_compatible(np, "fsl,ns16550")) { + if (IS_REACHABLE(CONFIG_SERIAL_8250_FSL)) { + port->handle_irq = fsl8250_handle_irq; + port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); + } else { + pr_warn_once("Not activating Freescale specific workaround for device %pOFP\n", + np); + } + } } static void __init fixup_port_pio(int index, diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 34c1001e9e8b..219f28637a3e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -15,37 +15,36 @@ #include <linux/percpu.h> #include <linux/export.h> #include <linux/irq_work.h> +#include <linux/extable.h> +#include <linux/ftrace.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <asm/interrupt.h> #include <asm/machdep.h> #include <asm/mce.h> #include <asm/nmi.h> -static DEFINE_PER_CPU(int, mce_nest_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); +#include "setup.h" -/* Queue for delayed MCE events. */ -static DEFINE_PER_CPU(int, mce_queue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); - -/* Queue for delayed MCE UE events. */ -static DEFINE_PER_CPU(int, mce_ue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], - mce_ue_event_queue); - -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { - .func = machine_check_process_queued_event, -}; +static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; +static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); -DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); +int mce_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&mce_notifier_list, nb); +} +EXPORT_SYMBOL_GPL(mce_register_notifier); + +int mce_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&mce_notifier_list, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_notifier); static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) @@ -79,6 +78,13 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +void mce_irq_work_queue(void) +{ + /* Raise decrementer interrupt */ + arch_irq_work_raise(); + set_mce_pending_irq_work(); +} + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -87,9 +93,10 @@ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, uint64_t addr, uint64_t phys_addr) { - int index = __this_cpu_inc_return(mce_nest_count) - 1; - struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); + int index = local_paca->mce_info->mce_nest_count++; + struct machine_check_event *mce; + mce = &local_paca->mce_info->mce_event[index]; /* * Return if we don't have enough space to log mce event. * mce_nest_count may go beyond MAX_MC_EVT but that's ok, @@ -121,6 +128,15 @@ void save_mce_event(struct pt_regs *regs, long handled, * Populate the mce error_type and type-specific error_type. */ mce_set_error_info(mce, mce_err); + if (mce->error_type == MCE_ERROR_TYPE_UE) + mce->u.ue_error.ignore_event = mce_err->ignore_event; + + /* + * Raise irq work, So that we don't miss to log the error for + * unrecoverable errors. + */ + if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED) + mce_irq_work_queue(); if (!addr) return; @@ -149,7 +165,6 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; - mce->u.ue_error.ignore_event = mce_err->ignore_event; machine_check_ue_event(mce); } } @@ -175,7 +190,7 @@ void save_mce_event(struct pt_regs *regs, long handled, */ int get_mce_event(struct machine_check_event *mce, bool release) { - int index = __this_cpu_read(mce_nest_count) - 1; + int index = local_paca->mce_info->mce_nest_count - 1; struct machine_check_event *mc_evt; int ret = 0; @@ -185,7 +200,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) /* Check if we have MCE info to process. */ if (index < MAX_MC_EVT) { - mc_evt = this_cpu_ptr(&mce_event[index]); + mc_evt = &local_paca->mce_info->mce_event[index]; /* Copy the event structure and release the original */ if (mce) *mce = *mc_evt; @@ -195,7 +210,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) } /* Decrement the count to free the slot. */ if (release) - __this_cpu_dec(mce_nest_count); + local_paca->mce_info->mce_nest_count--; return ret; } @@ -205,7 +220,7 @@ void release_mce_event(void) get_mce_event(NULL, true); } -static void machine_check_ue_irq_work(struct irq_work *work) +static void machine_check_ue_work(void) { schedule_work(&mce_ue_event_work); } @@ -217,16 +232,14 @@ static void machine_check_ue_event(struct machine_check_event *evt) { int index; - index = __this_cpu_inc_return(mce_ue_count) - 1; + index = local_paca->mce_info->mce_ue_count++; /* If queue is full, just return for now. */ if (index >= MAX_MC_EVT) { - __this_cpu_dec(mce_ue_count); + local_paca->mce_info->mce_ue_count--; return; } - memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); - - /* Queue work to process this event later. */ - irq_work_queue(&mce_ue_event_irq_work); + memcpy(&local_paca->mce_info->mce_ue_event_queue[index], + evt, sizeof(*evt)); } /* @@ -240,17 +253,30 @@ void machine_check_queue_event(void) if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return; - index = __this_cpu_inc_return(mce_queue_count) - 1; + index = local_paca->mce_info->mce_queue_count++; /* If queue is full, just return for now. */ if (index >= MAX_MC_EVT) { - __this_cpu_dec(mce_queue_count); + local_paca->mce_info->mce_queue_count--; return; } - memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt)); + memcpy(&local_paca->mce_info->mce_event_queue[index], + &evt, sizeof(evt)); + + mce_irq_work_queue(); +} + +void mce_common_process_ue(struct pt_regs *regs, + struct mce_error_info *mce_err) +{ + const struct exception_table_entry *entry; - /* Queue irq work to process this event later. */ - irq_work_queue(&mce_event_process_work); + entry = search_kernel_exception_table(regs->nip); + if (entry) { + mce_err->ignore_event = true; + regs_set_return_ip(regs, extable_fixup(entry)); + } } + /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. @@ -260,9 +286,10 @@ static void machine_process_ue_event(struct work_struct *work) int index; struct machine_check_event *evt; - while (__this_cpu_read(mce_ue_count) > 0) { - index = __this_cpu_read(mce_ue_count) - 1; - evt = this_cpu_ptr(&mce_ue_event_queue[index]); + while (local_paca->mce_info->mce_ue_count > 0) { + index = local_paca->mce_info->mce_ue_count - 1; + evt = &local_paca->mce_info->mce_ue_event_queue[index]; + blocking_notifier_call_chain(&mce_notifier_list, 0, evt); #ifdef CONFIG_MEMORY_FAILURE /* * This should probably queued elsewhere, but @@ -274,7 +301,7 @@ static void machine_process_ue_event(struct work_struct *work) */ if (evt->error_type == MCE_ERROR_TYPE_UE) { if (evt->u.ue_error.ignore_event) { - __this_cpu_dec(mce_ue_count); + local_paca->mce_info->mce_ue_count--; continue; } @@ -290,14 +317,14 @@ static void machine_process_ue_event(struct work_struct *work) "was generated\n"); } #endif - __this_cpu_dec(mce_ue_count); + local_paca->mce_info->mce_ue_count--; } } /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. */ -static void machine_check_process_queued_event(struct irq_work *work) +static void machine_check_process_queued_event(void) { int index; struct machine_check_event *evt; @@ -308,17 +335,38 @@ static void machine_check_process_queued_event(struct irq_work *work) * For now just print it to console. * TODO: log this error event to FSP or nvram. */ - while (__this_cpu_read(mce_queue_count) > 0) { - index = __this_cpu_read(mce_queue_count) - 1; - evt = this_cpu_ptr(&mce_event_queue[index]); + while (local_paca->mce_info->mce_queue_count > 0) { + index = local_paca->mce_info->mce_queue_count - 1; + evt = &local_paca->mce_info->mce_event_queue[index]; if (evt->error_type == MCE_ERROR_TYPE_UE && evt->u.ue_error.ignore_event) { - __this_cpu_dec(mce_queue_count); + local_paca->mce_info->mce_queue_count--; continue; } machine_check_print_event_info(evt, false, false); - __this_cpu_dec(mce_queue_count); + local_paca->mce_info->mce_queue_count--; + } +} + +void set_mce_pending_irq_work(void) +{ + local_paca->mce_pending_irq_work = 1; +} + +void clear_mce_pending_irq_work(void) +{ + local_paca->mce_pending_irq_work = 0; +} + +void mce_run_irq_context_handlers(void) +{ + if (unlikely(local_paca->mce_pending_irq_work)) { + if (ppc_md.machine_check_log_err) + ppc_md.machine_check_log_err(); + machine_check_process_queued_event(); + machine_check_ue_work(); + clear_mce_pending_irq_work(); } } @@ -355,18 +403,19 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_user_types[] = { "Indeterminate", "tlbie(l) invalid", + "scv invalid", }; static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", - "Instruction fetch (foreign)", + "Instruction fetch (foreign/control memory)", "Page table walk ifetch (bad)", - "Page table walk ifetch (foreign)", + "Page table walk ifetch (foreign/control memory)", "Load (bad)", "Store (bad)", "Page table walk Load/Store (bad)", - "Page table walk Load/Store (foreign)", - "Load/Store (foreign)", + "Page table walk Load/Store (foreign/control memory)", + "Load/Store (foreign/control memory)", }; static const char *mc_link_types[] = { "Indeterminate", @@ -524,7 +573,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, } printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n", - level, evt->cpu, sevstr, in_guest ? "Guest" : "Host", + level, evt->cpu, sevstr, in_guest ? "Guest" : "", err_type, subtype, dar_str, evt->disposition == MCE_DISPOSITION_RECOVERED ? "Recovered" : "Not recovered"); @@ -544,9 +593,9 @@ void machine_check_print_event_info(struct machine_check_event *evt, mc_error_class[evt->error_class] : "Unknown"; printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /* Display faulty slb contents for SLB errors. */ - if (evt->error_type == MCE_ERROR_TYPE_SLB) + if (evt->error_type == MCE_ERROR_TYPE_SLB && !in_guest) slb_dump_contents(local_paca->mce_faulty_slbs); #endif } @@ -557,7 +606,7 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info); * * regs->nip and regs->msr contains srr0 and ssr1. */ -long machine_check_early(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_early) { long handled = 0; @@ -568,6 +617,7 @@ long machine_check_early(struct pt_regs *regs) */ if (ppc_md.machine_check_early) handled = ppc_md.machine_check_early(regs); + return handled; } @@ -679,11 +729,11 @@ long hmi_handle_debugtrig(struct pt_regs *regs) /* * Return values: */ -long hmi_exception_realmode(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode) { int ret; - __this_cpu_inc(irq_stat.hmi_exceptions); + local_paca->hmi_irqs++; ret = hmi_handle_debugtrig(regs); if (ret >= 0) @@ -698,3 +748,24 @@ long hmi_exception_realmode(struct pt_regs *regs) return 1; } + +void __init mce_init(void) +{ + struct mce_info *mce_info; + u64 limit; + int i; + + limit = min(ppc64_bolted_size(), ppc64_rma_size); + for_each_possible_cpu(i) { + mce_info = memblock_alloc_try_nid(sizeof(*mce_info), + __alignof__(*mce_info), + MEMBLOCK_LOW_LIMIT, + limit, early_cpu_to_node(i)); + if (!mce_info) + goto err; + paca_ptrs[i]->mce_info = mce_info; + } + return; +err: + panic("Failed to allocate memory for MCE event data\n"); +} diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 1cbf7f1a4e3d..71e8f2a92e36 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -12,14 +12,15 @@ #include <linux/types.h> #include <linux/ptrace.h> #include <linux/extable.h> +#include <linux/pgtable.h> #include <asm/mmu.h> #include <asm/mce.h> #include <asm/machdep.h> -#include <asm/pgtable.h> #include <asm/pte-walk.h> #include <asm/sstep.h> #include <asm/exception-64s.h> #include <asm/extable.h> +#include <asm/inst.h> /* * Convert an address related to an mm to a PFN. NOTE: we are in real @@ -27,7 +28,7 @@ */ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { - pte_t *ptep; + pte_t *ptep, pte; unsigned int shift; unsigned long pfn, flags; struct mm_struct *mm; @@ -39,42 +40,52 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) local_irq_save(flags); ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift); + if (!ptep) { + pfn = ULONG_MAX; + goto out; + } + pte = READ_ONCE(*ptep); - if (!ptep || pte_special(*ptep)) { + if (!pte_present(pte) || pte_special(pte)) { pfn = ULONG_MAX; goto out; } if (shift <= PAGE_SHIFT) - pfn = pte_pfn(*ptep); + pfn = pte_pfn(pte); else { unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; - pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask))); + pfn = pte_pfn(__pte(pte_val(pte) | (addr & rpnmask))); } - out: local_irq_restore(flags); return pfn; } -/* flush SLBs and reload */ -#ifdef CONFIG_PPC_BOOK3S_64 -void flush_and_reload_slb(void) +static bool mce_in_guest(void) { - /* Invalidate all SLBs */ - slb_flush_all_realmode(); - #ifdef CONFIG_KVM_BOOK3S_HANDLER /* - * If machine check is hit when in guest or in transition, we will - * only flush the SLBs and continue. + * If machine check is hit when in guest context or low level KVM + * code, avoid looking up any translations or making any attempts + * to recover, just record the event and pass to KVM. */ if (get_paca()->kvm_hstate.in_guest) - return; + return true; #endif + return false; +} + +/* flush SLBs and reload */ +#ifdef CONFIG_PPC_64S_HASH_MMU +void flush_and_reload_slb(void) +{ if (early_radix_enabled()) return; + /* Invalidate all SLBs */ + slb_flush_all_realmode(); + /* * This probably shouldn't happen, but it may be possible it's * called in early boot before SLB shadows are allocated. @@ -86,9 +97,9 @@ void flush_and_reload_slb(void) } #endif -static void flush_erat(void) +void flush_erat(void) { -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { flush_and_reload_slb(); return; @@ -103,7 +114,7 @@ static void flush_erat(void) static int mce_flush(int what) { -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (what == MCE_FLUSH_SLB) { flush_and_reload_slb(); return 1; @@ -238,6 +249,45 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, 0, 0, 0, 0, 0, 0 } }; +static const struct mce_ierror_table mce_p10_ierror_table[] = { +{ 0x00000000081c0000, 0x0000000000040000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000000081c0000, 0x0000000000080000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000000081c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000000081c0000, 0x0000000000100000, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000000081c0000, 0x0000000000140000, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000000081c0000, 0x0000000000180000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, MCE_ECLASS_SOFTWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000000081c0000, 0x0000000008080000, true, + MCE_ERROR_TYPE_USER,MCE_USER_ERROR_SCV, MCE_ECLASS_SOFTWARE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000000081c0000, 0x00000000080c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH, MCE_ECLASS_SOFTWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000000081c0000, 0x0000000008100000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_SOFTWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000000081c0000, 0x0000000008140000, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE, MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_FATAL, false }, /* ASYNC is fatal */ +{ 0x00000000081c0000, 0x00000000081c0000, true, MCE_ECLASS_HARDWARE, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0, 0, 0, 0, 0, 0, 0 } }; + struct mce_derror_table { unsigned long dsisr_value; bool dar_valid; /* dar is a valid indicator of faulting address */ @@ -356,6 +406,46 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, false, 0, 0, 0, 0, 0 } }; +static const struct mce_derror_table mce_p10_derror_table[] = { +{ 0x00008000, false, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00004000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000800, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000400, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000200, false, + MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, MCE_ECLASS_SOFTWARE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_ECLASS_SOFT_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0x00000100, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000040, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000020, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000010, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN, + MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0x00000008, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN, MCE_ECLASS_HARDWARE, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0, false, 0, 0, 0, 0, 0 } }; + static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, uint64_t *phys_addr) { @@ -365,7 +455,7 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, * in real-mode is tricky and can lead to recursive * faults */ - int instr; + ppc_inst_t instr; unsigned long pfn, instr_addr; struct instruction_op op; struct pt_regs tmp = *regs; @@ -373,7 +463,7 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, pfn = addr_to_pfn(regs, regs->nip); if (pfn != ULONG_MAX) { instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK); - instr = *(unsigned int *)(instr_addr); + instr = ppc_inst_read((u32 *)instr_addr); if (!analyse_instr(&op, &tmp, instr)) { pfn = addr_to_pfn(regs, op.ea); *addr = op.ea; @@ -391,12 +481,11 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, return -1; } -static int mce_handle_ierror(struct pt_regs *regs, +static int mce_handle_ierror(struct pt_regs *regs, unsigned long srr1, const struct mce_ierror_table table[], struct mce_error_info *mce_err, uint64_t *addr, uint64_t *phys_addr) { - uint64_t srr1 = regs->msr; int handled = 0; int i; @@ -406,19 +495,23 @@ static int mce_handle_ierror(struct pt_regs *regs, if ((srr1 & table[i].srr1_mask) != table[i].srr1_value) continue; - /* attempt to correct the error */ - switch (table[i].error_type) { - case MCE_ERROR_TYPE_SLB: - if (local_paca->in_mce == 1) - slb_save_contents(local_paca->mce_faulty_slbs); - handled = mce_flush(MCE_FLUSH_SLB); - break; - case MCE_ERROR_TYPE_ERAT: - handled = mce_flush(MCE_FLUSH_ERAT); - break; - case MCE_ERROR_TYPE_TLB: - handled = mce_flush(MCE_FLUSH_TLB); - break; + if (!mce_in_guest()) { + /* attempt to correct the error */ + switch (table[i].error_type) { + case MCE_ERROR_TYPE_SLB: +#ifdef CONFIG_PPC_64S_HASH_MMU + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); +#endif + handled = mce_flush(MCE_FLUSH_SLB); + break; + case MCE_ERROR_TYPE_ERAT: + handled = mce_flush(MCE_FLUSH_ERAT); + break; + case MCE_ERROR_TYPE_TLB: + handled = mce_flush(MCE_FLUSH_TLB); + break; + } } /* now fill in mce_error_info */ @@ -450,7 +543,7 @@ static int mce_handle_ierror(struct pt_regs *regs, mce_err->sync_error = table[i].sync_error; mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; - if (table[i].nip_valid) { + if (table[i].nip_valid && !mce_in_guest()) { *addr = regs->nip; if (mce_err->sync_error && table[i].error_type == MCE_ERROR_TYPE_UE) { @@ -493,22 +586,26 @@ static int mce_handle_derror(struct pt_regs *regs, if (!(dsisr & table[i].dsisr_value)) continue; - /* attempt to correct the error */ - switch (table[i].error_type) { - case MCE_ERROR_TYPE_SLB: - if (local_paca->in_mce == 1) - slb_save_contents(local_paca->mce_faulty_slbs); - if (mce_flush(MCE_FLUSH_SLB)) - handled = 1; - break; - case MCE_ERROR_TYPE_ERAT: - if (mce_flush(MCE_FLUSH_ERAT)) - handled = 1; - break; - case MCE_ERROR_TYPE_TLB: - if (mce_flush(MCE_FLUSH_TLB)) - handled = 1; - break; + if (!mce_in_guest()) { + /* attempt to correct the error */ + switch (table[i].error_type) { + case MCE_ERROR_TYPE_SLB: +#ifdef CONFIG_PPC_64S_HASH_MMU + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); +#endif + if (mce_flush(MCE_FLUSH_SLB)) + handled = 1; + break; + case MCE_ERROR_TYPE_ERAT: + if (mce_flush(MCE_FLUSH_ERAT)) + handled = 1; + break; + case MCE_ERROR_TYPE_TLB: + if (mce_flush(MCE_FLUSH_TLB)) + handled = 1; + break; + } } /* @@ -550,7 +647,7 @@ static int mce_handle_derror(struct pt_regs *regs, mce_err->initiator = table[i].initiator; if (table[i].dar_valid) *addr = regs->dar; - else if (mce_err->sync_error && + else if (mce_err->sync_error && !mce_in_guest() && table[i].error_type == MCE_ERROR_TYPE_UE) { /* * We do a maximum of 4 nested MCE calls, see @@ -578,15 +675,12 @@ static int mce_handle_derror(struct pt_regs *regs, static long mce_handle_ue_error(struct pt_regs *regs, struct mce_error_info *mce_err) { - long handled = 0; - const struct exception_table_entry *entry; + if (mce_in_guest()) + return 0; - entry = search_kernel_exception_table(regs->nip); - if (entry) { - mce_err->ignore_event = true; - regs->nip = extable_fixup(entry); + mce_common_process_ue(regs, mce_err); + if (mce_err->ignore_event) return 1; - } /* * On specific SCOM read via MMIO we may get a machine check @@ -597,25 +691,26 @@ static long mce_handle_ue_error(struct pt_regs *regs, if (ppc_md.mce_check_early_recovery) { if (ppc_md.mce_check_early_recovery(regs)) - handled = 1; + return 1; } - return handled; + + return 0; } static long mce_handle_error(struct pt_regs *regs, + unsigned long srr1, const struct mce_derror_table dtable[], const struct mce_ierror_table itable[]) { struct mce_error_info mce_err = { 0 }; uint64_t addr, phys_addr = ULONG_MAX; - uint64_t srr1 = regs->msr; long handled; if (SRR1_MC_LOADSTORE(srr1)) handled = mce_handle_derror(regs, dtable, &mce_err, &addr, &phys_addr); else - handled = mce_handle_ierror(regs, itable, &mce_err, &addr, + handled = mce_handle_ierror(regs, srr1, itable, &mce_err, &addr, &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) @@ -631,16 +726,20 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs) /* P7 DD1 leaves top bits of DSISR undefined */ regs->dsisr &= 0x0000ffff; - return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table); + return mce_handle_error(regs, regs->msr, + mce_p7_derror_table, mce_p7_ierror_table); } long __machine_check_early_realmode_p8(struct pt_regs *regs) { - return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table); + return mce_handle_error(regs, regs->msr, + mce_p8_derror_table, mce_p8_ierror_table); } long __machine_check_early_realmode_p9(struct pt_regs *regs) { + unsigned long srr1 = regs->msr; + /* * On POWER9 DD2.1 and below, it's possible to get a machine check * caused by a paste instruction where only DSISR bit 25 is set. This @@ -654,5 +753,39 @@ long __machine_check_early_realmode_p9(struct pt_regs *regs) if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000) return 1; - return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); + /* + * Async machine check due to bad real address from store or foreign + * link time out comes with the load/store bit (PPC bit 42) set in + * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're + * directed to the ierror table so it will find the cause (which + * describes it correctly as a store error). + */ + if (SRR1_MC_LOADSTORE(srr1) && + ((srr1 & 0x081c0000) == 0x08140000 || + (srr1 & 0x081c0000) == 0x08180000)) { + srr1 &= ~PPC_BIT(42); + } + + return mce_handle_error(regs, srr1, + mce_p9_derror_table, mce_p9_ierror_table); +} + +long __machine_check_early_realmode_p10(struct pt_regs *regs) +{ + unsigned long srr1 = regs->msr; + + /* + * Async machine check due to bad real address from store comes with + * the load/store bit (PPC bit 42) set in SRR1, but the cause comes in + * SRR1 not DSISR. Clear bit 42 so we're directed to the ierror table + * so it will find the cause (which describes it correctly as a store + * error). + */ + if (SRR1_MC_LOADSTORE(srr1) && + (srr1 & 0x081c0000) == 0x08140000) { + srr1 &= ~PPC_BIT(42); + } + + return mce_handle_error(regs, srr1, + mce_p10_derror_table, mce_p10_ierror_table); } diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 974f65f79a8e..29e1440d14cc 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -10,11 +10,11 @@ * * setjmp/longjmp code by Paul Mackerras. */ +#include <linux/export.h> #include <asm/ppc_asm.h> #include <asm/unistd.h> #include <asm/asm-compat.h> #include <asm/asm-offsets.h> -#include <asm/export.h> .text @@ -29,13 +29,15 @@ _GLOBAL(reloc_offset) li r3, 0 _GLOBAL(add_reloc_offset) mflr r0 - bl 1f + bcl 20,31,$+4 1: mflr r5 PPC_LL r4,(2f-1b)(r5) subf r5,r4,r5 add r3,r3,r5 mtlr r0 blr +_ASM_NOKPROBE_SYMBOL(reloc_offset) +_ASM_NOKPROBE_SYMBOL(add_reloc_offset) .align 3 2: PPC_LONG 1b @@ -110,7 +112,7 @@ _GLOBAL(longjmp) li r3, 1 blr -_GLOBAL(current_stack_pointer) +_GLOBAL(current_stack_frame) PPC_LL r3,0(r1) blr -EXPORT_SYMBOL(current_stack_pointer) +EXPORT_SYMBOL(current_stack_frame) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index d80212be8698..2eabb15687a6 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -8,6 +8,7 @@ * */ +#include <linux/export.h> #include <linux/sys.h> #include <asm/unistd.h> #include <asm/errno.h> @@ -22,51 +23,11 @@ #include <asm/processor.h> #include <asm/bug.h> #include <asm/ptrace.h> -#include <asm/export.h> #include <asm/feature-fixups.h> .text /* - * We store the saved ksp_limit in the unused part - * of the STACK_FRAME_OVERHEAD - */ -_GLOBAL(call_do_softirq) - mflr r0 - stw r0,4(r1) - lwz r10,THREAD+KSP_LIMIT(r2) - stw r3, THREAD+KSP_LIMIT(r2) - stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) - mr r1,r3 - stw r10,8(r1) - bl __do_softirq - lwz r10,8(r1) - lwz r1,0(r1) - lwz r0,4(r1) - stw r10,THREAD+KSP_LIMIT(r2) - mtlr r0 - blr - -/* - * void call_do_irq(struct pt_regs *regs, void *sp); - */ -_GLOBAL(call_do_irq) - mflr r0 - stw r0,4(r1) - lwz r10,THREAD+KSP_LIMIT(r2) - stw r4, THREAD+KSP_LIMIT(r2) - stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) - mr r1,r4 - stw r10,8(r1) - bl __do_irq - lwz r10,8(r1) - lwz r1,0(r1) - lwz r0,4(r1) - stw r10,THREAD+KSP_LIMIT(r2) - mtlr r0 - blr - -/* * This returns the high 64 bits of the product of two 64-bit numbers. */ _GLOBAL(mulhdu) @@ -106,7 +67,7 @@ _GLOBAL(reloc_got2) srwi. r8,r8,2 beqlr mtctr r8 - bl 1f + bcl 20,31,$+4 1: mflr r0 lis r4,1b@ha addi r4,r4,1b@l @@ -215,19 +176,6 @@ _GLOBAL(low_choose_7447a_dfs) #endif /* CONFIG_CPU_FREQ_PMAC && CONFIG_PPC_BOOK3S_32 */ -/* - * complement mask on the msr then "or" some values on. - * _nmask_and_or_msr(nmask, value_to_or) - */ -_GLOBAL(_nmask_and_or_msr) - mfmsr r0 /* Get current msr */ - andc r0,r0,r3 /* And off the bits set in r3 (first parm) */ - or r0,r0,r4 /* Or on the bits in r4 (second parm) */ - SYNC /* Some chip revs have problems here... */ - mtmsr r0 /* Update machine state */ - isync - blr /* Done */ - #ifdef CONFIG_40x /* @@ -246,6 +194,7 @@ _GLOBAL(real_readb) sync isync blr +_ASM_NOKPROBE_SYMBOL(real_readb) /* * Do an IO access in real mode @@ -263,53 +212,10 @@ _GLOBAL(real_writeb) sync isync blr +_ASM_NOKPROBE_SYMBOL(real_writeb) #endif /* CONFIG_40x */ - -/* - * Flush instruction cache. - * This is a no-op on the 601. - */ -#ifndef CONFIG_PPC_8xx -_GLOBAL(flush_instruction_cache) -#if defined(CONFIG_4xx) -#ifdef CONFIG_403GCX - li r3, 512 - mtctr r3 - lis r4, KERNELBASE@h -1: iccci 0, r4 - addi r4, r4, 16 - bdnz 1b -#else - lis r3, KERNELBASE@h - iccci 0,r3 -#endif -#elif defined(CONFIG_FSL_BOOKE) -#ifdef CONFIG_E200 - mfspr r3,SPRN_L1CSR0 - ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC - /* msync; isync recommended here */ - mtspr SPRN_L1CSR0,r3 - isync - blr -#endif - mfspr r3,SPRN_L1CSR1 - ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR - mtspr SPRN_L1CSR1,r3 -#elif defined(CONFIG_PPC_BOOK3S_601) - blr /* for 601, do nothing */ -#else - /* 603/604 processor - use invalidate-all bit in HID0 */ - mfspr r3,SPRN_HID0 - ori r3,r3,HID0_ICFI - mtspr SPRN_HID0,r3 -#endif /* CONFIG_4xx */ - isync - blr -EXPORT_SYMBOL(flush_instruction_cache) -#endif /* CONFIG_PPC_8xx */ - /* * Copy a whole page. We use the dcbz instruction on the destination * to reduce memory traffic (it eliminates the unnecessary reads of @@ -331,7 +237,7 @@ _GLOBAL(copy_page) addi r3,r3,-4 0: twnei r5, 0 /* WARN if r3 is not cache aligned */ - EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING addi r4,r4,-4 @@ -476,15 +382,9 @@ EXPORT_SYMBOL(__bswapdi2) _GLOBAL(start_secondary_resume) /* Reset stack */ rlwinm r1, r1, 0, 0, 31 - THREAD_SHIFT - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r3,0 stw r3,0(r1) /* Zero the stack frame pointer */ bl start_secondary b . #endif /* CONFIG_SMP */ - -/* - * This routine is just here to keep GCC happy - sigh... - */ -_GLOBAL(__main) - blr diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 1864605eca29..1a8cdafd68e8 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -9,6 +9,8 @@ * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) */ +#include <linux/export.h> +#include <linux/linkage.h> #include <linux/sys.h> #include <asm/unistd.h> #include <asm/errno.h> @@ -22,33 +24,10 @@ #include <asm/kexec.h> #include <asm/ptrace.h> #include <asm/mmu.h> -#include <asm/export.h> #include <asm/feature-fixups.h> .text -_GLOBAL(call_do_softirq) - mflr r0 - std r0,16(r1) - stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) - mr r1,r3 - bl __do_softirq - ld r1,0(r1) - ld r0,16(r1) - mtlr r0 - blr - -_GLOBAL(call_do_irq) - mflr r0 - std r0,16(r1) - stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) - mr r1,r4 - bl __do_irq - ld r1,0(r1) - ld r0,16(r1) - mtlr r0 - blr - _GLOBAL(__bswapdi2) EXPORT_SYMBOL(__bswapdi2) srdi r8,r3,32 @@ -277,7 +256,7 @@ _GLOBAL(scom970_write) * Physical (hardware) cpu id should be in r3. */ _GLOBAL(kexec_wait) - bl 1f + bcl 20,31,$+4 1: mflr r5 addi r5,r5,kexec_flag-1b @@ -308,7 +287,7 @@ kexec_flag: #ifdef CONFIG_KEXEC_CORE -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* * BOOK3E has no real MMU mode, so we have to setup the initial TLB * for a core to identity map v:0 to p:0. This current implementation @@ -365,7 +344,6 @@ _GLOBAL(kexec_smp_wait) li r4,KEXEC_STATE_REAL_MODE stb r4,PACAKEXECSTATE(r13) - SYNC b kexec_wait @@ -376,8 +354,8 @@ _GLOBAL(kexec_smp_wait) * * don't overwrite r3 here, it is live for kexec_wait above. */ -real_mode: /* assume normal blr return */ -#ifdef CONFIG_PPC_BOOK3E +SYM_FUNC_START_LOCAL(real_mode) /* assume normal blr return */ +#ifdef CONFIG_PPC_BOOK3E_64 /* Create an identity mapping. */ b kexec_create_tlb #else @@ -393,6 +371,7 @@ real_mode: /* assume normal blr return */ mtspr SPRN_SRR0,r11 rfid #endif +SYM_FUNC_END(real_mode) /* * kexec_sequence(newstack, start, image, control, clear_all(), @@ -407,26 +386,12 @@ _GLOBAL(kexec_sequence) std r0,16(r1) /* switch stacks to newstack -- &kexec_stack.stack */ - stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) + stdu r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r3) mr r1,r3 li r0,0 std r0,16(r1) -BEGIN_FTR_SECTION - /* - * This is the best time to turn AMR/IAMR off. - * key 0 is used in radix for supervisor<->user - * protection, but on hash key 0 is reserved - * ideally we want to enter with a clean state. - * NOTE, we rely on r0 being 0 from above. - */ - mtspr SPRN_IAMR,r0 -BEGIN_FTR_SECTION_NESTED(42) - mtspr SPRN_AMOR,r0 -END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - /* save regs for local vars on new stack. * yes, we won't go back, but ... */ @@ -438,7 +403,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) std r26,-48(r1) std r25,-56(r1) - stdu r1,-STACK_FRAME_OVERHEAD-64(r1) + stdu r1,-STACK_FRAME_MIN_SIZE-64(r1) /* save args into preserved regs */ mr r31,r3 /* newstack (both) */ @@ -450,7 +415,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lhz r25,PACAHWCPUID(r13) /* get our phys cpu from paca */ /* disable interrupts, we are overwriting kernel data next */ -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 wrteei 0 #else mfmsr r3 @@ -467,7 +432,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1: /* copy dest pages, flush whole dest image */ mr r3,r29 - bl kexec_copy_flush /* (image) */ + bl CFUNC(kexec_copy_flush) /* (image) */ /* turn off mmu now if not done earlier */ cmpdi r26,0 @@ -491,7 +456,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) beq 1f /* clear out hardware hash page table and tlb */ -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 ld r12,0(r27) /* deref function descriptor */ #else mr r12,r27 diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index df649acb5631..f6d6ae0a1692 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -8,12 +8,14 @@ #include <linux/moduleloader.h> #include <linux/err.h> #include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/bug.h> #include <asm/module.h> #include <linux/uaccess.h> #include <asm/firmware.h> #include <linux/sort.h> #include <asm/setup.h> +#include <asm/sections.h> static LIST_HEAD(module_bug_list); @@ -62,13 +64,13 @@ int module_finalize(const Elf_Ehdr *hdr, (void *)sect->sh_addr + sect->sh_size); #endif /* CONFIG_PPC64 */ -#ifdef PPC64_ELF_ABI_v1 +#ifdef CONFIG_PPC64_ELF_ABI_V1 sect = find_section(hdr, sechdrs, ".opd"); if (sect != NULL) { me->arch.start_opd = sect->sh_addr; me->arch.end_opd = sect->sh_addr + sect->sh_size; } -#endif /* PPC64_ELF_ABI_v1 */ +#endif /* CONFIG_PPC64_ELF_ABI_V1 */ #ifdef CONFIG_PPC_BARRIER_NOSPEC sect = find_section(hdr, sechdrs, "__spec_barrier_fixup"); @@ -86,3 +88,40 @@ int module_finalize(const Elf_Ehdr *hdr, return 0; } + +static __always_inline void * +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) +{ + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); + + /* + * Don't do huge page allocations for modules yet until more testing + * is done. STRICT_MODULE_RWX may require extra work to support this + * too. + */ + return __vmalloc_node_range(size, 1, start, end, gfp, prot, + VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); +} + +void *module_alloc(unsigned long size) +{ +#ifdef MODULES_VADDR + unsigned long limit = (unsigned long)_etext - SZ_32M; + void *ptr = NULL; + + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); + + /* First try within 32M limit from _etext to avoid branch trampolines */ + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) + ptr = __module_alloc(size, limit, MODULES_END, true); + + if (!ptr) + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); + + return ptr; +#else + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); +#endif +} diff --git a/arch/powerpc/kernel/module.lds b/arch/powerpc/kernel/module.lds deleted file mode 100644 index cea5dc124be4..000000000000 --- a/arch/powerpc/kernel/module.lds +++ /dev/null @@ -1,8 +0,0 @@ -/* Force alignment of .toc section. */ -SECTIONS -{ - .toc 0 : ALIGN(256) - { - *(.got .toc) - } -} diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index d7134c614c16..816a63fd71fb 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -18,6 +18,7 @@ #include <linux/bug.h> #include <linux/sort.h> #include <asm/setup.h> +#include <asm/code-patching.h> /* Count how many different relocations (different symbol, different addend) */ @@ -67,21 +68,6 @@ static int relacmp(const void *_x, const void *_y) return 0; } -static void relaswap(void *_x, void *_y, int size) -{ - uint32_t *x, *y, tmp; - int i; - - y = (uint32_t *)_x; - x = (uint32_t *)_y; - - for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) { - tmp = x[i]; - x[i] = y[i]; - y[i] = tmp; - } -} - /* Get the potential trampolines size required of the init and non-init sections */ static unsigned long get_plt_size(const Elf32_Ehdr *hdr, @@ -113,12 +99,12 @@ static unsigned long get_plt_size(const Elf32_Ehdr *hdr, /* Sort the relocation information based on a symbol and * addend key. This is a stable O(n*log n) complexity - * alogrithm but it will reduce the complexity of + * algorithm but it will reduce the complexity of * count_relocs() to linear complexity O(n) */ sort((void *)hdr + sechdrs[i].sh_offset, sechdrs[i].sh_size / sizeof(Elf32_Rela), - sizeof(Elf32_Rela), relacmp, relaswap); + sizeof(Elf32_Rela), relacmp, NULL); ret += count_relocs((void *)hdr + sechdrs[i].sh_offset, @@ -160,10 +146,9 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr, static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val) { - if (entry->jump[0] != (PPC_INST_ADDIS | __PPC_RT(R12) | PPC_HA(val))) + if (entry->jump[0] != PPC_RAW_LIS(_R12, PPC_HA(val))) return 0; - if (entry->jump[1] != (PPC_INST_ADDI | __PPC_RT(R12) | __PPC_RA(R12) | - PPC_LO(val))) + if (entry->jump[1] != PPC_RAW_ADDI(_R12, _R12, PPC_LO(val))) return 0; return 1; } @@ -178,8 +163,7 @@ static uint32_t do_plt_call(void *location, pr_debug("Doing plt for call to 0x%x at 0x%x\n", val, (unsigned int)location); /* Init, or core PLT? */ - if (location >= mod->core_layout.base - && location < mod->core_layout.base + mod->core_layout.size) + if (within_module_core((unsigned long)location, mod)) entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr; else entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr; @@ -190,21 +174,25 @@ static uint32_t do_plt_call(void *location, entry++; } - /* - * lis r12, sym@ha - * addi r12, r12, sym@l - * mtctr r12 - * bctr - */ - entry->jump[0] = PPC_INST_ADDIS | __PPC_RT(R12) | PPC_HA(val); - entry->jump[1] = PPC_INST_ADDI | __PPC_RT(R12) | __PPC_RA(R12) | PPC_LO(val); - entry->jump[2] = PPC_INST_MTCTR | __PPC_RS(R12); - entry->jump[3] = PPC_INST_BCTR; + if (patch_instruction(&entry->jump[0], ppc_inst(PPC_RAW_LIS(_R12, PPC_HA(val))))) + return 0; + if (patch_instruction(&entry->jump[1], ppc_inst(PPC_RAW_ADDI(_R12, _R12, PPC_LO(val))))) + return 0; + if (patch_instruction(&entry->jump[2], ppc_inst(PPC_RAW_MTCTR(_R12)))) + return 0; + if (patch_instruction(&entry->jump[3], ppc_inst(PPC_RAW_BCTR()))) + return 0; pr_debug("Initialized plt for 0x%x at %p\n", val, entry); return (uint32_t)entry; } +static int patch_location_16(uint32_t *loc, u16 value) +{ + loc = PTR_ALIGN_DOWN(loc, sizeof(u32)); + return patch_instruction(loc, ppc_inst((*loc & 0xffff0000) | value)); +} + int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -238,44 +226,46 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, case R_PPC_ADDR16_LO: /* Low half of the symbol */ - *(uint16_t *)location = value; + if (patch_location_16(location, PPC_LO(value))) + return -EFAULT; break; case R_PPC_ADDR16_HI: /* Higher half of the symbol */ - *(uint16_t *)location = (value >> 16); + if (patch_location_16(location, PPC_HI(value))) + return -EFAULT; break; case R_PPC_ADDR16_HA: - /* Sign-adjusted lower 16 bits: PPC ELF ABI says: - (((x >> 16) + ((x & 0x8000) ? 1 : 0))) & 0xFFFF. - This is the same, only sane. - */ - *(uint16_t *)location = (value + 0x8000) >> 16; + if (patch_location_16(location, PPC_HA(value))) + return -EFAULT; break; case R_PPC_REL24: if ((int)(value - (uint32_t)location) < -0x02000000 - || (int)(value - (uint32_t)location) >= 0x02000000) + || (int)(value - (uint32_t)location) >= 0x02000000) { value = do_plt_call(location, value, sechdrs, module); + if (!value) + return -EFAULT; + } /* Only replace bits 2 through 26 */ pr_debug("REL24 value = %08X. location = %08X\n", value, (uint32_t)location); pr_debug("Location before: %08X.\n", *(uint32_t *)location); - *(uint32_t *)location - = (*(uint32_t *)location & ~0x03fffffc) - | ((value - (uint32_t)location) - & 0x03fffffc); + value = (*(uint32_t *)location & ~PPC_LI_MASK) | + PPC_LI(value - (uint32_t)location); + + if (patch_instruction(location, ppc_inst(value))) + return -EFAULT; + pr_debug("Location after: %08X.\n", *(uint32_t *)location); pr_debug("ie. jump to %08X+%08X = %08X\n", - *(uint32_t *)location & 0x03fffffc, - (uint32_t)location, - (*(uint32_t *)location & 0x03fffffc) - + (uint32_t)location); + *(uint32_t *)PPC_LI((uint32_t)location), (uint32_t)location, + (*(uint32_t *)PPC_LI((uint32_t)location)) + (uint32_t)location); break; case R_PPC_REL32: @@ -295,14 +285,56 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, } #ifdef CONFIG_DYNAMIC_FTRACE +notrace int module_trampoline_target(struct module *mod, unsigned long addr, + unsigned long *target) +{ + ppc_inst_t jmp[4]; + + /* Find where the trampoline jumps to */ + if (copy_inst_from_kernel_nofault(jmp, (void *)addr)) + return -EFAULT; + if (__copy_inst_from_kernel_nofault(jmp + 1, (void *)addr + 4)) + return -EFAULT; + if (__copy_inst_from_kernel_nofault(jmp + 2, (void *)addr + 8)) + return -EFAULT; + if (__copy_inst_from_kernel_nofault(jmp + 3, (void *)addr + 12)) + return -EFAULT; + + /* verify that this is what we expect it to be */ + if ((ppc_inst_val(jmp[0]) & 0xffff0000) != PPC_RAW_LIS(_R12, 0)) + return -EINVAL; + if ((ppc_inst_val(jmp[1]) & 0xffff0000) != PPC_RAW_ADDI(_R12, _R12, 0)) + return -EINVAL; + if (ppc_inst_val(jmp[2]) != PPC_RAW_MTCTR(_R12)) + return -EINVAL; + if (ppc_inst_val(jmp[3]) != PPC_RAW_BCTR()) + return -EINVAL; + + addr = (ppc_inst_val(jmp[1]) & 0xffff) | ((ppc_inst_val(jmp[0]) & 0xffff) << 16); + if (addr & 0x8000) + addr -= 0x10000; + + *target = addr; + + return 0; +} + int module_finalize_ftrace(struct module *module, const Elf_Shdr *sechdrs) { - module->arch.tramp = do_plt_call(module->core_layout.base, + module->arch.tramp = do_plt_call(module->mem[MOD_TEXT].base, (unsigned long)ftrace_caller, sechdrs, module); if (!module->arch.tramp) return -ENOENT; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + module->arch.tramp_regs = do_plt_call(module->mem[MOD_TEXT].base, + (unsigned long)ftrace_regs_caller, + sechdrs, module); + if (!module->arch.tramp_regs) + return -ENOENT; +#endif + return 0; } #endif diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 007606a48fd9..7112adc597a8 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -14,12 +14,14 @@ #include <linux/ftrace.h> #include <linux/bug.h> #include <linux/uaccess.h> +#include <linux/kernel.h> #include <asm/module.h> #include <asm/firmware.h> #include <asm/code-patching.h> #include <linux/sort.h> #include <asm/setup.h> #include <asm/sections.h> +#include <asm/inst.h> /* FIXME: We don't do .init separately. To do this, we'd need to have a separate r2 value in the init and core section, and stub between @@ -29,22 +31,25 @@ this, and makes other things simpler. Anton? --RR. */ -#ifdef PPC64_ELF_ABI_v2 +bool module_elf_check_arch(Elf_Ehdr *hdr) +{ + unsigned long abi_level = hdr->e_flags & 0x3; + + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) + return abi_level == 2; + else + return abi_level < 2; +} -/* An address is simply the address of the function. */ -typedef unsigned long func_desc_t; +#ifdef CONFIG_PPC64_ELF_ABI_V2 static func_desc_t func_desc(unsigned long addr) { - return addr; -} -static unsigned long func_addr(unsigned long addr) -{ - return addr; -} -static unsigned long stub_func_addr(func_desc_t func) -{ - return func; + func_desc_t desc = { + .addr = addr, + }; + + return desc; } /* PowerPC64 specific values for the Elf64_Sym st_other field. */ @@ -62,20 +67,9 @@ static unsigned int local_entry_offset(const Elf64_Sym *sym) } #else -/* An address is address of the OPD entry, which contains address of fn. */ -typedef struct ppc64_opd_entry func_desc_t; - static func_desc_t func_desc(unsigned long addr) { - return *(struct ppc64_opd_entry *)addr; -} -static unsigned long func_addr(unsigned long addr) -{ - return func_desc(addr).funcaddr; -} -static unsigned long stub_func_addr(func_desc_t func) -{ - return func.funcaddr; + return *(struct func_desc *)addr; } static unsigned int local_entry_offset(const Elf64_Sym *sym) { @@ -92,97 +86,80 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr) } #endif +static unsigned long func_addr(unsigned long addr) +{ + return func_desc(addr).addr; +} + +static unsigned long stub_func_addr(func_desc_t func) +{ + return func.addr; +} + #define STUB_MAGIC 0x73747562 /* stub */ /* Like PPC32, we need little trampolines to do > 24-bit jumps (into the kernel itself). But on PPC64, these need to be used for every jump, actually, to reset r2 (TOC+0x8000). */ -struct ppc64_stub_entry -{ - /* 28 byte jump instruction sequence (7 instructions). We only - * need 6 instructions on ABIv2 but we always allocate 7 so - * so we don't have to modify the trampoline load instruction. */ +struct ppc64_stub_entry { + /* + * 28 byte jump instruction sequence (7 instructions) that can + * hold ppc64_stub_insns or stub_insns. Must be 8-byte aligned + * with PCREL kernels that use prefix instructions in the stub. + */ u32 jump[7]; /* Used by ftrace to identify stubs */ u32 magic; /* Data for the above code */ func_desc_t funcdata; +} __aligned(8); + +struct ppc64_got_entry { + u64 addr; }; /* * PPC64 uses 24 bit jumps, but we need to jump into other modules or * the kernel which may be further. So we jump to a stub. * - * For ELFv1 we need to use this to set up the new r2 value (aka TOC - * pointer). For ELFv2 it's the callee's responsibility to set up the - * new r2, but for both we need to save the old r2. + * Target address and TOC are loaded from function descriptor in the + * ppc64_stub_entry. * - * We could simply patch the new r2 value and function pointer into - * the stub, but it's significantly shorter to put these values at the - * end of the stub code, and patch the stub address (32-bits relative - * to the TOC ptr, r2) into the stub. + * r12 is used to generate the target address, which is required for the + * ELFv2 global entry point calling convention. * - * addis r11,r2, <high> - * addi r11,r11, <low> - * std r2,R2_STACK_OFFSET(r1) - * ld r12,32(r11) - * ld r2,40(r11) - * mtctr r12 - * bctr + * TOC handling: + * - PCREL does not have a TOC. + * - ELFv2 non-PCREL just has to save r2, the callee is responsible for + * setting its own TOC pointer at the global entry address. + * - ELFv1 must load the new TOC pointer from the function descriptor. */ static u32 ppc64_stub_insns[] = { - PPC_INST_ADDIS | __PPC_RT(R11) | __PPC_RA(R2), - PPC_INST_ADDI | __PPC_RT(R11) | __PPC_RA(R11), +#ifdef CONFIG_PPC_KERNEL_PCREL + /* pld r12,addr */ + PPC_PREFIX_8LS | __PPC_PRFX_R(1), + PPC_INST_PLD | ___PPC_RT(_R12), +#else + PPC_RAW_ADDIS(_R11, _R2, 0), + PPC_RAW_ADDI(_R11, _R11, 0), /* Save current r2 value in magic place on the stack. */ - PPC_INST_STD | __PPC_RS(R2) | __PPC_RA(R1) | R2_STACK_OFFSET, - PPC_INST_LD | __PPC_RT(R12) | __PPC_RA(R11) | 32, -#ifdef PPC64_ELF_ABI_v1 + PPC_RAW_STD(_R2, _R1, R2_STACK_OFFSET), + PPC_RAW_LD(_R12, _R11, 32), +#ifdef CONFIG_PPC64_ELF_ABI_V1 /* Set up new r2 from function descriptor */ - PPC_INST_LD | __PPC_RT(R2) | __PPC_RA(R11) | 40, + PPC_RAW_LD(_R2, _R11, 40), #endif - PPC_INST_MTCTR | __PPC_RS(R12), - PPC_INST_BCTR, -}; - -#ifdef CONFIG_DYNAMIC_FTRACE -int module_trampoline_target(struct module *mod, unsigned long addr, - unsigned long *target) -{ - struct ppc64_stub_entry *stub; - func_desc_t funcdata; - u32 magic; - - if (!within_module_core(addr, mod)) { - pr_err("%s: stub %lx not in module %s\n", __func__, addr, mod->name); - return -EFAULT; - } - - stub = (struct ppc64_stub_entry *)addr; - - if (probe_kernel_read(&magic, &stub->magic, sizeof(magic))) { - pr_err("%s: fault reading magic for stub %lx for %s\n", __func__, addr, mod->name); - return -EFAULT; - } - - if (magic != STUB_MAGIC) { - pr_err("%s: bad magic for stub %lx for %s\n", __func__, addr, mod->name); - return -EFAULT; - } - - if (probe_kernel_read(&funcdata, &stub->funcdata, sizeof(funcdata))) { - pr_err("%s: fault reading funcdata for stub %lx for %s\n", __func__, addr, mod->name); - return -EFAULT; - } - - *target = stub_func_addr(funcdata); - - return 0; -} #endif + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR(), +}; -/* Count how many different 24-bit relocations (different symbol, - different addend) */ -static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) +/* + * Count how many different r_type relocations (different symbol, + * different addend). + */ +static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num, + unsigned long r_type) { unsigned int i, r_info, r_addend, _count_relocs; @@ -191,8 +168,8 @@ static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) r_info = 0; r_addend = 0; for (i = 0; i < num; i++) - /* Only count 24-bit relocs, others don't need stubs */ - if (ELF64_R_TYPE(rela[i].r_info) == R_PPC_REL24 && + /* Only count r_type relocs, others don't need stubs */ + if (ELF64_R_TYPE(rela[i].r_info) == r_type && (r_info != ELF64_R_SYM(rela[i].r_info) || r_addend != rela[i].r_addend)) { _count_relocs++; @@ -226,26 +203,11 @@ static int relacmp(const void *_x, const void *_y) return 0; } -static void relaswap(void *_x, void *_y, int size) -{ - uint64_t *x, *y, tmp; - int i; - - y = (uint64_t *)_x; - x = (uint64_t *)_y; - - for (i = 0; i < sizeof(Elf64_Rela) / sizeof(uint64_t); i++) { - tmp = x[i]; - x[i] = y[i]; - y[i] = tmp; - } -} - /* Get size of potential trampolines required. */ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, const Elf64_Shdr *sechdrs) { - /* One extra reloc so it's always 0-funcaddr terminated */ + /* One extra reloc so it's always 0-addr terminated */ unsigned long relocs = 1; unsigned i; @@ -259,16 +221,23 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, /* Sort the relocation information based on a symbol and * addend key. This is a stable O(n*log n) complexity - * alogrithm but it will reduce the complexity of + * algorithm but it will reduce the complexity of * count_relocs() to linear complexity O(n) */ sort((void *)sechdrs[i].sh_addr, sechdrs[i].sh_size / sizeof(Elf64_Rela), - sizeof(Elf64_Rela), relacmp, relaswap); + sizeof(Elf64_Rela), relacmp, NULL); relocs += count_relocs((void *)sechdrs[i].sh_addr, sechdrs[i].sh_size - / sizeof(Elf64_Rela)); + / sizeof(Elf64_Rela), + R_PPC_REL24); +#ifdef CONFIG_PPC_KERNEL_PCREL + relocs += count_relocs((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size + / sizeof(Elf64_Rela), + R_PPC64_REL24_NOTOC); +#endif } } @@ -285,6 +254,95 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, return relocs * sizeof(struct ppc64_stub_entry); } +#ifdef CONFIG_PPC_KERNEL_PCREL +static int count_pcpu_relocs(const Elf64_Shdr *sechdrs, + const Elf64_Rela *rela, unsigned int num, + unsigned int symindex, unsigned int pcpu) +{ + unsigned int i, r_info, r_addend, _count_relocs; + + _count_relocs = 0; + r_info = 0; + r_addend = 0; + + for (i = 0; i < num; i++) { + Elf64_Sym *sym; + + /* This is the symbol it is referring to */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rela[i].r_info); + + if (sym->st_shndx == pcpu && + (r_info != ELF64_R_SYM(rela[i].r_info) || + r_addend != rela[i].r_addend)) { + _count_relocs++; + r_info = ELF64_R_SYM(rela[i].r_info); + r_addend = rela[i].r_addend; + } + } + + return _count_relocs; +} + +/* Get size of potential GOT required. */ +static unsigned long get_got_size(const Elf64_Ehdr *hdr, + const Elf64_Shdr *sechdrs, + struct module *me) +{ + /* One extra reloc so it's always 0-addr terminated */ + unsigned long relocs = 1; + unsigned int i, symindex = 0; + + for (i = 1; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_SYMTAB) { + symindex = i; + break; + } + } + WARN_ON_ONCE(!symindex); + + /* Every relocated section... */ + for (i = 1; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_RELA) { + pr_debug("Found relocations in section %u\n", i); + pr_debug("Ptr: %p. Number: %llu\n", (void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size / sizeof(Elf64_Rela)); + + /* + * Sort the relocation information based on a symbol and + * addend key. This is a stable O(n*log n) complexity + * algorithm but it will reduce the complexity of + * count_relocs() to linear complexity O(n) + */ + sort((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size / sizeof(Elf64_Rela), + sizeof(Elf64_Rela), relacmp, NULL); + + relocs += count_relocs((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size + / sizeof(Elf64_Rela), + R_PPC64_GOT_PCREL34); + + /* + * Percpu data access typically gets linked with + * REL34 relocations, but the percpu section gets + * moved at load time and requires that to be + * converted to GOT linkage. + */ + if (IS_ENABLED(CONFIG_SMP) && symindex) + relocs += count_pcpu_relocs(sechdrs, + (void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size + / sizeof(Elf64_Rela), + symindex, me->arch.pcpu_section); + } + } + + pr_debug("Looks like a total of %lu GOT entries, max\n", relocs); + return relocs * sizeof(struct ppc64_got_entry); +} +#else /* CONFIG_PPC_KERNEL_PCREL */ + /* Still needed for ELFv2, for .TOC. */ static void dedotify_versions(struct modversion_info *vers, unsigned long size) @@ -334,6 +392,13 @@ static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs, } return NULL; } +#endif /* CONFIG_PPC_KERNEL_PCREL */ + +bool module_init_section(const char *name) +{ + /* We don't handle .init for the moment: always return false. */ + return false; +} int module_frob_arch_sections(Elf64_Ehdr *hdr, Elf64_Shdr *sechdrs, @@ -344,9 +409,17 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, /* Find .toc and .stubs sections, symtab and strtab */ for (i = 1; i < hdr->e_shnum; i++) { - char *p; if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0) me->arch.stubs_section = i; +#ifdef CONFIG_PPC_KERNEL_PCREL + else if (strcmp(secstrings + sechdrs[i].sh_name, ".data..percpu") == 0) + me->arch.pcpu_section = i; + else if (strcmp(secstrings + sechdrs[i].sh_name, ".mygot") == 0) { + me->arch.got_section = i; + if (sechdrs[i].sh_addralign < 8) + sechdrs[i].sh_addralign = 8; + } +#else else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0) { me->arch.toc_section = i; if (sechdrs[i].sh_addralign < 8) @@ -356,15 +429,12 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, dedotify_versions((void *)hdr + sechdrs[i].sh_offset, sechdrs[i].sh_size); - /* We don't handle .init for the moment: rename to _init */ - while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) - p[0] = '_'; - if (sechdrs[i].sh_type == SHT_SYMTAB) dedotify((void *)hdr + sechdrs[i].sh_offset, sechdrs[i].sh_size / sizeof(Elf64_Sym), (void *)hdr + sechdrs[sechdrs[i].sh_link].sh_offset); +#endif } if (!me->arch.stubs_section) { @@ -372,18 +442,135 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, return -ENOEXEC; } +#ifdef CONFIG_PPC_KERNEL_PCREL + if (!me->arch.got_section) { + pr_err("%s: doesn't contain .mygot.\n", me->name); + return -ENOEXEC; + } + + /* Override the got size */ + sechdrs[me->arch.got_section].sh_size = get_got_size(hdr, sechdrs, me); +#else /* If we don't have a .toc, just use .stubs. We need to set r2 to some reasonable value in case the module calls out to other functions via a stub, or if a function pointer escapes the module by some means. */ if (!me->arch.toc_section) me->arch.toc_section = me->arch.stubs_section; +#endif /* Override the stubs size */ sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs); + + return 0; +} + +#if defined(CONFIG_MPROFILE_KERNEL) || defined(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY) + +static u32 stub_insns[] = { +#ifdef CONFIG_PPC_KERNEL_PCREL + PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernelbase)), + PPC_RAW_NOP(), /* align the prefix insn */ + /* paddi r12,r12,addr */ + PPC_PREFIX_MLS | __PPC_PRFX_R(0), + PPC_INST_PADDI | ___PPC_RT(_R12) | ___PPC_RA(_R12), + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR(), +#else + PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernel_toc)), + PPC_RAW_ADDIS(_R12, _R12, 0), + PPC_RAW_ADDI(_R12, _R12, 0), + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR(), +#endif +}; + +/* + * For mprofile-kernel we use a special stub for ftrace_caller() because we + * can't rely on r2 containing this module's TOC when we enter the stub. + * + * That can happen if the function calling us didn't need to use the toc. In + * that case it won't have setup r2, and the r2 value will be either the + * kernel's toc, or possibly another modules toc. + * + * To deal with that this stub uses the kernel toc, which is always accessible + * via the paca (in r13). The target (ftrace_caller()) is responsible for + * saving and restoring the toc before returning. + */ +static inline int create_ftrace_stub(struct ppc64_stub_entry *entry, + unsigned long addr, + struct module *me) +{ + long reladdr; + + if ((unsigned long)entry->jump % 8 != 0) { + pr_err("%s: Address of stub entry is not 8-byte aligned\n", me->name); + return 0; + } + + BUILD_BUG_ON(sizeof(stub_insns) > sizeof(entry->jump)); + memcpy(entry->jump, stub_insns, sizeof(stub_insns)); + + if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) { + /* Stub uses address relative to kernel base (from the paca) */ + reladdr = addr - local_paca->kernelbase; + if (reladdr > 0x1FFFFFFFFL || reladdr < -0x200000000L) { + pr_err("%s: Address of %ps out of range of 34-bit relative address.\n", + me->name, (void *)addr); + return 0; + } + + entry->jump[2] |= IMM_H18(reladdr); + entry->jump[3] |= IMM_L(reladdr); + } else { + /* Stub uses address relative to kernel toc (from the paca) */ + reladdr = addr - kernel_toc_addr(); + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + pr_err("%s: Address of %ps out of range of kernel_toc.\n", + me->name, (void *)addr); + return 0; + } + + entry->jump[1] |= PPC_HA(reladdr); + entry->jump[2] |= PPC_LO(reladdr); + } + + /* Even though we don't use funcdata in the stub, it's needed elsewhere. */ + entry->funcdata = func_desc(addr); + entry->magic = STUB_MAGIC; + + return 1; +} + +static bool is_mprofile_ftrace_call(const char *name) +{ + if (!strcmp("_mcount", name)) + return true; +#ifdef CONFIG_DYNAMIC_FTRACE + if (!strcmp("ftrace_caller", name)) + return true; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + if (!strcmp("ftrace_regs_caller", name)) + return true; +#endif +#endif + + return false; +} +#else +static inline int create_ftrace_stub(struct ppc64_stub_entry *entry, + unsigned long addr, + struct module *me) +{ return 0; } +static bool is_mprofile_ftrace_call(const char *name) +{ + return false; +} +#endif + /* * r2 is the TOC pointer: it actually points 0x8000 into the TOC (this gives the * value maximum span in an instruction which uses a signed offset). Round down @@ -392,32 +579,85 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, */ static inline unsigned long my_r2(const Elf64_Shdr *sechdrs, struct module *me) { +#ifndef CONFIG_PPC_KERNEL_PCREL return (sechdrs[me->arch.toc_section].sh_addr & ~0xfful) + 0x8000; +#else + return -1; +#endif } /* Patch stub to reference function and correct r2 value. */ static inline int create_stub(const Elf64_Shdr *sechdrs, struct ppc64_stub_entry *entry, unsigned long addr, - struct module *me) + struct module *me, + const char *name) { long reladdr; + func_desc_t desc; + int i; - memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns)); + if (is_mprofile_ftrace_call(name)) + return create_ftrace_stub(entry, addr, me); - /* Stub uses address relative to r2. */ - reladdr = (unsigned long)entry - my_r2(sechdrs, me); - if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { - pr_err("%s: Address %p of stub out of range of %p.\n", - me->name, (void *)reladdr, (void *)my_r2); + if ((unsigned long)entry->jump % 8 != 0) { + pr_err("%s: Address of stub entry is not 8-byte aligned\n", me->name); return 0; } - pr_debug("Stub %p get data from reladdr %li\n", entry, reladdr); - entry->jump[0] |= PPC_HA(reladdr); - entry->jump[1] |= PPC_LO(reladdr); - entry->funcdata = func_desc(addr); - entry->magic = STUB_MAGIC; + BUILD_BUG_ON(sizeof(ppc64_stub_insns) > sizeof(entry->jump)); + for (i = 0; i < ARRAY_SIZE(ppc64_stub_insns); i++) { + if (patch_instruction(&entry->jump[i], + ppc_inst(ppc64_stub_insns[i]))) + return 0; + } + + if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) { + /* Stub uses address relative to itself! */ + reladdr = 0 + offsetof(struct ppc64_stub_entry, funcdata); + BUILD_BUG_ON(reladdr != 32); + if (reladdr > 0x1FFFFFFFFL || reladdr < -0x200000000L) { + pr_err("%s: Address of %p out of range of 34-bit relative address.\n", + me->name, (void *)reladdr); + return 0; + } + pr_debug("Stub %p get data from reladdr %li\n", entry, reladdr); + + /* May not even need this if we're relative to 0 */ + if (patch_instruction(&entry->jump[0], + ppc_inst_prefix(entry->jump[0] | IMM_H18(reladdr), + entry->jump[1] | IMM_L(reladdr)))) + return 0; + + } else { + /* Stub uses address relative to r2. */ + reladdr = (unsigned long)entry - my_r2(sechdrs, me); + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + pr_err("%s: Address %p of stub out of range of %p.\n", + me->name, (void *)reladdr, (void *)my_r2); + return 0; + } + pr_debug("Stub %p get data from reladdr %li\n", entry, reladdr); + + if (patch_instruction(&entry->jump[0], + ppc_inst(entry->jump[0] | PPC_HA(reladdr)))) + return 0; + + if (patch_instruction(&entry->jump[1], + ppc_inst(entry->jump[1] | PPC_LO(reladdr)))) + return 0; + } + + // func_desc_t is 8 bytes if ABIv2, else 16 bytes + desc = func_desc(addr); + for (i = 0; i < sizeof(func_desc_t) / sizeof(u32); i++) { + if (patch_instruction(((u32 *)&entry->funcdata) + i, + ppc_inst(((u32 *)(&desc))[i]))) + return 0; + } + + if (patch_instruction(&entry->magic, ppc_inst(STUB_MAGIC))) + return 0; return 1; } @@ -426,7 +666,8 @@ static inline int create_stub(const Elf64_Shdr *sechdrs, stub to set up the TOC ptr (r2) for the function. */ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, unsigned long addr, - struct module *me) + struct module *me, + const char *name) { struct ppc64_stub_entry *stubs; unsigned int i, num_stubs; @@ -443,52 +684,40 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, return (unsigned long)&stubs[i]; } - if (!create_stub(sechdrs, &stubs[i], addr, me)) + if (!create_stub(sechdrs, &stubs[i], addr, me, name)) return 0; return (unsigned long)&stubs[i]; } -#ifdef CONFIG_MPROFILE_KERNEL -static bool is_mprofile_mcount_callsite(const char *name, u32 *instruction) +#ifdef CONFIG_PPC_KERNEL_PCREL +/* Create GOT to load the location described in this ptr */ +static unsigned long got_for_addr(const Elf64_Shdr *sechdrs, + unsigned long addr, + struct module *me, + const char *name) { - if (strcmp("_mcount", name)) - return false; + struct ppc64_got_entry *got; + unsigned int i, num_got; - /* - * Check if this is one of the -mprofile-kernel sequences. - */ - if (instruction[-1] == PPC_INST_STD_LR && - instruction[-2] == PPC_INST_MFLR) - return true; + if (!IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + return addr; - if (instruction[-1] == PPC_INST_MFLR) - return true; + num_got = sechdrs[me->arch.got_section].sh_size / sizeof(*got); - return false; -} + /* Find this stub, or if that fails, the next avail. entry */ + got = (void *)sechdrs[me->arch.got_section].sh_addr; + for (i = 0; got[i].addr; i++) { + if (WARN_ON(i >= num_got)) + return 0; -/* - * In case of _mcount calls, do not save the current callee's TOC (in r2) into - * the original caller's stack frame. If we did we would clobber the saved TOC - * value of the original caller. - */ -static void squash_toc_save_inst(const char *name, unsigned long addr) -{ - struct ppc64_stub_entry *stub = (struct ppc64_stub_entry *)addr; + if (got[i].addr == addr) + return (unsigned long)&got[i]; + } - /* Only for calls to _mcount */ - if (strcmp("_mcount", name) != 0) - return; + got[i].addr = addr; - stub->jump[2] = PPC_INST_NOP; -} -#else -static void squash_toc_save_inst(const char *name, unsigned long addr) { } - -static bool is_mprofile_mcount_callsite(const char *name, u32 *instruction) -{ - return false; + return (unsigned long)&got[i]; } #endif @@ -497,26 +726,39 @@ static bool is_mprofile_mcount_callsite(const char *name, u32 *instruction) static int restore_r2(const char *name, u32 *instruction, struct module *me) { u32 *prev_insn = instruction - 1; + u32 insn_val = *instruction; - if (is_mprofile_mcount_callsite(name, prev_insn)) - return 1; + if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + return 0; + + if (is_mprofile_ftrace_call(name)) + return 0; /* * Make sure the branch isn't a sibling call. Sibling calls aren't * "link" branches and they don't return, so they don't need the r2 * restore afterwards. */ - if (!instr_is_relative_link_branch(*prev_insn)) - return 1; + if (!instr_is_relative_link_branch(ppc_inst(*prev_insn))) + return 0; - if (*instruction != PPC_INST_NOP) { - pr_err("%s: Expected nop after call, got %08x at %pS\n", - me->name, *instruction, instruction); + /* + * For livepatch, the restore r2 instruction might have already been + * written previously, if the referenced symbol is in a previously + * unloaded module which is now being loaded again. In that case, skip + * the warning and the instruction write. + */ + if (insn_val == PPC_INST_LD_TOC) return 0; + + if (insn_val != PPC_RAW_NOP()) { + pr_err("%s: Expected nop after call, got %08x at %pS\n", + me->name, insn_val, instruction); + return -ENOEXEC; } + /* ld r2,R2_STACK_OFFSET(r1) */ - *instruction = PPC_INST_LD_TOC; - return 1; + return patch_instruction(instruction, ppc_inst(PPC_INST_LD_TOC)); } int apply_relocate_add(Elf64_Shdr *sechdrs, @@ -534,6 +776,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, pr_debug("Applying ADD relocate section %u to %u\n", relsec, sechdrs[relsec].sh_info); +#ifndef CONFIG_PPC_KERNEL_PCREL /* First time we're called, we can fix up .TOC. */ if (!me->arch.toc_fixed) { sym = find_dot_toc(sechdrs, strtab, symindex); @@ -543,7 +786,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, sym->st_value = my_r2(sechdrs, me); me->arch.toc_fixed = true; } - +#endif for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -571,6 +814,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, *(unsigned long *)location = value; break; +#ifndef CONFIG_PPC_KERNEL_PCREL case R_PPC64_TOC: *(unsigned long *)location = my_r2(sechdrs, me); break; @@ -630,20 +874,24 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, = (*((uint16_t *) location) & ~0xffff) | (value & 0xffff); break; +#endif case R_PPC_REL24: +#ifdef CONFIG_PPC_KERNEL_PCREL + /* PCREL still generates REL24 for mcount */ + case R_PPC64_REL24_NOTOC: +#endif /* FIXME: Handle weak symbols here --RR */ if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_LIVEPATCH) { /* External: go via stub */ - value = stub_for_addr(sechdrs, value, me); + value = stub_for_addr(sechdrs, value, me, + strtab + sym->st_name); if (!value) return -ENOENT; - if (!restore_r2(strtab + sym->st_name, - (u32 *)location + 1, me)) + if (restore_r2(strtab + sym->st_name, + (u32 *)location + 1, me)) return -ENOEXEC; - - squash_toc_save_inst(strtab + sym->st_name, value); } else value += local_entry_offset(sym); @@ -656,9 +904,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, } /* Only replace bits 2 through 26 */ - *(uint32_t *)location - = (*(uint32_t *)location & ~0x03fffffc) - | (value & 0x03fffffc); + value = (*(uint32_t *)location & ~PPC_LI_MASK) | PPC_LI(value); + + if (patch_instruction((u32 *)location, ppc_inst(value))) + return -EFAULT; + break; case R_PPC64_REL64: @@ -678,6 +928,47 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, *(u32 *)location = value; break; +#ifdef CONFIG_PPC_KERNEL_PCREL + case R_PPC64_PCREL34: { + unsigned long absvalue = value; + + /* Convert value to relative */ + value -= (unsigned long)location; + + if (value + 0x200000000 > 0x3ffffffff) { + if (sym->st_shndx != me->arch.pcpu_section) { + pr_err("%s: REL34 %li out of range!\n", + me->name, (long)value); + return -ENOEXEC; + } + + /* + * per-cpu section is special cased because + * it is moved during loading, so has to be + * converted to use GOT. + */ + value = got_for_addr(sechdrs, absvalue, me, + strtab + sym->st_name); + if (!value) + return -ENOENT; + value -= (unsigned long)location; + + /* Turn pla into pld */ + if (patch_instruction((u32 *)location, + ppc_inst_prefix((*(u32 *)location & ~0x02000000), + (*((u32 *)location + 1) & ~0xf8000000) | 0xe4000000))) + return -EFAULT; + } + + if (patch_instruction((u32 *)location, + ppc_inst_prefix((*(u32 *)location & ~0x3ffff) | IMM_H18(value), + (*((u32 *)location + 1) & ~0xffff) | IMM_L(value)))) + return -EFAULT; + + break; + } + +#else case R_PPC64_TOCSAVE: /* * Marker reloc indicates we don't have to save r2. @@ -685,8 +976,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, * it. */ break; +#endif case R_PPC64_ENTRY: + if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + break; + /* * Optimize ELFv2 large code model entry point if * the TOC is within 2GB range of current location. @@ -699,21 +994,17 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, * ld r2, ...(r12) * add r2, r2, r12 */ - if ((((uint32_t *)location)[0] & ~0xfffc) != - (PPC_INST_LD | __PPC_RT(R2) | __PPC_RA(R12))) + if ((((uint32_t *)location)[0] & ~0xfffc) != PPC_RAW_LD(_R2, _R12, 0)) break; - if (((uint32_t *)location)[1] != - (PPC_INST_ADD | __PPC_RT(R2) | __PPC_RA(R2) | __PPC_RB(R12))) + if (((uint32_t *)location)[1] != PPC_RAW_ADD(_R2, _R2, _R12)) break; /* * If found, replace it with: * addis r2, r12, (.TOC.-func)@ha * addi r2, r2, (.TOC.-func)@l */ - ((uint32_t *)location)[0] = PPC_INST_ADDIS | __PPC_RT(R2) | - __PPC_RA(R12) | PPC_HA(value); - ((uint32_t *)location)[1] = PPC_INST_ADDI | __PPC_RT(R2) | - __PPC_RA(R2) | PPC_LO(value); + ((uint32_t *)location)[0] = PPC_RAW_ADDIS(_R2, _R12, PPC_HA(value)); + ((uint32_t *)location)[1] = PPC_RAW_ADDI(_R2, _R2, PPC_LO(value)); break; case R_PPC64_REL16_HA: @@ -733,6 +1024,20 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, | (value & 0xffff); break; +#ifdef CONFIG_PPC_KERNEL_PCREL + case R_PPC64_GOT_PCREL34: + value = got_for_addr(sechdrs, value, me, + strtab + sym->st_name); + if (!value) + return -ENOENT; + value -= (unsigned long)location; + ((uint32_t *)location)[0] = (((uint32_t *)location)[0] & ~0x3ffff) | + ((value >> 16) & 0x3ffff); + ((uint32_t *)location)[1] = (((uint32_t *)location)[1] & ~0xffff) | + (value & 0xffff); + break; +#endif + default: pr_err("%s: Unknown ADD relocation: %lu\n", me->name, @@ -745,89 +1050,53 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, } #ifdef CONFIG_DYNAMIC_FTRACE - -#ifdef CONFIG_MPROFILE_KERNEL - -#define PACATOC offsetof(struct paca_struct, kernel_toc) - -/* - * For mprofile-kernel we use a special stub for ftrace_caller() because we - * can't rely on r2 containing this module's TOC when we enter the stub. - * - * That can happen if the function calling us didn't need to use the toc. In - * that case it won't have setup r2, and the r2 value will be either the - * kernel's toc, or possibly another modules toc. - * - * To deal with that this stub uses the kernel toc, which is always accessible - * via the paca (in r13). The target (ftrace_caller()) is responsible for - * saving and restoring the toc before returning. - */ -static unsigned long create_ftrace_stub(const Elf64_Shdr *sechdrs, - struct module *me, unsigned long addr) +int module_trampoline_target(struct module *mod, unsigned long addr, + unsigned long *target) { - struct ppc64_stub_entry *entry; - unsigned int i, num_stubs; - /* - * ld r12,PACATOC(r13) - * addis r12,r12,<high> - * addi r12,r12,<low> - * mtctr r12 - * bctr - */ - static u32 stub_insns[] = { - PPC_INST_LD | __PPC_RT(R12) | __PPC_RA(R13) | PACATOC, - PPC_INST_ADDIS | __PPC_RT(R12) | __PPC_RA(R12), - PPC_INST_ADDI | __PPC_RT(R12) | __PPC_RA(R12), - PPC_INST_MTCTR | __PPC_RS(R12), - PPC_INST_BCTR, - }; - long reladdr; + struct ppc64_stub_entry *stub; + func_desc_t funcdata; + u32 magic; - num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*entry); + if (!within_module_core(addr, mod)) { + pr_err("%s: stub %lx not in module %s\n", __func__, addr, mod->name); + return -EFAULT; + } - /* Find the next available stub entry */ - entry = (void *)sechdrs[me->arch.stubs_section].sh_addr; - for (i = 0; i < num_stubs && stub_func_addr(entry->funcdata); i++, entry++); + stub = (struct ppc64_stub_entry *)addr; - if (i >= num_stubs) { - pr_err("%s: Unable to find a free slot for ftrace stub.\n", me->name); - return 0; + if (copy_from_kernel_nofault(&magic, &stub->magic, + sizeof(magic))) { + pr_err("%s: fault reading magic for stub %lx for %s\n", __func__, addr, mod->name); + return -EFAULT; } - memcpy(entry->jump, stub_insns, sizeof(stub_insns)); - - /* Stub uses address relative to kernel toc (from the paca) */ - reladdr = addr - kernel_toc_addr(); - if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { - pr_err("%s: Address of %ps out of range of kernel_toc.\n", - me->name, (void *)addr); - return 0; + if (magic != STUB_MAGIC) { + pr_err("%s: bad magic for stub %lx for %s\n", __func__, addr, mod->name); + return -EFAULT; } - entry->jump[1] |= PPC_HA(reladdr); - entry->jump[2] |= PPC_LO(reladdr); + if (copy_from_kernel_nofault(&funcdata, &stub->funcdata, + sizeof(funcdata))) { + pr_err("%s: fault reading funcdata for stub %lx for %s\n", __func__, addr, mod->name); + return -EFAULT; + } - /* Eventhough we don't use funcdata in the stub, it's needed elsewhere. */ - entry->funcdata = func_desc(addr); - entry->magic = STUB_MAGIC; + *target = stub_func_addr(funcdata); - return (unsigned long)entry; -} -#else -static unsigned long create_ftrace_stub(const Elf64_Shdr *sechdrs, - struct module *me, unsigned long addr) -{ - return stub_for_addr(sechdrs, addr, me); + return 0; } -#endif int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sechdrs) { - mod->arch.tramp = create_ftrace_stub(sechdrs, mod, - (unsigned long)ftrace_caller); + mod->arch.tramp = stub_for_addr(sechdrs, + (unsigned long)ftrace_caller, + mod, + "ftrace_caller"); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - mod->arch.tramp_regs = create_ftrace_stub(sechdrs, mod, - (unsigned long)ftrace_regs_caller); + mod->arch.tramp_regs = stub_for_addr(sechdrs, + (unsigned long)ftrace_regs_caller, + mod, + "ftrace_regs_caller"); if (!mod->arch.tramp_regs) return -ENOENT; #endif diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index fb4f61096613..e385d3164648 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -19,9 +19,9 @@ #include <linux/pstore.h> #include <linux/zlib.h> #include <linux/uaccess.h> +#include <linux/of.h> #include <asm/nvram.h> #include <asm/rtas.h> -#include <asm/prom.h> #include <asm/machdep.h> #undef DEBUG_NVRAM @@ -540,7 +540,7 @@ static struct pstore_info nvram_pstore_info = { .write = nvram_pstore_write, }; -static int nvram_pstore_init(void) +static int __init nvram_pstore_init(void) { int rc = 0; @@ -562,7 +562,7 @@ static int nvram_pstore_init(void) return rc; } #else -static int nvram_pstore_init(void) +static int __init nvram_pstore_init(void) { return -1; } @@ -647,6 +647,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, { struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; static unsigned int oops_count = 0; + static struct kmsg_dump_iter iter; static bool panicking = false; static DEFINE_SPINLOCK(lock); unsigned long flags; @@ -655,9 +656,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, int rc = -1; switch (reason) { - case KMSG_DUMP_RESTART: - case KMSG_DUMP_HALT: - case KMSG_DUMP_POWEROFF: + case KMSG_DUMP_SHUTDOWN: /* These are almost always orderly shutdowns. */ return; case KMSG_DUMP_OOPS: @@ -683,13 +682,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, return; if (big_oops_buf) { - kmsg_dump_get_buffer(dumper, false, + kmsg_dump_rewind(&iter); + kmsg_dump_get_buffer(&iter, false, big_oops_buf, big_oops_buf_sz, &text_len); rc = zip_oops(text_len); } if (rc != 0) { - kmsg_dump_rewind(dumper); - kmsg_dump_get_buffer(dumper, false, + kmsg_dump_rewind(&iter); + kmsg_dump_get_buffer(&iter, false, oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); @@ -755,7 +755,7 @@ static unsigned char __init nvram_checksum(struct nvram_header *p) * Per the criteria passed via nvram_remove_partition(), should this * partition be removed? 1=remove, 0=keep */ -static int nvram_can_remove_partition(struct nvram_partition *part, +static int __init nvram_can_remove_partition(struct nvram_partition *part, const char *name, int sig, const char *exceptions[]) { if (part->header.signature != sig) @@ -854,8 +854,8 @@ loff_t __init nvram_create_partition(const char *name, int sig, BUILD_BUG_ON(NVRAM_BLOCK_LEN != 16); /* Convert sizes from bytes to blocks */ - req_size = _ALIGN_UP(req_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; - min_size = _ALIGN_UP(min_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + req_size = ALIGN(req_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + min_size = ALIGN(min_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; /* If no minimum size specified, make it the same as the * requested size diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 427fc22f72b6..adc76fa58d1e 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -13,9 +13,7 @@ #include <linux/export.h> #include <linux/mod_devicetable.h> #include <linux/pci.h> -#include <linux/of.h> -#include <linux/of_device.h> -#include <linux/of_platform.h> +#include <linux/platform_device.h> #include <linux/atomic.h> #include <asm/errno.h> @@ -62,12 +60,8 @@ static int of_pci_phb_probe(struct platform_device *dev) /* Init pci_dn data structures */ pci_devs_phb_init_dynamic(phb); - /* Create EEH devices for the PHB */ - eeh_dev_phb_init_dynamic(phb); - - /* Register devices with EEH */ - if (dev->dev.of_node->child) - eeh_add_device_tree_early(PCI_DN(dev->dev.of_node)); + /* Create EEH PE for the PHB */ + eeh_phb_pe_create(phb); /* Scan the bus */ pcibios_scan_phb(phb); @@ -80,15 +74,9 @@ static int of_pci_phb_probe(struct platform_device *dev) */ pcibios_claim_one_bus(phb->bus); - /* Finish EEH setup */ - eeh_add_device_tree_late(phb->bus); - /* Add probed PCI devices to the device model */ pci_bus_add_devices(phb->bus); - /* sysfs files should only be added after devices are added */ - eeh_add_sysfs_files(phb->bus); - return 0; } diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 024f7aad1952..004fae2044a3 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -16,25 +16,18 @@ #include <asm/code-patching.h> #include <asm/sstep.h> #include <asm/ppc-opcode.h> +#include <asm/inst.h> -#define TMPL_CALL_HDLR_IDX \ - (optprobe_template_call_handler - optprobe_template_entry) -#define TMPL_EMULATE_IDX \ - (optprobe_template_call_emulate - optprobe_template_entry) -#define TMPL_RET_IDX \ - (optprobe_template_ret - optprobe_template_entry) -#define TMPL_OP_IDX \ - (optprobe_template_op_address - optprobe_template_entry) -#define TMPL_INSN_IDX \ - (optprobe_template_insn - optprobe_template_entry) -#define TMPL_END_IDX \ - (optprobe_template_end - optprobe_template_entry) - -DEFINE_INSN_CACHE_OPS(ppc_optinsn); +#define TMPL_CALL_HDLR_IDX (optprobe_template_call_handler - optprobe_template_entry) +#define TMPL_EMULATE_IDX (optprobe_template_call_emulate - optprobe_template_entry) +#define TMPL_RET_IDX (optprobe_template_ret - optprobe_template_entry) +#define TMPL_OP_IDX (optprobe_template_op_address - optprobe_template_entry) +#define TMPL_INSN_IDX (optprobe_template_insn - optprobe_template_entry) +#define TMPL_END_IDX (optprobe_template_end - optprobe_template_entry) static bool insn_page_in_use; -static void *__ppc_alloc_insn_page(void) +void *alloc_optinsn_page(void) { if (insn_page_in_use) return NULL; @@ -42,20 +35,11 @@ static void *__ppc_alloc_insn_page(void) return &optinsn_slot; } -static void __ppc_free_insn_page(void *page __maybe_unused) +void free_optinsn_page(void *page) { insn_page_in_use = false; } -struct kprobe_insn_cache kprobe_ppc_optinsn_slots = { - .mutex = __MUTEX_INITIALIZER(kprobe_ppc_optinsn_slots.mutex), - .pages = LIST_HEAD_INIT(kprobe_ppc_optinsn_slots.pages), - /* insn_size initialized later */ - .alloc = __ppc_alloc_insn_page, - .free = __ppc_free_insn_page, - .nr_garbage = 0, -}; - /* * Check if we can optimize this probe. Returns NIP post-emulation if this can * be optimized and 0 otherwise. @@ -65,14 +49,15 @@ static unsigned long can_optimize(struct kprobe *p) struct pt_regs regs; struct instruction_op op; unsigned long nip = 0; + unsigned long addr = (unsigned long)p->addr; /* * kprobe placed for kretprobe during boot time * has a 'nop' instruction, which can be emulated. * So further checks can be skipped. */ - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) - return (unsigned long)p->addr + sizeof(kprobe_opcode_t); + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) + return addr + sizeof(kprobe_opcode_t); /* * We only support optimizing kernel addresses, but not @@ -80,11 +65,11 @@ static unsigned long can_optimize(struct kprobe *p) * * FIXME: Optimize kprobes placed in module addresses. */ - if (!is_kernel_addr((unsigned long)p->addr)) + if (!is_kernel_addr(addr)) return 0; memset(®s, 0, sizeof(struct pt_regs)); - regs.nip = (unsigned long)p->addr; + regs.nip = addr; regs.trap = 0x0; regs.msr = MSR_KERNEL; @@ -99,8 +84,8 @@ static unsigned long can_optimize(struct kprobe *p) * Ensure that the instruction is not a conditional branch, * and that can be emulated. */ - if (!is_conditional_branch(*p->ainsn.insn) && - analyse_instr(&op, ®s, *p->ainsn.insn) == 1) { + if (!is_conditional_branch(ppc_inst_read(p->ainsn.insn)) && + analyse_instr(&op, ®s, ppc_inst_read(p->ainsn.insn)) == 1) { emulate_update_regs(®s, &op); nip = regs.nip; } @@ -121,88 +106,66 @@ static void optimized_callback(struct optimized_kprobe *op, kprobes_inc_nmissed_count(&op->kp); } else { __this_cpu_write(current_kprobe, &op->kp); - regs->nip = (unsigned long)op->kp.addr; + regs_set_return_ip(regs, (unsigned long)op->kp.addr); get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + preempt_enable(); } NOKPROBE_SYMBOL(optimized_callback); void arch_remove_optimized_kprobe(struct optimized_kprobe *op) { if (op->optinsn.insn) { - free_ppc_optinsn_slot(op->optinsn.insn, 1); + free_optinsn_slot(op->optinsn.insn, 1); op->optinsn.insn = NULL; } } -/* - * emulate_step() requires insn to be emulated as - * second parameter. Load register 'r4' with the - * instruction. - */ -void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) +static void patch_imm32_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) { - /* addis r4,0,(insn)@h */ - patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) | - ((val >> 16) & 0xffff)); - addr++; - - /* ori r4,r4,(insn)@l */ - patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) | - ___PPC_RS(4) | (val & 0xffff)); + patch_instruction(addr++, ppc_inst(PPC_RAW_LIS(reg, PPC_HI(val)))); + patch_instruction(addr, ppc_inst(PPC_RAW_ORI(reg, reg, PPC_LO(val)))); } /* * Generate instructions to load provided immediate 64-bit value - * to register 'r3' and patch these instructions at 'addr'. + * to register 'reg' and patch these instructions at 'addr'. */ -void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr) +static void patch_imm64_load_insns(unsigned long long val, int reg, kprobe_opcode_t *addr) { - /* lis r3,(op)@highest */ - patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) | - ((val >> 48) & 0xffff)); - addr++; - - /* ori r3,r3,(op)@higher */ - patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | - ___PPC_RS(3) | ((val >> 32) & 0xffff)); - addr++; - - /* rldicr r3,r3,32,31 */ - patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) | - ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31)); - addr++; - - /* oris r3,r3,(op)@h */ - patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) | - ___PPC_RS(3) | ((val >> 16) & 0xffff)); - addr++; - - /* ori r3,r3,(op)@l */ - patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | - ___PPC_RS(3) | (val & 0xffff)); + patch_instruction(addr++, ppc_inst(PPC_RAW_LIS(reg, PPC_HIGHEST(val)))); + patch_instruction(addr++, ppc_inst(PPC_RAW_ORI(reg, reg, PPC_HIGHER(val)))); + patch_instruction(addr++, ppc_inst(PPC_RAW_SLDI(reg, reg, 32))); + patch_instruction(addr++, ppc_inst(PPC_RAW_ORIS(reg, reg, PPC_HI(val)))); + patch_instruction(addr, ppc_inst(PPC_RAW_ORI(reg, reg, PPC_LO(val)))); +} + +static void patch_imm_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +{ + if (IS_ENABLED(CONFIG_PPC64)) + patch_imm64_load_insns(val, reg, addr); + else + patch_imm32_load_insns(val, reg, addr); } int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) { - kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step; - kprobe_opcode_t *op_callback_addr, *emulate_step_addr; + ppc_inst_t branch_op_callback, branch_emulate_step, temp; + unsigned long op_callback_addr, emulate_step_addr; + kprobe_opcode_t *buff; long b_offset; unsigned long nip, size; int rc, i; - kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; - nip = can_optimize(p); if (!nip) return -EILSEQ; /* Allocate instruction slot for detour buffer */ - buff = get_ppc_optinsn_slot(); + buff = get_optinsn_slot(); if (!buff) return -ENOMEM; @@ -220,8 +183,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) goto error; /* Check if the return address is also within 32MB range */ - b_offset = (unsigned long)(buff + TMPL_RET_IDX) - - (unsigned long)nip; + b_offset = (unsigned long)(buff + TMPL_RET_IDX) - nip; if (!is_offset_in_branch_range(b_offset)) goto error; @@ -230,7 +192,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int); pr_devel("Copying template to %p, size %lu\n", buff, size); for (i = 0; i < size; i++) { - rc = patch_instruction(buff + i, *(optprobe_template_entry + i)); + rc = patch_instruction(buff + i, ppc_inst(*(optprobe_template_entry + i))); if (rc < 0) goto error; } @@ -239,27 +201,25 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * Fixup the template with instructions to: * 1. load the address of the actual probepoint */ - patch_imm64_load_insns((unsigned long)op, buff + TMPL_OP_IDX); + patch_imm_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX); /* * 2. branch to optimized_callback() and emulate_step() */ - op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback"); - emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step"); + op_callback_addr = ppc_kallsyms_lookup_name("optimized_callback"); + emulate_step_addr = ppc_kallsyms_lookup_name("emulate_step"); if (!op_callback_addr || !emulate_step_addr) { WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n"); goto error; } - branch_op_callback = create_branch((unsigned int *)buff + TMPL_CALL_HDLR_IDX, - (unsigned long)op_callback_addr, - BRANCH_SET_LINK); + rc = create_branch(&branch_op_callback, buff + TMPL_CALL_HDLR_IDX, + op_callback_addr, BRANCH_SET_LINK); - branch_emulate_step = create_branch((unsigned int *)buff + TMPL_EMULATE_IDX, - (unsigned long)emulate_step_addr, - BRANCH_SET_LINK); + rc |= create_branch(&branch_emulate_step, buff + TMPL_EMULATE_IDX, + emulate_step_addr, BRANCH_SET_LINK); - if (!branch_op_callback || !branch_emulate_step) + if (rc) goto error; patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback); @@ -268,22 +228,22 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 3. load instruction to be emulated into relevant register, and */ - patch_imm32_load_insns(*p->ainsn.insn, buff + TMPL_INSN_IDX); + temp = ppc_inst_read(p->ainsn.insn); + patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); /* * 4. branch back from trampoline */ - patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0); + patch_branch(buff + TMPL_RET_IDX, nip, 0); - flush_icache_range((unsigned long)buff, - (unsigned long)(&buff[TMPL_END_IDX])); + flush_icache_range((unsigned long)buff, (unsigned long)(&buff[TMPL_END_IDX])); op->optinsn.insn = buff; return 0; error: - free_ppc_optinsn_slot(buff, 0); + free_optinsn_slot(buff, 0); return -ERANGE; } @@ -305,6 +265,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op) void arch_optimize_kprobes(struct list_head *oplist) { + ppc_inst_t instr; struct optimized_kprobe *op; struct optimized_kprobe *tmp; @@ -313,11 +274,9 @@ void arch_optimize_kprobes(struct list_head *oplist) * Backup instructions which will be replaced * by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr, - RELATIVEJUMP_SIZE); - patch_instruction(op->kp.addr, - create_branch((unsigned int *)op->kp.addr, - (unsigned long)op->optinsn.insn, 0)); + memcpy(op->optinsn.copied_insn, op->kp.addr, RELATIVEJUMP_SIZE); + create_branch(&instr, op->kp.addr, (unsigned long)op->optinsn.insn, 0); + patch_instruction(op->kp.addr, instr); list_del_init(&op->list); } } @@ -327,8 +286,7 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op) arch_arm_kprobe(&op->kp); } -void arch_unoptimize_kprobes(struct list_head *oplist, - struct list_head *done_list) +void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list) { struct optimized_kprobe *op; struct optimized_kprobe *tmp; @@ -339,9 +297,8 @@ void arch_unoptimize_kprobes(struct list_head *oplist, } } -int arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr) +int arch_within_optimized_kprobe(struct optimized_kprobe *op, kprobe_opcode_t *addr) { - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr); + return (op->kp.addr <= addr && + op->kp.addr + (RELATIVEJUMP_SIZE / sizeof(kprobe_opcode_t)) > addr); } diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index cf383520843f..35932f45fb4e 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -9,6 +9,16 @@ #include <asm/ptrace.h> #include <asm/asm-offsets.h> +#ifdef CONFIG_PPC64 +#define SAVE_30GPRS(base) SAVE_GPRS(2, 31, base) +#define REST_30GPRS(base) REST_GPRS(2, 31, base) +#define TEMPLATE_FOR_IMM_LOAD_INSNS nop; nop; nop; nop; nop +#else +#define SAVE_30GPRS(base) stmw r2, GPR2(base) +#define REST_30GPRS(base) lmw r2, GPR2(base) +#define TEMPLATE_FOR_IMM_LOAD_INSNS nop; nop; nop +#endif + #define OPT_SLOT_SIZE 65536 .balign 4 @@ -30,39 +40,41 @@ optinsn_slot: .global optprobe_template_entry optprobe_template_entry: /* Create an in-memory pt_regs */ - stdu r1,-INT_FRAME_SIZE(r1) + PPC_STLU r1,-INT_FRAME_SIZE(r1) SAVE_GPR(0,r1) /* Save the previous SP into stack */ addi r0,r1,INT_FRAME_SIZE - std r0,GPR1(r1) - SAVE_10GPRS(2,r1) - SAVE_10GPRS(12,r1) - SAVE_10GPRS(22,r1) + PPC_STL r0,GPR1(r1) + SAVE_30GPRS(r1) /* Save SPRS */ mfmsr r5 - std r5,_MSR(r1) + PPC_STL r5,_MSR(r1) li r5,0x700 - std r5,_TRAP(r1) + PPC_STL r5,_TRAP(r1) li r5,0 - std r5,ORIG_GPR3(r1) - std r5,RESULT(r1) + PPC_STL r5,ORIG_GPR3(r1) + PPC_STL r5,RESULT(r1) mfctr r5 - std r5,_CTR(r1) + PPC_STL r5,_CTR(r1) mflr r5 - std r5,_LINK(r1) + PPC_STL r5,_LINK(r1) mfspr r5,SPRN_XER - std r5,_XER(r1) + PPC_STL r5,_XER(r1) mfcr r5 - std r5,_CCR(r1) + PPC_STL r5,_CCR(r1) +#ifdef CONFIG_PPC64 lbz r5,PACAIRQSOFTMASK(r13) std r5,SOFTE(r1) +#endif /* * We may get here from a module, so load the kernel TOC in r2. * The original TOC gets restored when pt_regs is restored * further below. */ - ld r2,PACATOC(r13) +#ifdef CONFIG_PPC64 + LOAD_PACA_TOC() +#endif .global optprobe_template_op_address optprobe_template_op_address: @@ -70,13 +82,10 @@ optprobe_template_op_address: * Parameters to optimized_callback(): * 1. optimized_kprobe structure in r3 */ - nop - nop - nop - nop - nop + TEMPLATE_FOR_IMM_LOAD_INSNS + /* 2. pt_regs pointer in r4 */ - addi r4,r1,STACK_FRAME_OVERHEAD + addi r4,r1,STACK_INT_FRAME_REGS .global optprobe_template_call_handler optprobe_template_call_handler: @@ -87,13 +96,12 @@ optprobe_template_call_handler: * Parameters for instruction emulation: * 1. Pass SP in register r3. */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS .global optprobe_template_insn optprobe_template_insn: /* 2, Pass instruction to be emulated in r4 */ - nop - nop + TEMPLATE_FOR_IMM_LOAD_INSNS .global optprobe_template_call_emulate optprobe_template_call_emulate: @@ -104,20 +112,18 @@ optprobe_template_call_emulate: * All done. * Now, restore the registers... */ - ld r5,_MSR(r1) + PPC_LL r5,_MSR(r1) mtmsr r5 - ld r5,_CTR(r1) + PPC_LL r5,_CTR(r1) mtctr r5 - ld r5,_LINK(r1) + PPC_LL r5,_LINK(r1) mtlr r5 - ld r5,_XER(r1) + PPC_LL r5,_XER(r1) mtxer r5 - ld r5,_CCR(r1) + PPC_LL r5,_CCR(r1) mtcr r5 REST_GPR(0,r1) - REST_10GPRS(2,r1) - REST_10GPRS(12,r1) - REST_10GPRS(22,r1) + REST_30GPRS(r1) /* Restore the previous SP */ addi r1,r1,INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 949eceb254d8..7502066c3c53 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -8,11 +8,11 @@ #include <linux/memblock.h> #include <linux/sched/task.h> #include <linux/numa.h> +#include <linux/pgtable.h> #include <asm/lppaca.h> #include <asm/paca.h> #include <asm/sections.h> -#include <asm/pgtable.h> #include <asm/kexec.h> #include <asm/svm.h> #include <asm/ultravisor.h> @@ -56,8 +56,8 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, #define LPPACA_SIZE 0x400 -static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align, - unsigned long limit, int cpu) +static void *__init alloc_shared_lppaca(unsigned long size, unsigned long limit, + int cpu) { size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE); static unsigned long shared_lppaca_size; @@ -67,6 +67,13 @@ static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align, if (!shared_lppaca) { memblock_set_bottom_up(true); + /* + * See Documentation/arch/powerpc/ultravisor.rst for more details. + * + * UV/HV data sharing is in PAGE_SIZE granularity. In order to + * minimize the number of pages shared, align the allocation to + * PAGE_SIZE. + */ shared_lppaca = memblock_alloc_try_nid(shared_lppaca_total_size, PAGE_SIZE, MEMBLOCK_LOW_LIMIT, @@ -86,7 +93,7 @@ static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align, * This is very early in boot, so no harm done if the kernel crashes at * this point. */ - BUG_ON(shared_lppaca_size >= shared_lppaca_total_size); + BUG_ON(shared_lppaca_size > shared_lppaca_total_size); return ptr; } @@ -121,7 +128,7 @@ static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) return NULL; if (is_secure_guest()) - lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu); + lp = alloc_shared_lppaca(LPPACA_SIZE, limit, cpu); else lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); @@ -131,8 +138,7 @@ static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) } #endif /* CONFIG_PPC_PSERIES */ -#ifdef CONFIG_PPC_BOOK3S_64 - +#ifdef CONFIG_PPC_64S_HASH_MMU /* * 3 persistent SLBs are allocated here. The buffer will be zero * initially, hence will all be invaild until we actually write them. @@ -161,8 +167,7 @@ static struct slb_shadow * __init new_slb_shadow(int cpu, unsigned long limit) return s; } - -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_64S_HASH_MMU */ /* The Paca is an array with one entry per processor. Each contains an * lppaca, which contains the information shared between the @@ -181,12 +186,14 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) #ifdef CONFIG_PPC_PSERIES new_paca->lppaca_ptr = NULL; #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 new_paca->kernel_pgd = swapper_pg_dir; #endif new_paca->lock_token = 0x8000; new_paca->paca_index = cpu; +#ifndef CONFIG_PPC_KERNEL_PCREL new_paca->kernel_toc = kernel_toc_addr(); +#endif new_paca->kernelbase = (unsigned long) _stext; /* Only set MSR:IR/DR when MMU is initialized */ new_paca->kernel_msr = MSR_KERNEL & ~(MSR_IR | MSR_DR); @@ -194,11 +201,11 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU new_paca->slb_shadow_ptr = NULL; #endif -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* For now -- if we have threads this will be adjusted later */ new_paca->tcd_ptr = &new_paca->tcd; #endif @@ -210,15 +217,19 @@ void setup_paca(struct paca_struct *new_paca) /* Setup r13 */ local_paca = new_paca; -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* On Book3E, initialize the TLB miss exception frames */ mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb); #else - /* In HV mode, we setup both HPACA and PACA to avoid problems + /* + * In HV mode, we setup both HPACA and PACA to avoid problems * if we do a GET_PACA() before the feature fixups have been - * applied + * applied. + * + * Normally you should test against CPU_FTR_HVMODE, but CPU features + * are not yet set up when we first reach here. */ - if (early_cpu_has_feature(CPU_FTR_HVMODE)) + if (mfmsr() & MSR_HV) mtspr(SPRN_SPRG_HPACA, local_paca); #endif mtspr(SPRN_SPRG_PACA, local_paca); @@ -267,7 +278,7 @@ void __init allocate_paca(int cpu) #ifdef CONFIG_PPC_PSERIES paca->lppaca_ptr = new_lppaca(cpu, limit); #endif -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU paca->slb_shadow_ptr = new_slb_shadow(cpu, limit); #endif paca_struct_size += sizeof(struct paca_struct); @@ -279,17 +290,17 @@ void __init free_unused_pacas(void) new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids; if (new_ptrs_size < paca_ptrs_size) - memblock_free(__pa(paca_ptrs) + new_ptrs_size, - paca_ptrs_size - new_ptrs_size); + memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size, + paca_ptrs_size - new_ptrs_size); paca_nr_cpu_ids = nr_cpu_ids; paca_ptrs_size = new_ptrs_size; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (early_radix_enabled()) { /* Ugly fixup, see new_slb_shadow() */ - memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr), - sizeof(struct slb_shadow)); + memblock_phys_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr), + sizeof(struct slb_shadow)); paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL; } #endif @@ -298,24 +309,15 @@ void __init free_unused_pacas(void) paca_ptrs_size + paca_struct_size, nr_cpu_ids); } +#ifdef CONFIG_PPC_64S_HASH_MMU void copy_mm_to_paca(struct mm_struct *mm) { -#ifdef CONFIG_PPC_BOOK3S mm_context_t *context = &mm->context; - get_paca()->mm_ctx_id = context->id; -#ifdef CONFIG_PPC_MM_SLICES VM_BUG_ON(!mm_ctx_slb_addr_limit(context)); - get_paca()->mm_ctx_slb_addr_limit = mm_ctx_slb_addr_limit(context); memcpy(&get_paca()->mm_ctx_low_slices_psize, mm_ctx_low_slices(context), LOW_SLICE_ARRAY_SZ); memcpy(&get_paca()->mm_ctx_high_slices_psize, mm_ctx_high_slices(context), TASK_SLICE_ARRAY_SZ(context)); -#else /* CONFIG_PPC_MM_SLICES */ - get_paca()->mm_ctx_user_psize = context->user_psize; - get_paca()->mm_ctx_sllp = context->sllp; -#endif -#else /* !CONFIG_PPC_BOOK3S */ - return; -#endif } +#endif /* CONFIG_PPC_64S_HASH_MMU */ diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 1c448cf25506..d95a48eff412 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -29,19 +29,21 @@ #include <linux/slab.h> #include <linux/vgaarb.h> #include <linux/numa.h> +#include <linux/msi.h> +#include <linux/irqdomain.h> #include <asm/processor.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/byteorder.h> #include <asm/machdep.h> #include <asm/ppc-pci.h> #include <asm/eeh.h> +#include <asm/setup.h> #include "../../../drivers/pci/pci.h" -/* hose_spinlock protects accesses to the the phb_bitmap. */ +/* hose_spinlock protects accesses to the phb_bitmap. */ static DEFINE_SPINLOCK(hose_spinlock); LIST_HEAD(hose_list); @@ -61,28 +63,40 @@ EXPORT_SYMBOL(isa_mem_base); static const struct dma_map_ops *pci_dma_ops; -void set_pci_dma_ops(const struct dma_map_ops *dma_ops) +void __init set_pci_dma_ops(const struct dma_map_ops *dma_ops) { pci_dma_ops = dma_ops; } -/* - * This function should run under locking protection, specifically - * hose_spinlock. - */ static int get_phb_number(struct device_node *dn) { int ret, phb_id = -1; - u32 prop_32; u64 prop; /* * Try fixed PHB numbering first, by checking archs and reading - * the respective device-tree properties. Firstly, try powernv by - * reading "ibm,opal-phbid", only present in OPAL environment. + * the respective device-tree properties. Firstly, try reading + * standard "linux,pci-domain", then try reading "ibm,opal-phbid" + * (only present in powernv OPAL environment), then try device-tree + * alias and as the last try to use lower bits of "reg" property. */ - ret = of_property_read_u64(dn, "ibm,opal-phbid", &prop); + ret = of_get_pci_domain_nr(dn); + if (ret >= 0) { + prop = ret; + ret = 0; + } + if (ret) + ret = of_property_read_u64(dn, "ibm,opal-phbid", &prop); + if (ret) { + ret = of_alias_get_id(dn, "pci"); + if (ret >= 0) { + prop = ret; + ret = 0; + } + } + if (ret) { + u32 prop_32; ret = of_property_read_u32_index(dn, "reg", 1, &prop_32); prop = prop_32; } @@ -90,18 +104,20 @@ static int get_phb_number(struct device_node *dn) if (!ret) phb_id = (int)(prop & (MAX_PHBS - 1)); + spin_lock(&hose_spinlock); + /* We need to be sure to not use the same PHB number twice. */ if ((phb_id >= 0) && !test_and_set_bit(phb_id, phb_bitmap)) - return phb_id; + goto out_unlock; - /* - * If not pseries nor powernv, or if fixed PHB numbering tried to add - * the same PHB number twice, then fallback to dynamic PHB numbering. - */ + /* If everything fails then fallback to dynamic PHB numbering. */ phb_id = find_first_zero_bit(phb_bitmap, MAX_PHBS); BUG_ON(phb_id >= MAX_PHBS); set_bit(phb_id, phb_bitmap); +out_unlock: + spin_unlock(&hose_spinlock); + return phb_id; } @@ -109,14 +125,17 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev) { struct pci_controller *phb; - phb = zalloc_maybe_bootmem(sizeof(struct pci_controller), GFP_KERNEL); + phb = kzalloc(sizeof(struct pci_controller), GFP_KERNEL); if (phb == NULL) return NULL; - spin_lock(&hose_spinlock); + phb->global_number = get_phb_number(dev); + + spin_lock(&hose_spinlock); list_add_tail(&phb->list_node, &hose_list); spin_unlock(&hose_spinlock); - phb->dn = dev; + + phb->dn = of_node_get(dev); phb->is_dynamic = slab_is_available(); #ifdef CONFIG_PPC64 if (dev) { @@ -139,7 +158,7 @@ void pcibios_free_controller(struct pci_controller *phb) /* Clear bit of phb_bitmap to allow reuse of this PHB number. */ if (phb->global_number < MAX_PHBS) clear_bit(phb->global_number, phb_bitmap); - + of_node_put(phb->dn); list_del(&phb->list_node); spin_unlock(&hose_spinlock); @@ -261,12 +280,6 @@ int pcibios_sriov_disable(struct pci_dev *pdev) #endif /* CONFIG_PCI_IOV */ -void pcibios_bus_add_device(struct pci_dev *pdev) -{ - if (ppc_md.pcibios_bus_add_device) - ppc_md.pcibios_bus_add_device(pdev); -} - static resource_size_t pcibios_io_size(const struct pci_controller *hose) { #ifdef CONFIG_PPC64 @@ -359,6 +372,55 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) return NULL; } +struct pci_intx_virq { + int virq; + struct kref kref; + struct list_head list_node; +}; + +static LIST_HEAD(intx_list); +static DEFINE_MUTEX(intx_mutex); + +static void ppc_pci_intx_release(struct kref *kref) +{ + struct pci_intx_virq *vi = container_of(kref, struct pci_intx_virq, kref); + + list_del(&vi->list_node); + irq_dispose_mapping(vi->virq); + kfree(vi); +} + +static int ppc_pci_unmap_irq_line(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct pci_dev *pdev = to_pci_dev(data); + + if (action == BUS_NOTIFY_DEL_DEVICE) { + struct pci_intx_virq *vi; + + mutex_lock(&intx_mutex); + list_for_each_entry(vi, &intx_list, list_node) { + if (vi->virq == pdev->irq) { + kref_put(&vi->kref, ppc_pci_intx_release); + break; + } + } + mutex_unlock(&intx_mutex); + } + + return NOTIFY_DONE; +} + +static struct notifier_block ppc_pci_unmap_irq_notifier = { + .notifier_call = ppc_pci_unmap_irq_line, +}; + +static int ppc_pci_register_irq_notifier(void) +{ + return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier); +} +arch_initcall(ppc_pci_register_irq_notifier); + /* * Reads the interrupt pin to determine if interrupt is use by card. * If the interrupt is used, then gets the interrupt line from the @@ -367,6 +429,12 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr) static int pci_read_irq_line(struct pci_dev *pci_dev) { int virq; + struct pci_intx_virq *vi, *vitmp; + + /* Preallocate vi as rewind is complex if this fails after mapping */ + vi = kzalloc(sizeof(struct pci_intx_virq), GFP_KERNEL); + if (!vi) + return -1; pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); @@ -383,12 +451,12 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) * function. */ if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &pin)) - return -1; + goto error_exit; if (pin == 0) - return -1; + goto error_exit; if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_LINE, &line) || line == 0xff || line == 0) { - return -1; + goto error_exit; } pr_debug(" No map ! Using line %d (pin %d) from PCI config\n", line, pin); @@ -400,14 +468,33 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) if (!virq) { pr_debug(" Failed to map !\n"); - return -1; + goto error_exit; } pr_debug(" Mapped to linux irq %d\n", virq); pci_dev->irq = virq; + mutex_lock(&intx_mutex); + list_for_each_entry(vitmp, &intx_list, list_node) { + if (vitmp->virq == virq) { + kref_get(&vitmp->kref); + kfree(vi); + vi = NULL; + break; + } + } + if (vi) { + vi->virq = virq; + kref_init(&vi->kref); + list_add_tail(&vi->list_node, &intx_list); + } + mutex_unlock(&intx_mutex); + return 0; +error_exit: + kfree(vi); + return -1; } /* @@ -434,8 +521,7 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma) * PCI device, it tries to find the PCI device first and calls the * above routine */ -pgprot_t pci_phys_mem_access_prot(struct file *file, - unsigned long pfn, +pgprot_t pci_phys_mem_access_prot(unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -734,7 +820,7 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, " MEM 0x%016llx..0x%016llx -> 0x%016llx %s\n", range.cpu_addr, range.cpu_addr + range.size - 1, range.pci_addr, - (range.pci_space & 0x40000000) ? + (range.flags & IORESOURCE_PREFETCH) ? "Prefetch" : ""); /* We support only 3 memory ranges */ @@ -793,6 +879,7 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) static void pcibios_fixup_resources(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct resource *res; int i; if (!hose) { @@ -804,9 +891,9 @@ static void pcibios_fixup_resources(struct pci_dev *dev) if (dev->is_virtfn) return; - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { - struct resource *res = dev->resource + i; + pci_dev_for_each_resource(dev, res, i) { struct pci_bus_region reg; + if (!res->flags) continue; @@ -964,7 +1051,7 @@ void pcibios_setup_bus_self(struct pci_bus *bus) phb->controller_ops.dma_bus_setup(bus); } -static void pcibios_setup_device(struct pci_dev *dev) +void pcibios_bus_add_device(struct pci_dev *dev) { struct pci_controller *phb; /* Fixup NUMA node as it may not be setup yet by the generic @@ -985,43 +1072,26 @@ static void pcibios_setup_device(struct pci_dev *dev) pci_read_irq_line(dev); if (ppc_md.pci_irq_fixup) ppc_md.pci_irq_fixup(dev); + + if (ppc_md.pcibios_bus_add_device) + ppc_md.pcibios_bus_add_device(dev); } -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { - /* - * We can only call pcibios_setup_device() after bus setup is complete, - * since some of the platform specific DMA setup code depends on it. - */ - if (dev->bus->is_added) - pcibios_setup_device(dev); + struct irq_domain *d; #ifdef CONFIG_PCI_IOV if (ppc_md.pcibios_fixup_sriov) ppc_md.pcibios_fixup_sriov(dev); #endif /* CONFIG_PCI_IOV */ + d = dev_get_msi_domain(&dev->bus->dev); + if (d) + dev_set_msi_domain(&dev->dev, d); return 0; } -void pcibios_setup_bus_devices(struct pci_bus *bus) -{ - struct pci_dev *dev; - - pr_debug("PCI: Fixup bus devices %d (%s)\n", - bus->number, bus->self ? pci_name(bus->self) : "PHB"); - - list_for_each_entry(dev, &bus->devices, bus_list) { - /* Cardbus can call us to add new devices to a bus, so ignore - * those who are already fully discovered - */ - if (pci_dev_is_added(dev)) - continue; - - pcibios_setup_device(dev); - } -} - void pcibios_set_master(struct pci_dev *dev) { /* No special bus mastering setup handling */ @@ -1035,21 +1105,11 @@ void pcibios_fixup_bus(struct pci_bus *bus) */ pci_read_bridge_bases(bus); - /* Now fixup the bus bus */ + /* Now fixup the bus */ pcibios_setup_bus_self(bus); - - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); } EXPORT_SYMBOL(pcibios_fixup_bus); -void pci_fixup_cardbus(struct pci_bus *bus) -{ - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); -} - - static int skip_isa_ioresource_align(struct pci_dev *dev) { if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && @@ -1392,11 +1452,10 @@ void pcibios_claim_one_bus(struct pci_bus *bus) struct pci_bus *child_bus; list_for_each_entry(dev, &bus->devices, bus_list) { + struct resource *r; int i; - for (i = 0; i < PCI_NUM_RESOURCES; i++) { - struct resource *r = &dev->resource[i]; - + pci_dev_for_each_resource(dev, r, i) { if (r->parent || !r->start || !r->flags) continue; @@ -1437,14 +1496,8 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus) pci_assign_unassigned_bus_resources(bus); } - /* Fixup EEH */ - eeh_add_device_tree_late(bus); - /* Add new devices to global lists. Register in proc, sysfs. */ pci_bus_add_devices(bus); - - /* sysfs files should only be added after devices are added */ - eeh_add_sysfs_files(bus); } EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus); @@ -1651,21 +1704,32 @@ EXPORT_SYMBOL_GPL(pcibios_scan_phb); static void fixup_hide_host_resource_fsl(struct pci_dev *dev) { - int i, class = dev->class >> 8; - /* When configured as agent, programing interface = 1 */ + int class = dev->class >> 8; + /* When configured as agent, programming interface = 1 */ int prog_if = dev->class & 0xf; + struct resource *r; if ((class == PCI_CLASS_PROCESSOR_POWERPC || class == PCI_CLASS_BRIDGE_OTHER) && (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) && (prog_if == 0) && (dev->bus->parent == NULL)) { - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { - dev->resource[i].start = 0; - dev->resource[i].end = 0; - dev->resource[i].flags = 0; + pci_dev_for_each_resource(dev, r) { + r->start = 0; + r->end = 0; + r->flags = 0; } } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl); + + +static int __init discover_phbs(void) +{ + if (ppc_md.discover_phbs) + ppc_md.discover_phbs(); + + return 0; +} +core_initcall(discover_phbs); diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index fc62c4bc47b1..0fe251c6ac2c 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <linux/export.h> +#include <linux/of.h> #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> #include <asm/firmware.h> @@ -57,8 +58,6 @@ void pcibios_release_device(struct pci_dev *dev) struct pci_controller *phb = pci_bus_to_host(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); - eeh_remove_device(dev); - if (phb->controller_ops.release_device) phb->controller_ops.release_device(dev); @@ -112,8 +111,6 @@ void pci_hp_add_devices(struct pci_bus *bus) struct pci_controller *phb; struct device_node *dn = pci_bus_to_OF_node(bus); - eeh_add_device_tree_early(PCI_DN(dn)); - phb = pci_bus_to_host(bus); mode = PCI_PROBE_NORMAL; @@ -134,7 +131,6 @@ void pci_hp_add_devices(struct pci_bus *bus) */ slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); - pcibios_setup_bus_devices(bus); max = bus->busn_res.start; /* * Scan bridges that are already configured. We don't touch diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index b49e1060a3bf..ce0c8623e563 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -21,7 +21,6 @@ #include <asm/processor.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/sections.h> #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> @@ -37,18 +36,13 @@ int pcibios_assign_bus_offset = 1; EXPORT_SYMBOL(isa_io_base); EXPORT_SYMBOL(pci_dram_offset); -void pcibios_make_OF_bus_map(void); - static void fixup_cpc710_pci64(struct pci_dev* dev); -static u8* pci_to_OF_bus_map; /* By default, we don't re-assign bus numbers. We do this only on * some pmacs */ static int pci_assign_all_buses; -static int pci_bus_count; - /* This will remain NULL for now, until isa-bridge.c is made common * to both 32-bit and 64-bit. */ @@ -68,6 +62,11 @@ fixup_cpc710_pci64(struct pci_dev* dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CPC710_PCI64, fixup_cpc710_pci64); +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP + +static u8* pci_to_OF_bus_map; +static int pci_bus_count; + /* * Functions below are used on OpenFirmware machines. */ @@ -109,7 +108,7 @@ make_one_node_map(struct device_node* node, u8 pci_bus) } } -void +static void __init pcibios_make_OF_bus_map(void) { int i; @@ -153,14 +152,18 @@ pcibios_make_OF_bus_map(void) } #endif } +#endif // CONFIG_PPC_PCI_OF_BUS_MAP +#ifdef CONFIG_PPC_PMAC /* * Returns the PCI device matching a given OF node */ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) { +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP struct pci_dev *dev = NULL; +#endif const __be32 *reg; int size; @@ -175,6 +178,9 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) *bus = (be32_to_cpup(®[0]) >> 16) & 0xff; *devfn = (be32_to_cpup(®[0]) >> 8) & 0xff; +#ifndef CONFIG_PPC_PCI_OF_BUS_MAP + return 0; +#else /* Ok, here we need some tweak. If we have already renumbered * all busses, we can't rely on the OF bus number any more. * the pci_to_OF_bus_map is not enough as several PCI busses @@ -192,9 +198,12 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) } return -ENODEV; +#endif // CONFIG_PPC_PCI_OF_BUS_MAP } EXPORT_SYMBOL(pci_device_from_OF_node); +#endif +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP /* We create the "pci-OF-bus-map" property now so it appears in the * /proc device tree */ @@ -219,6 +228,7 @@ pci_create_OF_bus_map(void) of_node_put(dn); } } +#endif // CONFIG_PPC_PCI_OF_BUS_MAP void pcibios_setup_phb_io_space(struct pci_controller *hose) { @@ -234,23 +244,41 @@ void pcibios_setup_phb_io_space(struct pci_controller *hose) static int __init pcibios_init(void) { struct pci_controller *hose, *tmp; +#ifndef CONFIG_PPC_PCI_BUS_NUM_DOMAIN_DEPENDENT int next_busno = 0; +#endif printk(KERN_INFO "PCI: Probing PCI hardware\n"); +#ifdef CONFIG_PPC_PCI_BUS_NUM_DOMAIN_DEPENDENT + /* + * Enable PCI domains in /proc when PCI bus numbers are not unique + * across all PCI domains to prevent conflicts. And keep PCI domain 0 + * backward compatible in /proc for video cards. + */ + pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); +#endif + if (pci_has_flag(PCI_REASSIGN_ALL_BUS)) pci_assign_all_buses = 1; /* Scan all of the recorded PCI controllers. */ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { +#ifndef CONFIG_PPC_PCI_BUS_NUM_DOMAIN_DEPENDENT if (pci_assign_all_buses) hose->first_busno = next_busno; +#endif hose->last_busno = 0xff; pcibios_scan_phb(hose); pci_bus_add_devices(hose->bus); +#ifndef CONFIG_PPC_PCI_BUS_NUM_DOMAIN_DEPENDENT if (pci_assign_all_buses || next_busno <= hose->last_busno) next_busno = hose->last_busno + pcibios_assign_bus_offset; +#endif } + +#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_CHRP) +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP pci_bus_count = next_busno; /* OpenFirmware based machines need a map of OF bus @@ -259,6 +287,8 @@ static int __init pcibios_init(void) */ if (pci_assign_all_buses) pcibios_make_OF_bus_map(); +#endif +#endif /* Call common code to handle resource allocation */ pcibios_resource_survey(); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index f83d1f69b1dd..e27342ef128b 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -19,10 +19,10 @@ #include <linux/syscalls.h> #include <linux/irq.h> #include <linux/vmalloc.h> +#include <linux/of.h> #include <asm/processor.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/byteorder.h> #include <asm/machdep.h> @@ -73,7 +73,7 @@ static int __init pcibios_init(void) return 0; } -subsys_initcall(pcibios_init); +subsys_initcall_sync(pcibios_init); int pcibios_unmap_io_space(struct pci_bus *bus) { @@ -100,7 +100,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus) pci_name(bus->self)); #ifdef CONFIG_PPC_BOOK3S_64 - __flush_hash_table_range(&init_mm, res->start + _IO_BASE, + __flush_hash_table_range(res->start + _IO_BASE, res->end + _IO_BASE + 1); #endif return 0; @@ -109,29 +109,53 @@ int pcibios_unmap_io_space(struct pci_bus *bus) /* Get the host bridge */ hose = pci_bus_to_host(bus); - /* Check if we have IOs allocated */ - if (hose->io_base_alloc == NULL) - return 0; - pr_debug("IO unmapping for PHB %pOF\n", hose->dn); pr_debug(" alloc=0x%p\n", hose->io_base_alloc); - /* This is a PHB, we fully unmap the IO area */ - vunmap(hose->io_base_alloc); - + iounmap(hose->io_base_alloc); return 0; } EXPORT_SYMBOL_GPL(pcibios_unmap_io_space); -static int pcibios_map_phb_io_space(struct pci_controller *hose) +void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) { struct vm_struct *area; + unsigned long addr; + + WARN_ON_ONCE(paddr & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + /* + * Let's allocate some IO space for that guy. We don't pass VM_IOREMAP + * because we don't care about alignment tricks that the core does in + * that case. Maybe we should due to stupid card with incomplete + * address decoding but I'd rather not deal with those outside of the + * reserved 64K legacy region. + */ + area = __get_vm_area_caller(size, VM_IOREMAP, PHB_IO_BASE, PHB_IO_END, + __builtin_return_address(0)); + if (!area) + return NULL; + + addr = (unsigned long)area->addr; + if (ioremap_page_range(addr, addr + size, paddr, + pgprot_noncached(PAGE_KERNEL))) { + vunmap_range(addr, addr + size); + return NULL; + } + + return (void __iomem *)addr; +} +EXPORT_SYMBOL_GPL(ioremap_phb); + +static int pcibios_map_phb_io_space(struct pci_controller *hose) +{ unsigned long phys_page; unsigned long size_page; unsigned long io_virt_offset; - phys_page = _ALIGN_DOWN(hose->io_base_phys, PAGE_SIZE); - size_page = _ALIGN_UP(hose->pci_io_size, PAGE_SIZE); + phys_page = ALIGN_DOWN(hose->io_base_phys, PAGE_SIZE); + size_page = ALIGN(hose->pci_io_size, PAGE_SIZE); /* Make sure IO area address is clear */ hose->io_base_alloc = NULL; @@ -146,12 +170,11 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) * with incomplete address decoding but I'd rather not deal with * those outside of the reserved 64K legacy region. */ - area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END); - if (area == NULL) + hose->io_base_alloc = ioremap_phb(phys_page, size_page); + if (!hose->io_base_alloc) return -ENOMEM; - hose->io_base_alloc = area->addr; - hose->io_base_virt = (void __iomem *)(area->addr + - hose->io_base_phys - phys_page); + hose->io_base_virt = hose->io_base_alloc + + hose->io_base_phys - phys_page; pr_debug("IO mapping for PHB %pOF\n", hose->dn); pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", @@ -159,11 +182,6 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", hose->pci_io_size, size_page); - /* Establish the mapping */ - if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_noncached(PAGE_KERNEL)) == NULL) - return -ENOMEM; - /* Fixup hose IO resource */ io_virt_offset = pcibios_io_space_offset(hose); hose->io_resource.start += io_virt_offset; @@ -267,3 +285,14 @@ int pcibus_to_node(struct pci_bus *bus) } EXPORT_SYMBOL(pcibus_to_node); #endif + +#ifdef CONFIG_PPC_PMAC +int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) +{ + if (!PCI_DN(np)) + return -ENODEV; + *bus = PCI_DN(np)->busno; + *devfn = PCI_DN(np)->devfn; + return 0; +} +#endif diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 9524009ca1ae..38561d6a2079 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -12,9 +12,9 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/gfp.h> +#include <linux/of.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> #include <asm/firmware.h> @@ -124,9 +124,28 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) return NULL; } +#ifdef CONFIG_EEH +static struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) +{ + struct eeh_dev *edev; + + /* Allocate EEH device */ + edev = kzalloc(sizeof(*edev), GFP_KERNEL); + if (!edev) + return NULL; + + /* Associate EEH device with OF node */ + pdn->edev = edev; + edev->pdn = pdn; + edev->bdfn = (pdn->busno << 8) | pdn->devfn; + edev->controller = pdn->phb; + + return edev; +} +#endif /* CONFIG_EEH */ + #ifdef CONFIG_PCI_IOV -static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, - int vf_index, +static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn *parent, int busno, int devfn) { struct pci_dn *pdn; @@ -143,7 +162,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, pdn->parent = parent; pdn->busno = busno; pdn->devfn = devfn; - pdn->vf_index = vf_index; pdn->pe_number = IODA_INVALID_PE; INIT_LIST_HEAD(&pdn->child_list); INIT_LIST_HEAD(&pdn->list); @@ -151,17 +169,15 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, return pdn; } -#endif -struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) +struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev) { -#ifdef CONFIG_PCI_IOV struct pci_dn *parent, *pdn; int i; /* Only support IOV for now */ - if (!pdev->is_physfn) - return pci_get_pdn(pdev); + if (WARN_ON(!pdev->is_physfn)) + return NULL; /* Check if VFs have been populated */ pdn = pci_get_pdn(pdev); @@ -176,7 +192,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { struct eeh_dev *edev __maybe_unused; - pdn = add_one_dev_pci_data(parent, i, + pdn = add_one_sriov_vf_pdn(parent, pci_iov_virtfn_bus(pdev, i), pci_iov_virtfn_devfn(pdev, i)); if (!pdn) { @@ -189,34 +205,23 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) /* Create the EEH device for the VF */ edev = eeh_dev_init(pdn); BUG_ON(!edev); + + /* FIXME: these should probably be populated by the EEH probe */ edev->physfn = pdev; + edev->vf_index = i; #endif /* CONFIG_EEH */ } -#endif /* CONFIG_PCI_IOV */ - return pci_get_pdn(pdev); } -void remove_dev_pci_data(struct pci_dev *pdev) +void remove_sriov_vf_pdns(struct pci_dev *pdev) { -#ifdef CONFIG_PCI_IOV struct pci_dn *parent; struct pci_dn *pdn, *tmp; int i; - /* - * VF and VF PE are created/released dynamically, so we need to - * bind/unbind them. Otherwise the VF and VF PE would be mismatched - * when re-enabling SR-IOV. - */ - if (pdev->is_virtfn) { - pdn = pci_get_pdn(pdev); - pdn->pe_number = IODA_INVALID_PE; - return; - } - /* Only support IOV PF for now */ - if (!pdev->is_physfn) + if (WARN_ON(!pdev->is_physfn)) return; /* Check if VFs have been populated */ @@ -244,9 +249,22 @@ void remove_dev_pci_data(struct pci_dev *pdev) continue; #ifdef CONFIG_EEH - /* Release EEH device for the VF */ + /* + * Release EEH state for this VF. The PCI core + * has already torn down the pci_dev for this VF, but + * we're responsible to removing the eeh_dev since it + * has the same lifetime as the pci_dn that spawned it. + */ edev = pdn_to_eeh_dev(pdn); if (edev) { + /* + * We allocate pci_dn's for the totalvfs count, + * but only the vfs that were activated + * have a configured PE. + */ + if (edev->pe) + eeh_pe_tree_remove(edev); + pdn->edev = NULL; kfree(edev); } @@ -258,8 +276,8 @@ void remove_dev_pci_data(struct pci_dev *pdev) kfree(pdn); } } -#endif /* CONFIG_PCI_IOV */ } +#endif /* CONFIG_PCI_IOV */ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, struct device_node *dn) @@ -312,6 +330,7 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, INIT_LIST_HEAD(&pdn->list); parent = of_get_parent(dn); pdn->parent = parent ? PCI_DN(parent) : NULL; + of_node_put(parent); if (pdn->parent) list_add_tail(&pdn->list, &pdn->parent->child_list); @@ -425,46 +444,6 @@ void *pci_traverse_device_nodes(struct device_node *start, } EXPORT_SYMBOL_GPL(pci_traverse_device_nodes); -static struct pci_dn *pci_dn_next_one(struct pci_dn *root, - struct pci_dn *pdn) -{ - struct list_head *next = pdn->child_list.next; - - if (next != &pdn->child_list) - return list_entry(next, struct pci_dn, list); - - while (1) { - if (pdn == root) - return NULL; - - next = pdn->list.next; - if (next != &pdn->parent->child_list) - break; - - pdn = pdn->parent; - } - - return list_entry(next, struct pci_dn, list); -} - -void *traverse_pci_dn(struct pci_dn *root, - void *(*fn)(struct pci_dn *, void *), - void *data) -{ - struct pci_dn *pdn = root; - void *ret; - - /* Only scan the child nodes */ - for (pdn = pci_dn_next_one(root, pdn); pdn; - pdn = pci_dn_next_one(root, pdn)) { - ret = fn(pdn, data); - if (ret) - return ret; - } - - return NULL; -} - static void *add_pdn(struct device_node *dn, void *data) { struct pci_controller *hose = data; @@ -503,28 +482,6 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) pci_traverse_device_nodes(dn, add_pdn, phb); } -/** - * pci_devs_phb_init - Initialize phbs and pci devs under them. - * - * This routine walks over all phb's (pci-host bridges) on the - * system, and sets up assorted pci-related structures - * (including pci info in the device node structs) for each - * pci device found underneath. This routine runs once, - * early in the boot sequence. - */ -static int __init pci_devs_phb_init(void) -{ - struct pci_controller *phb, *tmp; - - /* This must be done first so the device nodes have valid pci info! */ - list_for_each_entry_safe(phb, tmp, &hose_list, list_node) - pci_devs_phb_init_dynamic(phb); - - return 0; -} - -core_initcall(pci_devs_phb_init); - static void pci_dev_pdn_setup(struct pci_dev *pdev) { struct pci_dn *pdn; diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index f91d7e94872e..756043dd06e9 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -13,8 +13,8 @@ #include <linux/pci.h> #include <linux/export.h> +#include <linux/of.h> #include <asm/pci-bridge.h> -#include <asm/prom.h> /** * get_int_prop - Decode a u32 from a device tree property @@ -244,7 +244,7 @@ EXPORT_SYMBOL(of_create_pci_dev); * @dev: pci_dev structure for the bridge * * of_scan_bus() calls this routine for each PCI bridge that it finds, and - * this routine in turn call of_scan_bus() recusively to scan for more child + * this routine in turn call of_scan_bus() recursively to scan for more child * devices. */ void of_scan_pci_bridge(struct pci_dev *dev) @@ -414,7 +414,6 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, */ if (!rescan_existing) pcibios_setup_bus_self(bus); - pcibios_setup_bus_devices(bus); /* Now scan child busses */ for_each_pci_bridge(dev, bus) diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c index 15414c8a2837..9fabb4d9235e 100644 --- a/arch/powerpc/kernel/pmc.c +++ b/arch/powerpc/kernel/pmc.c @@ -74,7 +74,7 @@ void release_pmc_hardware(void) } EXPORT_SYMBOL_GPL(release_pmc_hardware); -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 void power4_enable_pmcs(void) { unsigned long hid0; diff --git a/arch/powerpc/kernel/ppc32.h b/arch/powerpc/kernel/ppc32.h deleted file mode 100644 index 2346f8c7ff2e..000000000000 --- a/arch/powerpc/kernel/ppc32.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _PPC64_PPC32_H -#define _PPC64_PPC32_H - -#include <linux/compat.h> -#include <asm/siginfo.h> -#include <asm/signal.h> - -/* - * Data types and macros for providing 32b PowerPC support. - */ - -/* These are here to support 32-bit syscalls on a 64-bit kernel. */ - -struct pt_regs32 { - unsigned int gpr[32]; - unsigned int nip; - unsigned int msr; - unsigned int orig_gpr3; /* Used for restarting system calls */ - unsigned int ctr; - unsigned int link; - unsigned int xer; - unsigned int ccr; - unsigned int mq; /* 601 only (not used at present) */ - unsigned int trap; /* Reason for being here */ - unsigned int dar; /* Fault registers */ - unsigned int dsisr; - unsigned int result; /* Result of a system call */ -}; - -struct sigcontext32 { - unsigned int _unused[4]; - int signal; - compat_uptr_t handler; - unsigned int oldmask; - compat_uptr_t regs; /* 4 byte pointer to the pt_regs32 structure. */ -}; - -struct mcontext32 { - elf_gregset_t32 mc_gregs; - elf_fpregset_t mc_fregs; - unsigned int mc_pad[2]; - elf_vrregset_t32 mc_vregs __attribute__((__aligned__(16))); - elf_vsrreghalf_t32 mc_vsregs __attribute__((__aligned__(16))); -}; - -struct ucontext32 { - unsigned int uc_flags; - unsigned int uc_link; - compat_stack_t uc_stack; - int uc_pad[7]; - compat_uptr_t uc_regs; /* points to uc_mcontext field */ - compat_sigset_t uc_sigmask; /* mask last for extensibility */ - /* glibc has 1024-bit signal masks, ours are 64-bit */ - int uc_maskext[30]; - int uc_pad2[3]; - struct mcontext32 uc_mcontext; -}; - -#endif /* _PPC64_PPC32_H */ diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S index f3bd0bbf2ae8..a9b9c32d0c1f 100644 --- a/arch/powerpc/kernel/ppc_save_regs.S +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -21,56 +21,33 @@ * different ABIs, though). */ _GLOBAL(ppc_save_regs) - PPC_STL r0,0*SZL(r3) + /* This allows stack frame accessor macros and offsets to be used */ + subi r3,r3,STACK_INT_FRAME_REGS + PPC_STL r0,GPR0(r3) #ifdef CONFIG_PPC32 - stmw r2, 2*SZL(r3) + stmw r2,GPR2(r3) #else - PPC_STL r2,2*SZL(r3) - PPC_STL r3,3*SZL(r3) - PPC_STL r4,4*SZL(r3) - PPC_STL r5,5*SZL(r3) - PPC_STL r6,6*SZL(r3) - PPC_STL r7,7*SZL(r3) - PPC_STL r8,8*SZL(r3) - PPC_STL r9,9*SZL(r3) - PPC_STL r10,10*SZL(r3) - PPC_STL r11,11*SZL(r3) - PPC_STL r12,12*SZL(r3) - PPC_STL r13,13*SZL(r3) - PPC_STL r14,14*SZL(r3) - PPC_STL r15,15*SZL(r3) - PPC_STL r16,16*SZL(r3) - PPC_STL r17,17*SZL(r3) - PPC_STL r18,18*SZL(r3) - PPC_STL r19,19*SZL(r3) - PPC_STL r20,20*SZL(r3) - PPC_STL r21,21*SZL(r3) - PPC_STL r22,22*SZL(r3) - PPC_STL r23,23*SZL(r3) - PPC_STL r24,24*SZL(r3) - PPC_STL r25,25*SZL(r3) - PPC_STL r26,26*SZL(r3) - PPC_STL r27,27*SZL(r3) - PPC_STL r28,28*SZL(r3) - PPC_STL r29,29*SZL(r3) - PPC_STL r30,30*SZL(r3) - PPC_STL r31,31*SZL(r3) + SAVE_GPRS(2, 31, r3) + lbz r0,PACAIRQSOFTMASK(r13) + PPC_STL r0,SOFTE(r3) #endif - /* go up one stack frame for SP */ - PPC_LL r4,0(r1) - PPC_STL r4,1*SZL(r3) + /* store current SP */ + PPC_STL r1,GPR1(r3) /* get caller's LR */ + PPC_LL r4,0(r1) PPC_LL r0,LRSAVE(r4) - PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3) - PPC_STL r0,_LINK-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_LINK(r3) + mflr r0 + PPC_STL r0,_NIP(r3) mfmsr r0 - PPC_STL r0,_MSR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_MSR(r3) mfctr r0 - PPC_STL r0,_CTR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_CTR(r3) mfxer r0 - PPC_STL r0,_XER-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_XER(r3) mfcr r0 - PPC_STL r0,_CCR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_CCR(r3) li r0,0 - PPC_STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_TRAP(r3) + PPC_STL r0,ORIG_GPR3(r3) blr diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index be3758d54e59..b109cd7b5d01 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -7,12 +7,12 @@ #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/kernel.h> +#include <linux/of.h> #include <asm/machdep.h> #include <asm/vdso_datapage.h> #include <asm/rtas.h> #include <linux/uaccess.h> -#include <asm/prom.h> #ifdef CONFIG_PPC64 @@ -25,7 +25,7 @@ static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes loff_t *ppos) { return simple_read_from_buffer(buf, nbytes, ppos, - PDE_DATA(file_inode(file)), PAGE_SIZE); + pde_data(file_inode(file)), PAGE_SIZE); } static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) @@ -34,15 +34,15 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) return -EINVAL; remap_pfn_range(vma, vma->vm_start, - __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT, + __pa(pde_data(file_inode(file))) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); return 0; } -static const struct file_operations page_map_fops = { - .llseek = page_map_seek, - .read = page_map_read, - .mmap = page_map_mmap +static const struct proc_ops page_map_proc_ops = { + .proc_lseek = page_map_seek, + .proc_read = page_map_read, + .proc_mmap = page_map_mmap, }; @@ -51,7 +51,7 @@ static int __init proc_ppc64_init(void) struct proc_dir_entry *pde; pde = proc_create_data("powerpc/systemcfg", S_IFREG | 0444, NULL, - &page_map_fops, vdso_data); + &page_map_proc_ops, vdso_data); if (!pde) return 1; proc_set_size(pde, PAGE_SIZE); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4df94b6e2f32..9452a54d356c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -34,18 +34,15 @@ #include <linux/ftrace.h> #include <linux/kernel_stat.h> #include <linux/personality.h> -#include <linux/random.h> #include <linux/hw_breakpoint.h> #include <linux/uaccess.h> -#include <linux/elf-randomize.h> #include <linux/pkeys.h> #include <linux/seq_buf.h> -#include <asm/pgtable.h> +#include <asm/interrupt.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/mmu.h> -#include <asm/prom.h> #include <asm/machdep.h> #include <asm/time.h> #include <asm/runlatch.h> @@ -96,7 +93,8 @@ static void check_if_tm_restore_required(struct task_struct *tsk) if (tsk == current && tsk->thread.regs && MSR_TM_ACTIVE(tsk->thread.regs->msr) && !test_thread_flag(TIF_RESTORE_TM)) { - tsk->thread.ckpt_regs.msr = tsk->thread.regs->msr; + regs_set_return_msr(&tsk->thread.ckpt_regs, + tsk->thread.regs->msr); set_thread_flag(TIF_RESTORE_TM); } } @@ -125,13 +123,11 @@ unsigned long notrace msr_check_and_set(unsigned long bits) newmsr = oldmsr | bits; -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP)) newmsr |= MSR_VSX; -#endif if (oldmsr != newmsr) - mtmsr_isync(newmsr); + newmsr = mtmsr_isync_irqsafe(newmsr); return newmsr; } @@ -145,13 +141,11 @@ void notrace __msr_check_and_clear(unsigned long bits) newmsr = oldmsr & ~bits; -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP)) newmsr &= ~MSR_VSX; -#endif if (oldmsr != newmsr) - mtmsr_isync(newmsr); + mtmsr_isync_irqsafe(newmsr); } EXPORT_SYMBOL(__msr_check_and_clear); @@ -163,11 +157,9 @@ static void __giveup_fpu(struct task_struct *tsk) save_fpu(tsk); msr = tsk->thread.regs->msr; msr &= ~(MSR_FP|MSR_FE0|MSR_FE1); -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr &= ~MSR_VSX; -#endif - tsk->thread.regs->msr = msr; + regs_set_return_msr(tsk->thread.regs, msr); } void giveup_fpu(struct task_struct *tsk) @@ -236,23 +228,11 @@ void enable_kernel_fp(void) } } EXPORT_SYMBOL(enable_kernel_fp); - -static int restore_fp(struct task_struct *tsk) -{ - if (tsk->thread.load_fp) { - load_fp_state(¤t->thread.fp_state); - current->thread.load_fp++; - return 1; - } - return 0; -} #else -static int restore_fp(struct task_struct *tsk) { return 0; } +static inline void __giveup_fpu(struct task_struct *tsk) { } #endif /* CONFIG_PPC_FPU */ #ifdef CONFIG_ALTIVEC -#define loadvec(thr) ((thr).load_vec) - static void __giveup_altivec(struct task_struct *tsk) { unsigned long msr; @@ -260,11 +240,9 @@ static void __giveup_altivec(struct task_struct *tsk) save_altivec(tsk); msr = tsk->thread.regs->msr; msr &= ~MSR_VEC; -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr &= ~MSR_VSX; -#endif - tsk->thread.regs->msr = msr; + regs_set_return_msr(tsk->thread.regs, msr); } void giveup_altivec(struct task_struct *tsk) @@ -318,21 +296,6 @@ void flush_altivec_to_thread(struct task_struct *tsk) } } EXPORT_SYMBOL_GPL(flush_altivec_to_thread); - -static int restore_altivec(struct task_struct *tsk) -{ - if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) { - load_vr_state(&tsk->thread.vr_state); - tsk->thread.used_vr = 1; - tsk->thread.load_vec++; - - return 1; - } - return 0; -} -#else -#define loadvec(thr) 0 -static inline int restore_altivec(struct task_struct *tsk) { return 0; } #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX @@ -341,7 +304,7 @@ static void __giveup_vsx(struct task_struct *tsk) unsigned long msr = tsk->thread.regs->msr; /* - * We should never be ssetting MSR_VSX without also setting + * We should never be setting MSR_VSX without also setting * MSR_FP and MSR_VEC */ WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC))); @@ -400,18 +363,6 @@ void flush_vsx_to_thread(struct task_struct *tsk) } } EXPORT_SYMBOL_GPL(flush_vsx_to_thread); - -static int restore_vsx(struct task_struct *tsk) -{ - if (cpu_has_feature(CPU_FTR_VSX)) { - tsk->thread.used_vsr = 1; - return 1; - } - - return 0; -} -#else -static inline int restore_vsx(struct task_struct *tsk) { return 0; } #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE @@ -456,21 +407,14 @@ static unsigned long msr_all_available; static int __init init_msr_all_available(void) { -#ifdef CONFIG_PPC_FPU - msr_all_available |= MSR_FP; -#endif -#ifdef CONFIG_ALTIVEC + if (IS_ENABLED(CONFIG_PPC_FPU)) + msr_all_available |= MSR_FP; if (cpu_has_feature(CPU_FTR_ALTIVEC)) msr_all_available |= MSR_VEC; -#endif -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr_all_available |= MSR_VSX; -#endif -#ifdef CONFIG_SPE if (cpu_has_feature(CPU_FTR_SPE)) msr_all_available |= MSR_SPE; -#endif return 0; } @@ -494,23 +438,72 @@ void giveup_all(struct task_struct *tsk) WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); -#ifdef CONFIG_PPC_FPU if (usermsr & MSR_FP) __giveup_fpu(tsk); -#endif -#ifdef CONFIG_ALTIVEC if (usermsr & MSR_VEC) __giveup_altivec(tsk); -#endif -#ifdef CONFIG_SPE if (usermsr & MSR_SPE) __giveup_spe(tsk); -#endif msr_check_and_clear(msr_all_available); } EXPORT_SYMBOL(giveup_all); +#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_FPU +static bool should_restore_fp(void) +{ + if (current->thread.load_fp) { + current->thread.load_fp++; + return true; + } + return false; +} + +static void do_restore_fp(void) +{ + load_fp_state(¤t->thread.fp_state); +} +#else +static bool should_restore_fp(void) { return false; } +static void do_restore_fp(void) { } +#endif /* CONFIG_PPC_FPU */ + +#ifdef CONFIG_ALTIVEC +static bool should_restore_altivec(void) +{ + if (cpu_has_feature(CPU_FTR_ALTIVEC) && (current->thread.load_vec)) { + current->thread.load_vec++; + return true; + } + return false; +} + +static void do_restore_altivec(void) +{ + load_vr_state(¤t->thread.vr_state); + current->thread.used_vr = 1; +} +#else +static bool should_restore_altivec(void) { return false; } +static void do_restore_altivec(void) { } +#endif /* CONFIG_ALTIVEC */ + +static bool should_restore_vsx(void) +{ + if (cpu_has_feature(CPU_FTR_VSX)) + return true; + return false; +} +#ifdef CONFIG_VSX +static void do_restore_vsx(void) +{ + current->thread.used_vsr = 1; +} +#else +static void do_restore_vsx(void) { } +#endif /* CONFIG_VSX */ + /* * The exception exit path calls restore_math() with interrupts hard disabled * but the soft irq state not "reconciled". ftrace code that calls @@ -524,33 +517,50 @@ EXPORT_SYMBOL(giveup_all); void notrace restore_math(struct pt_regs *regs) { unsigned long msr; - - if (!MSR_TM_ACTIVE(regs->msr) && - !current->thread.load_fp && !loadvec(current->thread)) - return; + unsigned long new_msr = 0; msr = regs->msr; - msr_check_and_set(msr_all_available); /* - * Only reload if the bit is not set in the user MSR, the bit BEING set - * indicates that the registers are hot + * new_msr tracks the facilities that are to be restored. Only reload + * if the bit is not set in the user MSR (if it is set, the registers + * are live for the user thread). */ - if ((!(msr & MSR_FP)) && restore_fp(current)) - msr |= MSR_FP | current->thread.fpexc_mode; + if ((!(msr & MSR_FP)) && should_restore_fp()) + new_msr |= MSR_FP; - if ((!(msr & MSR_VEC)) && restore_altivec(current)) - msr |= MSR_VEC; + if ((!(msr & MSR_VEC)) && should_restore_altivec()) + new_msr |= MSR_VEC; - if ((msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC) && - restore_vsx(current)) { - msr |= MSR_VSX; + if ((!(msr & MSR_VSX)) && should_restore_vsx()) { + if (((msr | new_msr) & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) + new_msr |= MSR_VSX; } - msr_check_and_clear(msr_all_available); + if (new_msr) { + unsigned long fpexc_mode = 0; - regs->msr = msr; + msr_check_and_set(new_msr); + + if (new_msr & MSR_FP) { + do_restore_fp(); + + // This also covers VSX, because VSX implies FP + fpexc_mode = current->thread.fpexc_mode; + } + + if (new_msr & MSR_VEC) + do_restore_altivec(); + + if (new_msr & MSR_VSX) + do_restore_vsx(); + + msr_check_and_clear(new_msr); + + regs_set_return_msr(regs, regs->msr | new_msr | fpexc_mode); + } } +#endif /* CONFIG_PPC_BOOK3S_64 */ static void save_all(struct task_struct *tsk) { @@ -578,7 +588,6 @@ static void save_all(struct task_struct *tsk) __giveup_spe(tsk); msr_check_and_clear(msr_all_available); - thread_pkey_regs_save(&tsk->thread); } void flush_all_to_thread(struct task_struct *tsk) @@ -611,26 +620,70 @@ void do_send_trap(struct pt_regs *regs, unsigned long address, (void __user *)address); } #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ -void do_break (struct pt_regs *regs, unsigned long address, - unsigned long error_code) + +static void do_break_handler(struct pt_regs *regs) +{ + struct arch_hw_breakpoint null_brk = {0}; + struct arch_hw_breakpoint *info; + ppc_inst_t instr = ppc_inst(0); + int type = 0; + int size = 0; + unsigned long ea; + int i; + + /* + * If underneath hw supports only one watchpoint, we know it + * caused exception. 8xx also falls into this category. + */ + if (nr_wp_slots() == 1) { + __set_breakpoint(0, &null_brk); + current->thread.hw_brk[0] = null_brk; + current->thread.hw_brk[0].flags |= HW_BRK_FLAG_DISABLED; + return; + } + + /* Otherwise find out which DAWR caused exception and disable it. */ + wp_get_instr_detail(regs, &instr, &type, &size, &ea); + + for (i = 0; i < nr_wp_slots(); i++) { + info = ¤t->thread.hw_brk[i]; + if (!info->address) + continue; + + if (wp_check_constraints(regs, instr, ea, type, size, info)) { + __set_breakpoint(i, &null_brk); + current->thread.hw_brk[i] = null_brk; + current->thread.hw_brk[i].flags |= HW_BRK_FLAG_DISABLED; + } + } +} + +DEFINE_INTERRUPT_HANDLER(do_break) { current->thread.trap_nr = TRAP_HWBKPT; - if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, + if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, regs->dsisr, 11, SIGSEGV) == NOTIFY_STOP) return; if (debugger_break_match(regs)) return; - /* Clear the breakpoint */ - hw_breakpoint_disable(); + /* + * We reach here only when watchpoint exception is generated by ptrace + * event (or hw is buggy!). Now if CONFIG_HAVE_HW_BREAKPOINT is set, + * watchpoint is already handled by hw_breakpoint_handler() so we don't + * have to do anything. But when CONFIG_HAVE_HW_BREAKPOINT is not set, + * we need to manually handle the watchpoint here. + */ + if (!IS_ENABLED(CONFIG_HAVE_HW_BREAKPOINT)) + do_break_handler(regs); /* Deliver the signal to userspace */ - force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address); + force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)regs->dar); } #endif /* CONFIG_PPC_ADV_DEBUG_REGS */ -static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk); +static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk[HBP_NUM_MAX]); #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* @@ -704,70 +757,52 @@ void switch_booke_debug_regs(struct debug_reg *new_debug) EXPORT_SYMBOL_GPL(switch_booke_debug_regs); #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ #ifndef CONFIG_HAVE_HW_BREAKPOINT -static void set_breakpoint(struct arch_hw_breakpoint *brk) +static void set_breakpoint(int i, struct arch_hw_breakpoint *brk) { preempt_disable(); - __set_breakpoint(brk); + __set_breakpoint(i, brk); preempt_enable(); } static void set_debug_reg_defaults(struct thread_struct *thread) { - thread->hw_brk.address = 0; - thread->hw_brk.type = 0; - thread->hw_brk.len = 0; - thread->hw_brk.hw_len = 0; - if (ppc_breakpoint_available()) - set_breakpoint(&thread->hw_brk); -} -#endif /* !CONFIG_HAVE_HW_BREAKPOINT */ -#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ + int i; + struct arch_hw_breakpoint null_brk = {0}; -#ifdef CONFIG_PPC_ADV_DEBUG_REGS -static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) -{ - mtspr(SPRN_DAC1, dabr); -#ifdef CONFIG_PPC_47x - isync(); -#endif - return 0; + for (i = 0; i < nr_wp_slots(); i++) { + thread->hw_brk[i] = null_brk; + if (ppc_breakpoint_available()) + set_breakpoint(i, &thread->hw_brk[i]); + } } -#elif defined(CONFIG_PPC_BOOK3S) -static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) + +static inline bool hw_brk_match(struct arch_hw_breakpoint *a, + struct arch_hw_breakpoint *b) { - mtspr(SPRN_DABR, dabr); - if (cpu_has_feature(CPU_FTR_DABRX)) - mtspr(SPRN_DABRX, dabrx); - return 0; + if (a->address != b->address) + return false; + if (a->type != b->type) + return false; + if (a->len != b->len) + return false; + /* no need to check hw_len. it's calculated from address and len */ + return true; } -#elif defined(CONFIG_PPC_8xx) -static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) -{ - unsigned long addr = dabr & ~HW_BRK_TYPE_DABR; - unsigned long lctrl1 = 0x90000000; /* compare type: equal on E & F */ - unsigned long lctrl2 = 0x8e000002; /* watchpoint 1 on cmp E | F */ - if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_READ) - lctrl1 |= 0xa0000; - else if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_WRITE) - lctrl1 |= 0xf0000; - else if ((dabr & HW_BRK_TYPE_RDWR) == 0) - lctrl2 = 0; +static void switch_hw_breakpoint(struct task_struct *new) +{ + int i; - mtspr(SPRN_LCTRL2, 0); - mtspr(SPRN_CMPE, addr); - mtspr(SPRN_CMPF, addr + 4); - mtspr(SPRN_LCTRL1, lctrl1); - mtspr(SPRN_LCTRL2, lctrl2); + for (i = 0; i < nr_wp_slots(); i++) { + if (likely(hw_brk_match(this_cpu_ptr(¤t_brk[i]), + &new->thread.hw_brk[i]))) + continue; - return 0; -} -#else -static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) -{ - return -EINVAL; + __set_breakpoint(i, &new->thread.hw_brk[i]); + } } -#endif +#endif /* !CONFIG_HAVE_HW_BREAKPOINT */ +#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ static inline int set_dabr(struct arch_hw_breakpoint *brk) { @@ -779,16 +814,61 @@ static inline int set_dabr(struct arch_hw_breakpoint *brk) if (ppc_md.set_dabr) return ppc_md.set_dabr(dabr, dabrx); - return __set_dabr(dabr, dabrx); + if (IS_ENABLED(CONFIG_PPC_ADV_DEBUG_REGS)) { + mtspr(SPRN_DAC1, dabr); + if (IS_ENABLED(CONFIG_PPC_47x)) + isync(); + return 0; + } else if (IS_ENABLED(CONFIG_PPC_BOOK3S)) { + mtspr(SPRN_DABR, dabr); + if (cpu_has_feature(CPU_FTR_DABRX)) + mtspr(SPRN_DABRX, dabrx); + return 0; + } else { + return -EINVAL; + } } -void __set_breakpoint(struct arch_hw_breakpoint *brk) +static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk) { - memcpy(this_cpu_ptr(¤t_brk), brk, sizeof(*brk)); + unsigned long lctrl1 = LCTRL1_CTE_GT | LCTRL1_CTF_LT | LCTRL1_CRWE_RW | + LCTRL1_CRWF_RW; + unsigned long lctrl2 = LCTRL2_LW0EN | LCTRL2_LW0LADC | LCTRL2_SLW0EN; + unsigned long start_addr = ALIGN_DOWN(brk->address, HW_BREAKPOINT_SIZE); + unsigned long end_addr = ALIGN(brk->address + brk->len, HW_BREAKPOINT_SIZE); + if (start_addr == 0) + lctrl2 |= LCTRL2_LW0LA_F; + else if (end_addr == 0) + lctrl2 |= LCTRL2_LW0LA_E; + else + lctrl2 |= LCTRL2_LW0LA_EandF; + + mtspr(SPRN_LCTRL2, 0); + + if ((brk->type & HW_BRK_TYPE_RDWR) == 0) + return 0; + + if ((brk->type & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_READ) + lctrl1 |= LCTRL1_CRWE_RO | LCTRL1_CRWF_RO; + if ((brk->type & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_WRITE) + lctrl1 |= LCTRL1_CRWE_WO | LCTRL1_CRWF_WO; + + mtspr(SPRN_CMPE, start_addr - 1); + mtspr(SPRN_CMPF, end_addr); + mtspr(SPRN_LCTRL1, lctrl1); + mtspr(SPRN_LCTRL2, lctrl2); + + return 0; +} + +static void set_hw_breakpoint(int nr, struct arch_hw_breakpoint *brk) +{ if (dawr_enabled()) // Power8 or later - set_dawr(brk); + set_dawr(nr, brk); + else if (IS_ENABLED(CONFIG_PPC_8xx)) + set_breakpoint_8xx(brk); else if (!cpu_has_feature(CPU_FTR_ARCH_207S)) // Power7 or earlier set_dabr(brk); @@ -797,6 +877,12 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk) WARN_ON_ONCE(1); } +void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) +{ + memcpy(this_cpu_ptr(¤t_brk[nr]), brk, sizeof(*brk)); + set_hw_breakpoint(nr, brk); +} + /* Check if we have DAWR or DABR hardware */ bool ppc_breakpoint_available(void) { @@ -809,17 +895,32 @@ bool ppc_breakpoint_available(void) } EXPORT_SYMBOL_GPL(ppc_breakpoint_available); -static inline bool hw_brk_match(struct arch_hw_breakpoint *a, - struct arch_hw_breakpoint *b) +/* Disable the breakpoint in hardware without touching current_brk[] */ +void suspend_breakpoints(void) { - if (a->address != b->address) - return false; - if (a->type != b->type) - return false; - if (a->len != b->len) - return false; - /* no need to check hw_len. it's calculated from address and len */ - return true; + struct arch_hw_breakpoint brk = {0}; + int i; + + if (!ppc_breakpoint_available()) + return; + + for (i = 0; i < nr_wp_slots(); i++) + set_hw_breakpoint(i, &brk); +} + +/* + * Re-enable breakpoints suspended by suspend_breakpoints() in hardware + * from current_brk[] + */ +void restore_breakpoints(void) +{ + int i; + + if (!ppc_breakpoint_available()) + return; + + for (i = 0; i < nr_wp_slots(); i++) + set_hw_breakpoint(i, this_cpu_ptr(¤t_brk[i])); } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -1043,12 +1144,13 @@ void restore_tm_state(struct pt_regs *regs) #endif restore_math(regs); - regs->msr |= msr_diff; + regs_set_return_msr(regs, regs->msr | msr_diff); } -#else +#else /* !CONFIG_PPC_TRANSACTIONAL_MEM */ #define tm_recheckpoint_new_task(new) #define __switch_to_tm(prev, new) +void tm_reclaim_current(uint8_t cause) {} #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ static inline void save_sprs(struct thread_struct *t) @@ -1057,6 +1159,10 @@ static inline void save_sprs(struct thread_struct *t) if (cpu_has_feature(CPU_FTR_ALTIVEC)) t->vrsave = mfspr(SPRN_VRSAVE); #endif +#ifdef CONFIG_SPE + if (cpu_has_feature(CPU_FTR_SPE)) + t->spefscr = mfspr(SPRN_SPEFSCR); +#endif #ifdef CONFIG_PPC_BOOK3S_64 if (cpu_has_feature(CPU_FTR_DSCR)) t->dscr = mfspr(SPRN_DSCR); @@ -1076,10 +1182,45 @@ static inline void save_sprs(struct thread_struct *t) */ t->tar = mfspr(SPRN_TAR); } + + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) + t->hashkeyr = mfspr(SPRN_HASHKEYR); +#endif +} + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void kvmppc_save_user_regs(void) +{ + unsigned long usermsr; + + if (!current->thread.regs) + return; + + usermsr = current->thread.regs->msr; + + /* Caller has enabled FP/VEC/VSX/TM in MSR */ + if (usermsr & MSR_FP) + __giveup_fpu(current); + if (usermsr & MSR_VEC) + __giveup_altivec(current); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (usermsr & MSR_TM) { + current->thread.tm_tfhar = mfspr(SPRN_TFHAR); + current->thread.tm_tfiar = mfspr(SPRN_TFIAR); + current->thread.tm_texasr = mfspr(SPRN_TEXASR); + current->thread.regs->msr &= ~MSR_TM; + } #endif +} +EXPORT_SYMBOL_GPL(kvmppc_save_user_regs); - thread_pkey_regs_save(t); +void kvmppc_save_current_sprs(void) +{ + save_sprs(¤t->thread); } +EXPORT_SYMBOL_GPL(kvmppc_save_current_sprs); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ static inline void restore_sprs(struct thread_struct *old_thread, struct thread_struct *new_thread) @@ -1089,6 +1230,11 @@ static inline void restore_sprs(struct thread_struct *old_thread, old_thread->vrsave != new_thread->vrsave) mtspr(SPRN_VRSAVE, new_thread->vrsave); #endif +#ifdef CONFIG_SPE + if (cpu_has_feature(CPU_FTR_SPE) && + old_thread->spefscr != new_thread->spefscr) + mtspr(SPRN_SPEFSCR, new_thread->spefscr); +#endif #ifdef CONFIG_PPC_BOOK3S_64 if (cpu_has_feature(CPU_FTR_DSCR)) { u64 dscr = get_paca()->dscr_default; @@ -1117,9 +1263,12 @@ static inline void restore_sprs(struct thread_struct *old_thread, if (cpu_has_feature(CPU_FTR_P9_TIDR) && old_thread->tidr != new_thread->tidr) mtspr(SPRN_TIDR, new_thread->tidr); + + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE) && + old_thread->hashkeyr != new_thread->hashkeyr) + mtspr(SPRN_HASHKEYR, new_thread->hashkeyr); #endif - thread_pkey_regs_restore(new_thread, old_thread); } struct task_struct *__switch_to(struct task_struct *prev, @@ -1127,7 +1276,7 @@ struct task_struct *__switch_to(struct task_struct *prev, { struct thread_struct *new_thread, *old_thread; struct task_struct *last; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU struct ppc64_tlb_batch *batch; #endif @@ -1136,7 +1285,7 @@ struct task_struct *__switch_to(struct task_struct *prev, WARN_ON(!irqs_disabled()); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1144,6 +1293,19 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } + + /* + * On POWER9 the copy-paste buffer can only paste into + * foreign real addresses, so unprivileged processes can not + * see the data or use it in any way unless they have + * foreign real mappings. If the new process has the foreign + * real address mappings, we must issue a cp_abort to clear + * any state and prevent snooping, corruption or a covert + * channel. ISA v3.1 supports paste into local memory. + */ + if (new->mm && (cpu_has_feature(CPU_FTR_ARCH_31) || + atomic_read(&new->mm->context.vas_windows))) + asm volatile(PPC_CP_ABORT); #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS @@ -1154,8 +1316,7 @@ struct task_struct *__switch_to(struct task_struct *prev, * schedule DABR */ #ifndef CONFIG_HAVE_HW_BREAKPOINT - if (unlikely(!hw_brk_match(this_cpu_ptr(¤t_brk), &new->thread.hw_brk))) - __set_breakpoint(&new->thread.hw_brk); + switch_hw_breakpoint(new); #endif /* CONFIG_HAVE_HW_BREAKPOINT */ #endif @@ -1180,37 +1341,50 @@ struct task_struct *__switch_to(struct task_struct *prev, } /* - * Call restore_sprs() before calling _switch(). If we move it after - * _switch() then we miss out on calling it for new tasks. The reason - * for this is we manually create a stack frame for new tasks that - * directly returns through ret_from_fork() or + * Call restore_sprs() and set_return_regs_changed() before calling + * _switch(). If we move it after _switch() then we miss out on calling + * it for new tasks. The reason for this is we manually create a stack + * frame for new tasks that directly returns through ret_from_fork() or * ret_from_kernel_thread(). See copy_thread() for details. */ restore_sprs(old_thread, new_thread); + set_return_regs_changed(); /* _switch changes stack (and regs) */ + + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + kuap_assert_locked(); + last = _switch(old_thread, new_thread); + /* + * Nothing after _switch will be run for newly created tasks, + * because they switch directly to ret_from_fork/ret_from_kernel_thread + * etc. Code added here should have a comment explaining why that is + * okay. + */ + #ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU + /* + * This applies to a process that was context switched while inside + * arch_enter_lazy_mmu_mode(), to re-activate the batch that was + * deactivated above, before _switch(). This will never be the case + * for new tasks. + */ if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } +#endif - if (current->thread.regs) { + /* + * Math facilities are masked out of the child MSR in copy_thread. + * A new task does not need to restore_math because it will + * demand fault them. + */ + if (current->thread.regs) restore_math(current->thread.regs); - - /* - * The copy-paste buffer can only store into foreign real - * addresses, so unprivileged processes can not see the - * data or use it in any way unless they have foreign real - * mappings. If the new process has the foreign real address - * mappings, we must issue a cp_abort to clear any state and - * prevent snooping, corruption or a covert channel. - */ - if (current->thread.used_vas) - asm volatile(PPC_CP_ABORT); - } #endif /* CONFIG_PPC_BOOK3S_64 */ return last; @@ -1221,29 +1395,27 @@ struct task_struct *__switch_to(struct task_struct *prev, static void show_instructions(struct pt_regs *regs) { int i; + unsigned long nip = regs->nip; unsigned long pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); - printk("Instruction dump:"); + printk("Code: "); + + /* + * If we were executing with the MMU off for instructions, adjust pc + * rather than printing XXXXXXXX. + */ + if (!IS_ENABLED(CONFIG_BOOKE) && !(regs->msr & MSR_IR)) { + pc = (unsigned long)phys_to_virt(pc); + nip = (unsigned long)phys_to_virt(regs->nip); + } for (i = 0; i < NR_INSN_TO_PRINT; i++) { int instr; - if (!(i % 8)) - pr_cont("\n"); - -#if !defined(CONFIG_BOOKE) - /* If executing with the IMMU off, adjust pc rather - * than print XXXXXXXX. - */ - if (!(regs->msr & MSR_IR)) - pc = (unsigned long)phys_to_virt(pc); -#endif - - if (!__kernel_text_address(pc) || - probe_kernel_address((const void *)pc, instr)) { + if (get_kernel_nofault(instr, (const void *)pc)) { pr_cont("XXXXXXXX "); } else { - if (regs->nip == pc) + if (nip == pc) pr_cont("<%08x> ", instr); else pr_cont("%08x ", instr); @@ -1264,16 +1436,6 @@ void show_user_instructions(struct pt_regs *regs) pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); - /* - * Make sure the NIP points at userspace, not kernel text/data or - * elsewhere. - */ - if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) { - pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", - current->comm, current->pid); - return; - } - seq_buf_init(&s, buf, sizeof(buf)); while (n) { @@ -1284,7 +1446,8 @@ void show_user_instructions(struct pt_regs *regs) for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { int instr; - if (probe_kernel_address((const void *)pc, instr)) { + if (copy_from_user_nofault(&instr, (void __user *)pc, + sizeof(instr))) { seq_buf_printf(&s, "XXXXXXXX "); continue; } @@ -1381,19 +1544,15 @@ static void print_msr_bits(unsigned long val) #ifdef CONFIG_PPC64 #define REG "%016lx" #define REGS_PER_LINE 4 -#define LAST_VOLATILE 13 #else #define REG "%08lx" #define REGS_PER_LINE 8 -#define LAST_VOLATILE 12 #endif -void show_regs(struct pt_regs * regs) +static void __show_regs(struct pt_regs *regs) { int i, trap; - show_regs_print_info(KERN_DEFAULT); - printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %px TRAP: %04lx %s (%s)\n", @@ -1402,14 +1561,17 @@ void show_regs(struct pt_regs * regs) print_msr_bits(regs->msr); pr_cont(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); trap = TRAP(regs); - if ((TRAP(regs) != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) + if (!trap_is_syscall(regs) && cpu_has_feature(CPU_FTR_CFAR)) pr_cont("CFAR: "REG" ", regs->orig_gpr3); - if (trap == 0x200 || trap == 0x300 || trap == 0x600) -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); -#else - pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); -#endif + if (trap == INTERRUPT_MACHINE_CHECK || + trap == INTERRUPT_DATA_STORAGE || + trap == INTERRUPT_ALIGNMENT) { + if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, regs->esr); + else + pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); + } + #ifdef CONFIG_PPC64 pr_cont("IRQMASK: %lx ", regs->softe); #endif @@ -1422,19 +1584,23 @@ void show_regs(struct pt_regs * regs) if ((i % REGS_PER_LINE) == 0) pr_cont("\nGPR%02d: ", i); pr_cont(REG " ", regs->gpr[i]); - if (i == LAST_VOLATILE && !FULL_REGS(regs)) - break; } pr_cont("\n"); -#ifdef CONFIG_KALLSYMS /* * Lookup NIP late so we have the best change of getting the * above info out without failing */ - printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); - printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); -#endif - show_stack(current, (unsigned long *) regs->gpr[1]); + if (IS_ENABLED(CONFIG_KALLSYMS)) { + printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); + printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); + } +} + +void show_regs(struct pt_regs *regs) +{ + show_regs_print_info(KERN_DEFAULT); + __show_regs(regs); + show_stack(current, (unsigned long *) regs->gpr[1], KERN_DEFAULT); if (!user_mode(regs)) show_instructions(regs); } @@ -1448,38 +1614,30 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } -#ifdef CONFIG_PPC_BOOK3S_64 void arch_setup_new_exec(void) { - if (radix_enabled()) - return; - hash__setup_new_exec(); -} -#endif -int set_thread_uses_vas(void) -{ #ifdef CONFIG_PPC_BOOK3S_64 - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - return -EINVAL; - - current->thread.used_vas = 1; - + if (!radix_enabled()) + hash__setup_new_exec(); +#endif /* - * Even a process that has no foreign real address mapping can use - * an unpaired COPY instruction (to no real effect). Issue CP_ABORT - * to clear any pending COPY and prevent a covert channel. - * - * __switch_to() will issue CP_ABORT on future context switches. + * If we exec out of a kernel thread then thread.regs will not be + * set. Do it now. */ - asm volatile(PPC_CP_ABORT); + if (!current->thread.regs) { + struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE; + current->thread.regs = regs - 1; + } -#endif /* CONFIG_PPC_BOOK3S_64 */ - return 0; +#ifdef CONFIG_PPC_MEM_KEYS + current->thread.regs->amr = default_amr; + current->thread.regs->iamr = default_iamr; +#endif } #ifdef CONFIG_PPC64 -/** +/* * Assign a TIDR (thread ID) for task @t and set it in the thread * structure. For now, we only support setting TIDR for 'current' task. * @@ -1532,11 +1690,6 @@ EXPORT_SYMBOL_GPL(set_thread_tidr); #endif /* CONFIG_PPC64 */ -void -release_thread(struct task_struct *t) -{ -} - /* * this gets called so that we can store coprocessor state into memory and * copy the current task into the new thread. @@ -1565,7 +1718,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) { -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; @@ -1590,59 +1743,85 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) /* * Copy architecture-specific thread state */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, - unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - struct pt_regs *childregs, *kregs; + struct pt_regs *kregs; /* Switch frame regs */ extern void ret_from_fork(void); - extern void ret_from_kernel_thread(void); + extern void ret_from_fork_scv(void); + extern void ret_from_kernel_user_thread(void); + extern void start_kernel_thread(void); void (*f)(void); unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; - struct thread_info *ti = task_thread_info(p); +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int i; +#endif klp_init_thread_info(p); - /* Copy registers */ - sp -= sizeof(struct pt_regs); - childregs = (struct pt_regs *) sp; if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ - memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gpr[1] = sp + sizeof(struct pt_regs); - /* function */ - if (usp) - childregs->gpr[14] = ppc_function_entry((void *)usp); -#ifdef CONFIG_PPC64 - clear_tsk_thread_flag(p, TIF_32BIT); - childregs->softe = IRQS_ENABLED; -#endif - childregs->gpr[15] = kthread_arg; + + /* Create initial minimum stack frame. */ + sp -= STACK_FRAME_MIN_SIZE; + ((unsigned long *)sp)[0] = 0; + + f = start_kernel_thread; p->thread.regs = NULL; /* no user register state */ - ti->flags |= _TIF_RESTOREALL; - f = ret_from_kernel_thread; + clear_tsk_compat_task(p); } else { /* user thread */ - struct pt_regs *regs = current_pt_regs(); - CHECK_FULL_REGS(regs); - *childregs = *regs; - if (usp) - childregs->gpr[1] = usp; - p->thread.regs = childregs; - childregs->gpr[3] = 0; /* Result from fork() */ - if (clone_flags & CLONE_SETTLS) { + struct pt_regs *childregs; + + /* Create initial user return stack frame. */ + sp -= STACK_USER_INT_FRAME_SIZE; + *(unsigned long *)(sp + STACK_INT_FRAME_MARKER) = STACK_FRAME_REGS_MARKER; + + childregs = (struct pt_regs *)(sp + STACK_INT_FRAME_REGS); + + if (unlikely(args->fn)) { + /* + * A user space thread, but it first runs a kernel + * thread, and then returns as though it had called + * execve rather than fork, so user regs will be + * filled in (e.g., by kernel_execve()). + */ + ((unsigned long *)sp)[0] = 0; + memset(childregs, 0, sizeof(struct pt_regs)); #ifdef CONFIG_PPC64 - if (!is_32bit_task()) - childregs->gpr[13] = tls; - else + childregs->softe = IRQS_ENABLED; #endif - childregs->gpr[2] = tls; + f = ret_from_kernel_user_thread; + } else { + struct pt_regs *regs = current_pt_regs(); + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + + /* Copy registers */ + *childregs = *regs; + if (usp) + childregs->gpr[1] = usp; + ((unsigned long *)sp)[0] = childregs->gpr[1]; +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON_ONCE(childregs->softe != IRQS_ENABLED); +#endif + if (clone_flags & CLONE_SETTLS) { + unsigned long tls = args->tls; + + if (!is_32bit_task()) + childregs->gpr[13] = tls; + else + childregs->gpr[2] = tls; + } + + if (trap_is_scv(regs)) + f = ret_from_fork_scv; + else + f = ret_from_fork; } - f = ret_from_fork; + childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX); + p->thread.regs = childregs; } - childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX); - sp -= STACK_FRAME_OVERHEAD; /* * The way this works is that at some point in the future @@ -1652,22 +1831,39 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp, * do some house keeping and then return from the fork or clone * system call, using the stack frame created above. */ - ((unsigned long *)sp)[0] = 0; - sp -= sizeof(struct pt_regs); - kregs = (struct pt_regs *) sp; - sp -= STACK_FRAME_OVERHEAD; + ((unsigned long *)sp)[STACK_FRAME_LR_SAVE] = (unsigned long)f; + sp -= STACK_SWITCH_FRAME_SIZE; + ((unsigned long *)sp)[0] = sp + STACK_SWITCH_FRAME_SIZE; + kregs = (struct pt_regs *)(sp + STACK_SWITCH_FRAME_REGS); + kregs->nip = ppc_function_entry(f); + if (unlikely(args->fn)) { + /* + * Put kthread fn, arg parameters in non-volatile GPRs in the + * switch frame so they are loaded by _switch before it returns + * to ret_from_kernel_thread. + */ + kregs->gpr[14] = ppc_function_entry((void *)args->fn); + kregs->gpr[15] = (unsigned long)args->fn_arg; + } p->thread.ksp = sp; -#ifdef CONFIG_PPC32 - p->thread.ksp_limit = (unsigned long)end_of_stack(p); -#endif + #ifdef CONFIG_HAVE_HW_BREAKPOINT - p->thread.ptrace_bps[0] = NULL; + for (i = 0; i < nr_wp_slots(); i++) + p->thread.ptrace_bps[i] = NULL; #endif +#ifdef CONFIG_PPC_FPU_REGS p->thread.fp_save_area = NULL; +#endif #ifdef CONFIG_ALTIVEC p->thread.vr_save_area = NULL; #endif +#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) + p->thread.kuap = KUAP_NONE; +#endif +#if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP) + p->thread.pid = MMU_NO_CONTEXT; +#endif setup_ksp_vsid(p, sp); @@ -1676,12 +1872,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp, p->thread.dscr_inherit = current->thread.dscr_inherit; p->thread.dscr = mfspr(SPRN_DSCR); } - if (cpu_has_feature(CPU_FTR_HAS_PPR)) - childregs->ppr = DEFAULT_PPR; p->thread.tidr = 0; #endif - kregs->nip = ppc_function_entry(f); +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) + p->thread.hashkeyr = current->thread.hashkeyr; +#endif return 0; } @@ -1695,20 +1892,9 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ -#ifdef CONFIG_PPC_BOOK3S_64 - if (!radix_enabled()) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled()) preload_new_slb_context(start, sp); #endif -#endif - - /* - * If we exec out of a kernel thread then thread.regs will not be - * set. Do it now. - */ - if (!current->thread.regs) { - struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE; - current->thread.regs = regs - 1; - } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* @@ -1720,20 +1906,13 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) tm_reclaim_current(0); #endif - memset(regs->gpr, 0, sizeof(regs->gpr)); + memset(®s->gpr[1], 0, sizeof(regs->gpr) - sizeof(regs->gpr[0])); regs->ctr = 0; regs->link = 0; regs->xer = 0; regs->ccr = 0; regs->gpr[1] = sp; - /* - * We have just cleared all the nonvolatile GPRs, so make - * FULL_REGS(regs) return true. This is necessary to allow - * ptrace to examine the thread immediately after exec. - */ - regs->trap &= ~1UL; - #ifdef CONFIG_PPC32 regs->mq = 0; regs->nip = start; @@ -1778,21 +1957,24 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) } regs->gpr[2] = toc; } - regs->nip = entry; - regs->msr = MSR_USER64; + regs_set_return_ip(regs, entry); + regs_set_return_msr(regs, MSR_USER64); } else { - regs->nip = start; regs->gpr[2] = 0; - regs->msr = MSR_USER32; + regs_set_return_ip(regs, start); + regs_set_return_msr(regs, MSR_USER32); } + #endif #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif current->thread.load_slb = 0; current->thread.load_fp = 0; +#ifdef CONFIG_PPC_FPU_REGS memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; +#endif #ifdef CONFIG_ALTIVEC memset(¤t->thread.vr_state, 0, sizeof(current->thread.vr_state)); current->thread.vr_state.vscr.u[3] = 0x00010000; /* Java mode disabled */ @@ -1813,8 +1995,12 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.tm_tfiar = 0; current->thread.load_tm = 0; #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ - - thread_pkey_regs_init(¤t->thread); +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) { + current->thread.hashkeyr = get_random_long(); + mtspr(SPRN_HASHKEYR, current->thread.hashkeyr); + } +#endif /* CONFIG_PPC_BOOK3S_64 */ } EXPORT_SYMBOL(start_thread); @@ -1830,7 +2016,6 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) * fpexc_mode. fpexc_mode is also used for setting FP exception * mode (asyn, precise, disabled) for 'Classic' FP. */ if (val & PR_FP_EXC_SW_ENABLE) { -#ifdef CONFIG_SPE if (cpu_has_feature(CPU_FTR_SPE)) { /* * When the sticky exception bits are set @@ -1844,16 +2029,15 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) * anyway to restore the prctl settings from * the saved environment. */ +#ifdef CONFIG_SPE tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); tsk->thread.fpexc_mode = val & (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); +#endif return 0; } else { return -EINVAL; } -#else - return -EINVAL; -#endif } /* on a CONFIG_SPE this does not hurt us. The bits that @@ -1864,18 +2048,18 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) if (val > PR_FP_EXC_PRECISE) return -EINVAL; tsk->thread.fpexc_mode = __pack_fe01(val); - if (regs != NULL && (regs->msr & MSR_FP) != 0) - regs->msr = (regs->msr & ~(MSR_FE0|MSR_FE1)) - | tsk->thread.fpexc_mode; + if (regs != NULL && (regs->msr & MSR_FP) != 0) { + regs_set_return_msr(regs, (regs->msr & ~(MSR_FE0|MSR_FE1)) + | tsk->thread.fpexc_mode); + } return 0; } int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) { - unsigned int val; + unsigned int val = 0; - if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) -#ifdef CONFIG_SPE + if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) { if (cpu_has_feature(CPU_FTR_SPE)) { /* * When the sticky exception bits are set @@ -1889,15 +2073,15 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) * anyway to restore the prctl settings from * the saved environment. */ +#ifdef CONFIG_SPE tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); val = tsk->thread.fpexc_mode; +#endif } else return -EINVAL; -#else - return -EINVAL; -#endif - else + } else { val = __unpack_fe01(tsk->thread.fpexc_mode); + } return put_user(val, (unsigned int __user *) adr); } @@ -1913,9 +2097,9 @@ int set_endian(struct task_struct *tsk, unsigned int val) return -EINVAL; if (val == PR_ENDIAN_BIG) - regs->msr &= ~MSR_LE; + regs_set_return_msr(regs, regs->msr & ~MSR_LE); else if (val == PR_ENDIAN_LITTLE || val == PR_ENDIAN_PPC_LITTLE) - regs->msr |= MSR_LE; + regs_set_return_msr(regs, regs->msr | MSR_LE); else return -EINVAL; @@ -1962,6 +2146,9 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p, unsigned long stack_page; unsigned long cpu = task_cpu(p); + if (!hardirq_ctx[cpu] || !softirq_ctx[cpu]) + return 0; + stack_page = (unsigned long)hardirq_ctx[cpu]; if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) return 1; @@ -1973,8 +2160,48 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p, return 0; } -int validate_sp(unsigned long sp, struct task_struct *p, - unsigned long nbytes) +static inline int valid_emergency_stack(unsigned long sp, struct task_struct *p, + unsigned long nbytes) +{ +#ifdef CONFIG_PPC64 + unsigned long stack_page; + unsigned long cpu = task_cpu(p); + + if (!paca_ptrs) + return 0; + + if (!paca_ptrs[cpu]->emergency_sp) + return 0; + +# ifdef CONFIG_PPC_BOOK3S_64 + if (!paca_ptrs[cpu]->nmi_emergency_sp || !paca_ptrs[cpu]->mc_emergency_sp) + return 0; +#endif + + stack_page = (unsigned long)paca_ptrs[cpu]->emergency_sp - THREAD_SIZE; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + +# ifdef CONFIG_PPC_BOOK3S_64 + stack_page = (unsigned long)paca_ptrs[cpu]->nmi_emergency_sp - THREAD_SIZE; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + + stack_page = (unsigned long)paca_ptrs[cpu]->mc_emergency_sp - THREAD_SIZE; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; +# endif +#endif + + return 0; +} + +/* + * validate the stack frame of a particular minimum size, used for when we are + * looking at a certain object in the stack beyond the minimum. + */ +int validate_sp_size(unsigned long sp, struct task_struct *p, + unsigned long nbytes) { unsigned long stack_page = (unsigned long)task_stack_page(p); @@ -1984,30 +2211,32 @@ int validate_sp(unsigned long sp, struct task_struct *p, if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) return 1; - return valid_irq_stack(sp, p, nbytes); + if (valid_irq_stack(sp, p, nbytes)) + return 1; + + return valid_emergency_stack(sp, p, nbytes); } -EXPORT_SYMBOL(validate_sp); +int validate_sp(unsigned long sp, struct task_struct *p) +{ + return validate_sp_size(sp, p, STACK_FRAME_MIN_SIZE); +} -static unsigned long __get_wchan(struct task_struct *p) +static unsigned long ___get_wchan(struct task_struct *p) { unsigned long ip, sp; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - sp = p->thread.ksp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p)) return 0; do { - sp = *(unsigned long *)sp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || - p->state == TASK_RUNNING) + sp = READ_ONCE_NOCHECK(*(unsigned long *)sp); + if (!validate_sp(sp, p) || task_is_running(p)) return 0; if (count > 0) { - ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; + ip = READ_ONCE_NOCHECK(((unsigned long *)sp)[STACK_FRAME_LR_SAVE]); if (!in_sched_functions(ip)) return ip; } @@ -2015,31 +2244,47 @@ static unsigned long __get_wchan(struct task_struct *p) return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long ret; if (!try_get_task_stack(p)) return 0; - ret = __get_wchan(p); + ret = ___get_wchan(p); put_task_stack(p); return ret; } +static bool empty_user_regs(struct pt_regs *regs, struct task_struct *tsk) +{ + unsigned long stack_page; + + // A non-empty pt_regs should never have a zero MSR or TRAP value. + if (regs->msr || regs->trap) + return false; + + // Check it sits at the very base of the stack + stack_page = (unsigned long)task_stack_page(tsk); + if ((unsigned long)(regs + 1) != stack_page + THREAD_SIZE) + return false; + + return true; +} + static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack(struct task_struct *tsk, unsigned long *stack) +void __no_sanitize_address show_stack(struct task_struct *tsk, + unsigned long *stack, + const char *loglvl) { unsigned long sp, ip, lr, newsp; int count = 0; int firstframe = 1; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned long ret_addr; int ftrace_idx = 0; -#endif if (tsk == NULL) tsk = current; @@ -2050,28 +2295,27 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) sp = (unsigned long) stack; if (sp == 0) { if (tsk == current) - sp = current_stack_pointer(); + sp = current_stack_frame(); else sp = tsk->thread.ksp; } lr = 0; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); do { - if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, tsk)) break; stack = (unsigned long *) sp; newsp = stack[0]; ip = stack[STACK_FRAME_LR_SAVE]; if (!firstframe || ip != lr) { - printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER + printk("%s["REG"] ["REG"] %pS", + loglvl, sp, ip, (void *)ip); ret_addr = ftrace_graph_ret_addr(current, &ftrace_idx, ip, stack); if (ret_addr != ip) pr_cont(" (%pS)", (void *)ret_addr); -#endif if (firstframe) pr_cont(" (unreliable)"); pr_cont("\n"); @@ -2080,15 +2324,28 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) /* * See if this is an exception frame. - * We look for the "regshere" marker in the current frame. + * We look for the "regs" marker in the current frame. + * + * STACK_SWITCH_FRAME_SIZE being the smallest frame that + * could hold a pt_regs, if that does not fit then it can't + * have regs. */ - if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE) - && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + if (validate_sp_size(sp, tsk, STACK_SWITCH_FRAME_SIZE) + && stack[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) - (sp + STACK_FRAME_OVERHEAD); + (sp + STACK_INT_FRAME_REGS); + lr = regs->link; - printk("--- interrupt: %lx at %pS\n LR = %pS\n", - regs->trap, (void *)regs->nip, (void *)lr); + printk("%s--- interrupt: %lx at %pS\n", + loglvl, regs->trap, (void *)regs->nip); + + // Detect the case of an empty pt_regs at the very base + // of the stack and suppress showing it in full. + if (!empty_user_regs(regs, tsk)) { + __show_regs(regs); + printk("%s--- interrupt: %lx\n", loglvl, regs->trap); + } + firstframe = 1; } @@ -2148,46 +2405,6 @@ void notrace __ppc64_runlatch_off(void) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; } - -static inline unsigned long brk_rnd(void) -{ - unsigned long rnd = 0; - - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT))); - else - rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT))); - - return rnd << PAGE_SHIFT; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long base = mm->brk; - unsigned long ret; - -#ifdef CONFIG_PPC_BOOK3S_64 - /* - * If we are using 1TB segments and we are allowed to randomise - * the heap, we can put it above 1TB so it is backed by a 1TB - * segment. Otherwise the heap will be in the bottom 1TB - * which always uses 256MB segments and this may result in a - * performance penalty. We don't need to worry about radix. For - * radix, mmu_highuser_ssize remains unchanged from 256MB. - */ - if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) - base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); -#endif - - ret = PAGE_ALIGN(base + brk_rnd()); - - if (ret < mm->brk) - return mm->brk; - - return ret; -} - diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 6620f37abe73..0b5878c3125b 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -11,7 +11,6 @@ #undef DEBUG -#include <stdarg.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> @@ -30,8 +29,9 @@ #include <linux/of_fdt.h> #include <linux/libfdt.h> #include <linux/cpu.h> +#include <linux/pgtable.h> +#include <linux/seq_buf.h> -#include <asm/prom.h> #include <asm/rtas.h> #include <asm/page.h> #include <asm/processor.h> @@ -41,12 +41,11 @@ #include <asm/smp.h> #include <asm/mmu.h> #include <asm/paca.h> -#include <asm/pgtable.h> #include <asm/powernv.h> #include <asm/iommu.h> #include <asm/btext.h> #include <asm/sections.h> -#include <asm/machdep.h> +#include <asm/setup.h> #include <asm/pci-bridge.h> #include <asm/kexec.h> #include <asm/opal.h> @@ -56,6 +55,8 @@ #include <asm/dt_cpu_ftrs.h> #include <asm/drmem.h> #include <asm/ultravisor.h> +#include <asm/prom.h> +#include <asm/plpks.h> #include <mm/mmu_decl.h> @@ -65,11 +66,14 @@ #define DBG(fmt...) #endif +int *chip_id_lookup_table; + #ifdef CONFIG_PPC64 int __initdata iommu_is_off; int __initdata iommu_force_on; unsigned long tce_alloc_start, tce_alloc_end; u64 ppc64_rma_size; +unsigned int boot_cpu_node_count __ro_after_init; #endif static phys_addr_t first_memblock_size; static int __initdata boot_cpu_count; @@ -96,8 +100,8 @@ static inline int overlaps_initrd(unsigned long start, unsigned long size) if (!initrd_start) return 0; - return (start + size) > _ALIGN_DOWN(initrd_start, PAGE_SIZE) && - start <= _ALIGN_UP(initrd_end, PAGE_SIZE); + return (start + size) > ALIGN_DOWN(initrd_start, PAGE_SIZE) && + start <= ALIGN(initrd_end, PAGE_SIZE); #else return 0; #endif @@ -136,7 +140,7 @@ static void __init move_device_tree(void) } /* - * ibm,pa-features is a per-cpu property that contains a string of + * ibm,pa/pi-features is a per-cpu property that contains a string of * attribute descriptors, each of which has a 2 byte header plus up * to 254 bytes worth of processor attribute bits. First header * byte specifies the number of bytes following the header. @@ -148,24 +152,25 @@ static void __init move_device_tree(void) * is supported/not supported. Note that the bit numbers are * big-endian to match the definition in PAPR. */ -static struct ibm_pa_feature { +struct ibm_feature { unsigned long cpu_features; /* CPU_FTR_xxx bit */ unsigned long mmu_features; /* MMU_FTR_xxx bit */ unsigned int cpu_user_ftrs; /* PPC_FEATURE_xxx bit */ unsigned int cpu_user_ftrs2; /* PPC_FEATURE2_xxx bit */ - unsigned char pabyte; /* byte number in ibm,pa-features */ + unsigned char pabyte; /* byte number in ibm,pa/pi-features */ unsigned char pabit; /* bit number (big-endian) */ unsigned char invert; /* if 1, pa bit set => clear feature */ -} ibm_pa_features[] __initdata = { +}; + +static struct ibm_feature ibm_pa_features[] __initdata = { { .pabyte = 0, .pabit = 0, .cpu_user_ftrs = PPC_FEATURE_HAS_MMU }, { .pabyte = 0, .pabit = 1, .cpu_user_ftrs = PPC_FEATURE_HAS_FPU }, { .pabyte = 0, .pabit = 3, .cpu_features = CPU_FTR_CTRL }, { .pabyte = 0, .pabit = 6, .cpu_features = CPU_FTR_NOEXECUTE }, { .pabyte = 1, .pabit = 2, .mmu_features = MMU_FTR_CI_LARGE_PAGE }, #ifdef CONFIG_PPC_RADIX_MMU - { .pabyte = 40, .pabit = 0, .mmu_features = MMU_FTR_TYPE_RADIX }, + { .pabyte = 40, .pabit = 0, .mmu_features = MMU_FTR_TYPE_RADIX | MMU_FTR_GTSE }, #endif - { .pabyte = 1, .pabit = 1, .invert = 1, .cpu_features = CPU_FTR_NODSISRALIGN }, { .pabyte = 5, .pabit = 0, .cpu_features = CPU_FTR_REAL_LE, .cpu_user_ftrs = PPC_FEATURE_TRUE_LE }, /* @@ -175,11 +180,24 @@ static struct ibm_pa_feature { */ { .pabyte = 22, .pabit = 0, .cpu_features = CPU_FTR_TM_COMP, .cpu_user_ftrs2 = PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_HTM_NOSC_COMP }, + + { .pabyte = 64, .pabit = 0, .cpu_features = CPU_FTR_DAWR1 }, + { .pabyte = 68, .pabit = 5, .cpu_features = CPU_FTR_DEXCR_NPHIE }, +}; + +/* + * ibm,pi-features property provides the support of processor specific + * options not described in ibm,pa-features. Right now use byte 0, bit 3 + * which indicates the occurrence of DSI interrupt when the paste operation + * on the suspended NX window. + */ +static struct ibm_feature ibm_pi_features[] __initdata = { + { .pabyte = 0, .pabit = 3, .mmu_features = MMU_FTR_NX_DSI }, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, unsigned long tablelen, - struct ibm_pa_feature *fp, + struct ibm_feature *fp, unsigned long ft_size) { unsigned long i, len, bit; @@ -216,20 +234,21 @@ static void __init scan_features(unsigned long node, const unsigned char *ftrs, } } -static void __init check_cpu_pa_features(unsigned long node) +static void __init check_cpu_features(unsigned long node, char *name, + struct ibm_feature *fp, + unsigned long size) { const unsigned char *pa_ftrs; int tablelen; - pa_ftrs = of_get_flat_dt_prop(node, "ibm,pa-features", &tablelen); + pa_ftrs = of_get_flat_dt_prop(node, name, &tablelen); if (pa_ftrs == NULL) return; - scan_features(node, pa_ftrs, tablelen, - ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); + scan_features(node, pa_ftrs, tablelen, fp, size); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU static void __init init_mmu_slb_size(unsigned long node) { const __be32 *slb_size_ptr; @@ -266,7 +285,7 @@ static struct feature_property { }; #if defined(CONFIG_44x) && defined(CONFIG_PPC_FPU) -static inline void identical_pvr_fixup(unsigned long node) +static __init void identical_pvr_fixup(unsigned long node) { unsigned int pvr; const char *model = of_get_flat_dt_prop(node, "model", NULL); @@ -319,6 +338,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; + if (IS_ENABLED(CONFIG_PPC64)) + boot_cpu_node_count++; + /* Get physical cpuid */ intserv = of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", &len); if (!intserv) @@ -350,6 +372,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, be32_to_cpu(intserv[found_thread])); boot_cpuid = found; + if (IS_ENABLED(CONFIG_PPC64)) + boot_cpu_hwid = be32_to_cpu(intserv[found_thread]); + /* * PAPR defines "logical" PVR values for cpus that * meet various levels of the architecture: @@ -371,11 +396,16 @@ static int __init early_init_dt_scan_cpus(unsigned long node, */ if (!dt_cpu_ftrs_in_use()) { prop = of_get_flat_dt_prop(node, "cpu-version", NULL); - if (prop && (be32_to_cpup(prop) & 0xff000000) == 0x0f000000) + if (prop && (be32_to_cpup(prop) & 0xff000000) == 0x0f000000) { identify_cpu(0, be32_to_cpup(prop)); + seq_buf_printf(&ppc_hw_desc, "0x%04x ", be32_to_cpup(prop)); + } check_cpu_feature_properties(node); - check_cpu_pa_features(node); + check_cpu_features(node, "ibm,pa-features", ibm_pa_features, + ARRAY_SIZE(ibm_pa_features)); + check_cpu_features(node, "ibm,pi-features", ibm_pi_features, + ARRAY_SIZE(ibm_pi_features)); } identical_pvr_fixup(node); @@ -386,9 +416,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node, cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT; else if (!dt_cpu_ftrs_in_use()) cur_cpu_spec->cpu_features |= CPU_FTR_SMT; - allocate_paca(boot_cpuid); #endif - set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread])); return 0; } @@ -400,7 +428,7 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned long node, const unsigned long *lprop; /* All these set by kernel, so no need to convert endian */ /* Use common scan routine to determine if this is the chosen node */ - if (early_init_dt_scan_chosen(node, uname, depth, data) == 0) + if (early_init_dt_scan_chosen(data) < 0) return 0; #ifdef CONFIG_PPC64 @@ -445,7 +473,7 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned long node, */ #ifdef CONFIG_SPARSEMEM -static bool validate_mem_limit(u64 base, u64 *size) +static bool __init validate_mem_limit(u64 base, u64 *size) { u64 max_mem = 1UL << (MAX_PHYSMEM_BITS); @@ -456,7 +484,7 @@ static bool validate_mem_limit(u64 base, u64 *size) return true; } #else -static bool validate_mem_limit(u64 base, u64 *size) +static bool __init validate_mem_limit(u64 base, u64 *size) { return true; } @@ -468,8 +496,9 @@ static bool validate_mem_limit(u64 base, u64 *size) * This contains a list of memory blocks along with NUMA affinity * information. */ -static void __init early_init_drmem_lmb(struct drmem_lmb *lmb, - const __be32 **usm) +static int __init early_init_drmem_lmb(struct drmem_lmb *lmb, + const __be32 **usm, + void *data) { u64 base, size; int is_kexec_kdump = 0, rngs; @@ -484,7 +513,7 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb, */ if ((lmb->flags & DRCONF_MEM_RESERVED) || !(lmb->flags & DRCONF_MEM_ASSIGNED)) - return; + return 0; if (*usm) is_kexec_kdump = 1; @@ -499,7 +528,7 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb, */ rngs = dt_mem_next_cell(dt_root_size_cells, usm); if (!rngs) /* there are no (base, size) duple */ - return; + return 0; } do { @@ -515,26 +544,32 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb, size = 0x80000000ul - base; } + if (!validate_mem_limit(base, &size)) + continue; + DBG("Adding: %llx -> %llx\n", base, size); - if (validate_mem_limit(base, &size)) - memblock_add(base, size); + memblock_add(base, size); + + if (lmb->flags & DRCONF_MEM_HOTREMOVABLE) + memblock_mark_hotplug(base, size); } while (--rngs); + + return 0; } #endif /* CONFIG_PPC_PSERIES */ -static int __init early_init_dt_scan_memory_ppc(unsigned long node, - const char *uname, - int depth, void *data) +static int __init early_init_dt_scan_memory_ppc(void) { #ifdef CONFIG_PPC_PSERIES - if (depth == 1 && - strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) { - walk_drmem_lmbs_early(node, early_init_drmem_lmb); - return 0; - } + const void *fdt = initial_boot_params; + int node = fdt_path_offset(fdt, "/ibm,dynamic-reconfiguration-memory"); + + if (node > 0) + walk_drmem_lmbs_early(node, NULL, early_init_drmem_lmb); + #endif - - return early_init_dt_scan_memory(node, uname, depth, data); + + return early_init_dt_scan_memory(); } /* @@ -623,13 +658,15 @@ static void __init early_reserve_mem(void) #ifdef CONFIG_BLK_DEV_INITRD /* Then reserve the initrd, if any */ if (initrd_start && (initrd_end > initrd_start)) { - memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE), - _ALIGN_UP(initrd_end, PAGE_SIZE) - - _ALIGN_DOWN(initrd_start, PAGE_SIZE)); + memblock_reserve(ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE), + ALIGN(initrd_end, PAGE_SIZE) - + ALIGN_DOWN(initrd_start, PAGE_SIZE)); } #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_PPC32 + if (!IS_ENABLED(CONFIG_PPC32)) + return; + /* * Handle the case where we might be booting from an old kexec * image that setup the mem_rsvmap as pairs of 32-bit values @@ -650,7 +687,6 @@ static void __init early_reserve_mem(void) } return; } -#endif } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -685,6 +721,40 @@ static void __init tm_init(void) static void tm_init(void) { } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +static int __init +early_init_dt_scan_model(unsigned long node, const char *uname, + int depth, void *data) +{ + const char *prop; + + if (depth != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "model", NULL); + if (prop) + seq_buf_printf(&ppc_hw_desc, "%s ", prop); + + /* break now */ + return 1; +} + +#ifdef CONFIG_PPC64 +static void __init save_fscr_to_task(void) +{ + /* + * Ensure the init_task (pid 0, aka swapper) uses the value of FSCR we + * have configured via the device tree features or via __init_FSCR(). + * That value will then be propagated to pid 1 (init) and all future + * processes. + */ + if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + init_task.thread.fscr = mfspr(SPRN_FSCR); +} +#else +static inline void save_fscr_to_task(void) {} +#endif + + void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -695,6 +765,8 @@ void __init early_init_devtree(void *params) if (!early_init_dt_verify(params)) panic("BUG: Failed verifying flat device tree, bad version?"); + of_scan_flat_dt(early_init_dt_scan_model, NULL); + #ifdef CONFIG_PPC_RTAS /* Some machines might need RTAS info for debugging, grab it now. */ of_scan_flat_dt(early_init_dt_scan_rtas, NULL); @@ -720,9 +792,16 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line); /* Scan memory nodes and rebuild MEMBLOCKs */ - of_scan_flat_dt(early_init_dt_scan_root, NULL); - of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); + early_init_dt_scan_root(); + early_init_dt_scan_memory_ppc(); + /* + * As generic code authors expect to be able to use static keys + * in early_param() handlers, we initialize the static keys just + * before parsing early params (it's fine to call jump_label_init() + * more than once). + */ + jump_label_init(); parse_early_param(); /* make sure we've parsed cmdline for mem= before this */ @@ -730,7 +809,7 @@ void __init early_init_devtree(void *params) first_memblock_size = min_t(u64, first_memblock_size, memory_limit); setup_initial_memory_limit(memstart_addr, first_memblock_size); /* Reserve MEMBLOCK regions used by kernel, initrd, dt, etc... */ - memblock_reserve(PHYSICAL_START, __pa(klimit) - PHYSICAL_START); + memblock_reserve(PHYSICAL_START, __pa(_end) - PHYSICAL_START); /* If relocatable, reserve first 32k for interrupt vectors etc. */ if (PHYSICAL_START > MEMORY_START) memblock_reserve(MEMORY_START, 0x8000); @@ -749,6 +828,11 @@ void __init early_init_devtree(void *params) limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); memblock_enforce_memory_limit(limit); +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_4K_PAGES) + if (!early_radix_enabled()) + memblock_cap_memory_range(0, 1UL << (H_MAX_PHYSMEM_BITS)); +#endif + memblock_allow_resize(); memblock_dump_all(); @@ -758,12 +842,13 @@ void __init early_init_devtree(void *params) * FIXME .. and the initrd too? */ move_device_tree(); - allocate_paca_ptrs(); - DBG("Scanning CPUs ...\n"); dt_cpu_ftrs_scan(); + // We can now add the CPU name & PVR to the hardware description + seq_buf_printf(&ppc_hw_desc, "%s 0x%04lx ", cur_cpu_spec->cpu_name, mfspr(SPRN_PVR)); + /* Retrieve CPU related informations from the flat tree * (altivec support, boot CPU ID, ...) */ @@ -773,6 +858,8 @@ void __init early_init_devtree(void *params) BUG(); } + save_fscr_to_task(); + #if defined(CONFIG_SMP) && defined(CONFIG_PPC64) /* We'll later wait for secondaries to check in; there are * NCPUS-1 non-boot CPUs :-) @@ -791,12 +878,20 @@ void __init early_init_devtree(void *params) /* Now try to figure out if we are running on LPAR and so on */ pseries_probe_fw_features(); + /* + * Initialize pkey features and default AMR/IAMR values + */ + pkey_early_init_devtree(); + #ifdef CONFIG_PPC_PS3 /* Identify PS3 firmware */ if (of_flat_dt_is_compatible(of_get_flat_dt_root(), "sony,ps3")) powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE; #endif + /* If kexec left a PLPKS password in the DT, get it and clear it */ + plpks_early_init_devtree(); + tm_init(); DBG(" <- early_init_devtree()\n"); @@ -817,8 +912,8 @@ void __init early_get_first_memblock_info(void *params, phys_addr_t *size) * mess the memblock. */ add_mem_to_memblock = 0; - of_scan_flat_dt(early_init_dt_scan_root, NULL); - of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); + early_init_dt_scan_root(); + early_init_dt_scan_memory_ppc(); add_mem_to_memblock = 1; if (size) @@ -876,13 +971,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id); int cpu_to_chip_id(int cpu) { struct device_node *np; + int ret = -1, idx; + + idx = cpu / threads_per_core; + if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1) + return chip_id_lookup_table[idx]; np = of_get_cpu_node(cpu, NULL); - if (!np) - return -1; + if (np) { + ret = of_get_ibm_chip_id(np); + of_node_put(np); + + if (chip_id_lookup_table) + chip_id_lookup_table[idx] = ret; + } - of_node_put(np); - return of_get_ibm_chip_id(np); + return ret; } EXPORT_SYMBOL(cpu_to_chip_id); diff --git a/arch/powerpc/kernel/prom_entry_64.S b/arch/powerpc/kernel/prom_entry_64.S new file mode 100644 index 000000000000..f1b8793d28c6 --- /dev/null +++ b/arch/powerpc/kernel/prom_entry_64.S @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP + * Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu> + * Adapted for Power Macintosh by Paul Mackerras. + * Low-level exception handlers and MMU support + * rewritten by Paul Mackerras. + * Copyright (C) 1996 Paul Mackerras. + * MPC8xx modifications Copyright (C) 1997 Dan Malek (dmalek@jlc.net). + * + * This file contains the 64-bit prom entry code. + */ +#include <asm/asm-offsets.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/exception-64s.h> +#else +#include <asm/exception-64e.h> +#endif +#include <asm/ppc_asm.h> + +.section ".text","ax",@progbits + +_GLOBAL(enter_prom) + mflr r0 + std r0,16(r1) + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space */ + + /* Because PROM is running in 32b mode, it clobbers the high order half + * of all registers that it saves. We therefore save those registers + * PROM might touch to the stack. (r0, r3-r13 are caller saved) + */ + SAVE_GPR(2, r1) + SAVE_GPR(13, r1) + SAVE_NVGPRS(r1) + mfcr r10 + mfmsr r11 + std r10,_CCR(r1) + std r11,_MSR(r1) + + /* Put PROM address in SRR0 */ + mtsrr0 r4 + + /* Setup our trampoline return addr in LR */ + bcl 20,31,$+4 +0: mflr r4 + addi r4,r4,(1f - 0b) + mtlr r4 + + /* Prepare a 32-bit mode big endian MSR + */ +#ifdef CONFIG_PPC_BOOK3E_64 + rlwinm r11,r11,0,1,31 + mtsrr1 r11 + rfi +#else /* CONFIG_PPC_BOOK3E_64 */ + LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_LE) + andc r11,r11,r12 + mtsrr1 r11 + RFI_TO_KERNEL +#endif /* CONFIG_PPC_BOOK3E_64 */ + +1: /* Return from OF */ + FIXUP_ENDIAN + + /* Just make sure that r1 top 32 bits didn't get + * corrupt by OF + */ + rldicl r1,r1,0,32 + + /* Restore the MSR (back to 64 bits) */ + ld r0,_MSR(r1) + MTMSRD(r0) + isync + + /* Restore other registers */ + REST_GPR(2, r1) + REST_GPR(13, r1) + REST_NVGPRS(r1) + ld r4,_CCR(r1) + mtcr r4 + + addi r1,r1,SWITCH_FRAME_SIZE + ld r0,16(r1) + mtlr r0 + blr diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 577345382b23..e67effdba85c 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -14,7 +14,7 @@ /* we cannot use FORTIFY as it brings in new symbols */ #define __NO_FORTIFY -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> @@ -26,26 +26,30 @@ #include <linux/delay.h> #include <linux/initrd.h> #include <linux/bitops.h> +#include <linux/pgtable.h> +#include <linux/printk.h> +#include <linux/of.h> +#include <linux/of_fdt.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/page.h> #include <asm/processor.h> +#include <asm/interrupt.h> #include <asm/irq.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/iommu.h> #include <asm/btext.h> #include <asm/sections.h> -#include <asm/machdep.h> +#include <asm/setup.h> #include <asm/asm-prototypes.h> #include <asm/ultravisor-api.h> #include <linux/linux_logo.h> /* All of prom_init bss lives here */ -#define __prombss __section(.bss.prominit) +#define __prombss __section(".bss.prominit") /* * Eventually bump that one up @@ -92,12 +96,6 @@ static int of_workarounds __prombss; #define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ #define OF_WA_LONGTRAIL 2 /* work around longtrail bugs */ -#define PROM_BUG() do { \ - prom_printf("kernel BUG at %s line 0x%x!\n", \ - __FILE__, __LINE__); \ - __builtin_trap(); \ -} while (0) - #ifdef DEBUG_PROM #define prom_debug(x...) prom_printf(x) #else @@ -169,6 +167,7 @@ static unsigned long __prombss prom_tce_alloc_end; #ifdef CONFIG_PPC_PSERIES static bool __prombss prom_radix_disable; +static bool __prombss prom_radix_gtse_disable; static bool __prombss prom_xive_disable; #endif @@ -241,13 +240,31 @@ static int __init prom_strcmp(const char *cs, const char *ct) return 0; } -static char __init *prom_strcpy(char *dest, const char *src) +static ssize_t __init prom_strscpy_pad(char *dest, const char *src, size_t n) { - char *tmp = dest; + ssize_t rc; + size_t i; - while ((*dest++ = *src++) != '\0') - /* nothing */; - return tmp; + if (n == 0 || n > INT_MAX) + return -E2BIG; + + // Copy up to n bytes + for (i = 0; i < n && src[i] != '\0'; i++) + dest[i] = src[i]; + + rc = i; + + // If we copied all n then we have run out of space for the nul + if (rc == n) { + // Rewind by one character to ensure nul termination + i--; + rc = -E2BIG; + } + + for (; i < n; i++) + dest[i] = '\0'; + + return rc; } static int __init prom_strncmp(const char *cs, const char *ct, size_t count) @@ -354,6 +371,7 @@ static int __init prom_strtobool(const char *s, bool *res) default: break; } + break; default: break; } @@ -650,7 +668,7 @@ static inline int __init prom_getproplen(phandle node, const char *pname) return call_prom("getproplen", 2, 1, node, ADDR(pname)); } -static void add_string(char **str, const char *q) +static void __init add_string(char **str, const char *q) { char *p = *str; @@ -660,7 +678,7 @@ static void add_string(char **str, const char *q) *str = p; } -static char *tohex(unsigned int x) +static char *__init tohex(unsigned int x) { static const char digits[] __initconst = "0123456789abcdef"; static char result[9] __prombss; @@ -699,29 +717,28 @@ static int __init prom_setprop(phandle node, const char *nodename, } /* We can't use the standard versions because of relocation headaches. */ -#define isxdigit(c) (('0' <= (c) && (c) <= '9') \ - || ('a' <= (c) && (c) <= 'f') \ - || ('A' <= (c) && (c) <= 'F')) +#define prom_isxdigit(c) \ + (('0' <= (c) && (c) <= '9') || ('a' <= (c) && (c) <= 'f') || ('A' <= (c) && (c) <= 'F')) -#define isdigit(c) ('0' <= (c) && (c) <= '9') -#define islower(c) ('a' <= (c) && (c) <= 'z') -#define toupper(c) (islower(c) ? ((c) - 'a' + 'A') : (c)) +#define prom_isdigit(c) ('0' <= (c) && (c) <= '9') +#define prom_islower(c) ('a' <= (c) && (c) <= 'z') +#define prom_toupper(c) (prom_islower(c) ? ((c) - 'a' + 'A') : (c)) -static unsigned long prom_strtoul(const char *cp, const char **endp) +static unsigned long __init prom_strtoul(const char *cp, const char **endp) { unsigned long result = 0, base = 10, value; if (*cp == '0') { base = 8; cp++; - if (toupper(*cp) == 'X') { + if (prom_toupper(*cp) == 'X') { cp++; base = 16; } } - while (isxdigit(*cp) && - (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < base) { + while (prom_isxdigit(*cp) && + (value = prom_isdigit(*cp) ? *cp - '0' : prom_toupper(*cp) - 'A' + 10) < base) { result = result * base + value; cp++; } @@ -732,7 +749,7 @@ static unsigned long prom_strtoul(const char *cp, const char **endp) return result; } -static unsigned long prom_memparse(const char *ptr, const char **retptr) +static unsigned long __init prom_memparse(const char *ptr, const char **retptr) { unsigned long ret = prom_strtoul(ptr, retptr); int shift = 0; @@ -823,6 +840,12 @@ static void __init early_cmdline_parse(void) if (prom_radix_disable) prom_debug("Radix disabled from cmdline\n"); + opt = prom_strstr(prom_cmd_line, "radix_hcall_invalidate=on"); + if (opt) { + prom_radix_gtse_disable = true; + prom_debug("Radix GTSE disabled from cmdline\n"); + } + opt = prom_strstr(prom_cmd_line, "xive=off"); if (opt) { prom_xive_disable = true; @@ -919,8 +942,12 @@ struct option_vector6 { u8 os_name; } __packed; +struct option_vector7 { + u8 os_id[256]; +} __packed; + struct ibm_arch_vec { - struct { u32 mask, val; } pvrs[12]; + struct { __be32 mask, val; } pvrs[14]; u8 num_vectors; @@ -941,6 +968,9 @@ struct ibm_arch_vec { u8 vec6_len; struct option_vector6 vec6; + + u8 vec7_len; + struct option_vector7 vec7; } __packed; static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { @@ -974,6 +1004,14 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { .val = cpu_to_be32(0x004e0000), }, { + .mask = cpu_to_be32(0xffff0000), /* POWER10 */ + .val = cpu_to_be32(0x00800000), + }, + { + .mask = cpu_to_be32(0xffffffff), /* all 3.1-compliant */ + .val = cpu_to_be32(0x0f000006), + }, + { .mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */ .val = cpu_to_be32(0x0f000005), }, @@ -1002,7 +1040,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { .byte1 = 0, .arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 | OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07, - .arch_versions3 = OV1_PPC_3_00, + .arch_versions3 = OV1_PPC_3_00 | OV1_PPC_3_1, }, .vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)), @@ -1054,7 +1092,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) | + OV5_FEAT(OV5_FORM2_AFFINITY), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, @@ -1079,6 +1118,9 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { .secondary_pteg = 0, .os_name = OV6_LINUX, }, + + /* option vector 7: OS Identification */ + .vec7_len = VECTOR_LENGTH(sizeof(struct option_vector7)), }; static struct ibm_arch_vec __prombss ibm_architecture_vec ____cacheline_aligned; @@ -1277,10 +1319,8 @@ static void __init prom_parse_platform_support(u8 index, u8 val, prom_parse_mmu_model(val & OV5_FEAT(OV5_MMU_SUPPORT), support); break; case OV5_INDX(OV5_RADIX_GTSE): /* Radix Extensions */ - if (val & OV5_FEAT(OV5_RADIX_GTSE)) { - prom_debug("Radix - GTSE supported\n"); - support->radix_gtse = true; - } + if (val & OV5_FEAT(OV5_RADIX_GTSE)) + support->radix_gtse = !prom_radix_gtse_disable; break; case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */ prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT), @@ -1309,6 +1349,8 @@ static void __init prom_check_platform_support(void) memcpy(&ibm_architecture_vec, &ibm_architecture_vec_template, sizeof(ibm_architecture_vec)); + prom_strscpy_pad(ibm_architecture_vec.vec7.os_id, linux_banner, 256); + if (prop_len > 1) { int i; u8 vec[8]; @@ -1317,23 +1359,22 @@ static void __init prom_check_platform_support(void) if (prop_len > sizeof(vec)) prom_printf("WARNING: ibm,arch-vec-5-platform-support longer than expected (len: %d)\n", prop_len); - prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support", - &vec, sizeof(vec)); - for (i = 0; i < sizeof(vec); i += 2) { - prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2 - , vec[i] - , vec[i + 1]); - prom_parse_platform_support(vec[i], vec[i + 1], - &supported); + prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support", &vec, sizeof(vec)); + for (i = 0; i < prop_len; i += 2) { + prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2, vec[i], vec[i + 1]); + prom_parse_platform_support(vec[i], vec[i + 1], &supported); } } - if (supported.radix_mmu && supported.radix_gtse && - IS_ENABLED(CONFIG_PPC_RADIX_MMU)) { - /* Radix preferred - but we require GTSE for now */ - prom_debug("Asking for radix with GTSE\n"); + if (supported.radix_mmu && IS_ENABLED(CONFIG_PPC_RADIX_MMU)) { + /* Radix preferred - Check if GTSE is also supported */ + prom_debug("Asking for radix\n"); ibm_architecture_vec.vec5.mmu = OV5_FEAT(OV5_MMU_RADIX); - ibm_architecture_vec.vec5.radix_ext = OV5_FEAT(OV5_RADIX_GTSE); + if (supported.radix_gtse) + ibm_architecture_vec.vec5.radix_ext = + OV5_FEAT(OV5_RADIX_GTSE); + else + prom_debug("Radix GTSE isn't supported\n"); } else if (supported.hash_mmu) { /* Default to hash mmu (if we can) */ prom_debug("Asking for hash\n"); @@ -1449,18 +1490,18 @@ static unsigned long __init alloc_up(unsigned long size, unsigned long align) unsigned long addr = 0; if (align) - base = _ALIGN_UP(base, align); + base = ALIGN(base, align); prom_debug("%s(%lx, %lx)\n", __func__, size, align); if (ram_top == 0) prom_panic("alloc_up() called with mem not initialized\n"); if (align) - base = _ALIGN_UP(alloc_bottom, align); + base = ALIGN(alloc_bottom, align); else base = alloc_bottom; for(; (base + size) <= alloc_top; - base = _ALIGN_UP(base + 0x100000, align)) { + base = ALIGN(base + 0x100000, align)) { prom_debug(" trying: 0x%lx\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); if (addr != PROM_ERROR && addr != 0) @@ -1500,7 +1541,7 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align, if (highmem) { /* Carve out storage for the TCE table. */ - addr = _ALIGN_DOWN(alloc_top_high - size, align); + addr = ALIGN_DOWN(alloc_top_high - size, align); if (addr <= alloc_bottom) return 0; /* Will we bump into the RMO ? If yes, check out that we @@ -1518,9 +1559,9 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align, goto bail; } - base = _ALIGN_DOWN(alloc_top - size, align); + base = ALIGN_DOWN(alloc_top - size, align); for (; base > alloc_bottom; - base = _ALIGN_DOWN(base - 0x100000, align)) { + base = ALIGN_DOWN(base - 0x100000, align)) { prom_debug(" trying: 0x%lx\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); if (addr != PROM_ERROR && addr != 0) @@ -1586,8 +1627,8 @@ static void __init reserve_mem(u64 base, u64 size) * have our terminator with "size" set to 0 since we are * dumb and just copy this entire array to the boot params */ - base = _ALIGN_DOWN(base, PAGE_SIZE); - top = _ALIGN_UP(top, PAGE_SIZE); + base = ALIGN_DOWN(base, PAGE_SIZE); + top = ALIGN(top, PAGE_SIZE); size = top - base; if (cnt >= (MEM_RESERVE_MAP_SIZE - 1)) @@ -1741,7 +1782,7 @@ static void __init prom_close_stdin(void) } #ifdef CONFIG_PPC_SVM -static int prom_rtas_hcall(uint64_t args) +static int __init prom_rtas_hcall(uint64_t args) { register uint64_t arg1 asm("r3") = H_RTAS; register uint64_t arg2 asm("r4") = args; @@ -1749,6 +1790,8 @@ static int prom_rtas_hcall(uint64_t args) asm volatile("sc 1\n" : "=r" (arg1) : "r" (arg1), "r" (arg2) :); + srr_regs_clobbered(); + return arg1; } @@ -1773,6 +1816,9 @@ static void __init prom_rtas_os_term(char *str) if (token == 0) prom_panic("Could not get token for ibm,os-term\n"); os_term_args.token = cpu_to_be32(token); + os_term_args.nargs = cpu_to_be32(1); + os_term_args.nret = cpu_to_be32(1); + os_term_args.args[0] = cpu_to_be32(__pa(str)); prom_rtas_hcall((uint64_t)&os_term_args); } #endif /* CONFIG_PPC_SVM */ @@ -2250,7 +2296,7 @@ static void __init prom_init_stdout(void) static int __init prom_find_machine_type(void) { - char compat[256]; + static char compat[256] __prombss; int len, i = 0; #ifdef CONFIG_PPC64 phandle rtas; @@ -2403,10 +2449,19 @@ static void __init prom_check_displays(void) u32 width, height, pitch, addr; prom_printf("Setting btext !\n"); - prom_getprop(node, "width", &width, 4); - prom_getprop(node, "height", &height, 4); - prom_getprop(node, "linebytes", &pitch, 4); - prom_getprop(node, "address", &addr, 4); + + if (prom_getprop(node, "width", &width, 4) == PROM_ERROR) + return; + + if (prom_getprop(node, "height", &height, 4) == PROM_ERROR) + return; + + if (prom_getprop(node, "linebytes", &pitch, 4) == PROM_ERROR) + return; + + if (prom_getprop(node, "address", &addr, 4) == PROM_ERROR) + return; + prom_printf("W=%d H=%d LB=%d addr=0x%x\n", width, height, pitch, addr); btext_setup_display(width, height, 8, pitch, addr); @@ -2423,7 +2478,7 @@ static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end, { void *ret; - *mem_start = _ALIGN(*mem_start, align); + *mem_start = ALIGN(*mem_start, align); while ((*mem_start + needed) > *mem_end) { unsigned long room, chunk; @@ -2559,7 +2614,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, *lp++ = *p; } *lp = 0; - *mem_start = _ALIGN((unsigned long)lp + 1, 4); + *mem_start = ALIGN((unsigned long)lp + 1, 4); } /* get it again for debugging */ @@ -2605,7 +2660,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, /* push property content */ valp = make_room(mem_start, mem_end, l, 4); call_prom("getprop", 4, 1, node, pname, valp, l); - *mem_start = _ALIGN(*mem_start, 4); + *mem_start = ALIGN(*mem_start, 4); if (!prom_strcmp(pname, "phandle")) has_phandle = 1; @@ -2664,7 +2719,7 @@ static void __init flatten_device_tree(void) prom_panic ("couldn't get device tree root\n"); /* Build header and make room for mem rsv map */ - mem_start = _ALIGN(mem_start, 4); + mem_start = ALIGN(mem_start, 4); hdr = make_room(&mem_start, &mem_end, sizeof(struct boot_param_header), 4); dt_header_start = (unsigned long)hdr; @@ -2677,7 +2732,7 @@ static void __init flatten_device_tree(void) /* Add "phandle" in there, we'll need it */ namep = make_room(&mem_start, &mem_end, 16, 1); - prom_strcpy(namep, "phandle"); + prom_strscpy_pad(namep, "phandle", sizeof("phandle")); mem_start = (unsigned long)namep + prom_strlen(namep) + 1; /* Build string array */ @@ -2932,7 +2987,7 @@ static void __init fixup_device_tree_efika_add_phy(void) /* Check if the phy-handle property exists - bail if it does */ rv = prom_getprop(node, "phy-handle", prop, sizeof(prop)); - if (!rv) + if (rv <= 0) return; /* @@ -2958,7 +3013,7 @@ static void __init fixup_device_tree_efika_add_phy(void) " 0x3 encode-int encode+" " s\" interrupts\" property" " finish-device"); - }; + } /* Check for a PHY device node - if missing then create one and * give it's phandle to the ethernet node */ @@ -3185,59 +3240,11 @@ static void __init prom_check_initrd(unsigned long r3, unsigned long r4) #endif /* CONFIG_BLK_DEV_INITRD */ } -#ifdef CONFIG_PPC64 -#ifdef CONFIG_RELOCATABLE -static void reloc_toc(void) -{ -} - -static void unreloc_toc(void) -{ -} -#else -static void __reloc_toc(unsigned long offset, unsigned long nr_entries) -{ - unsigned long i; - unsigned long *toc_entry; - - /* Get the start of the TOC by using r2 directly. */ - asm volatile("addi %0,2,-0x8000" : "=b" (toc_entry)); - - for (i = 0; i < nr_entries; i++) { - *toc_entry = *toc_entry + offset; - toc_entry++; - } -} - -static void reloc_toc(void) -{ - unsigned long offset = reloc_offset(); - unsigned long nr_entries = - (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long); - - __reloc_toc(offset, nr_entries); - - mb(); -} - -static void unreloc_toc(void) -{ - unsigned long offset = reloc_offset(); - unsigned long nr_entries = - (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long); - - mb(); - - __reloc_toc(-offset, nr_entries); -} -#endif -#endif - #ifdef CONFIG_PPC_SVM /* * Perform the Enter Secure Mode ultracall. */ -static int enter_secure_mode(unsigned long kbase, unsigned long fdt) +static int __init enter_secure_mode(unsigned long kbase, unsigned long fdt) { register unsigned long r3 asm("r3") = UV_ESM; register unsigned long r4 asm("r4") = kbase; @@ -3251,7 +3258,7 @@ static int enter_secure_mode(unsigned long kbase, unsigned long fdt) /* * Call the Ultravisor to transfer us to secure memory if we have an ESM blob. */ -static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +static void __init setup_secure_guest(unsigned long kbase, unsigned long fdt) { int ret; @@ -3266,14 +3273,12 @@ static void setup_secure_guest(unsigned long kbase, unsigned long fdt) * relocated it so the check will fail. Restore the original image by * relocating it back to the kernel virtual base address. */ - if (IS_ENABLED(CONFIG_RELOCATABLE)) - relocate(KERNELBASE); + relocate(KERNELBASE); ret = enter_secure_mode(kbase, fdt); /* Relocate the kernel again. */ - if (IS_ENABLED(CONFIG_RELOCATABLE)) - relocate(kbase); + relocate(kbase); if (ret != U_SUCCESS) { prom_printf("Returned %d from switching to secure mode.\n", ret); @@ -3281,7 +3286,7 @@ static void setup_secure_guest(unsigned long kbase, unsigned long fdt) } } #else -static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +static void __init setup_secure_guest(unsigned long kbase, unsigned long fdt) { } #endif /* CONFIG_PPC_SVM */ @@ -3301,8 +3306,6 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, #ifdef CONFIG_PPC32 unsigned long offset = reloc_offset(); reloc_got2(offset); -#else - reloc_toc(); #endif /* @@ -3409,7 +3412,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * * PowerMacs use a different mechanism to spin CPUs * - * (This must be done after instanciating RTAS) + * (This must be done after instantiating RTAS) */ if (of_platform != PLATFORM_POWERMAC) prom_hold_cpus(); @@ -3474,14 +3477,11 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ hdr = dt_header_start; - /* Don't print anything after quiesce under OPAL, it crashes OFW */ prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); prom_debug("->dt_header_start=0x%lx\n", hdr); #ifdef CONFIG_PPC32 reloc_got2(-offset); -#else - unreloc_toc(); #endif /* Move to secure memory if we're supposed to be secure guests. */ diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index b183ab9c5107..69623b9045d5 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -13,8 +13,13 @@ # If you really need to reference something from prom_init.o add # it to the list below: -grep "^CONFIG_KASAN=y$" .config >/dev/null -if [ $? -eq 0 ] +has_renamed_memintrinsics() +{ + grep -q "^CONFIG_KASAN=y$" ${KCONFIG_CONFIG} && \ + ! grep -q "^CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX=y" ${KCONFIG_CONFIG} +} + +if has_renamed_memintrinsics then MEM_FUNCS="__memcpy __memset" else @@ -26,8 +31,7 @@ _end enter_prom $MEM_FUNCS reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start logo_linux_clut224 btext_prepare_BAT reloc_got2 kernstart_addr memstart_addr linux_banner _stext -__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC. -relocate" +btext_setup_display TOC. relocate" NM="$1" OBJ="$2" @@ -52,11 +56,10 @@ do # a leading . on the name, so strip it off here. UNDEF="${UNDEF#.}" - if [ $KBUILD_VERBOSE ]; then - if [ $KBUILD_VERBOSE -ne 0 ]; then - echo "Checking prom_init.o symbol '$UNDEF'" - fi - fi + case "$KBUILD_VERBOSE" in + *1*) + echo "Checking prom_init.o symbol '$UNDEF'" ;; + esac OK=0 for WHITE in $WHITELIST diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c deleted file mode 100644 index 25c0424e8868..000000000000 --- a/arch/powerpc/kernel/ptrace.c +++ /dev/null @@ -1,3468 +0,0 @@ -/* - * PowerPC version - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Derived from "arch/m68k/kernel/ptrace.c" - * Copyright (C) 1994 by Hamish Macdonald - * Taken from linux/kernel/ptrace.c and modified for M680x0. - * linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds - * - * Modified by Cort Dougan (cort@hq.fsmlabs.com) - * and Paul Mackerras (paulus@samba.org). - * - * This file is subject to the terms and conditions of the GNU General - * Public License. See the file README.legal in the main directory of - * this archive for more details. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/regset.h> -#include <linux/tracehook.h> -#include <linux/elf.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/signal.h> -#include <linux/seccomp.h> -#include <linux/audit.h> -#include <trace/syscall.h> -#include <linux/hw_breakpoint.h> -#include <linux/perf_event.h> -#include <linux/context_tracking.h> -#include <linux/nospec.h> - -#include <linux/uaccess.h> -#include <linux/pkeys.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/switch_to.h> -#include <asm/tm.h> -#include <asm/asm-prototypes.h> -#include <asm/debug.h> -#include <asm/hw_breakpoint.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -/* - * The parameter save area on the stack is used to store arguments being passed - * to callee function and is located at fixed offset from stack pointer. - */ -#ifdef CONFIG_PPC32 -#define PARAMETER_SAVE_AREA_OFFSET 24 /* bytes */ -#else /* CONFIG_PPC32 */ -#define PARAMETER_SAVE_AREA_OFFSET 48 /* bytes */ -#endif - -struct pt_regs_offset { - const char *name; - int offset; -}; - -#define STR(s) #s /* convert to string */ -#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} -#define GPR_OFFSET_NAME(num) \ - {.name = STR(r##num), .offset = offsetof(struct pt_regs, gpr[num])}, \ - {.name = STR(gpr##num), .offset = offsetof(struct pt_regs, gpr[num])} -#define REG_OFFSET_END {.name = NULL, .offset = 0} - -#define TVSO(f) (offsetof(struct thread_vr_state, f)) -#define TFSO(f) (offsetof(struct thread_fp_state, f)) -#define TSO(f) (offsetof(struct thread_struct, f)) - -static const struct pt_regs_offset regoffset_table[] = { - GPR_OFFSET_NAME(0), - GPR_OFFSET_NAME(1), - GPR_OFFSET_NAME(2), - GPR_OFFSET_NAME(3), - GPR_OFFSET_NAME(4), - GPR_OFFSET_NAME(5), - GPR_OFFSET_NAME(6), - GPR_OFFSET_NAME(7), - GPR_OFFSET_NAME(8), - GPR_OFFSET_NAME(9), - GPR_OFFSET_NAME(10), - GPR_OFFSET_NAME(11), - GPR_OFFSET_NAME(12), - GPR_OFFSET_NAME(13), - GPR_OFFSET_NAME(14), - GPR_OFFSET_NAME(15), - GPR_OFFSET_NAME(16), - GPR_OFFSET_NAME(17), - GPR_OFFSET_NAME(18), - GPR_OFFSET_NAME(19), - GPR_OFFSET_NAME(20), - GPR_OFFSET_NAME(21), - GPR_OFFSET_NAME(22), - GPR_OFFSET_NAME(23), - GPR_OFFSET_NAME(24), - GPR_OFFSET_NAME(25), - GPR_OFFSET_NAME(26), - GPR_OFFSET_NAME(27), - GPR_OFFSET_NAME(28), - GPR_OFFSET_NAME(29), - GPR_OFFSET_NAME(30), - GPR_OFFSET_NAME(31), - REG_OFFSET_NAME(nip), - REG_OFFSET_NAME(msr), - REG_OFFSET_NAME(ctr), - REG_OFFSET_NAME(link), - REG_OFFSET_NAME(xer), - REG_OFFSET_NAME(ccr), -#ifdef CONFIG_PPC64 - REG_OFFSET_NAME(softe), -#else - REG_OFFSET_NAME(mq), -#endif - REG_OFFSET_NAME(trap), - REG_OFFSET_NAME(dar), - REG_OFFSET_NAME(dsisr), - REG_OFFSET_END, -}; - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -static void flush_tmregs_to_thread(struct task_struct *tsk) -{ - /* - * If task is not current, it will have been flushed already to - * it's thread_struct during __switch_to(). - * - * A reclaim flushes ALL the state or if not in TM save TM SPRs - * in the appropriate thread structures from live. - */ - - if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) - return; - - if (MSR_TM_SUSPENDED(mfmsr())) { - tm_reclaim_current(TM_CAUSE_SIGNAL); - } else { - tm_enable(); - tm_save_sprs(&(tsk->thread)); - } -} -#else -static inline void flush_tmregs_to_thread(struct task_struct *tsk) { } -#endif - -/** - * regs_query_register_offset() - query register offset from its name - * @name: the name of a register - * - * regs_query_register_offset() returns the offset of a register in struct - * pt_regs from its name. If the name is invalid, this returns -EINVAL; - */ -int regs_query_register_offset(const char *name) -{ - const struct pt_regs_offset *roff; - for (roff = regoffset_table; roff->name != NULL; roff++) - if (!strcmp(roff->name, name)) - return roff->offset; - return -EINVAL; -} - -/** - * regs_query_register_name() - query register name from its offset - * @offset: the offset of a register in struct pt_regs. - * - * regs_query_register_name() returns the name of a register from its - * offset in struct pt_regs. If the @offset is invalid, this returns NULL; - */ -const char *regs_query_register_name(unsigned int offset) -{ - const struct pt_regs_offset *roff; - for (roff = regoffset_table; roff->name != NULL; roff++) - if (roff->offset == offset) - return roff->name; - return NULL; -} - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Set of msr bits that gdb can change on behalf of a process. - */ -#ifdef CONFIG_PPC_ADV_DEBUG_REGS -#define MSR_DEBUGCHANGE 0 -#else -#define MSR_DEBUGCHANGE (MSR_SE | MSR_BE) -#endif - -/* - * Max register writeable via put_reg - */ -#ifdef CONFIG_PPC32 -#define PT_MAX_PUT_REG PT_MQ -#else -#define PT_MAX_PUT_REG PT_CCR -#endif - -static unsigned long get_user_msr(struct task_struct *task) -{ - return task->thread.regs->msr | task->thread.fpexc_mode; -} - -static int set_user_msr(struct task_struct *task, unsigned long msr) -{ - task->thread.regs->msr &= ~MSR_DEBUGCHANGE; - task->thread.regs->msr |= msr & MSR_DEBUGCHANGE; - return 0; -} - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -static unsigned long get_user_ckpt_msr(struct task_struct *task) -{ - return task->thread.ckpt_regs.msr | task->thread.fpexc_mode; -} - -static int set_user_ckpt_msr(struct task_struct *task, unsigned long msr) -{ - task->thread.ckpt_regs.msr &= ~MSR_DEBUGCHANGE; - task->thread.ckpt_regs.msr |= msr & MSR_DEBUGCHANGE; - return 0; -} - -static int set_user_ckpt_trap(struct task_struct *task, unsigned long trap) -{ - task->thread.ckpt_regs.trap = trap & 0xfff0; - return 0; -} -#endif - -#ifdef CONFIG_PPC64 -static int get_user_dscr(struct task_struct *task, unsigned long *data) -{ - *data = task->thread.dscr; - return 0; -} - -static int set_user_dscr(struct task_struct *task, unsigned long dscr) -{ - task->thread.dscr = dscr; - task->thread.dscr_inherit = 1; - return 0; -} -#else -static int get_user_dscr(struct task_struct *task, unsigned long *data) -{ - return -EIO; -} - -static int set_user_dscr(struct task_struct *task, unsigned long dscr) -{ - return -EIO; -} -#endif - -/* - * We prevent mucking around with the reserved area of trap - * which are used internally by the kernel. - */ -static int set_user_trap(struct task_struct *task, unsigned long trap) -{ - task->thread.regs->trap = trap & 0xfff0; - return 0; -} - -/* - * Get contents of register REGNO in task TASK. - */ -int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) -{ - unsigned int regs_max; - - if ((task->thread.regs == NULL) || !data) - return -EIO; - - if (regno == PT_MSR) { - *data = get_user_msr(task); - return 0; - } - - if (regno == PT_DSCR) - return get_user_dscr(task, data); - -#ifdef CONFIG_PPC64 - /* - * softe copies paca->irq_soft_mask variable state. Since irq_soft_mask is - * no more used as a flag, lets force usr to alway see the softe value as 1 - * which means interrupts are not soft disabled. - */ - if (regno == PT_SOFTE) { - *data = 1; - return 0; - } -#endif - - regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long); - if (regno < regs_max) { - regno = array_index_nospec(regno, regs_max); - *data = ((unsigned long *)task->thread.regs)[regno]; - return 0; - } - - return -EIO; -} - -/* - * Write contents of register REGNO in task TASK. - */ -int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data) -{ - if (task->thread.regs == NULL) - return -EIO; - - if (regno == PT_MSR) - return set_user_msr(task, data); - if (regno == PT_TRAP) - return set_user_trap(task, data); - if (regno == PT_DSCR) - return set_user_dscr(task, data); - - if (regno <= PT_MAX_PUT_REG) { - regno = array_index_nospec(regno, PT_MAX_PUT_REG + 1); - ((unsigned long *)task->thread.regs)[regno] = data; - return 0; - } - return -EIO; -} - -static int gpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int i, ret; - - if (target->thread.regs == NULL) - return -EIO; - - if (!FULL_REGS(target->thread.regs)) { - /* We have a partial register set. Fill 14-31 with bogus values */ - for (i = 14; i < 32; i++) - target->thread.regs->gpr[i] = NV_REG_POISON; - } - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - target->thread.regs, - 0, offsetof(struct pt_regs, msr)); - if (!ret) { - unsigned long msr = get_user_msr(target); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &msr, - offsetof(struct pt_regs, msr), - offsetof(struct pt_regs, msr) + - sizeof(msr)); - } - - BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != - offsetof(struct pt_regs, msr) + sizeof(long)); - - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.regs->orig_gpr3, - offsetof(struct pt_regs, orig_gpr3), - sizeof(struct user_pt_regs)); - if (!ret) - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct user_pt_regs), -1); - - return ret; -} - -static int gpr_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - unsigned long reg; - int ret; - - if (target->thread.regs == NULL) - return -EIO; - - CHECK_FULL_REGS(target->thread.regs); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - target->thread.regs, - 0, PT_MSR * sizeof(reg)); - - if (!ret && count > 0) { - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, - PT_MSR * sizeof(reg), - (PT_MSR + 1) * sizeof(reg)); - if (!ret) - ret = set_user_msr(target, reg); - } - - BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != - offsetof(struct pt_regs, msr) + sizeof(long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.regs->orig_gpr3, - PT_ORIG_R3 * sizeof(reg), - (PT_MAX_PUT_REG + 1) * sizeof(reg)); - - if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret) - ret = user_regset_copyin_ignore( - &pos, &count, &kbuf, &ubuf, - (PT_MAX_PUT_REG + 1) * sizeof(reg), - PT_TRAP * sizeof(reg)); - - if (!ret && count > 0) { - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, - PT_TRAP * sizeof(reg), - (PT_TRAP + 1) * sizeof(reg)); - if (!ret) - ret = set_user_trap(target, reg); - } - - if (!ret) - ret = user_regset_copyin_ignore( - &pos, &count, &kbuf, &ubuf, - (PT_TRAP + 1) * sizeof(reg), -1); - - return ret; -} - -/* - * Regardless of transactions, 'fp_state' holds the current running - * value of all FPR registers and 'ckfp_state' holds the last checkpointed - * value of all FPR registers for the current transaction. - * - * Userspace interface buffer layout: - * - * struct data { - * u64 fpr[32]; - * u64 fpscr; - * }; - */ -static int fpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ -#ifdef CONFIG_VSX - u64 buf[33]; - int i; - - flush_fp_to_thread(target); - - /* copy to local buffer then write that out */ - for (i = 0; i < 32 ; i++) - buf[i] = target->thread.TS_FPR(i); - buf[32] = target->thread.fp_state.fpscr; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1); -#else - BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != - offsetof(struct thread_fp_state, fpr[32])); - - flush_fp_to_thread(target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fp_state, 0, -1); -#endif -} - -/* - * Regardless of transactions, 'fp_state' holds the current running - * value of all FPR registers and 'ckfp_state' holds the last checkpointed - * value of all FPR registers for the current transaction. - * - * Userspace interface buffer layout: - * - * struct data { - * u64 fpr[32]; - * u64 fpscr; - * }; - * - */ -static int fpr_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ -#ifdef CONFIG_VSX - u64 buf[33]; - int i; - - flush_fp_to_thread(target); - - for (i = 0; i < 32 ; i++) - buf[i] = target->thread.TS_FPR(i); - buf[32] = target->thread.fp_state.fpscr; - - /* copy to local buffer then write that out */ - i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); - if (i) - return i; - - for (i = 0; i < 32 ; i++) - target->thread.TS_FPR(i) = buf[i]; - target->thread.fp_state.fpscr = buf[32]; - return 0; -#else - BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != - offsetof(struct thread_fp_state, fpr[32])); - - flush_fp_to_thread(target); - - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fp_state, 0, -1); -#endif -} - -#ifdef CONFIG_ALTIVEC -/* - * Get/set all the altivec registers vr0..vr31, vscr, vrsave, in one go. - * The transfer totals 34 quadword. Quadwords 0-31 contain the - * corresponding vector registers. Quadword 32 contains the vscr as the - * last word (offset 12) within that quadword. Quadword 33 contains the - * vrsave as the first word (offset 0) within the quadword. - * - * This definition of the VMX state is compatible with the current PPC32 - * ptrace interface. This allows signal handling and ptrace to use the - * same structures. This also simplifies the implementation of a bi-arch - * (combined (32- and 64-bit) gdb. - */ - -static int vr_active(struct task_struct *target, - const struct user_regset *regset) -{ - flush_altivec_to_thread(target); - return target->thread.used_vr ? regset->n : 0; -} - -/* - * Regardless of transactions, 'vr_state' holds the current running - * value of all the VMX registers and 'ckvr_state' holds the last - * checkpointed value of all the VMX registers for the current - * transaction to fall back on in case it aborts. - * - * Userspace interface buffer layout: - * - * struct data { - * vector128 vr[32]; - * vector128 vscr; - * vector128 vrsave; - * }; - */ -static int vr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - flush_altivec_to_thread(target); - - BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != - offsetof(struct thread_vr_state, vr[32])); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.vr_state, 0, - 33 * sizeof(vector128)); - if (!ret) { - /* - * Copy out only the low-order word of vrsave. - */ - int start, end; - union { - elf_vrreg_t reg; - u32 word; - } vrsave; - memset(&vrsave, 0, sizeof(vrsave)); - - vrsave.word = target->thread.vrsave; - - start = 33 * sizeof(vector128); - end = start + sizeof(vrsave); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, - start, end); - } - - return ret; -} - -/* - * Regardless of transactions, 'vr_state' holds the current running - * value of all the VMX registers and 'ckvr_state' holds the last - * checkpointed value of all the VMX registers for the current - * transaction to fall back on in case it aborts. - * - * Userspace interface buffer layout: - * - * struct data { - * vector128 vr[32]; - * vector128 vscr; - * vector128 vrsave; - * }; - */ -static int vr_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - flush_altivec_to_thread(target); - - BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != - offsetof(struct thread_vr_state, vr[32])); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.vr_state, 0, - 33 * sizeof(vector128)); - if (!ret && count > 0) { - /* - * We use only the first word of vrsave. - */ - int start, end; - union { - elf_vrreg_t reg; - u32 word; - } vrsave; - memset(&vrsave, 0, sizeof(vrsave)); - - vrsave.word = target->thread.vrsave; - - start = 33 * sizeof(vector128); - end = start + sizeof(vrsave); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, - start, end); - if (!ret) - target->thread.vrsave = vrsave.word; - } - - return ret; -} -#endif /* CONFIG_ALTIVEC */ - -#ifdef CONFIG_VSX -/* - * Currently to set and and get all the vsx state, you need to call - * the fp and VMX calls as well. This only get/sets the lower 32 - * 128bit VSX registers. - */ - -static int vsr_active(struct task_struct *target, - const struct user_regset *regset) -{ - flush_vsx_to_thread(target); - return target->thread.used_vsr ? regset->n : 0; -} - -/* - * Regardless of transactions, 'fp_state' holds the current running - * value of all FPR registers and 'ckfp_state' holds the last - * checkpointed value of all FPR registers for the current - * transaction. - * - * Userspace interface buffer layout: - * - * struct data { - * u64 vsx[32]; - * }; - */ -static int vsr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - u64 buf[32]; - int ret, i; - - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - flush_vsx_to_thread(target); - - for (i = 0; i < 32 ; i++) - buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - buf, 0, 32 * sizeof(double)); - - return ret; -} - -/* - * Regardless of transactions, 'fp_state' holds the current running - * value of all FPR registers and 'ckfp_state' holds the last - * checkpointed value of all FPR registers for the current - * transaction. - * - * Userspace interface buffer layout: - * - * struct data { - * u64 vsx[32]; - * }; - */ -static int vsr_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - u64 buf[32]; - int ret,i; - - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - flush_vsx_to_thread(target); - - for (i = 0; i < 32 ; i++) - buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - buf, 0, 32 * sizeof(double)); - if (!ret) - for (i = 0; i < 32 ; i++) - target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; - - return ret; -} -#endif /* CONFIG_VSX */ - -#ifdef CONFIG_SPE - -/* - * For get_evrregs/set_evrregs functions 'data' has the following layout: - * - * struct { - * u32 evr[32]; - * u64 acc; - * u32 spefscr; - * } - */ - -static int evr_active(struct task_struct *target, - const struct user_regset *regset) -{ - flush_spe_to_thread(target); - return target->thread.used_spe ? regset->n : 0; -} - -static int evr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - flush_spe_to_thread(target); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.evr, - 0, sizeof(target->thread.evr)); - - BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) != - offsetof(struct thread_struct, spefscr)); - - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.acc, - sizeof(target->thread.evr), -1); - - return ret; -} - -static int evr_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - flush_spe_to_thread(target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.evr, - 0, sizeof(target->thread.evr)); - - BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) != - offsetof(struct thread_struct, spefscr)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.acc, - sizeof(target->thread.evr), -1); - - return ret; -} -#endif /* CONFIG_SPE */ - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -/** - * tm_cgpr_active - get active number of registers in CGPR - * @target: The target task. - * @regset: The user regset structure. - * - * This function checks for the active number of available - * regisers in transaction checkpointed GPR category. - */ -static int tm_cgpr_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return 0; - - return regset->n; -} - -/** - * tm_cgpr_get - get CGPR registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy from. - * @ubuf: User buffer to copy into. - * - * This function gets transaction checkpointed GPR registers. - * - * When the transaction is active, 'ckpt_regs' holds all the checkpointed - * GPR register values for the current transaction to fall back on if it - * aborts in between. This function gets those checkpointed GPR registers. - * The userspace interface buffer layout is as follows. - * - * struct data { - * struct pt_regs ckpt_regs; - * }; - */ -static int tm_cgpr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ckpt_regs, - 0, offsetof(struct pt_regs, msr)); - if (!ret) { - unsigned long msr = get_user_ckpt_msr(target); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &msr, - offsetof(struct pt_regs, msr), - offsetof(struct pt_regs, msr) + - sizeof(msr)); - } - - BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != - offsetof(struct pt_regs, msr) + sizeof(long)); - - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ckpt_regs.orig_gpr3, - offsetof(struct pt_regs, orig_gpr3), - sizeof(struct user_pt_regs)); - if (!ret) - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct user_pt_regs), -1); - - return ret; -} - -/* - * tm_cgpr_set - set the CGPR registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy into. - * @ubuf: User buffer to copy from. - * - * This function sets in transaction checkpointed GPR registers. - * - * When the transaction is active, 'ckpt_regs' holds the checkpointed - * GPR register values for the current transaction to fall back on if it - * aborts in between. This function sets those checkpointed GPR registers. - * The userspace interface buffer layout is as follows. - * - * struct data { - * struct pt_regs ckpt_regs; - * }; - */ -static int tm_cgpr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - unsigned long reg; - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ckpt_regs, - 0, PT_MSR * sizeof(reg)); - - if (!ret && count > 0) { - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, - PT_MSR * sizeof(reg), - (PT_MSR + 1) * sizeof(reg)); - if (!ret) - ret = set_user_ckpt_msr(target, reg); - } - - BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != - offsetof(struct pt_regs, msr) + sizeof(long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ckpt_regs.orig_gpr3, - PT_ORIG_R3 * sizeof(reg), - (PT_MAX_PUT_REG + 1) * sizeof(reg)); - - if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret) - ret = user_regset_copyin_ignore( - &pos, &count, &kbuf, &ubuf, - (PT_MAX_PUT_REG + 1) * sizeof(reg), - PT_TRAP * sizeof(reg)); - - if (!ret && count > 0) { - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, - PT_TRAP * sizeof(reg), - (PT_TRAP + 1) * sizeof(reg)); - if (!ret) - ret = set_user_ckpt_trap(target, reg); - } - - if (!ret) - ret = user_regset_copyin_ignore( - &pos, &count, &kbuf, &ubuf, - (PT_TRAP + 1) * sizeof(reg), -1); - - return ret; -} - -/** - * tm_cfpr_active - get active number of registers in CFPR - * @target: The target task. - * @regset: The user regset structure. - * - * This function checks for the active number of available - * regisers in transaction checkpointed FPR category. - */ -static int tm_cfpr_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return 0; - - return regset->n; -} - -/** - * tm_cfpr_get - get CFPR registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy from. - * @ubuf: User buffer to copy into. - * - * This function gets in transaction checkpointed FPR registers. - * - * When the transaction is active 'ckfp_state' holds the checkpointed - * values for the current transaction to fall back on if it aborts - * in between. This function gets those checkpointed FPR registers. - * The userspace interface buffer layout is as follows. - * - * struct data { - * u64 fpr[32]; - * u64 fpscr; - *}; - */ -static int tm_cfpr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - u64 buf[33]; - int i; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - /* copy to local buffer then write that out */ - for (i = 0; i < 32 ; i++) - buf[i] = target->thread.TS_CKFPR(i); - buf[32] = target->thread.ckfp_state.fpscr; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1); -} - -/** - * tm_cfpr_set - set CFPR registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy into. - * @ubuf: User buffer to copy from. - * - * This function sets in transaction checkpointed FPR registers. - * - * When the transaction is active 'ckfp_state' holds the checkpointed - * FPR register values for the current transaction to fall back on - * if it aborts in between. This function sets these checkpointed - * FPR registers. The userspace interface buffer layout is as follows. - * - * struct data { - * u64 fpr[32]; - * u64 fpscr; - *}; - */ -static int tm_cfpr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - u64 buf[33]; - int i; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - for (i = 0; i < 32; i++) - buf[i] = target->thread.TS_CKFPR(i); - buf[32] = target->thread.ckfp_state.fpscr; - - /* copy to local buffer then write that out */ - i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); - if (i) - return i; - for (i = 0; i < 32 ; i++) - target->thread.TS_CKFPR(i) = buf[i]; - target->thread.ckfp_state.fpscr = buf[32]; - return 0; -} - -/** - * tm_cvmx_active - get active number of registers in CVMX - * @target: The target task. - * @regset: The user regset structure. - * - * This function checks for the active number of available - * regisers in checkpointed VMX category. - */ -static int tm_cvmx_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return 0; - - return regset->n; -} - -/** - * tm_cvmx_get - get CMVX registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy from. - * @ubuf: User buffer to copy into. - * - * This function gets in transaction checkpointed VMX registers. - * - * When the transaction is active 'ckvr_state' and 'ckvrsave' hold - * the checkpointed values for the current transaction to fall - * back on if it aborts in between. The userspace interface buffer - * layout is as follows. - * - * struct data { - * vector128 vr[32]; - * vector128 vscr; - * vector128 vrsave; - *}; - */ -static int tm_cvmx_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32])); - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - /* Flush the state */ - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ckvr_state, 0, - 33 * sizeof(vector128)); - if (!ret) { - /* - * Copy out only the low-order word of vrsave. - */ - union { - elf_vrreg_t reg; - u32 word; - } vrsave; - memset(&vrsave, 0, sizeof(vrsave)); - vrsave.word = target->thread.ckvrsave; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); - } - - return ret; -} - -/** - * tm_cvmx_set - set CMVX registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy into. - * @ubuf: User buffer to copy from. - * - * This function sets in transaction checkpointed VMX registers. - * - * When the transaction is active 'ckvr_state' and 'ckvrsave' hold - * the checkpointed values for the current transaction to fall - * back on if it aborts in between. The userspace interface buffer - * layout is as follows. - * - * struct data { - * vector128 vr[32]; - * vector128 vscr; - * vector128 vrsave; - *}; - */ -static int tm_cvmx_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32])); - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ckvr_state, 0, - 33 * sizeof(vector128)); - if (!ret && count > 0) { - /* - * We use only the low-order word of vrsave. - */ - union { - elf_vrreg_t reg; - u32 word; - } vrsave; - memset(&vrsave, 0, sizeof(vrsave)); - vrsave.word = target->thread.ckvrsave; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); - if (!ret) - target->thread.ckvrsave = vrsave.word; - } - - return ret; -} - -/** - * tm_cvsx_active - get active number of registers in CVSX - * @target: The target task. - * @regset: The user regset structure. - * - * This function checks for the active number of available - * regisers in transaction checkpointed VSX category. - */ -static int tm_cvsx_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return 0; - - flush_vsx_to_thread(target); - return target->thread.used_vsr ? regset->n : 0; -} - -/** - * tm_cvsx_get - get CVSX registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy from. - * @ubuf: User buffer to copy into. - * - * This function gets in transaction checkpointed VSX registers. - * - * When the transaction is active 'ckfp_state' holds the checkpointed - * values for the current transaction to fall back on if it aborts - * in between. This function gets those checkpointed VSX registers. - * The userspace interface buffer layout is as follows. - * - * struct data { - * u64 vsx[32]; - *}; - */ -static int tm_cvsx_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - u64 buf[32]; - int ret, i; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - /* Flush the state */ - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - flush_vsx_to_thread(target); - - for (i = 0; i < 32 ; i++) - buf[i] = target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - buf, 0, 32 * sizeof(double)); - - return ret; -} - -/** - * tm_cvsx_set - set CFPR registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy into. - * @ubuf: User buffer to copy from. - * - * This function sets in transaction checkpointed VSX registers. - * - * When the transaction is active 'ckfp_state' holds the checkpointed - * VSX register values for the current transaction to fall back on - * if it aborts in between. This function sets these checkpointed - * FPR registers. The userspace interface buffer layout is as follows. - * - * struct data { - * u64 vsx[32]; - *}; - */ -static int tm_cvsx_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - u64 buf[32]; - int ret, i; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - /* Flush the state */ - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - flush_vsx_to_thread(target); - - for (i = 0; i < 32 ; i++) - buf[i] = target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - buf, 0, 32 * sizeof(double)); - if (!ret) - for (i = 0; i < 32 ; i++) - target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; - - return ret; -} - -/** - * tm_spr_active - get active number of registers in TM SPR - * @target: The target task. - * @regset: The user regset structure. - * - * This function checks the active number of available - * regisers in the transactional memory SPR category. - */ -static int tm_spr_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - return regset->n; -} - -/** - * tm_spr_get - get the TM related SPR registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy from. - * @ubuf: User buffer to copy into. - * - * This function gets transactional memory related SPR registers. - * The userspace interface buffer layout is as follows. - * - * struct { - * u64 tm_tfhar; - * u64 tm_texasr; - * u64 tm_tfiar; - * }; - */ -static int tm_spr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - /* Build tests */ - BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr)); - BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar)); - BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs)); - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - /* Flush the states */ - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - /* TFHAR register */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_tfhar, 0, sizeof(u64)); - - /* TEXASR register */ - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_texasr, sizeof(u64), - 2 * sizeof(u64)); - - /* TFIAR register */ - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_tfiar, - 2 * sizeof(u64), 3 * sizeof(u64)); - return ret; -} - -/** - * tm_spr_set - set the TM related SPR registers - * @target: The target task. - * @regset: The user regset structure. - * @pos: The buffer position. - * @count: Number of bytes to copy. - * @kbuf: Kernel buffer to copy into. - * @ubuf: User buffer to copy from. - * - * This function sets transactional memory related SPR registers. - * The userspace interface buffer layout is as follows. - * - * struct { - * u64 tm_tfhar; - * u64 tm_texasr; - * u64 tm_tfiar; - * }; - */ -static int tm_spr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - /* Build tests */ - BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr)); - BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar)); - BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs)); - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - /* Flush the states */ - flush_tmregs_to_thread(target); - flush_fp_to_thread(target); - flush_altivec_to_thread(target); - - /* TFHAR register */ - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_tfhar, 0, sizeof(u64)); - - /* TEXASR register */ - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_texasr, sizeof(u64), - 2 * sizeof(u64)); - - /* TFIAR register */ - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_tfiar, - 2 * sizeof(u64), 3 * sizeof(u64)); - return ret; -} - -static int tm_tar_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (MSR_TM_ACTIVE(target->thread.regs->msr)) - return regset->n; - - return 0; -} - -static int tm_tar_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_tar, 0, sizeof(u64)); - return ret; -} - -static int tm_tar_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_tar, 0, sizeof(u64)); - return ret; -} - -static int tm_ppr_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (MSR_TM_ACTIVE(target->thread.regs->msr)) - return regset->n; - - return 0; -} - - -static int tm_ppr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_ppr, 0, sizeof(u64)); - return ret; -} - -static int tm_ppr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_ppr, 0, sizeof(u64)); - return ret; -} - -static int tm_dscr_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (MSR_TM_ACTIVE(target->thread.regs->msr)) - return regset->n; - - return 0; -} - -static int tm_dscr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_dscr, 0, sizeof(u64)); - return ret; -} - -static int tm_dscr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_TM)) - return -ENODEV; - - if (!MSR_TM_ACTIVE(target->thread.regs->msr)) - return -ENODATA; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tm_dscr, 0, sizeof(u64)); - return ret; -} -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ - -#ifdef CONFIG_PPC64 -static int ppr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.regs->ppr, 0, sizeof(u64)); -} - -static int ppr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.regs->ppr, 0, sizeof(u64)); -} - -static int dscr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); -} -static int dscr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); -} -#endif -#ifdef CONFIG_PPC_BOOK3S_64 -static int tar_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); -} -static int tar_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); -} - -static int ebb_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - if (target->thread.used_ebb) - return regset->n; - - return 0; -} - -static int ebb_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - /* Build tests */ - BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr)); - BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr)); - - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - if (!target->thread.used_ebb) - return -ENODATA; - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ebbrr, 0, 3 * sizeof(unsigned long)); -} - -static int ebb_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret = 0; - - /* Build tests */ - BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr)); - BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr)); - - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - if (target->thread.used_ebb) - return -ENODATA; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ebbrr, 0, sizeof(unsigned long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ebbhr, sizeof(unsigned long), - 2 * sizeof(unsigned long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.bescr, - 2 * sizeof(unsigned long), 3 * sizeof(unsigned long)); - - return ret; -} -static int pmu_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - return regset->n; -} - -static int pmu_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - /* Build tests */ - BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar)); - BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier)); - BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2)); - BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0)); - - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.siar, 0, - 5 * sizeof(unsigned long)); -} - -static int pmu_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret = 0; - - /* Build tests */ - BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar)); - BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier)); - BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2)); - BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0)); - - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.siar, 0, - sizeof(unsigned long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.sdar, sizeof(unsigned long), - 2 * sizeof(unsigned long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.sier, 2 * sizeof(unsigned long), - 3 * sizeof(unsigned long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.mmcr2, 3 * sizeof(unsigned long), - 4 * sizeof(unsigned long)); - - if (!ret) - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.mmcr0, 4 * sizeof(unsigned long), - 5 * sizeof(unsigned long)); - return ret; -} -#endif - -#ifdef CONFIG_PPC_MEM_KEYS -static int pkey_active(struct task_struct *target, - const struct user_regset *regset) -{ - if (!arch_pkeys_enabled()) - return -ENODEV; - - return regset->n; -} - -static int pkey_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - BUILD_BUG_ON(TSO(amr) + sizeof(unsigned long) != TSO(iamr)); - BUILD_BUG_ON(TSO(iamr) + sizeof(unsigned long) != TSO(uamor)); - - if (!arch_pkeys_enabled()) - return -ENODEV; - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.amr, 0, - ELF_NPKEY * sizeof(unsigned long)); -} - -static int pkey_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - u64 new_amr; - int ret; - - if (!arch_pkeys_enabled()) - return -ENODEV; - - /* Only the AMR can be set from userspace */ - if (pos != 0 || count != sizeof(new_amr)) - return -EINVAL; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &new_amr, 0, sizeof(new_amr)); - if (ret) - return ret; - - /* UAMOR determines which bits of the AMR can be set from userspace. */ - target->thread.amr = (new_amr & target->thread.uamor) | - (target->thread.amr & ~target->thread.uamor); - - return 0; -} -#endif /* CONFIG_PPC_MEM_KEYS */ - -/* - * These are our native regset flavors. - */ -enum powerpc_regset { - REGSET_GPR, - REGSET_FPR, -#ifdef CONFIG_ALTIVEC - REGSET_VMX, -#endif -#ifdef CONFIG_VSX - REGSET_VSX, -#endif -#ifdef CONFIG_SPE - REGSET_SPE, -#endif -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - REGSET_TM_CGPR, /* TM checkpointed GPR registers */ - REGSET_TM_CFPR, /* TM checkpointed FPR registers */ - REGSET_TM_CVMX, /* TM checkpointed VMX registers */ - REGSET_TM_CVSX, /* TM checkpointed VSX registers */ - REGSET_TM_SPR, /* TM specific SPR registers */ - REGSET_TM_CTAR, /* TM checkpointed TAR register */ - REGSET_TM_CPPR, /* TM checkpointed PPR register */ - REGSET_TM_CDSCR, /* TM checkpointed DSCR register */ -#endif -#ifdef CONFIG_PPC64 - REGSET_PPR, /* PPR register */ - REGSET_DSCR, /* DSCR register */ -#endif -#ifdef CONFIG_PPC_BOOK3S_64 - REGSET_TAR, /* TAR register */ - REGSET_EBB, /* EBB registers */ - REGSET_PMR, /* Performance Monitor Registers */ -#endif -#ifdef CONFIG_PPC_MEM_KEYS - REGSET_PKEY, /* AMR register */ -#endif -}; - -static const struct user_regset native_regsets[] = { - [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, .n = ELF_NGREG, - .size = sizeof(long), .align = sizeof(long), - .get = gpr_get, .set = gpr_set - }, - [REGSET_FPR] = { - .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, - .size = sizeof(double), .align = sizeof(double), - .get = fpr_get, .set = fpr_set - }, -#ifdef CONFIG_ALTIVEC - [REGSET_VMX] = { - .core_note_type = NT_PPC_VMX, .n = 34, - .size = sizeof(vector128), .align = sizeof(vector128), - .active = vr_active, .get = vr_get, .set = vr_set - }, -#endif -#ifdef CONFIG_VSX - [REGSET_VSX] = { - .core_note_type = NT_PPC_VSX, .n = 32, - .size = sizeof(double), .align = sizeof(double), - .active = vsr_active, .get = vsr_get, .set = vsr_set - }, -#endif -#ifdef CONFIG_SPE - [REGSET_SPE] = { - .core_note_type = NT_PPC_SPE, .n = 35, - .size = sizeof(u32), .align = sizeof(u32), - .active = evr_active, .get = evr_get, .set = evr_set - }, -#endif -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - [REGSET_TM_CGPR] = { - .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG, - .size = sizeof(long), .align = sizeof(long), - .active = tm_cgpr_active, .get = tm_cgpr_get, .set = tm_cgpr_set - }, - [REGSET_TM_CFPR] = { - .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG, - .size = sizeof(double), .align = sizeof(double), - .active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set - }, - [REGSET_TM_CVMX] = { - .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX, - .size = sizeof(vector128), .align = sizeof(vector128), - .active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set - }, - [REGSET_TM_CVSX] = { - .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX, - .size = sizeof(double), .align = sizeof(double), - .active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set - }, - [REGSET_TM_SPR] = { - .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set - }, - [REGSET_TM_CTAR] = { - .core_note_type = NT_PPC_TM_CTAR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set - }, - [REGSET_TM_CPPR] = { - .core_note_type = NT_PPC_TM_CPPR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set - }, - [REGSET_TM_CDSCR] = { - .core_note_type = NT_PPC_TM_CDSCR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set - }, -#endif -#ifdef CONFIG_PPC64 - [REGSET_PPR] = { - .core_note_type = NT_PPC_PPR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .get = ppr_get, .set = ppr_set - }, - [REGSET_DSCR] = { - .core_note_type = NT_PPC_DSCR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .get = dscr_get, .set = dscr_set - }, -#endif -#ifdef CONFIG_PPC_BOOK3S_64 - [REGSET_TAR] = { - .core_note_type = NT_PPC_TAR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .get = tar_get, .set = tar_set - }, - [REGSET_EBB] = { - .core_note_type = NT_PPC_EBB, .n = ELF_NEBB, - .size = sizeof(u64), .align = sizeof(u64), - .active = ebb_active, .get = ebb_get, .set = ebb_set - }, - [REGSET_PMR] = { - .core_note_type = NT_PPC_PMU, .n = ELF_NPMU, - .size = sizeof(u64), .align = sizeof(u64), - .active = pmu_active, .get = pmu_get, .set = pmu_set - }, -#endif -#ifdef CONFIG_PPC_MEM_KEYS - [REGSET_PKEY] = { - .core_note_type = NT_PPC_PKEY, .n = ELF_NPKEY, - .size = sizeof(u64), .align = sizeof(u64), - .active = pkey_active, .get = pkey_get, .set = pkey_set - }, -#endif -}; - -static const struct user_regset_view user_ppc_native_view = { - .name = UTS_MACHINE, .e_machine = ELF_ARCH, .ei_osabi = ELF_OSABI, - .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets) -}; - -#ifdef CONFIG_PPC64 -#include <linux/compat.h> - -static int gpr32_get_common(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - unsigned long *regs) -{ - compat_ulong_t *k = kbuf; - compat_ulong_t __user *u = ubuf; - compat_ulong_t reg; - - pos /= sizeof(reg); - count /= sizeof(reg); - - if (kbuf) - for (; count > 0 && pos < PT_MSR; --count) - *k++ = regs[pos++]; - else - for (; count > 0 && pos < PT_MSR; --count) - if (__put_user((compat_ulong_t) regs[pos++], u++)) - return -EFAULT; - - if (count > 0 && pos == PT_MSR) { - reg = get_user_msr(target); - if (kbuf) - *k++ = reg; - else if (__put_user(reg, u++)) - return -EFAULT; - ++pos; - --count; - } - - if (kbuf) - for (; count > 0 && pos < PT_REGS_COUNT; --count) - *k++ = regs[pos++]; - else - for (; count > 0 && pos < PT_REGS_COUNT; --count) - if (__put_user((compat_ulong_t) regs[pos++], u++)) - return -EFAULT; - - kbuf = k; - ubuf = u; - pos *= sizeof(reg); - count *= sizeof(reg); - return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - PT_REGS_COUNT * sizeof(reg), -1); -} - -static int gpr32_set_common(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf, - unsigned long *regs) -{ - const compat_ulong_t *k = kbuf; - const compat_ulong_t __user *u = ubuf; - compat_ulong_t reg; - - pos /= sizeof(reg); - count /= sizeof(reg); - - if (kbuf) - for (; count > 0 && pos < PT_MSR; --count) - regs[pos++] = *k++; - else - for (; count > 0 && pos < PT_MSR; --count) { - if (__get_user(reg, u++)) - return -EFAULT; - regs[pos++] = reg; - } - - - if (count > 0 && pos == PT_MSR) { - if (kbuf) - reg = *k++; - else if (__get_user(reg, u++)) - return -EFAULT; - set_user_msr(target, reg); - ++pos; - --count; - } - - if (kbuf) { - for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) - regs[pos++] = *k++; - for (; count > 0 && pos < PT_TRAP; --count, ++pos) - ++k; - } else { - for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) { - if (__get_user(reg, u++)) - return -EFAULT; - regs[pos++] = reg; - } - for (; count > 0 && pos < PT_TRAP; --count, ++pos) - if (__get_user(reg, u++)) - return -EFAULT; - } - - if (count > 0 && pos == PT_TRAP) { - if (kbuf) - reg = *k++; - else if (__get_user(reg, u++)) - return -EFAULT; - set_user_trap(target, reg); - ++pos; - --count; - } - - kbuf = k; - ubuf = u; - pos *= sizeof(reg); - count *= sizeof(reg); - return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - (PT_TRAP + 1) * sizeof(reg), -1); -} - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -static int tm_cgpr32_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, - &target->thread.ckpt_regs.gpr[0]); -} - -static int tm_cgpr32_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, - &target->thread.ckpt_regs.gpr[0]); -} -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ - -static int gpr32_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int i; - - if (target->thread.regs == NULL) - return -EIO; - - if (!FULL_REGS(target->thread.regs)) { - /* - * We have a partial register set. - * Fill 14-31 with bogus values. - */ - for (i = 14; i < 32; i++) - target->thread.regs->gpr[i] = NV_REG_POISON; - } - return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, - &target->thread.regs->gpr[0]); -} - -static int gpr32_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - if (target->thread.regs == NULL) - return -EIO; - - CHECK_FULL_REGS(target->thread.regs); - return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, - &target->thread.regs->gpr[0]); -} - -/* - * These are the regset flavors matching the CONFIG_PPC32 native set. - */ -static const struct user_regset compat_regsets[] = { - [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, .n = ELF_NGREG, - .size = sizeof(compat_long_t), .align = sizeof(compat_long_t), - .get = gpr32_get, .set = gpr32_set - }, - [REGSET_FPR] = { - .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, - .size = sizeof(double), .align = sizeof(double), - .get = fpr_get, .set = fpr_set - }, -#ifdef CONFIG_ALTIVEC - [REGSET_VMX] = { - .core_note_type = NT_PPC_VMX, .n = 34, - .size = sizeof(vector128), .align = sizeof(vector128), - .active = vr_active, .get = vr_get, .set = vr_set - }, -#endif -#ifdef CONFIG_SPE - [REGSET_SPE] = { - .core_note_type = NT_PPC_SPE, .n = 35, - .size = sizeof(u32), .align = sizeof(u32), - .active = evr_active, .get = evr_get, .set = evr_set - }, -#endif -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - [REGSET_TM_CGPR] = { - .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG, - .size = sizeof(long), .align = sizeof(long), - .active = tm_cgpr_active, - .get = tm_cgpr32_get, .set = tm_cgpr32_set - }, - [REGSET_TM_CFPR] = { - .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG, - .size = sizeof(double), .align = sizeof(double), - .active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set - }, - [REGSET_TM_CVMX] = { - .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX, - .size = sizeof(vector128), .align = sizeof(vector128), - .active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set - }, - [REGSET_TM_CVSX] = { - .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX, - .size = sizeof(double), .align = sizeof(double), - .active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set - }, - [REGSET_TM_SPR] = { - .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set - }, - [REGSET_TM_CTAR] = { - .core_note_type = NT_PPC_TM_CTAR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set - }, - [REGSET_TM_CPPR] = { - .core_note_type = NT_PPC_TM_CPPR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set - }, - [REGSET_TM_CDSCR] = { - .core_note_type = NT_PPC_TM_CDSCR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set - }, -#endif -#ifdef CONFIG_PPC64 - [REGSET_PPR] = { - .core_note_type = NT_PPC_PPR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .get = ppr_get, .set = ppr_set - }, - [REGSET_DSCR] = { - .core_note_type = NT_PPC_DSCR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .get = dscr_get, .set = dscr_set - }, -#endif -#ifdef CONFIG_PPC_BOOK3S_64 - [REGSET_TAR] = { - .core_note_type = NT_PPC_TAR, .n = 1, - .size = sizeof(u64), .align = sizeof(u64), - .get = tar_get, .set = tar_set - }, - [REGSET_EBB] = { - .core_note_type = NT_PPC_EBB, .n = ELF_NEBB, - .size = sizeof(u64), .align = sizeof(u64), - .active = ebb_active, .get = ebb_get, .set = ebb_set - }, -#endif -}; - -static const struct user_regset_view user_ppc_compat_view = { - .name = "ppc", .e_machine = EM_PPC, .ei_osabi = ELF_OSABI, - .regsets = compat_regsets, .n = ARRAY_SIZE(compat_regsets) -}; -#endif /* CONFIG_PPC64 */ - -const struct user_regset_view *task_user_regset_view(struct task_struct *task) -{ -#ifdef CONFIG_PPC64 - if (test_tsk_thread_flag(task, TIF_32BIT)) - return &user_ppc_compat_view; -#endif - return &user_ppc_native_view; -} - - -void user_enable_single_step(struct task_struct *task) -{ - struct pt_regs *regs = task->thread.regs; - - if (regs != NULL) { -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - task->thread.debug.dbcr0 &= ~DBCR0_BT; - task->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC; - regs->msr |= MSR_DE; -#else - regs->msr &= ~MSR_BE; - regs->msr |= MSR_SE; -#endif - } - set_tsk_thread_flag(task, TIF_SINGLESTEP); -} - -void user_enable_block_step(struct task_struct *task) -{ - struct pt_regs *regs = task->thread.regs; - - if (regs != NULL) { -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - task->thread.debug.dbcr0 &= ~DBCR0_IC; - task->thread.debug.dbcr0 = DBCR0_IDM | DBCR0_BT; - regs->msr |= MSR_DE; -#else - regs->msr &= ~MSR_SE; - regs->msr |= MSR_BE; -#endif - } - set_tsk_thread_flag(task, TIF_SINGLESTEP); -} - -void user_disable_single_step(struct task_struct *task) -{ - struct pt_regs *regs = task->thread.regs; - - if (regs != NULL) { -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - /* - * The logic to disable single stepping should be as - * simple as turning off the Instruction Complete flag. - * And, after doing so, if all debug flags are off, turn - * off DBCR0(IDM) and MSR(DE) .... Torez - */ - task->thread.debug.dbcr0 &= ~(DBCR0_IC|DBCR0_BT); - /* - * Test to see if any of the DBCR_ACTIVE_EVENTS bits are set. - */ - if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, - task->thread.debug.dbcr1)) { - /* - * All debug events were off..... - */ - task->thread.debug.dbcr0 &= ~DBCR0_IDM; - regs->msr &= ~MSR_DE; - } -#else - regs->msr &= ~(MSR_SE | MSR_BE); -#endif - } - clear_tsk_thread_flag(task, TIF_SINGLESTEP); -} - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -void ptrace_triggered(struct perf_event *bp, - struct perf_sample_data *data, struct pt_regs *regs) -{ - struct perf_event_attr attr; - - /* - * Disable the breakpoint request here since ptrace has defined a - * one-shot behaviour for breakpoint exceptions in PPC64. - * The SIGTRAP signal is generated automatically for us in do_dabr(). - * We don't have to do anything about that here - */ - attr = bp->attr; - attr.disabled = true; - modify_user_hw_breakpoint(bp, &attr); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - -static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, - unsigned long data) -{ -#ifdef CONFIG_HAVE_HW_BREAKPOINT - int ret; - struct thread_struct *thread = &(task->thread); - struct perf_event *bp; - struct perf_event_attr attr; -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -#ifndef CONFIG_PPC_ADV_DEBUG_REGS - bool set_bp = true; - struct arch_hw_breakpoint hw_brk; -#endif - - /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). - * For embedded processors we support one DAC and no IAC's at the - * moment. - */ - if (addr > 0) - return -EINVAL; - - /* The bottom 3 bits in dabr are flags */ - if ((data & ~0x7UL) >= TASK_SIZE) - return -EIO; - -#ifndef CONFIG_PPC_ADV_DEBUG_REGS - /* For processors using DABR (i.e. 970), the bottom 3 bits are flags. - * It was assumed, on previous implementations, that 3 bits were - * passed together with the data address, fitting the design of the - * DABR register, as follows: - * - * bit 0: Read flag - * bit 1: Write flag - * bit 2: Breakpoint translation - * - * Thus, we use them here as so. - */ - - /* Ensure breakpoint translation bit is set */ - if (data && !(data & HW_BRK_TYPE_TRANSLATE)) - return -EIO; - hw_brk.address = data & (~HW_BRK_TYPE_DABR); - hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; - hw_brk.len = DABR_MAX_LEN; - hw_brk.hw_len = DABR_MAX_LEN; - set_bp = (data) && (hw_brk.type & HW_BRK_TYPE_RDWR); -#ifdef CONFIG_HAVE_HW_BREAKPOINT - bp = thread->ptrace_bps[0]; - if (!set_bp) { - if (bp) { - unregister_hw_breakpoint(bp); - thread->ptrace_bps[0] = NULL; - } - return 0; - } - if (bp) { - attr = bp->attr; - attr.bp_addr = hw_brk.address; - attr.bp_len = DABR_MAX_LEN; - arch_bp_generic_fields(hw_brk.type, &attr.bp_type); - - /* Enable breakpoint */ - attr.disabled = false; - - ret = modify_user_hw_breakpoint(bp, &attr); - if (ret) { - return ret; - } - thread->ptrace_bps[0] = bp; - thread->hw_brk = hw_brk; - return 0; - } - - /* Create a new breakpoint request if one doesn't exist already */ - hw_breakpoint_init(&attr); - attr.bp_addr = hw_brk.address; - attr.bp_len = DABR_MAX_LEN; - arch_bp_generic_fields(hw_brk.type, - &attr.bp_type); - - thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, - ptrace_triggered, NULL, task); - if (IS_ERR(bp)) { - thread->ptrace_bps[0] = NULL; - return PTR_ERR(bp); - } - -#else /* !CONFIG_HAVE_HW_BREAKPOINT */ - if (set_bp && (!ppc_breakpoint_available())) - return -ENODEV; -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - task->thread.hw_brk = hw_brk; -#else /* CONFIG_PPC_ADV_DEBUG_REGS */ - /* As described above, it was assumed 3 bits were passed with the data - * address, but we will assume only the mode bits will be passed - * as to not cause alignment restrictions for DAC-based processors. - */ - - /* DAC's hold the whole address without any mode flags */ - task->thread.debug.dac1 = data & ~0x3UL; - - if (task->thread.debug.dac1 == 0) { - dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W); - if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, - task->thread.debug.dbcr1)) { - task->thread.regs->msr &= ~MSR_DE; - task->thread.debug.dbcr0 &= ~DBCR0_IDM; - } - return 0; - } - - /* Read or Write bits must be set */ - - if (!(data & 0x3UL)) - return -EINVAL; - - /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0 - register */ - task->thread.debug.dbcr0 |= DBCR0_IDM; - - /* Check for write and read flags and set DBCR0 - accordingly */ - dbcr_dac(task) &= ~(DBCR_DAC1R|DBCR_DAC1W); - if (data & 0x1UL) - dbcr_dac(task) |= DBCR_DAC1R; - if (data & 0x2UL) - dbcr_dac(task) |= DBCR_DAC1W; - task->thread.regs->msr |= MSR_DE; -#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ - return 0; -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure single step bits etc are not set. - */ -void ptrace_disable(struct task_struct *child) -{ - /* make sure the single step bit is not set. */ - user_disable_single_step(child); -} - -#ifdef CONFIG_PPC_ADV_DEBUG_REGS -static long set_instruction_bp(struct task_struct *child, - struct ppc_hw_breakpoint *bp_info) -{ - int slot; - int slot1_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC1) != 0); - int slot2_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC2) != 0); - int slot3_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC3) != 0); - int slot4_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC4) != 0); - - if (dbcr_iac_range(child) & DBCR_IAC12MODE) - slot2_in_use = 1; - if (dbcr_iac_range(child) & DBCR_IAC34MODE) - slot4_in_use = 1; - - if (bp_info->addr >= TASK_SIZE) - return -EIO; - - if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { - - /* Make sure range is valid. */ - if (bp_info->addr2 >= TASK_SIZE) - return -EIO; - - /* We need a pair of IAC regsisters */ - if ((!slot1_in_use) && (!slot2_in_use)) { - slot = 1; - child->thread.debug.iac1 = bp_info->addr; - child->thread.debug.iac2 = bp_info->addr2; - child->thread.debug.dbcr0 |= DBCR0_IAC1; - if (bp_info->addr_mode == - PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) - dbcr_iac_range(child) |= DBCR_IAC12X; - else - dbcr_iac_range(child) |= DBCR_IAC12I; -#if CONFIG_PPC_ADV_DEBUG_IACS > 2 - } else if ((!slot3_in_use) && (!slot4_in_use)) { - slot = 3; - child->thread.debug.iac3 = bp_info->addr; - child->thread.debug.iac4 = bp_info->addr2; - child->thread.debug.dbcr0 |= DBCR0_IAC3; - if (bp_info->addr_mode == - PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) - dbcr_iac_range(child) |= DBCR_IAC34X; - else - dbcr_iac_range(child) |= DBCR_IAC34I; -#endif - } else - return -ENOSPC; - } else { - /* We only need one. If possible leave a pair free in - * case a range is needed later - */ - if (!slot1_in_use) { - /* - * Don't use iac1 if iac1-iac2 are free and either - * iac3 or iac4 (but not both) are free - */ - if (slot2_in_use || (slot3_in_use == slot4_in_use)) { - slot = 1; - child->thread.debug.iac1 = bp_info->addr; - child->thread.debug.dbcr0 |= DBCR0_IAC1; - goto out; - } - } - if (!slot2_in_use) { - slot = 2; - child->thread.debug.iac2 = bp_info->addr; - child->thread.debug.dbcr0 |= DBCR0_IAC2; -#if CONFIG_PPC_ADV_DEBUG_IACS > 2 - } else if (!slot3_in_use) { - slot = 3; - child->thread.debug.iac3 = bp_info->addr; - child->thread.debug.dbcr0 |= DBCR0_IAC3; - } else if (!slot4_in_use) { - slot = 4; - child->thread.debug.iac4 = bp_info->addr; - child->thread.debug.dbcr0 |= DBCR0_IAC4; -#endif - } else - return -ENOSPC; - } -out: - child->thread.debug.dbcr0 |= DBCR0_IDM; - child->thread.regs->msr |= MSR_DE; - - return slot; -} - -static int del_instruction_bp(struct task_struct *child, int slot) -{ - switch (slot) { - case 1: - if ((child->thread.debug.dbcr0 & DBCR0_IAC1) == 0) - return -ENOENT; - - if (dbcr_iac_range(child) & DBCR_IAC12MODE) { - /* address range - clear slots 1 & 2 */ - child->thread.debug.iac2 = 0; - dbcr_iac_range(child) &= ~DBCR_IAC12MODE; - } - child->thread.debug.iac1 = 0; - child->thread.debug.dbcr0 &= ~DBCR0_IAC1; - break; - case 2: - if ((child->thread.debug.dbcr0 & DBCR0_IAC2) == 0) - return -ENOENT; - - if (dbcr_iac_range(child) & DBCR_IAC12MODE) - /* used in a range */ - return -EINVAL; - child->thread.debug.iac2 = 0; - child->thread.debug.dbcr0 &= ~DBCR0_IAC2; - break; -#if CONFIG_PPC_ADV_DEBUG_IACS > 2 - case 3: - if ((child->thread.debug.dbcr0 & DBCR0_IAC3) == 0) - return -ENOENT; - - if (dbcr_iac_range(child) & DBCR_IAC34MODE) { - /* address range - clear slots 3 & 4 */ - child->thread.debug.iac4 = 0; - dbcr_iac_range(child) &= ~DBCR_IAC34MODE; - } - child->thread.debug.iac3 = 0; - child->thread.debug.dbcr0 &= ~DBCR0_IAC3; - break; - case 4: - if ((child->thread.debug.dbcr0 & DBCR0_IAC4) == 0) - return -ENOENT; - - if (dbcr_iac_range(child) & DBCR_IAC34MODE) - /* Used in a range */ - return -EINVAL; - child->thread.debug.iac4 = 0; - child->thread.debug.dbcr0 &= ~DBCR0_IAC4; - break; -#endif - default: - return -EINVAL; - } - return 0; -} - -static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) -{ - int byte_enable = - (bp_info->condition_mode >> PPC_BREAKPOINT_CONDITION_BE_SHIFT) - & 0xf; - int condition_mode = - bp_info->condition_mode & PPC_BREAKPOINT_CONDITION_MODE; - int slot; - - if (byte_enable && (condition_mode == 0)) - return -EINVAL; - - if (bp_info->addr >= TASK_SIZE) - return -EIO; - - if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) { - slot = 1; - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) - dbcr_dac(child) |= DBCR_DAC1R; - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) - dbcr_dac(child) |= DBCR_DAC1W; - child->thread.debug.dac1 = (unsigned long)bp_info->addr; -#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - if (byte_enable) { - child->thread.debug.dvc1 = - (unsigned long)bp_info->condition_value; - child->thread.debug.dbcr2 |= - ((byte_enable << DBCR2_DVC1BE_SHIFT) | - (condition_mode << DBCR2_DVC1M_SHIFT)); - } -#endif -#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - } else if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { - /* Both dac1 and dac2 are part of a range */ - return -ENOSPC; -#endif - } else if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) { - slot = 2; - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) - dbcr_dac(child) |= DBCR_DAC2R; - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) - dbcr_dac(child) |= DBCR_DAC2W; - child->thread.debug.dac2 = (unsigned long)bp_info->addr; -#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - if (byte_enable) { - child->thread.debug.dvc2 = - (unsigned long)bp_info->condition_value; - child->thread.debug.dbcr2 |= - ((byte_enable << DBCR2_DVC2BE_SHIFT) | - (condition_mode << DBCR2_DVC2M_SHIFT)); - } -#endif - } else - return -ENOSPC; - child->thread.debug.dbcr0 |= DBCR0_IDM; - child->thread.regs->msr |= MSR_DE; - - return slot + 4; -} - -static int del_dac(struct task_struct *child, int slot) -{ - if (slot == 1) { - if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) - return -ENOENT; - - child->thread.debug.dac1 = 0; - dbcr_dac(child) &= ~(DBCR_DAC1R | DBCR_DAC1W); -#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { - child->thread.debug.dac2 = 0; - child->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE; - } - child->thread.debug.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE); -#endif -#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - child->thread.debug.dvc1 = 0; -#endif - } else if (slot == 2) { - if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) - return -ENOENT; - -#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) - /* Part of a range */ - return -EINVAL; - child->thread.debug.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE); -#endif -#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - child->thread.debug.dvc2 = 0; -#endif - child->thread.debug.dac2 = 0; - dbcr_dac(child) &= ~(DBCR_DAC2R | DBCR_DAC2W); - } else - return -EINVAL; - - return 0; -} -#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ - -#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE -static int set_dac_range(struct task_struct *child, - struct ppc_hw_breakpoint *bp_info) -{ - int mode = bp_info->addr_mode & PPC_BREAKPOINT_MODE_MASK; - - /* We don't allow range watchpoints to be used with DVC */ - if (bp_info->condition_mode) - return -EINVAL; - - /* - * Best effort to verify the address range. The user/supervisor bits - * prevent trapping in kernel space, but let's fail on an obvious bad - * range. The simple test on the mask is not fool-proof, and any - * exclusive range will spill over into kernel space. - */ - if (bp_info->addr >= TASK_SIZE) - return -EIO; - if (mode == PPC_BREAKPOINT_MODE_MASK) { - /* - * dac2 is a bitmask. Don't allow a mask that makes a - * kernel space address from a valid dac1 value - */ - if (~((unsigned long)bp_info->addr2) >= TASK_SIZE) - return -EIO; - } else { - /* - * For range breakpoints, addr2 must also be a valid address - */ - if (bp_info->addr2 >= TASK_SIZE) - return -EIO; - } - - if (child->thread.debug.dbcr0 & - (DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W)) - return -ENOSPC; - - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) - child->thread.debug.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM); - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) - child->thread.debug.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM); - child->thread.debug.dac1 = bp_info->addr; - child->thread.debug.dac2 = bp_info->addr2; - if (mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) - child->thread.debug.dbcr2 |= DBCR2_DAC12M; - else if (mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) - child->thread.debug.dbcr2 |= DBCR2_DAC12MX; - else /* PPC_BREAKPOINT_MODE_MASK */ - child->thread.debug.dbcr2 |= DBCR2_DAC12MM; - child->thread.regs->msr |= MSR_DE; - - return 5; -} -#endif /* CONFIG_PPC_ADV_DEBUG_DAC_RANGE */ - -static long ppc_set_hwdebug(struct task_struct *child, - struct ppc_hw_breakpoint *bp_info) -{ -#ifdef CONFIG_HAVE_HW_BREAKPOINT - int len = 0; - struct thread_struct *thread = &(child->thread); - struct perf_event *bp; - struct perf_event_attr attr; -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -#ifndef CONFIG_PPC_ADV_DEBUG_REGS - struct arch_hw_breakpoint brk; -#endif - - if (bp_info->version != 1) - return -ENOTSUPP; -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - /* - * Check for invalid flags and combinations - */ - if ((bp_info->trigger_type == 0) || - (bp_info->trigger_type & ~(PPC_BREAKPOINT_TRIGGER_EXECUTE | - PPC_BREAKPOINT_TRIGGER_RW)) || - (bp_info->addr_mode & ~PPC_BREAKPOINT_MODE_MASK) || - (bp_info->condition_mode & - ~(PPC_BREAKPOINT_CONDITION_MODE | - PPC_BREAKPOINT_CONDITION_BE_ALL))) - return -EINVAL; -#if CONFIG_PPC_ADV_DEBUG_DVCS == 0 - if (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE) - return -EINVAL; -#endif - - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_EXECUTE) { - if ((bp_info->trigger_type != PPC_BREAKPOINT_TRIGGER_EXECUTE) || - (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE)) - return -EINVAL; - return set_instruction_bp(child, bp_info); - } - if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) - return set_dac(child, bp_info); - -#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - return set_dac_range(child, bp_info); -#else - return -EINVAL; -#endif -#else /* !CONFIG_PPC_ADV_DEBUG_DVCS */ - /* - * We only support one data breakpoint - */ - if ((bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_RW) == 0 || - (bp_info->trigger_type & ~PPC_BREAKPOINT_TRIGGER_RW) != 0 || - bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE) - return -EINVAL; - - if ((unsigned long)bp_info->addr >= TASK_SIZE) - return -EIO; - - brk.address = bp_info->addr & ~HW_BREAKPOINT_ALIGN; - brk.type = HW_BRK_TYPE_TRANSLATE; - brk.len = DABR_MAX_LEN; - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) - brk.type |= HW_BRK_TYPE_READ; - if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) - brk.type |= HW_BRK_TYPE_WRITE; -#ifdef CONFIG_HAVE_HW_BREAKPOINT - if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) - len = bp_info->addr2 - bp_info->addr; - else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) - len = 1; - else - return -EINVAL; - bp = thread->ptrace_bps[0]; - if (bp) - return -ENOSPC; - - /* Create a new breakpoint request if one doesn't exist already */ - hw_breakpoint_init(&attr); - attr.bp_addr = (unsigned long)bp_info->addr; - attr.bp_len = len; - arch_bp_generic_fields(brk.type, &attr.bp_type); - - thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, - ptrace_triggered, NULL, child); - if (IS_ERR(bp)) { - thread->ptrace_bps[0] = NULL; - return PTR_ERR(bp); - } - - return 1; -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - - if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) - return -EINVAL; - - if (child->thread.hw_brk.address) - return -ENOSPC; - - if (!ppc_breakpoint_available()) - return -ENODEV; - - child->thread.hw_brk = brk; - - return 1; -#endif /* !CONFIG_PPC_ADV_DEBUG_DVCS */ -} - -static long ppc_del_hwdebug(struct task_struct *child, long data) -{ -#ifdef CONFIG_HAVE_HW_BREAKPOINT - int ret = 0; - struct thread_struct *thread = &(child->thread); - struct perf_event *bp; -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - int rc; - - if (data <= 4) - rc = del_instruction_bp(child, (int)data); - else - rc = del_dac(child, (int)data - 4); - - if (!rc) { - if (!DBCR_ACTIVE_EVENTS(child->thread.debug.dbcr0, - child->thread.debug.dbcr1)) { - child->thread.debug.dbcr0 &= ~DBCR0_IDM; - child->thread.regs->msr &= ~MSR_DE; - } - } - return rc; -#else - if (data != 1) - return -EINVAL; - -#ifdef CONFIG_HAVE_HW_BREAKPOINT - bp = thread->ptrace_bps[0]; - if (bp) { - unregister_hw_breakpoint(bp); - thread->ptrace_bps[0] = NULL; - } else - ret = -ENOENT; - return ret; -#else /* CONFIG_HAVE_HW_BREAKPOINT */ - if (child->thread.hw_brk.address == 0) - return -ENOENT; - - child->thread.hw_brk.address = 0; - child->thread.hw_brk.type = 0; -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - - return 0; -#endif -} - -long arch_ptrace(struct task_struct *child, long request, - unsigned long addr, unsigned long data) -{ - int ret = -EPERM; - void __user *datavp = (void __user *) data; - unsigned long __user *datalp = datavp; - - switch (request) { - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long index, tmp; - - ret = -EIO; - /* convert to index and check */ -#ifdef CONFIG_PPC32 - index = addr >> 2; - if ((addr & 3) || (index > PT_FPSCR) - || (child->thread.regs == NULL)) -#else - index = addr >> 3; - if ((addr & 7) || (index > PT_FPSCR)) -#endif - break; - - CHECK_FULL_REGS(child->thread.regs); - if (index < PT_FPR0) { - ret = ptrace_get_reg(child, (int) index, &tmp); - if (ret) - break; - } else { - unsigned int fpidx = index - PT_FPR0; - - flush_fp_to_thread(child); - if (fpidx < (PT_FPSCR - PT_FPR0)) - memcpy(&tmp, &child->thread.TS_FPR(fpidx), - sizeof(long)); - else - tmp = child->thread.fp_state.fpscr; - } - ret = put_user(tmp, datalp); - break; - } - - /* write the word at location addr in the USER area */ - case PTRACE_POKEUSR: { - unsigned long index; - - ret = -EIO; - /* convert to index and check */ -#ifdef CONFIG_PPC32 - index = addr >> 2; - if ((addr & 3) || (index > PT_FPSCR) - || (child->thread.regs == NULL)) -#else - index = addr >> 3; - if ((addr & 7) || (index > PT_FPSCR)) -#endif - break; - - CHECK_FULL_REGS(child->thread.regs); - if (index < PT_FPR0) { - ret = ptrace_put_reg(child, index, data); - } else { - unsigned int fpidx = index - PT_FPR0; - - flush_fp_to_thread(child); - if (fpidx < (PT_FPSCR - PT_FPR0)) - memcpy(&child->thread.TS_FPR(fpidx), &data, - sizeof(long)); - else - child->thread.fp_state.fpscr = data; - ret = 0; - } - break; - } - - case PPC_PTRACE_GETHWDBGINFO: { - struct ppc_debug_info dbginfo; - - dbginfo.version = 1; -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - dbginfo.num_instruction_bps = CONFIG_PPC_ADV_DEBUG_IACS; - dbginfo.num_data_bps = CONFIG_PPC_ADV_DEBUG_DACS; - dbginfo.num_condition_regs = CONFIG_PPC_ADV_DEBUG_DVCS; - dbginfo.data_bp_alignment = 4; - dbginfo.sizeof_condition = 4; - dbginfo.features = PPC_DEBUG_FEATURE_INSN_BP_RANGE | - PPC_DEBUG_FEATURE_INSN_BP_MASK; -#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - dbginfo.features |= - PPC_DEBUG_FEATURE_DATA_BP_RANGE | - PPC_DEBUG_FEATURE_DATA_BP_MASK; -#endif -#else /* !CONFIG_PPC_ADV_DEBUG_REGS */ - dbginfo.num_instruction_bps = 0; - if (ppc_breakpoint_available()) - dbginfo.num_data_bps = 1; - else - dbginfo.num_data_bps = 0; - dbginfo.num_condition_regs = 0; -#ifdef CONFIG_PPC64 - dbginfo.data_bp_alignment = 8; -#else - dbginfo.data_bp_alignment = 4; -#endif - dbginfo.sizeof_condition = 0; -#ifdef CONFIG_HAVE_HW_BREAKPOINT - dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; - if (dawr_enabled()) - dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR; -#else - dbginfo.features = 0; -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ - - if (copy_to_user(datavp, &dbginfo, - sizeof(struct ppc_debug_info))) - return -EFAULT; - return 0; - } - - case PPC_PTRACE_SETHWDEBUG: { - struct ppc_hw_breakpoint bp_info; - - if (copy_from_user(&bp_info, datavp, - sizeof(struct ppc_hw_breakpoint))) - return -EFAULT; - return ppc_set_hwdebug(child, &bp_info); - } - - case PPC_PTRACE_DELHWDEBUG: { - ret = ppc_del_hwdebug(child, data); - break; - } - - case PTRACE_GET_DEBUGREG: { -#ifndef CONFIG_PPC_ADV_DEBUG_REGS - unsigned long dabr_fake; -#endif - ret = -EINVAL; - /* We only support one DABR and no IABRS at the moment */ - if (addr > 0) - break; -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - ret = put_user(child->thread.debug.dac1, datalp); -#else - dabr_fake = ((child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) | - (child->thread.hw_brk.type & HW_BRK_TYPE_DABR)); - ret = put_user(dabr_fake, datalp); -#endif - break; - } - - case PTRACE_SET_DEBUGREG: - ret = ptrace_set_debugreg(child, addr, data); - break; - -#ifdef CONFIG_PPC64 - case PTRACE_GETREGS64: -#endif - case PTRACE_GETREGS: /* Get all pt_regs from the child. */ - return copy_regset_to_user(child, &user_ppc_native_view, - REGSET_GPR, - 0, sizeof(struct user_pt_regs), - datavp); - -#ifdef CONFIG_PPC64 - case PTRACE_SETREGS64: -#endif - case PTRACE_SETREGS: /* Set all gp regs in the child. */ - return copy_regset_from_user(child, &user_ppc_native_view, - REGSET_GPR, - 0, sizeof(struct user_pt_regs), - datavp); - - case PTRACE_GETFPREGS: /* Get the child FPU state (FPR0...31 + FPSCR) */ - return copy_regset_to_user(child, &user_ppc_native_view, - REGSET_FPR, - 0, sizeof(elf_fpregset_t), - datavp); - - case PTRACE_SETFPREGS: /* Set the child FPU state (FPR0...31 + FPSCR) */ - return copy_regset_from_user(child, &user_ppc_native_view, - REGSET_FPR, - 0, sizeof(elf_fpregset_t), - datavp); - -#ifdef CONFIG_ALTIVEC - case PTRACE_GETVRREGS: - return copy_regset_to_user(child, &user_ppc_native_view, - REGSET_VMX, - 0, (33 * sizeof(vector128) + - sizeof(u32)), - datavp); - - case PTRACE_SETVRREGS: - return copy_regset_from_user(child, &user_ppc_native_view, - REGSET_VMX, - 0, (33 * sizeof(vector128) + - sizeof(u32)), - datavp); -#endif -#ifdef CONFIG_VSX - case PTRACE_GETVSRREGS: - return copy_regset_to_user(child, &user_ppc_native_view, - REGSET_VSX, - 0, 32 * sizeof(double), - datavp); - - case PTRACE_SETVSRREGS: - return copy_regset_from_user(child, &user_ppc_native_view, - REGSET_VSX, - 0, 32 * sizeof(double), - datavp); -#endif -#ifdef CONFIG_SPE - case PTRACE_GETEVRREGS: - /* Get the child spe register state. */ - return copy_regset_to_user(child, &user_ppc_native_view, - REGSET_SPE, 0, 35 * sizeof(u32), - datavp); - - case PTRACE_SETEVRREGS: - /* Set the child spe register state. */ - return copy_regset_from_user(child, &user_ppc_native_view, - REGSET_SPE, 0, 35 * sizeof(u32), - datavp); -#endif - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - return ret; -} - -#ifdef CONFIG_SECCOMP -static int do_seccomp(struct pt_regs *regs) -{ - if (!test_thread_flag(TIF_SECCOMP)) - return 0; - - /* - * The ABI we present to seccomp tracers is that r3 contains - * the syscall return value and orig_gpr3 contains the first - * syscall parameter. This is different to the ptrace ABI where - * both r3 and orig_gpr3 contain the first syscall parameter. - */ - regs->gpr[3] = -ENOSYS; - - /* - * We use the __ version here because we have already checked - * TIF_SECCOMP. If this fails, there is nothing left to do, we - * have already loaded -ENOSYS into r3, or seccomp has put - * something else in r3 (via SECCOMP_RET_ERRNO/TRACE). - */ - if (__secure_computing(NULL)) - return -1; - - /* - * The syscall was allowed by seccomp, restore the register - * state to what audit expects. - * Note that we use orig_gpr3, which means a seccomp tracer can - * modify the first syscall parameter (in orig_gpr3) and also - * allow the syscall to proceed. - */ - regs->gpr[3] = regs->orig_gpr3; - - return 0; -} -#else -static inline int do_seccomp(struct pt_regs *regs) { return 0; } -#endif /* CONFIG_SECCOMP */ - -/** - * do_syscall_trace_enter() - Do syscall tracing on kernel entry. - * @regs: the pt_regs of the task to trace (current) - * - * Performs various types of tracing on syscall entry. This includes seccomp, - * ptrace, syscall tracepoints and audit. - * - * The pt_regs are potentially visible to userspace via ptrace, so their - * contents is ABI. - * - * One or more of the tracers may modify the contents of pt_regs, in particular - * to modify arguments or even the syscall number itself. - * - * It's also possible that a tracer can choose to reject the system call. In - * that case this function will return an illegal syscall number, and will put - * an appropriate return value in regs->r3. - * - * Return: the (possibly changed) syscall number. - */ -long do_syscall_trace_enter(struct pt_regs *regs) -{ - u32 flags; - - user_exit(); - - flags = READ_ONCE(current_thread_info()->flags) & - (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE); - - if (flags) { - int rc = tracehook_report_syscall_entry(regs); - - if (unlikely(flags & _TIF_SYSCALL_EMU)) { - /* - * A nonzero return code from - * tracehook_report_syscall_entry() tells us to prevent - * the syscall execution, but we are not going to - * execute it anyway. - * - * Returning -1 will skip the syscall execution. We want - * to avoid clobbering any registers, so we don't goto - * the skip label below. - */ - return -1; - } - - if (rc) { - /* - * The tracer decided to abort the syscall. Note that - * the tracer may also just change regs->gpr[0] to an - * invalid syscall number, that is handled below on the - * exit path. - */ - goto skip; - } - } - - /* Run seccomp after ptrace; allow it to set gpr[3]. */ - if (do_seccomp(regs)) - return -1; - - /* Avoid trace and audit when syscall is invalid. */ - if (regs->gpr[0] >= NR_syscalls) - goto skip; - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->gpr[0]); - -#ifdef CONFIG_PPC64 - if (!is_32bit_task()) - audit_syscall_entry(regs->gpr[0], regs->gpr[3], regs->gpr[4], - regs->gpr[5], regs->gpr[6]); - else -#endif - audit_syscall_entry(regs->gpr[0], - regs->gpr[3] & 0xffffffff, - regs->gpr[4] & 0xffffffff, - regs->gpr[5] & 0xffffffff, - regs->gpr[6] & 0xffffffff); - - /* Return the possibly modified but valid syscall number */ - return regs->gpr[0]; - -skip: - /* - * If we are aborting explicitly, or if the syscall number is - * now invalid, set the return value to -ENOSYS. - */ - regs->gpr[3] = -ENOSYS; - return -1; -} - -void do_syscall_trace_leave(struct pt_regs *regs) -{ - int step; - - audit_syscall_exit(regs); - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_exit(regs, regs->result); - - step = test_thread_flag(TIF_SINGLESTEP); - if (step || test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, step); - - user_enter(); -} - -void __init pt_regs_check(void); - -/* - * Dummy function, its purpose is to break the build if struct pt_regs and - * struct user_pt_regs don't match. - */ -void __init pt_regs_check(void) -{ - BUILD_BUG_ON(offsetof(struct pt_regs, gpr) != - offsetof(struct user_pt_regs, gpr)); - BUILD_BUG_ON(offsetof(struct pt_regs, nip) != - offsetof(struct user_pt_regs, nip)); - BUILD_BUG_ON(offsetof(struct pt_regs, msr) != - offsetof(struct user_pt_regs, msr)); - BUILD_BUG_ON(offsetof(struct pt_regs, msr) != - offsetof(struct user_pt_regs, msr)); - BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != - offsetof(struct user_pt_regs, orig_gpr3)); - BUILD_BUG_ON(offsetof(struct pt_regs, ctr) != - offsetof(struct user_pt_regs, ctr)); - BUILD_BUG_ON(offsetof(struct pt_regs, link) != - offsetof(struct user_pt_regs, link)); - BUILD_BUG_ON(offsetof(struct pt_regs, xer) != - offsetof(struct user_pt_regs, xer)); - BUILD_BUG_ON(offsetof(struct pt_regs, ccr) != - offsetof(struct user_pt_regs, ccr)); -#ifdef __powerpc64__ - BUILD_BUG_ON(offsetof(struct pt_regs, softe) != - offsetof(struct user_pt_regs, softe)); -#else - BUILD_BUG_ON(offsetof(struct pt_regs, mq) != - offsetof(struct user_pt_regs, mq)); -#endif - BUILD_BUG_ON(offsetof(struct pt_regs, trap) != - offsetof(struct user_pt_regs, trap)); - BUILD_BUG_ON(offsetof(struct pt_regs, dar) != - offsetof(struct user_pt_regs, dar)); - BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != - offsetof(struct user_pt_regs, dsisr)); - BUILD_BUG_ON(offsetof(struct pt_regs, result) != - offsetof(struct user_pt_regs, result)); - - BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs)); - - // Now check that the pt_regs offsets match the uapi #defines - #define CHECK_REG(_pt, _reg) \ - BUILD_BUG_ON(_pt != (offsetof(struct user_pt_regs, _reg) / \ - sizeof(unsigned long))); - - CHECK_REG(PT_R0, gpr[0]); - CHECK_REG(PT_R1, gpr[1]); - CHECK_REG(PT_R2, gpr[2]); - CHECK_REG(PT_R3, gpr[3]); - CHECK_REG(PT_R4, gpr[4]); - CHECK_REG(PT_R5, gpr[5]); - CHECK_REG(PT_R6, gpr[6]); - CHECK_REG(PT_R7, gpr[7]); - CHECK_REG(PT_R8, gpr[8]); - CHECK_REG(PT_R9, gpr[9]); - CHECK_REG(PT_R10, gpr[10]); - CHECK_REG(PT_R11, gpr[11]); - CHECK_REG(PT_R12, gpr[12]); - CHECK_REG(PT_R13, gpr[13]); - CHECK_REG(PT_R14, gpr[14]); - CHECK_REG(PT_R15, gpr[15]); - CHECK_REG(PT_R16, gpr[16]); - CHECK_REG(PT_R17, gpr[17]); - CHECK_REG(PT_R18, gpr[18]); - CHECK_REG(PT_R19, gpr[19]); - CHECK_REG(PT_R20, gpr[20]); - CHECK_REG(PT_R21, gpr[21]); - CHECK_REG(PT_R22, gpr[22]); - CHECK_REG(PT_R23, gpr[23]); - CHECK_REG(PT_R24, gpr[24]); - CHECK_REG(PT_R25, gpr[25]); - CHECK_REG(PT_R26, gpr[26]); - CHECK_REG(PT_R27, gpr[27]); - CHECK_REG(PT_R28, gpr[28]); - CHECK_REG(PT_R29, gpr[29]); - CHECK_REG(PT_R30, gpr[30]); - CHECK_REG(PT_R31, gpr[31]); - CHECK_REG(PT_NIP, nip); - CHECK_REG(PT_MSR, msr); - CHECK_REG(PT_ORIG_R3, orig_gpr3); - CHECK_REG(PT_CTR, ctr); - CHECK_REG(PT_LNK, link); - CHECK_REG(PT_XER, xer); - CHECK_REG(PT_CCR, ccr); -#ifdef CONFIG_PPC64 - CHECK_REG(PT_SOFTE, softe); -#else - CHECK_REG(PT_MQ, mq); -#endif - CHECK_REG(PT_TRAP, trap); - CHECK_REG(PT_DAR, dar); - CHECK_REG(PT_DSISR, dsisr); - CHECK_REG(PT_RESULT, result); - #undef CHECK_REG - - BUILD_BUG_ON(PT_REGS_COUNT != sizeof(struct user_pt_regs) / sizeof(unsigned long)); - - /* - * PT_DSCR isn't a real reg, but it's important that it doesn't overlap the - * real registers. - */ - BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long)); -} diff --git a/arch/powerpc/kernel/ptrace/Makefile b/arch/powerpc/kernel/ptrace/Makefile new file mode 100644 index 000000000000..77abd1a5a508 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +CFLAGS_ptrace-view.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' + +obj-y += ptrace.o ptrace-view.o +obj-y += ptrace-fpu.o +obj-$(CONFIG_COMPAT) += ptrace32.o +obj-$(CONFIG_VSX) += ptrace-vsx.o +ifneq ($(CONFIG_VSX),y) +obj-y += ptrace-novsx.o +endif +obj-$(CONFIG_ALTIVEC) += ptrace-altivec.o +obj-$(CONFIG_SPE) += ptrace-spe.o +obj-$(CONFIG_PPC_TRANSACTIONAL_MEM) += ptrace-tm.o +obj-$(CONFIG_PPC_ADV_DEBUG_REGS) += ptrace-adv.o +ifneq ($(CONFIG_PPC_ADV_DEBUG_REGS),y) +obj-y += ptrace-noadv.o +endif diff --git a/arch/powerpc/kernel/ptrace/ptrace-adv.c b/arch/powerpc/kernel/ptrace/ptrace-adv.c new file mode 100644 index 000000000000..399f5d94a3df --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-adv.c @@ -0,0 +1,494 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> +#include <linux/hw_breakpoint.h> + +#include "ptrace-decl.h" + +void user_enable_single_step(struct task_struct *task) +{ + struct pt_regs *regs = task->thread.regs; + + if (regs != NULL) { + task->thread.debug.dbcr0 &= ~DBCR0_BT; + task->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC; + regs_set_return_msr(regs, regs->msr | MSR_DE); + } + set_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +void user_enable_block_step(struct task_struct *task) +{ + struct pt_regs *regs = task->thread.regs; + + if (regs != NULL) { + task->thread.debug.dbcr0 &= ~DBCR0_IC; + task->thread.debug.dbcr0 = DBCR0_IDM | DBCR0_BT; + regs_set_return_msr(regs, regs->msr | MSR_DE); + } + set_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +void user_disable_single_step(struct task_struct *task) +{ + struct pt_regs *regs = task->thread.regs; + + if (regs != NULL) { + /* + * The logic to disable single stepping should be as + * simple as turning off the Instruction Complete flag. + * And, after doing so, if all debug flags are off, turn + * off DBCR0(IDM) and MSR(DE) .... Torez + */ + task->thread.debug.dbcr0 &= ~(DBCR0_IC | DBCR0_BT); + /* + * Test to see if any of the DBCR_ACTIVE_EVENTS bits are set. + */ + if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, + task->thread.debug.dbcr1)) { + /* + * All debug events were off..... + */ + task->thread.debug.dbcr0 &= ~DBCR0_IDM; + regs_set_return_msr(regs, regs->msr & ~MSR_DE); + } + } + clear_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +void ppc_gethwdinfo(struct ppc_debug_info *dbginfo) +{ + dbginfo->version = 1; + dbginfo->num_instruction_bps = CONFIG_PPC_ADV_DEBUG_IACS; + dbginfo->num_data_bps = CONFIG_PPC_ADV_DEBUG_DACS; + dbginfo->num_condition_regs = CONFIG_PPC_ADV_DEBUG_DVCS; + dbginfo->data_bp_alignment = 4; + dbginfo->sizeof_condition = 4; + dbginfo->features = PPC_DEBUG_FEATURE_INSN_BP_RANGE | + PPC_DEBUG_FEATURE_INSN_BP_MASK; + if (IS_ENABLED(CONFIG_PPC_ADV_DEBUG_DAC_RANGE)) + dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_RANGE | + PPC_DEBUG_FEATURE_DATA_BP_MASK; +} + +int ptrace_get_debugreg(struct task_struct *child, unsigned long addr, + unsigned long __user *datalp) +{ + /* We only support one DABR and no IABRS at the moment */ + if (addr > 0) + return -EINVAL; + return put_user(child->thread.debug.dac1, datalp); +} + +int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, unsigned long data) +{ + struct pt_regs *regs = task->thread.regs; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int ret; + struct thread_struct *thread = &task->thread; + struct perf_event *bp; + struct perf_event_attr attr; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + + /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). + * For embedded processors we support one DAC and no IAC's at the + * moment. + */ + if (addr > 0) + return -EINVAL; + + /* The bottom 3 bits in dabr are flags */ + if ((data & ~0x7UL) >= TASK_SIZE) + return -EIO; + + /* As described above, it was assumed 3 bits were passed with the data + * address, but we will assume only the mode bits will be passed + * as to not cause alignment restrictions for DAC-based processors. + */ + + /* DAC's hold the whole address without any mode flags */ + task->thread.debug.dac1 = data & ~0x3UL; + + if (task->thread.debug.dac1 == 0) { + dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W); + if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, + task->thread.debug.dbcr1)) { + regs_set_return_msr(regs, regs->msr & ~MSR_DE); + task->thread.debug.dbcr0 &= ~DBCR0_IDM; + } + return 0; + } + + /* Read or Write bits must be set */ + + if (!(data & 0x3UL)) + return -EINVAL; + + /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0 register */ + task->thread.debug.dbcr0 |= DBCR0_IDM; + + /* Check for write and read flags and set DBCR0 accordingly */ + dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W); + if (data & 0x1UL) + dbcr_dac(task) |= DBCR_DAC1R; + if (data & 0x2UL) + dbcr_dac(task) |= DBCR_DAC1W; + regs_set_return_msr(regs, regs->msr | MSR_DE); + return 0; +} + +static long set_instruction_bp(struct task_struct *child, + struct ppc_hw_breakpoint *bp_info) +{ + int slot; + int slot1_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC1) != 0); + int slot2_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC2) != 0); + int slot3_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC3) != 0); + int slot4_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC4) != 0); + + if (dbcr_iac_range(child) & DBCR_IAC12MODE) + slot2_in_use = 1; + if (dbcr_iac_range(child) & DBCR_IAC34MODE) + slot4_in_use = 1; + + if (bp_info->addr >= TASK_SIZE) + return -EIO; + + if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { + /* Make sure range is valid. */ + if (bp_info->addr2 >= TASK_SIZE) + return -EIO; + + /* We need a pair of IAC regsisters */ + if (!slot1_in_use && !slot2_in_use) { + slot = 1; + child->thread.debug.iac1 = bp_info->addr; + child->thread.debug.iac2 = bp_info->addr2; + child->thread.debug.dbcr0 |= DBCR0_IAC1; + if (bp_info->addr_mode == + PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) + dbcr_iac_range(child) |= DBCR_IAC12X; + else + dbcr_iac_range(child) |= DBCR_IAC12I; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + } else if ((!slot3_in_use) && (!slot4_in_use)) { + slot = 3; + child->thread.debug.iac3 = bp_info->addr; + child->thread.debug.iac4 = bp_info->addr2; + child->thread.debug.dbcr0 |= DBCR0_IAC3; + if (bp_info->addr_mode == + PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) + dbcr_iac_range(child) |= DBCR_IAC34X; + else + dbcr_iac_range(child) |= DBCR_IAC34I; +#endif + } else { + return -ENOSPC; + } + } else { + /* We only need one. If possible leave a pair free in + * case a range is needed later + */ + if (!slot1_in_use) { + /* + * Don't use iac1 if iac1-iac2 are free and either + * iac3 or iac4 (but not both) are free + */ + if (slot2_in_use || slot3_in_use == slot4_in_use) { + slot = 1; + child->thread.debug.iac1 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC1; + goto out; + } + } + if (!slot2_in_use) { + slot = 2; + child->thread.debug.iac2 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC2; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + } else if (!slot3_in_use) { + slot = 3; + child->thread.debug.iac3 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC3; + } else if (!slot4_in_use) { + slot = 4; + child->thread.debug.iac4 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC4; +#endif + } else { + return -ENOSPC; + } + } +out: + child->thread.debug.dbcr0 |= DBCR0_IDM; + regs_set_return_msr(child->thread.regs, child->thread.regs->msr | MSR_DE); + + return slot; +} + +static int del_instruction_bp(struct task_struct *child, int slot) +{ + switch (slot) { + case 1: + if ((child->thread.debug.dbcr0 & DBCR0_IAC1) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC12MODE) { + /* address range - clear slots 1 & 2 */ + child->thread.debug.iac2 = 0; + dbcr_iac_range(child) &= ~DBCR_IAC12MODE; + } + child->thread.debug.iac1 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC1; + break; + case 2: + if ((child->thread.debug.dbcr0 & DBCR0_IAC2) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC12MODE) + /* used in a range */ + return -EINVAL; + child->thread.debug.iac2 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC2; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case 3: + if ((child->thread.debug.dbcr0 & DBCR0_IAC3) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC34MODE) { + /* address range - clear slots 3 & 4 */ + child->thread.debug.iac4 = 0; + dbcr_iac_range(child) &= ~DBCR_IAC34MODE; + } + child->thread.debug.iac3 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC3; + break; + case 4: + if ((child->thread.debug.dbcr0 & DBCR0_IAC4) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC34MODE) + /* Used in a range */ + return -EINVAL; + child->thread.debug.iac4 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC4; + break; +#endif + default: + return -EINVAL; + } + return 0; +} + +static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) +{ + int byte_enable = + (bp_info->condition_mode >> PPC_BREAKPOINT_CONDITION_BE_SHIFT) + & 0xf; + int condition_mode = + bp_info->condition_mode & PPC_BREAKPOINT_CONDITION_MODE; + int slot; + + if (byte_enable && condition_mode == 0) + return -EINVAL; + + if (bp_info->addr >= TASK_SIZE) + return -EIO; + + if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) { + slot = 1; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + dbcr_dac(child) |= DBCR_DAC1R; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + dbcr_dac(child) |= DBCR_DAC1W; + child->thread.debug.dac1 = (unsigned long)bp_info->addr; +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + if (byte_enable) { + child->thread.debug.dvc1 = + (unsigned long)bp_info->condition_value; + child->thread.debug.dbcr2 |= + ((byte_enable << DBCR2_DVC1BE_SHIFT) | + (condition_mode << DBCR2_DVC1M_SHIFT)); + } +#endif +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + } else if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { + /* Both dac1 and dac2 are part of a range */ + return -ENOSPC; +#endif + } else if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) { + slot = 2; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + dbcr_dac(child) |= DBCR_DAC2R; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + dbcr_dac(child) |= DBCR_DAC2W; + child->thread.debug.dac2 = (unsigned long)bp_info->addr; +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + if (byte_enable) { + child->thread.debug.dvc2 = + (unsigned long)bp_info->condition_value; + child->thread.debug.dbcr2 |= + ((byte_enable << DBCR2_DVC2BE_SHIFT) | + (condition_mode << DBCR2_DVC2M_SHIFT)); + } +#endif + } else { + return -ENOSPC; + } + child->thread.debug.dbcr0 |= DBCR0_IDM; + regs_set_return_msr(child->thread.regs, child->thread.regs->msr | MSR_DE); + + return slot + 4; +} + +static int del_dac(struct task_struct *child, int slot) +{ + if (slot == 1) { + if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) + return -ENOENT; + + child->thread.debug.dac1 = 0; + dbcr_dac(child) &= ~(DBCR_DAC1R | DBCR_DAC1W); +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { + child->thread.debug.dac2 = 0; + child->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE; + } + child->thread.debug.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE); +#endif +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + child->thread.debug.dvc1 = 0; +#endif + } else if (slot == 2) { + if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) + return -ENOENT; + +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) + /* Part of a range */ + return -EINVAL; + child->thread.debug.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE); +#endif +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + child->thread.debug.dvc2 = 0; +#endif + child->thread.debug.dac2 = 0; + dbcr_dac(child) &= ~(DBCR_DAC2R | DBCR_DAC2W); + } else { + return -EINVAL; + } + + return 0; +} + +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE +static int set_dac_range(struct task_struct *child, + struct ppc_hw_breakpoint *bp_info) +{ + int mode = bp_info->addr_mode & PPC_BREAKPOINT_MODE_MASK; + + /* We don't allow range watchpoints to be used with DVC */ + if (bp_info->condition_mode) + return -EINVAL; + + /* + * Best effort to verify the address range. The user/supervisor bits + * prevent trapping in kernel space, but let's fail on an obvious bad + * range. The simple test on the mask is not fool-proof, and any + * exclusive range will spill over into kernel space. + */ + if (bp_info->addr >= TASK_SIZE) + return -EIO; + if (mode == PPC_BREAKPOINT_MODE_MASK) { + /* + * dac2 is a bitmask. Don't allow a mask that makes a + * kernel space address from a valid dac1 value + */ + if (~((unsigned long)bp_info->addr2) >= TASK_SIZE) + return -EIO; + } else { + /* + * For range breakpoints, addr2 must also be a valid address + */ + if (bp_info->addr2 >= TASK_SIZE) + return -EIO; + } + + if (child->thread.debug.dbcr0 & + (DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W)) + return -ENOSPC; + + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + child->thread.debug.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM); + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + child->thread.debug.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM); + child->thread.debug.dac1 = bp_info->addr; + child->thread.debug.dac2 = bp_info->addr2; + if (mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) + child->thread.debug.dbcr2 |= DBCR2_DAC12M; + else if (mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) + child->thread.debug.dbcr2 |= DBCR2_DAC12MX; + else /* PPC_BREAKPOINT_MODE_MASK */ + child->thread.debug.dbcr2 |= DBCR2_DAC12MM; + regs_set_return_msr(child->thread.regs, child->thread.regs->msr | MSR_DE); + + return 5; +} +#endif /* CONFIG_PPC_ADV_DEBUG_DAC_RANGE */ + +long ppc_set_hwdebug(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) +{ + if (bp_info->version != 1) + return -ENOTSUPP; + /* + * Check for invalid flags and combinations + */ + if (bp_info->trigger_type == 0 || + (bp_info->trigger_type & ~(PPC_BREAKPOINT_TRIGGER_EXECUTE | + PPC_BREAKPOINT_TRIGGER_RW)) || + (bp_info->addr_mode & ~PPC_BREAKPOINT_MODE_MASK) || + (bp_info->condition_mode & + ~(PPC_BREAKPOINT_CONDITION_MODE | + PPC_BREAKPOINT_CONDITION_BE_ALL))) + return -EINVAL; +#if CONFIG_PPC_ADV_DEBUG_DVCS == 0 + if (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE) + return -EINVAL; +#endif + + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_EXECUTE) { + if (bp_info->trigger_type != PPC_BREAKPOINT_TRIGGER_EXECUTE || + bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE) + return -EINVAL; + return set_instruction_bp(child, bp_info); + } + if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + return set_dac(child, bp_info); + +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + return set_dac_range(child, bp_info); +#else + return -EINVAL; +#endif +} + +long ppc_del_hwdebug(struct task_struct *child, long data) +{ + int rc; + + if (data <= 4) + rc = del_instruction_bp(child, (int)data); + else + rc = del_dac(child, (int)data - 4); + + if (!rc) { + if (!DBCR_ACTIVE_EVENTS(child->thread.debug.dbcr0, + child->thread.debug.dbcr1)) { + child->thread.debug.dbcr0 &= ~DBCR0_IDM; + regs_set_return_msr(child->thread.regs, + child->thread.regs->msr & ~MSR_DE); + } + } + return rc; +} diff --git a/arch/powerpc/kernel/ptrace/ptrace-altivec.c b/arch/powerpc/kernel/ptrace/ptrace-altivec.c new file mode 100644 index 000000000000..0d9bc4bd4972 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-altivec.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> +#include <linux/elf.h> + +#include <asm/switch_to.h> + +#include "ptrace-decl.h" + +/* + * Get/set all the altivec registers vr0..vr31, vscr, vrsave, in one go. + * The transfer totals 34 quadword. Quadwords 0-31 contain the + * corresponding vector registers. Quadword 32 contains the vscr as the + * last word (offset 12) within that quadword. Quadword 33 contains the + * vrsave as the first word (offset 0) within the quadword. + * + * This definition of the VMX state is compatible with the current PPC32 + * ptrace interface. This allows signal handling and ptrace to use the + * same structures. This also simplifies the implementation of a bi-arch + * (combined (32- and 64-bit) gdb. + */ + +int vr_active(struct task_struct *target, const struct user_regset *regset) +{ + flush_altivec_to_thread(target); + return target->thread.used_vr ? regset->n : 0; +} + +/* + * Regardless of transactions, 'vr_state' holds the current running + * value of all the VMX registers and 'ckvr_state' holds the last + * checkpointed value of all the VMX registers for the current + * transaction to fall back on in case it aborts. + * + * Userspace interface buffer layout: + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + * }; + */ +int vr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + + flush_altivec_to_thread(target); + + BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != + offsetof(struct thread_vr_state, vr[32])); + + membuf_write(&to, &target->thread.vr_state, 33 * sizeof(vector128)); + /* + * Copy out only the low-order word of vrsave. + */ + memset(&vrsave, 0, sizeof(vrsave)); + vrsave.word = target->thread.vrsave; + return membuf_write(&to, &vrsave, sizeof(vrsave)); +} + +/* + * Regardless of transactions, 'vr_state' holds the current running + * value of all the VMX registers and 'ckvr_state' holds the last + * checkpointed value of all the VMX registers for the current + * transaction to fall back on in case it aborts. + * + * Userspace interface buffer layout: + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + * }; + */ +int vr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + flush_altivec_to_thread(target); + + BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != + offsetof(struct thread_vr_state, vr[32])); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.vr_state, 0, + 33 * sizeof(vector128)); + if (!ret && count > 0) { + /* + * We use only the first word of vrsave. + */ + int start, end; + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + memset(&vrsave, 0, sizeof(vrsave)); + + vrsave.word = target->thread.vrsave; + + start = 33 * sizeof(vector128); + end = start + sizeof(vrsave); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, + start, end); + if (!ret) + target->thread.vrsave = vrsave.word; + } + + return ret; +} diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h new file mode 100644 index 000000000000..4171a5727197 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include <linux/regset.h> + +/* + * Set of msr bits that gdb can change on behalf of a process. + */ +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +#define MSR_DEBUGCHANGE 0 +#else +#define MSR_DEBUGCHANGE (MSR_SE | MSR_BE) +#endif + +/* + * Max register writeable via put_reg + */ +#ifdef CONFIG_PPC32 +#define PT_MAX_PUT_REG PT_MQ +#else +#define PT_MAX_PUT_REG PT_CCR +#endif + +#define TVSO(f) (offsetof(struct thread_vr_state, f)) +#define TFSO(f) (offsetof(struct thread_fp_state, f)) +#define TSO(f) (offsetof(struct thread_struct, f)) + +/* + * These are our native regset flavors. + */ +enum powerpc_regset { + REGSET_GPR, + REGSET_FPR, +#ifdef CONFIG_ALTIVEC + REGSET_VMX, +#endif +#ifdef CONFIG_VSX + REGSET_VSX, +#endif +#ifdef CONFIG_SPE + REGSET_SPE, +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + REGSET_TM_CGPR, /* TM checkpointed GPR registers */ + REGSET_TM_CFPR, /* TM checkpointed FPR registers */ + REGSET_TM_CVMX, /* TM checkpointed VMX registers */ + REGSET_TM_CVSX, /* TM checkpointed VSX registers */ + REGSET_TM_SPR, /* TM specific SPR registers */ + REGSET_TM_CTAR, /* TM checkpointed TAR register */ + REGSET_TM_CPPR, /* TM checkpointed PPR register */ + REGSET_TM_CDSCR, /* TM checkpointed DSCR register */ +#endif +#ifdef CONFIG_PPC64 + REGSET_PPR, /* PPR register */ + REGSET_DSCR, /* DSCR register */ +#endif +#ifdef CONFIG_PPC_BOOK3S_64 + REGSET_TAR, /* TAR register */ + REGSET_EBB, /* EBB registers */ + REGSET_PMR, /* Performance Monitor Registers */ + REGSET_DEXCR, /* DEXCR registers */ +#ifdef CONFIG_CHECKPOINT_RESTORE + REGSET_HASHKEYR, /* HASHKEYR register */ +#endif +#endif +#ifdef CONFIG_PPC_MEM_KEYS + REGSET_PKEY, /* AMR register */ +#endif +}; + +/* ptrace-(no)vsx */ + +user_regset_get2_fn fpr_get; +int fpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); + +/* ptrace-vsx */ + +int vsr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn vsr_get; +int vsr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); + +/* ptrace-altivec */ + +int vr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn vr_get; +int vr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); + +/* ptrace-spe */ + +int evr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn evr_get; +int evr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); + +/* ptrace */ + +int gpr32_get_common(struct task_struct *target, + const struct user_regset *regset, + struct membuf to, + unsigned long *regs); +int gpr32_set_common(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + unsigned long *regs); + +/* ptrace-tm */ + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void flush_tmregs_to_thread(struct task_struct *tsk); +#else +static inline void flush_tmregs_to_thread(struct task_struct *tsk) { } +#endif + +int tm_cgpr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_cgpr_get; +int tm_cgpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +int tm_cfpr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_cfpr_get; +int tm_cfpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +int tm_cvmx_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_cvmx_get; +int tm_cvmx_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +int tm_cvsx_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_cvsx_get; +int tm_cvsx_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +int tm_spr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_spr_get; +int tm_spr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +int tm_tar_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_tar_get; +int tm_tar_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +int tm_ppr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_ppr_get; +int tm_ppr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +int tm_dscr_active(struct task_struct *target, const struct user_regset *regset); +user_regset_get2_fn tm_dscr_get; +int tm_dscr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); +user_regset_get2_fn tm_cgpr32_get; +int tm_cgpr32_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf); + +/* ptrace-view */ + +int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data); +int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data); + +extern const struct user_regset_view user_ppc_native_view; + +/* ptrace-fpu */ +int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data); +int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data); + +/* ptrace-(no)adv */ +void ppc_gethwdinfo(struct ppc_debug_info *dbginfo); +int ptrace_get_debugreg(struct task_struct *child, unsigned long addr, + unsigned long __user *datalp); +int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, unsigned long data); +long ppc_set_hwdebug(struct task_struct *child, struct ppc_hw_breakpoint *bp_info); +long ppc_del_hwdebug(struct task_struct *child, long data); diff --git a/arch/powerpc/kernel/ptrace/ptrace-fpu.c b/arch/powerpc/kernel/ptrace/ptrace-fpu.c new file mode 100644 index 000000000000..09c49632bfe5 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-fpu.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> + +#include <asm/switch_to.h> + +#include "ptrace-decl.h" + +int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data) +{ +#ifdef CONFIG_PPC_FPU_REGS + unsigned int fpidx = index - PT_FPR0; +#endif + + if (index > PT_FPSCR) + return -EIO; + +#ifdef CONFIG_PPC_FPU_REGS + flush_fp_to_thread(child); + if (fpidx < (PT_FPSCR - PT_FPR0)) { + if (IS_ENABLED(CONFIG_PPC32)) + // On 32-bit the index we are passed refers to 32-bit words + *data = ((u32 *)child->thread.fp_state.fpr)[fpidx]; + else + memcpy(data, &child->thread.TS_FPR(fpidx), sizeof(long)); + } else + *data = child->thread.fp_state.fpscr; +#else + *data = 0; +#endif + + return 0; +} + +int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data) +{ +#ifdef CONFIG_PPC_FPU_REGS + unsigned int fpidx = index - PT_FPR0; +#endif + + if (index > PT_FPSCR) + return -EIO; + +#ifdef CONFIG_PPC_FPU_REGS + flush_fp_to_thread(child); + if (fpidx < (PT_FPSCR - PT_FPR0)) { + if (IS_ENABLED(CONFIG_PPC32)) + // On 32-bit the index we are passed refers to 32-bit words + ((u32 *)child->thread.fp_state.fpr)[fpidx] = data; + else + memcpy(&child->thread.TS_FPR(fpidx), &data, sizeof(long)); + } else + child->thread.fp_state.fpscr = data; +#endif + + return 0; +} + diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c b/arch/powerpc/kernel/ptrace/ptrace-noadv.c new file mode 100644 index 000000000000..a5dd7d2e2c9e --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> +#include <linux/hw_breakpoint.h> + +#include <asm/debug.h> + +#include "ptrace-decl.h" + +void user_enable_single_step(struct task_struct *task) +{ + struct pt_regs *regs = task->thread.regs; + + if (regs != NULL) + regs_set_return_msr(regs, (regs->msr & ~MSR_BE) | MSR_SE); + set_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +void user_enable_block_step(struct task_struct *task) +{ + struct pt_regs *regs = task->thread.regs; + + if (regs != NULL) + regs_set_return_msr(regs, (regs->msr & ~MSR_SE) | MSR_BE); + set_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +void user_disable_single_step(struct task_struct *task) +{ + struct pt_regs *regs = task->thread.regs; + + if (regs != NULL) + regs_set_return_msr(regs, regs->msr & ~(MSR_SE | MSR_BE)); + + clear_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +void ppc_gethwdinfo(struct ppc_debug_info *dbginfo) +{ + dbginfo->version = 1; + dbginfo->num_instruction_bps = 0; + if (ppc_breakpoint_available()) + dbginfo->num_data_bps = nr_wp_slots(); + else + dbginfo->num_data_bps = 0; + dbginfo->num_condition_regs = 0; + dbginfo->data_bp_alignment = sizeof(long); + dbginfo->sizeof_condition = 0; + if (IS_ENABLED(CONFIG_HAVE_HW_BREAKPOINT)) { + dbginfo->features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; + if (dawr_enabled()) + dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR; + } else { + dbginfo->features = 0; + } + if (cpu_has_feature(CPU_FTR_ARCH_31)) + dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_ARCH_31; +} + +int ptrace_get_debugreg(struct task_struct *child, unsigned long addr, + unsigned long __user *datalp) +{ + unsigned long dabr_fake; + + /* We only support one DABR and no IABRS at the moment */ + if (addr > 0) + return -EINVAL; + dabr_fake = ((child->thread.hw_brk[0].address & (~HW_BRK_TYPE_DABR)) | + (child->thread.hw_brk[0].type & HW_BRK_TYPE_DABR)); + return put_user(dabr_fake, datalp); +} + +/* + * ptrace_set_debugreg() fakes DABR and DABR is only one. So even if + * internal hw supports more than one watchpoint, we support only one + * watchpoint with this interface. + */ +int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, unsigned long data) +{ +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int ret; + struct thread_struct *thread = &task->thread; + struct perf_event *bp; + struct perf_event_attr attr; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + bool set_bp = true; + struct arch_hw_breakpoint hw_brk; + + /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). + * For embedded processors we support one DAC and no IAC's at the + * moment. + */ + if (addr > 0) + return -EINVAL; + + /* The bottom 3 bits in dabr are flags */ + if ((data & ~0x7UL) >= TASK_SIZE) + return -EIO; + + /* For processors using DABR (i.e. 970), the bottom 3 bits are flags. + * It was assumed, on previous implementations, that 3 bits were + * passed together with the data address, fitting the design of the + * DABR register, as follows: + * + * bit 0: Read flag + * bit 1: Write flag + * bit 2: Breakpoint translation + * + * Thus, we use them here as so. + */ + + /* Ensure breakpoint translation bit is set */ + if (data && !(data & HW_BRK_TYPE_TRANSLATE)) + return -EIO; + hw_brk.address = data & (~HW_BRK_TYPE_DABR); + hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; + hw_brk.len = DABR_MAX_LEN; + hw_brk.hw_len = DABR_MAX_LEN; + set_bp = (data) && (hw_brk.type & HW_BRK_TYPE_RDWR); +#ifdef CONFIG_HAVE_HW_BREAKPOINT + bp = thread->ptrace_bps[0]; + if (!set_bp) { + if (bp) { + unregister_hw_breakpoint(bp); + thread->ptrace_bps[0] = NULL; + } + return 0; + } + if (bp) { + attr = bp->attr; + attr.bp_addr = hw_brk.address; + attr.bp_len = DABR_MAX_LEN; + arch_bp_generic_fields(hw_brk.type, &attr.bp_type); + + /* Enable breakpoint */ + attr.disabled = false; + + ret = modify_user_hw_breakpoint(bp, &attr); + if (ret) + return ret; + + thread->ptrace_bps[0] = bp; + thread->hw_brk[0] = hw_brk; + return 0; + } + + /* Create a new breakpoint request if one doesn't exist already */ + hw_breakpoint_init(&attr); + attr.bp_addr = hw_brk.address; + attr.bp_len = DABR_MAX_LEN; + arch_bp_generic_fields(hw_brk.type, + &attr.bp_type); + + thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, + ptrace_triggered, NULL, task); + if (IS_ERR(bp)) { + thread->ptrace_bps[0] = NULL; + return PTR_ERR(bp); + } + +#else /* !CONFIG_HAVE_HW_BREAKPOINT */ + if (set_bp && (!ppc_breakpoint_available())) + return -ENODEV; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + task->thread.hw_brk[0] = hw_brk; + return 0; +} + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static int find_empty_ptrace_bp(struct thread_struct *thread) +{ + int i; + + for (i = 0; i < nr_wp_slots(); i++) { + if (!thread->ptrace_bps[i]) + return i; + } + return -1; +} +#endif + +static int find_empty_hw_brk(struct thread_struct *thread) +{ + int i; + + for (i = 0; i < nr_wp_slots(); i++) { + if (!thread->hw_brk[i].address) + return i; + } + return -1; +} + +long ppc_set_hwdebug(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) +{ + int i; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int len = 0; + struct thread_struct *thread = &child->thread; + struct perf_event *bp; + struct perf_event_attr attr; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + struct arch_hw_breakpoint brk; + + if (bp_info->version != 1) + return -ENOTSUPP; + /* + * We only support one data breakpoint + */ + if ((bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_RW) == 0 || + (bp_info->trigger_type & ~PPC_BREAKPOINT_TRIGGER_RW) != 0 || + bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE) + return -EINVAL; + + if ((unsigned long)bp_info->addr >= TASK_SIZE) + return -EIO; + + brk.address = ALIGN_DOWN(bp_info->addr, HW_BREAKPOINT_SIZE); + brk.type = HW_BRK_TYPE_TRANSLATE | HW_BRK_TYPE_PRIV_ALL; + brk.len = DABR_MAX_LEN; + brk.hw_len = DABR_MAX_LEN; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + brk.type |= HW_BRK_TYPE_READ; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + brk.type |= HW_BRK_TYPE_WRITE; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) + len = bp_info->addr2 - bp_info->addr; + else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + len = 1; + else + return -EINVAL; + + i = find_empty_ptrace_bp(thread); + if (i < 0) + return -ENOSPC; + + /* Create a new breakpoint request if one doesn't exist already */ + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)bp_info->addr; + attr.bp_len = len; + arch_bp_generic_fields(brk.type, &attr.bp_type); + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, NULL, child); + thread->ptrace_bps[i] = bp; + if (IS_ERR(bp)) { + thread->ptrace_bps[i] = NULL; + return PTR_ERR(bp); + } + + return i + 1; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + + if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) + return -EINVAL; + + i = find_empty_hw_brk(&child->thread); + if (i < 0) + return -ENOSPC; + + if (!ppc_breakpoint_available()) + return -ENODEV; + + child->thread.hw_brk[i] = brk; + + return i + 1; +} + +long ppc_del_hwdebug(struct task_struct *child, long data) +{ +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int ret = 0; + struct thread_struct *thread = &child->thread; + struct perf_event *bp; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + if (data < 1 || data > nr_wp_slots()) + return -EINVAL; + +#ifdef CONFIG_HAVE_HW_BREAKPOINT + bp = thread->ptrace_bps[data - 1]; + if (bp) { + unregister_hw_breakpoint(bp); + thread->ptrace_bps[data - 1] = NULL; + } else { + ret = -ENOENT; + } + return ret; +#else /* CONFIG_HAVE_HW_BREAKPOINT */ + if (!(child->thread.hw_brk[data - 1].flags & HW_BRK_FLAG_DISABLED) && + child->thread.hw_brk[data - 1].address == 0) + return -ENOENT; + + child->thread.hw_brk[data - 1].address = 0; + child->thread.hw_brk[data - 1].type = 0; + child->thread.hw_brk[data - 1].flags = 0; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + + return 0; +} diff --git a/arch/powerpc/kernel/ptrace/ptrace-novsx.c b/arch/powerpc/kernel/ptrace/ptrace-novsx.c new file mode 100644 index 000000000000..7433f3db979a --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-novsx.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> + +#include <asm/switch_to.h> + +#include "ptrace-decl.h" + +/* + * Regardless of transactions, 'fp_state' holds the current running + * value of all FPR registers and 'ckfp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + * }; + */ +int fpr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ +#ifdef CONFIG_PPC_FPU_REGS + BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != + offsetof(struct thread_fp_state, fpr[32])); + + flush_fp_to_thread(target); + + return membuf_write(&to, &target->thread.fp_state, 33 * sizeof(u64)); +#else + return membuf_write(&to, &empty_zero_page, 33 * sizeof(u64)); +#endif +} + +/* + * Regardless of transactions, 'fp_state' holds the current running + * value of all FPR registers and 'ckfp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + * }; + * + */ +int fpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ +#ifdef CONFIG_PPC_FPU_REGS + BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != + offsetof(struct thread_fp_state, fpr[32])); + + flush_fp_to_thread(target); + + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fp_state, 0, -1); +#else + return 0; +#endif +} diff --git a/arch/powerpc/kernel/ptrace/ptrace-spe.c b/arch/powerpc/kernel/ptrace/ptrace-spe.c new file mode 100644 index 000000000000..47034d069045 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-spe.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> + +#include <asm/switch_to.h> + +#include "ptrace-decl.h" + +/* + * For get_evrregs/set_evrregs functions 'data' has the following layout: + * + * struct { + * u32 evr[32]; + * u64 acc; + * u32 spefscr; + * } + */ + +int evr_active(struct task_struct *target, const struct user_regset *regset) +{ + flush_spe_to_thread(target); + return target->thread.used_spe ? regset->n : 0; +} + +int evr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + flush_spe_to_thread(target); + + membuf_write(&to, &target->thread.evr, sizeof(target->thread.evr)); + + BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) != + offsetof(struct thread_struct, spefscr)); + + return membuf_write(&to, &target->thread.acc, + sizeof(u64) + sizeof(u32)); +} + +int evr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + flush_spe_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.evr, + 0, sizeof(target->thread.evr)); + + BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) != + offsetof(struct thread_struct, spefscr)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.acc, + sizeof(target->thread.evr), -1); + + return ret; +} diff --git a/arch/powerpc/kernel/ptrace/ptrace-tm.c b/arch/powerpc/kernel/ptrace/ptrace-tm.c new file mode 100644 index 000000000000..210ea834e603 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-tm.c @@ -0,0 +1,788 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> + +#include <asm/switch_to.h> +#include <asm/tm.h> +#include <asm/asm-prototypes.h> + +#include "ptrace-decl.h" + +void flush_tmregs_to_thread(struct task_struct *tsk) +{ + /* + * If task is not current, it will have been flushed already to + * it's thread_struct during __switch_to(). + * + * A reclaim flushes ALL the state or if not in TM save TM SPRs + * in the appropriate thread structures from live. + */ + + if (!cpu_has_feature(CPU_FTR_TM) || tsk != current) + return; + + if (MSR_TM_SUSPENDED(mfmsr())) { + tm_reclaim_current(TM_CAUSE_SIGNAL); + } else { + tm_enable(); + tm_save_sprs(&tsk->thread); + } +} + +static unsigned long get_user_ckpt_msr(struct task_struct *task) +{ + return task->thread.ckpt_regs.msr | task->thread.fpexc_mode; +} + +static int set_user_ckpt_msr(struct task_struct *task, unsigned long msr) +{ + task->thread.ckpt_regs.msr &= ~MSR_DEBUGCHANGE; + task->thread.ckpt_regs.msr |= msr & MSR_DEBUGCHANGE; + return 0; +} + +static int set_user_ckpt_trap(struct task_struct *task, unsigned long trap) +{ + set_trap(&task->thread.ckpt_regs, trap); + return 0; +} + +/** + * tm_cgpr_active - get active number of registers in CGPR + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in transaction checkpointed GPR category. + */ +int tm_cgpr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + return regset->n; +} + +/** + * tm_cgpr_get - get CGPR registers + * @target: The target task. + * @regset: The user regset structure. + * @to: Destination of copy. + * + * This function gets transaction checkpointed GPR registers. + * + * When the transaction is active, 'ckpt_regs' holds all the checkpointed + * GPR register values for the current transaction to fall back on if it + * aborts in between. This function gets those checkpointed GPR registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * struct pt_regs ckpt_regs; + * }; + */ +int tm_cgpr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct membuf to_msr = membuf_at(&to, offsetof(struct pt_regs, msr)); +#ifdef CONFIG_PPC64 + struct membuf to_softe = membuf_at(&to, offsetof(struct pt_regs, softe)); +#endif + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + membuf_write(&to, &target->thread.ckpt_regs, sizeof(struct user_pt_regs)); + + membuf_store(&to_msr, get_user_ckpt_msr(target)); +#ifdef CONFIG_PPC64 + membuf_store(&to_softe, 0x1ul); +#endif + return membuf_zero(&to, ELF_NGREG * sizeof(unsigned long) - + sizeof(struct user_pt_regs)); +} + +/* + * tm_cgpr_set - set the CGPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed GPR registers. + * + * When the transaction is active, 'ckpt_regs' holds the checkpointed + * GPR register values for the current transaction to fall back on if it + * aborts in between. This function sets those checkpointed GPR registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * struct pt_regs ckpt_regs; + * }; + */ +int tm_cgpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + unsigned long reg; + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ckpt_regs, + 0, PT_MSR * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_MSR * sizeof(reg), + (PT_MSR + 1) * sizeof(reg)); + if (!ret) + ret = set_user_ckpt_msr(target, reg); + } + + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct pt_regs, msr) + sizeof(long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ckpt_regs.orig_gpr3, + PT_ORIG_R3 * sizeof(reg), + (PT_MAX_PUT_REG + 1) * sizeof(reg)); + + if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret) + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + (PT_MAX_PUT_REG + 1) * sizeof(reg), + PT_TRAP * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_TRAP * sizeof(reg), + (PT_TRAP + 1) * sizeof(reg)); + if (!ret) + ret = set_user_ckpt_trap(target, reg); + } + + if (!ret) + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + (PT_TRAP + 1) * sizeof(reg), -1); + + return ret; +} + +/** + * tm_cfpr_active - get active number of registers in CFPR + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in transaction checkpointed FPR category. + */ +int tm_cfpr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + return regset->n; +} + +/** + * tm_cfpr_get - get CFPR registers + * @target: The target task. + * @regset: The user regset structure. + * @to: Destination of copy. + * + * This function gets in transaction checkpointed FPR registers. + * + * When the transaction is active 'ckfp_state' holds the checkpointed + * values for the current transaction to fall back on if it aborts + * in between. This function gets those checkpointed FPR registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + *}; + */ +int tm_cfpr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + u64 buf[33]; + int i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + /* copy to local buffer then write that out */ + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.TS_CKFPR(i); + buf[32] = target->thread.ckfp_state.fpscr; + return membuf_write(&to, buf, sizeof(buf)); +} + +/** + * tm_cfpr_set - set CFPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed FPR registers. + * + * When the transaction is active 'ckfp_state' holds the checkpointed + * FPR register values for the current transaction to fall back on + * if it aborts in between. This function sets these checkpointed + * FPR registers. The userspace interface buffer layout is as follows. + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + *}; + */ +int tm_cfpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u64 buf[33]; + int i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + for (i = 0; i < 32; i++) + buf[i] = target->thread.TS_CKFPR(i); + buf[32] = target->thread.ckfp_state.fpscr; + + /* copy to local buffer then write that out */ + i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); + if (i) + return i; + for (i = 0; i < 32 ; i++) + target->thread.TS_CKFPR(i) = buf[i]; + target->thread.ckfp_state.fpscr = buf[32]; + return 0; +} + +/** + * tm_cvmx_active - get active number of registers in CVMX + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in checkpointed VMX category. + */ +int tm_cvmx_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + return regset->n; +} + +/** + * tm_cvmx_get - get CMVX registers + * @target: The target task. + * @regset: The user regset structure. + * @to: Destination of copy. + * + * This function gets in transaction checkpointed VMX registers. + * + * When the transaction is active 'ckvr_state' and 'ckvrsave' hold + * the checkpointed values for the current transaction to fall + * back on if it aborts in between. The userspace interface buffer + * layout is as follows. + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + *}; + */ +int tm_cvmx_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32])); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + /* Flush the state */ + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + membuf_write(&to, &target->thread.ckvr_state, 33 * sizeof(vector128)); + /* + * Copy out only the low-order word of vrsave. + */ + memset(&vrsave, 0, sizeof(vrsave)); + vrsave.word = target->thread.ckvrsave; + return membuf_write(&to, &vrsave, sizeof(vrsave)); +} + +/** + * tm_cvmx_set - set CMVX registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed VMX registers. + * + * When the transaction is active 'ckvr_state' and 'ckvrsave' hold + * the checkpointed values for the current transaction to fall + * back on if it aborts in between. The userspace interface buffer + * layout is as follows. + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + *}; + */ +int tm_cvmx_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32])); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.ckvr_state, + 0, 33 * sizeof(vector128)); + if (!ret && count > 0) { + /* + * We use only the low-order word of vrsave. + */ + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + memset(&vrsave, 0, sizeof(vrsave)); + vrsave.word = target->thread.ckvrsave; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, + 33 * sizeof(vector128), -1); + if (!ret) + target->thread.ckvrsave = vrsave.word; + } + + return ret; +} + +/** + * tm_cvsx_active - get active number of registers in CVSX + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in transaction checkpointed VSX category. + */ +int tm_cvsx_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + flush_vsx_to_thread(target); + return target->thread.used_vsr ? regset->n : 0; +} + +/** + * tm_cvsx_get - get CVSX registers + * @target: The target task. + * @regset: The user regset structure. + * @to: Destination of copy. + * + * This function gets in transaction checkpointed VSX registers. + * + * When the transaction is active 'ckfp_state' holds the checkpointed + * values for the current transaction to fall back on if it aborts + * in between. This function gets those checkpointed VSX registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * u64 vsx[32]; + *}; + */ +int tm_cvsx_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + u64 buf[32]; + int i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + /* Flush the state */ + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_vsx_to_thread(target); + + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; + return membuf_write(&to, buf, 32 * sizeof(double)); +} + +/** + * tm_cvsx_set - set CFPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed VSX registers. + * + * When the transaction is active 'ckfp_state' holds the checkpointed + * VSX register values for the current transaction to fall back on + * if it aborts in between. This function sets these checkpointed + * FPR registers. The userspace interface buffer layout is as follows. + * + * struct data { + * u64 vsx[32]; + *}; + */ +int tm_cvsx_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u64 buf[32]; + int ret, i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + /* Flush the state */ + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_vsx_to_thread(target); + + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + buf, 0, 32 * sizeof(double)); + if (!ret) + for (i = 0; i < 32 ; i++) + target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + + return ret; +} + +/** + * tm_spr_active - get active number of registers in TM SPR + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks the active number of available + * regisers in the transactional memory SPR category. + */ +int tm_spr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + return regset->n; +} + +/** + * tm_spr_get - get the TM related SPR registers + * @target: The target task. + * @regset: The user regset structure. + * @to: Destination of copy. + * + * This function gets transactional memory related SPR registers. + * The userspace interface buffer layout is as follows. + * + * struct { + * u64 tm_tfhar; + * u64 tm_texasr; + * u64 tm_tfiar; + * }; + */ +int tm_spr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + /* Build tests */ + BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr)); + BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar)); + BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs)); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + /* Flush the states */ + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + /* TFHAR register */ + membuf_write(&to, &target->thread.tm_tfhar, sizeof(u64)); + /* TEXASR register */ + membuf_write(&to, &target->thread.tm_texasr, sizeof(u64)); + /* TFIAR register */ + return membuf_write(&to, &target->thread.tm_tfiar, sizeof(u64)); +} + +/** + * tm_spr_set - set the TM related SPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets transactional memory related SPR registers. + * The userspace interface buffer layout is as follows. + * + * struct { + * u64 tm_tfhar; + * u64 tm_texasr; + * u64 tm_tfiar; + * }; + */ +int tm_spr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + /* Build tests */ + BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr)); + BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar)); + BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs)); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + /* Flush the states */ + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + + /* TFHAR register */ + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tfhar, 0, sizeof(u64)); + + /* TEXASR register */ + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_texasr, sizeof(u64), + 2 * sizeof(u64)); + + /* TFIAR register */ + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tfiar, + 2 * sizeof(u64), 3 * sizeof(u64)); + return ret; +} + +int tm_tar_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + return regset->n; + + return 0; +} + +int tm_tar_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + return membuf_write(&to, &target->thread.tm_tar, sizeof(u64)); +} + +int tm_tar_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tar, 0, sizeof(u64)); + return ret; +} + +int tm_ppr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + return regset->n; + + return 0; +} + + +int tm_ppr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + return membuf_write(&to, &target->thread.tm_ppr, sizeof(u64)); +} + +int tm_ppr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_ppr, 0, sizeof(u64)); + return ret; +} + +int tm_dscr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + return regset->n; + + return 0; +} + +int tm_dscr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + return membuf_write(&to, &target->thread.tm_dscr, sizeof(u64)); +} + +int tm_dscr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_dscr, 0, sizeof(u64)); + return ret; +} + +int tm_cgpr32_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + gpr32_get_common(target, regset, to, + &target->thread.ckpt_regs.gpr[0]); + return membuf_zero(&to, ELF_NGREG * sizeof(u32)); +} + +int tm_cgpr32_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, + &target->thread.ckpt_regs.gpr[0]); +} diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c new file mode 100644 index 000000000000..584cf5c3df50 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -0,0 +1,953 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> +#include <linux/elf.h> +#include <linux/nospec.h> +#include <linux/pkeys.h> + +#include "ptrace-decl.h" + +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define STR(s) #s /* convert to string */ +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define GPR_OFFSET_NAME(num) \ + {.name = STR(r##num), .offset = offsetof(struct pt_regs, gpr[num])}, \ + {.name = STR(gpr##num), .offset = offsetof(struct pt_regs, gpr[num])} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { + GPR_OFFSET_NAME(0), + GPR_OFFSET_NAME(1), + GPR_OFFSET_NAME(2), + GPR_OFFSET_NAME(3), + GPR_OFFSET_NAME(4), + GPR_OFFSET_NAME(5), + GPR_OFFSET_NAME(6), + GPR_OFFSET_NAME(7), + GPR_OFFSET_NAME(8), + GPR_OFFSET_NAME(9), + GPR_OFFSET_NAME(10), + GPR_OFFSET_NAME(11), + GPR_OFFSET_NAME(12), + GPR_OFFSET_NAME(13), + GPR_OFFSET_NAME(14), + GPR_OFFSET_NAME(15), + GPR_OFFSET_NAME(16), + GPR_OFFSET_NAME(17), + GPR_OFFSET_NAME(18), + GPR_OFFSET_NAME(19), + GPR_OFFSET_NAME(20), + GPR_OFFSET_NAME(21), + GPR_OFFSET_NAME(22), + GPR_OFFSET_NAME(23), + GPR_OFFSET_NAME(24), + GPR_OFFSET_NAME(25), + GPR_OFFSET_NAME(26), + GPR_OFFSET_NAME(27), + GPR_OFFSET_NAME(28), + GPR_OFFSET_NAME(29), + GPR_OFFSET_NAME(30), + GPR_OFFSET_NAME(31), + REG_OFFSET_NAME(nip), + REG_OFFSET_NAME(msr), + REG_OFFSET_NAME(ctr), + REG_OFFSET_NAME(link), + REG_OFFSET_NAME(xer), + REG_OFFSET_NAME(ccr), +#ifdef CONFIG_PPC64 + REG_OFFSET_NAME(softe), +#else + REG_OFFSET_NAME(mq), +#endif + REG_OFFSET_NAME(trap), + REG_OFFSET_NAME(dar), + REG_OFFSET_NAME(dsisr), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +static unsigned long get_user_msr(struct task_struct *task) +{ + return task->thread.regs->msr | task->thread.fpexc_mode; +} + +static __always_inline int set_user_msr(struct task_struct *task, unsigned long msr) +{ + unsigned long newmsr = (task->thread.regs->msr & ~MSR_DEBUGCHANGE) | + (msr & MSR_DEBUGCHANGE); + regs_set_return_msr(task->thread.regs, newmsr); + return 0; +} + +#ifdef CONFIG_PPC64 +static int get_user_dscr(struct task_struct *task, unsigned long *data) +{ + *data = task->thread.dscr; + return 0; +} + +static int set_user_dscr(struct task_struct *task, unsigned long dscr) +{ + task->thread.dscr = dscr; + task->thread.dscr_inherit = 1; + return 0; +} +#else +static int get_user_dscr(struct task_struct *task, unsigned long *data) +{ + return -EIO; +} + +static int set_user_dscr(struct task_struct *task, unsigned long dscr) +{ + return -EIO; +} +#endif + +/* + * We prevent mucking around with the reserved area of trap + * which are used internally by the kernel. + */ +static __always_inline int set_user_trap(struct task_struct *task, unsigned long trap) +{ + set_trap(task->thread.regs, trap); + return 0; +} + +/* + * Get contents of register REGNO in task TASK. + */ +int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) +{ + unsigned int regs_max; + + if (task->thread.regs == NULL || !data) + return -EIO; + + if (regno == PT_MSR) { + *data = get_user_msr(task); + return 0; + } + + if (regno == PT_DSCR) + return get_user_dscr(task, data); + + /* + * softe copies paca->irq_soft_mask variable state. Since irq_soft_mask is + * no more used as a flag, lets force usr to always see the softe value as 1 + * which means interrupts are not soft disabled. + */ + if (IS_ENABLED(CONFIG_PPC64) && regno == PT_SOFTE) { + *data = 1; + return 0; + } + + regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long); + if (regno < regs_max) { + regno = array_index_nospec(regno, regs_max); + *data = ((unsigned long *)task->thread.regs)[regno]; + return 0; + } + + return -EIO; +} + +/* + * Write contents of register REGNO in task TASK. + */ +int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data) +{ + if (task->thread.regs == NULL) + return -EIO; + + if (regno == PT_MSR) + return set_user_msr(task, data); + if (regno == PT_TRAP) + return set_user_trap(task, data); + if (regno == PT_DSCR) + return set_user_dscr(task, data); + + if (regno <= PT_MAX_PUT_REG) { + regno = array_index_nospec(regno, PT_MAX_PUT_REG + 1); + ((unsigned long *)task->thread.regs)[regno] = data; + return 0; + } + return -EIO; +} + +static int gpr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct membuf to_msr = membuf_at(&to, offsetof(struct pt_regs, msr)); +#ifdef CONFIG_PPC64 + struct membuf to_softe = membuf_at(&to, offsetof(struct pt_regs, softe)); +#endif + if (target->thread.regs == NULL) + return -EIO; + + membuf_write(&to, target->thread.regs, sizeof(struct user_pt_regs)); + + membuf_store(&to_msr, get_user_msr(target)); +#ifdef CONFIG_PPC64 + membuf_store(&to_softe, 0x1ul); +#endif + return membuf_zero(&to, ELF_NGREG * sizeof(unsigned long) - + sizeof(struct user_pt_regs)); +} + +static int gpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + unsigned long reg; + int ret; + + if (target->thread.regs == NULL) + return -EIO; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.regs, + 0, PT_MSR * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_MSR * sizeof(reg), + (PT_MSR + 1) * sizeof(reg)); + if (!ret) + ret = set_user_msr(target, reg); + } + + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct pt_regs, msr) + sizeof(long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.regs->orig_gpr3, + PT_ORIG_R3 * sizeof(reg), + (PT_MAX_PUT_REG + 1) * sizeof(reg)); + + if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret) + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + (PT_MAX_PUT_REG + 1) * sizeof(reg), + PT_TRAP * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_TRAP * sizeof(reg), + (PT_TRAP + 1) * sizeof(reg)); + if (!ret) + ret = set_user_trap(target, reg); + } + + if (!ret) + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + (PT_TRAP + 1) * sizeof(reg), -1); + + return ret; +} + +#ifdef CONFIG_PPC64 +static int ppr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!target->thread.regs) + return -EINVAL; + + return membuf_write(&to, &target->thread.regs->ppr, sizeof(u64)); +} + +static int ppr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + if (!target->thread.regs) + return -EINVAL; + + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.regs->ppr, 0, sizeof(u64)); +} + +static int dscr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + return membuf_write(&to, &target->thread.dscr, sizeof(u64)); +} +static int dscr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); +} +#endif +#ifdef CONFIG_PPC_BOOK3S_64 +static int tar_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + return membuf_write(&to, &target->thread.tar, sizeof(u64)); +} +static int tar_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); +} + +static int ebb_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + if (target->thread.used_ebb) + return regset->n; + + return 0; +} + +static int ebb_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + /* Build tests */ + BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr)); + BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + if (!target->thread.used_ebb) + return -ENODATA; + + return membuf_write(&to, &target->thread.ebbrr, 3 * sizeof(unsigned long)); +} + +static int ebb_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + int ret = 0; + + /* Build tests */ + BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr)); + BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + if (target->thread.used_ebb) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.ebbrr, + 0, sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ebbhr, sizeof(unsigned long), + 2 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.bescr, 2 * sizeof(unsigned long), + 3 * sizeof(unsigned long)); + + return ret; +} +static int pmu_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + return regset->n; +} + +static int pmu_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + /* Build tests */ + BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar)); + BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier)); + BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2)); + BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + return membuf_write(&to, &target->thread.siar, 5 * sizeof(unsigned long)); +} + +static int pmu_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + int ret = 0; + + /* Build tests */ + BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar)); + BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier)); + BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2)); + BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.siar, + 0, sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.sdar, sizeof(unsigned long), + 2 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.sier, 2 * sizeof(unsigned long), + 3 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.mmcr2, 3 * sizeof(unsigned long), + 4 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.mmcr0, 4 * sizeof(unsigned long), + 5 * sizeof(unsigned long)); + return ret; +} + +static int dexcr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return regset->n; +} + +static int dexcr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + /* + * The DEXCR is currently static across all CPUs, so we don't + * store the target's value anywhere, but the static value + * will also be correct. + */ + membuf_store(&to, (u64)lower_32_bits(DEXCR_INIT)); + + /* + * Technically the HDEXCR is per-cpu, but a hypervisor can't reasonably + * change it between CPUs of the same guest. + */ + return membuf_store(&to, (u64)lower_32_bits(mfspr(SPRN_HDEXCR_RO))); +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static int hashkeyr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return regset->n; +} + +static int hashkeyr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return membuf_store(&to, target->thread.hashkeyr); +} + +static int hashkeyr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.hashkeyr, + 0, sizeof(unsigned long)); +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_MEM_KEYS +static int pkey_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!arch_pkeys_enabled()) + return -ENODEV; + + return regset->n; +} + +static int pkey_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + + if (!arch_pkeys_enabled()) + return -ENODEV; + + membuf_store(&to, target->thread.regs->amr); + membuf_store(&to, target->thread.regs->iamr); + return membuf_store(&to, default_uamor); +} + +static int pkey_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + u64 new_amr; + int ret; + + if (!arch_pkeys_enabled()) + return -ENODEV; + + /* Only the AMR can be set from userspace */ + if (pos != 0 || count != sizeof(new_amr)) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &new_amr, 0, sizeof(new_amr)); + if (ret) + return ret; + + /* + * UAMOR determines which bits of the AMR can be set from userspace. + * UAMOR value 0b11 indicates that the AMR value can be modified + * from userspace. If the kernel is using a specific key, we avoid + * userspace modifying the AMR value for that key by masking them + * via UAMOR 0b00. + * + * Pick the AMR values for the keys that kernel is using. This + * will be indicated by the ~default_uamor bits. + */ + target->thread.regs->amr = (new_amr & default_uamor) | + (target->thread.regs->amr & ~default_uamor); + + return 0; +} +#endif /* CONFIG_PPC_MEM_KEYS */ + +static const struct user_regset native_regsets[] = { + [REGSET_GPR] = { + .core_note_type = NT_PRSTATUS, .n = ELF_NGREG, + .size = sizeof(long), .align = sizeof(long), + .regset_get = gpr_get, .set = gpr_set + }, + [REGSET_FPR] = { + .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .regset_get = fpr_get, .set = fpr_set + }, +#ifdef CONFIG_ALTIVEC + [REGSET_VMX] = { + .core_note_type = NT_PPC_VMX, .n = 34, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = vr_active, .regset_get = vr_get, .set = vr_set + }, +#endif +#ifdef CONFIG_VSX + [REGSET_VSX] = { + .core_note_type = NT_PPC_VSX, .n = 32, + .size = sizeof(double), .align = sizeof(double), + .active = vsr_active, .regset_get = vsr_get, .set = vsr_set + }, +#endif +#ifdef CONFIG_SPE + [REGSET_SPE] = { + .core_note_type = NT_PPC_SPE, .n = 35, + .size = sizeof(u32), .align = sizeof(u32), + .active = evr_active, .regset_get = evr_get, .set = evr_set + }, +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + [REGSET_TM_CGPR] = { + .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG, + .size = sizeof(long), .align = sizeof(long), + .active = tm_cgpr_active, .regset_get = tm_cgpr_get, .set = tm_cgpr_set + }, + [REGSET_TM_CFPR] = { + .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cfpr_active, .regset_get = tm_cfpr_get, .set = tm_cfpr_set + }, + [REGSET_TM_CVMX] = { + .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = tm_cvmx_active, .regset_get = tm_cvmx_get, .set = tm_cvmx_set + }, + [REGSET_TM_CVSX] = { + .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cvsx_active, .regset_get = tm_cvsx_get, .set = tm_cvsx_set + }, + [REGSET_TM_SPR] = { + .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_spr_active, .regset_get = tm_spr_get, .set = tm_spr_set + }, + [REGSET_TM_CTAR] = { + .core_note_type = NT_PPC_TM_CTAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_tar_active, .regset_get = tm_tar_get, .set = tm_tar_set + }, + [REGSET_TM_CPPR] = { + .core_note_type = NT_PPC_TM_CPPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_ppr_active, .regset_get = tm_ppr_get, .set = tm_ppr_set + }, + [REGSET_TM_CDSCR] = { + .core_note_type = NT_PPC_TM_CDSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_dscr_active, .regset_get = tm_dscr_get, .set = tm_dscr_set + }, +#endif +#ifdef CONFIG_PPC64 + [REGSET_PPR] = { + .core_note_type = NT_PPC_PPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .regset_get = ppr_get, .set = ppr_set + }, + [REGSET_DSCR] = { + .core_note_type = NT_PPC_DSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .regset_get = dscr_get, .set = dscr_set + }, +#endif +#ifdef CONFIG_PPC_BOOK3S_64 + [REGSET_TAR] = { + .core_note_type = NT_PPC_TAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .regset_get = tar_get, .set = tar_set + }, + [REGSET_EBB] = { + .core_note_type = NT_PPC_EBB, .n = ELF_NEBB, + .size = sizeof(u64), .align = sizeof(u64), + .active = ebb_active, .regset_get = ebb_get, .set = ebb_set + }, + [REGSET_PMR] = { + .core_note_type = NT_PPC_PMU, .n = ELF_NPMU, + .size = sizeof(u64), .align = sizeof(u64), + .active = pmu_active, .regset_get = pmu_get, .set = pmu_set + }, + [REGSET_DEXCR] = { + .core_note_type = NT_PPC_DEXCR, .n = ELF_NDEXCR, + .size = sizeof(u64), .align = sizeof(u64), + .active = dexcr_active, .regset_get = dexcr_get + }, +#ifdef CONFIG_CHECKPOINT_RESTORE + [REGSET_HASHKEYR] = { + .core_note_type = NT_PPC_HASHKEYR, .n = ELF_NHASHKEYR, + .size = sizeof(u64), .align = sizeof(u64), + .active = hashkeyr_active, .regset_get = hashkeyr_get, .set = hashkeyr_set + }, +#endif +#endif +#ifdef CONFIG_PPC_MEM_KEYS + [REGSET_PKEY] = { + .core_note_type = NT_PPC_PKEY, .n = ELF_NPKEY, + .size = sizeof(u64), .align = sizeof(u64), + .active = pkey_active, .regset_get = pkey_get, .set = pkey_set + }, +#endif +}; + +const struct user_regset_view user_ppc_native_view = { + .name = UTS_MACHINE, .e_machine = ELF_ARCH, .ei_osabi = ELF_OSABI, + .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets) +}; + +#include <linux/compat.h> + +int gpr32_get_common(struct task_struct *target, + const struct user_regset *regset, + struct membuf to, unsigned long *regs) +{ + int i; + + for (i = 0; i < PT_MSR; i++) + membuf_store(&to, (u32)regs[i]); + membuf_store(&to, (u32)get_user_msr(target)); + for (i++ ; i < PT_REGS_COUNT; i++) + membuf_store(&to, (u32)regs[i]); + return membuf_zero(&to, (ELF_NGREG - PT_REGS_COUNT) * sizeof(u32)); +} + +static int gpr32_set_common_kernel(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, unsigned long *regs) +{ + const compat_ulong_t *k = kbuf; + + pos /= sizeof(compat_ulong_t); + count /= sizeof(compat_ulong_t); + + for (; count > 0 && pos < PT_MSR; --count) + regs[pos++] = *k++; + + if (count > 0 && pos == PT_MSR) { + set_user_msr(target, *k++); + ++pos; + --count; + } + + for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) + regs[pos++] = *k++; + for (; count > 0 && pos < PT_TRAP; --count, ++pos) + ++k; + + if (count > 0 && pos == PT_TRAP) { + set_user_trap(target, *k++); + ++pos; + --count; + } + + kbuf = k; + pos *= sizeof(compat_ulong_t); + count *= sizeof(compat_ulong_t); + user_regset_copyin_ignore(&pos, &count, &kbuf, NULL, + (PT_TRAP + 1) * sizeof(compat_ulong_t), -1); + return 0; +} + +static int gpr32_set_common_user(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void __user *ubuf, unsigned long *regs) +{ + const compat_ulong_t __user *u = ubuf; + const void *kbuf = NULL; + compat_ulong_t reg; + + if (!user_read_access_begin(u, count)) + return -EFAULT; + + pos /= sizeof(reg); + count /= sizeof(reg); + + for (; count > 0 && pos < PT_MSR; --count) { + unsafe_get_user(reg, u++, Efault); + regs[pos++] = reg; + } + + if (count > 0 && pos == PT_MSR) { + unsafe_get_user(reg, u++, Efault); + set_user_msr(target, reg); + ++pos; + --count; + } + + for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) { + unsafe_get_user(reg, u++, Efault); + regs[pos++] = reg; + } + for (; count > 0 && pos < PT_TRAP; --count, ++pos) + unsafe_get_user(reg, u++, Efault); + + if (count > 0 && pos == PT_TRAP) { + unsafe_get_user(reg, u++, Efault); + set_user_trap(target, reg); + ++pos; + --count; + } + user_read_access_end(); + + ubuf = u; + pos *= sizeof(reg); + count *= sizeof(reg); + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + (PT_TRAP + 1) * sizeof(reg), -1); + return 0; + +Efault: + user_read_access_end(); + return -EFAULT; +} + +int gpr32_set_common(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + unsigned long *regs) +{ + if (kbuf) + return gpr32_set_common_kernel(target, regset, pos, count, kbuf, regs); + else + return gpr32_set_common_user(target, regset, pos, count, ubuf, regs); +} + +static int gpr32_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (target->thread.regs == NULL) + return -EIO; + + return gpr32_get_common(target, regset, to, + &target->thread.regs->gpr[0]); +} + +static int gpr32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (target->thread.regs == NULL) + return -EIO; + + return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, + &target->thread.regs->gpr[0]); +} + +/* + * These are the regset flavors matching the CONFIG_PPC32 native set. + */ +static const struct user_regset compat_regsets[] = { + [REGSET_GPR] = { + .core_note_type = NT_PRSTATUS, .n = ELF_NGREG, + .size = sizeof(compat_long_t), .align = sizeof(compat_long_t), + .regset_get = gpr32_get, .set = gpr32_set + }, + [REGSET_FPR] = { + .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .regset_get = fpr_get, .set = fpr_set + }, +#ifdef CONFIG_ALTIVEC + [REGSET_VMX] = { + .core_note_type = NT_PPC_VMX, .n = 34, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = vr_active, .regset_get = vr_get, .set = vr_set + }, +#endif +#ifdef CONFIG_SPE + [REGSET_SPE] = { + .core_note_type = NT_PPC_SPE, .n = 35, + .size = sizeof(u32), .align = sizeof(u32), + .active = evr_active, .regset_get = evr_get, .set = evr_set + }, +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + [REGSET_TM_CGPR] = { + .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG, + .size = sizeof(long), .align = sizeof(long), + .active = tm_cgpr_active, + .regset_get = tm_cgpr32_get, .set = tm_cgpr32_set + }, + [REGSET_TM_CFPR] = { + .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cfpr_active, .regset_get = tm_cfpr_get, .set = tm_cfpr_set + }, + [REGSET_TM_CVMX] = { + .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = tm_cvmx_active, .regset_get = tm_cvmx_get, .set = tm_cvmx_set + }, + [REGSET_TM_CVSX] = { + .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cvsx_active, .regset_get = tm_cvsx_get, .set = tm_cvsx_set + }, + [REGSET_TM_SPR] = { + .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_spr_active, .regset_get = tm_spr_get, .set = tm_spr_set + }, + [REGSET_TM_CTAR] = { + .core_note_type = NT_PPC_TM_CTAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_tar_active, .regset_get = tm_tar_get, .set = tm_tar_set + }, + [REGSET_TM_CPPR] = { + .core_note_type = NT_PPC_TM_CPPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_ppr_active, .regset_get = tm_ppr_get, .set = tm_ppr_set + }, + [REGSET_TM_CDSCR] = { + .core_note_type = NT_PPC_TM_CDSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_dscr_active, .regset_get = tm_dscr_get, .set = tm_dscr_set + }, +#endif +#ifdef CONFIG_PPC64 + [REGSET_PPR] = { + .core_note_type = NT_PPC_PPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .regset_get = ppr_get, .set = ppr_set + }, + [REGSET_DSCR] = { + .core_note_type = NT_PPC_DSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .regset_get = dscr_get, .set = dscr_set + }, +#endif +#ifdef CONFIG_PPC_BOOK3S_64 + [REGSET_TAR] = { + .core_note_type = NT_PPC_TAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .regset_get = tar_get, .set = tar_set + }, + [REGSET_EBB] = { + .core_note_type = NT_PPC_EBB, .n = ELF_NEBB, + .size = sizeof(u64), .align = sizeof(u64), + .active = ebb_active, .regset_get = ebb_get, .set = ebb_set + }, +#endif +}; + +static const struct user_regset_view user_ppc_compat_view = { + .name = "ppc", .e_machine = EM_PPC, .ei_osabi = ELF_OSABI, + .regsets = compat_regsets, .n = ARRAY_SIZE(compat_regsets) +}; + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ + if (IS_ENABLED(CONFIG_COMPAT) && is_tsk_32bit_task(task)) + return &user_ppc_compat_view; + return &user_ppc_native_view; +} diff --git a/arch/powerpc/kernel/ptrace/ptrace-vsx.c b/arch/powerpc/kernel/ptrace/ptrace-vsx.c new file mode 100644 index 000000000000..7df08004c47d --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace-vsx.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/regset.h> + +#include <asm/switch_to.h> + +#include "ptrace-decl.h" + +/* + * Regardless of transactions, 'fp_state' holds the current running + * value of all FPR registers and 'ckfp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + * }; + */ +int fpr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + u64 buf[33]; + int i; + + flush_fp_to_thread(target); + + /* copy to local buffer then write that out */ + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.TS_FPR(i); + buf[32] = target->thread.fp_state.fpscr; + return membuf_write(&to, buf, 33 * sizeof(u64)); +} + +/* + * Regardless of transactions, 'fp_state' holds the current running + * value of all FPR registers and 'ckfp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + * }; + * + */ +int fpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u64 buf[33]; + int i; + + flush_fp_to_thread(target); + + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.TS_FPR(i); + buf[32] = target->thread.fp_state.fpscr; + + /* copy to local buffer then write that out */ + i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); + if (i) + return i; + + for (i = 0; i < 32 ; i++) + target->thread.TS_FPR(i) = buf[i]; + target->thread.fp_state.fpscr = buf[32]; + return 0; +} + +/* + * Currently to set and get all the vsx state, you need to call + * the fp and VMX calls as well. This only get/sets the lower 32 + * 128bit VSX registers. + */ + +int vsr_active(struct task_struct *target, const struct user_regset *regset) +{ + flush_vsx_to_thread(target); + return target->thread.used_vsr ? regset->n : 0; +} + +/* + * Regardless of transactions, 'fp_state' holds the current running + * value of all FPR registers and 'ckfp_state' holds the last + * checkpointed value of all FPR registers for the current + * transaction. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 vsx[32]; + * }; + */ +int vsr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + u64 buf[32]; + int i; + + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_vsx_to_thread(target); + + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + + return membuf_write(&to, buf, 32 * sizeof(double)); +} + +/* + * Regardless of transactions, 'fp_state' holds the current running + * value of all FPR registers and 'ckfp_state' holds the last + * checkpointed value of all FPR registers for the current + * transaction. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 vsx[32]; + * }; + */ +int vsr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u64 buf[32]; + int ret, i; + + flush_tmregs_to_thread(target); + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_vsx_to_thread(target); + + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + buf, 0, 32 * sizeof(double)); + if (!ret) + for (i = 0; i < 32 ; i++) + target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + + return ret; +} diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c new file mode 100644 index 000000000000..727ed4a14545 --- /dev/null +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -0,0 +1,447 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Derived from "arch/m68k/kernel/ptrace.c" + * Copyright (C) 1994 by Hamish Macdonald + * Taken from linux/kernel/ptrace.c and modified for M680x0. + * linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds + * + * Modified by Cort Dougan (cort@hq.fsmlabs.com) + * and Paul Mackerras (paulus@samba.org). + */ + +#include <linux/regset.h> +#include <linux/ptrace.h> +#include <linux/audit.h> +#include <linux/context_tracking.h> +#include <linux/syscalls.h> + +#include <asm/switch_to.h> +#include <asm/debug.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +#include "ptrace-decl.h" + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure single step bits etc are not set. + */ +void ptrace_disable(struct task_struct *child) +{ + /* make sure the single step bit is not set. */ + user_disable_single_step(child); +} + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret = -EPERM; + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = datavp; + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long index, tmp; + + ret = -EIO; + /* convert to index and check */ + index = addr / sizeof(long); + if ((addr & (sizeof(long) - 1)) || !child->thread.regs) + break; + + if (index < PT_FPR0) + ret = ptrace_get_reg(child, (int) index, &tmp); + else + ret = ptrace_get_fpr(child, index, &tmp); + + if (ret) + break; + ret = put_user(tmp, datalp); + break; + } + + /* write the word at location addr in the USER area */ + case PTRACE_POKEUSR: { + unsigned long index; + + ret = -EIO; + /* convert to index and check */ + index = addr / sizeof(long); + if ((addr & (sizeof(long) - 1)) || !child->thread.regs) + break; + + if (index < PT_FPR0) + ret = ptrace_put_reg(child, index, data); + else + ret = ptrace_put_fpr(child, index, data); + break; + } + + case PPC_PTRACE_GETHWDBGINFO: { + struct ppc_debug_info dbginfo; + + ppc_gethwdinfo(&dbginfo); + + if (copy_to_user(datavp, &dbginfo, + sizeof(struct ppc_debug_info))) + return -EFAULT; + return 0; + } + + case PPC_PTRACE_SETHWDEBUG: { + struct ppc_hw_breakpoint bp_info; + + if (copy_from_user(&bp_info, datavp, + sizeof(struct ppc_hw_breakpoint))) + return -EFAULT; + return ppc_set_hwdebug(child, &bp_info); + } + + case PPC_PTRACE_DELHWDEBUG: { + ret = ppc_del_hwdebug(child, data); + break; + } + + case PTRACE_GET_DEBUGREG: + ret = ptrace_get_debugreg(child, addr, datalp); + break; + + case PTRACE_SET_DEBUGREG: + ret = ptrace_set_debugreg(child, addr, data); + break; + +#ifdef CONFIG_PPC64 + case PTRACE_GETREGS64: +#endif + case PTRACE_GETREGS: /* Get all pt_regs from the child. */ + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_GPR, + 0, sizeof(struct user_pt_regs), + datavp); + +#ifdef CONFIG_PPC64 + case PTRACE_SETREGS64: +#endif + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_GPR, + 0, sizeof(struct user_pt_regs), + datavp); + + case PTRACE_GETFPREGS: /* Get the child FPU state (FPR0...31 + FPSCR) */ + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_FPR, + 0, sizeof(elf_fpregset_t), + datavp); + + case PTRACE_SETFPREGS: /* Set the child FPU state (FPR0...31 + FPSCR) */ + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_FPR, + 0, sizeof(elf_fpregset_t), + datavp); + +#ifdef CONFIG_ALTIVEC + case PTRACE_GETVRREGS: + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_VMX, + 0, (33 * sizeof(vector128) + + sizeof(u32)), + datavp); + + case PTRACE_SETVRREGS: + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_VMX, + 0, (33 * sizeof(vector128) + + sizeof(u32)), + datavp); +#endif +#ifdef CONFIG_VSX + case PTRACE_GETVSRREGS: + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_VSX, + 0, 32 * sizeof(double), + datavp); + + case PTRACE_SETVSRREGS: + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_VSX, + 0, 32 * sizeof(double), + datavp); +#endif +#ifdef CONFIG_SPE + case PTRACE_GETEVRREGS: + /* Get the child spe register state. */ + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_SPE, 0, 35 * sizeof(u32), + datavp); + + case PTRACE_SETEVRREGS: + /* Set the child spe register state. */ + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_SPE, 0, 35 * sizeof(u32), + datavp); +#endif + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + return ret; +} + +#ifdef CONFIG_SECCOMP +static int do_seccomp(struct pt_regs *regs) +{ + if (!test_thread_flag(TIF_SECCOMP)) + return 0; + + /* + * The ABI we present to seccomp tracers is that r3 contains + * the syscall return value and orig_gpr3 contains the first + * syscall parameter. This is different to the ptrace ABI where + * both r3 and orig_gpr3 contain the first syscall parameter. + */ + regs->gpr[3] = -ENOSYS; + + /* + * We use the __ version here because we have already checked + * TIF_SECCOMP. If this fails, there is nothing left to do, we + * have already loaded -ENOSYS into r3, or seccomp has put + * something else in r3 (via SECCOMP_RET_ERRNO/TRACE). + */ + if (__secure_computing(NULL)) + return -1; + + /* + * The syscall was allowed by seccomp, restore the register + * state to what audit expects. + * Note that we use orig_gpr3, which means a seccomp tracer can + * modify the first syscall parameter (in orig_gpr3) and also + * allow the syscall to proceed. + */ + regs->gpr[3] = regs->orig_gpr3; + + return 0; +} +#else +static inline int do_seccomp(struct pt_regs *regs) { return 0; } +#endif /* CONFIG_SECCOMP */ + +/** + * do_syscall_trace_enter() - Do syscall tracing on kernel entry. + * @regs: the pt_regs of the task to trace (current) + * + * Performs various types of tracing on syscall entry. This includes seccomp, + * ptrace, syscall tracepoints and audit. + * + * The pt_regs are potentially visible to userspace via ptrace, so their + * contents is ABI. + * + * One or more of the tracers may modify the contents of pt_regs, in particular + * to modify arguments or even the syscall number itself. + * + * It's also possible that a tracer can choose to reject the system call. In + * that case this function will return an illegal syscall number, and will put + * an appropriate return value in regs->r3. + * + * Return: the (possibly changed) syscall number. + */ +long do_syscall_trace_enter(struct pt_regs *regs) +{ + u32 flags; + + flags = read_thread_flags() & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE); + + if (flags) { + int rc = ptrace_report_syscall_entry(regs); + + if (unlikely(flags & _TIF_SYSCALL_EMU)) { + /* + * A nonzero return code from + * ptrace_report_syscall_entry() tells us to prevent + * the syscall execution, but we are not going to + * execute it anyway. + * + * Returning -1 will skip the syscall execution. We want + * to avoid clobbering any registers, so we don't goto + * the skip label below. + */ + return -1; + } + + if (rc) { + /* + * The tracer decided to abort the syscall. Note that + * the tracer may also just change regs->gpr[0] to an + * invalid syscall number, that is handled below on the + * exit path. + */ + goto skip; + } + } + + /* Run seccomp after ptrace; allow it to set gpr[3]. */ + if (do_seccomp(regs)) + return -1; + + /* Avoid trace and audit when syscall is invalid. */ + if (regs->gpr[0] >= NR_syscalls) + goto skip; + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->gpr[0]); + + if (!is_32bit_task()) + audit_syscall_entry(regs->gpr[0], regs->gpr[3], regs->gpr[4], + regs->gpr[5], regs->gpr[6]); + else + audit_syscall_entry(regs->gpr[0], + regs->gpr[3] & 0xffffffff, + regs->gpr[4] & 0xffffffff, + regs->gpr[5] & 0xffffffff, + regs->gpr[6] & 0xffffffff); + + /* Return the possibly modified but valid syscall number */ + return regs->gpr[0]; + +skip: + /* + * If we are aborting explicitly, or if the syscall number is + * now invalid, set the return value to -ENOSYS. + */ + regs->gpr[3] = -ENOSYS; + return -1; +} + +void do_syscall_trace_leave(struct pt_regs *regs) +{ + int step; + + audit_syscall_exit(regs); + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->result); + + step = test_thread_flag(TIF_SINGLESTEP); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + ptrace_report_syscall_exit(regs, step); +} + +void __init pt_regs_check(void); + +/* + * Dummy function, its purpose is to break the build if struct pt_regs and + * struct user_pt_regs don't match. + */ +void __init pt_regs_check(void) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, gpr) != + offsetof(struct user_pt_regs, gpr)); + BUILD_BUG_ON(offsetof(struct pt_regs, nip) != + offsetof(struct user_pt_regs, nip)); + BUILD_BUG_ON(offsetof(struct pt_regs, msr) != + offsetof(struct user_pt_regs, msr)); + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct user_pt_regs, orig_gpr3)); + BUILD_BUG_ON(offsetof(struct pt_regs, ctr) != + offsetof(struct user_pt_regs, ctr)); + BUILD_BUG_ON(offsetof(struct pt_regs, link) != + offsetof(struct user_pt_regs, link)); + BUILD_BUG_ON(offsetof(struct pt_regs, xer) != + offsetof(struct user_pt_regs, xer)); + BUILD_BUG_ON(offsetof(struct pt_regs, ccr) != + offsetof(struct user_pt_regs, ccr)); +#ifdef __powerpc64__ + BUILD_BUG_ON(offsetof(struct pt_regs, softe) != + offsetof(struct user_pt_regs, softe)); +#else + BUILD_BUG_ON(offsetof(struct pt_regs, mq) != + offsetof(struct user_pt_regs, mq)); +#endif + BUILD_BUG_ON(offsetof(struct pt_regs, trap) != + offsetof(struct user_pt_regs, trap)); + BUILD_BUG_ON(offsetof(struct pt_regs, dar) != + offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dear) != + offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != + offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != + offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, result) != + offsetof(struct user_pt_regs, result)); + + BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs)); + + // Now check that the pt_regs offsets match the uapi #defines + #define CHECK_REG(_pt, _reg) \ + BUILD_BUG_ON(_pt != (offsetof(struct user_pt_regs, _reg) / \ + sizeof(unsigned long))); + + CHECK_REG(PT_R0, gpr[0]); + CHECK_REG(PT_R1, gpr[1]); + CHECK_REG(PT_R2, gpr[2]); + CHECK_REG(PT_R3, gpr[3]); + CHECK_REG(PT_R4, gpr[4]); + CHECK_REG(PT_R5, gpr[5]); + CHECK_REG(PT_R6, gpr[6]); + CHECK_REG(PT_R7, gpr[7]); + CHECK_REG(PT_R8, gpr[8]); + CHECK_REG(PT_R9, gpr[9]); + CHECK_REG(PT_R10, gpr[10]); + CHECK_REG(PT_R11, gpr[11]); + CHECK_REG(PT_R12, gpr[12]); + CHECK_REG(PT_R13, gpr[13]); + CHECK_REG(PT_R14, gpr[14]); + CHECK_REG(PT_R15, gpr[15]); + CHECK_REG(PT_R16, gpr[16]); + CHECK_REG(PT_R17, gpr[17]); + CHECK_REG(PT_R18, gpr[18]); + CHECK_REG(PT_R19, gpr[19]); + CHECK_REG(PT_R20, gpr[20]); + CHECK_REG(PT_R21, gpr[21]); + CHECK_REG(PT_R22, gpr[22]); + CHECK_REG(PT_R23, gpr[23]); + CHECK_REG(PT_R24, gpr[24]); + CHECK_REG(PT_R25, gpr[25]); + CHECK_REG(PT_R26, gpr[26]); + CHECK_REG(PT_R27, gpr[27]); + CHECK_REG(PT_R28, gpr[28]); + CHECK_REG(PT_R29, gpr[29]); + CHECK_REG(PT_R30, gpr[30]); + CHECK_REG(PT_R31, gpr[31]); + CHECK_REG(PT_NIP, nip); + CHECK_REG(PT_MSR, msr); + CHECK_REG(PT_ORIG_R3, orig_gpr3); + CHECK_REG(PT_CTR, ctr); + CHECK_REG(PT_LNK, link); + CHECK_REG(PT_XER, xer); + CHECK_REG(PT_CCR, ccr); +#ifdef CONFIG_PPC64 + CHECK_REG(PT_SOFTE, softe); +#else + CHECK_REG(PT_MQ, mq); +#endif + CHECK_REG(PT_TRAP, trap); + CHECK_REG(PT_DAR, dar); + CHECK_REG(PT_DSISR, dsisr); + CHECK_REG(PT_RESULT, result); + #undef CHECK_REG + + BUILD_BUG_ON(PT_REGS_COUNT != sizeof(struct user_pt_regs) / sizeof(unsigned long)); + + /* + * PT_DSCR isn't a real reg, but it's important that it doesn't overlap the + * real registers. + */ + BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long)); + + // ptrace_get/put_fpr() rely on PPC32 and VSX being incompatible + BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC32) && IS_ENABLED(CONFIG_VSX)); +} diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace/ptrace32.c index f37eb53de1a1..19c224808982 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace/ptrace32.c @@ -17,23 +17,14 @@ * this archive for more details. */ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> #include <linux/ptrace.h> #include <linux/regset.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/signal.h> #include <linux/compat.h> -#include <linux/uaccess.h> -#include <asm/page.h> -#include <asm/pgtable.h> #include <asm/switch_to.h> +#include "ptrace-decl.h" + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -92,7 +83,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if ((addr & 3) || (index > PT_FPSCR32)) break; - CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { ret = ptrace_get_reg(child, index, &tmp); if (ret) @@ -142,7 +132,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if ((addr & 3) || numReg > PT_FPSCR) break; - CHECK_FULL_REGS(child->thread.regs); if (numReg >= PT_FPR0) { flush_fp_to_thread(child); /* get 64 bit FPR */ @@ -196,7 +185,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if ((addr & 3) || (index > PT_FPSCR32)) break; - CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { ret = ptrace_put_reg(child, index, data); } else { @@ -235,7 +223,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, */ if ((addr & 3) || (numReg > PT_FPSCR)) break; - CHECK_FULL_REGS(child->thread.regs); if (numReg < PT_FPR0) { unsigned long freg; ret = ptrace_get_reg(child, numReg, &freg); @@ -270,8 +257,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, ret = put_user(child->thread.debug.dac1, (u32 __user *)data); #else dabr_fake = ( - (child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) | - (child->thread.hw_brk.type & HW_BRK_TYPE_DABR)); + (child->thread.hw_brk[0].address & (~HW_BRK_TYPE_DABR)) | + (child->thread.hw_brk[0].type & HW_BRK_TYPE_DABR)); ret = put_user(dabr_fake, (u32 __user *)data); #endif break; diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S index 10e96f3e22fe..0508c14b4c28 100644 --- a/arch/powerpc/kernel/reloc_32.S +++ b/arch/powerpc/kernel/reloc_32.S @@ -30,7 +30,7 @@ R_PPC_RELATIVE = 22 _GLOBAL(relocate) mflr r0 /* Save our LR */ - bl 0f /* Find our current runtime address */ + bcl 20,31,$+4 /* Find our current runtime address */ 0: mflr r12 /* Make it accessible */ mtlr r0 diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S index 02d4719bf43a..efd52f2e7033 100644 --- a/arch/powerpc/kernel/reloc_64.S +++ b/arch/powerpc/kernel/reloc_64.S @@ -8,8 +8,10 @@ #include <asm/ppc_asm.h> RELA = 7 -RELACOUNT = 0x6ffffff9 +RELASZ = 8 +RELAENT = 9 R_PPC64_RELATIVE = 22 +R_PPC64_UADDR64 = 43 /* * r3 = desired final address of kernel @@ -25,29 +27,38 @@ _GLOBAL(relocate) add r9,r9,r12 /* r9 has runtime addr of .rela.dyn section */ ld r10,(p_st - 0b)(r12) add r10,r10,r12 /* r10 has runtime addr of _stext */ + ld r4,(p_sym - 0b)(r12) + add r4,r4,r12 /* r4 has runtime addr of .dynsym */ /* - * Scan the dynamic section for the RELA and RELACOUNT entries. + * Scan the dynamic section for the RELA, RELASZ and RELAENT entries. */ li r7,0 li r8,0 -1: ld r6,0(r11) /* get tag */ +.Ltags: + ld r6,0(r11) /* get tag */ cmpdi r6,0 - beq 4f /* end of list */ + beq .Lend_of_list /* end of list */ cmpdi r6,RELA bne 2f ld r7,8(r11) /* get RELA pointer in r7 */ - b 3f -2: addis r6,r6,(-RELACOUNT)@ha - cmpdi r6,RELACOUNT@l + b 4f +2: cmpdi r6,RELASZ bne 3f - ld r8,8(r11) /* get RELACOUNT value in r8 */ -3: addi r11,r11,16 - b 1b -4: cmpdi r7,0 /* check we have both RELA and RELACOUNT */ + ld r8,8(r11) /* get RELASZ value in r8 */ + b 4f +3: cmpdi r6,RELAENT + bne 4f + ld r12,8(r11) /* get RELAENT value in r12 */ +4: addi r11,r11,16 + b .Ltags +.Lend_of_list: + cmpdi r7,0 /* check we have RELA, RELASZ, RELAENT */ cmpdi cr1,r8,0 - beq 6f - beq cr1,6f + beq .Lout + beq cr1,.Lout + cmpdi r12,0 + beq .Lout /* * Work out linktime address of _stext and hence the @@ -62,23 +73,39 @@ _GLOBAL(relocate) /* * Run through the list of relocations and process the - * R_PPC64_RELATIVE ones. + * R_PPC64_RELATIVE and R_PPC64_UADDR64 ones. */ + divd r8,r8,r12 /* RELASZ / RELAENT */ mtctr r8 -5: ld r0,8(9) /* ELF64_R_TYPE(reloc->r_info) */ +.Lrels: ld r0,8(r9) /* ELF64_R_TYPE(reloc->r_info) */ cmpdi r0,R_PPC64_RELATIVE - bne 6f + bne .Luaddr64 ld r6,0(r9) /* reloc->r_offset */ ld r0,16(r9) /* reloc->r_addend */ + b .Lstore +.Luaddr64: + srdi r5,r0,32 /* ELF64_R_SYM(reloc->r_info) */ + clrldi r0,r0,32 + cmpdi r0,R_PPC64_UADDR64 + bne .Lnext + ld r6,0(r9) + ld r0,16(r9) + mulli r5,r5,24 /* 24 == sizeof(elf64_sym) */ + add r5,r5,r4 /* elf64_sym[ELF64_R_SYM] */ + ld r5,8(r5) + add r0,r0,r5 +.Lstore: add r0,r0,r3 stdx r0,r7,r6 - addi r9,r9,24 - bdnz 5b - -6: blr +.Lnext: + add r9,r9,r12 + bdnz .Lrels +.Lout: + blr .balign 8 p_dyn: .8byte __dynamic_start - 0b p_rela: .8byte __rela_dyn_start - 0b +p_sym: .8byte __dynamic_symtab - 0b p_st: .8byte _stext - 0b diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index 487dcd8da4de..f38df72e64b8 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -24,11 +24,11 @@ #include <linux/seq_file.h> #include <linux/bitops.h> #include <linux/rtc.h> +#include <linux/of.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/rtas.h> #include <asm/machdep.h> /* for ppc_md */ #include <asm/time.h> @@ -159,12 +159,12 @@ static int poweron_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_poweron_show, NULL); } -static const struct file_operations ppc_rtas_poweron_operations = { - .open = poweron_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_poweron_write, - .release = single_release, +static const struct proc_ops ppc_rtas_poweron_proc_ops = { + .proc_open = poweron_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_poweron_write, + .proc_release = single_release, }; static int progress_open(struct inode *inode, struct file *file) @@ -172,12 +172,12 @@ static int progress_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_progress_show, NULL); } -static const struct file_operations ppc_rtas_progress_operations = { - .open = progress_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_progress_write, - .release = single_release, +static const struct proc_ops ppc_rtas_progress_proc_ops = { + .proc_open = progress_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_progress_write, + .proc_release = single_release, }; static int clock_open(struct inode *inode, struct file *file) @@ -185,12 +185,12 @@ static int clock_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_clock_show, NULL); } -static const struct file_operations ppc_rtas_clock_operations = { - .open = clock_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_clock_write, - .release = single_release, +static const struct proc_ops ppc_rtas_clock_proc_ops = { + .proc_open = clock_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_clock_write, + .proc_release = single_release, }; static int tone_freq_open(struct inode *inode, struct file *file) @@ -198,12 +198,12 @@ static int tone_freq_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_tone_freq_show, NULL); } -static const struct file_operations ppc_rtas_tone_freq_operations = { - .open = tone_freq_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_tone_freq_write, - .release = single_release, +static const struct proc_ops ppc_rtas_tone_freq_proc_ops = { + .proc_open = tone_freq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_tone_freq_write, + .proc_release = single_release, }; static int tone_volume_open(struct inode *inode, struct file *file) @@ -211,12 +211,12 @@ static int tone_volume_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_tone_volume_show, NULL); } -static const struct file_operations ppc_rtas_tone_volume_operations = { - .open = tone_volume_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_tone_volume_write, - .release = single_release, +static const struct proc_ops ppc_rtas_tone_volume_proc_ops = { + .proc_open = tone_volume_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_tone_volume_write, + .proc_release = single_release, }; static int ppc_rtas_find_all_sensors(void); @@ -238,17 +238,17 @@ static int __init proc_rtas_init(void) return -ENODEV; proc_create("powerpc/rtas/progress", 0644, NULL, - &ppc_rtas_progress_operations); + &ppc_rtas_progress_proc_ops); proc_create("powerpc/rtas/clock", 0644, NULL, - &ppc_rtas_clock_operations); + &ppc_rtas_clock_proc_ops); proc_create("powerpc/rtas/poweron", 0644, NULL, - &ppc_rtas_poweron_operations); + &ppc_rtas_poweron_proc_ops); proc_create_single("powerpc/rtas/sensors", 0444, NULL, ppc_rtas_sensors_show); proc_create("powerpc/rtas/frequency", 0644, NULL, - &ppc_rtas_tone_freq_operations); + &ppc_rtas_tone_freq_proc_ops); proc_create("powerpc/rtas/volume", 0644, NULL, - &ppc_rtas_tone_volume_operations); + &ppc_rtas_tone_volume_proc_ops); proc_create_single("powerpc/rtas/rmo_buffer", 0400, NULL, ppc_rtas_rmo_buf_show); return 0; @@ -259,7 +259,6 @@ __initcall(proc_rtas_init); static int parse_number(const char __user *p, size_t count, u64 *val) { char buf[40]; - char *end; if (count > 39) return -EINVAL; @@ -269,11 +268,7 @@ static int parse_number(const char __user *p, size_t count, u64 *val) buf[count] = 0; - *val = simple_strtoull(buf, &end, 10); - if (*end && *end != '\n') - return -EINVAL; - - return 0; + return kstrtoull(buf, 10, val); } /* ****************************************************************** */ @@ -292,9 +287,9 @@ static ssize_t ppc_rtas_poweron_write(struct file *file, rtc_time64_to_tm(nowtime, &tm); - error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL, - tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */); + error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_FOR_POWER_ON), 7, 1, NULL, + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */); if (error) printk(KERN_WARNING "error: setting poweron time returned: %s\n", ppc_rtas_process_error(error)); @@ -355,9 +350,9 @@ static ssize_t ppc_rtas_clock_write(struct file *file, return error; rtc_time64_to_tm(nowtime, &tm); - error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, - tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec, 0); + error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_OF_DAY), 7, 1, NULL, + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, 0); if (error) printk(KERN_WARNING "error: setting the clock returned: %s\n", ppc_rtas_process_error(error)); @@ -367,7 +362,7 @@ static ssize_t ppc_rtas_clock_write(struct file *file, static int ppc_rtas_clock_show(struct seq_file *m, void *v) { int ret[8]; - int error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + int error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret); if (error) { printk(KERN_WARNING "error: reading the clock returned: %s\n", @@ -390,7 +385,7 @@ static int ppc_rtas_sensors_show(struct seq_file *m, void *v) { int i,j; int state, error; - int get_sensor_state = rtas_token("get-sensor-state"); + int get_sensor_state = rtas_function_token(RTAS_FN_GET_SENSOR_STATE); seq_printf(m, "RTAS (RunTime Abstraction Services) Sensor Information\n"); seq_printf(m, "Sensor\t\tValue\t\tCondition\tLocation\n"); @@ -713,8 +708,8 @@ static ssize_t ppc_rtas_tone_freq_write(struct file *file, return error; rtas_tone_frequency = freq; /* save it for later */ - error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL, - TONE_FREQUENCY, 0, freq); + error = rtas_call(rtas_function_token(RTAS_FN_SET_INDICATOR), 3, 1, NULL, + TONE_FREQUENCY, 0, freq); if (error) printk(KERN_WARNING "error: setting tone frequency returned: %s\n", ppc_rtas_process_error(error)); @@ -741,8 +736,8 @@ static ssize_t ppc_rtas_tone_volume_write(struct file *file, volume = 100; rtas_tone_volume = volume; /* save it for later */ - error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL, - TONE_VOLUME, 0, volume); + error = rtas_call(rtas_function_token(RTAS_FN_SET_INDICATOR), 3, 1, NULL, + TONE_VOLUME, 0, volume); if (error) printk(KERN_WARNING "error: setting tone volume returned: %s\n", ppc_rtas_process_error(error)); @@ -755,11 +750,20 @@ static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v) return 0; } -#define RMO_READ_BUF_MAX 30 - -/* RTAS Userspace access */ +/** + * ppc_rtas_rmo_buf_show() - Describe RTAS-addressable region for user space. + * @m: seq_file output target. + * @v: Unused. + * + * Base + size description of a range of RTAS-addressable memory set + * aside for user space to use as work area(s) for certain RTAS + * functions. User space accesses this region via /dev/mem. Apart from + * security policies, the kernel does not arbitrate or serialize + * access to this region, and user space must ensure that concurrent + * users do not interfere with each other. + */ static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v) { - seq_printf(m, "%016lx %x\n", rtas_rmo_buf, RTAS_RMOBUF_MAX); + seq_printf(m, "%016lx %x\n", rtas_rmo_buf, RTAS_USER_REGION_SIZE); return 0; } diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c index a28239b8b0c0..6996214532bd 100644 --- a/arch/powerpc/kernel/rtas-rtc.c +++ b/arch/powerpc/kernel/rtas-rtc.c @@ -6,13 +6,12 @@ #include <linux/rtc.h> #include <linux/delay.h> #include <linux/ratelimit.h> -#include <asm/prom.h> #include <asm/rtas.h> #include <asm/time.h> #define MAX_RTC_WAIT 5000 /* 5 sec */ -#define RTAS_CLOCK_BUSY (-2) + time64_t __init rtas_get_boot_time(void) { int ret[8]; @@ -22,7 +21,7 @@ time64_t __init rtas_get_boot_time(void) max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; do { - error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret); wait_time = rtas_busy_delay_time(error); if (wait_time) { @@ -54,7 +53,7 @@ void rtas_get_rtc_time(struct rtc_time *rtc_tm) max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; do { - error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret); wait_time = rtas_busy_delay_time(error); if (wait_time) { @@ -91,7 +90,7 @@ int rtas_set_rtc_time(struct rtc_time *tm) max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; do { - error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, + error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_OF_DAY), 7, 1, NULL, tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec, 0); diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index c5fa251b8950..7e793b503e29 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -7,86 +7,745 @@ * Copyright (C) 2001 IBM. */ -#include <stdarg.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/export.h> -#include <linux/init.h> +#define pr_fmt(fmt) "rtas: " fmt + +#include <linux/bsearch.h> #include <linux/capability.h> #include <linux/delay.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/completion.h> -#include <linux/cpumask.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/kconfig.h> +#include <linux/kernel.h> +#include <linux/lockdep.h> #include <linux/memblock.h> -#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/of.h> +#include <linux/of_fdt.h> #include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/security.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stdarg.h> #include <linux/syscalls.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/xarray.h> -#include <asm/prom.h> -#include <asm/rtas.h> -#include <asm/hvcall.h> -#include <asm/machdep.h> +#include <asm/delay.h> #include <asm/firmware.h> +#include <asm/interrupt.h> +#include <asm/machdep.h> +#include <asm/mmu.h> #include <asm/page.h> -#include <asm/param.h> -#include <asm/delay.h> -#include <linux/uaccess.h> -#include <asm/udbg.h> -#include <asm/syscalls.h> -#include <asm/smp.h> -#include <linux/atomic.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> #include <asm/time.h> -#include <asm/mmu.h> -#include <asm/topology.h> +#include <asm/trace.h> +#include <asm/udbg.h> -/* This is here deliberately so it's only used in this file */ -void enter_rtas(unsigned long); +struct rtas_filter { + /* Indexes into the args buffer, -1 if not used */ + const int buf_idx1; + const int size_idx1; + const int buf_idx2; + const int size_idx2; + /* + * Assumed buffer size per the spec if the function does not + * have a size parameter, e.g. ibm,errinjct. 0 if unused. + */ + const int fixed_size; +}; -struct rtas_t rtas = { - .lock = __ARCH_SPIN_LOCK_UNLOCKED +/** + * struct rtas_function - Descriptor for RTAS functions. + * + * @token: Value of @name if it exists under the /rtas node. + * @name: Function name. + * @filter: If non-NULL, invoking this function via the rtas syscall is + * generally allowed, and @filter describes constraints on the + * arguments. See also @banned_for_syscall_on_le. + * @banned_for_syscall_on_le: Set when call via sys_rtas is generally allowed + * but specifically restricted on ppc64le. Such + * functions are believed to have no users on + * ppc64le, and we want to keep it that way. It does + * not make sense for this to be set when @filter + * is NULL. + * @lock: Pointer to an optional dedicated per-function mutex. This + * should be set for functions that require multiple calls in + * sequence to complete a single operation, and such sequences + * will disrupt each other if allowed to interleave. Users of + * this function are required to hold the associated lock for + * the duration of the call sequence. Add an explanatory + * comment to the function table entry if setting this member. + */ +struct rtas_function { + s32 token; + const bool banned_for_syscall_on_le:1; + const char * const name; + const struct rtas_filter *filter; + struct mutex *lock; }; -EXPORT_SYMBOL(rtas); -DEFINE_SPINLOCK(rtas_data_buf_lock); -EXPORT_SYMBOL(rtas_data_buf_lock); +/* + * Per-function locks for sequence-based RTAS functions. + */ +static DEFINE_MUTEX(rtas_ibm_activate_firmware_lock); +static DEFINE_MUTEX(rtas_ibm_get_dynamic_sensor_state_lock); +static DEFINE_MUTEX(rtas_ibm_get_indices_lock); +static DEFINE_MUTEX(rtas_ibm_lpar_perftools_lock); +static DEFINE_MUTEX(rtas_ibm_physical_attestation_lock); +static DEFINE_MUTEX(rtas_ibm_set_dynamic_indicator_lock); +DEFINE_MUTEX(rtas_ibm_get_vpd_lock); + +static struct rtas_function rtas_function_table[] __ro_after_init = { + [RTAS_FNIDX__CHECK_EXCEPTION] = { + .name = "check-exception", + }, + [RTAS_FNIDX__DISPLAY_CHARACTER] = { + .name = "display-character", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__EVENT_SCAN] = { + .name = "event-scan", + }, + [RTAS_FNIDX__FREEZE_TIME_BASE] = { + .name = "freeze-time-base", + }, + [RTAS_FNIDX__GET_POWER_LEVEL] = { + .name = "get-power-level", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__GET_SENSOR_STATE] = { + .name = "get-sensor-state", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__GET_TERM_CHAR] = { + .name = "get-term-char", + }, + [RTAS_FNIDX__GET_TIME_OF_DAY] = { + .name = "get-time-of-day", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_ACTIVATE_FIRMWARE] = { + .name = "ibm,activate-firmware", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ as of v2.13 doesn't explicitly impose any + * restriction, but this typically requires multiple + * calls before success, and there's no reason to + * allow sequences to interleave. + */ + .lock = &rtas_ibm_activate_firmware_lock, + }, + [RTAS_FNIDX__IBM_CBE_START_PTCAL] = { + .name = "ibm,cbe-start-ptcal", + }, + [RTAS_FNIDX__IBM_CBE_STOP_PTCAL] = { + .name = "ibm,cbe-stop-ptcal", + }, + [RTAS_FNIDX__IBM_CHANGE_MSI] = { + .name = "ibm,change-msi", + }, + [RTAS_FNIDX__IBM_CLOSE_ERRINJCT] = { + .name = "ibm,close-errinjct", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_CONFIGURE_BRIDGE] = { + .name = "ibm,configure-bridge", + }, + [RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR] = { + .name = "ibm,configure-connector", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = 1, .size_idx2 = -1, + .fixed_size = 4096, + }, + }, + [RTAS_FNIDX__IBM_CONFIGURE_KERNEL_DUMP] = { + .name = "ibm,configure-kernel-dump", + }, + [RTAS_FNIDX__IBM_CONFIGURE_PE] = { + .name = "ibm,configure-pe", + }, + [RTAS_FNIDX__IBM_CREATE_PE_DMA_WINDOW] = { + .name = "ibm,create-pe-dma-window", + }, + [RTAS_FNIDX__IBM_DISPLAY_MESSAGE] = { + .name = "ibm,display-message", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_ERRINJCT] = { + .name = "ibm,errinjct", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + .fixed_size = 1024, + }, + }, + [RTAS_FNIDX__IBM_EXTI2C] = { + .name = "ibm,exti2c", + }, + [RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO] = { + .name = "ibm,get-config-addr-info", + }, + [RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO2] = { + .name = "ibm,get-config-addr-info2", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_GET_DYNAMIC_SENSOR_STATE] = { + .name = "ibm,get-dynamic-sensor-state", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.19–3 is explicit that the OS + * must not call ibm,get-dynamic-sensor-state with + * different inputs until a non-retry status has been + * returned. + */ + .lock = &rtas_ibm_get_dynamic_sensor_state_lock, + }, + [RTAS_FNIDX__IBM_GET_INDICES] = { + .name = "ibm,get-indices", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = 3, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.17–2 says that the OS must not + * interleave ibm,get-indices call sequences with + * different inputs. + */ + .lock = &rtas_ibm_get_indices_lock, + }, + [RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY] = { + .name = "ibm,get-rio-topology", + }, + [RTAS_FNIDX__IBM_GET_SYSTEM_PARAMETER] = { + .name = "ibm,get-system-parameter", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = 2, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_GET_VPD] = { + .name = "ibm,get-vpd", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = 1, .size_idx2 = 2, + }, + /* + * PAPR+ v2.13 R1–7.3.20–4 indicates that sequences + * should not be allowed to interleave. + */ + .lock = &rtas_ibm_get_vpd_lock, + }, + [RTAS_FNIDX__IBM_GET_XIVE] = { + .name = "ibm,get-xive", + }, + [RTAS_FNIDX__IBM_INT_OFF] = { + .name = "ibm,int-off", + }, + [RTAS_FNIDX__IBM_INT_ON] = { + .name = "ibm,int-on", + }, + [RTAS_FNIDX__IBM_IO_QUIESCE_ACK] = { + .name = "ibm,io-quiesce-ack", + }, + [RTAS_FNIDX__IBM_LPAR_PERFTOOLS] = { + .name = "ibm,lpar-perftools", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = 3, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.26–6 says the OS should allow + * only one call sequence in progress at a time. + */ + .lock = &rtas_ibm_lpar_perftools_lock, + }, + [RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE] = { + .name = "ibm,manage-flash-image", + }, + [RTAS_FNIDX__IBM_MANAGE_STORAGE_PRESERVATION] = { + .name = "ibm,manage-storage-preservation", + }, + [RTAS_FNIDX__IBM_NMI_INTERLOCK] = { + .name = "ibm,nmi-interlock", + }, + [RTAS_FNIDX__IBM_NMI_REGISTER] = { + .name = "ibm,nmi-register", + }, + [RTAS_FNIDX__IBM_OPEN_ERRINJCT] = { + .name = "ibm,open-errinjct", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_OPEN_SRIOV_ALLOW_UNFREEZE] = { + .name = "ibm,open-sriov-allow-unfreeze", + }, + [RTAS_FNIDX__IBM_OPEN_SRIOV_MAP_PE_NUMBER] = { + .name = "ibm,open-sriov-map-pe-number", + }, + [RTAS_FNIDX__IBM_OS_TERM] = { + .name = "ibm,os-term", + }, + [RTAS_FNIDX__IBM_PARTNER_CONTROL] = { + .name = "ibm,partner-control", + }, + [RTAS_FNIDX__IBM_PHYSICAL_ATTESTATION] = { + .name = "ibm,physical-attestation", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = 1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * This follows a sequence-based pattern similar to + * ibm,get-vpd et al. Since PAPR+ restricts + * interleaving call sequences for other functions of + * this style, assume the restriction applies here, + * even though it's not explicit in the spec. + */ + .lock = &rtas_ibm_physical_attestation_lock, + }, + [RTAS_FNIDX__IBM_PLATFORM_DUMP] = { + .name = "ibm,platform-dump", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 4, .size_idx1 = 5, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 7.3.3.4.1 indicates that concurrent + * sequences of ibm,platform-dump are allowed if they + * are operating on different dump tags. So leave the + * lock pointer unset for now. This may need + * reconsideration if kernel-internal users appear. + */ + }, + [RTAS_FNIDX__IBM_POWER_OFF_UPS] = { + .name = "ibm,power-off-ups", + }, + [RTAS_FNIDX__IBM_QUERY_INTERRUPT_SOURCE_NUMBER] = { + .name = "ibm,query-interrupt-source-number", + }, + [RTAS_FNIDX__IBM_QUERY_PE_DMA_WINDOW] = { + .name = "ibm,query-pe-dma-window", + }, + [RTAS_FNIDX__IBM_READ_PCI_CONFIG] = { + .name = "ibm,read-pci-config", + }, + [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE] = { + .name = "ibm,read-slot-reset-state", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2] = { + .name = "ibm,read-slot-reset-state2", + }, + [RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = { + .name = "ibm,remove-pe-dma-window", + }, + [RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS] = { + .name = "ibm,reset-pe-dma-windows", + }, + [RTAS_FNIDX__IBM_SCAN_LOG_DUMP] = { + .name = "ibm,scan-log-dump", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = 1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR] = { + .name = "ibm,set-dynamic-indicator", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.18–3 says the OS must not call + * this function with different inputs until a + * non-retry status has been returned. + */ + .lock = &rtas_ibm_set_dynamic_indicator_lock, + }, + [RTAS_FNIDX__IBM_SET_EEH_OPTION] = { + .name = "ibm,set-eeh-option", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_SET_SLOT_RESET] = { + .name = "ibm,set-slot-reset", + }, + [RTAS_FNIDX__IBM_SET_SYSTEM_PARAMETER] = { + .name = "ibm,set-system-parameter", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_SET_XIVE] = { + .name = "ibm,set-xive", + }, + [RTAS_FNIDX__IBM_SLOT_ERROR_DETAIL] = { + .name = "ibm,slot-error-detail", + }, + [RTAS_FNIDX__IBM_SUSPEND_ME] = { + .name = "ibm,suspend-me", + .banned_for_syscall_on_le = true, + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_TUNE_DMA_PARMS] = { + .name = "ibm,tune-dma-parms", + }, + [RTAS_FNIDX__IBM_UPDATE_FLASH_64_AND_REBOOT] = { + .name = "ibm,update-flash-64-and-reboot", + }, + [RTAS_FNIDX__IBM_UPDATE_NODES] = { + .name = "ibm,update-nodes", + .banned_for_syscall_on_le = true, + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + .fixed_size = 4096, + }, + }, + [RTAS_FNIDX__IBM_UPDATE_PROPERTIES] = { + .name = "ibm,update-properties", + .banned_for_syscall_on_le = true, + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + .fixed_size = 4096, + }, + }, + [RTAS_FNIDX__IBM_VALIDATE_FLASH_IMAGE] = { + .name = "ibm,validate-flash-image", + }, + [RTAS_FNIDX__IBM_WRITE_PCI_CONFIG] = { + .name = "ibm,write-pci-config", + }, + [RTAS_FNIDX__NVRAM_FETCH] = { + .name = "nvram-fetch", + }, + [RTAS_FNIDX__NVRAM_STORE] = { + .name = "nvram-store", + }, + [RTAS_FNIDX__POWER_OFF] = { + .name = "power-off", + }, + [RTAS_FNIDX__PUT_TERM_CHAR] = { + .name = "put-term-char", + }, + [RTAS_FNIDX__QUERY_CPU_STOPPED_STATE] = { + .name = "query-cpu-stopped-state", + }, + [RTAS_FNIDX__READ_PCI_CONFIG] = { + .name = "read-pci-config", + }, + [RTAS_FNIDX__RTAS_LAST_ERROR] = { + .name = "rtas-last-error", + }, + [RTAS_FNIDX__SET_INDICATOR] = { + .name = "set-indicator", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__SET_POWER_LEVEL] = { + .name = "set-power-level", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__SET_TIME_FOR_POWER_ON] = { + .name = "set-time-for-power-on", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__SET_TIME_OF_DAY] = { + .name = "set-time-of-day", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__START_CPU] = { + .name = "start-cpu", + }, + [RTAS_FNIDX__STOP_SELF] = { + .name = "stop-self", + }, + [RTAS_FNIDX__SYSTEM_REBOOT] = { + .name = "system-reboot", + }, + [RTAS_FNIDX__THAW_TIME_BASE] = { + .name = "thaw-time-base", + }, + [RTAS_FNIDX__WRITE_PCI_CONFIG] = { + .name = "write-pci-config", + }, +}; -char rtas_data_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned; -EXPORT_SYMBOL(rtas_data_buf); +#define for_each_rtas_function(funcp) \ + for (funcp = &rtas_function_table[0]; \ + funcp < &rtas_function_table[ARRAY_SIZE(rtas_function_table)]; \ + ++funcp) -unsigned long rtas_rmo_buf; +/* + * Nearly all RTAS calls need to be serialized. All uses of the + * default rtas_args block must hold rtas_lock. + * + * Exceptions to the RTAS serialization requirement (e.g. stop-self) + * must use a separate rtas_args structure. + */ +static DEFINE_RAW_SPINLOCK(rtas_lock); +static struct rtas_args rtas_args; + +/** + * rtas_function_token() - RTAS function token lookup. + * @handle: Function handle, e.g. RTAS_FN_EVENT_SCAN. + * + * Context: Any context. + * Return: the token value for the function if implemented by this platform, + * otherwise RTAS_UNKNOWN_SERVICE. + */ +s32 rtas_function_token(const rtas_fn_handle_t handle) +{ + const size_t index = handle.index; + const bool out_of_bounds = index >= ARRAY_SIZE(rtas_function_table); + + if (WARN_ONCE(out_of_bounds, "invalid function index %zu", index)) + return RTAS_UNKNOWN_SERVICE; + /* + * Various drivers attempt token lookups on non-RTAS + * platforms. + */ + if (!rtas.dev) + return RTAS_UNKNOWN_SERVICE; + + return rtas_function_table[index].token; +} +EXPORT_SYMBOL_GPL(rtas_function_token); + +static int rtas_function_cmp(const void *a, const void *b) +{ + const struct rtas_function *f1 = a; + const struct rtas_function *f2 = b; + + return strcmp(f1->name, f2->name); +} /* - * If non-NULL, this gets called when the kernel terminates. - * This is done like this so rtas_flash can be a module. + * Boot-time initialization of the function table needs the lookup to + * return a non-const-qualified object. Use rtas_name_to_function() + * in all other contexts. */ -void (*rtas_flash_term_hook)(int); -EXPORT_SYMBOL(rtas_flash_term_hook); +static struct rtas_function *__rtas_name_to_function(const char *name) +{ + const struct rtas_function key = { + .name = name, + }; + struct rtas_function *found; + + found = bsearch(&key, rtas_function_table, ARRAY_SIZE(rtas_function_table), + sizeof(rtas_function_table[0]), rtas_function_cmp); -/* RTAS use home made raw locking instead of spin_lock_irqsave - * because those can be called from within really nasty contexts - * such as having the timebase stopped which would lockup with - * normal locks and spinlock debugging enabled + return found; +} + +static const struct rtas_function *rtas_name_to_function(const char *name) +{ + return __rtas_name_to_function(name); +} + +static DEFINE_XARRAY(rtas_token_to_function_xarray); + +static int __init rtas_token_to_function_xarray_init(void) +{ + const struct rtas_function *func; + int err = 0; + + for_each_rtas_function(func) { + const s32 token = func->token; + + if (token == RTAS_UNKNOWN_SERVICE) + continue; + + err = xa_err(xa_store(&rtas_token_to_function_xarray, + token, (void *)func, GFP_KERNEL)); + if (err) + break; + } + + return err; +} +arch_initcall(rtas_token_to_function_xarray_init); + +/* + * For use by sys_rtas(), where the token value is provided by user + * space and we don't want to warn on failed lookups. */ -static unsigned long lock_rtas(void) +static const struct rtas_function *rtas_token_to_function_untrusted(s32 token) { - unsigned long flags; + return xa_load(&rtas_token_to_function_xarray, token); +} + +/* + * Reverse lookup for deriving the function descriptor from a + * known-good token value in contexts where the former is not already + * available. @token must be valid, e.g. derived from the result of a + * prior lookup against the function table. + */ +static const struct rtas_function *rtas_token_to_function(s32 token) +{ + const struct rtas_function *func; + + if (WARN_ONCE(token < 0, "invalid token %d", token)) + return NULL; + + func = rtas_token_to_function_untrusted(token); + if (func) + return func; + /* + * Fall back to linear scan in case the reverse mapping hasn't + * been initialized yet. + */ + if (xa_empty(&rtas_token_to_function_xarray)) { + for_each_rtas_function(func) { + if (func->token == token) + return func; + } + } - local_irq_save(flags); - preempt_disable(); - arch_spin_lock(&rtas.lock); - return flags; + WARN_ONCE(true, "unexpected failed lookup for token %d", token); + return NULL; } -static void unlock_rtas(unsigned long flags) +/* This is here deliberately so it's only used in this file */ +void enter_rtas(unsigned long); + +static void __do_enter_rtas(struct rtas_args *args) { - arch_spin_unlock(&rtas.lock); - local_irq_restore(flags); - preempt_enable(); + enter_rtas(__pa(args)); + srr_regs_clobbered(); /* rtas uses SRRs, invalidate */ +} + +static void __do_enter_rtas_trace(struct rtas_args *args) +{ + const struct rtas_function *func = rtas_token_to_function(be32_to_cpu(args->token)); + + /* + * If there is a per-function lock, it must be held by the + * caller. + */ + if (func->lock) + lockdep_assert_held(func->lock); + + if (args == &rtas_args) + lockdep_assert_held(&rtas_lock); + + trace_rtas_input(args, func->name); + trace_rtas_ll_entry(args); + + __do_enter_rtas(args); + + trace_rtas_ll_exit(args); + trace_rtas_output(args, func->name); } +static void do_enter_rtas(struct rtas_args *args) +{ + const unsigned long msr = mfmsr(); + /* + * Situations where we want to skip any active tracepoints for + * safety reasons: + * + * 1. The last code executed on an offline CPU as it stops, + * i.e. we're about to call stop-self. The tracepoints' + * function name lookup uses xarray, which uses RCU, which + * isn't valid to call on an offline CPU. Any events + * emitted on an offline CPU will be discarded anyway. + * + * 2. In real mode, as when invoking ibm,nmi-interlock from + * the pseries MCE handler. We cannot count on trace + * buffers or the entries in rtas_token_to_function_xarray + * to be contained in the RMO. + */ + const unsigned long mask = MSR_IR | MSR_DR; + const bool can_trace = likely(cpu_online(raw_smp_processor_id()) && + (msr & mask) == mask); + /* + * Make sure MSR[RI] is currently enabled as it will be forced later + * in enter_rtas. + */ + BUG_ON(!(msr & MSR_RI)); + + BUG_ON(!irqs_disabled()); + + hard_irq_disable(); /* Ensure MSR[EE] is disabled on PPC64 */ + + if (can_trace) + __do_enter_rtas_trace(args); + else + __do_enter_rtas(args); +} + +struct rtas_t rtas; + +DEFINE_SPINLOCK(rtas_data_buf_lock); +EXPORT_SYMBOL_GPL(rtas_data_buf_lock); + +char rtas_data_buf[RTAS_DATA_BUF_SIZE] __aligned(SZ_4K); +EXPORT_SYMBOL_GPL(rtas_data_buf); + +unsigned long rtas_rmo_buf; + +/* + * If non-NULL, this gets called when the kernel terminates. + * This is done like this so rtas_flash can be a module. + */ +void (*rtas_flash_term_hook)(int); +EXPORT_SYMBOL_GPL(rtas_flash_term_hook); + /* * call_rtas_display_status and call_rtas_display_status_delay * are designed only for very early low-level debugging, which @@ -94,14 +753,14 @@ static void unlock_rtas(unsigned long flags) */ static void call_rtas_display_status(unsigned char c) { - unsigned long s; + unsigned long flags; if (!rtas.base) return; - s = lock_rtas(); - rtas_call_unlocked(&rtas.args, 10, 1, 1, NULL, c); - unlock_rtas(s); + raw_spin_lock_irqsave(&rtas_lock, flags); + rtas_call_unlocked(&rtas_args, 10, 1, 1, NULL, c); + raw_spin_unlock_irqrestore(&rtas_lock, flags); } static void call_rtas_display_status_delay(char c) @@ -109,7 +768,7 @@ static void call_rtas_display_status_delay(char c) static int pending_newline = 0; /* did last write end with unprinted newline? */ static int width = 16; - if (c == '\n') { + if (c == '\n') { while (width-- > 0) call_rtas_display_status(' '); width = 16; @@ -119,7 +778,7 @@ static void call_rtas_display_status_delay(char c) if (pending_newline) { call_rtas_display_status('\r'); call_rtas_display_status('\n'); - } + } pending_newline = 0; if (width--) { call_rtas_display_status(c); @@ -225,8 +884,8 @@ void rtas_progress(char *s, unsigned short hex) "ibm,display-truncation-length", NULL); of_node_put(root); } - display_character = rtas_token("display-character"); - set_indicator = rtas_token("set-indicator"); + display_character = rtas_function_token(RTAS_FN_DISPLAY_CHARACTER); + set_indicator = rtas_function_token(RTAS_FN_SET_INDICATOR); } if (display_character == RTAS_UNKNOWN_SERVICE) { @@ -259,7 +918,7 @@ void rtas_progress(char *s, unsigned short hex) else rtas_call(display_character, 1, 1, NULL, '\r'); } - + if (row_width) width = row_width[current_line]; else @@ -279,9 +938,9 @@ void rtas_progress(char *s, unsigned short hex) spin_unlock(&progress_lock); return; } - + /* RTAS wants CR-LF, not just LF */ - + if (*os == '\n') { rtas_call(display_character, 1, 1, NULL, '\r'); rtas_call(display_character, 1, 1, NULL, '\n'); @@ -291,7 +950,7 @@ void rtas_progress(char *s, unsigned short hex) */ rtas_call(display_character, 1, 1, NULL, *os); } - + if (row_width) width = row_width[current_line]; else @@ -300,36 +959,49 @@ void rtas_progress(char *s, unsigned short hex) width--; rtas_call(display_character, 1, 1, NULL, *os); } - + os++; - + /* if we overwrite the screen length */ if (width <= 0) while ((*os != 0) && (*os != '\n') && (*os != '\r')) os++; } - + spin_unlock(&progress_lock); } -EXPORT_SYMBOL(rtas_progress); /* needed by rtas_flash module */ +EXPORT_SYMBOL_GPL(rtas_progress); /* needed by rtas_flash module */ int rtas_token(const char *service) { + const struct rtas_function *func; const __be32 *tokp; + if (rtas.dev == NULL) return RTAS_UNKNOWN_SERVICE; + + func = rtas_name_to_function(service); + if (func) + return func->token; + /* + * The caller is looking up a name that is not known to be an + * RTAS function. Either it's a function that needs to be + * added to the table, or they're misusing rtas_token() to + * access non-function properties of the /rtas node. Warn and + * fall back to the legacy behavior. + */ + WARN_ONCE(1, "unknown function `%s`, should it be added to rtas_function_table?\n", + service); + tokp = of_get_property(rtas.dev, service, NULL); return tokp ? be32_to_cpu(*tokp) : RTAS_UNKNOWN_SERVICE; } -EXPORT_SYMBOL(rtas_token); - -int rtas_service_present(const char *service) -{ - return rtas_token(service) != RTAS_UNKNOWN_SERVICE; -} -EXPORT_SYMBOL(rtas_service_present); +EXPORT_SYMBOL_GPL(rtas_token); #ifdef CONFIG_RTAS_ERROR_LOGGING + +static u32 rtas_error_log_max __ro_after_init = RTAS_ERROR_LOG_MAX; + /* * Return the firmware-specified size of the error log buffer * for all rtas calls that require an error buffer argument. @@ -337,56 +1009,66 @@ EXPORT_SYMBOL(rtas_service_present); */ int rtas_get_error_log_max(void) { - static int rtas_error_log_max; - if (rtas_error_log_max) - return rtas_error_log_max; - - rtas_error_log_max = rtas_token ("rtas-error-log-max"); - if ((rtas_error_log_max == RTAS_UNKNOWN_SERVICE) || - (rtas_error_log_max > RTAS_ERROR_LOG_MAX)) { - printk (KERN_WARNING "RTAS: bad log buffer size %d\n", - rtas_error_log_max); - rtas_error_log_max = RTAS_ERROR_LOG_MAX; - } return rtas_error_log_max; } -EXPORT_SYMBOL(rtas_get_error_log_max); + +static void __init init_error_log_max(void) +{ + static const char propname[] __initconst = "rtas-error-log-max"; + u32 max; + + if (of_property_read_u32(rtas.dev, propname, &max)) { + pr_warn("%s not found, using default of %u\n", + propname, RTAS_ERROR_LOG_MAX); + max = RTAS_ERROR_LOG_MAX; + } + + if (max > RTAS_ERROR_LOG_MAX) { + pr_warn("%s = %u, clamping max error log size to %u\n", + propname, max, RTAS_ERROR_LOG_MAX); + max = RTAS_ERROR_LOG_MAX; + } + + rtas_error_log_max = max; +} static char rtas_err_buf[RTAS_ERROR_LOG_MAX]; -static int rtas_last_error_token; /** Return a copy of the detailed error text associated with the * most recent failed call to rtas. Because the error text * might go stale if there are any other intervening rtas calls, * this routine must be called atomically with whatever produced - * the error (i.e. with rtas.lock still held from the previous call). + * the error (i.e. with rtas_lock still held from the previous call). */ static char *__fetch_rtas_last_error(char *altbuf) { + const s32 token = rtas_function_token(RTAS_FN_RTAS_LAST_ERROR); struct rtas_args err_args, save_args; u32 bufsz; char *buf = NULL; - if (rtas_last_error_token == -1) + lockdep_assert_held(&rtas_lock); + + if (token == -1) return NULL; bufsz = rtas_get_error_log_max(); - err_args.token = cpu_to_be32(rtas_last_error_token); + err_args.token = cpu_to_be32(token); err_args.nargs = cpu_to_be32(2); err_args.nret = cpu_to_be32(1); err_args.args[0] = cpu_to_be32(__pa(rtas_err_buf)); err_args.args[1] = cpu_to_be32(bufsz); err_args.args[2] = 0; - save_args = rtas.args; - rtas.args = err_args; + save_args = rtas_args; + rtas_args = err_args; - enter_rtas(__pa(&rtas.args)); + do_enter_rtas(&rtas_args); - err_args = rtas.args; - rtas.args = save_args; + err_args = rtas_args; + rtas_args = save_args; /* Log the error in the unlikely case that there was one. */ if (unlikely(err_args.args[2] == 0)) { @@ -398,7 +1080,7 @@ static char *__fetch_rtas_last_error(char *altbuf) buf = kmalloc(RTAS_ERROR_LOG_MAX, GFP_ATOMIC); } if (buf) - memcpy(buf, rtas_err_buf, RTAS_ERROR_LOG_MAX); + memmove(buf, rtas_err_buf, RTAS_ERROR_LOG_MAX); } return buf; @@ -409,6 +1091,7 @@ static char *__fetch_rtas_last_error(char *altbuf) #else /* CONFIG_RTAS_ERROR_LOGGING */ #define __fetch_rtas_last_error(x) NULL #define get_errorlog_buffer() NULL +static void __init init_error_log_max(void) {} #endif @@ -429,9 +1112,26 @@ va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, for (i = 0; i < nret; ++i) args->rets[i] = 0; - enter_rtas(__pa(args)); + do_enter_rtas(args); } +/** + * rtas_call_unlocked() - Invoke an RTAS firmware function without synchronization. + * @args: RTAS parameter block to be used for the call, must obey RTAS addressing + * constraints. + * @token: Identifies the function being invoked. + * @nargs: Number of input parameters. Does not include token. + * @nret: Number of output parameters, including the call status. + * @....: List of @nargs input parameters. + * + * Invokes the RTAS function indicated by @token, which the caller + * should obtain via rtas_function_token(). + * + * This function is similar to rtas_call(), but must be used with a + * limited set of RTAS calls specifically exempted from the general + * requirement that only one RTAS call may be in progress at any + * time. Examples include stop-self and ibm,nmi-interlock. + */ void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...) { va_list list; @@ -441,38 +1141,120 @@ void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, va_end(list); } +static bool token_is_restricted_errinjct(s32 token) +{ + return token == rtas_function_token(RTAS_FN_IBM_OPEN_ERRINJCT) || + token == rtas_function_token(RTAS_FN_IBM_ERRINJCT); +} + +/** + * rtas_call() - Invoke an RTAS firmware function. + * @token: Identifies the function being invoked. + * @nargs: Number of input parameters. Does not include token. + * @nret: Number of output parameters, including the call status. + * @outputs: Array of @nret output words. + * @....: List of @nargs input parameters. + * + * Invokes the RTAS function indicated by @token, which the caller + * should obtain via rtas_function_token(). + * + * The @nargs and @nret arguments must match the number of input and + * output parameters specified for the RTAS function. + * + * rtas_call() returns RTAS status codes, not conventional Linux errno + * values. Callers must translate any failure to an appropriate errno + * in syscall context. Most callers of RTAS functions that can return + * -2 or 990x should use rtas_busy_delay() to correctly handle those + * statuses before calling again. + * + * The return value descriptions are adapted from 7.2.8 [RTAS] Return + * Codes of the PAPR and CHRP specifications. + * + * Context: Process context preferably, interrupt context if + * necessary. Acquires an internal spinlock and may perform + * GFP_ATOMIC slab allocation in error path. Unsafe for NMI + * context. + * Return: + * * 0 - RTAS function call succeeded. + * * -1 - RTAS function encountered a hardware or + * platform error, or the token is invalid, + * or the function is restricted by kernel policy. + * * -2 - Specs say "A necessary hardware device was busy, + * and the requested function could not be + * performed. The operation should be retried at + * a later time." This is misleading, at least with + * respect to current RTAS implementations. What it + * usually means in practice is that the function + * could not be completed while meeting RTAS's + * deadline for returning control to the OS (250us + * for PAPR/PowerVM, typically), but the call may be + * immediately reattempted to resume work on it. + * * -3 - Parameter error. + * * -7 - Unexpected state change. + * * 9000...9899 - Vendor-specific success codes. + * * 9900...9905 - Advisory extended delay. Caller should try + * again after ~10^x ms has elapsed, where x is + * the last digit of the status [0-5]. Again going + * beyond the PAPR text, 990x on PowerVM indicates + * contention for RTAS-internal resources. Other + * RTAS call sequences in progress should be + * allowed to complete before reattempting the + * call. + * * -9000 - Multi-level isolation error. + * * -9999...-9004 - Vendor-specific error codes. + * * Additional negative values - Function-specific error. + * * Additional positive values - Function-specific success. + */ int rtas_call(int token, int nargs, int nret, int *outputs, ...) { + struct pin_cookie cookie; va_list list; int i; - unsigned long s; - struct rtas_args *rtas_args; + unsigned long flags; + struct rtas_args *args; char *buff_copy = NULL; int ret; if (!rtas.entry || token == RTAS_UNKNOWN_SERVICE) return -1; - s = lock_rtas(); + if (token_is_restricted_errinjct(token)) { + /* + * It would be nicer to not discard the error value + * from security_locked_down(), but callers expect an + * RTAS status, not an errno. + */ + if (security_locked_down(LOCKDOWN_RTAS_ERROR_INJECTION)) + return -1; + } + + if ((mfmsr() & (MSR_IR|MSR_DR)) != (MSR_IR|MSR_DR)) { + WARN_ON_ONCE(1); + return -1; + } + + raw_spin_lock_irqsave(&rtas_lock, flags); + cookie = lockdep_pin_lock(&rtas_lock); /* We use the global rtas args buffer */ - rtas_args = &rtas.args; + args = &rtas_args; va_start(list, outputs); - va_rtas_call_unlocked(rtas_args, token, nargs, nret, list); + va_rtas_call_unlocked(args, token, nargs, nret, list); va_end(list); /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ - if (be32_to_cpu(rtas_args->rets[0]) == -1) + if (be32_to_cpu(args->rets[0]) == -1) buff_copy = __fetch_rtas_last_error(NULL); if (nret > 1 && outputs != NULL) for (i = 0; i < nret-1; ++i) - outputs[i] = be32_to_cpu(rtas_args->rets[i+1]); - ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0; + outputs[i] = be32_to_cpu(args->rets[i + 1]); + ret = (nret > 0) ? be32_to_cpu(args->rets[0]) : 0; - unlock_rtas(s); + lockdep_unpin_lock(&rtas_lock, cookie); + raw_spin_unlock_irqrestore(&rtas_lock, flags); if (buff_copy) { log_error(buff_copy, ERR_TYPE_RTAS_LOG, 0); @@ -481,10 +1263,27 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) } return ret; } -EXPORT_SYMBOL(rtas_call); +EXPORT_SYMBOL_GPL(rtas_call); -/* For RTAS_BUSY (-2), delay for 1 millisecond. For an extended busy status - * code of 990n, perform the hinted delay of 10^n (last digit) milliseconds. +/** + * rtas_busy_delay_time() - From an RTAS status value, calculate the + * suggested delay time in milliseconds. + * + * @status: a value returned from rtas_call() or similar APIs which return + * the status of a RTAS function call. + * + * Context: Any context. + * + * Return: + * * 100000 - If @status is 9905. + * * 10000 - If @status is 9904. + * * 1000 - If @status is 9903. + * * 100 - If @status is 9902. + * * 10 - If @status is 9901. + * * 1 - If @status is either 9900 or -2. This is "wrong" for -2, but + * some callers depend on this behavior, and the worst outcome + * is that they will delay for longer than necessary. + * * 0 - If @status is not a busy or extended delay value. */ unsigned int rtas_busy_delay_time(int status) { @@ -502,54 +1301,160 @@ unsigned int rtas_busy_delay_time(int status) return ms; } -EXPORT_SYMBOL(rtas_busy_delay_time); -/* For an RTAS busy status code, perform the hinted delay. */ -unsigned int rtas_busy_delay(int status) +/* + * Early boot fallback for rtas_busy_delay(). + */ +static bool __init rtas_busy_delay_early(int status) +{ + static size_t successive_ext_delays __initdata; + bool retry; + + switch (status) { + case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX: + /* + * In the unlikely case that we receive an extended + * delay status in early boot, the OS is probably not + * the cause, and there's nothing we can do to clear + * the condition. Best we can do is delay for a bit + * and hope it's transient. Lie to the caller if it + * seems like we're stuck in a retry loop. + */ + mdelay(1); + retry = true; + successive_ext_delays += 1; + if (successive_ext_delays > 1000) { + pr_err("too many extended delays, giving up\n"); + dump_stack(); + retry = false; + successive_ext_delays = 0; + } + break; + case RTAS_BUSY: + retry = true; + successive_ext_delays = 0; + break; + default: + retry = false; + successive_ext_delays = 0; + break; + } + + return retry; +} + +/** + * rtas_busy_delay() - helper for RTAS busy and extended delay statuses + * + * @status: a value returned from rtas_call() or similar APIs which return + * the status of a RTAS function call. + * + * Context: Process context. May sleep or schedule. + * + * Return: + * * true - @status is RTAS_BUSY or an extended delay hint. The + * caller may assume that the CPU has been yielded if necessary, + * and that an appropriate delay for @status has elapsed. + * Generally the caller should reattempt the RTAS call which + * yielded @status. + * + * * false - @status is not @RTAS_BUSY nor an extended delay hint. The + * caller is responsible for handling @status. + */ +bool __ref rtas_busy_delay(int status) { unsigned int ms; + bool ret; - might_sleep(); - ms = rtas_busy_delay_time(status); - if (ms && need_resched()) - msleep(ms); + /* + * Can't do timed sleeps before timekeeping is up. + */ + if (system_state < SYSTEM_SCHEDULING) + return rtas_busy_delay_early(status); - return ms; + switch (status) { + case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX: + ret = true; + ms = rtas_busy_delay_time(status); + /* + * The extended delay hint can be as high as 100 seconds. + * Surely any function returning such a status is either + * buggy or isn't going to be significantly slowed by us + * polling at 1HZ. Clamp the sleep time to one second. + */ + ms = clamp(ms, 1U, 1000U); + /* + * The delay hint is an order-of-magnitude suggestion, not + * a minimum. It is fine, possibly even advantageous, for + * us to pause for less time than hinted. For small values, + * use usleep_range() to ensure we don't sleep much longer + * than actually needed. + * + * See Documentation/timers/timers-howto.rst for + * explanation of the threshold used here. In effect we use + * usleep_range() for 9900 and 9901, msleep() for + * 9902-9905. + */ + if (ms <= 20) + usleep_range(ms * 100, ms * 1000); + else + msleep(ms); + break; + case RTAS_BUSY: + ret = true; + /* + * We should call again immediately if there's no other + * work to do. + */ + cond_resched(); + break; + default: + ret = false; + /* + * Not a busy or extended delay status; the caller should + * handle @status itself. Ensure we warn on misuses in + * atomic context regardless. + */ + might_sleep(); + break; + } + + return ret; } -EXPORT_SYMBOL(rtas_busy_delay); +EXPORT_SYMBOL_GPL(rtas_busy_delay); -static int rtas_error_rc(int rtas_rc) +int rtas_error_rc(int rtas_rc) { int rc; switch (rtas_rc) { - case -1: /* Hardware Error */ - rc = -EIO; - break; - case -3: /* Bad indicator/domain/etc */ - rc = -EINVAL; - break; - case -9000: /* Isolation error */ - rc = -EFAULT; - break; - case -9001: /* Outstanding TCE/PTE */ - rc = -EEXIST; - break; - case -9002: /* No usable slot */ - rc = -ENODEV; - break; - default: - printk(KERN_ERR "%s: unexpected RTAS error %d\n", - __func__, rtas_rc); - rc = -ERANGE; - break; + case RTAS_HARDWARE_ERROR: /* Hardware Error */ + rc = -EIO; + break; + case RTAS_INVALID_PARAMETER: /* Bad indicator/domain/etc */ + rc = -EINVAL; + break; + case -9000: /* Isolation error */ + rc = -EFAULT; + break; + case -9001: /* Outstanding TCE/PTE */ + rc = -EEXIST; + break; + case -9002: /* No usable slot */ + rc = -ENODEV; + break; + default: + pr_err("%s: unexpected error %d\n", __func__, rtas_rc); + rc = -ERANGE; + break; } return rc; } +EXPORT_SYMBOL_GPL(rtas_error_rc); int rtas_get_power_level(int powerdomain, int *level) { - int token = rtas_token("get-power-level"); + int token = rtas_function_token(RTAS_FN_GET_POWER_LEVEL); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -562,11 +1467,11 @@ int rtas_get_power_level(int powerdomain, int *level) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_get_power_level); +EXPORT_SYMBOL_GPL(rtas_get_power_level); int rtas_set_power_level(int powerdomain, int level, int *setlevel) { - int token = rtas_token("set-power-level"); + int token = rtas_function_token(RTAS_FN_SET_POWER_LEVEL); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -580,11 +1485,11 @@ int rtas_set_power_level(int powerdomain, int level, int *setlevel) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_set_power_level); +EXPORT_SYMBOL_GPL(rtas_set_power_level); int rtas_get_sensor(int sensor, int index, int *state) { - int token = rtas_token("get-sensor-state"); + int token = rtas_function_token(RTAS_FN_GET_SENSOR_STATE); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -598,11 +1503,11 @@ int rtas_get_sensor(int sensor, int index, int *state) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_get_sensor); +EXPORT_SYMBOL_GPL(rtas_get_sensor); int rtas_get_sensor_fast(int sensor, int index, int *state) { - int token = rtas_token("get-sensor-state"); + int token = rtas_function_token(RTAS_FN_GET_SENSOR_STATE); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -641,11 +1546,10 @@ bool rtas_indicator_present(int token, int *maxindex) return false; } -EXPORT_SYMBOL(rtas_indicator_present); int rtas_set_indicator(int indicator, int index, int new_value) { - int token = rtas_token("set-indicator"); + int token = rtas_function_token(RTAS_FN_SET_INDICATOR); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -659,15 +1563,15 @@ int rtas_set_indicator(int indicator, int index, int new_value) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_set_indicator); +EXPORT_SYMBOL_GPL(rtas_set_indicator); /* * Ignoring RTAS extended delay */ int rtas_set_indicator_fast(int indicator, int index, int new_value) { + int token = rtas_function_token(RTAS_FN_SET_INDICATOR); int rc; - int token = rtas_token("set-indicator"); if (token == RTAS_UNKNOWN_SERVICE) return -ENOENT; @@ -683,12 +1587,70 @@ int rtas_set_indicator_fast(int indicator, int index, int new_value) return rc; } +/** + * rtas_ibm_suspend_me() - Call ibm,suspend-me to suspend the LPAR. + * + * @fw_status: RTAS call status will be placed here if not NULL. + * + * rtas_ibm_suspend_me() should be called only on a CPU which has + * received H_CONTINUE from the H_JOIN hcall. All other active CPUs + * should be waiting to return from H_JOIN. + * + * rtas_ibm_suspend_me() may suspend execution of the OS + * indefinitely. Callers should take appropriate measures upon return, such as + * resetting watchdog facilities. + * + * Callers may choose to retry this call if @fw_status is + * %RTAS_THREADS_ACTIVE. + * + * Return: + * 0 - The partition has resumed from suspend, possibly after + * migration to a different host. + * -ECANCELED - The operation was aborted. + * -EAGAIN - There were other CPUs not in H_JOIN at the time of the call. + * -EBUSY - Some other condition prevented the suspend from succeeding. + * -EIO - Hardware/platform error. + */ +int rtas_ibm_suspend_me(int *fw_status) +{ + int token = rtas_function_token(RTAS_FN_IBM_SUSPEND_ME); + int fwrc; + int ret; + + fwrc = rtas_call(token, 0, 1, NULL); + + switch (fwrc) { + case 0: + ret = 0; + break; + case RTAS_SUSPEND_ABORTED: + ret = -ECANCELED; + break; + case RTAS_THREADS_ACTIVE: + ret = -EAGAIN; + break; + case RTAS_NOT_SUSPENDABLE: + case RTAS_OUTSTANDING_COPROC: + ret = -EBUSY; + break; + case -1: + default: + ret = -EIO; + break; + } + + if (fw_status) + *fw_status = fwrc; + + return ret; +} + void __noreturn rtas_restart(char *cmd) { if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_RESTART); - printk("RTAS system-reboot returned %d\n", - rtas_call(rtas_token("system-reboot"), 0, 1, NULL)); + pr_emerg("system-reboot returned %d\n", + rtas_call(rtas_function_token(RTAS_FN_SYSTEM_REBOOT), 0, 1, NULL)); for (;;); } @@ -697,8 +1659,8 @@ void rtas_power_off(void) if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_POWER_OFF); /* allow power on only with power button press */ - printk("RTAS power-off returned %d\n", - rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); + pr_emerg("power-off returned %d\n", + rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1)); for (;;); } @@ -707,16 +1669,19 @@ void __noreturn rtas_halt(void) if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_HALT); /* allow power on only with power button press */ - printk("RTAS power-off returned %d\n", - rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); + pr_emerg("power-off returned %d\n", + rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1)); for (;;); } /* Must be in the RMO region, so we place it here */ static char rtas_os_term_buf[2048]; +static bool ibm_extended_os_term; void rtas_os_term(char *str) { + s32 token = rtas_function_token(RTAS_FN_IBM_OS_TERM); + static struct rtas_args args; int status; /* @@ -725,311 +1690,69 @@ void rtas_os_term(char *str) * this property may terminate the partition which we want to avoid * since it interferes with panic_timeout. */ - if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") || - RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term")) + + if (token == RTAS_UNKNOWN_SERVICE || !ibm_extended_os_term) return; snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); + /* + * Keep calling as long as RTAS returns a "try again" status, + * but don't use rtas_busy_delay(), which potentially + * schedules. + */ do { - status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL, - __pa(rtas_os_term_buf)); - } while (rtas_busy_delay(status)); + rtas_call_unlocked(&args, token, 1, 1, NULL, __pa(rtas_os_term_buf)); + status = be32_to_cpu(args.rets[0]); + } while (rtas_busy_delay_time(status)); if (status != 0) - printk(KERN_EMERG "ibm,os-term call failed %d\n", status); + pr_emerg("ibm,os-term call failed %d\n", status); } -static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE; -#ifdef CONFIG_PPC_PSERIES -static int __rtas_suspend_last_cpu(struct rtas_suspend_me_data *data, int wake_when_done) -{ - u16 slb_size = mmu_slb_size; - int rc = H_MULTI_THREADS_ACTIVE; - int cpu; - - slb_set_size(SLB_MIN_SIZE); - printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", smp_processor_id()); - - while (rc == H_MULTI_THREADS_ACTIVE && !atomic_read(&data->done) && - !atomic_read(&data->error)) - rc = rtas_call(data->token, 0, 1, NULL); - - if (rc || atomic_read(&data->error)) { - printk(KERN_DEBUG "ibm,suspend-me returned %d\n", rc); - slb_set_size(slb_size); - } - - if (atomic_read(&data->error)) - rc = atomic_read(&data->error); - - atomic_set(&data->error, rc); - pSeries_coalesce_init(); - - if (wake_when_done) { - atomic_set(&data->done, 1); - - for_each_online_cpu(cpu) - plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); - } - - if (atomic_dec_return(&data->working) == 0) - complete(data->complete); - - return rc; -} - -int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data) -{ - atomic_inc(&data->working); - return __rtas_suspend_last_cpu(data, 0); -} - -static int __rtas_suspend_cpu(struct rtas_suspend_me_data *data, int wake_when_done) -{ - long rc = H_SUCCESS; - unsigned long msr_save; - int cpu; - - atomic_inc(&data->working); - - /* really need to ensure MSR.EE is off for H_JOIN */ - msr_save = mfmsr(); - mtmsr(msr_save & ~(MSR_EE)); - - while (rc == H_SUCCESS && !atomic_read(&data->done) && !atomic_read(&data->error)) - rc = plpar_hcall_norets(H_JOIN); - - mtmsr(msr_save); - - if (rc == H_SUCCESS) { - /* This cpu was prodded and the suspend is complete. */ - goto out; - } else if (rc == H_CONTINUE) { - /* All other cpus are in H_JOIN, this cpu does - * the suspend. - */ - return __rtas_suspend_last_cpu(data, wake_when_done); - } else { - printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n", - smp_processor_id(), rc); - atomic_set(&data->error, rc); - } - - if (wake_when_done) { - atomic_set(&data->done, 1); - - /* This cpu did the suspend or got an error; in either case, - * we need to prod all other other cpus out of join state. - * Extra prods are harmless. - */ - for_each_online_cpu(cpu) - plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); - } -out: - if (atomic_dec_return(&data->working) == 0) - complete(data->complete); - return rc; -} - -int rtas_suspend_cpu(struct rtas_suspend_me_data *data) -{ - return __rtas_suspend_cpu(data, 0); -} - -static void rtas_percpu_suspend_me(void *info) -{ - __rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1); -} - -enum rtas_cpu_state { - DOWN, - UP, -}; - -#ifndef CONFIG_SMP -static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, - cpumask_var_t cpus) -{ - if (!cpumask_empty(cpus)) { - cpumask_clear(cpus); - return -EINVAL; - } else - return 0; -} -#else -/* On return cpumask will be altered to indicate CPUs changed. - * CPUs with states changed will be set in the mask, - * CPUs with status unchanged will be unset in the mask. */ -static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, - cpumask_var_t cpus) -{ - int cpu; - int cpuret = 0; - int ret = 0; - - if (cpumask_empty(cpus)) - return 0; - - for_each_cpu(cpu, cpus) { - struct device *dev = get_cpu_device(cpu); - - switch (state) { - case DOWN: - cpuret = device_offline(dev); - break; - case UP: - cpuret = device_online(dev); - break; - } - if (cpuret < 0) { - pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", - __func__, - ((state == UP) ? "up" : "down"), - cpu, cpuret); - if (!ret) - ret = cpuret; - if (state == UP) { - /* clear bits for unchanged cpus, return */ - cpumask_shift_right(cpus, cpus, cpu); - cpumask_shift_left(cpus, cpus, cpu); - break; - } else { - /* clear bit for unchanged cpu, continue */ - cpumask_clear_cpu(cpu, cpus); - } - } - cond_resched(); - } - - return ret; -} -#endif - -int rtas_online_cpus_mask(cpumask_var_t cpus) -{ - int ret; - - ret = rtas_cpu_state_change_mask(UP, cpus); - - if (ret) { - cpumask_var_t tmp_mask; - - if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) - return ret; - - /* Use tmp_mask to preserve cpus mask from first failure */ - cpumask_copy(tmp_mask, cpus); - rtas_offline_cpus_mask(tmp_mask); - free_cpumask_var(tmp_mask); - } - - return ret; -} - -int rtas_offline_cpus_mask(cpumask_var_t cpus) -{ - return rtas_cpu_state_change_mask(DOWN, cpus); -} - -int rtas_ibm_suspend_me(u64 handle) +/** + * rtas_activate_firmware() - Activate a new version of firmware. + * + * Context: This function may sleep. + * + * Activate a new version of partition firmware. The OS must call this + * after resuming from a partition hibernation or migration in order + * to maintain the ability to perform live firmware updates. It's not + * catastrophic for this method to be absent or to fail; just log the + * condition in that case. + */ +void rtas_activate_firmware(void) { - long state; - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - struct rtas_suspend_me_data data; - DECLARE_COMPLETION_ONSTACK(done); - cpumask_var_t offline_mask; - int cpuret; - - if (!rtas_service_present("ibm,suspend-me")) - return -ENOSYS; - - /* Make sure the state is valid */ - rc = plpar_hcall(H_VASI_STATE, retbuf, handle); - - state = retbuf[0]; - - if (rc) { - printk(KERN_ERR "rtas_ibm_suspend_me: vasi_state returned %ld\n",rc); - return rc; - } else if (state == H_VASI_ENABLED) { - return -EAGAIN; - } else if (state != H_VASI_SUSPENDING) { - printk(KERN_ERR "rtas_ibm_suspend_me: vasi_state returned state %ld\n", - state); - return -EIO; - } - - if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL)) - return -ENOMEM; - - atomic_set(&data.working, 0); - atomic_set(&data.done, 0); - atomic_set(&data.error, 0); - data.token = rtas_token("ibm,suspend-me"); - data.complete = &done; - - lock_device_hotplug(); + int token = rtas_function_token(RTAS_FN_IBM_ACTIVATE_FIRMWARE); + int fwrc; - /* All present CPUs must be online */ - cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); - cpuret = rtas_online_cpus_mask(offline_mask); - if (cpuret) { - pr_err("%s: Could not bring present CPUs online.\n", __func__); - atomic_set(&data.error, cpuret); - goto out; - } - - cpu_hotplug_disable(); - - /* Check if we raced with a CPU-Offline Operation */ - if (!cpumask_equal(cpu_present_mask, cpu_online_mask)) { - pr_info("%s: Raced against a concurrent CPU-Offline\n", __func__); - atomic_set(&data.error, -EAGAIN); - goto out_hotplug_enable; + if (token == RTAS_UNKNOWN_SERVICE) { + pr_notice("ibm,activate-firmware method unavailable\n"); + return; } - /* Call function on all CPUs. One of us will make the - * rtas call - */ - on_each_cpu(rtas_percpu_suspend_me, &data, 0); - - wait_for_completion(&done); - - if (atomic_read(&data.error) != 0) - printk(KERN_ERR "Error doing global join\n"); + mutex_lock(&rtas_ibm_activate_firmware_lock); -out_hotplug_enable: - cpu_hotplug_enable(); + do { + fwrc = rtas_call(token, 0, 1, NULL); + } while (rtas_busy_delay(fwrc)); - /* Take down CPUs not online prior to suspend */ - cpuret = rtas_offline_cpus_mask(offline_mask); - if (cpuret) - pr_warn("%s: Could not restore CPUs to offline state.\n", - __func__); + mutex_unlock(&rtas_ibm_activate_firmware_lock); -out: - unlock_device_hotplug(); - free_cpumask_var(offline_mask); - return atomic_read(&data.error); -} -#else /* CONFIG_PPC_PSERIES */ -int rtas_ibm_suspend_me(u64 handle) -{ - return -ENOSYS; + if (fwrc) + pr_err("ibm,activate-firmware failed (%i)\n", fwrc); } -#endif /** - * Find a specific pseries error log in an RTAS extended event log. + * get_pseries_errorlog() - Find a specific pseries error log in an RTAS + * extended event log. * @log: RTAS error/event log * @section_id: two character section identifier * - * Returns a pointer to the specified errorlog or NULL if not found. + * Return: A pointer to the specified errorlog or NULL if not found. */ -struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, - uint16_t section_id) +noinstr struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id) { struct rtas_ext_event_log_v6 *ext_log = (struct rtas_ext_event_log_v6 *)log->buffer; @@ -1058,9 +1781,113 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, return NULL; } +/* + * The sys_rtas syscall, as originally designed, allows root to pass + * arbitrary physical addresses to RTAS calls. A number of RTAS calls + * can be abused to write to arbitrary memory and do other things that + * are potentially harmful to system integrity, and thus should only + * be used inside the kernel and not exposed to userspace. + * + * All known legitimate users of the sys_rtas syscall will only ever + * pass addresses that fall within the RMO buffer, and use a known + * subset of RTAS calls. + * + * Accordingly, we filter RTAS requests to check that the call is + * permitted, and that provided pointers fall within the RMO buffer. + * If a function is allowed to be invoked via the syscall, then its + * entry in the rtas_functions table points to a rtas_filter that + * describes its constraints, with the indexes of the parameters which + * are expected to contain addresses and sizes of buffers allocated + * inside the RMO buffer. + */ + +static bool in_rmo_buf(u32 base, u32 end) +{ + return base >= rtas_rmo_buf && + base < (rtas_rmo_buf + RTAS_USER_REGION_SIZE) && + base <= end && + end >= rtas_rmo_buf && + end < (rtas_rmo_buf + RTAS_USER_REGION_SIZE); +} + +static bool block_rtas_call(const struct rtas_function *func, int nargs, + struct rtas_args *args) +{ + const struct rtas_filter *f; + const bool is_platform_dump = + func == &rtas_function_table[RTAS_FNIDX__IBM_PLATFORM_DUMP]; + const bool is_config_conn = + func == &rtas_function_table[RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR]; + u32 base, size, end; + + /* + * Only functions with filters attached are allowed. + */ + f = func->filter; + if (!f) + goto err; + /* + * And some functions aren't allowed on LE. + */ + if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) && func->banned_for_syscall_on_le) + goto err; + + if (f->buf_idx1 != -1) { + base = be32_to_cpu(args->args[f->buf_idx1]); + if (f->size_idx1 != -1) + size = be32_to_cpu(args->args[f->size_idx1]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; + + end = base + size - 1; + + /* + * Special case for ibm,platform-dump - NULL buffer + * address is used to indicate end of dump processing + */ + if (is_platform_dump && base == 0) + return false; + + if (!in_rmo_buf(base, end)) + goto err; + } + + if (f->buf_idx2 != -1) { + base = be32_to_cpu(args->args[f->buf_idx2]); + if (f->size_idx2 != -1) + size = be32_to_cpu(args->args[f->size_idx2]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; + end = base + size - 1; + + /* + * Special case for ibm,configure-connector where the + * address can be 0 + */ + if (is_config_conn && base == 0) + return false; + + if (!in_rmo_buf(base, end)) + goto err; + } + + return false; +err: + pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n"); + pr_err_ratelimited("sys_rtas: %s nargs=%d (called by %s)\n", + func->name, nargs, current->comm); + return true; +} + /* We assume to be passed big endian arguments */ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) { + const struct rtas_function *func; + struct pin_cookie cookie; struct rtas_args args; unsigned long flags; char *buff_copy, *errbuf = NULL; @@ -1089,14 +1916,30 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) nargs * sizeof(rtas_arg_t)) != 0) return -EFAULT; - if (token == RTAS_UNKNOWN_SERVICE) + /* + * If this token doesn't correspond to a function the kernel + * understands, you're not allowed to call it. + */ + func = rtas_token_to_function_untrusted(token); + if (!func) return -EINVAL; args.rets = &args.args[nargs]; memset(args.rets, 0, nret * sizeof(rtas_arg_t)); + if (block_rtas_call(func, nargs, &args)) + return -EINVAL; + + if (token_is_restricted_errinjct(token)) { + int err; + + err = security_locked_down(LOCKDOWN_RTAS_ERROR_INJECTION); + if (err) + return err; + } + /* Need to handle ibm,suspend_me call specially */ - if (token == ibm_suspend_me_token) { + if (token == rtas_function_token(RTAS_FN_IBM_SUSPEND_ME)) { /* * rtas_ibm_suspend_me assumes the streamid handle is in cpu @@ -1105,7 +1948,7 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) int rc = 0; u64 handle = ((u64)be32_to_cpu(args.args[0]) << 32) | be32_to_cpu(args.args[1]); - rc = rtas_ibm_suspend_me(handle); + rc = rtas_syscall_dispatch_ibm_suspend_me(handle); if (rc == -EAGAIN) args.rets[0] = cpu_to_be32(RTAS_NOT_SUSPENDABLE); else if (rc == -EIO) @@ -1117,18 +1960,32 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) buff_copy = get_errorlog_buffer(); - flags = lock_rtas(); + /* + * If this function has a mutex assigned to it, we must + * acquire it to avoid interleaving with any kernel-based uses + * of the same function. Kernel-based sequences acquire the + * appropriate mutex explicitly. + */ + if (func->lock) + mutex_lock(func->lock); + + raw_spin_lock_irqsave(&rtas_lock, flags); + cookie = lockdep_pin_lock(&rtas_lock); - rtas.args = args; - enter_rtas(__pa(&rtas.args)); - args = rtas.args; + rtas_args = args; + do_enter_rtas(&rtas_args); + args = rtas_args; /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ if (be32_to_cpu(args.rets[0]) == -1) errbuf = __fetch_rtas_last_error(buff_copy); - unlock_rtas(flags); + lockdep_unpin_lock(&rtas_lock, cookie); + raw_spin_unlock_irqrestore(&rtas_lock, flags); + + if (func->lock) + mutex_unlock(func->lock); if (buff_copy) { if (errbuf) @@ -1146,6 +2003,54 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) return 0; } +static void __init rtas_function_table_init(void) +{ + struct property *prop; + + for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) { + struct rtas_function *curr = &rtas_function_table[i]; + struct rtas_function *prior; + int cmp; + + curr->token = RTAS_UNKNOWN_SERVICE; + + if (i == 0) + continue; + /* + * Ensure table is sorted correctly for binary search + * on function names. + */ + prior = &rtas_function_table[i - 1]; + + cmp = strcmp(prior->name, curr->name); + if (cmp < 0) + continue; + + if (cmp == 0) { + pr_err("'%s' has duplicate function table entries\n", + curr->name); + } else { + pr_err("function table unsorted: '%s' wrongly precedes '%s'\n", + prior->name, curr->name); + } + } + + for_each_property_of_node(rtas.dev, prop) { + struct rtas_function *func; + + if (prop->length != sizeof(u32)) + continue; + + func = __rtas_name_to_function(prop->name); + if (!func) + continue; + + func->token = be32_to_cpup((__be32 *)prop->value); + + pr_debug("function %s has token %u\n", func->name, func->token); + } +} + /* * Call early during boot, before mem init, to retrieve the RTAS * information from the device-tree and allocate the RMO buffer for userland @@ -1177,24 +2082,31 @@ void __init rtas_initialize(void) no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry); rtas.entry = no_entry ? rtas.base : entry; + init_error_log_max(); + + /* Must be called before any function token lookups */ + rtas_function_table_init(); + + /* + * Discover this now to avoid a device tree lookup in the + * panic path. + */ + ibm_extended_os_term = of_property_read_bool(rtas.dev, "ibm,extended-os-term"); + /* If RTAS was found, allocate the RMO buffer for it and look for * the stop-self token if any */ #ifdef CONFIG_PPC64 - if (firmware_has_feature(FW_FEATURE_LPAR)) { + if (firmware_has_feature(FW_FEATURE_LPAR)) rtas_region = min(ppc64_rma_size, RTAS_INSTANTIATE_MAX); - ibm_suspend_me_token = rtas_token("ibm,suspend-me"); - } #endif - rtas_rmo_buf = memblock_phys_alloc_range(RTAS_RMOBUF_MAX, PAGE_SIZE, + rtas_rmo_buf = memblock_phys_alloc_range(RTAS_USER_REGION_SIZE, PAGE_SIZE, 0, rtas_region); if (!rtas_rmo_buf) panic("ERROR: RTAS: Failed to allocate %lx bytes below %pa\n", PAGE_SIZE, &rtas_region); -#ifdef CONFIG_RTAS_ERROR_LOGGING - rtas_last_error_token = rtas_token("rtas-last-error"); -#endif + rtas_work_area_reserve_arena(rtas_region); } int __init early_init_dt_scan_rtas(unsigned long node, @@ -1209,6 +2121,12 @@ int __init early_init_dt_scan_rtas(unsigned long node, entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL); sizep = of_get_flat_dt_prop(node, "rtas-size", NULL); +#ifdef CONFIG_PPC64 + /* need this feature to decide the crashkernel offset */ + if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL)) + powerpc_firmware_features |= FW_FEATURE_LPAR; +#endif + if (basep && entryp && sizep) { rtas.base = *basep; rtas.entry = *entryp; @@ -1234,23 +2152,22 @@ int __init early_init_dt_scan_rtas(unsigned long node, return 1; } -static arch_spinlock_t timebase_lock; +static DEFINE_RAW_SPINLOCK(timebase_lock); static u64 timebase = 0; void rtas_give_timebase(void) { unsigned long flags; - local_irq_save(flags); + raw_spin_lock_irqsave(&timebase_lock, flags); hard_irq_disable(); - arch_spin_lock(&timebase_lock); - rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL); + rtas_call(rtas_function_token(RTAS_FN_FREEZE_TIME_BASE), 0, 1, NULL); timebase = get_tb(); - arch_spin_unlock(&timebase_lock); + raw_spin_unlock(&timebase_lock); while (timebase) barrier(); - rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL); + rtas_call(rtas_function_token(RTAS_FN_THAW_TIME_BASE), 0, 1, NULL); local_irq_restore(flags); } @@ -1258,8 +2175,8 @@ void rtas_take_timebase(void) { while (!timebase) barrier(); - arch_spin_lock(&timebase_lock); + raw_spin_lock(&timebase_lock); set_tb(timebase >> 32, timebase & 0xffffffff); timebase = 0; - arch_spin_unlock(&timebase_lock); + raw_spin_unlock(&timebase_lock); } diff --git a/arch/powerpc/kernel/rtas_entry.S b/arch/powerpc/kernel/rtas_entry.S new file mode 100644 index 000000000000..6ce95ddadbcd --- /dev/null +++ b/arch/powerpc/kernel/rtas_entry.S @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include <asm/asm-offsets.h> +#include <asm/bug.h> +#include <asm/page.h> +#include <asm/ppc_asm.h> + +/* + * RTAS is called with MSR IR, DR, EE disabled, and LR in the return address. + * + * Note: r3 is an input parameter to rtas, so don't trash it... + */ + +#ifdef CONFIG_PPC32 +_GLOBAL(enter_rtas) + stwu r1,-INT_FRAME_SIZE(r1) + mflr r0 + stw r0,INT_FRAME_SIZE+4(r1) + LOAD_REG_ADDR(r4, rtas) + lis r6,1f@ha /* physical return address for rtas */ + addi r6,r6,1f@l + tophys(r6,r6) + lwz r8,RTASENTRY(r4) + lwz r4,RTASBASE(r4) + mfmsr r9 + stw r9,8(r1) + li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) + mtlr r6 + stw r1, THREAD + RTAS_SP(r2) + mtspr SPRN_SRR0,r8 + mtspr SPRN_SRR1,r9 + rfi +1: + lis r8, 1f@h + ori r8, r8, 1f@l + LOAD_REG_IMMEDIATE(r9,MSR_KERNEL) + mtspr SPRN_SRR0,r8 + mtspr SPRN_SRR1,r9 + rfi /* Reactivate MMU translation */ +1: + lwz r8,INT_FRAME_SIZE+4(r1) /* get return address */ + lwz r9,8(r1) /* original msr value */ + addi r1,r1,INT_FRAME_SIZE + li r0,0 + stw r0, THREAD + RTAS_SP(r2) + mtlr r8 + mtmsr r9 + blr /* return to caller */ +_ASM_NOKPROBE_SYMBOL(enter_rtas) + +#else /* CONFIG_PPC32 */ +#include <asm/exception-64s.h> + +/* + * 32-bit rtas on 64-bit machines has the additional problem that RTAS may + * not preserve the upper parts of registers it uses. + */ +_GLOBAL(enter_rtas) + mflr r0 + std r0,16(r1) + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space. */ + + /* Because RTAS is running in 32b mode, it clobbers the high order half + * of all registers that it saves. We therefore save those registers + * RTAS might touch to the stack. (r0, r3-r12 are caller saved) + */ + SAVE_GPR(2, r1) /* Save the TOC */ + SAVE_NVGPRS(r1) /* Save the non-volatiles */ + + mfcr r4 + std r4,_CCR(r1) + mfctr r5 + std r5,_CTR(r1) + mfspr r6,SPRN_XER + std r6,_XER(r1) + mfdar r7 + std r7,_DAR(r1) + mfdsisr r8 + std r8,_DSISR(r1) + + /* Temporary workaround to clear CR until RTAS can be modified to + * ignore all bits. + */ + li r0,0 + mtcr r0 + + mfmsr r6 + + /* Unfortunately, the stack pointer and the MSR are also clobbered, + * so they are saved in the PACA which allows us to restore + * our original state after RTAS returns. + */ + std r1,PACAR1(r13) + std r6,PACASAVEDMSR(r13) + + /* Setup our real return addr */ + LOAD_REG_ADDR(r4,rtas_return_loc) + clrldi r4,r4,2 /* convert to realmode address */ + mtlr r4 + +__enter_rtas: + LOAD_REG_ADDR(r4, rtas) + ld r5,RTASENTRY(r4) /* get the rtas->entry value */ + ld r4,RTASBASE(r4) /* get the rtas->base value */ + + /* + * RTAS runs in 32-bit big endian real mode, but leave MSR[RI] on as we + * may hit NMI (SRESET or MCE) while in RTAS. RTAS should disable RI in + * its critical regions (as specified in PAPR+ section 7.2.1). MSR[S] + * is not impacted by RFI_TO_KERNEL (only urfid can unset it). So if + * MSR[S] is set, it will remain when entering RTAS. + * If we're in HV mode, RTAS must also run in HV mode, so extract MSR_HV + * from the saved MSR value and insert into the value RTAS will use. + */ + extrdi r0, r6, 1, 63 - MSR_HV_LG + LOAD_REG_IMMEDIATE(r6, MSR_ME | MSR_RI) + insrdi r6, r0, 1, 63 - MSR_HV_LG + + li r0,0 + mtmsrd r0,1 /* disable RI before using SRR0/1 */ + + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r6 + RFI_TO_KERNEL + b . /* prevent speculative execution */ +rtas_return_loc: + FIXUP_ENDIAN + + /* Set SF before anything. */ + LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR|MSR_DR)) + mtmsrd r6 + + /* relocation is off at this point */ + GET_PACA(r13) + + bcl 20,31,$+4 +0: mflr r3 + ld r3,(1f-0b)(r3) /* get &rtas_restore_regs */ + + ld r1,PACAR1(r13) /* Restore our SP */ + ld r4,PACASAVEDMSR(r13) /* Restore our MSR */ + + mtspr SPRN_SRR0,r3 + mtspr SPRN_SRR1,r4 + RFI_TO_KERNEL + b . /* prevent speculative execution */ +_ASM_NOKPROBE_SYMBOL(enter_rtas) +_ASM_NOKPROBE_SYMBOL(__enter_rtas) +_ASM_NOKPROBE_SYMBOL(rtas_return_loc) + + .align 3 +1: .8byte rtas_restore_regs + +rtas_restore_regs: + /* relocation is on at this point */ + REST_GPR(2, r1) /* Restore the TOC */ + REST_NVGPRS(r1) /* Restore the non-volatiles */ + + ld r4,_CCR(r1) + mtcr r4 + ld r5,_CTR(r1) + mtctr r5 + ld r6,_XER(r1) + mtspr SPRN_XER,r6 + ld r7,_DAR(r1) + mtdar r7 + ld r8,_DSISR(r1) + mtdsisr r8 + + addi r1,r1,SWITCH_FRAME_SIZE /* Unstack our frame */ + ld r0,16(r1) /* get return address */ + + mtlr r0 + blr /* return to caller */ + +#endif /* CONFIG_PPC32 */ diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index 84f794782c62..359577ec1680 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -120,7 +120,7 @@ static struct kmem_cache *flash_block_cache = NULL; /* * Local copy of the flash block list. * - * The rtas_firmware_flash_list varable will be + * The rtas_firmware_flash_list variable will be * set once the data is fully read. * * For convenience as we build the list we use virtual addrs, @@ -376,7 +376,7 @@ static void manage_flash(struct rtas_manage_flash_t *args_buf, unsigned int op) s32 rc; do { - rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 1, + rc = rtas_call(rtas_function_token(RTAS_FN_IBM_MANAGE_FLASH_IMAGE), 1, 1, NULL, op); } while (rtas_busy_delay(rc)); @@ -444,7 +444,7 @@ error: */ static void validate_flash(struct rtas_validate_flash_t *args_buf) { - int token = rtas_token("ibm,validate-flash-image"); + int token = rtas_function_token(RTAS_FN_IBM_VALIDATE_FLASH_IMAGE); int update_results; s32 rc; @@ -570,7 +570,7 @@ static void rtas_flash_firmware(int reboot_type) return; } - update_token = rtas_token("ibm,update-flash-64-and-reboot"); + update_token = rtas_function_token(RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT); if (update_token == RTAS_UNKNOWN_SERVICE) { printk(KERN_ALERT "FLASH: ibm,update-flash-64-and-reboot " "is not available -- not a service partition?\n"); @@ -653,46 +653,46 @@ static void rtas_flash_firmware(int reboot_type) */ struct rtas_flash_file { const char *filename; - const char *rtas_call_name; + const rtas_fn_handle_t handle; int *status; - const struct file_operations fops; + const struct proc_ops ops; }; static const struct rtas_flash_file rtas_flash_files[] = { { .filename = "powerpc/rtas/" FIRMWARE_FLASH_NAME, - .rtas_call_name = "ibm,update-flash-64-and-reboot", + .handle = RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT, .status = &rtas_update_flash_data.status, - .fops.read = rtas_flash_read_msg, - .fops.write = rtas_flash_write, - .fops.release = rtas_flash_release, - .fops.llseek = default_llseek, + .ops.proc_read = rtas_flash_read_msg, + .ops.proc_write = rtas_flash_write, + .ops.proc_release = rtas_flash_release, + .ops.proc_lseek = default_llseek, }, { .filename = "powerpc/rtas/" FIRMWARE_UPDATE_NAME, - .rtas_call_name = "ibm,update-flash-64-and-reboot", + .handle = RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT, .status = &rtas_update_flash_data.status, - .fops.read = rtas_flash_read_num, - .fops.write = rtas_flash_write, - .fops.release = rtas_flash_release, - .fops.llseek = default_llseek, + .ops.proc_read = rtas_flash_read_num, + .ops.proc_write = rtas_flash_write, + .ops.proc_release = rtas_flash_release, + .ops.proc_lseek = default_llseek, }, { .filename = "powerpc/rtas/" VALIDATE_FLASH_NAME, - .rtas_call_name = "ibm,validate-flash-image", + .handle = RTAS_FN_IBM_VALIDATE_FLASH_IMAGE, .status = &rtas_validate_flash_data.status, - .fops.read = validate_flash_read, - .fops.write = validate_flash_write, - .fops.release = validate_flash_release, - .fops.llseek = default_llseek, + .ops.proc_read = validate_flash_read, + .ops.proc_write = validate_flash_write, + .ops.proc_release = validate_flash_release, + .ops.proc_lseek = default_llseek, }, { .filename = "powerpc/rtas/" MANAGE_FLASH_NAME, - .rtas_call_name = "ibm,manage-flash-image", + .handle = RTAS_FN_IBM_MANAGE_FLASH_IMAGE, .status = &rtas_manage_flash_data.status, - .fops.read = manage_flash_read, - .fops.write = manage_flash_write, - .fops.llseek = default_llseek, + .ops.proc_read = manage_flash_read, + .ops.proc_write = manage_flash_write, + .ops.proc_lseek = default_llseek, } }; @@ -700,8 +700,7 @@ static int __init rtas_flash_init(void) { int i; - if (rtas_token("ibm,update-flash-64-and-reboot") == - RTAS_UNKNOWN_SERVICE) { + if (rtas_function_token(RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT) == RTAS_UNKNOWN_SERVICE) { pr_info("rtas_flash: no firmware flash support\n"); return -EINVAL; } @@ -710,9 +709,9 @@ static int __init rtas_flash_init(void) if (!rtas_validate_flash_data.buf) return -ENOMEM; - flash_block_cache = kmem_cache_create("rtas_flash_cache", - RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0, - NULL); + flash_block_cache = kmem_cache_create_usercopy("rtas_flash_cache", + RTAS_BLK_SIZE, RTAS_BLK_SIZE, + 0, 0, RTAS_BLK_SIZE, NULL); if (!flash_block_cache) { printk(KERN_ERR "%s: failed to create block cache\n", __func__); @@ -723,14 +722,14 @@ static int __init rtas_flash_init(void) const struct rtas_flash_file *f = &rtas_flash_files[i]; int token; - if (!proc_create(f->filename, 0600, NULL, &f->fops)) + if (!proc_create(f->filename, 0600, NULL, &f->ops)) goto enomem; /* * This code assumes that the status int is the first member of the * struct */ - token = rtas_token(f->rtas_call_name); + token = rtas_function_token(f->handle); if (token == RTAS_UNKNOWN_SERVICE) *f->status = FLASH_AUTH; else diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index ae5e43eaca48..fccf96e897f6 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -13,11 +13,12 @@ #include <linux/pci.h> #include <linux/string.h> #include <linux/init.h> +#include <linux/pgtable.h> +#include <linux/of_address.h> +#include <linux/of_fdt.h> #include <asm/io.h> -#include <asm/pgtable.h> #include <asm/irq.h> -#include <asm/prom.h> #include <asm/machdep.h> #include <asm/pci-bridge.h> #include <asm/iommu.h> @@ -42,7 +43,7 @@ static inline int config_access_valid(struct pci_dn *dn, int where) return 0; } -int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +int rtas_pci_dn_read_config(struct pci_dn *pdn, int where, int size, u32 *val) { int returnval = -1; unsigned long buid, addr; @@ -86,7 +87,7 @@ static int rtas_pci_read_config(struct pci_bus *bus, pdn = pci_get_pdn_by_devfn(bus, devfn); /* Validity of pdn is checked in here */ - ret = rtas_read_config(pdn, where, size, val); + ret = rtas_pci_dn_read_config(pdn, where, size, val); if (*val == EEH_IO_ERROR_VALUE(size) && eeh_dev_check_failure(pdn_to_eeh_dev(pdn))) return PCIBIOS_DEVICE_NOT_FOUND; @@ -94,7 +95,7 @@ static int rtas_pci_read_config(struct pci_bus *bus, return ret; } -int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) +int rtas_pci_dn_write_config(struct pci_dn *pdn, int where, int size, u32 val) { unsigned long buid, addr; int ret; @@ -133,7 +134,7 @@ static int rtas_pci_write_config(struct pci_bus *bus, pdn = pci_get_pdn_by_devfn(bus, devfn); /* Validity of pdn is checked in here. */ - return rtas_write_config(pdn, where, size, val); + return rtas_pci_dn_write_config(pdn, where, size, val); } static struct pci_ops rtas_pci_ops = { @@ -190,10 +191,10 @@ static void python_countermeasures(struct device_node *dev) void __init init_pci_config_tokens(void) { - read_pci_config = rtas_token("read-pci-config"); - write_pci_config = rtas_token("write-pci-config"); - ibm_read_pci_config = rtas_token("ibm,read-pci-config"); - ibm_write_pci_config = rtas_token("ibm,write-pci-config"); + read_pci_config = rtas_function_token(RTAS_FN_READ_PCI_CONFIG); + write_pci_config = rtas_function_token(RTAS_FN_WRITE_PCI_CONFIG); + ibm_read_pci_config = rtas_function_token(RTAS_FN_IBM_READ_PCI_CONFIG); + ibm_write_pci_config = rtas_function_token(RTAS_FN_IBM_WRITE_PCI_CONFIG); } unsigned long get_phb_buid(struct device_node *phb) diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 8d02e047f96a..9bba469239fc 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -9,6 +9,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> +#include <linux/of.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/init.h> @@ -22,7 +23,6 @@ #include <linux/uaccess.h> #include <asm/io.h> #include <asm/rtas.h> -#include <asm/prom.h> #include <asm/nvram.h> #include <linux/atomic.h> #include <asm/machdep.h> @@ -273,37 +273,15 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) } } -#ifdef CONFIG_PPC_PSERIES -static void handle_prrn_event(s32 scope) -{ - /* - * For PRRN, we must pass the negative of the scope value in - * the RTAS event. - */ - pseries_devicetree_update(-scope); - numa_update_cpu_topology(false); -} - static void handle_rtas_event(const struct rtas_error_log *log) { - if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled()) + if (!machine_is(pseries)) return; - /* For PRRN Events the extended log length is used to denote - * the scope for calling rtas update-nodes. - */ - handle_prrn_event(rtas_error_extended_log_length(log)); -} - -#else - -static void handle_rtas_event(const struct rtas_error_log *log) -{ - return; + if (rtas_error_type(log) == RTAS_TYPE_PRRN) + pr_info_ratelimited("Platform resource reassignment ignored.\n"); } -#endif - static int rtas_log_open(struct inode * inode, struct file * file) { return 0; @@ -385,12 +363,12 @@ static __poll_t rtas_log_poll(struct file *file, poll_table * wait) return 0; } -static const struct file_operations proc_rtas_log_operations = { - .read = rtas_log_read, - .poll = rtas_log_poll, - .open = rtas_log_open, - .release = rtas_log_release, - .llseek = noop_llseek, +static const struct proc_ops rtas_log_proc_ops = { + .proc_read = rtas_log_read, + .proc_poll = rtas_log_poll, + .proc_open = rtas_log_open, + .proc_release = rtas_log_release, + .proc_lseek = noop_llseek, }; static int enable_surveillance(int timeout) @@ -451,7 +429,7 @@ static void rtas_event_scan(struct work_struct *w) do_event_scan(); - get_online_cpus(); + cpus_read_lock(); /* raw_ OK because just using CPU as starting point. */ cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); @@ -473,11 +451,11 @@ static void rtas_event_scan(struct work_struct *w) schedule_delayed_work_on(cpu, &event_scan_work, __round_jiffies_relative(event_scan_delay, cpu)); - put_online_cpus(); + cpus_read_unlock(); } #ifdef CONFIG_PPC64 -static void retrieve_nvram_error_log(void) +static void __init retrieve_nvram_error_log(void) { unsigned int err_type ; int rc ; @@ -495,12 +473,12 @@ static void retrieve_nvram_error_log(void) } } #else /* CONFIG_PPC64 */ -static void retrieve_nvram_error_log(void) +static void __init retrieve_nvram_error_log(void) { } #endif /* CONFIG_PPC64 */ -static void start_event_scan(void) +static void __init start_event_scan(void) { printk(KERN_DEBUG "RTAS daemon started\n"); pr_debug("rtasd: will sleep for %d milliseconds\n", @@ -522,18 +500,20 @@ EXPORT_SYMBOL_GPL(rtas_cancel_event_scan); static int __init rtas_event_scan_init(void) { + int err; + if (!machine_is(pseries) && !machine_is(chrp)) return 0; /* No RTAS */ - event_scan = rtas_token("event-scan"); + event_scan = rtas_function_token(RTAS_FN_EVENT_SCAN); if (event_scan == RTAS_UNKNOWN_SERVICE) { printk(KERN_INFO "rtasd: No event-scan on system\n"); return -ENODEV; } - rtas_event_scan_rate = rtas_token("rtas-event-scan-rate"); - if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) { + err = of_property_read_u32(rtas.dev, "rtas-event-scan-rate", &rtas_event_scan_rate); + if (err) { printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n"); return -ENODEV; } @@ -572,7 +552,7 @@ static int __init rtas_init(void) return -ENODEV; entry = proc_create("powerpc/rtas/error_log", 0400, NULL, - &proc_rtas_log_operations); + &rtas_log_proc_ops); if (!entry) printk(KERN_ERR "Failed to create error_log proc entry\n"); diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c index 4b982324d368..f9af305d9579 100644 --- a/arch/powerpc/kernel/secure_boot.c +++ b/arch/powerpc/kernel/secure_boot.c @@ -23,12 +23,19 @@ bool is_ppc_secureboot_enabled(void) { struct device_node *node; bool enabled = false; + u32 secureboot; node = get_ppc_fw_sb_node(); enabled = of_property_read_bool(node, "os-secureboot-enforcing"); - of_node_put(node); + if (enabled) + goto out; + + if (!of_property_read_u32(of_root, "ibm,secure-boot", &secureboot)) + enabled = (secureboot > 1); + +out: pr_info("Secure boot mode %s\n", enabled ? "enabled" : "disabled"); return enabled; @@ -38,12 +45,19 @@ bool is_ppc_trustedboot_enabled(void) { struct device_node *node; bool enabled = false; + u32 trustedboot; node = get_ppc_fw_sb_node(); enabled = of_property_read_bool(node, "trusted-enabled"); - of_node_put(node); + if (enabled) + goto out; + + if (!of_property_read_u32(of_root, "ibm,trusted-boot", &trustedboot)) + enabled = (trustedboot > 0); + +out: pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled"); return enabled; diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index bd70f5be1c27..4856e1a5161c 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -7,29 +7,35 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/device.h> +#include <linux/memblock.h> +#include <linux/nospec.h> +#include <linux/prctl.h> #include <linux/seq_buf.h> +#include <linux/debugfs.h> #include <asm/asm-prototypes.h> #include <asm/code-patching.h> -#include <asm/debugfs.h> #include <asm/security_features.h> +#include <asm/sections.h> #include <asm/setup.h> +#include <asm/inst.h> +#include "setup.h" u64 powerpc_security_features __read_mostly = SEC_FTR_DEFAULT; -enum count_cache_flush_type { - COUNT_CACHE_FLUSH_NONE = 0x1, - COUNT_CACHE_FLUSH_SW = 0x2, - COUNT_CACHE_FLUSH_HW = 0x4, +enum branch_cache_flush_type { + BRANCH_CACHE_FLUSH_NONE = 0x1, + BRANCH_CACHE_FLUSH_SW = 0x2, + BRANCH_CACHE_FLUSH_HW = 0x4, }; -static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; -static bool link_stack_flush_enabled; +static enum branch_cache_flush_type count_cache_flush_type = BRANCH_CACHE_FLUSH_NONE; +static enum branch_cache_flush_type link_stack_flush_type = BRANCH_CACHE_FLUSH_NONE; bool barrier_nospec_enabled; static bool no_nospec; static bool btb_flush_enabled; -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) +#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64) static bool no_spectrev2; #endif @@ -39,7 +45,7 @@ static void enable_barrier_nospec(bool enable) do_barrier_nospec_fixups(enable); } -void setup_barrier_nospec(void) +void __init setup_barrier_nospec(void) { bool enable; @@ -101,7 +107,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_barrier_nospec, barrier_nospec_get, static __init int barrier_nospec_debugfs_init(void) { debugfs_create_file_unsafe("barrier_nospec", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_barrier_nospec); return 0; } @@ -109,14 +115,14 @@ device_initcall(barrier_nospec_debugfs_init); static __init int security_feature_debugfs_init(void) { - debugfs_create_x64("security_features", 0400, powerpc_debugfs_root, + debugfs_create_x64("security_features", 0400, arch_debugfs_dir, &powerpc_security_features); return 0; } device_initcall(security_feature_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) +#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64) static int __init handle_nospectre_v2(char *p) { no_spectrev2 = true; @@ -124,17 +130,17 @@ static int __init handle_nospectre_v2(char *p) return 0; } early_param("nospectre_v2", handle_nospectre_v2); -#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_E500 || CONFIG_PPC_BOOK3S_64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E -void setup_spectre_v2(void) +#ifdef CONFIG_PPC_E500 +void __init setup_spectre_v2(void) { if (no_spectrev2 || cpu_mitigations_off()) do_btb_flush_fixups(); else btb_flush_enabled = true; } -#endif /* CONFIG_PPC_FSL_BOOK3E */ +#endif /* CONFIG_PPC_E500 */ #ifdef CONFIG_PPC_BOOK3S_64 ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) @@ -216,24 +222,25 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c if (ccd) seq_buf_printf(&s, "Indirect branch cache disabled"); - if (link_stack_flush_enabled) - seq_buf_printf(&s, ", Software link stack flush"); - - } else if (count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) { + } else if (count_cache_flush_type != BRANCH_CACHE_FLUSH_NONE) { seq_buf_printf(&s, "Mitigation: Software count cache flush"); - if (count_cache_flush_type == COUNT_CACHE_FLUSH_HW) + if (count_cache_flush_type == BRANCH_CACHE_FLUSH_HW) seq_buf_printf(&s, " (hardware accelerated)"); - if (link_stack_flush_enabled) - seq_buf_printf(&s, ", Software link stack flush"); - } else if (btb_flush_enabled) { seq_buf_printf(&s, "Mitigation: Branch predictor state flush"); } else { seq_buf_printf(&s, "Vulnerable"); } + if (bcs || ccd || count_cache_flush_type != BRANCH_CACHE_FLUSH_NONE) { + if (link_stack_flush_type != BRANCH_CACHE_FLUSH_NONE) + seq_buf_printf(&s, ", Software link stack flush"); + if (link_stack_flush_type == BRANCH_CACHE_FLUSH_HW) + seq_buf_printf(&s, " (hardware accelerated)"); + } + seq_buf_printf(&s, "\n"); return s.len; @@ -246,7 +253,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c static enum stf_barrier_type stf_enabled_flush_types; static bool no_stf_barrier; -bool stf_barrier; +static bool stf_barrier; static int __init handle_no_stf_barrier(char *p) { @@ -257,6 +264,11 @@ static int __init handle_no_stf_barrier(char *p) early_param("no_stf_barrier", handle_no_stf_barrier); +enum stf_barrier_type stf_barrier_type_get(void) +{ + return stf_enabled_flush_types; +} + /* This is the generic flag used by other architectures */ static int __init handle_ssbd(char *p) { @@ -294,9 +306,7 @@ static void stf_barrier_enable(bool enable) void setup_stf_barrier(void) { enum stf_barrier_type type; - bool enable, hv; - - hv = cpu_has_feature(CPU_FTR_HVMODE); + bool enable; /* Default to fallback in case fw-features are not available */ if (cpu_has_feature(CPU_FTR_ARCH_300)) @@ -309,8 +319,7 @@ void setup_stf_barrier(void) type = STF_BARRIER_NONE; enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && - (security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) || - (security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) && hv)); + security_ftr_enabled(SEC_FTR_STF_BARRIER); if (type == STF_BARRIER_FALLBACK) { pr_info("stf-barrier: fallback barrier available\n"); @@ -353,6 +362,41 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute * return sprintf(buf, "Vulnerable\n"); } +static int ssb_prctl_get(struct task_struct *task) +{ + /* + * The STF_BARRIER feature is on by default, so if it's off that means + * firmware has explicitly said the CPU is not vulnerable via either + * the hypercall or device tree. + */ + if (!security_ftr_enabled(SEC_FTR_STF_BARRIER)) + return PR_SPEC_NOT_AFFECTED; + + /* + * If the system's CPU has no known barrier (see setup_stf_barrier()) + * then assume that the CPU is not vulnerable. + */ + if (stf_enabled_flush_types == STF_BARRIER_NONE) + return PR_SPEC_NOT_AFFECTED; + + /* + * Otherwise the CPU is vulnerable. The barrier is not a global or + * per-process mitigation, so the only value that can be reported here + * is PR_SPEC_ENABLE, which appears as "vulnerable" in /proc. + */ + return PR_SPEC_ENABLE; +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); + default: + return -ENODEV; + } +} + #ifdef CONFIG_DEBUG_FS static int stf_barrier_set(void *data, u64 val) { @@ -383,65 +427,103 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, static __init int stf_barrier_debugfs_init(void) { - debugfs_create_file_unsafe("stf_barrier", 0600, powerpc_debugfs_root, + debugfs_create_file_unsafe("stf_barrier", 0600, arch_debugfs_dir, NULL, &fops_stf_barrier); return 0; } device_initcall(stf_barrier_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -static void no_count_cache_flush(void) +static void update_branch_cache_flush(void) { - count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; - pr_info("count-cache-flush: software flush disabled.\n"); -} + u32 *site, __maybe_unused *site2; -static void toggle_count_cache_flush(bool enable) -{ - if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE) && - !security_ftr_enabled(SEC_FTR_FLUSH_LINK_STACK)) - enable = false; - - if (!enable) { - patch_instruction_site(&patch__call_flush_count_cache, PPC_INST_NOP); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - patch_instruction_site(&patch__call_kvm_flush_link_stack, PPC_INST_NOP); -#endif - pr_info("link-stack-flush: software flush disabled.\n"); - link_stack_flush_enabled = false; - no_count_cache_flush(); - return; + site = &patch__call_kvm_flush_link_stack; + site2 = &patch__call_kvm_flush_link_stack_p9; + // This controls the branch from guest_exit_cont to kvm_flush_link_stack + if (link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) { + patch_instruction_site(site, ppc_inst(PPC_RAW_NOP())); + patch_instruction_site(site2, ppc_inst(PPC_RAW_NOP())); + } else { + // Could use HW flush, but that could also flush count cache + patch_branch_site(site, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); + patch_branch_site(site2, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); } +#endif - // This enables the branch from _switch to flush_count_cache - patch_branch_site(&patch__call_flush_count_cache, - (u64)&flush_count_cache, BRANCH_SET_LINK); + // Patch out the bcctr first, then nop the rest + site = &patch__call_flush_branch_caches3; + patch_instruction_site(site, ppc_inst(PPC_RAW_NOP())); + site = &patch__call_flush_branch_caches2; + patch_instruction_site(site, ppc_inst(PPC_RAW_NOP())); + site = &patch__call_flush_branch_caches1; + patch_instruction_site(site, ppc_inst(PPC_RAW_NOP())); + + // This controls the branch from _switch to flush_branch_caches + if (count_cache_flush_type == BRANCH_CACHE_FLUSH_NONE && + link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) { + // Nothing to be done + + } else if (count_cache_flush_type == BRANCH_CACHE_FLUSH_HW && + link_stack_flush_type == BRANCH_CACHE_FLUSH_HW) { + // Patch in the bcctr last + site = &patch__call_flush_branch_caches1; + patch_instruction_site(site, ppc_inst(0x39207fff)); // li r9,0x7fff + site = &patch__call_flush_branch_caches2; + patch_instruction_site(site, ppc_inst(0x7d2903a6)); // mtctr r9 + site = &patch__call_flush_branch_caches3; + patch_instruction_site(site, ppc_inst(PPC_INST_BCCTR_FLUSH)); -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - // This enables the branch from guest_exit_cont to kvm_flush_link_stack - patch_branch_site(&patch__call_kvm_flush_link_stack, - (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); -#endif + } else { + patch_branch_site(site, (u64)&flush_branch_caches, BRANCH_SET_LINK); - pr_info("link-stack-flush: software flush enabled.\n"); - link_stack_flush_enabled = true; + // If we just need to flush the link stack, early return + if (count_cache_flush_type == BRANCH_CACHE_FLUSH_NONE) { + patch_instruction_site(&patch__flush_link_stack_return, + ppc_inst(PPC_RAW_BLR())); - // If we just need to flush the link stack, patch an early return - if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { - patch_instruction_site(&patch__flush_link_stack_return, PPC_INST_BLR); - no_count_cache_flush(); - return; + // If we have flush instruction, early return + } else if (count_cache_flush_type == BRANCH_CACHE_FLUSH_HW) { + patch_instruction_site(&patch__flush_count_cache_return, + ppc_inst(PPC_RAW_BLR())); + } } +} - if (!security_ftr_enabled(SEC_FTR_BCCTR_FLUSH_ASSIST)) { - count_cache_flush_type = COUNT_CACHE_FLUSH_SW; - pr_info("count-cache-flush: full software flush sequence enabled.\n"); - return; +static void toggle_branch_cache_flush(bool enable) +{ + if (!enable || !security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { + if (count_cache_flush_type != BRANCH_CACHE_FLUSH_NONE) + count_cache_flush_type = BRANCH_CACHE_FLUSH_NONE; + + pr_info("count-cache-flush: flush disabled.\n"); + } else { + if (security_ftr_enabled(SEC_FTR_BCCTR_FLUSH_ASSIST)) { + count_cache_flush_type = BRANCH_CACHE_FLUSH_HW; + pr_info("count-cache-flush: hardware flush enabled.\n"); + } else { + count_cache_flush_type = BRANCH_CACHE_FLUSH_SW; + pr_info("count-cache-flush: software flush enabled.\n"); + } } - patch_instruction_site(&patch__flush_count_cache_return, PPC_INST_BLR); - count_cache_flush_type = COUNT_CACHE_FLUSH_HW; - pr_info("count-cache-flush: hardware assisted flush sequence enabled\n"); + if (!enable || !security_ftr_enabled(SEC_FTR_FLUSH_LINK_STACK)) { + if (link_stack_flush_type != BRANCH_CACHE_FLUSH_NONE) + link_stack_flush_type = BRANCH_CACHE_FLUSH_NONE; + + pr_info("link-stack-flush: flush disabled.\n"); + } else { + if (security_ftr_enabled(SEC_FTR_BCCTR_LINK_FLUSH_ASSIST)) { + link_stack_flush_type = BRANCH_CACHE_FLUSH_HW; + pr_info("link-stack-flush: hardware flush enabled.\n"); + } else { + link_stack_flush_type = BRANCH_CACHE_FLUSH_SW; + pr_info("link-stack-flush: software flush enabled.\n"); + } + } + + update_branch_cache_flush(); } void setup_count_cache_flush(void) @@ -465,7 +547,179 @@ void setup_count_cache_flush(void) security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) security_ftr_set(SEC_FTR_FLUSH_LINK_STACK); - toggle_count_cache_flush(enable); + toggle_branch_cache_flush(enable); +} + +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +static bool no_rfi_flush; +static bool no_entry_flush; +static bool no_uaccess_flush; +bool rfi_flush; +static bool entry_flush; +static bool uaccess_flush; +DEFINE_STATIC_KEY_FALSE(uaccess_flush_key); +EXPORT_SYMBOL(uaccess_flush_key); + +static int __init handle_no_rfi_flush(char *p) +{ + pr_info("rfi-flush: disabled on command line."); + no_rfi_flush = true; + return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +static int __init handle_no_entry_flush(char *p) +{ + pr_info("entry-flush: disabled on command line."); + no_entry_flush = true; + return 0; +} +early_param("no_entry_flush", handle_no_entry_flush); + +static int __init handle_no_uaccess_flush(char *p) +{ + pr_info("uaccess-flush: disabled on command line."); + no_uaccess_flush = true; + return 0; +} +early_param("no_uaccess_flush", handle_no_uaccess_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); + handle_no_rfi_flush(NULL); + return 0; +} +early_param("nopti", handle_no_pti); + +static void do_nothing(void *unused) +{ + /* + * We don't need to do the flush explicitly, just enter+exit kernel is + * sufficient, the RFI exit handlers will do the right thing. + */ +} + +void rfi_flush_enable(bool enable) +{ + if (enable) { + do_rfi_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else + do_rfi_flush_fixups(L1D_FLUSH_NONE); + + rfi_flush = enable; +} + +static void entry_flush_enable(bool enable) +{ + if (enable) { + do_entry_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else { + do_entry_flush_fixups(L1D_FLUSH_NONE); + } + + entry_flush = enable; +} + +static void uaccess_flush_enable(bool enable) +{ + if (enable) { + do_uaccess_flush_fixups(enabled_flush_types); + static_branch_enable(&uaccess_flush_key); + on_each_cpu(do_nothing, NULL, 1); + } else { + static_branch_disable(&uaccess_flush_key); + do_uaccess_flush_fixups(L1D_FLUSH_NONE); + } + + uaccess_flush = enable; +} + +static void __ref init_fallback_flush(void) +{ + u64 l1d_size, limit; + int cpu; + + /* Only allocate the fallback flush area once (at boot time). */ + if (l1d_flush_fallback_area) + return; + + l1d_size = ppc64_caches.l1d.size; + + /* + * If there is no d-cache-size property in the device tree, l1d_size + * could be zero. That leads to the loop in the asm wrapping around to + * 2^64-1, and then walking off the end of the fallback area and + * eventually causing a page fault which is fatal. Just default to + * something vaguely sane. + */ + if (!l1d_size) + l1d_size = (64 * 1024); + + limit = min(ppc64_bolted_size(), ppc64_rma_size); + + /* + * Align to L1d size, and size it at 2x L1d size, to catch possible + * hardware prefetch runoff. We don't have a recipe for load patterns to + * reliably avoid the prefetcher. + */ + l1d_flush_fallback_area = memblock_alloc_try_nid(l1d_size * 2, + l1d_size, MEMBLOCK_LOW_LIMIT, + limit, NUMA_NO_NODE); + if (!l1d_flush_fallback_area) + panic("%s: Failed to allocate %llu bytes align=0x%llx max_addr=%pa\n", + __func__, l1d_size * 2, l1d_size, &limit); + + + for_each_possible_cpu(cpu) { + struct paca_struct *paca = paca_ptrs[cpu]; + paca->rfi_flush_fallback_area = l1d_flush_fallback_area; + paca->l1d_flush_size = l1d_size; + } +} + +void setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ + if (types & L1D_FLUSH_FALLBACK) { + pr_info("rfi-flush: fallback displacement flush available\n"); + init_fallback_flush(); + } + + if (types & L1D_FLUSH_ORI) + pr_info("rfi-flush: ori type flush available\n"); + + if (types & L1D_FLUSH_MTTRIG) + pr_info("rfi-flush: mttrig type flush available\n"); + + enabled_flush_types = types; + + if (!cpu_mitigations_off() && !no_rfi_flush) + rfi_flush_enable(enable); +} + +void setup_entry_flush(bool enable) +{ + if (cpu_mitigations_off()) + return; + + if (!no_entry_flush) + entry_flush_enable(enable); +} + +void setup_uaccess_flush(bool enable) +{ + if (cpu_mitigations_off()) + return; + + if (!no_uaccess_flush) + uaccess_flush_enable(enable); } #ifdef CONFIG_DEBUG_FS @@ -480,14 +734,24 @@ static int count_cache_flush_set(void *data, u64 val) else return -EINVAL; - toggle_count_cache_flush(enable); + toggle_branch_cache_flush(enable); return 0; } static int count_cache_flush_get(void *data, u64 *val) { - if (count_cache_flush_type == COUNT_CACHE_FLUSH_NONE) + if (count_cache_flush_type == BRANCH_CACHE_FLUSH_NONE) + *val = 0; + else + *val = 1; + + return 0; +} + +static int link_stack_flush_get(void *data, u64 *val) +{ + if (link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) *val = 0; else *val = 1; @@ -497,14 +761,106 @@ static int count_cache_flush_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get, count_cache_flush_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_link_stack_flush, link_stack_flush_get, + count_cache_flush_set, "%llu\n"); static __init int count_cache_flush_debugfs_init(void) { debugfs_create_file_unsafe("count_cache_flush", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_count_cache_flush); + debugfs_create_file_unsafe("link_stack_flush", 0600, + arch_debugfs_dir, NULL, + &fops_link_stack_flush); return 0; } device_initcall(count_cache_flush_debugfs_init); + +static int rfi_flush_set(void *data, u64 val) +{ + bool enable; + + if (val == 1) + enable = true; + else if (val == 0) + enable = false; + else + return -EINVAL; + + /* Only do anything if we're changing state */ + if (enable != rfi_flush) + rfi_flush_enable(enable); + + return 0; +} + +static int rfi_flush_get(void *data, u64 *val) +{ + *val = rfi_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); + +static int entry_flush_set(void *data, u64 val) +{ + bool enable; + + if (val == 1) + enable = true; + else if (val == 0) + enable = false; + else + return -EINVAL; + + /* Only do anything if we're changing state */ + if (enable != entry_flush) + entry_flush_enable(enable); + + return 0; +} + +static int entry_flush_get(void *data, u64 *val) +{ + *val = entry_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_entry_flush, entry_flush_get, entry_flush_set, "%llu\n"); + +static int uaccess_flush_set(void *data, u64 val) +{ + bool enable; + + if (val == 1) + enable = true; + else if (val == 0) + enable = false; + else + return -EINVAL; + + /* Only do anything if we're changing state */ + if (enable != uaccess_flush) + uaccess_flush_enable(enable); + + return 0; +} + +static int uaccess_flush_get(void *data, u64 *val) +{ + *val = uaccess_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set, "%llu\n"); + +static __init int rfi_flush_debugfs_init(void) +{ + debugfs_create_file("rfi_flush", 0600, arch_debugfs_dir, NULL, &fops_rfi_flush); + debugfs_create_file("entry_flush", 0600, arch_debugfs_dir, NULL, &fops_entry_flush); + debugfs_create_file("uaccess_flush", 0600, arch_debugfs_dir, NULL, &fops_uaccess_flush); + return 0; +} +device_initcall(rfi_flush_debugfs_init); #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/secvar-ops.c b/arch/powerpc/kernel/secvar-ops.c index 6a29777d6a2d..19172a2804f0 100644 --- a/arch/powerpc/kernel/secvar-ops.c +++ b/arch/powerpc/kernel/secvar-ops.c @@ -8,10 +8,16 @@ #include <linux/cache.h> #include <asm/secvar.h> +#include <asm/bug.h> -const struct secvar_operations *secvar_ops __ro_after_init; +const struct secvar_operations *secvar_ops __ro_after_init = NULL; -void set_secvar_ops(const struct secvar_operations *ops) +int set_secvar_ops(const struct secvar_operations *ops) { + if (WARN_ON_ONCE(secvar_ops)) + return -EBUSY; + secvar_ops = ops; + + return 0; } diff --git a/arch/powerpc/kernel/secvar-sysfs.c b/arch/powerpc/kernel/secvar-sysfs.c index a0a78aba2083..eb3c053f323f 100644 --- a/arch/powerpc/kernel/secvar-sysfs.c +++ b/arch/powerpc/kernel/secvar-sysfs.c @@ -21,53 +21,48 @@ static struct kset *secvar_kset; static ssize_t format_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - ssize_t rc = 0; - struct device_node *node; - const char *format; + char tmp[32]; + ssize_t len = secvar_ops->format(tmp, sizeof(tmp)); - node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); - if (!of_device_is_available(node)) - return -ENODEV; - - rc = of_property_read_string(node, "format", &format); - if (rc) - return rc; + if (len > 0) + return sysfs_emit(buf, "%s\n", tmp); + else if (len < 0) + pr_err("Error %zd reading format string\n", len); + else + pr_err("Got empty format string from backend\n"); - rc = sprintf(buf, "%s\n", format); - - of_node_put(node); - - return rc; + return -EIO; } static ssize_t size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - uint64_t dsize; + u64 dsize; int rc; rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize); if (rc) { - pr_err("Error retrieving %s variable size %d\n", kobj->name, - rc); + if (rc != -ENOENT) + pr_err("Error retrieving %s variable size %d\n", kobj->name, rc); return rc; } - return sprintf(buf, "%llu\n", dsize); + return sysfs_emit(buf, "%llu\n", dsize); } static ssize_t data_read(struct file *filep, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - uint64_t dsize; char *data; + u64 dsize; int rc; rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize); if (rc) { - pr_err("Error getting %s variable size %d\n", kobj->name, rc); + if (rc != -ENOENT) + pr_err("Error getting %s variable size %d\n", kobj->name, rc); return rc; } pr_debug("dsize is %llu\n", dsize); @@ -138,34 +133,58 @@ static struct kobj_type secvar_ktype = { static int update_kobj_size(void) { - struct device_node *node; u64 varsize; - int rc = 0; + int rc = secvar_ops->max_size(&varsize); - node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); - if (!of_device_is_available(node)) { - rc = -ENODEV; - goto out; - } - - rc = of_property_read_u64(node, "max-var-size", &varsize); if (rc) - goto out; + return rc; data_attr.size = varsize; update_attr.size = varsize; -out: - of_node_put(node); + return 0; +} - return rc; +static int secvar_sysfs_config(struct kobject *kobj) +{ + struct attribute_group config_group = { + .name = "config", + .attrs = (struct attribute **)secvar_ops->config_attrs, + }; + + if (secvar_ops->config_attrs) + return sysfs_create_group(kobj, &config_group); + + return 0; +} + +static int add_var(const char *name) +{ + struct kobject *kobj; + int rc; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (!kobj) + return -ENOMEM; + + kobject_init(kobj, &secvar_ktype); + + rc = kobject_add(kobj, &secvar_kset->kobj, "%s", name); + if (rc) { + pr_warn("kobject_add error %d for attribute: %s\n", rc, + name); + kobject_put(kobj); + return rc; + } + + kobject_uevent(kobj, KOBJ_ADD); + return 0; } static int secvar_sysfs_load(void) { + u64 namesize = 0; char *name; - uint64_t namesize = 0; - struct kobject *kobj; int rc; name = kzalloc(NAME_MAX_SIZE, GFP_KERNEL); @@ -176,73 +195,99 @@ static int secvar_sysfs_load(void) rc = secvar_ops->get_next(name, &namesize, NAME_MAX_SIZE); if (rc) { if (rc != -ENOENT) - pr_err("error getting secvar from firmware %d\n", - rc); - break; - } + pr_err("error getting secvar from firmware %d\n", rc); + else + rc = 0; - kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); - if (!kobj) { - rc = -ENOMEM; break; } - kobject_init(kobj, &secvar_ktype); - - rc = kobject_add(kobj, &secvar_kset->kobj, "%s", name); - if (rc) { - pr_warn("kobject_add error %d for attribute: %s\n", rc, - name); - kobject_put(kobj); - kobj = NULL; - } - - if (kobj) - kobject_uevent(kobj, KOBJ_ADD); - + rc = add_var(name); } while (!rc); kfree(name); return rc; } +static int secvar_sysfs_load_static(void) +{ + const char * const *name_ptr = secvar_ops->var_names; + int rc; + + while (*name_ptr) { + rc = add_var(*name_ptr); + if (rc) + return rc; + name_ptr++; + } + + return 0; +} + static int secvar_sysfs_init(void) { + u64 max_size; int rc; if (!secvar_ops) { - pr_warn("secvar: failed to retrieve secvar operations.\n"); + pr_warn("Failed to retrieve secvar operations\n"); return -ENODEV; } secvar_kobj = kobject_create_and_add("secvar", firmware_kobj); if (!secvar_kobj) { - pr_err("secvar: Failed to create firmware kobj\n"); + pr_err("Failed to create firmware kobj\n"); return -ENOMEM; } rc = sysfs_create_file(secvar_kobj, &format_attr.attr); if (rc) { - kobject_put(secvar_kobj); - return -ENOMEM; + pr_err("Failed to create format object\n"); + rc = -ENOMEM; + goto err; } secvar_kset = kset_create_and_add("vars", NULL, secvar_kobj); if (!secvar_kset) { - pr_err("secvar: sysfs kobject registration failed.\n"); - kobject_put(secvar_kobj); - return -ENOMEM; + pr_err("sysfs kobject registration failed\n"); + rc = -ENOMEM; + goto err; } rc = update_kobj_size(); if (rc) { pr_err("Cannot read the size of the attribute\n"); - return rc; + goto err; + } + + rc = secvar_sysfs_config(secvar_kobj); + if (rc) { + pr_err("Failed to create config directory\n"); + goto err; } - secvar_sysfs_load(); + if (secvar_ops->get_next) + rc = secvar_sysfs_load(); + else + rc = secvar_sysfs_load_static(); + + if (rc) { + pr_err("Failed to create variable attributes\n"); + goto err; + } + + // Due to sysfs limitations, we will only ever get a write buffer of + // up to 1 page in size. Print a warning if this is potentially going + // to cause problems, so that the user is aware. + secvar_ops->max_size(&max_size); + if (max_size > PAGE_SIZE) + pr_warn_ratelimited("PAGE_SIZE (%lu) is smaller than maximum object size (%llu), writes are limited to PAGE_SIZE\n", + PAGE_SIZE, max_size); return 0; +err: + kobject_put(secvar_kobj); + return rc; } late_initcall(secvar_sysfs_init); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 488f1eecc0de..9b142b9d5187 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -9,6 +9,7 @@ #undef DEBUG #include <linux/export.h> +#include <linux/panic_notifier.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/init.h> @@ -17,27 +18,27 @@ #include <linux/delay.h> #include <linux/initrd.h> #include <linux/platform_device.h> +#include <linux/printk.h> #include <linux/seq_file.h> #include <linux/ioport.h> #include <linux/console.h> -#include <linux/screen_info.h> #include <linux/root_dev.h> -#include <linux/notifier.h> #include <linux/cpu.h> #include <linux/unistd.h> +#include <linux/seq_buf.h> #include <linux/serial.h> #include <linux/serial_8250.h> #include <linux/percpu.h> #include <linux/memblock.h> -#include <linux/of_platform.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_irq.h> #include <linux/hugetlb.h> -#include <asm/debugfs.h> +#include <linux/pgtable.h> #include <asm/io.h> #include <asm/paca.h> -#include <asm/prom.h> #include <asm/processor.h> #include <asm/vdso_datapage.h> -#include <asm/pgtable.h> #include <asm/smp.h> #include <asm/elf.h> #include <asm/machdep.h> @@ -57,6 +58,7 @@ #include <asm/xmon.h> #include <asm/cputhreads.h> #include <mm/mmu_decl.h> +#include <asm/archrandom.h> #include <asm/fadump.h> #include <asm/udbg.h> #include <asm/hugetlb.h> @@ -64,11 +66,11 @@ #include <asm/mmu_context.h> #include <asm/cpu_has_feature.h> #include <asm/kasan.h> +#include <asm/mce.h> #include "setup.h" #ifdef DEBUG -#include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) #else #define DBG(fmt...) @@ -84,31 +86,16 @@ EXPORT_SYMBOL(machine_id); int boot_cpuid = -1; EXPORT_SYMBOL_GPL(boot_cpuid); +#ifdef CONFIG_PPC64 +int boot_cpu_hwid = -1; +#endif + /* * These are used in binfmt_elf.c to put aux entries on the stack * for each elf executable being started. */ int dcache_bsize; int icache_bsize; -int ucache_bsize; - - -unsigned long klimit = (unsigned long) _end; - -/* - * This still seems to be needed... -- paulus - */ -struct screen_info screen_info = { - .orig_x = 0, - .orig_y = 25, - .orig_video_cols = 80, - .orig_video_lines = 25, - .orig_video_isVGA = 1, - .orig_video_points = 16 -}; -#if defined(CONFIG_FB_VGA16_MODULE) -EXPORT_SYMBOL(screen_info); -#endif /* Variables required to store legacy IO irq routing */ int of_i8042_kbd_irq; @@ -165,9 +152,7 @@ void machine_restart(char *cmd) void machine_power_off(void) { machine_shutdown(); - if (pm_power_off) - pm_power_off(); - + do_kernel_power_off(); smp_send_stop(); machine_hang(); } @@ -177,6 +162,14 @@ EXPORT_SYMBOL_GPL(machine_power_off); void (*pm_power_off)(void); EXPORT_SYMBOL_GPL(pm_power_off); +size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) +{ + if (max_longs && ppc_md.get_random_seed && ppc_md.get_random_seed(v)) + return 1; + return 0; +} +EXPORT_SYMBOL(arch_get_random_seed_longs); + void machine_halt(void) { machine_shutdown(); @@ -239,18 +232,17 @@ static int show_cpuinfo(struct seq_file *m, void *v) maj = (pvr >> 8) & 0xFF; min = pvr & 0xFF; - seq_printf(m, "processor\t: %lu\n", cpu_id); - seq_printf(m, "cpu\t\t: "); + seq_printf(m, "processor\t: %lu\ncpu\t\t: ", cpu_id); if (cur_cpu_spec->pvr_mask && cur_cpu_spec->cpu_name) - seq_printf(m, "%s", cur_cpu_spec->cpu_name); + seq_puts(m, cur_cpu_spec->cpu_name); else seq_printf(m, "unknown (%08x)", pvr); if (cpu_has_feature(CPU_FTR_ALTIVEC)) - seq_printf(m, ", altivec supported"); + seq_puts(m, ", altivec supported"); - seq_printf(m, "\n"); + seq_putc(m, '\n'); #ifdef CONFIG_TAU if (cpu_has_feature(CPU_FTR_TAU)) { @@ -283,11 +275,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "clock\t\t: %lu.%06luMHz\n", proc_freq / 1000000, proc_freq % 1000000); - if (ppc_md.show_percpuinfo != NULL) - ppc_md.show_percpuinfo(m, cpu_id); - /* If we are a Freescale core do a simple check so - * we dont have to keep adding cases in the future */ + * we don't have to keep adding cases in the future */ if (PVR_VER(pvr) & 0x8000) { switch (PVR_VER(pvr)) { case 0x8000: /* 7441/7450/7451, Voyager */ @@ -306,15 +295,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) } } else { switch (PVR_VER(pvr)) { - case 0x0020: /* 403 family */ - maj = PVR_MAJ(pvr) + 1; - min = PVR_MIN(pvr); - break; case 0x1008: /* 740P/750P ?? */ maj = ((pvr >> 8) & 0xFF) - 1; min = pvr & 0xFF; break; case 0x004e: /* POWER9 bits 12-15 give chip type */ + case 0x0080: /* POWER10 bit 12 gives SMT8/4 */ maj = (pvr >> 8) & 0x0F; min = pvr & 0xFF; break; @@ -332,7 +318,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "bogomips\t: %lu.%02lu\n", loops_per_jiffy / (500000 / HZ), (loops_per_jiffy / (5000 / HZ)) % 100); - seq_printf(m, "\n"); + seq_putc(m, '\n'); /* If this is the last cpu, print the summary */ if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids) @@ -467,8 +453,8 @@ void __init smp_setup_cpu_maps(void) intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); if (intserv) { - DBG(" ibm,ppc-interrupt-server#s -> %d threads\n", - nthreads); + DBG(" ibm,ppc-interrupt-server#s -> %lu threads\n", + (len / sizeof(int))); } else { DBG(" no ibm,ppc-interrupt-server#s -> 1 thread\n"); intserv = of_get_property(dn, "reg", &len); @@ -593,7 +579,15 @@ static __init int add_pcspkr(void) device_initcall(add_pcspkr); #endif /* CONFIG_PCSPKR_PLATFORM */ -void probe_machine(void) +static char ppc_hw_desc_buf[128] __initdata; + +struct seq_buf ppc_hw_desc __initdata = { + .buffer = ppc_hw_desc_buf, + .size = sizeof(ppc_hw_desc_buf), + .len = 0, +}; + +static __init void probe_machine(void) { extern struct machdep_calls __machine_desc_start; extern struct machdep_calls __machine_desc_end; @@ -619,13 +613,14 @@ void probe_machine(void) for (machine_id = &__machine_desc_start; machine_id < &__machine_desc_end; machine_id++) { - DBG(" %s ...", machine_id->name); + DBG(" %s ...\n", machine_id->name); + if (machine_id->compatible && !of_machine_is_compatible(machine_id->compatible)) + continue; memcpy(&ppc_md, machine_id, sizeof(struct machdep_calls)); - if (ppc_md.probe()) { - DBG(" match !\n"); - break; - } - DBG("\n"); + if (ppc_md.probe && !ppc_md.probe()) + continue; + DBG(" %s match !\n", machine_id->name); + break; } /* What can we do if we didn't find ? */ if (machine_id >= &__machine_desc_end) { @@ -633,7 +628,13 @@ void probe_machine(void) for (;;); } - printk(KERN_INFO "Using %s machine description\n", ppc_md.name); + // Append the machine name to other info we've gathered + seq_buf_puts(&ppc_hw_desc, ppc_md.name); + + // Set the generic hardware description shown in oopses + dump_stack_set_arch_desc(ppc_hw_desc.buffer); + + pr_info("Hardware name: %s\n", ppc_hw_desc.buffer); } /* Match a class of boards, not a specific device configuration. */ @@ -691,8 +692,25 @@ int check_legacy_ioport(unsigned long base_port) } EXPORT_SYMBOL(check_legacy_ioport); -static int ppc_panic_event(struct notifier_block *this, - unsigned long event, void *ptr) +/* + * Panic notifiers setup + * + * We have 3 notifiers for powerpc, each one from a different "nature": + * + * - ppc_panic_fadump_handler() is a hypervisor notifier, which hard-disables + * IRQs and deal with the Firmware-Assisted dump, when it is configured; + * should run early in the panic path. + * + * - dump_kernel_offset() is an informative notifier, just showing the KASLR + * offset if we have RANDOMIZE_BASE set. + * + * - ppc_panic_platform_handler() is a low-level handler that's registered + * only if the platform wishes to perform final actions in the panic path, + * hence it should run late and might not even return. Currently, only + * pseries and ps3 platforms register callbacks. + */ +static int ppc_panic_fadump_handler(struct notifier_block *this, + unsigned long event, void *ptr) { /* * panic does a local_irq_disable, but we really @@ -702,45 +720,63 @@ static int ppc_panic_event(struct notifier_block *this, /* * If firmware-assisted dump has been registered then trigger - * firmware-assisted dump and let firmware handle everything else. + * its callback and let the firmware handles everything else. */ crash_fadump(NULL, ptr); - if (ppc_md.panic) - ppc_md.panic(ptr); /* May not return */ + return NOTIFY_DONE; } -static struct notifier_block ppc_panic_block = { - .notifier_call = ppc_panic_event, - .priority = INT_MIN /* may not return; must be done last */ -}; - -/* - * Dump out kernel offset information on panic. - */ static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", kaslr_offset(), KERNELBASE); - return 0; + return NOTIFY_DONE; } +static int ppc_panic_platform_handler(struct notifier_block *this, + unsigned long event, void *ptr) +{ + /* + * This handler is only registered if we have a panic callback + * on ppc_md, hence NULL check is not needed. + * Also, it may not return, so it runs really late on panic path. + */ + ppc_md.panic(ptr); + + return NOTIFY_DONE; +} + +static struct notifier_block ppc_fadump_block = { + .notifier_call = ppc_panic_fadump_handler, + .priority = INT_MAX, /* run early, to notify the firmware ASAP */ +}; + static struct notifier_block kernel_offset_notifier = { - .notifier_call = dump_kernel_offset + .notifier_call = dump_kernel_offset, +}; + +static struct notifier_block ppc_panic_block = { + .notifier_call = ppc_panic_platform_handler, + .priority = INT_MIN, /* may not return; must be done last */ }; void __init setup_panic(void) { + /* Hard-disables IRQs + deal with FW-assisted dump (fadump) */ + atomic_notifier_chain_register(&panic_notifier_list, + &ppc_fadump_block); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset() > 0) atomic_notifier_chain_register(&panic_notifier_list, &kernel_offset_notifier); - /* PPC64 always does a hard irq disable in its panic handler */ - if (!IS_ENABLED(CONFIG_PPC64) && !ppc_md.panic) - return; - atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); + /* Low-level platform-specific routines that should run on panic */ + if (ppc_md.panic) + atomic_notifier_chain_register(&panic_notifier_list, + &ppc_panic_block); } #ifdef CONFIG_CHECK_CACHE_COHERENCY @@ -780,19 +816,6 @@ static int __init check_cache_coherency(void) late_initcall(check_cache_coherency); #endif /* CONFIG_CHECK_CACHE_COHERENCY */ -#ifdef CONFIG_DEBUG_FS -struct dentry *powerpc_debugfs_root; -EXPORT_SYMBOL(powerpc_debugfs_root); - -static int powerpc_debugfs_init(void) -{ - powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL); - - return powerpc_debugfs_root == NULL; -} -arch_initcall(powerpc_debugfs_init); -#endif - void ppc_printk_progress(char *s, unsigned short hex) { pr_info("%s\n", s); @@ -806,8 +829,6 @@ static __init void print_system_info(void) pr_info("dcache_bsize = 0x%x\n", dcache_bsize); pr_info("icache_bsize = 0x%x\n", icache_bsize); - if (ucache_bsize != 0) - pr_info("ucache_bsize = 0x%x\n", ucache_bsize); pr_info("cpu_features = 0x%016lx\n", cur_cpu_spec->cpu_features); pr_info(" possible = 0x%016lx\n", @@ -837,7 +858,7 @@ static __init void print_system_info(void) } #ifdef CONFIG_SMP -static void smp_setup_pacas(void) +static void __init smp_setup_pacas(void) { int cpu; @@ -848,7 +869,7 @@ static void smp_setup_pacas(void) set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]); } - memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32)); + memblock_free(cpu_to_phys_id, nr_cpu_ids * sizeof(u32)); cpu_to_phys_id = NULL; } #endif @@ -910,6 +931,8 @@ void __init setup_arch(char **cmdline_p) /* Parse memory topology */ mem_topology_setup(); + /* Set max_mapnr before paging_init() */ + set_max_mapnr(max_pfn); /* * Release secondary cpus out of their spinloops at 0x60 now that @@ -923,34 +946,37 @@ void __init setup_arch(char **cmdline_p) /* On BookE, setup per-core TLB data structures. */ setup_tlb_core_data(); - - smp_release_cpus(); #endif /* Print various info about the machine that has been gathered so far. */ print_system_info(); - /* Reserve large chunks of memory for use by CMA for KVM. */ - kvm_cma_reserve(); - klp_init_thread_info(&init_task); - init_mm.start_code = (unsigned long)_stext; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = klimit; - + setup_initial_init_mm(_stext, _etext, _edata, _end); + /* sched_init() does the mmgrab(&init_mm) for the primary CPU */ + VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(&init_mm))); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(&init_mm)); + inc_mm_active_cpus(&init_mm); mm_iommu_init(&init_mm); + irqstack_early_init(); exc_lvl_early_init(); emergency_stack_init(); + mce_init(); + smp_release_cpus(); + initmem_init(); - early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); + /* + * Reserve large chunks of memory for use by CMA for KVM and hugetlb. These must + * be called after initmem_init(), so that pageblock_order is initialised. + */ + kvm_cma_reserve(); + gigantic_hugetlb_cma_reserve(); - if (IS_ENABLED(CONFIG_DUMMY_CONSOLE)) - conswitchp = &dummy_con; + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); if (ppc_md.setup_arch) ppc_md.setup_arch(); diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index c82577c4b15d..7912bb50a7cb 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -14,31 +14,31 @@ void irqstack_early_init(void); #ifdef CONFIG_PPC32 void setup_power_save(void); #else -static inline void setup_power_save(void) { }; +static inline void setup_power_save(void) { } #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SMP) void check_smt_enabled(void); #else -static inline void check_smt_enabled(void) { }; +static inline void check_smt_enabled(void) { } #endif -#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) +#if defined(CONFIG_PPC_BOOK3E_64) && defined(CONFIG_SMP) void setup_tlb_core_data(void); #else -static inline void setup_tlb_core_data(void) { }; +static inline void setup_tlb_core_data(void) { } #endif -#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x void exc_lvl_early_init(void); #else -static inline void exc_lvl_early_init(void) { }; +static inline void exc_lvl_early_init(void) { } #endif -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) || defined(CONFIG_VMAP_STACK) void emergency_stack_init(void); #else -static inline void emergency_stack_init(void) { }; +static inline void emergency_stack_init(void) { } #endif #ifdef CONFIG_PPC64 @@ -55,7 +55,7 @@ extern unsigned long spr_default_dscr; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE void kvm_cma_reserve(void); #else -static inline void kvm_cma_reserve(void) { }; +static inline void kvm_cma_reserve(void) { } #endif #ifdef CONFIG_TAU diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index dcffe927f5b9..b761cc1a403c 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -19,11 +19,12 @@ #include <linux/memblock.h> #include <linux/export.h> #include <linux/nvram.h> +#include <linux/pgtable.h> +#include <linux/of_fdt.h> +#include <linux/irq.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/processor.h> -#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/smp.h> #include <asm/elf.h> @@ -58,7 +59,6 @@ EXPORT_SYMBOL_GPL(boot_cpuid_phys); int smp_hw_index[NR_CPUS]; EXPORT_SYMBOL(smp_hw_index); -unsigned long ISA_DMA_THRESHOLD; unsigned int DMA_MODE_READ; unsigned int DMA_MODE_WRITE; @@ -75,20 +75,20 @@ EXPORT_SYMBOL(DMA_MODE_WRITE); */ notrace void __init machine_init(u64 dt_ptr) { - unsigned int *addr = (unsigned int *)patch_site_addr(&patch__memset_nocache); - unsigned long insn; + u32 *addr = (u32 *)patch_site_addr(&patch__memset_nocache); + ppc_inst_t insn; /* Configure static keys first, now that we're relocated. */ setup_feature_keys(); - early_ioremap_setup(); + early_ioremap_init(); /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); - patch_instruction_site(&patch__memcpy_nocache, PPC_INST_NOP); + patch_instruction_site(&patch__memcpy_nocache, ppc_inst(PPC_RAW_NOP())); - insn = create_cond_branch(addr, branch_target(addr), 0x820000); + create_cond_branch(&insn, addr, branch_target(addr), 0x820000); patch_instruction(addr, insn); /* replace b by bne cr0 */ /* Do some early initialization based on the flat device tree */ @@ -140,7 +140,7 @@ arch_initcall(ppc_init); static void *__init alloc_stack(void) { - void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN); if (!ptr) panic("cannot allocate %d bytes for stack at %pS\n", @@ -153,6 +153,9 @@ void __init irqstack_early_init(void) { unsigned int i; + if (IS_ENABLED(CONFIG_VMAP_STACK)) + return; + /* interrupt stacks must be in lowmem, we get that for free on ppc32 * as the memblock is limited to lowmem by default */ for_each_possible_cpu(i) { @@ -161,7 +164,19 @@ void __init irqstack_early_init(void) } } -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_VMAP_STACK +void *emergency_ctx[NR_CPUS] __ro_after_init = {[0] = &init_stack}; + +void __init emergency_stack_init(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + emergency_ctx[i] = alloc_stack(); +} +#endif + +#ifdef CONFIG_BOOKE_OR_40x void __init exc_lvl_early_init(void) { unsigned int i, hw_cpu; @@ -192,7 +207,7 @@ void __init setup_power_save(void) ppc_md.power_save = ppc6xx_idle; #endif -#ifdef CONFIG_E500 +#ifdef CONFIG_PPC_E500 if (cpu_has_feature(CPU_FTR_CAN_DOZE) || cpu_has_feature(CPU_FTR_CAN_NAP)) ppc_md.power_save = e500_idle; @@ -208,7 +223,4 @@ __init void initialize_cache_info(void) */ dcache_bsize = cur_cpu_spec->dcache_bsize; icache_bsize = cur_cpu_spec->icache_bsize; - ucache_bsize = 0; - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601) || IS_ENABLED(CONFIG_E200)) - ucache_bsize = icache_bsize = dcache_bsize; } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6104917a282d..2f19d5e94485 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -30,13 +30,15 @@ #include <linux/lockdep.h> #include <linux/memory.h> #include <linux/nmi.h> +#include <linux/pgtable.h> +#include <linux/of.h> +#include <linux/of_fdt.h> -#include <asm/debugfs.h> +#include <asm/asm-prototypes.h> +#include <asm/kvm_guest.h> #include <asm/io.h> #include <asm/kdump.h> -#include <asm/prom.h> #include <asm/processor.h> -#include <asm/pgtable.h> #include <asm/smp.h> #include <asm/elf.h> #include <asm/machdep.h> @@ -59,13 +61,14 @@ #include <asm/udbg.h> #include <asm/kexec.h> #include <asm/code-patching.h> -#include <asm/livepatch.h> +#include <asm/ftrace.h> #include <asm/opal.h> #include <asm/cputhreads.h> #include <asm/hw_irq.h> #include <asm/feature-fixups.h> #include <asm/kup.h> #include <asm/early_ioremap.h> +#include <asm/pgalloc.h> #include "setup.h" @@ -84,7 +87,7 @@ struct ppc64_caches ppc64_caches = { }; EXPORT_SYMBOL_GPL(ppc64_caches); -#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) +#if defined(CONFIG_PPC_BOOK3E_64) && defined(CONFIG_SMP) void __init setup_tlb_core_data(void) { int cpu; @@ -111,7 +114,6 @@ void __init setup_tlb_core_data(void) * Should we panic instead? */ WARN_ONCE(smt_enabled_at_boot >= 2 && - !mmu_has_feature(MMU_FTR_USE_TLBRSRV) && book3e_htw_mode != PPC_HTW_E6500, "%s: unsupported MMU configuration\n", __func__); } @@ -175,14 +177,26 @@ early_param("smt-enabled", early_smt_enabled); #endif /* CONFIG_SMP */ /** Fix up paca fields required for the boot cpu */ -static void __init fixup_boot_paca(void) +static void __init fixup_boot_paca(struct paca_struct *boot_paca) { /* The boot cpu is started */ - get_paca()->cpu_start = 1; + boot_paca->cpu_start = 1; +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Give the early boot machine check stack somewhere to use, use + * half of the init stack. This is a bit hacky but there should not be + * deep stack usage in early init so shouldn't overflow it or overwrite + * things. + */ + boot_paca->mc_emergency_sp = (void *)&init_thread_union + + (THREAD_SIZE/2); +#endif /* Allow percpu accesses to work until we setup percpu data */ - get_paca()->data_offset = 0; - /* Mark interrupts disabled in PACA */ - irq_soft_mask_set(IRQS_DISABLED); + boot_paca->data_offset = 0; + /* Mark interrupts soft and hard disabled in PACA */ + boot_paca->irq_soft_mask = IRQS_DISABLED; + boot_paca->irq_happened = PACA_IRQ_HARD_DIS; + WARN_ON(mfmsr() & MSR_EE); } static void __init configure_exceptions(void) @@ -195,8 +209,39 @@ static void __init configure_exceptions(void) /* Under a PAPR hypervisor, we need hypercalls */ if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + /* + * - PR KVM does not support AIL mode interrupts in the host + * while a PR guest is running. + * + * - SCV system call interrupt vectors are only implemented for + * AIL mode interrupts. + * + * - On pseries, AIL mode can only be enabled and disabled + * system-wide so when a PR VM is created on a pseries host, + * all CPUs of the host are set to AIL=0 mode. + * + * - Therefore host CPUs must not execute scv while a PR VM + * exists. + * + * - SCV support can not be disabled dynamically because the + * feature is advertised to host userspace. Disabling the + * facility and emulating it would be possible but is not + * implemented. + * + * - So SCV support is blanket disabled if PR KVM could possibly + * run. That is, PR support compiled in, booting on pseries + * with hash MMU. + */ + if (IS_ENABLED(CONFIG_KVM_BOOK3S_PR_POSSIBLE) && !radix_enabled()) { + init_task.thread.fscr &= ~FSCR_SCV; + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV; + } + /* Enable AIL if possible */ - pseries_enable_reloc_on_exc(); + if (!pseries_enable_reloc_on_exc()) { + init_task.thread.fscr &= ~FSCR_SCV; + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV; + } /* * Tell the hypervisor that we want our exceptions to @@ -227,10 +272,23 @@ static void cpu_ready_for_interrupts(void) * If we are not in hypervisor mode the job is done once for * the whole partition in configure_exceptions(). */ - if (cpu_has_feature(CPU_FTR_HVMODE) && - cpu_has_feature(CPU_FTR_ARCH_207S)) { + if (cpu_has_feature(CPU_FTR_HVMODE)) { unsigned long lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); + unsigned long new_lpcr = lpcr; + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + /* P10 DD1 does not have HAIL */ + if (pvr_version_is(PVR_POWER10) && + (mfspr(SPRN_PVR) & 0xf00) == 0x100) + new_lpcr |= LPCR_AIL_3; + else + new_lpcr |= LPCR_HAIL; + } else if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + new_lpcr |= LPCR_AIL_3; + } + + if (new_lpcr != lpcr) + mtspr(SPRN_LPCR, new_lpcr); } /* @@ -254,7 +312,7 @@ static void cpu_ready_for_interrupts(void) unsigned long spr_default_dscr = 0; -void __init record_spr_defaults(void) +static void __init record_spr_defaults(void) { if (early_cpu_has_feature(CPU_FTR_DSCR)) spr_default_dscr = mfspr(SPRN_DSCR); @@ -285,18 +343,40 @@ void __init early_setup(unsigned long dt_ptr) /* -------- printk is _NOT_ safe to use here ! ------- */ + /* + * Assume we're on cpu 0 for now. + * + * We need to load a PACA very early for a few reasons. + * + * The stack protector canary is stored in the paca, so as soon as we + * call any stack protected code we need r13 pointing somewhere valid. + * + * If we are using kcov it will call in_task() in its instrumentation, + * which relies on the current task from the PACA. + * + * dt_cpu_ftrs_init() calls into generic OF/fdt code, as well as + * printk(), which can trigger both stack protector and kcov. + * + * percpu variables and spin locks also use the paca. + * + * So set up a temporary paca. It will be replaced below once we know + * what CPU we are on. + */ + initialise_paca(&boot_paca, 0); + fixup_boot_paca(&boot_paca); + WARN_ON(local_paca); + setup_paca(&boot_paca); /* install the paca into registers */ + + /* -------- printk is now safe to use ------- */ + + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && (mfmsr() & MSR_HV)) + enable_machine_check(); + /* Try new device tree based feature discovery ... */ if (!dt_cpu_ftrs_init(__va(dt_ptr))) /* Otherwise use the old style CPU table */ identify_cpu(0, mfspr(SPRN_PVR)); - /* Assume we're on cpu 0 for now. Don't write to the paca yet! */ - initialise_paca(&boot_paca, 0); - setup_paca(&boot_paca); - fixup_boot_paca(); - - /* -------- printk is now safe to use ------- */ - /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); @@ -305,17 +385,21 @@ void __init early_setup(unsigned long dt_ptr) /* * Do early initialization using the flattened device * tree, such as retrieving the physical memory map or - * calculating/retrieving the hash table size. + * calculating/retrieving the hash table size, discover + * boot_cpuid and boot_cpu_hwid. */ early_init_devtree(__va(dt_ptr)); - /* Now we know the logical id of our boot cpu, setup the paca. */ - if (boot_cpuid != 0) { - /* Poison paca_ptrs[0] again if it's not the boot cpu */ - memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0])); - } - setup_paca(paca_ptrs[boot_cpuid]); - fixup_boot_paca(); + allocate_paca_ptrs(); + allocate_paca(boot_cpuid); + set_hard_smp_processor_id(boot_cpuid, boot_cpu_hwid); + fixup_boot_paca(paca_ptrs[boot_cpuid]); + setup_paca(paca_ptrs[boot_cpuid]); /* install the paca into registers */ + // smp_processor_id() now reports boot_cpuid + +#ifdef CONFIG_SMP + task_thread_info(current)->cpu = boot_cpuid; // fix task_cpu(current) +#endif /* * Configure exception handlers. This include setting up trampolines @@ -333,11 +417,11 @@ void __init early_setup(unsigned long dt_ptr) apply_feature_fixups(); setup_feature_keys(); - early_ioremap_setup(); - /* Initialize the hash table or TLB handling */ early_init_mmu(); + early_ioremap_setup(); + /* * After firmware and early platform setup code has set things up, * we note the SPR values for configurable control/performance @@ -396,7 +480,7 @@ void early_setup_secondary(void) #endif /* CONFIG_SMP */ -void panic_smp_self_stop(void) +void __noreturn panic_smp_self_stop(void) { hard_irq_disable(); spin_begin(); @@ -463,7 +547,7 @@ void smp_release_cpus(void) * routines and/or provided to userland */ -static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize, +static void __init init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize, u32 bsize, u32 sets) { info->size = size; @@ -516,6 +600,8 @@ static bool __init parse_cache_info(struct device_node *np, lsizep = of_get_property(np, propnames[3], NULL); if (bsizep == NULL) bsizep = lsizep; + if (lsizep == NULL) + lsizep = bsizep; if (lsizep != NULL) lsize = be32_to_cpu(*lsizep); if (bsizep != NULL) @@ -608,7 +694,7 @@ void __init initialize_cache_info(void) */ __init u64 ppc64_bolted_size(void) { -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 /* Freescale BookE bolts the entire linear mapping */ /* XXX: BookE ppc64_rma_limit setup seems to disagree? */ if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) @@ -633,7 +719,7 @@ static void *__init alloc_stack(unsigned long limit, int cpu) BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); - ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE, + ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_ALIGN, MEMBLOCK_LOW_LIMIT, limit, early_cpu_to_node(cpu)); if (!ptr) @@ -658,7 +744,7 @@ void __init irqstack_early_init(void) } } -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 void __init exc_lvl_early_init(void) { unsigned int i; @@ -691,7 +777,7 @@ void __init exc_lvl_early_init(void) */ void __init emergency_stack_init(void) { - u64 limit; + u64 limit, mce_limit; unsigned int i; /* @@ -708,7 +794,16 @@ void __init emergency_stack_init(void) * initialized in kernel/irq.c. These are initialized here in order * to have emergency stacks available as early as possible. */ - limit = min(ppc64_bolted_size(), ppc64_rma_size); + limit = mce_limit = min(ppc64_bolted_size(), ppc64_rma_size); + + /* + * Machine check on pseries calls rtas, but can't use the static + * rtas_args due to a machine check hitting while the lock is held. + * rtas args have to be under 4GB, so the machine check stack is + * limited to 4GB so args can be put on stack. + */ + if (firmware_has_feature(FW_FEATURE_LPAR) && mce_limit > SZ_4G) + mce_limit = SZ_4G; for_each_possible_cpu(i) { paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; @@ -718,27 +813,12 @@ void __init emergency_stack_init(void) paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; /* emergency stack for machine check exception handling. */ - paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; + paca_ptrs[i]->mc_emergency_sp = alloc_stack(mce_limit, i) + THREAD_SIZE; #endif } } #ifdef CONFIG_SMP -#define PCPU_DYN_SIZE () - -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) -{ - return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_ACCESSIBLE, - early_cpu_to_node(cpu)); - -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(__pa(ptr), size); -} - static int pcpu_cpu_distance(unsigned int from, unsigned int to) { if (early_cpu_to_node(from) == early_cpu_to_node(to)) @@ -747,6 +827,11 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } +static __init int pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -756,20 +841,38 @@ void __init setup_per_cpu_areas(void) size_t atom_size; unsigned long delta; unsigned int cpu; - int rc; + int rc = -EINVAL; /* - * Linear mapping is one of 4K, 1M and 16M. For 4K, no need - * to group units. For larger mappings, use 1M atom which - * should be large enough to contain a number of units. + * BookE and BookS radix are historical values and should be revisited. */ - if (mmu_linear_psize == MMU_PAGE_4K) + if (IS_ENABLED(CONFIG_PPC_BOOK3E_64)) { + atom_size = SZ_1M; + } else if (radix_enabled()) { atom_size = PAGE_SIZE; - else - atom_size = 1 << 20; + } else if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) { + /* + * Linear mapping is one of 4K, 1M and 16M. For 4K, no need + * to group units. For larger mappings, use 1M atom which + * should be large enough to contain a number of units. + */ + if (mmu_linear_psize == MMU_PAGE_4K) + atom_size = PAGE_SIZE; + else + atom_size = SZ_1M; + } + + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc) + pr_warn("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } - rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); @@ -781,7 +884,7 @@ void __init setup_per_cpu_areas(void) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG unsigned long memory_block_size_bytes(void) { if (ppc_md.memory_block_size) @@ -808,161 +911,22 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) * disable it by default. Book3S has a soft-nmi hardlockup detector based * on the decrementer interrupt, so it does not suffer from this problem. * - * It is likely to get false positives in VM guests, so disable it there - * by default too. + * It is likely to get false positives in KVM guests, so disable it there + * by default too. PowerVM will not stop or arbitrarily oversubscribe + * CPUs, but give a minimum regular allotment even with SPLPAR, so enable + * the detector for non-KVM guests, assume PowerVM. */ static int __init disable_hardlockup_detector(void) { #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF hardlockup_detector_disable(); #else - if (firmware_has_feature(FW_FEATURE_LPAR)) - hardlockup_detector_disable(); + if (firmware_has_feature(FW_FEATURE_LPAR)) { + if (is_kvm_guest()) + hardlockup_detector_disable(); + } #endif return 0; } early_initcall(disable_hardlockup_detector); - -#ifdef CONFIG_PPC_BOOK3S_64 -static enum l1d_flush_type enabled_flush_types; -static void *l1d_flush_fallback_area; -static bool no_rfi_flush; -bool rfi_flush; - -static int __init handle_no_rfi_flush(char *p) -{ - pr_info("rfi-flush: disabled on command line."); - no_rfi_flush = true; - return 0; -} -early_param("no_rfi_flush", handle_no_rfi_flush); - -/* - * The RFI flush is not KPTI, but because users will see doco that says to use - * nopti we hijack that option here to also disable the RFI flush. - */ -static int __init handle_no_pti(char *p) -{ - pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); - handle_no_rfi_flush(NULL); - return 0; -} -early_param("nopti", handle_no_pti); - -static void do_nothing(void *unused) -{ - /* - * We don't need to do the flush explicitly, just enter+exit kernel is - * sufficient, the RFI exit handlers will do the right thing. - */ -} - -void rfi_flush_enable(bool enable) -{ - if (enable) { - do_rfi_flush_fixups(enabled_flush_types); - on_each_cpu(do_nothing, NULL, 1); - } else - do_rfi_flush_fixups(L1D_FLUSH_NONE); - - rfi_flush = enable; -} - -static void __ref init_fallback_flush(void) -{ - u64 l1d_size, limit; - int cpu; - - /* Only allocate the fallback flush area once (at boot time). */ - if (l1d_flush_fallback_area) - return; - - l1d_size = ppc64_caches.l1d.size; - - /* - * If there is no d-cache-size property in the device tree, l1d_size - * could be zero. That leads to the loop in the asm wrapping around to - * 2^64-1, and then walking off the end of the fallback area and - * eventually causing a page fault which is fatal. Just default to - * something vaguely sane. - */ - if (!l1d_size) - l1d_size = (64 * 1024); - - limit = min(ppc64_bolted_size(), ppc64_rma_size); - - /* - * Align to L1d size, and size it at 2x L1d size, to catch possible - * hardware prefetch runoff. We don't have a recipe for load patterns to - * reliably avoid the prefetcher. - */ - l1d_flush_fallback_area = memblock_alloc_try_nid(l1d_size * 2, - l1d_size, MEMBLOCK_LOW_LIMIT, - limit, NUMA_NO_NODE); - if (!l1d_flush_fallback_area) - panic("%s: Failed to allocate %llu bytes align=0x%llx max_addr=%pa\n", - __func__, l1d_size * 2, l1d_size, &limit); - - - for_each_possible_cpu(cpu) { - struct paca_struct *paca = paca_ptrs[cpu]; - paca->rfi_flush_fallback_area = l1d_flush_fallback_area; - paca->l1d_flush_size = l1d_size; - } -} - -void setup_rfi_flush(enum l1d_flush_type types, bool enable) -{ - if (types & L1D_FLUSH_FALLBACK) { - pr_info("rfi-flush: fallback displacement flush available\n"); - init_fallback_flush(); - } - - if (types & L1D_FLUSH_ORI) - pr_info("rfi-flush: ori type flush available\n"); - - if (types & L1D_FLUSH_MTTRIG) - pr_info("rfi-flush: mttrig type flush available\n"); - - enabled_flush_types = types; - - if (!no_rfi_flush && !cpu_mitigations_off()) - rfi_flush_enable(enable); -} - -#ifdef CONFIG_DEBUG_FS -static int rfi_flush_set(void *data, u64 val) -{ - bool enable; - - if (val == 1) - enable = true; - else if (val == 0) - enable = false; - else - return -EINVAL; - - /* Only do anything if we're changing state */ - if (enable != rfi_flush) - rfi_flush_enable(enable); - - return 0; -} - -static int rfi_flush_get(void *data, u64 *val) -{ - *val = rfi_flush ? 1 : 0; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); - -static __init int rfi_flush_debugfs_init(void) -{ - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); - return 0; -} -device_initcall(rfi_flush_debugfs_init); -#endif -#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e6c30cee6abf..aa17e62f3754 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -1,15 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Common signal handling code for both 32 and 64 bits * * Copyright (c) 2007 Benjamin Herrenschmidt, IBM Corporation * Extracted from signal_32.c and signal_64.c - * - * This file is subject to the terms and conditions of the GNU General - * Public License. See the file README.legal in the main directory of - * this archive for more details. */ -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <linux/signal.h> #include <linux/uprobes.h> #include <linux/key.h> @@ -18,35 +15,163 @@ #include <linux/syscalls.h> #include <asm/hw_breakpoint.h> #include <linux/uaccess.h> +#include <asm/switch_to.h> #include <asm/unistd.h> #include <asm/debug.h> #include <asm/tm.h> #include "signal.h" +#ifdef CONFIG_VSX +unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_FPR(i); + buf[i] = task->thread.fp_state.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_fpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_FPR(i) = buf[i]; + task->thread.fp_state.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_vsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_vsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +unsigned long copy_ckfpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_CKFPR(i); + buf[i] = task->thread.ckfp_state.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_ckfpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_CKFPR(i) = buf[i]; + task->thread.ckfp_state.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_ckvsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_ckvsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#endif + /* Log an error when sending an unhandled signal to a process. Controlled * through debug.exception-trace sysctl. */ int show_unhandled_signals = 1; +unsigned long get_min_sigframe_size(void) +{ + if (IS_ENABLED(CONFIG_PPC64)) + return get_min_sigframe_size_64(); + else + return get_min_sigframe_size_32(); +} + +#ifdef CONFIG_COMPAT +unsigned long get_min_sigframe_size_compat(void) +{ + return get_min_sigframe_size_32(); +} +#endif + /* * Allocate space for the signal frame */ -void __user *get_sigframe(struct ksignal *ksig, unsigned long sp, - size_t frame_size, int is_32) +static unsigned long get_tm_stackpointer(struct task_struct *tsk); + +void __user *get_sigframe(struct ksignal *ksig, struct task_struct *tsk, + size_t frame_size, int is_32) { unsigned long oldsp, newsp; + unsigned long sp = get_tm_stackpointer(tsk); /* Default to using normal stack */ - oldsp = get_clean_sp(sp, is_32); + if (is_32) + oldsp = sp & 0x0ffffffffUL; + else + oldsp = sp; oldsp = sigsp(oldsp, ksig); newsp = (oldsp - frame_size) & ~0xFUL; - /* Check access */ - if (!access_ok((void __user *)newsp, oldsp - newsp)) - return NULL; - return (void __user *)newsp; } @@ -57,12 +182,21 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, int restart = 1; /* syscall ? */ - if (TRAP(regs) != 0x0C00) + if (!trap_is_syscall(regs)) + return; + + if (trap_norestart(regs)) return; /* error signalled ? */ - if (!(regs->ccr & 0x10000000)) + if (trap_is_scv(regs)) { + /* 32-bit compat mode sign extend? */ + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { return; + } switch (ret) { case ERESTART_RESTARTBLOCK: @@ -92,12 +226,17 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, regs->gpr[0] = __NR_restart_syscall; else regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; + regs_add_return_ip(regs, -4); regs->result = 0; } else { - regs->result = -EINTR; - regs->gpr[3] = EINTR; - regs->ccr |= 0x10000000; + if (trap_is_scv(regs)) { + regs->result = -EINTR; + regs->gpr[3] = -EINTR; + } else { + regs->result = -EINTR; + regs->gpr[3] = EINTR; + regs->ccr |= 0x10000000; + } } } @@ -106,7 +245,6 @@ static void do_signal(struct task_struct *tsk) sigset_t *oldset = sigmask_to_save(); struct ksignal ksig = { .sig = 0 }; int ret; - int is32 = is_32bit_task(); BUG_ON(tsk != current); @@ -118,25 +256,30 @@ static void do_signal(struct task_struct *tsk) if (ksig.sig <= 0) { /* No signal to deliver -- put the saved sigmask back */ restore_saved_sigmask(); - tsk->thread.regs->trap = 0; + set_trap_norestart(tsk->thread.regs); return; /* no signals delivered */ } -#ifndef CONFIG_PPC_ADV_DEBUG_REGS /* * Reenable the DABR before delivering the signal to * user space. The DABR will have been cleared if it * triggered inside the kernel. */ - if (tsk->thread.hw_brk.address && tsk->thread.hw_brk.type) - __set_breakpoint(&tsk->thread.hw_brk); -#endif + if (!IS_ENABLED(CONFIG_PPC_ADV_DEBUG_REGS)) { + int i; + + for (i = 0; i < nr_wp_slots(); i++) { + if (tsk->thread.hw_brk[i].address && tsk->thread.hw_brk[i].type) + __set_breakpoint(i, &tsk->thread.hw_brk[i]); + } + } + /* Re-enable the breakpoints for the signal stack */ thread_change_pc(tsk, tsk->thread.regs); rseq_signal_deliver(&ksig, tsk->thread.regs); - if (is32) { + if (is_32bit_task()) { if (ksig.ka.sa.sa_flags & SA_SIGINFO) ret = handle_rt_signal32(&ksig, oldset, tsk); else @@ -145,38 +288,28 @@ static void do_signal(struct task_struct *tsk) ret = handle_rt_signal64(&ksig, oldset, tsk); } - tsk->thread.regs->trap = 0; + set_trap_norestart(tsk->thread.regs); signal_setup_done(ret, &ksig, test_thread_flag(TIF_SINGLESTEP)); } void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) { - user_exit(); - - /* Check valid addr_limit, TIF check is done there */ - addr_limit_user_check(); - if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); if (thread_info_flags & _TIF_PATCH_PENDING) klp_update_patch_state(current); - if (thread_info_flags & _TIF_SIGPENDING) { + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { BUG_ON(regs != current->thread.regs); do_signal(current); } - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } - - user_enter(); + if (thread_info_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); } -unsigned long get_tm_stackpointer(struct task_struct *tsk) +static unsigned long get_tm_stackpointer(struct task_struct *tsk) { /* When in an active transaction that takes a signal, we need to be * careful with the stack. It's possible that the stack has moved back @@ -199,15 +332,39 @@ unsigned long get_tm_stackpointer(struct task_struct *tsk) * For signals taken in non-TM or suspended mode, we use the * normal/non-checkpointed stack pointer. */ + struct pt_regs *regs = tsk->thread.regs; + unsigned long ret = regs->gpr[1]; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BUG_ON(tsk != current); - if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) { + if (MSR_TM_ACTIVE(regs->msr)) { + preempt_disable(); tm_reclaim_current(TM_CAUSE_SIGNAL); - if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr)) - return tsk->thread.ckpt_regs.gpr[1]; + if (MSR_TM_TRANSACTIONAL(regs->msr)) + ret = tsk->thread.ckpt_regs.gpr[1]; + + /* + * If we treclaim, we must clear the current thread's TM bits + * before re-enabling preemption. Otherwise we might be + * preempted and have the live MSR[TS] changed behind our back + * (tm_recheckpoint_new_task() would recheckpoint). Besides, we + * enter the signal handler in non-transactional state. + */ + regs_set_return_msr(regs, regs->msr & ~MSR_TS_MASK); + preempt_enable(); } #endif - return tsk->thread.regs->gpr[1]; + return ret; +} + +static const char fm32[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %08lx lr %08lx\n"; +static const char fm64[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %016lx lr %016lx\n"; + +void signal_fault(struct task_struct *tsk, struct pt_regs *regs, + const char *where, void __user *ptr) +{ + if (show_unhandled_signals) + printk_ratelimited(regs->msr & MSR_64BIT ? fm64 : fm32, tsk->comm, + task_pid_nr(tsk), where, ptr, regs->nip, regs->link); } diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index 800433685888..58ecea1cdc27 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -1,19 +1,14 @@ -/* +/* SPDX-License-Identifier: GPL-2.0-or-later + * * Copyright (c) 2007 Benjamin Herrenschmidt, IBM Corporation * Extracted from signal_32.c and signal_64.c - * - * This file is subject to the terms and conditions of the GNU General - * Public License. See the file README.legal in the main directory of - * this archive for more details. */ #ifndef _POWERPC_ARCH_SIGNAL_H #define _POWERPC_ARCH_SIGNAL_H -extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); - -extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp, - size_t frame_size, int is_32); +void __user *get_sigframe(struct ksignal *ksig, struct task_struct *tsk, + size_t frame_size, int is_32); extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct task_struct *tsk); @@ -21,15 +16,20 @@ extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset, extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, struct task_struct *tsk); -extern unsigned long copy_fpr_to_user(void __user *to, - struct task_struct *task); -extern unsigned long copy_ckfpr_to_user(void __user *to, - struct task_struct *task); -extern unsigned long copy_fpr_from_user(struct task_struct *task, - void __user *from); -extern unsigned long copy_ckfpr_from_user(struct task_struct *task, - void __user *from); -extern unsigned long get_tm_stackpointer(struct task_struct *tsk); +static inline int __get_user_sigset(sigset_t *dst, const sigset_t __user *src) +{ + BUILD_BUG_ON(sizeof(sigset_t) != sizeof(u64)); + + return __get_user(dst->sig[0], (u64 __user *)&src->sig[0]); +} +#define unsafe_get_user_sigset(dst, src, label) do { \ + sigset_t *__dst = dst; \ + const sigset_t __user *__src = src; \ + int i; \ + \ + for (i = 0; i < _NSIG_WORDS; i++) \ + unsafe_get_user(__dst->sig[i], &__src->sig[i], label); \ +} while (0) #ifdef CONFIG_VSX extern unsigned long copy_vsx_to_user(void __user *to, @@ -40,6 +40,150 @@ extern unsigned long copy_vsx_from_user(struct task_struct *task, void __user *from); extern unsigned long copy_ckvsx_from_user(struct task_struct *task, void __user *from); +unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task); +unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct *task); +unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from); +unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); + +#define unsafe_copy_fpr_to_user(to, task, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)to; \ + int i; \ + \ + for (i = 0; i < ELF_NFPREG - 1 ; i++) \ + unsafe_put_user(__t->thread.TS_FPR(i), &buf[i], label); \ + unsafe_put_user(__t->thread.fp_state.fpscr, &buf[i], label); \ +} while (0) + +#define unsafe_copy_vsx_to_user(to, task, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)to; \ + int i; \ + \ + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ + unsafe_put_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ + &buf[i], label);\ +} while (0) + +#define unsafe_copy_fpr_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NFPREG - 1; i++) \ + unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ + unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ +} while (0) + +#define unsafe_copy_vsx_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ + unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ + &buf[i], label); \ +} while (0) + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +#define unsafe_copy_ckfpr_to_user(to, task, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)to; \ + int i; \ + \ + for (i = 0; i < ELF_NFPREG - 1 ; i++) \ + unsafe_put_user(__t->thread.TS_CKFPR(i), &buf[i], label);\ + unsafe_put_user(__t->thread.ckfp_state.fpscr, &buf[i], label); \ +} while (0) + +#define unsafe_copy_ckvsx_to_user(to, task, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)to; \ + int i; \ + \ + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ + unsafe_put_user(__t->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET], \ + &buf[i], label);\ +} while (0) + +#define unsafe_copy_ckfpr_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NFPREG - 1 ; i++) \ + unsafe_get_user(__t->thread.TS_CKFPR(i), &buf[i], label);\ + unsafe_get_user(__t->thread.ckfp_state.fpscr, &buf[i], failed); \ +} while (0) + +#define unsafe_copy_ckvsx_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ + unsafe_get_user(__t->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET], \ + &buf[i], label); \ +} while (0) +#endif +#elif defined(CONFIG_PPC_FPU_REGS) + +#define unsafe_copy_fpr_to_user(to, task, label) \ + unsafe_copy_to_user(to, (task)->thread.fp_state.fpr, \ + ELF_NFPREG * sizeof(double), label) + +#define unsafe_copy_fpr_from_user(task, from, label) \ + unsafe_copy_from_user((task)->thread.fp_state.fpr, from, \ + ELF_NFPREG * sizeof(double), label) + +static inline unsigned long +copy_fpr_to_user(void __user *to, struct task_struct *task) +{ + return __copy_to_user(to, task->thread.fp_state.fpr, + ELF_NFPREG * sizeof(double)); +} + +static inline unsigned long +copy_fpr_from_user(struct task_struct *task, void __user *from) +{ + return __copy_from_user(task->thread.fp_state.fpr, from, + ELF_NFPREG * sizeof(double)); +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +#define unsafe_copy_ckfpr_to_user(to, task, label) \ + unsafe_copy_to_user(to, (task)->thread.ckfp_state.fpr, \ + ELF_NFPREG * sizeof(double), label) + +inline unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct *task) +{ + return __copy_to_user(to, task->thread.ckfp_state.fpr, + ELF_NFPREG * sizeof(double)); +} + +static inline unsigned long +copy_ckfpr_from_user(struct task_struct *task, void __user *from) +{ + return __copy_from_user(task->thread.ckfp_state.fpr, from, + ELF_NFPREG * sizeof(double)); +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#else +#define unsafe_copy_fpr_to_user(to, task, label) do { if (0) goto label;} while (0) + +#define unsafe_copy_fpr_from_user(task, from, label) do { if (0) goto label;} while (0) + +static inline unsigned long +copy_fpr_to_user(void __user *to, struct task_struct *task) +{ + return 0; +} + +static inline unsigned long +copy_fpr_from_user(struct task_struct *task, void __user *from) +{ + return 0; +} #endif #ifdef CONFIG_PPC64 @@ -49,9 +193,6 @@ extern int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, #else /* CONFIG_PPC64 */ -extern long sys_rt_sigreturn(void); -extern long sys_sigreturn(void); - static inline int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, struct task_struct *tsk) { @@ -60,4 +201,7 @@ static inline int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, #endif /* !defined(CONFIG_PPC64) */ +void signal_fault(struct task_struct *tsk, struct pt_regs *regs, + const char *where, void __user *ptr); + #endif /* _POWERPC_ARCH_SIGNAL_H */ diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 98600b276f76..7a718ed32b27 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -43,11 +43,10 @@ #include <asm/tm.h> #include <asm/asm-prototypes.h> #ifdef CONFIG_PPC64 -#include "ppc32.h" +#include <asm/syscalls_32.h> #include <asm/unistd.h> #else #include <asm/ucontext.h> -#include <asm/pgtable.h> #endif #include "signal.h" @@ -59,8 +58,6 @@ #define mcontext mcontext32 #define ucontext ucontext32 -#define __save_altstack __compat_save_altstack - /* * Userspace code may pass a ucontext which doesn't include VSX added * at the end. We need to check for this case. @@ -85,47 +82,35 @@ * Functions for flipping sigsets (thanks to brain dead generic * implementation that makes things simple for little endian only) */ -static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) -{ - return put_compat_sigset(uset, set, sizeof(*uset)); -} - -static inline int get_sigset_t(sigset_t *set, - const compat_sigset_t __user *uset) -{ - return get_compat_sigset(set, uset); -} +#define unsafe_put_sigset_t unsafe_put_compat_sigset +#define unsafe_get_sigset_t unsafe_get_compat_sigset #define to_user_ptr(p) ptr_to_compat(p) #define from_user_ptr(p) compat_ptr(p) -static inline int save_general_regs(struct pt_regs *regs, - struct mcontext __user *frame) +static __always_inline int +__unsafe_save_general_regs(struct pt_regs *regs, struct mcontext __user *frame) { elf_greg_t64 *gregs = (elf_greg_t64 *)regs; - int i; - /* Force usr to alway see softe as 1 (interrupts enabled) */ - elf_greg_t64 softe = 0x1; - - WARN_ON(!FULL_REGS(regs)); + int val, i; for (i = 0; i <= PT_RESULT; i ++) { - if (i == 14 && !FULL_REGS(regs)) - i = 32; - if ( i == PT_SOFTE) { - if(__put_user((unsigned int)softe, &frame->mc_gregs[i])) - return -EFAULT; - else - continue; - } - if (__put_user((unsigned int)gregs[i], &frame->mc_gregs[i])) - return -EFAULT; + /* Force usr to alway see softe as 1 (interrupts enabled) */ + if (i == PT_SOFTE) + val = 1; + else + val = gregs[i]; + + unsafe_put_user(val, &frame->mc_gregs[i], failed); } return 0; + +failed: + return 1; } -static inline int restore_general_regs(struct pt_regs *regs, - struct mcontext __user *sr) +static __always_inline int +__unsafe_restore_general_regs(struct pt_regs *regs, struct mcontext __user *sr) { elf_greg_t64 *gregs = (elf_greg_t64 *)regs; int i; @@ -133,51 +118,67 @@ static inline int restore_general_regs(struct pt_regs *regs, for (i = 0; i <= PT_RESULT; i++) { if ((i == PT_MSR) || (i == PT_SOFTE)) continue; - if (__get_user(gregs[i], &sr->mc_gregs[i])) - return -EFAULT; + unsafe_get_user(gregs[i], &sr->mc_gregs[i], failed); } return 0; + +failed: + return 1; } #else /* CONFIG_PPC64 */ #define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs)) -static inline int put_sigset_t(sigset_t __user *uset, sigset_t *set) -{ - return copy_to_user(uset, set, sizeof(*uset)); -} +#define unsafe_put_sigset_t(uset, set, label) do { \ + sigset_t __user *__us = uset ; \ + const sigset_t *__s = set; \ + \ + unsafe_copy_to_user(__us, __s, sizeof(*__us), label); \ +} while (0) -static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset) -{ - return copy_from_user(set, uset, sizeof(*uset)); -} +#define unsafe_get_sigset_t unsafe_get_user_sigset #define to_user_ptr(p) ((unsigned long)(p)) #define from_user_ptr(p) ((void __user *)(p)) -static inline int save_general_regs(struct pt_regs *regs, - struct mcontext __user *frame) +static __always_inline int +__unsafe_save_general_regs(struct pt_regs *regs, struct mcontext __user *frame) { - WARN_ON(!FULL_REGS(regs)); - return __copy_to_user(&frame->mc_gregs, regs, GP_REGS_SIZE); + unsafe_copy_to_user(&frame->mc_gregs, regs, GP_REGS_SIZE, failed); + return 0; + +failed: + return 1; } -static inline int restore_general_regs(struct pt_regs *regs, - struct mcontext __user *sr) +static __always_inline +int __unsafe_restore_general_regs(struct pt_regs *regs, struct mcontext __user *sr) { /* copy up to but not including MSR */ - if (__copy_from_user(regs, &sr->mc_gregs, - PT_MSR * sizeof(elf_greg_t))) - return -EFAULT; + unsafe_copy_from_user(regs, &sr->mc_gregs, PT_MSR * sizeof(elf_greg_t), failed); + /* copy from orig_r3 (the word after the MSR) up to the end */ - if (__copy_from_user(®s->orig_gpr3, &sr->mc_gregs[PT_ORIG_R3], - GP_REGS_SIZE - PT_ORIG_R3 * sizeof(elf_greg_t))) - return -EFAULT; + unsafe_copy_from_user(®s->orig_gpr3, &sr->mc_gregs[PT_ORIG_R3], + GP_REGS_SIZE - PT_ORIG_R3 * sizeof(elf_greg_t), failed); + return 0; + +failed: + return 1; } #endif +#define unsafe_save_general_regs(regs, frame, label) do { \ + if (__unsafe_save_general_regs(regs, frame)) \ + goto label; \ +} while (0) + +#define unsafe_restore_general_regs(regs, frame, label) do { \ + if (__unsafe_restore_general_regs(regs, frame)) \ + goto label; \ +} while (0) + /* * When we have signals to deliver, we set up on the * user stack, going down from the original stack pointer: @@ -204,9 +205,6 @@ struct sigframe { int abigap[56]; }; -/* We use the mc_pad field for the signal return trampoline. */ -#define tramp mc_pad - /* * When we have rt signals to deliver, we set up on the * user stack, going down from the original stack pointer: @@ -235,171 +233,51 @@ struct rt_sigframe { int abigap[56]; }; -#ifdef CONFIG_VSX -unsigned long copy_fpr_to_user(void __user *to, - struct task_struct *task) +unsigned long get_min_sigframe_size_32(void) { - u64 buf[ELF_NFPREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - buf[i] = task->thread.TS_FPR(i); - buf[i] = task->thread.fp_state.fpscr; - return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); + return max(sizeof(struct rt_sigframe) + __SIGNAL_FRAMESIZE + 16, + sizeof(struct sigframe) + __SIGNAL_FRAMESIZE); } -unsigned long copy_fpr_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NFPREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) - return 1; - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - task->thread.TS_FPR(i) = buf[i]; - task->thread.fp_state.fpscr = buf[i]; - - return 0; -} - -unsigned long copy_vsx_to_user(void __user *to, - struct task_struct *task) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < ELF_NVSRHALFREG; i++) - buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; - return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); -} - -unsigned long copy_vsx_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) - return 1; - for (i = 0; i < ELF_NVSRHALFREG ; i++) - task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; - return 0; -} - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -unsigned long copy_ckfpr_to_user(void __user *to, - struct task_struct *task) -{ - u64 buf[ELF_NFPREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - buf[i] = task->thread.TS_CKFPR(i); - buf[i] = task->thread.ckfp_state.fpscr; - return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); -} - -unsigned long copy_ckfpr_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NFPREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) - return 1; - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - task->thread.TS_CKFPR(i) = buf[i]; - task->thread.ckfp_state.fpscr = buf[i]; - - return 0; -} - -unsigned long copy_ckvsx_to_user(void __user *to, - struct task_struct *task) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < ELF_NVSRHALFREG; i++) - buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; - return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); -} - -unsigned long copy_ckvsx_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) - return 1; - for (i = 0; i < ELF_NVSRHALFREG ; i++) - task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; - return 0; -} -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -#else -inline unsigned long copy_fpr_to_user(void __user *to, - struct task_struct *task) -{ - return __copy_to_user(to, task->thread.fp_state.fpr, - ELF_NFPREG * sizeof(double)); -} - -inline unsigned long copy_fpr_from_user(struct task_struct *task, - void __user *from) -{ - return __copy_from_user(task->thread.fp_state.fpr, from, - ELF_NFPREG * sizeof(double)); -} - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -inline unsigned long copy_ckfpr_to_user(void __user *to, - struct task_struct *task) -{ - return __copy_to_user(to, task->thread.ckfp_state.fpr, - ELF_NFPREG * sizeof(double)); -} - -inline unsigned long copy_ckfpr_from_user(struct task_struct *task, - void __user *from) -{ - return __copy_from_user(task->thread.ckfp_state.fpr, from, - ELF_NFPREG * sizeof(double)); -} -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -#endif - /* * Save the current user registers on the user stack. * We only save the altivec/spe registers if the process has used * altivec/spe instructions at some point. */ -static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, - struct mcontext __user *tm_frame, int sigret, - int ctx_has_vsx_region) +static void prepare_save_user_regs(int ctx_has_vsx_region) { - unsigned long msr = regs->msr; - /* Make sure floating point registers are stored in regs */ flush_fp_to_thread(current); +#ifdef CONFIG_ALTIVEC + if (current->thread.used_vr) + flush_altivec_to_thread(current); + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); +#endif +#ifdef CONFIG_VSX + if (current->thread.used_vsr && ctx_has_vsx_region) + flush_vsx_to_thread(current); +#endif +#ifdef CONFIG_SPE + if (current->thread.used_spe) + flush_spe_to_thread(current); +#endif +} + +static __always_inline int +__unsafe_save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, + struct mcontext __user *tm_frame, int ctx_has_vsx_region) +{ + unsigned long msr = regs->msr; /* save general registers */ - if (save_general_regs(regs, frame)) - return 1; + unsafe_save_general_regs(regs, frame, failed); #ifdef CONFIG_ALTIVEC /* save altivec registers */ if (current->thread.used_vr) { - flush_altivec_to_thread(current); - if (__copy_to_user(&frame->mc_vregs, ¤t->thread.vr_state, - ELF_NVRREG * sizeof(vector128))) - return 1; + unsafe_copy_to_user(&frame->mc_vregs, ¤t->thread.vr_state, + ELF_NVRREG * sizeof(vector128), failed); /* set MSR_VEC in the saved MSR value to indicate that frame->mc_vregs contains valid data */ msr |= MSR_VEC; @@ -412,13 +290,10 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, * most significant bits of that same vector. --BenH * Note that the current VRSAVE value is in the SPR at this point. */ - if (cpu_has_feature(CPU_FTR_ALTIVEC)) - current->thread.vrsave = mfspr(SPRN_VRSAVE); - if (__put_user(current->thread.vrsave, (u32 __user *)&frame->mc_vregs[32])) - return 1; + unsafe_put_user(current->thread.vrsave, (u32 __user *)&frame->mc_vregs[32], + failed); #endif /* CONFIG_ALTIVEC */ - if (copy_fpr_to_user(&frame->mc_fregs, current)) - return 1; + unsafe_copy_fpr_to_user(&frame->mc_fregs, current, failed); /* * Clear the MSR VSX bit to indicate there is no valid state attached @@ -433,19 +308,15 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, * contains valid data */ if (current->thread.used_vsr && ctx_has_vsx_region) { - flush_vsx_to_thread(current); - if (copy_vsx_to_user(&frame->mc_vsregs, current)) - return 1; + unsafe_copy_vsx_to_user(&frame->mc_vsregs, current, failed); msr |= MSR_VSX; } #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE /* save spe registers */ if (current->thread.used_spe) { - flush_spe_to_thread(current); - if (__copy_to_user(&frame->mc_vregs, current->thread.evr, - ELF_NEVRREG * sizeof(u32))) - return 1; + unsafe_copy_to_user(&frame->mc_vregs, current->thread.evr, + ELF_NEVRREG * sizeof(u32), failed); /* set MSR_SPE in the saved MSR value to indicate that frame->mc_vregs contains valid data */ msr |= MSR_SPE; @@ -453,30 +324,29 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, /* else assert((regs->msr & MSR_SPE) == 0) */ /* We always copy to/from spefscr */ - if (__put_user(current->thread.spefscr, (u32 __user *)&frame->mc_vregs + ELF_NEVRREG)) - return 1; + unsafe_put_user(current->thread.spefscr, + (u32 __user *)&frame->mc_vregs + ELF_NEVRREG, failed); #endif /* CONFIG_SPE */ - if (__put_user(msr, &frame->mc_gregs[PT_MSR])) - return 1; + unsafe_put_user(msr, &frame->mc_gregs[PT_MSR], failed); + /* We need to write 0 the MSR top 32 bits in the tm frame so that we * can check it on the restore to see if TM is active */ - if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR])) - return 1; - - if (sigret) { - /* Set up the sigreturn trampoline: li 0,sigret; sc */ - if (__put_user(PPC_INST_ADDI + sigret, &frame->tramp[0]) - || __put_user(PPC_INST_SC, &frame->tramp[1])) - return 1; - flush_icache_range((unsigned long) &frame->tramp[0], - (unsigned long) &frame->tramp[2]); - } + if (tm_frame) + unsafe_put_user(0, &tm_frame->mc_gregs[PT_MSR], failed); return 0; + +failed: + return 1; } +#define unsafe_save_user_regs(regs, frame, tm_frame, has_vsx, label) do { \ + if (__unsafe_save_user_regs(regs, frame, tm_frame, has_vsx)) \ + goto label; \ +} while (0) + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* * Save the current user registers on the user stack. @@ -485,27 +355,23 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, * We also save the transactional registers to a second ucontext in the * frame. * - * See save_user_regs() and signal_64.c:setup_tm_sigcontexts(). + * See __unsafe_save_user_regs() and signal_64.c:setup_tm_sigcontexts(). */ -static int save_tm_user_regs(struct pt_regs *regs, - struct mcontext __user *frame, - struct mcontext __user *tm_frame, int sigret) +static void prepare_save_tm_user_regs(void) { - unsigned long msr = regs->msr; - WARN_ON(tm_suspend_disabled); - /* Remove TM bits from thread's MSR. The MSR in the sigcontext - * just indicates to userland that we were doing a transaction, but we - * don't want to return in transactional state. This also ensures - * that flush_fp_to_thread won't set TIF_RESTORE_TM again. - */ - regs->msr &= ~MSR_TS_MASK; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.ckvrsave = mfspr(SPRN_VRSAVE); +} +static __always_inline int +save_tm_user_regs_unsafe(struct pt_regs *regs, struct mcontext __user *frame, + struct mcontext __user *tm_frame, unsigned long msr) +{ /* Save both sets of general registers */ - if (save_general_regs(¤t->thread.ckpt_regs, frame) - || save_general_regs(regs, tm_frame)) - return 1; + unsafe_save_general_regs(¤t->thread.ckpt_regs, frame, failed); + unsafe_save_general_regs(regs, tm_frame, failed); /* Stash the top half of the 64bit MSR into the 32bit MSR word * of the transactional mcontext. This way we have a backward-compatible @@ -513,26 +379,20 @@ static int save_tm_user_regs(struct pt_regs *regs, * also look at what type of transaction (T or S) was active at the * time of the signal. */ - if (__put_user((msr >> 32), &tm_frame->mc_gregs[PT_MSR])) - return 1; + unsafe_put_user((msr >> 32), &tm_frame->mc_gregs[PT_MSR], failed); -#ifdef CONFIG_ALTIVEC /* save altivec registers */ if (current->thread.used_vr) { - if (__copy_to_user(&frame->mc_vregs, ¤t->thread.ckvr_state, - ELF_NVRREG * sizeof(vector128))) - return 1; - if (msr & MSR_VEC) { - if (__copy_to_user(&tm_frame->mc_vregs, - ¤t->thread.vr_state, - ELF_NVRREG * sizeof(vector128))) - return 1; - } else { - if (__copy_to_user(&tm_frame->mc_vregs, - ¤t->thread.ckvr_state, - ELF_NVRREG * sizeof(vector128))) - return 1; - } + unsafe_copy_to_user(&frame->mc_vregs, ¤t->thread.ckvr_state, + ELF_NVRREG * sizeof(vector128), failed); + if (msr & MSR_VEC) + unsafe_copy_to_user(&tm_frame->mc_vregs, + ¤t->thread.vr_state, + ELF_NVRREG * sizeof(vector128), failed); + else + unsafe_copy_to_user(&tm_frame->mc_vregs, + ¤t->thread.ckvr_state, + ELF_NVRREG * sizeof(vector128), failed); /* set MSR_VEC in the saved MSR value to indicate that * frame->mc_vregs contains valid data @@ -545,33 +405,21 @@ static int save_tm_user_regs(struct pt_regs *regs, * significant bits of a vector, we "cheat" and stuff VRSAVE in the * most significant bits of that same vector. --BenH */ - if (cpu_has_feature(CPU_FTR_ALTIVEC)) - current->thread.ckvrsave = mfspr(SPRN_VRSAVE); - if (__put_user(current->thread.ckvrsave, - (u32 __user *)&frame->mc_vregs[32])) - return 1; - if (msr & MSR_VEC) { - if (__put_user(current->thread.vrsave, - (u32 __user *)&tm_frame->mc_vregs[32])) - return 1; - } else { - if (__put_user(current->thread.ckvrsave, - (u32 __user *)&tm_frame->mc_vregs[32])) - return 1; - } -#endif /* CONFIG_ALTIVEC */ + unsafe_put_user(current->thread.ckvrsave, + (u32 __user *)&frame->mc_vregs[32], failed); + if (msr & MSR_VEC) + unsafe_put_user(current->thread.vrsave, + (u32 __user *)&tm_frame->mc_vregs[32], failed); + else + unsafe_put_user(current->thread.ckvrsave, + (u32 __user *)&tm_frame->mc_vregs[32], failed); - if (copy_ckfpr_to_user(&frame->mc_fregs, current)) - return 1; - if (msr & MSR_FP) { - if (copy_fpr_to_user(&tm_frame->mc_fregs, current)) - return 1; - } else { - if (copy_ckfpr_to_user(&tm_frame->mc_fregs, current)) - return 1; - } + unsafe_copy_ckfpr_to_user(&frame->mc_fregs, current, failed); + if (msr & MSR_FP) + unsafe_copy_fpr_to_user(&tm_frame->mc_fregs, current, failed); + else + unsafe_copy_ckfpr_to_user(&tm_frame->mc_fregs, current, failed); -#ifdef CONFIG_VSX /* * Copy VSR 0-31 upper half from thread_struct to local * buffer, then write that to userspace. Also set MSR_VSX in @@ -579,54 +427,38 @@ static int save_tm_user_regs(struct pt_regs *regs, * contains valid data */ if (current->thread.used_vsr) { - if (copy_ckvsx_to_user(&frame->mc_vsregs, current)) - return 1; - if (msr & MSR_VSX) { - if (copy_vsx_to_user(&tm_frame->mc_vsregs, - current)) - return 1; - } else { - if (copy_ckvsx_to_user(&tm_frame->mc_vsregs, current)) - return 1; - } + unsafe_copy_ckvsx_to_user(&frame->mc_vsregs, current, failed); + if (msr & MSR_VSX) + unsafe_copy_vsx_to_user(&tm_frame->mc_vsregs, current, failed); + else + unsafe_copy_ckvsx_to_user(&tm_frame->mc_vsregs, current, failed); msr |= MSR_VSX; } -#endif /* CONFIG_VSX */ -#ifdef CONFIG_SPE - /* SPE regs are not checkpointed with TM, so this section is - * simply the same as in save_user_regs(). - */ - if (current->thread.used_spe) { - flush_spe_to_thread(current); - if (__copy_to_user(&frame->mc_vregs, current->thread.evr, - ELF_NEVRREG * sizeof(u32))) - return 1; - /* set MSR_SPE in the saved MSR value to indicate that - * frame->mc_vregs contains valid data */ - msr |= MSR_SPE; - } - /* We always copy to/from spefscr */ - if (__put_user(current->thread.spefscr, (u32 __user *)&frame->mc_vregs + ELF_NEVRREG)) - return 1; -#endif /* CONFIG_SPE */ + unsafe_put_user(msr, &frame->mc_gregs[PT_MSR], failed); - if (__put_user(msr, &frame->mc_gregs[PT_MSR])) - return 1; - if (sigret) { - /* Set up the sigreturn trampoline: li 0,sigret; sc */ - if (__put_user(PPC_INST_ADDI + sigret, &frame->tramp[0]) - || __put_user(PPC_INST_SC, &frame->tramp[1])) - return 1; - flush_icache_range((unsigned long) &frame->tramp[0], - (unsigned long) &frame->tramp[2]); - } + return 0; + +failed: + return 1; +} +#else +static void prepare_save_tm_user_regs(void) { } +static __always_inline int +save_tm_user_regs_unsafe(struct pt_regs *regs, struct mcontext __user *frame, + struct mcontext __user *tm_frame, unsigned long msr) +{ return 0; } #endif +#define unsafe_save_tm_user_regs(regs, frame, tm_frame, msr, label) do { \ + if (save_tm_user_regs_unsafe(regs, frame, tm_frame, msr)) \ + goto label; \ +} while (0) + /* * Restore the current user register values from the user stack, * (except for MSR). @@ -634,69 +466,64 @@ static int save_tm_user_regs(struct pt_regs *regs, static long restore_user_regs(struct pt_regs *regs, struct mcontext __user *sr, int sig) { - long err; unsigned int save_r2 = 0; unsigned long msr; #ifdef CONFIG_VSX int i; #endif + if (!user_read_access_begin(sr, sizeof(*sr))) + return 1; /* * restore general registers but not including MSR or SOFTE. Also * take care of keeping r2 (TLS) intact if not a signal */ if (!sig) save_r2 = (unsigned int)regs->gpr[2]; - err = restore_general_regs(regs, sr); - regs->trap = 0; - err |= __get_user(msr, &sr->mc_gregs[PT_MSR]); + unsafe_restore_general_regs(regs, sr, failed); + set_trap_norestart(regs); + unsafe_get_user(msr, &sr->mc_gregs[PT_MSR], failed); if (!sig) regs->gpr[2] = (unsigned long) save_r2; - if (err) - return 1; /* if doing signal return, restore the previous little-endian mode */ if (sig) - regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); + regs_set_return_msr(regs, (regs->msr & ~MSR_LE) | (msr & MSR_LE)); #ifdef CONFIG_ALTIVEC /* * Force the process to reload the altivec registers from * current->thread when it next does altivec instructions */ - regs->msr &= ~MSR_VEC; + regs_set_return_msr(regs, regs->msr & ~MSR_VEC); if (msr & MSR_VEC) { /* restore altivec registers from the stack */ - if (__copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, - sizeof(sr->mc_vregs))) - return 1; + unsafe_copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, + sizeof(sr->mc_vregs), failed); current->thread.used_vr = true; } else if (current->thread.used_vr) memset(¤t->thread.vr_state, 0, ELF_NVRREG * sizeof(vector128)); /* Always get VRSAVE back */ - if (__get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32])) - return 1; + unsafe_get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32], failed); if (cpu_has_feature(CPU_FTR_ALTIVEC)) mtspr(SPRN_VRSAVE, current->thread.vrsave); #endif /* CONFIG_ALTIVEC */ - if (copy_fpr_from_user(current, &sr->mc_fregs)) - return 1; + unsafe_copy_fpr_from_user(current, &sr->mc_fregs, failed); #ifdef CONFIG_VSX /* * Force the process to reload the VSX registers from * current->thread when it next does VSX instruction. */ - regs->msr &= ~MSR_VSX; + regs_set_return_msr(regs, regs->msr & ~MSR_VSX); if (msr & MSR_VSX) { /* * Restore altivec registers from the stack to a local * buffer, then write this out to the thread_struct */ - if (copy_vsx_from_user(current, &sr->mc_vsregs)) - return 1; + unsafe_copy_vsx_from_user(current, &sr->mc_vsregs, failed); current->thread.used_vsr = true; } else if (current->thread.used_vsr) for (i = 0; i < 32 ; i++) @@ -706,27 +533,34 @@ static long restore_user_regs(struct pt_regs *regs, * force the process to reload the FP registers from * current->thread when it next does FP instructions */ - regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); + regs_set_return_msr(regs, regs->msr & ~(MSR_FP | MSR_FE0 | MSR_FE1)); #ifdef CONFIG_SPE - /* force the process to reload the spe registers from - current->thread when it next does spe instructions */ - regs->msr &= ~MSR_SPE; + /* + * Force the process to reload the spe registers from + * current->thread when it next does spe instructions. + * Since this is user ABI, we must enforce the sizing. + */ + BUILD_BUG_ON(sizeof(current->thread.spe) != ELF_NEVRREG * sizeof(u32)); + regs_set_return_msr(regs, regs->msr & ~MSR_SPE); if (msr & MSR_SPE) { /* restore spe registers from the stack */ - if (__copy_from_user(current->thread.evr, &sr->mc_vregs, - ELF_NEVRREG * sizeof(u32))) - return 1; + unsafe_copy_from_user(¤t->thread.spe, &sr->mc_vregs, + sizeof(current->thread.spe), failed); current->thread.used_spe = true; } else if (current->thread.used_spe) - memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32)); + memset(¤t->thread.spe, 0, sizeof(current->thread.spe)); /* Always get SPEFSCR back */ - if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + ELF_NEVRREG)) - return 1; + unsafe_get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + ELF_NEVRREG, failed); #endif /* CONFIG_SPE */ + user_read_access_end(); return 0; + +failed: + user_read_access_end(); + return 1; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -739,11 +573,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, struct mcontext __user *sr, struct mcontext __user *tm_sr) { - long err; unsigned long msr, msr_hi; -#ifdef CONFIG_VSX int i; -#endif if (tm_suspend_disabled) return 1; @@ -754,28 +585,21 @@ static long restore_tm_user_regs(struct pt_regs *regs, * TFHAR is restored from the checkpointed NIP; TEXASR and TFIAR * were set by the signal delivery. */ - err = restore_general_regs(regs, tm_sr); - err |= restore_general_regs(¤t->thread.ckpt_regs, sr); - - err |= __get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP]); - - err |= __get_user(msr, &sr->mc_gregs[PT_MSR]); - if (err) + if (!user_read_access_begin(sr, sizeof(*sr))) return 1; + unsafe_restore_general_regs(¤t->thread.ckpt_regs, sr, failed); + unsafe_get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP], failed); + unsafe_get_user(msr, &sr->mc_gregs[PT_MSR], failed); + /* Restore the previous little-endian mode */ - regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); + regs_set_return_msr(regs, (regs->msr & ~MSR_LE) | (msr & MSR_LE)); -#ifdef CONFIG_ALTIVEC - regs->msr &= ~MSR_VEC; + regs_set_return_msr(regs, regs->msr & ~MSR_VEC); if (msr & MSR_VEC) { /* restore altivec registers from the stack */ - if (__copy_from_user(¤t->thread.ckvr_state, &sr->mc_vregs, - sizeof(sr->mc_vregs)) || - __copy_from_user(¤t->thread.vr_state, - &tm_sr->mc_vregs, - sizeof(sr->mc_vregs))) - return 1; + unsafe_copy_from_user(¤t->thread.ckvr_state, &sr->mc_vregs, + sizeof(sr->mc_vregs), failed); current->thread.used_vr = true; } else if (current->thread.used_vr) { memset(¤t->thread.vr_state, 0, @@ -785,62 +609,62 @@ static long restore_tm_user_regs(struct pt_regs *regs, } /* Always get VRSAVE back */ - if (__get_user(current->thread.ckvrsave, - (u32 __user *)&sr->mc_vregs[32]) || - __get_user(current->thread.vrsave, - (u32 __user *)&tm_sr->mc_vregs[32])) - return 1; + unsafe_get_user(current->thread.ckvrsave, + (u32 __user *)&sr->mc_vregs[32], failed); if (cpu_has_feature(CPU_FTR_ALTIVEC)) mtspr(SPRN_VRSAVE, current->thread.ckvrsave); -#endif /* CONFIG_ALTIVEC */ - regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); + regs_set_return_msr(regs, regs->msr & ~(MSR_FP | MSR_FE0 | MSR_FE1)); - if (copy_fpr_from_user(current, &sr->mc_fregs) || - copy_ckfpr_from_user(current, &tm_sr->mc_fregs)) - return 1; + unsafe_copy_fpr_from_user(current, &sr->mc_fregs, failed); -#ifdef CONFIG_VSX - regs->msr &= ~MSR_VSX; + regs_set_return_msr(regs, regs->msr & ~MSR_VSX); if (msr & MSR_VSX) { /* * Restore altivec registers from the stack to a local * buffer, then write this out to the thread_struct */ - if (copy_vsx_from_user(current, &tm_sr->mc_vsregs) || - copy_ckvsx_from_user(current, &sr->mc_vsregs)) - return 1; + unsafe_copy_ckvsx_from_user(current, &sr->mc_vsregs, failed); current->thread.used_vsr = true; } else if (current->thread.used_vsr) for (i = 0; i < 32 ; i++) { current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; current->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = 0; } -#endif /* CONFIG_VSX */ -#ifdef CONFIG_SPE - /* SPE regs are not checkpointed with TM, so this section is - * simply the same as in restore_user_regs(). - */ - regs->msr &= ~MSR_SPE; - if (msr & MSR_SPE) { - if (__copy_from_user(current->thread.evr, &sr->mc_vregs, - ELF_NEVRREG * sizeof(u32))) - return 1; - current->thread.used_spe = true; - } else if (current->thread.used_spe) - memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32)); + user_read_access_end(); - /* Always get SPEFSCR back */ - if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs - + ELF_NEVRREG)) + if (!user_read_access_begin(tm_sr, sizeof(*tm_sr))) return 1; -#endif /* CONFIG_SPE */ + + unsafe_restore_general_regs(regs, tm_sr, failed); + + /* restore altivec registers from the stack */ + if (msr & MSR_VEC) + unsafe_copy_from_user(¤t->thread.vr_state, &tm_sr->mc_vregs, + sizeof(sr->mc_vregs), failed); + + /* Always get VRSAVE back */ + unsafe_get_user(current->thread.vrsave, + (u32 __user *)&tm_sr->mc_vregs[32], failed); + + unsafe_copy_ckfpr_from_user(current, &tm_sr->mc_fregs, failed); + + if (msr & MSR_VSX) { + /* + * Restore altivec registers from the stack to a local + * buffer, then write this out to the thread_struct + */ + unsafe_copy_vsx_from_user(current, &tm_sr->mc_vsregs, failed); + current->thread.used_vsr = true; + } /* Get the top half of the MSR from the user context */ - if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) - return 1; + unsafe_get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR], failed); msr_hi <<= 32; + + user_read_access_end(); + /* If TM bits are set to the reserved value, it's an invalid context */ if (MSR_TM_RESV(msr_hi)) return 1; @@ -861,7 +685,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, * * Pull in the MSR TM bits from the user context */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr_hi & MSR_TS_MASK); + regs_set_return_msr(regs, (regs->msr & ~MSR_TS_MASK) | (msr_hi & MSR_TS_MASK)); /* Now, recheckpoint. This loads up all of the checkpointed (older) * registers, including FP and V[S]Rs. After recheckpointing, the * transactional versions should be loaded. @@ -876,18 +700,26 @@ static long restore_tm_user_regs(struct pt_regs *regs, msr_check_and_set(msr & (MSR_FP | MSR_VEC)); if (msr & MSR_FP) { load_fp_state(¤t->thread.fp_state); - regs->msr |= (MSR_FP | current->thread.fpexc_mode); + regs_set_return_msr(regs, regs->msr | (MSR_FP | current->thread.fpexc_mode)); } -#ifdef CONFIG_ALTIVEC if (msr & MSR_VEC) { load_vr_state(¤t->thread.vr_state); - regs->msr |= MSR_VEC; + regs_set_return_msr(regs, regs->msr | MSR_VEC); } -#endif preempt_enable(); return 0; + +failed: + user_read_access_end(); + return 1; +} +#else +static long restore_tm_user_regs(struct pt_regs *regs, struct mcontext __user *sr, + struct mcontext __user *tm_sr) +{ + return 0; } #endif @@ -904,92 +736,183 @@ static long restore_tm_user_regs(struct pt_regs *regs, int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, struct task_struct *tsk) { - struct rt_sigframe __user *rt_sf; - struct mcontext __user *frame; - struct mcontext __user *tm_frame = NULL; - void __user *addr; + struct rt_sigframe __user *frame; + struct mcontext __user *mctx; + struct mcontext __user *tm_mctx = NULL; unsigned long newsp = 0; - int sigret; unsigned long tramp; struct pt_regs *regs = tsk->thread.regs; - - BUG_ON(tsk != current); + /* Save the thread's msr before get_tm_stackpointer() changes it */ + unsigned long msr = regs->msr; /* Set up Signal Frame */ - /* Put a Real Time Context onto stack */ - rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1); - addr = rt_sf; - if (unlikely(rt_sf == NULL)) + frame = get_sigframe(ksig, tsk, sizeof(*frame), 1); + mctx = &frame->uc.uc_mcontext; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_mctx = &frame->uc_transact.uc_mcontext; +#endif + if (MSR_TM_ACTIVE(msr)) + prepare_save_tm_user_regs(); + else + prepare_save_user_regs(1); + + if (!user_access_begin(frame, sizeof(*frame))) goto badframe; /* Put the siginfo & fill in most of the ucontext */ - if (copy_siginfo_to_user(&rt_sf->info, &ksig->info) - || __put_user(0, &rt_sf->uc.uc_flags) - || __save_altstack(&rt_sf->uc.uc_stack, regs->gpr[1]) - || __put_user(to_user_ptr(&rt_sf->uc.uc_mcontext), - &rt_sf->uc.uc_regs) - || put_sigset_t(&rt_sf->uc.uc_sigmask, oldset)) - goto badframe; + unsafe_put_user(0, &frame->uc.uc_flags, failed); +#ifdef CONFIG_PPC64 + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->gpr[1], failed); +#else + unsafe_save_altstack(&frame->uc.uc_stack, regs->gpr[1], failed); +#endif + unsafe_put_user(to_user_ptr(&frame->uc.uc_mcontext), &frame->uc.uc_regs, failed); + + if (MSR_TM_ACTIVE(msr)) { +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + unsafe_put_user((unsigned long)&frame->uc_transact, + &frame->uc.uc_link, failed); + unsafe_put_user((unsigned long)tm_mctx, + &frame->uc_transact.uc_regs, failed); +#endif + unsafe_save_tm_user_regs(regs, mctx, tm_mctx, msr, failed); + } else { + unsafe_put_user(0, &frame->uc.uc_link, failed); + unsafe_save_user_regs(regs, mctx, tm_mctx, 1, failed); + } /* Save user registers on the stack */ - frame = &rt_sf->uc.uc_mcontext; - addr = frame; - if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) { - sigret = 0; - tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp; + if (tsk->mm->context.vdso) { + tramp = VDSO32_SYMBOL(tsk->mm->context.vdso, sigtramp_rt32); } else { - sigret = __NR_rt_sigreturn; - tramp = (unsigned long) frame->tramp; + tramp = (unsigned long)mctx->mc_pad; + unsafe_put_user(PPC_RAW_LI(_R0, __NR_rt_sigreturn), &mctx->mc_pad[0], failed); + unsafe_put_user(PPC_RAW_SC(), &mctx->mc_pad[1], failed); + asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0])); } + unsafe_put_sigset_t(&frame->uc.uc_sigmask, oldset, failed); + + user_access_end(); + + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + goto badframe; + regs->link = tramp; + +#ifdef CONFIG_PPC_FPU_REGS + tsk->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */ +#endif + + /* create a stack frame for the caller of the handler */ + newsp = ((unsigned long)frame) - (__SIGNAL_FRAMESIZE + 16); + if (put_user(regs->gpr[1], (u32 __user *)newsp)) + goto badframe; + + /* Fill registers for signal handler */ + regs->gpr[1] = newsp; + regs->gpr[3] = ksig->sig; + regs->gpr[4] = (unsigned long)&frame->info; + regs->gpr[5] = (unsigned long)&frame->uc; + regs->gpr[6] = (unsigned long)frame; + regs_set_return_ip(regs, (unsigned long) ksig->ka.sa.sa_handler); + /* enter the signal handler in native-endian mode */ + regs_set_return_msr(regs, (regs->msr & ~MSR_LE) | (MSR_KERNEL & MSR_LE)); + + return 0; + +failed: + user_access_end(); + +badframe: + signal_fault(tsk, regs, "handle_rt_signal32", frame); + + return 1; +} + +/* + * OK, we're invoking a handler + */ +int handle_signal32(struct ksignal *ksig, sigset_t *oldset, + struct task_struct *tsk) +{ + struct sigcontext __user *sc; + struct sigframe __user *frame; + struct mcontext __user *mctx; + struct mcontext __user *tm_mctx = NULL; + unsigned long newsp = 0; + unsigned long tramp; + struct pt_regs *regs = tsk->thread.regs; + /* Save the thread's msr before get_tm_stackpointer() changes it */ + unsigned long msr = regs->msr; + + /* Set up Signal Frame */ + frame = get_sigframe(ksig, tsk, sizeof(*frame), 1); + mctx = &frame->mctx; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM - tm_frame = &rt_sf->uc_transact.uc_mcontext; - if (MSR_TM_ACTIVE(regs->msr)) { - if (__put_user((unsigned long)&rt_sf->uc_transact, - &rt_sf->uc.uc_link) || - __put_user((unsigned long)tm_frame, - &rt_sf->uc_transact.uc_regs)) - goto badframe; - if (save_tm_user_regs(regs, frame, tm_frame, sigret)) - goto badframe; - } + tm_mctx = &frame->mctx_transact; +#endif + if (MSR_TM_ACTIVE(msr)) + prepare_save_tm_user_regs(); else + prepare_save_user_regs(1); + + if (!user_access_begin(frame, sizeof(*frame))) + goto badframe; + sc = (struct sigcontext __user *) &frame->sctx; + +#if _NSIG != 64 +#error "Please adjust handle_signal()" #endif - { - if (__put_user(0, &rt_sf->uc.uc_link)) - goto badframe; - if (save_user_regs(regs, frame, tm_frame, sigret, 1)) - goto badframe; + unsafe_put_user(to_user_ptr(ksig->ka.sa.sa_handler), &sc->handler, failed); + unsafe_put_user(oldset->sig[0], &sc->oldmask, failed); +#ifdef CONFIG_PPC64 + unsafe_put_user((oldset->sig[0] >> 32), &sc->_unused[3], failed); +#else + unsafe_put_user(oldset->sig[1], &sc->_unused[3], failed); +#endif + unsafe_put_user(to_user_ptr(mctx), &sc->regs, failed); + unsafe_put_user(ksig->sig, &sc->signal, failed); + + if (MSR_TM_ACTIVE(msr)) + unsafe_save_tm_user_regs(regs, mctx, tm_mctx, msr, failed); + else + unsafe_save_user_regs(regs, mctx, tm_mctx, 1, failed); + + if (tsk->mm->context.vdso) { + tramp = VDSO32_SYMBOL(tsk->mm->context.vdso, sigtramp32); + } else { + tramp = (unsigned long)mctx->mc_pad; + unsafe_put_user(PPC_RAW_LI(_R0, __NR_sigreturn), &mctx->mc_pad[0], failed); + unsafe_put_user(PPC_RAW_SC(), &mctx->mc_pad[1], failed); + asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0])); } + user_access_end(); + regs->link = tramp; +#ifdef CONFIG_PPC_FPU_REGS tsk->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */ +#endif /* create a stack frame for the caller of the handler */ - newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16); - addr = (void __user *)regs->gpr[1]; + newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; - /* Fill registers for signal handler */ regs->gpr[1] = newsp; regs->gpr[3] = ksig->sig; - regs->gpr[4] = (unsigned long) &rt_sf->info; - regs->gpr[5] = (unsigned long) &rt_sf->uc; - regs->gpr[6] = (unsigned long) rt_sf; - regs->nip = (unsigned long) ksig->ka.sa.sa_handler; + regs->gpr[4] = (unsigned long) sc; + regs_set_return_ip(regs, (unsigned long) ksig->ka.sa.sa_handler); /* enter the signal handler in native-endian mode */ - regs->msr &= ~MSR_LE; - regs->msr |= (MSR_KERNEL & MSR_LE); + regs_set_return_msr(regs, (regs->msr & ~MSR_LE) | (MSR_KERNEL & MSR_LE)); + return 0; +failed: + user_access_end(); + badframe: - if (show_unhandled_signals) - printk_ratelimited(KERN_INFO - "%s[%d]: bad frame in handle_rt_signal32: " - "%p nip %08lx lr %08lx\n", - tsk->comm, tsk->pid, - addr, regs->nip, regs->link); + signal_fault(tsk, regs, "handle_signal32", frame); return 1; } @@ -999,28 +922,31 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int sigset_t set; struct mcontext __user *mcp; - if (get_sigset_t(&set, &ucp->uc_sigmask)) + if (!user_read_access_begin(ucp, sizeof(*ucp))) return -EFAULT; + + unsafe_get_sigset_t(&set, &ucp->uc_sigmask, failed); #ifdef CONFIG_PPC64 { u32 cmcp; - if (__get_user(cmcp, &ucp->uc_regs)) - return -EFAULT; + unsafe_get_user(cmcp, &ucp->uc_regs, failed); mcp = (struct mcontext __user *)(u64)cmcp; - /* no need to check access_ok(mcp), since mcp < 4GB */ } #else - if (__get_user(mcp, &ucp->uc_regs)) - return -EFAULT; - if (!access_ok(mcp, sizeof(*mcp))) - return -EFAULT; + unsafe_get_user(mcp, &ucp->uc_regs, failed); #endif + user_read_access_end(); + set_current_blocked(&set); if (restore_user_regs(regs, mcp, sig)) return -EFAULT; return 0; + +failed: + user_read_access_end(); + return -EFAULT; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -1034,11 +960,15 @@ static int do_setcontext_tm(struct ucontext __user *ucp, u32 cmcp; u32 tm_cmcp; - if (get_sigset_t(&set, &ucp->uc_sigmask)) + if (!user_read_access_begin(ucp, sizeof(*ucp))) return -EFAULT; - if (__get_user(cmcp, &ucp->uc_regs) || - __get_user(tm_cmcp, &tm_ucp->uc_regs)) + unsafe_get_sigset_t(&set, &ucp->uc_sigmask, failed); + unsafe_get_user(cmcp, &ucp->uc_regs, failed); + + user_read_access_end(); + + if (__get_user(tm_cmcp, &tm_ucp->uc_regs)) return -EFAULT; mcp = (struct mcontext __user *)(u64)cmcp; tm_mcp = (struct mcontext __user *)(u64)tm_cmcp; @@ -1049,6 +979,10 @@ static int do_setcontext_tm(struct ucontext __user *ucp, return -EFAULT; return 0; + +failed: + user_read_access_end(); + return -EFAULT; } #endif @@ -1116,16 +1050,18 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, */ mctx = (struct mcontext __user *) ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); - if (!access_ok(old_ctx, ctx_size) - || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region) - || put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked) - || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) + prepare_save_user_regs(ctx_has_vsx_region); + if (!user_write_access_begin(old_ctx, ctx_size)) return -EFAULT; + unsafe_save_user_regs(regs, mctx, NULL, ctx_has_vsx_region, failed); + unsafe_put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked, failed); + unsafe_put_user(to_user_ptr(mctx), &old_ctx->uc_regs, failed); + user_write_access_end(); } if (new_ctx == NULL) return 0; if (!access_ok(new_ctx, ctx_size) || - fault_in_pages_readable((u8 __user *)new_ctx, ctx_size)) + fault_in_readable((char __user *)new_ctx, ctx_size)) return -EFAULT; /* @@ -1139,11 +1075,17 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, * or if another thread unmaps the region containing the context. * We kill the task with a SIGSEGV in this situation. */ - if (do_setcontext(new_ctx, regs, 0)) - do_exit(SIGSEGV); + if (do_setcontext(new_ctx, regs, 0)) { + force_exit_sig(SIGSEGV); + return -EFAULT; + } set_thread_flag(TIF_RESTOREALL); return 0; + +failed: + user_write_access_end(); + return -EFAULT; } #ifdef CONFIG_PPC64 @@ -1215,7 +1157,7 @@ SYSCALL_DEFINE0(rt_sigreturn) * set, and recheckpoint was not called. This avoid * hitting a TM Bad thing at RFID */ - regs->msr &= ~MSR_TS_MASK; + regs_set_return_msr(regs, regs->msr & ~MSR_TS_MASK); } /* Fall through, for non-TM restore */ #endif @@ -1241,12 +1183,7 @@ SYSCALL_DEFINE0(rt_sigreturn) return 0; bad: - if (show_unhandled_signals) - printk_ratelimited(KERN_INFO - "%s[%d]: bad frame in sys_rt_sigreturn: " - "%p nip %08lx lr %08lx\n", - current->comm, current->pid, - rt_sf, regs->nip, regs->link); + signal_fault(current, regs, "sys_rt_sigreturn", rt_sf); force_sig(SIGSEGV); return 0; @@ -1309,13 +1246,13 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx, affect the contents of these registers. After this point, failure is a problem, anyway, and it's very unlikely unless the user is really doing something wrong. */ - regs->msr = new_msr; + regs_set_return_msr(regs, new_msr); #ifdef CONFIG_PPC_ADV_DEBUG_REGS current->thread.debug.dbcr0 = new_dbcr0; #endif if (!access_ok(ctx, sizeof(*ctx)) || - fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx))) + fault_in_readable((char __user *)ctx, sizeof(*ctx))) return -EFAULT; /* @@ -1330,12 +1267,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx, * We kill the task with a SIGSEGV in this situation. */ if (do_setcontext(ctx, regs, 1)) { - if (show_unhandled_signals) - printk_ratelimited(KERN_INFO "%s[%d]: bad frame in " - "sys_debug_setcontext: %p nip %08lx " - "lr %08lx\n", - current->comm, current->pid, - ctx, regs->nip, regs->link); + signal_fault(current, regs, "sys_debug_setcontext", ctx); force_sig(SIGSEGV); goto out; @@ -1357,92 +1289,6 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx, #endif /* - * OK, we're invoking a handler - */ -int handle_signal32(struct ksignal *ksig, sigset_t *oldset, - struct task_struct *tsk) -{ - struct sigcontext __user *sc; - struct sigframe __user *frame; - struct mcontext __user *tm_mctx = NULL; - unsigned long newsp = 0; - int sigret; - unsigned long tramp; - struct pt_regs *regs = tsk->thread.regs; - - BUG_ON(tsk != current); - - /* Set up Signal Frame */ - frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1); - if (unlikely(frame == NULL)) - goto badframe; - sc = (struct sigcontext __user *) &frame->sctx; - -#if _NSIG != 64 -#error "Please adjust handle_signal()" -#endif - if (__put_user(to_user_ptr(ksig->ka.sa.sa_handler), &sc->handler) - || __put_user(oldset->sig[0], &sc->oldmask) -#ifdef CONFIG_PPC64 - || __put_user((oldset->sig[0] >> 32), &sc->_unused[3]) -#else - || __put_user(oldset->sig[1], &sc->_unused[3]) -#endif - || __put_user(to_user_ptr(&frame->mctx), &sc->regs) - || __put_user(ksig->sig, &sc->signal)) - goto badframe; - - if (vdso32_sigtramp && tsk->mm->context.vdso_base) { - sigret = 0; - tramp = tsk->mm->context.vdso_base + vdso32_sigtramp; - } else { - sigret = __NR_sigreturn; - tramp = (unsigned long) frame->mctx.tramp; - } - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - tm_mctx = &frame->mctx_transact; - if (MSR_TM_ACTIVE(regs->msr)) { - if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact, - sigret)) - goto badframe; - } - else -#endif - { - if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1)) - goto badframe; - } - - regs->link = tramp; - - tsk->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */ - - /* create a stack frame for the caller of the handler */ - newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; - if (put_user(regs->gpr[1], (u32 __user *)newsp)) - goto badframe; - - regs->gpr[1] = newsp; - regs->gpr[3] = ksig->sig; - regs->gpr[4] = (unsigned long) sc; - regs->nip = (unsigned long) (unsigned long)ksig->ka.sa.sa_handler; - /* enter the signal handler in big-endian mode */ - regs->msr &= ~MSR_LE; - return 0; - -badframe: - if (show_unhandled_signals) - printk_ratelimited(KERN_INFO - "%s[%d]: bad frame in handle_signal32: " - "%p nip %08lx lr %08lx\n", - tsk->comm, tsk->pid, - frame, regs->nip, regs->link); - - return 1; -} - -/* * Do a signal return; undo the signal stack. */ #ifdef CONFIG_PPC64 @@ -1456,19 +1302,16 @@ SYSCALL_DEFINE0(sigreturn) struct sigcontext __user *sc; struct sigcontext sigctx; struct mcontext __user *sr; - void __user *addr; sigset_t set; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - struct mcontext __user *mcp, *tm_mcp; - unsigned long msr_hi; -#endif + struct mcontext __user *mcp; + struct mcontext __user *tm_mcp = NULL; + unsigned long long msr_hi = 0; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); sc = &sf->sctx; - addr = sc; if (copy_from_user(&sigctx, sc, sizeof(sigctx))) goto badframe; @@ -1484,36 +1327,32 @@ SYSCALL_DEFINE0(sigreturn) #endif set_current_blocked(&set); -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM mcp = (struct mcontext __user *)&sf->mctx; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM tm_mcp = (struct mcontext __user *)&sf->mctx_transact; if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR])) goto badframe; +#endif if (MSR_TM_ACTIVE(msr_hi<<32)) { if (!cpu_has_feature(CPU_FTR_TM)) goto badframe; if (restore_tm_user_regs(regs, mcp, tm_mcp)) goto badframe; - } else -#endif - { + } else { sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); - addr = sr; - if (!access_ok(sr, sizeof(*sr)) - || restore_user_regs(regs, sr, 1)) - goto badframe; + if (restore_user_regs(regs, sr, 1)) { + signal_fault(current, regs, "sys_sigreturn", sr); + + force_sig(SIGSEGV); + return 0; + } } set_thread_flag(TIF_RESTOREALL); return 0; badframe: - if (show_unhandled_signals) - printk_ratelimited(KERN_INFO - "%s[%d]: bad frame in sys_sigreturn: " - "%p nip %08lx lr %08lx\n", - current->comm, current->pid, - addr, regs->nip, regs->link); + signal_fault(current, regs, "sys_sigreturn", sc); force_sig(SIGSEGV); return 0; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 117515564ec7..86bb5bb4c143 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -21,11 +21,11 @@ #include <linux/ptrace.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> +#include <linux/pagemap.h> #include <asm/sigcontext.h> #include <asm/ucontext.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/unistd.h> #include <asm/cacheflush.h> #include <asm/syscalls.h> @@ -40,8 +40,8 @@ #define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs)) #define FP_REGS_SIZE sizeof(elf_fpregset_t) -#define TRAMP_TRACEBACK 3 -#define TRAMP_SIZE 6 +#define TRAMP_TRACEBACK 4 +#define TRAMP_SIZE 7 /* * When we have signals to deliver, we set up on the user stack, @@ -66,10 +66,10 @@ struct rt_sigframe { char abigap[USER_REDZONE_SIZE]; } __attribute__ ((aligned (16))); -static const char fmt32[] = KERN_INFO \ - "%s[%d]: bad frame in %s: %08lx nip %08lx lr %08lx\n"; -static const char fmt64[] = KERN_INFO \ - "%s[%d]: bad frame in %s: %016lx nip %016lx lr %016lx\n"; +unsigned long get_min_sigframe_size_64(void) +{ + return sizeof(struct rt_sigframe) + __SIGNAL_FRAMESIZE; +} /* * This computes a quad word aligned pointer inside the vmx_reserve array @@ -84,13 +84,36 @@ static elf_vrreg_t __user *sigcontext_vmx_regs(struct sigcontext __user *sc) } #endif +static void prepare_setup_sigcontext(struct task_struct *tsk) +{ +#ifdef CONFIG_ALTIVEC + /* save altivec registers */ + if (tsk->thread.used_vr) + flush_altivec_to_thread(tsk); + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + tsk->thread.vrsave = mfspr(SPRN_VRSAVE); +#endif /* CONFIG_ALTIVEC */ + + flush_fp_to_thread(tsk); + +#ifdef CONFIG_VSX + if (tsk->thread.used_vsr) + flush_vsx_to_thread(tsk); +#endif /* CONFIG_VSX */ +} + /* * Set up the sigcontext for the signal frame. */ -static long setup_sigcontext(struct sigcontext __user *sc, - struct task_struct *tsk, int signr, sigset_t *set, - unsigned long handler, int ctx_has_vsx_region) +#define unsafe_setup_sigcontext(sc, tsk, signr, set, handler, ctx_has_vsx_region, label)\ +do { \ + if (__unsafe_setup_sigcontext(sc, tsk, signr, set, handler, ctx_has_vsx_region))\ + goto label; \ +} while (0) +static long notrace __unsafe_setup_sigcontext(struct sigcontext __user *sc, + struct task_struct *tsk, int signr, sigset_t *set, + unsigned long handler, int ctx_has_vsx_region) { /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the * process never used altivec yet (MSR_VEC is zero in pt_regs of @@ -102,25 +125,22 @@ static long setup_sigcontext(struct sigcontext __user *sc, */ #ifdef CONFIG_ALTIVEC elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc); - unsigned long vrsave; #endif struct pt_regs *regs = tsk->thread.regs; unsigned long msr = regs->msr; - long err = 0; - /* Force usr to alway see softe as 1 (interrupts enabled) */ + /* Force usr to always see softe as 1 (interrupts enabled) */ unsigned long softe = 0x1; BUG_ON(tsk != current); #ifdef CONFIG_ALTIVEC - err |= __put_user(v_regs, &sc->v_regs); + unsafe_put_user(v_regs, &sc->v_regs, efault_out); /* save altivec registers */ if (tsk->thread.used_vr) { - flush_altivec_to_thread(tsk); /* Copy 33 vec registers (vr0..31 and vscr) to the stack */ - err |= __copy_to_user(v_regs, &tsk->thread.vr_state, - 33 * sizeof(vector128)); + unsafe_copy_to_user(v_regs, &tsk->thread.vr_state, + 33 * sizeof(vector128), efault_out); /* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg) * contains valid data. */ @@ -129,19 +149,12 @@ static long setup_sigcontext(struct sigcontext __user *sc, /* We always copy to/from vrsave, it's 0 if we don't have or don't * use altivec. */ - vrsave = 0; - if (cpu_has_feature(CPU_FTR_ALTIVEC)) { - vrsave = mfspr(SPRN_VRSAVE); - tsk->thread.vrsave = vrsave; - } - - err |= __put_user(vrsave, (u32 __user *)&v_regs[33]); + unsafe_put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out); #else /* CONFIG_ALTIVEC */ - err |= __put_user(0, &sc->v_regs); + unsafe_put_user(0, &sc->v_regs, efault_out); #endif /* CONFIG_ALTIVEC */ - flush_fp_to_thread(tsk); /* copy fpr regs and fpscr */ - err |= copy_fpr_to_user(&sc->fp_regs, tsk); + unsafe_copy_fpr_to_user(&sc->fp_regs, tsk, efault_out); /* * Clear the MSR VSX bit to indicate there is no valid state attached @@ -155,26 +168,27 @@ static long setup_sigcontext(struct sigcontext __user *sc, * VMX data. */ if (tsk->thread.used_vsr && ctx_has_vsx_region) { - flush_vsx_to_thread(tsk); v_regs += ELF_NVRREG; - err |= copy_vsx_to_user(v_regs, tsk); + unsafe_copy_vsx_to_user(v_regs, tsk, efault_out); /* set MSR_VSX in the MSR value in the frame to * indicate that sc->vs_reg) contains valid data. */ msr |= MSR_VSX; } #endif /* CONFIG_VSX */ - err |= __put_user(&sc->gp_regs, &sc->regs); - WARN_ON(!FULL_REGS(regs)); - err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE); - err |= __put_user(msr, &sc->gp_regs[PT_MSR]); - err |= __put_user(softe, &sc->gp_regs[PT_SOFTE]); - err |= __put_user(signr, &sc->signal); - err |= __put_user(handler, &sc->handler); + unsafe_put_user(&sc->gp_regs, &sc->regs, efault_out); + unsafe_copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE, efault_out); + unsafe_put_user(msr, &sc->gp_regs[PT_MSR], efault_out); + unsafe_put_user(softe, &sc->gp_regs[PT_SOFTE], efault_out); + unsafe_put_user(signr, &sc->signal, efault_out); + unsafe_put_user(handler, &sc->handler, efault_out); if (set != NULL) - err |= __put_user(set->sig[0], &sc->oldmask); + unsafe_put_user(set->sig[0], &sc->oldmask, efault_out); - return err; + return 0; + +efault_out: + return -EFAULT; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -192,7 +206,8 @@ static long setup_sigcontext(struct sigcontext __user *sc, static long setup_tm_sigcontexts(struct sigcontext __user *sc, struct sigcontext __user *tm_sc, struct task_struct *tsk, - int signr, sigset_t *set, unsigned long handler) + int signr, sigset_t *set, unsigned long handler, + unsigned long msr) { /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the * process never used altivec yet (MSR_VEC is zero in pt_regs of @@ -207,12 +222,11 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc); #endif struct pt_regs *regs = tsk->thread.regs; - unsigned long msr = tsk->thread.regs->msr; long err = 0; BUG_ON(tsk != current); - BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + BUG_ON(!MSR_TM_ACTIVE(msr)); WARN_ON(tm_suspend_disabled); @@ -222,13 +236,6 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, */ msr |= tsk->thread.ckpt_regs.msr & (MSR_FP | MSR_VEC | MSR_VSX); - /* Remove TM bits from thread's MSR. The MSR in the sigcontext - * just indicates to userland that we were doing a transaction, but we - * don't want to return in transactional state. This also ensures - * that flush_fp_to_thread won't set TIF_RESTORE_TM again. - */ - regs->msr &= ~MSR_TS_MASK; - #ifdef CONFIG_ALTIVEC err |= __put_user(v_regs, &sc->v_regs); err |= __put_user(tm_v_regs, &tm_sc->v_regs); @@ -306,7 +313,6 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, err |= __put_user(&sc->gp_regs, &sc->regs); err |= __put_user(&tm_sc->gp_regs, &tm_sc->regs); - WARN_ON(!FULL_REGS(regs)); err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE); err |= __copy_to_user(&sc->gp_regs, &tsk->thread.ckpt_regs, GP_REGS_SIZE); @@ -324,14 +330,16 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, /* * Restore the sigcontext from the signal frame. */ - -static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, - struct sigcontext __user *sc) +#define unsafe_restore_sigcontext(tsk, set, sig, sc, label) do { \ + if (__unsafe_restore_sigcontext(tsk, set, sig, sc)) \ + goto label; \ +} while (0) +static long notrace __unsafe_restore_sigcontext(struct task_struct *tsk, sigset_t *set, + int sig, struct sigcontext __user *sc) { #ifdef CONFIG_ALTIVEC elf_vrreg_t __user *v_regs; #endif - unsigned long err = 0; unsigned long save_r13 = 0; unsigned long msr; struct pt_regs *regs = tsk->thread.regs; @@ -346,59 +354,60 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, save_r13 = regs->gpr[13]; /* copy the GPRs */ - err |= __copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr)); - err |= __get_user(regs->nip, &sc->gp_regs[PT_NIP]); + unsafe_copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr), efault_out); + unsafe_get_user(regs->nip, &sc->gp_regs[PT_NIP], efault_out); /* get MSR separately, transfer the LE bit if doing signal return */ - err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + unsafe_get_user(msr, &sc->gp_regs[PT_MSR], efault_out); if (sig) - regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); - err |= __get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3]); - err |= __get_user(regs->ctr, &sc->gp_regs[PT_CTR]); - err |= __get_user(regs->link, &sc->gp_regs[PT_LNK]); - err |= __get_user(regs->xer, &sc->gp_regs[PT_XER]); - err |= __get_user(regs->ccr, &sc->gp_regs[PT_CCR]); - /* skip SOFTE */ - regs->trap = 0; - err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]); - err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]); - err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]); + regs_set_return_msr(regs, (regs->msr & ~MSR_LE) | (msr & MSR_LE)); + unsafe_get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3], efault_out); + unsafe_get_user(regs->ctr, &sc->gp_regs[PT_CTR], efault_out); + unsafe_get_user(regs->link, &sc->gp_regs[PT_LNK], efault_out); + unsafe_get_user(regs->xer, &sc->gp_regs[PT_XER], efault_out); + unsafe_get_user(regs->ccr, &sc->gp_regs[PT_CCR], efault_out); + /* Don't allow userspace to set SOFTE */ + set_trap_norestart(regs); + unsafe_get_user(regs->dar, &sc->gp_regs[PT_DAR], efault_out); + unsafe_get_user(regs->dsisr, &sc->gp_regs[PT_DSISR], efault_out); + unsafe_get_user(regs->result, &sc->gp_regs[PT_RESULT], efault_out); if (!sig) regs->gpr[13] = save_r13; if (set != NULL) - err |= __get_user(set->sig[0], &sc->oldmask); + unsafe_get_user(set->sig[0], &sc->oldmask, efault_out); /* - * Force reload of FP/VEC. - * This has to be done before copying stuff into tsk->thread.fpr/vr - * for the reasons explained in the previous comment. + * Force reload of FP/VEC/VSX so userspace sees any changes. + * Clear these bits from the user process' MSR before copying into the + * thread struct. If we are rescheduled or preempted and another task + * uses FP/VEC/VSX, and this process has the MSR bits set, then the + * context switch code will save the current CPU state into the + * thread_struct - possibly overwriting the data we are updating here. */ - regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX); + regs_set_return_msr(regs, regs->msr & ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX)); #ifdef CONFIG_ALTIVEC - err |= __get_user(v_regs, &sc->v_regs); - if (err) - return err; + unsafe_get_user(v_regs, &sc->v_regs, efault_out); if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128))) return -EFAULT; /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ if (v_regs != NULL && (msr & MSR_VEC) != 0) { - err |= __copy_from_user(&tsk->thread.vr_state, v_regs, - 33 * sizeof(vector128)); + unsafe_copy_from_user(&tsk->thread.vr_state, v_regs, + 33 * sizeof(vector128), efault_out); tsk->thread.used_vr = true; } else if (tsk->thread.used_vr) { memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128)); } /* Always get VRSAVE back */ if (v_regs != NULL) - err |= __get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]); + unsafe_get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out); else tsk->thread.vrsave = 0; if (cpu_has_feature(CPU_FTR_ALTIVEC)) mtspr(SPRN_VRSAVE, tsk->thread.vrsave); #endif /* CONFIG_ALTIVEC */ /* restore floating point */ - err |= copy_fpr_from_user(tsk, &sc->fp_regs); + unsafe_copy_fpr_from_user(tsk, &sc->fp_regs, efault_out); #ifdef CONFIG_VSX /* * Get additional VSX data. Update v_regs to point after the @@ -407,14 +416,17 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, */ v_regs += ELF_NVRREG; if ((msr & MSR_VSX) != 0) { - err |= copy_vsx_from_user(tsk, v_regs); + unsafe_copy_vsx_from_user(tsk, v_regs, efault_out); tsk->thread.used_vsr = true; } else { for (i = 0; i < 32 ; i++) tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; } #endif - return err; + return 0; + +efault_out: + return -EFAULT; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -464,7 +476,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, return -EINVAL; /* pull in MSR LE from user context */ - regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); + regs_set_return_msr(regs, (regs->msr & ~MSR_LE) | (msr & MSR_LE)); /* The following non-GPR non-FPR non-VR state is also checkpointed: */ err |= __get_user(regs->ctr, &tm_sc->gp_regs[PT_CTR]); @@ -479,9 +491,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, &sc->gp_regs[PT_XER]); err |= __get_user(tsk->thread.ckpt_regs.ccr, &sc->gp_regs[PT_CCR]); - + /* Don't allow userspace to set SOFTE */ + set_trap_norestart(regs); /* These regs are not checkpointed; they can go in 'regs'. */ - err |= __get_user(regs->trap, &sc->gp_regs[PT_TRAP]); err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]); err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]); err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]); @@ -491,7 +503,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, * This has to be done before copying stuff into tsk->thread.fpr/vr * for the reasons explained in the previous comment. */ - regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX); + regs_set_return_msr(regs, regs->msr & ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX)); #ifdef CONFIG_ALTIVEC err |= __get_user(v_regs, &sc->v_regs); @@ -561,7 +573,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, preempt_disable(); /* pull in MSR TS bits from user context */ - regs->msr |= msr & MSR_TS_MASK; + regs_set_return_msr(regs, regs->msr | (msr & MSR_TS_MASK)); /* * Ensure that TM is enabled in regs->msr before we leave the signal @@ -579,7 +591,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, * to be de-scheduled with MSR[TS] set but without calling * tm_recheckpoint(). This can cause a bug. */ - regs->msr |= MSR_TM; + regs_set_return_msr(regs, regs->msr | MSR_TM); /* This loads the checkpointed FP/VEC state, if used */ tm_recheckpoint(&tsk->thread); @@ -587,17 +599,23 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, msr_check_and_set(msr & (MSR_FP | MSR_VEC)); if (msr & MSR_FP) { load_fp_state(&tsk->thread.fp_state); - regs->msr |= (MSR_FP | tsk->thread.fpexc_mode); + regs_set_return_msr(regs, regs->msr | (MSR_FP | tsk->thread.fpexc_mode)); } if (msr & MSR_VEC) { load_vr_state(&tsk->thread.vr_state); - regs->msr |= MSR_VEC; + regs_set_return_msr(regs, regs->msr | MSR_VEC); } preempt_enable(); return err; } +#else /* !CONFIG_PPC_TRANSACTIONAL_MEM */ +static long restore_tm_sigcontexts(struct task_struct *tsk, struct sigcontext __user *sc, + struct sigcontext __user *tm_sc) +{ + return -EINVAL; +} #endif /* @@ -608,13 +626,12 @@ static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp) int i; long err = 0; - /* addi r1, r1, __SIGNAL_FRAMESIZE # Pop the dummy stackframe */ - err |= __put_user(PPC_INST_ADDI | __PPC_RT(R1) | __PPC_RA(R1) | - (__SIGNAL_FRAMESIZE & 0xffff), &tramp[0]); - /* li r0, __NR_[rt_]sigreturn| */ - err |= __put_user(PPC_INST_ADDI | (syscall & 0xffff), &tramp[1]); - /* sc */ - err |= __put_user(PPC_INST_SC, &tramp[2]); + /* Call the handler and pop the dummy stackframe*/ + err |= __put_user(PPC_RAW_BCTRL(), &tramp[0]); + err |= __put_user(PPC_RAW_ADDI(_R1, _R1, __SIGNAL_FRAMESIZE), &tramp[1]); + + err |= __put_user(PPC_RAW_LI(_R0, syscall), &tramp[2]); + err |= __put_user(PPC_RAW_SC(), &tramp[3]); /* Minimal traceback info */ for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++) @@ -640,7 +657,6 @@ static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp) SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, struct ucontext __user *, new_ctx, long, ctx_size) { - unsigned char tmp; sigset_t set; unsigned long new_msr = 0; int ctx_has_vsx_region = 0; @@ -666,18 +682,21 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, ctx_has_vsx_region = 1; if (old_ctx != NULL) { - if (!access_ok(old_ctx, ctx_size) - || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0, - ctx_has_vsx_region) - || __copy_to_user(&old_ctx->uc_sigmask, - ¤t->blocked, sizeof(sigset_t))) + prepare_setup_sigcontext(current); + if (!user_write_access_begin(old_ctx, ctx_size)) return -EFAULT; + + unsafe_setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, + 0, ctx_has_vsx_region, efault_out); + unsafe_copy_to_user(&old_ctx->uc_sigmask, ¤t->blocked, + sizeof(sigset_t), efault_out); + + user_write_access_end(); } if (new_ctx == NULL) return 0; - if (!access_ok(new_ctx, ctx_size) - || __get_user(tmp, (u8 __user *) new_ctx) - || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1)) + if (!access_ok(new_ctx, ctx_size) || + fault_in_readable((char __user *)new_ctx, ctx_size)) return -EFAULT; /* @@ -692,15 +711,29 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, * We kill the task with a SIGSEGV in this situation. */ - if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set))) - do_exit(SIGSEGV); + if (__get_user_sigset(&set, &new_ctx->uc_sigmask)) { + force_exit_sig(SIGSEGV); + return -EFAULT; + } set_current_blocked(&set); - if (restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) - do_exit(SIGSEGV); + + if (!user_read_access_begin(new_ctx, ctx_size)) + return -EFAULT; + if (__unsafe_restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) { + user_read_access_end(); + force_exit_sig(SIGSEGV); + return -EFAULT; + } + user_read_access_end(); /* This returns like rt_sigreturn */ set_thread_flag(TIF_RESTOREALL); + return 0; + +efault_out: + user_write_access_end(); + return -EFAULT; } @@ -713,9 +746,7 @@ SYSCALL_DEFINE0(rt_sigreturn) struct pt_regs *regs = current_pt_regs(); struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1]; sigset_t set; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM unsigned long msr; -#endif /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -723,52 +754,54 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(uc, sizeof(*uc))) goto badframe; - if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) + if (__get_user_sigset(&set, &uc->uc_sigmask)) goto badframe; set_current_blocked(&set); -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* - * If there is a transactional state then throw it away. - * The purpose of a sigreturn is to destroy all traces of the - * signal frame, this includes any transactional state created - * within in. We only check for suspended as we can never be - * active in the kernel, we are active, there is nothing better to - * do than go ahead and Bad Thing later. - * The cause is not important as there will never be a - * recheckpoint so it's not user visible. - */ - if (MSR_TM_SUSPENDED(mfmsr())) - tm_reclaim_current(0); + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM)) { + /* + * If there is a transactional state then throw it away. + * The purpose of a sigreturn is to destroy all traces of the + * signal frame, this includes any transactional state created + * within in. We only check for suspended as we can never be + * active in the kernel, we are active, there is nothing better to + * do than go ahead and Bad Thing later. + * The cause is not important as there will never be a + * recheckpoint so it's not user visible. + */ + if (MSR_TM_SUSPENDED(mfmsr())) + tm_reclaim_current(0); - /* - * Disable MSR[TS] bit also, so, if there is an exception in the - * code below (as a page fault in copy_ckvsx_to_user()), it does - * not recheckpoint this task if there was a context switch inside - * the exception. - * - * A major page fault can indirectly call schedule(). A reschedule - * process in the middle of an exception can have a side effect - * (Changing the CPU MSR[TS] state), since schedule() is called - * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended - * (switch_to() calls tm_recheckpoint() for the 'new' process). In - * this case, the process continues to be the same in the CPU, but - * the CPU state just changed. - * - * This can cause a TM Bad Thing, since the MSR in the stack will - * have the MSR[TS]=0, and this is what will be used to RFID. - * - * Clearing MSR[TS] state here will avoid a recheckpoint if there - * is any process reschedule in kernel space. The MSR[TS] state - * does not need to be saved also, since it will be replaced with - * the MSR[TS] that came from user context later, at - * restore_tm_sigcontexts. - */ - regs->msr &= ~MSR_TS_MASK; + /* + * Disable MSR[TS] bit also, so, if there is an exception in the + * code below (as a page fault in copy_ckvsx_to_user()), it does + * not recheckpoint this task if there was a context switch inside + * the exception. + * + * A major page fault can indirectly call schedule(). A reschedule + * process in the middle of an exception can have a side effect + * (Changing the CPU MSR[TS] state), since schedule() is called + * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended + * (switch_to() calls tm_recheckpoint() for the 'new' process). In + * this case, the process continues to be the same in the CPU, but + * the CPU state just changed. + * + * This can cause a TM Bad Thing, since the MSR in the stack will + * have the MSR[TS]=0, and this is what will be used to RFID. + * + * Clearing MSR[TS] state here will avoid a recheckpoint if there + * is any process reschedule in kernel space. The MSR[TS] state + * does not need to be saved also, since it will be replaced with + * the MSR[TS] that came from user context later, at + * restore_tm_sigcontexts. + */ + regs_set_return_msr(regs, regs->msr & ~MSR_TS_MASK); - if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) - goto badframe; - if (MSR_TM_ACTIVE(msr)) { + if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) + goto badframe; + } + + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && MSR_TM_ACTIVE(msr)) { /* We recheckpoint on return. */ struct ucontext __user *uc_transact; @@ -781,9 +814,7 @@ SYSCALL_DEFINE0(rt_sigreturn) if (restore_tm_sigcontexts(current, &uc->uc_mcontext, &uc_transact->uc_mcontext)) goto badframe; - } else -#endif - { + } else { /* * Fall through, for non-TM restore * @@ -796,22 +827,28 @@ SYSCALL_DEFINE0(rt_sigreturn) * MSR[TS] set, but without CPU in the proper state, * causing a TM bad thing. */ - current->thread.regs->msr &= ~MSR_TS_MASK; - if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext)) + regs_set_return_msr(current->thread.regs, + current->thread.regs->msr & ~MSR_TS_MASK); + if (!user_read_access_begin(&uc->uc_mcontext, sizeof(uc->uc_mcontext))) goto badframe; + + unsafe_restore_sigcontext(current, NULL, 1, &uc->uc_mcontext, + badframe_block); + + user_read_access_end(); } if (restore_altstack(&uc->uc_stack)) goto badframe; set_thread_flag(TIF_RESTOREALL); + return 0; +badframe_block: + user_read_access_end(); badframe: - if (show_unhandled_signals) - printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, - current->comm, current->pid, "rt_sigreturn", - (long)uc, regs->nip, regs->link); + signal_fault(current, regs, "rt_sigreturn", uc); force_sig(SIGSEGV); return 0; @@ -824,55 +861,73 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, unsigned long newsp = 0; long err = 0; struct pt_regs *regs = tsk->thread.regs; + /* Save the thread's msr before get_tm_stackpointer() changes it */ + unsigned long msr = regs->msr; - BUG_ON(tsk != current); + frame = get_sigframe(ksig, tsk, sizeof(*frame), 0); - frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0); - if (unlikely(frame == NULL)) - goto badframe; + /* + * This only applies when calling unsafe_setup_sigcontext() and must be + * called before opening the uaccess window. + */ + if (!MSR_TM_ACTIVE(msr)) + prepare_setup_sigcontext(tsk); - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - if (err) + if (!user_write_access_begin(frame, sizeof(*frame))) goto badframe; + unsafe_put_user(&frame->info, &frame->pinfo, badframe_block); + unsafe_put_user(&frame->uc, &frame->puc, badframe_block); + /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]); + unsafe_put_user(0, &frame->uc.uc_flags, badframe_block); + unsafe_save_altstack(&frame->uc.uc_stack, regs->gpr[1], badframe_block); + + if (MSR_TM_ACTIVE(msr)) { #ifdef CONFIG_PPC_TRANSACTIONAL_MEM - if (MSR_TM_ACTIVE(regs->msr)) { /* The ucontext_t passed to userland points to the second * ucontext_t (for transactional state) with its uc_link ptr. */ - err |= __put_user(&frame->uc_transact, &frame->uc.uc_link); + unsafe_put_user(&frame->uc_transact, &frame->uc.uc_link, badframe_block); + + user_write_access_end(); + err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext, &frame->uc_transact.uc_mcontext, tsk, ksig->sig, NULL, - (unsigned long)ksig->ka.sa.sa_handler); - } else + (unsigned long)ksig->ka.sa.sa_handler, + msr); + + if (!user_write_access_begin(&frame->uc.uc_sigmask, + sizeof(frame->uc.uc_sigmask))) + goto badframe; + #endif - { - err |= __put_user(0, &frame->uc.uc_link); - err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig, + } else { + unsafe_put_user(0, &frame->uc.uc_link, badframe_block); + unsafe_setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig, NULL, (unsigned long)ksig->ka.sa.sa_handler, - 1); + 1, badframe_block); } - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) + + unsafe_copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set), badframe_block); + user_write_access_end(); + + /* Save the siginfo outside of the unsafe block. */ + if (copy_siginfo_to_user(&frame->info, &ksig->info)) goto badframe; /* Make sure signal handler doesn't get spurious FP exceptions */ tsk->thread.fp_state.fpscr = 0; /* Set up to return from userspace. */ - if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) { - regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp; + if (tsk->mm->context.vdso) { + regs_set_return_ip(regs, VDSO64_SYMBOL(tsk->mm->context.vdso, sigtramp_rt64)); } else { err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]); if (err) goto badframe; - regs->link = (unsigned long) &frame->tramp[0]; + regs_set_return_ip(regs, (unsigned long) &frame->tramp[0]); } /* Allocate a dummy caller frame for the signal handler. */ @@ -881,30 +936,29 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, /* Set up "regs" so we "return" to the signal handler. */ if (is_elf2_task()) { - regs->nip = (unsigned long) ksig->ka.sa.sa_handler; - regs->gpr[12] = regs->nip; + regs->ctr = (unsigned long) ksig->ka.sa.sa_handler; + regs->gpr[12] = regs->ctr; } else { /* Handler is *really* a pointer to the function descriptor for * the signal routine. The first entry in the function * descriptor is the entry address of signal and the second * entry is the TOC value we need to use. */ - func_descr_t __user *funct_desc_ptr = - (func_descr_t __user *) ksig->ka.sa.sa_handler; + struct func_desc __user *ptr = + (struct func_desc __user *)ksig->ka.sa.sa_handler; - err |= get_user(regs->nip, &funct_desc_ptr->entry); - err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); + err |= get_user(regs->ctr, &ptr->addr); + err |= get_user(regs->gpr[2], &ptr->toc); } /* enter the signal handler in native-endian mode */ - regs->msr &= ~MSR_LE; - regs->msr |= (MSR_KERNEL & MSR_LE); + regs_set_return_msr(regs, (regs->msr & ~MSR_LE) | (MSR_KERNEL & MSR_LE)); regs->gpr[1] = newsp; regs->gpr[3] = ksig->sig; regs->result = 0; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo); - err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc); + regs->gpr[4] = (unsigned long)&frame->info; + regs->gpr[5] = (unsigned long)&frame->uc; regs->gpr[6] = (unsigned long) frame; } else { regs->gpr[4] = (unsigned long)&frame->uc.uc_mcontext; @@ -914,11 +968,10 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, return 0; +badframe_block: + user_write_access_end(); badframe: - if (show_unhandled_signals) - printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, - tsk->comm, tsk->pid, "setup_rt_frame", - (long)frame, regs->nip, regs->link); + signal_fault(current, regs, "handle_rt_signal64", frame); return 1; } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ea6adbf6a221..693334c20d07 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,6 +33,9 @@ #include <linux/processor.h> #include <linux/random.h> #include <linux/stackprotector.h> +#include <linux/pgtable.h> +#include <linux/clockchips.h> +#include <linux/kexec.h> #include <asm/ptrace.h> #include <linux/atomic.h> @@ -41,11 +44,10 @@ #include <asm/kvm_ppc.h> #include <asm/dbell.h> #include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/prom.h> #include <asm/smp.h> #include <asm/time.h> #include <asm/machdep.h> +#include <asm/mmu_context.h> #include <asm/cputhreads.h> #include <asm/cputable.h> #include <asm/mpic.h> @@ -55,10 +57,12 @@ #endif #include <asm/vdso.h> #include <asm/debug.h> -#include <asm/kexec.h> -#include <asm/asm-prototypes.h> #include <asm/cpu_has_feature.h> #include <asm/ftrace.h> +#include <asm/kup.h> +#include <asm/fadump.h> + +#include <trace/events/ipi.h> #ifdef DEBUG #include <asm/udbg.h> @@ -73,12 +77,16 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct task_struct *secondary_current; -bool has_big_cores; +bool has_big_cores __ro_after_init; +bool coregroup_enabled __ro_after_init; +bool thread_group_shares_l2 __ro_after_init; +bool thread_group_shares_l3 __ro_after_init; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); +static DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); @@ -87,6 +95,7 @@ EXPORT_SYMBOL_GPL(has_big_cores); #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 +#define THREAD_GROUP_SHARE_L2_L3 2 struct thread_groups { unsigned int property; unsigned int nr_groups; @@ -94,11 +103,33 @@ struct thread_groups { unsigned int thread_list[MAX_THREAD_LIST_SIZE]; }; +/* Maximum number of properties that groups of threads within a core can share */ +#define MAX_THREAD_GROUP_PROPERTIES 2 + +struct thread_groups_list { + unsigned int nr_properties; + struct thread_groups property_tgs[MAX_THREAD_GROUP_PROPERTIES]; +}; + +static struct thread_groups_list tgl[NR_CPUS] __initdata; /* - * On big-cores system, cpu_l1_cache_map for each CPU corresponds to + * On big-cores system, thread_group_l1_cache_map for each CPU corresponds to * the set its siblings that share the L1-cache. */ -DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map); +DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); + +/* + * On some big-cores system, thread_group_l2_cache_map for each CPU + * corresponds to the set its siblings within the core that share the + * L2-cache. + */ +DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); + +/* + * On P10, thread_group_l3_cache_map for each CPU is equal to the + * thread_group_l2_cache_map + */ +DEFINE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -252,7 +283,7 @@ void smp_muxed_ipi_set_message(int cpu, int msg) * Order previous accesses before accesses in the IPI handler. */ smp_mb(); - message[msg] = 1; + WRITE_ONCE(message[msg], 1); } void smp_muxed_ipi_message_pass(int cpu, int msg) @@ -311,7 +342,7 @@ irqreturn_t smp_ipi_demux_relaxed(void) if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI)) nmi_ipi_action(0, NULL); #endif - } while (info->messages); + } while (READ_ONCE(info->messages)); return IRQ_HANDLED; } @@ -327,12 +358,12 @@ static inline void do_message_pass(int cpu, int msg) #endif } -void smp_send_reschedule(int cpu) +void arch_smp_send_reschedule(int cpu) { if (likely(smp_ops)) do_message_pass(cpu, PPC_MSG_RESCHEDULE); } -EXPORT_SYMBOL_GPL(smp_send_reschedule); +EXPORT_SYMBOL_GPL(arch_smp_send_reschedule); void arch_send_call_function_single_ipi(int cpu) { @@ -374,32 +405,32 @@ static struct cpumask nmi_ipi_pending_mask; static bool nmi_ipi_busy = false; static void (*nmi_ipi_function)(struct pt_regs *) = NULL; -static void nmi_ipi_lock_start(unsigned long *flags) +noinstr static void nmi_ipi_lock_start(unsigned long *flags) { raw_local_irq_save(*flags); hard_irq_disable(); - while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { + while (raw_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { raw_local_irq_restore(*flags); - spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); + spin_until_cond(raw_atomic_read(&__nmi_ipi_lock) == 0); raw_local_irq_save(*flags); hard_irq_disable(); } } -static void nmi_ipi_lock(void) +noinstr static void nmi_ipi_lock(void) { - while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) - spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); + while (raw_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) + spin_until_cond(raw_atomic_read(&__nmi_ipi_lock) == 0); } -static void nmi_ipi_unlock(void) +noinstr static void nmi_ipi_unlock(void) { smp_mb(); - WARN_ON(atomic_read(&__nmi_ipi_lock) != 1); - atomic_set(&__nmi_ipi_lock, 0); + WARN_ON(raw_atomic_read(&__nmi_ipi_lock) != 1); + raw_atomic_set(&__nmi_ipi_lock, 0); } -static void nmi_ipi_unlock_end(unsigned long *flags) +noinstr static void nmi_ipi_unlock_end(unsigned long *flags) { nmi_ipi_unlock(); raw_local_irq_restore(*flags); @@ -408,7 +439,7 @@ static void nmi_ipi_unlock_end(unsigned long *flags) /* * Platform NMI handler calls this to ack */ -int smp_handle_nmi_ipi(struct pt_regs *regs) +noinstr int smp_handle_nmi_ipi(struct pt_regs *regs) { void (*fn)(struct pt_regs *) = NULL; unsigned long flags; @@ -546,7 +577,7 @@ void tick_broadcast(const struct cpumask *mask) #endif #ifdef CONFIG_DEBUGGER -void debugger_ipi_callback(struct pt_regs *regs) +static void debugger_ipi_callback(struct pt_regs *regs) { debugger_ipi(regs); } @@ -582,12 +613,42 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) } #endif +void crash_smp_send_stop(void) +{ + static bool stopped = false; + + /* + * In case of fadump, register data for all CPUs is captured by f/w + * on ibm,os-term rtas call. Skip IPI callbacks to other CPUs before + * this rtas call to avoid tricky post processing of those CPUs' + * backtraces. + */ + if (should_fadump_crash()) + return; + + if (stopped) + return; + + stopped = true; + +#ifdef CONFIG_KEXEC_CORE + if (kexec_crash_image) { + crash_kexec_prepare(); + return; + } +#endif + + smp_send_stop(); +} + #ifdef CONFIG_NMI_IPI static void nmi_stop_this_cpu(struct pt_regs *regs) { /* * IRQs are already hard disabled by the smp_handle_nmi_ipi. */ + set_cpu_online(smp_processor_id(), false); + spin_begin(); while (1) spin_cpu_relax(); @@ -603,6 +664,15 @@ void smp_send_stop(void) static void stop_this_cpu(void *dummy) { hard_irq_disable(); + + /* + * Offlining CPUs in stop_this_cpu can result in scheduler warnings, + * (see commit de6e5d38417e), but printk_safe_flush_on_panic() wants + * to know other CPUs are offline before it breaks locks to flush + * printk buffers, in case we panic()ed while holding the lock. + */ + set_cpu_online(smp_processor_id(), false); + spin_begin(); while (1) spin_cpu_relax(); @@ -627,12 +697,12 @@ void smp_send_stop(void) } #endif /* CONFIG_NMI_IPI */ -struct task_struct *current_set[NR_CPUS]; +static struct task_struct *current_set[NR_CPUS]; static void smp_store_cpu_info(int id) { per_cpu(cpu_pvr, id) = mfspr(SPRN_PVR); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 per_cpu(next_tlbcam_idx, id) = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; #endif @@ -659,83 +729,124 @@ static void set_cpus_unrelated(int i, int j, #endif /* + * Extends set_cpus_related. Instead of setting one CPU at a time in + * dstmask, set srcmask at oneshot. dstmask should be super set of srcmask. + */ +static void or_cpumasks_related(int i, int j, struct cpumask *(*srcmask)(int), + struct cpumask *(*dstmask)(int)) +{ + struct cpumask *mask; + int k; + + mask = srcmask(j); + for_each_cpu(k, srcmask(i)) + cpumask_or(dstmask(k), dstmask(k), mask); + + if (i == j) + return; + + mask = srcmask(i); + for_each_cpu(k, srcmask(j)) + cpumask_or(dstmask(k), dstmask(k), mask); +} + +/* * parse_thread_groups: Parses the "ibm,thread-groups" device tree * property for the CPU device node @dn and stores - * the parsed output in the thread_groups - * structure @tg if the ibm,thread-groups[0] - * matches @property. + * the parsed output in the thread_groups_list + * structure @tglp. * * @dn: The device node of the CPU device. - * @tg: Pointer to a thread group structure into which the parsed + * @tglp: Pointer to a thread group list structure into which the parsed * output of "ibm,thread-groups" is stored. - * @property: The property of the thread-group that the caller is - * interested in. * * ibm,thread-groups[0..N-1] array defines which group of threads in * the CPU-device node can be grouped together based on the property. * - * ibm,thread-groups[0] tells us the property based on which the + * This array can represent thread groupings for multiple properties. + * + * ibm,thread-groups[i + 0] tells us the property based on which the * threads are being grouped together. If this value is 1, it implies - * that the threads in the same group share L1, translation cache. + * that the threads in the same group share L1, translation cache. If + * the value is 2, it implies that the threads in the same group share + * the same L2 cache. * - * ibm,thread-groups[1] tells us how many such thread groups exist. + * ibm,thread-groups[i+1] tells us how many such thread groups exist for the + * property ibm,thread-groups[i] * - * ibm,thread-groups[2] tells us the number of threads in each such + * ibm,thread-groups[i+2] tells us the number of threads in each such * group. + * Suppose k = (ibm,thread-groups[i+1] * ibm,thread-groups[i+2]), then, * - * ibm,thread-groups[3..N-1] is the list of threads identified by + * ibm,thread-groups[i+3..i+k+2] (is the list of threads identified by * "ibm,ppc-interrupt-server#s" arranged as per their membership in * the grouping. * - * Example: If ibm,thread-groups = [1,2,4,5,6,7,8,9,10,11,12] it - * implies that there are 2 groups of 4 threads each, where each group - * of threads share L1, translation cache. + * Example: + * If "ibm,thread-groups" = [1,2,4,8,10,12,14,9,11,13,15,2,2,4,8,10,12,14,9,11,13,15] + * This can be decomposed up into two consecutive arrays: + * a) [1,2,4,8,10,12,14,9,11,13,15] + * b) [2,2,4,8,10,12,14,9,11,13,15] * - * The "ibm,ppc-interrupt-server#s" of the first group is {5,6,7,8} - * and the "ibm,ppc-interrupt-server#s" of the second group is {9, 10, - * 11, 12} structure + * where in, + * + * a) provides information of Property "1" being shared by "2" groups, + * each with "4" threads each. The "ibm,ppc-interrupt-server#s" of + * the first group is {8,10,12,14} and the + * "ibm,ppc-interrupt-server#s" of the second group is + * {9,11,13,15}. Property "1" is indicative of the thread in the + * group sharing L1 cache, translation cache and Instruction Data + * flow. + * + * b) provides information of Property "2" being shared by "2" groups, + * each group with "4" threads. The "ibm,ppc-interrupt-server#s" of + * the first group is {8,10,12,14} and the + * "ibm,ppc-interrupt-server#s" of the second group is + * {9,11,13,15}. Property "2" indicates that the threads in each + * group share the L2-cache. * * Returns 0 on success, -EINVAL if the property does not exist, * -ENODATA if property does not have a value, and -EOVERFLOW if the * property data isn't large enough. */ static int parse_thread_groups(struct device_node *dn, - struct thread_groups *tg, - unsigned int property) + struct thread_groups_list *tglp) { - int i; - u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE]; - u32 *thread_list; + unsigned int property_idx = 0; + u32 *thread_group_array; size_t total_threads; - int ret; + int ret = 0, count; + u32 *thread_list; + int i = 0; + count = of_property_count_u32_elems(dn, "ibm,thread-groups"); + thread_group_array = kcalloc(count, sizeof(u32), GFP_KERNEL); ret = of_property_read_u32_array(dn, "ibm,thread-groups", - thread_group_array, 3); + thread_group_array, count); if (ret) - return ret; - - tg->property = thread_group_array[0]; - tg->nr_groups = thread_group_array[1]; - tg->threads_per_group = thread_group_array[2]; - if (tg->property != property || - tg->nr_groups < 1 || - tg->threads_per_group < 1) - return -ENODATA; + goto out_free; - total_threads = tg->nr_groups * tg->threads_per_group; + while (i < count && property_idx < MAX_THREAD_GROUP_PROPERTIES) { + int j; + struct thread_groups *tg = &tglp->property_tgs[property_idx++]; - ret = of_property_read_u32_array(dn, "ibm,thread-groups", - thread_group_array, - 3 + total_threads); - if (ret) - return ret; + tg->property = thread_group_array[i]; + tg->nr_groups = thread_group_array[i + 1]; + tg->threads_per_group = thread_group_array[i + 2]; + total_threads = tg->nr_groups * tg->threads_per_group; - thread_list = &thread_group_array[3]; + thread_list = &thread_group_array[i + 3]; - for (i = 0 ; i < total_threads; i++) - tg->thread_list[i] = thread_list[i]; + for (j = 0; j < total_threads; j++) + tg->thread_list[j] = thread_list[j]; + i = i + 3 + total_threads; + } - return 0; + tglp->nr_properties = property_idx; + +out_free: + kfree(thread_group_array); + return ret; } /* @@ -746,7 +857,7 @@ static int parse_thread_groups(struct device_node *dn, * @tg : The thread-group structure of the CPU node which @cpu belongs * to. * - * Returns the index to tg->thread_list that points to the the start + * Returns the index to tg->thread_list that points to the start * of the thread_group that @cpu belongs to. * * Returns -1 if cpu doesn't belong to any of the groups pointed to by @@ -771,59 +882,188 @@ static int get_cpu_thread_group_start(int cpu, struct thread_groups *tg) return -1; } -static int init_cpu_l1_cache_map(int cpu) - +static struct thread_groups *__init get_thread_groups(int cpu, + int group_property, + int *err) { struct device_node *dn = of_get_cpu_node(cpu, NULL); - struct thread_groups tg = {.property = 0, - .nr_groups = 0, - .threads_per_group = 0}; - int first_thread = cpu_first_thread_sibling(cpu); - int i, cpu_group_start = -1, err = 0; + struct thread_groups_list *cpu_tgl = &tgl[cpu]; + struct thread_groups *tg = NULL; + int i; + *err = 0; - if (!dn) - return -ENODATA; + if (!dn) { + *err = -ENODATA; + return NULL; + } - err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1); - if (err) - goto out; + if (!cpu_tgl->nr_properties) { + *err = parse_thread_groups(dn, cpu_tgl); + if (*err) + goto out; + } - zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu), - GFP_KERNEL, - cpu_to_node(cpu)); + for (i = 0; i < cpu_tgl->nr_properties; i++) { + if (cpu_tgl->property_tgs[i].property == group_property) { + tg = &cpu_tgl->property_tgs[i]; + break; + } + } - cpu_group_start = get_cpu_thread_group_start(cpu, &tg); + if (!tg) + *err = -EINVAL; +out: + of_node_put(dn); + return tg; +} - if (unlikely(cpu_group_start == -1)) { - WARN_ON_ONCE(1); - err = -ENODATA; - goto out; - } +static int __init update_mask_from_threadgroup(cpumask_var_t *mask, struct thread_groups *tg, + int cpu, int cpu_group_start) +{ + int first_thread = cpu_first_thread_sibling(cpu); + int i; + + zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu)); for (i = first_thread; i < first_thread + threads_per_core; i++) { - int i_group_start = get_cpu_thread_group_start(i, &tg); + int i_group_start = get_cpu_thread_group_start(i, tg); if (unlikely(i_group_start == -1)) { WARN_ON_ONCE(1); - err = -ENODATA; - goto out; + return -ENODATA; } if (i_group_start == cpu_group_start) - cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu)); + cpumask_set_cpu(i, *mask); } -out: - of_node_put(dn); - return err; + return 0; +} + +static int __init init_thread_group_cache_map(int cpu, int cache_property) + +{ + int cpu_group_start = -1, err = 0; + struct thread_groups *tg = NULL; + cpumask_var_t *mask = NULL; + + if (cache_property != THREAD_GROUP_SHARE_L1 && + cache_property != THREAD_GROUP_SHARE_L2_L3) + return -EINVAL; + + tg = get_thread_groups(cpu, cache_property, &err); + + if (!tg) + return err; + + cpu_group_start = get_cpu_thread_group_start(cpu, tg); + + if (unlikely(cpu_group_start == -1)) { + WARN_ON_ONCE(1); + return -ENODATA; + } + + if (cache_property == THREAD_GROUP_SHARE_L1) { + mask = &per_cpu(thread_group_l1_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + } + else if (cache_property == THREAD_GROUP_SHARE_L2_L3) { + mask = &per_cpu(thread_group_l2_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + mask = &per_cpu(thread_group_l3_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + } + + + return 0; +} + +static bool shared_caches __ro_after_init; + +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymmetric SMT dependency */ +static int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +/* + * On shared processor LPARs scheduled on a big core (which has two or more + * independent thread groups per core), prefer lower numbered CPUs, so + * that workload consolidates to lesser number of cores. + */ +static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); + +/* + * P9 has a slightly odd architecture where pairs of cores share an L2 cache. + * This topology makes it *much* cheaper to migrate tasks between adjacent cores + * since the migrated task remains cache hot. We want to take advantage of this + * at the scheduler level so an extra topology level is required. + */ +static int powerpc_shared_cache_flags(void) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + + return SD_SHARE_PKG_RESOURCES; +} + +static int powerpc_shared_proc_flags(void) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_ASYM_PACKING; + + return 0; +} + +/* + * We can't just pass cpu_l2_cache_mask() directly because + * returns a non-const pointer and the compiler barfs on that. + */ +static const struct cpumask *shared_cache_mask(int cpu) +{ + return per_cpu(cpu_l2_cache_map, cpu); +} + +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *smallcore_smt_mask(int cpu) +{ + return cpu_smallcore_mask(cpu); +} +#endif + +static struct cpumask *cpu_coregroup_mask(int cpu) +{ + return per_cpu(cpu_coregroup_map, cpu); +} + +static bool has_coregroup_support(void) +{ + /* Coregroup identification not available on shared systems */ + if (is_shared_processor()) + return 0; + + return coregroup_enabled; +} + +static const struct cpumask *cpu_mc_mask(int cpu) +{ + return cpu_coregroup_mask(cpu); } -static int init_big_cores(void) +static int __init init_big_cores(void) { int cpu; for_each_possible_cpu(cpu) { - int err = init_cpu_l1_cache_map(cpu); + int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L1); if (err) return err; @@ -834,17 +1074,29 @@ static int init_big_cores(void) } has_big_cores = true; + + for_each_possible_cpu(cpu) { + int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2_L3); + + if (err) + return err; + } + + thread_group_shares_l2 = true; + thread_group_shares_l3 = true; + pr_debug("L2/L3 cache only shared by the threads in the small core\n"); + return 0; } void __init smp_prepare_cpus(unsigned int max_cpus) { - unsigned int cpu; + unsigned int cpu, num_threads; DBG("smp_prepare_cpus\n"); /* - * setup_cpu may need to be called on the boot cpu. We havent + * setup_cpu may need to be called on the boot cpu. We haven't * spun any cpus up but lets be paranoid. */ BUG_ON(boot_cpuid != smp_processor_id()); @@ -860,6 +1112,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + if (has_coregroup_support()) + zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + +#ifdef CONFIG_NUMA /* * numa_node_id() works after this. */ @@ -868,6 +1125,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu])); } +#endif } /* Init the cpumasks so the boot CPU is related to itself */ @@ -875,14 +1133,37 @@ void __init smp_prepare_cpus(unsigned int max_cpus) cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); + if (has_coregroup_support()) + cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid)); + init_big_cores(); if (has_big_cores) { cpumask_set_cpu(boot_cpuid, cpu_smallcore_mask(boot_cpuid)); } + if (cpu_to_chip_id(boot_cpuid) != -1) { + int idx = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); + + /* + * All threads of a core will all belong to the same core, + * chip_id_lookup_table will have one entry per core. + * Assumption: if boot_cpuid doesn't have a chip-id, then no + * other CPUs, will also not have chip-id. + */ + chip_id_lookup_table = kcalloc(idx, sizeof(int), GFP_KERNEL); + if (chip_id_lookup_table) + memset(chip_id_lookup_table, -1, sizeof(int) * idx); + } + if (smp_ops && smp_ops->probe) smp_ops->probe(); + + // Initalise the generic SMT topology support + num_threads = 1; + if (smt_enabled_at_boot) + num_threads = smt_enabled_at_boot; + cpu_smt_set_num_threads(num_threads, threads_per_core); } void smp_prepare_boot_cpu(void) @@ -980,15 +1261,20 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + - THREAD_SIZE - STACK_FRAME_OVERHEAD; + THREAD_SIZE - STACK_FRAME_MIN_SIZE; #endif - idle->cpu = cpu; + task_thread_info(idle)->cpu = cpu; secondary_current = current_set[cpu] = idle; } int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - int rc, c; + const unsigned long boot_spin_ms = 5 * MSEC_PER_SEC; + const bool booting = system_state < SYSTEM_RUNNING; + const unsigned long hp_spin_ms = 1; + unsigned long deadline; + int rc; + const unsigned long spin_wait_ms = booting ? boot_spin_ms : hp_spin_ms; /* * Don't allow secondary threads to come online if inhibited @@ -1033,22 +1319,23 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) } /* - * wait to see if the cpu made a callin (is actually up). - * use this value that I found through experimentation. - * -- Cort + * At boot time, simply spin on the callin word until the + * deadline passes. + * + * At run time, spin for an optimistic amount of time to avoid + * sleeping in the common case. */ - if (system_state < SYSTEM_RUNNING) - for (c = 50000; c && !cpu_callin_map[cpu]; c--) - udelay(100); -#ifdef CONFIG_HOTPLUG_CPU - else - /* - * CPUs can take much longer to come up in the - * hotplug case. Wait five seconds. - */ - for (c = 5000; c && !cpu_callin_map[cpu]; c--) - msleep(1); -#endif + deadline = jiffies + msecs_to_jiffies(spin_wait_ms); + spin_until_cond(cpu_callin_map[cpu] || time_is_before_jiffies(deadline)); + + if (!cpu_callin_map[cpu] && system_state >= SYSTEM_RUNNING) { + const unsigned long sleep_interval_us = 10 * USEC_PER_MSEC; + const unsigned long sleep_wait_ms = 100 * MSEC_PER_SEC; + + deadline = jiffies + msecs_to_jiffies(sleep_wait_ms); + while (!cpu_callin_map[cpu] && time_is_after_jiffies(deadline)) + fsleep(sleep_interval_us); + } if (!cpu_callin_map[cpu]) { printk(KERN_ERR "Processor %u is stuck.\n", cpu); @@ -1072,18 +1359,13 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) int cpu_to_core_id(int cpu) { struct device_node *np; - const __be32 *reg; int id = -1; np = of_get_cpu_node(cpu, NULL); if (!np) goto out; - reg = of_get_property(np, "reg", NULL); - if (!reg) - goto out; - - id = be32_to_cpup(reg); + id = of_get_cpu_hwid(np, 0); out: of_node_put(np); return id; @@ -1125,26 +1407,68 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } -static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) +static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) { + struct cpumask *(*submask_fn)(int) = cpu_sibling_mask; struct device_node *l2_cache, *np; int i; + if (has_big_cores) + submask_fn = cpu_smallcore_mask; + + /* + * If the threads in a thread-group share L2 cache, then the + * L2-mask can be obtained from thread_group_l2_cache_map. + */ + if (thread_group_shares_l2) { + cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu)); + + for_each_cpu(i, per_cpu(thread_group_l2_cache_map, cpu)) { + if (cpu_online(i)) + set_cpus_related(i, cpu, cpu_l2_cache_mask); + } + + /* Verify that L1-cache siblings are a subset of L2 cache-siblings */ + if (!cpumask_equal(submask_fn(cpu), cpu_l2_cache_mask(cpu)) && + !cpumask_subset(submask_fn(cpu), cpu_l2_cache_mask(cpu))) { + pr_warn_once("CPU %d : Inconsistent L1 and L2 cache siblings\n", + cpu); + } + + return true; + } + l2_cache = cpu_to_l2cache(cpu); - if (!l2_cache) + if (!l2_cache || !*mask) { + /* Assume only core siblings share cache with this CPU */ + for_each_cpu(i, cpu_sibling_mask(cpu)) + set_cpus_related(cpu, i, cpu_l2_cache_mask); + return false; + } + + cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); - for_each_cpu(i, cpu_online_mask) { + /* Update l2-cache mask with all the CPUs that are part of submask */ + or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); + + /* Skip all CPUs already part of current CPU l2-cache mask */ + cpumask_andnot(*mask, *mask, cpu_l2_cache_mask(cpu)); + + for_each_cpu(i, *mask) { /* * when updating the marks the current CPU has not been marked * online, but we need to update the cache masks */ np = cpu_to_l2cache(i); - if (!np) - continue; - if (np == l2_cache) - set_cpus_related(cpu, i, mask_fn); + /* Skip all CPUs already part of current CPU l2-cache */ + if (np == l2_cache) { + or_cpumasks_related(cpu, i, submask_fn, cpu_l2_cache_mask); + cpumask_andnot(*mask, *mask, submask_fn(i)); + } else { + cpumask_andnot(*mask, *mask, cpu_l2_cache_mask(i)); + } of_node_put(np); } @@ -1156,89 +1480,159 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) #ifdef CONFIG_HOTPLUG_CPU static void remove_cpu_from_masks(int cpu) { + struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; - /* NB: cpu_core_mask is a superset of the others */ - for_each_cpu(i, cpu_core_mask(cpu)) { - set_cpus_unrelated(cpu, i, cpu_core_mask); + unmap_cpu_from_node(cpu); + + if (shared_caches) + mask_fn = cpu_l2_cache_mask; + + for_each_cpu(i, mask_fn(cpu)) { set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); set_cpus_unrelated(cpu, i, cpu_sibling_mask); if (has_big_cores) set_cpus_unrelated(cpu, i, cpu_smallcore_mask); } + + for_each_cpu(i, cpu_core_mask(cpu)) + set_cpus_unrelated(cpu, i, cpu_core_mask); + + if (has_coregroup_support()) { + for_each_cpu(i, cpu_coregroup_mask(cpu)) + set_cpus_unrelated(cpu, i, cpu_coregroup_mask); + } } #endif static inline void add_cpu_to_smallcore_masks(int cpu) { - struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu); - int i, first_thread = cpu_first_thread_sibling(cpu); + int i; if (!has_big_cores) return; cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu)); - for (i = first_thread; i < first_thread + threads_per_core; i++) { - if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map)) + for_each_cpu(i, per_cpu(thread_group_l1_cache_map, cpu)) { + if (cpu_online(i)) set_cpus_related(i, cpu, cpu_smallcore_mask); } } +static void update_coregroup_mask(int cpu, cpumask_var_t *mask) +{ + struct cpumask *(*submask_fn)(int) = cpu_sibling_mask; + int coregroup_id = cpu_to_coregroup_id(cpu); + int i; + + if (shared_caches) + submask_fn = cpu_l2_cache_mask; + + if (!*mask) { + /* Assume only siblings are part of this CPU's coregroup */ + for_each_cpu(i, submask_fn(cpu)) + set_cpus_related(cpu, i, cpu_coregroup_mask); + + return; + } + + cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + + /* Update coregroup mask with all the CPUs that are part of submask */ + or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); + + /* Skip all CPUs already part of coregroup mask */ + cpumask_andnot(*mask, *mask, cpu_coregroup_mask(cpu)); + + for_each_cpu(i, *mask) { + /* Skip all CPUs not part of this coregroup */ + if (coregroup_id == cpu_to_coregroup_id(i)) { + or_cpumasks_related(cpu, i, submask_fn, cpu_coregroup_mask); + cpumask_andnot(*mask, *mask, submask_fn(i)); + } else { + cpumask_andnot(*mask, *mask, cpu_coregroup_mask(i)); + } + } +} + static void add_cpu_to_masks(int cpu) { + struct cpumask *(*submask_fn)(int) = cpu_sibling_mask; int first_thread = cpu_first_thread_sibling(cpu); - int chipid = cpu_to_chip_id(cpu); + cpumask_var_t mask; + int chip_id = -1; + bool ret; int i; /* * This CPU will not be in the online mask yet so we need to manually * add it to it's own thread sibling mask. */ + map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); for (i = first_thread; i < first_thread + threads_per_core; i++) if (cpu_online(i)) set_cpus_related(i, cpu, cpu_sibling_mask); add_cpu_to_smallcore_masks(cpu); - /* - * Copy the thread sibling mask into the cache sibling mask - * and mark any CPUs that share an L2 with this CPU. - */ - for_each_cpu(i, cpu_sibling_mask(cpu)) - set_cpus_related(cpu, i, cpu_l2_cache_mask); - update_mask_by_l2(cpu, cpu_l2_cache_mask); - /* - * Copy the cache sibling mask into core sibling mask and mark - * any CPUs on the same chip as this CPU. - */ - for_each_cpu(i, cpu_l2_cache_mask(cpu)) - set_cpus_related(cpu, i, cpu_core_mask); + /* In CPU-hotplug path, hence use GFP_ATOMIC */ + ret = alloc_cpumask_var_node(&mask, GFP_ATOMIC, cpu_to_node(cpu)); + update_mask_by_l2(cpu, &mask); - if (chipid == -1) - return; + if (has_coregroup_support()) + update_coregroup_mask(cpu, &mask); - for_each_cpu(i, cpu_online_mask) - if (cpu_to_chip_id(i) == chipid) - set_cpus_related(cpu, i, cpu_core_mask); -} + if (chip_id_lookup_table && ret) + chip_id = cpu_to_chip_id(cpu); + + if (shared_caches) + submask_fn = cpu_l2_cache_mask; + + /* Update core_mask with all the CPUs that are part of submask */ + or_cpumasks_related(cpu, cpu, submask_fn, cpu_core_mask); -static bool shared_caches; + /* Skip all CPUs already part of current CPU core mask */ + cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu)); + + /* If chip_id is -1; limit the cpu_core_mask to within PKG */ + if (chip_id == -1) + cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + + for_each_cpu(i, mask) { + if (chip_id == cpu_to_chip_id(i)) { + or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask); + cpumask_andnot(mask, mask, submask_fn(i)); + } else { + cpumask_andnot(mask, mask, cpu_core_mask(i)); + } + } + + free_cpumask_var(mask); +} /* Activate a secondary processor. */ +__no_stack_protector void start_secondary(void *unused) { - unsigned int cpu = smp_processor_id(); - struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; + unsigned int cpu = raw_smp_processor_id(); + + /* PPC64 calls setup_kup() in early_setup_secondary() */ + if (IS_ENABLED(CONFIG_PPC32)) + setup_kup(); - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; + VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(&init_mm))); + cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); + inc_mm_active_cpus(&init_mm); smp_store_cpu_info(cpu); set_dec(tb_ticks_per_jiffy); - preempt_disable(); + rcutree_report_cpu_starting(cpu); cpu_callin_map[cpu] = 1; if (smp_ops->setup_cpu) @@ -1254,20 +1648,26 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif + set_numa_node(numa_cpu_lookup_table[cpu]); + set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); + /* Update topology CPU masks */ add_cpu_to_masks(cpu); - if (has_big_cores) - sibling_mask = cpu_smallcore_mask; /* * Check for any shared caches. Note that this must be done on a * per-core basis because one core in the pair might be disabled. */ - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu))) - shared_caches = true; + if (!shared_caches) { + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; + struct cpumask *mask = cpu_l2_cache_mask(cpu); - set_numa_node(numa_cpu_lookup_table[cpu]); - set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); + if (has_big_cores) + sibling_mask = cpu_smallcore_mask; + + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu))) + shared_caches = true; + } smp_wmb(); notify_cpu_starting(cpu); @@ -1285,68 +1685,46 @@ void start_secondary(void *unused) BUG(); } -int setup_profiling_timer(unsigned int multiplier) -{ - return 0; -} +static struct sched_domain_topology_level powerpc_topology[6]; -#ifdef CONFIG_SCHED_SMT -/* cpumask of CPUs with asymetric SMT dependancy */ -static int powerpc_smt_flags(void) +static void __init build_sched_topology(void) { - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + int i = 0; - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - flags |= SD_ASYM_PACKING; - } - return flags; -} -#endif + if (is_shared_processor() && has_big_cores) + static_branch_enable(&splpar_asym_pack); -static struct sched_domain_topology_level powerpc_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, + if (has_big_cores) { + pr_info("Big cores detected but using small core scheduling\n"); + powerpc_topology[i++] = (struct sched_domain_topology_level){ + smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; + } else { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; + } #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - -/* - * P9 has a slightly odd architecture where pairs of cores share an L2 cache. - * This topology makes it *much* cheaper to migrate tasks between adjacent cores - * since the migrated task remains cache hot. We want to take advantage of this - * at the scheduler level so an extra topology level is required. - */ -static int powerpc_shared_cache_flags(void) -{ - return SD_SHARE_PKG_RESOURCES; -} + if (shared_caches) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) + }; + } + if (has_coregroup_support()) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) + }; + } + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) + }; -/* - * We can't just pass cpu_l2_cache_mask() directly because - * returns a non-const pointer and the compiler barfs on that. - */ -static const struct cpumask *shared_cache_mask(int cpu) -{ - return cpu_l2_cache_mask(cpu); -} + /* There must be one trailing NULL entry left. */ + BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) -{ - return cpu_smallcore_mask(cpu); + set_sched_topology(powerpc_topology); } -#endif - -static struct sched_domain_topology_level power9_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; void __init smp_cpus_done(unsigned int max_cpus) { @@ -1359,31 +1737,21 @@ void __init smp_cpus_done(unsigned int max_cpus) if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); - /* - * On a shared LPAR, associativity needs to be requested. - * Hence, get numa topology before dumping cpu topology - */ - shared_proc_topology_init(); dump_numa_cpu_topology(); + build_sched_topology(); +} -#ifdef CONFIG_SCHED_SMT - if (has_big_cores) { - pr_info("Using small cores at SMT level\n"); - power9_topology[0].mask = smallcore_smt_mask; - powerpc_topology[0].mask = smallcore_smt_mask; - } -#endif - /* - * If any CPU detects that it's sharing a cache with another CPU then - * use the deeper topology that is aware of this sharing. - */ - if (shared_caches) { - pr_info("Using shared cache scheduler topology\n"); - set_sched_topology(power9_topology); - } else { - pr_info("Using standard scheduler topology\n"); - set_sched_topology(powerpc_topology); - } +/* + * For asym packing, by default lower numbered CPU has higher priority. + * On shared processors, pack to lower numbered core. However avoid moving + * between thread_groups within the same core. + */ +int arch_asym_cpu_priority(int cpu) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return -cpu / threads_per_core; + + return -cpu; } #ifdef CONFIG_HOTPLUG_CPU @@ -1409,11 +1777,19 @@ int __cpu_disable(void) void __cpu_die(unsigned int cpu) { + /* + * This could perhaps be a generic call in idlea_task_dead(), but + * that requires testing from all archs, so first put it here to + */ + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(&init_mm))); + dec_mm_active_cpus(&init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); + if (smp_ops->cpu_die) smp_ops->cpu_die(cpu); } -void cpu_die(void) +void __noreturn arch_cpu_idle_dead(void) { /* * Disable on the down path. This will be re-enabled by @@ -1421,8 +1797,8 @@ void cpu_die(void) */ this_cpu_disable_ftrace(); - if (ppc_md.cpu_die) - ppc_md.cpu_die(); + if (smp_ops->cpu_offline_self) + smp_ops->cpu_offline_self(); /* If we return, we re-enter start_secondary */ start_secondary_resume(); diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index e2a46cfed5fd..e6a958a5da27 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -8,6 +8,7 @@ * Copyright 2018 Nick Piggin, Michael Ellerman, IBM Corp. */ +#include <linux/delay.h> #include <linux/export.h> #include <linux/kallsyms.h> #include <linux/module.h> @@ -23,117 +24,66 @@ #include <asm/paca.h> -/* - * Save stack-backtrace addresses into a stack_trace buffer. - */ -static void save_context_stack(struct stack_trace *trace, unsigned long sp, - struct task_struct *tsk, int savesched) +void __no_sanitize_address arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) { + unsigned long sp; + + if (regs && !consume_entry(cookie, regs->nip)) + return; + + if (regs) + sp = regs->gpr[1]; + else if (task == current) + sp = current_stack_frame(); + else + sp = task->thread.ksp; + for (;;) { unsigned long *stack = (unsigned long *) sp; unsigned long newsp, ip; - if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, task)) return; newsp = stack[0]; ip = stack[STACK_FRAME_LR_SAVE]; - if (savesched || !in_sched_functions(ip)) { - if (!trace->skip) - trace->entries[trace->nr_entries++] = ip; - else - trace->skip--; - } - - if (trace->nr_entries >= trace->max_entries) + if (!consume_entry(cookie, ip)) return; sp = newsp; } } -void save_stack_trace(struct stack_trace *trace) -{ - unsigned long sp; - - sp = current_stack_pointer(); - - save_context_stack(trace, sp, current, 1); -} -EXPORT_SYMBOL_GPL(save_stack_trace); - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - unsigned long sp; - - if (!try_get_task_stack(tsk)) - return; - - if (tsk == current) - sp = current_stack_pointer(); - else - sp = tsk->thread.ksp; - - save_context_stack(trace, sp, tsk, 0); - - put_task_stack(tsk); -} -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); - -void -save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - save_context_stack(trace, regs->gpr[1], current, 0); -} -EXPORT_SYMBOL_GPL(save_stack_trace_regs); - -#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE /* * This function returns an error if it detects any unreliable features of the * stack. Otherwise it guarantees that the stack trace is reliable. * * If the task is not 'current', the caller *must* ensure the task is inactive. */ -static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) +int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) { unsigned long sp; unsigned long newsp; - unsigned long stack_page = (unsigned long)task_stack_page(tsk); + unsigned long stack_page = (unsigned long)task_stack_page(task); unsigned long stack_end; int graph_idx = 0; bool firstframe; stack_end = stack_page + THREAD_SIZE; - if (!is_idle_task(tsk)) { - /* - * For user tasks, this is the SP value loaded on - * kernel entry, see "PACAKSAVE(r13)" in _switch() and - * system_call_common()/EXCEPTION_PROLOG_COMMON(). - * - * Likewise for non-swapper kernel threads, - * this also happens to be the top of the stack - * as setup by copy_thread(). - * - * Note that stack backlinks are not properly setup by - * copy_thread() and thus, a forked task() will have - * an unreliable stack trace until it's been - * _switch()'ed to for the first time. - */ - stack_end -= STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - } else { - /* - * idle tasks have a custom stack layout, - * c.f. cpu_idle_thread_init(). - */ - stack_end -= STACK_FRAME_OVERHEAD; - } - if (tsk == current) - sp = current_stack_pointer(); + // See copy_thread() for details. + if (task->flags & PF_KTHREAD) + stack_end -= STACK_FRAME_MIN_SIZE; else - sp = tsk->thread.ksp; + stack_end -= STACK_USER_INT_FRAME_SIZE; + + if (task == current) + sp = current_stack_frame(); + else + sp = task->thread.ksp; if (sp < stack_page + sizeof(struct thread_struct) || sp > stack_end - STACK_FRAME_MIN_SIZE) { @@ -169,7 +119,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, /* Mark stacktraces with exception frames as unreliable. */ if (sp <= stack_end - STACK_INT_FRAME_SIZE && - stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + stack[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { return -EINVAL; } @@ -182,46 +132,22 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, * FIXME: IMHO these tests do not belong in * arch-dependent code, they are generic. */ - ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack); + ip = ftrace_graph_ret_addr(task, &graph_idx, ip, stack); #ifdef CONFIG_KPROBES /* * Mark stacktraces with kretprobed functions on them * as unreliable. */ - if (ip == (unsigned long)kretprobe_trampoline) + if (ip == (unsigned long)__kretprobe_trampoline) return -EINVAL; #endif - if (trace->nr_entries >= trace->max_entries) - return -E2BIG; - if (!trace->skip) - trace->entries[trace->nr_entries++] = ip; - else - trace->skip--; + if (!consume_entry(cookie, ip)) + return -EINVAL; } return 0; } -int save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - int ret; - - /* - * If the task doesn't have a stack (e.g., a zombie), the stack is - * "reliably" empty. - */ - if (!try_get_task_stack(tsk)) - return 0; - - ret = __save_stack_trace_tsk_reliable(tsk, trace); - - put_task_stack(tsk); - - return ret; -} -#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ - #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) static void handle_backtrace_ipi(struct pt_regs *regs) { @@ -230,17 +156,31 @@ static void handle_backtrace_ipi(struct pt_regs *regs) static void raise_backtrace_ipi(cpumask_t *mask) { + struct paca_struct *p; unsigned int cpu; + u64 delay_us; for_each_cpu(cpu, mask) { - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { handle_backtrace_ipi(NULL); - else - smp_send_safe_nmi_ipi(cpu, handle_backtrace_ipi, 5 * USEC_PER_SEC); - } + continue; + } - for_each_cpu(cpu, mask) { - struct paca_struct *p = paca_ptrs[cpu]; + delay_us = 5 * USEC_PER_SEC; + + if (smp_send_safe_nmi_ipi(cpu, handle_backtrace_ipi, delay_us)) { + // Now wait up to 5s for the other CPU to do its backtrace + while (cpumask_test_cpu(cpu, mask) && delay_us) { + udelay(1); + delay_us--; + } + + // Other CPU cleared itself from the mask + if (delay_us) + continue; + } + + p = paca_ptrs[cpu]; cpumask_clear_cpu(cpu, mask); @@ -260,12 +200,12 @@ static void raise_backtrace_ipi(cpumask_t *mask) pr_cont(" current pointer corrupt? (%px)\n", p->__current); pr_warn("Back trace of paca->saved_r1 (0x%016llx) (possibly stale):\n", p->saved_r1); - show_stack(p->__current, (unsigned long *)p->saved_r1); + show_stack(p->__current, (unsigned long *)p->saved_r1, KERN_WARNING); } } -void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) { - nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, raise_backtrace_ipi); } #endif /* defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) */ diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c new file mode 100644 index 000000000000..863a7aa24650 --- /dev/null +++ b/arch/powerpc/kernel/static_call.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/memory.h> +#include <linux/static_call.h> + +#include <asm/code-patching.h> + +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) +{ + int err; + bool is_ret0 = (func == __static_call_return0); + unsigned long target = (unsigned long)(is_ret0 ? tramp + PPC_SCT_RET0 : func); + bool is_short = is_offset_in_branch_range((long)target - (long)tramp); + + if (!tramp) + return; + + mutex_lock(&text_mutex); + + if (func && !is_short) { + err = patch_instruction(tramp + PPC_SCT_DATA, ppc_inst(target)); + if (err) + goto out; + } + + if (!func) + err = patch_instruction(tramp, ppc_inst(PPC_RAW_BLR())); + else if (is_short) + err = patch_branch(tramp, target, 0); + else + err = patch_instruction(tramp, ppc_inst(PPC_RAW_NOP())); +out: + mutex_unlock(&text_mutex); + + if (err) + panic("%s: patching failed %pS at %pS\n", __func__, func, tramp); +} +EXPORT_SYMBOL_GPL(arch_static_call_transform); diff --git a/arch/powerpc/kernel/switch.S b/arch/powerpc/kernel/switch.S new file mode 100644 index 000000000000..608c0ce7cec6 --- /dev/null +++ b/arch/powerpc/kernel/switch.S @@ -0,0 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include <linux/objtool.h> +#include <asm/asm-offsets.h> +#include <asm/code-patching-asm.h> +#include <asm/mmu.h> +#include <asm/ppc_asm.h> +#include <asm/kup.h> +#include <asm/thread_info.h> + +.section ".text","ax",@progbits + +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * Cancel all explict user streams as they will have no use after context + * switch and will stop the HW from creating streams itself + */ +#define STOP_STREAMS \ + DCBT_BOOK3S_STOP_ALL_STREAM_IDS(r6) + +#define FLUSH_COUNT_CACHE \ +1: nop; \ + patch_site 1b, patch__call_flush_branch_caches1; \ +1: nop; \ + patch_site 1b, patch__call_flush_branch_caches2; \ +1: nop; \ + patch_site 1b, patch__call_flush_branch_caches3 + +.macro nops number + .rept \number + nop + .endr +.endm + +.balign 32 +.global flush_branch_caches +flush_branch_caches: + /* Save LR into r9 */ + mflr r9 + + // Flush the link stack + .rept 64 + ANNOTATE_INTRA_FUNCTION_CALL + bl .+4 + .endr + b 1f + nops 6 + + .balign 32 + /* Restore LR */ +1: mtlr r9 + + // If we're just flushing the link stack, return here +3: nop + patch_site 3b patch__flush_link_stack_return + + li r9,0x7fff + mtctr r9 + + PPC_BCCTR_FLUSH + +2: nop + patch_site 2b patch__flush_count_cache_return + + nops 3 + + .rept 278 + .balign 32 + PPC_BCCTR_FLUSH + nops 7 + .endr + + blr + +#ifdef CONFIG_PPC_64S_HASH_MMU +.balign 32 +/* + * New stack pointer in r8, old stack pointer in r1, must not clobber r3 + */ +pin_stack_slb: +BEGIN_FTR_SECTION + clrrdi r6,r8,28 /* get its ESID */ + clrrdi r9,r1,28 /* get current sp ESID */ +FTR_SECTION_ELSE + clrrdi r6,r8,40 /* get its 1T ESID */ + clrrdi r9,r1,40 /* get current sp 1T ESID */ +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_1T_SEGMENT) + clrldi. r0,r6,2 /* is new ESID c00000000? */ + cmpd cr1,r6,r9 /* or is new ESID the same as current ESID? */ + cror eq,4*cr1+eq,eq + beq 2f /* if yes, don't slbie it */ + + /* Bolt in the new stack SLB entry */ + ld r7,KSP_VSID(r4) /* Get new stack's VSID */ + oris r0,r6,(SLB_ESID_V)@h + ori r0,r0,(SLB_NUM_BOLTED-1)@l +BEGIN_FTR_SECTION + li r9,MMU_SEGSIZE_1T /* insert B field */ + oris r6,r6,(MMU_SEGSIZE_1T << SLBIE_SSIZE_SHIFT)@h + rldimi r7,r9,SLB_VSID_SSIZE_SHIFT,0 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + + /* Update the last bolted SLB. No write barriers are needed + * here, provided we only update the current CPU's SLB shadow + * buffer. + */ + ld r9,PACA_SLBSHADOWPTR(r13) + li r12,0 + std r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */ + li r12,SLBSHADOW_STACKVSID + STDX_BE r7,r12,r9 /* Save VSID */ + li r12,SLBSHADOW_STACKESID + STDX_BE r0,r12,r9 /* Save ESID */ + + /* No need to check for MMU_FTR_NO_SLBIE_B here, since when + * we have 1TB segments, the only CPUs known to have the errata + * only support less than 1TB of system memory and we'll never + * actually hit this code path. + */ + + isync + slbie r6 +BEGIN_FTR_SECTION + slbie r6 /* Workaround POWER5 < DD2.1 issue */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + slbmte r7,r0 + isync +2: blr + .size pin_stack_slb,.-pin_stack_slb +#endif /* CONFIG_PPC_64S_HASH_MMU */ + +#else +#define STOP_STREAMS +#define FLUSH_COUNT_CACHE +#endif /* CONFIG_PPC_BOOK3S_64 */ + +/* + * do_switch_32/64 have the same calling convention as _switch, i.e., r3,r4 + * are prev and next thread_struct *, and returns prev task_struct * in r3. + + * This switches the stack, current, and does other task switch housekeeping. + */ +.macro do_switch_32 + tophys(r0,r4) + mtspr SPRN_SPRG_THREAD,r0 /* Update current THREAD phys addr */ + lwz r1,KSP(r4) /* Load new stack pointer */ + + /* save the old current 'last' for return value */ + mr r3,r2 + addi r2,r4,-THREAD /* Update current */ +.endm + +.macro do_switch_64 + ld r8,KSP(r4) /* Load new stack pointer */ + + kuap_check_amr r9, r10 + + FLUSH_COUNT_CACHE /* Clobbers r9, ctr */ + + STOP_STREAMS /* Clobbers r6 */ + + addi r3,r3,-THREAD /* old thread -> task_struct for return value */ + addi r6,r4,-THREAD /* new thread -> task_struct */ + std r6,PACACURRENT(r13) /* Set new task_struct to 'current' */ +#if defined(CONFIG_STACKPROTECTOR) + ld r6, TASK_CANARY(r6) + std r6, PACA_CANARY(r13) +#endif + /* Set new PACAKSAVE */ + clrrdi r7,r8,THREAD_SHIFT /* base of new stack */ + addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE + std r7,PACAKSAVE(r13) + +#ifdef CONFIG_PPC_64S_HASH_MMU +BEGIN_MMU_FTR_SECTION + bl pin_stack_slb +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) +#endif + /* + * PMU interrupts in radix may come in here. They will use r1, not + * PACAKSAVE, so this stack switch will not cause a problem. They + * will store to the process stack, which may then be migrated to + * another CPU. However the rq lock release on this CPU paired with + * the rq lock acquire on the new CPU before the stack becomes + * active on the new CPU, will order those stores. + */ + mr r1,r8 /* start using new stack pointer */ +.endm + +/* + * This routine switches between two different tasks. The process + * state of one is saved on its kernel stack. Then the state + * of the other is restored from its kernel stack. The memory + * management hardware is updated to the second process's state. + * Finally, we can return to the second process. + * On entry, r3 points to the THREAD for the current task, r4 + * points to the THREAD for the new task. + * + * This routine is always called with interrupts disabled. + * + * Note: there are two ways to get to the "going out" portion + * of this code; either by coming in via the entry (_switch) + * or via "fork" which must set up an environment equivalent + * to the "_switch" path. If you change this , you'll have to + * change the fork code also. + * + * The code which creates the new task context is in 'copy_thread' + * in arch/ppc/kernel/process.c + * + * Note: this uses SWITCH_FRAME_SIZE rather than USER_INT_FRAME_SIZE + * because we don't need to leave the redzone ABI gap at the top of + * the kernel stack. + */ +_GLOBAL(_switch) + PPC_CREATE_STACK_FRAME(SWITCH_FRAME_SIZE) + PPC_STL r1,KSP(r3) /* Set old stack pointer */ + SAVE_NVGPRS(r1) /* volatiles are caller-saved -- Cort */ + PPC_STL r0,_NIP(r1) /* Return to switch caller */ + mfcr r0 + stw r0,_CCR(r1) + + /* + * On SMP kernels, care must be taken because a task may be + * scheduled off CPUx and on to CPUy. Memory ordering must be + * considered. + * + * Cacheable stores on CPUx will be visible when the task is + * scheduled on CPUy by virtue of the core scheduler barriers + * (see "Notes on Program-Order guarantees on SMP systems." in + * kernel/sched/core.c). + * + * Uncacheable stores in the case of involuntary preemption must + * be taken care of. The smp_mb__after_spinlock() in __schedule() + * is implemented as hwsync on powerpc, which orders MMIO too. So + * long as there is an hwsync in the context switch path, it will + * be executed on the source CPU after the task has performed + * all MMIO ops on that CPU, and on the destination CPU before the + * task performs any MMIO ops there. + */ + + /* + * The kernel context switch path must contain a spin_lock, + * which contains larx/stcx, which will clear any reservation + * of the task being switched. + */ + +#ifdef CONFIG_PPC32 + do_switch_32 +#else + do_switch_64 +#endif + + lwz r0,_CCR(r1) + mtcrf 0xFF,r0 + REST_NVGPRS(r1) /* volatiles are destroyed -- Cort */ + PPC_LL r0,_NIP(r1) /* Return to _switch caller in new task */ + mtlr r0 + addi r1,r1,SWITCH_FRAME_SIZE + blr diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S index cbdf86228eaa..ffb79326483c 100644 --- a/arch/powerpc/kernel/swsusp_32.S +++ b/arch/powerpc/kernel/swsusp_32.S @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/threads.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/cputable.h> @@ -181,7 +183,7 @@ _GLOBAL(swsusp_arch_resume) #ifdef CONFIG_ALTIVEC /* Stop pending alitvec streams and memory accesses */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif sync @@ -395,15 +397,18 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) li r3,0 blr +_ASM_NOKPROBE_SYMBOL(swsusp_arch_resume) /* FIXME:This construct is actually not useful since we don't shut * down the instruction MMU, we could just flip back MSR-DR on. */ -turn_on_mmu: +SYM_FUNC_START_LOCAL(turn_on_mmu) mflr r4 mtsrr0 r4 mtsrr1 r3 sync isync rfi +_ASM_NOKPROBE_SYMBOL(turn_on_mmu) +SYM_FUNC_END(turn_on_mmu) diff --git a/arch/powerpc/kernel/swsusp_64.c b/arch/powerpc/kernel/swsusp_64.c index aeea97ad85cf..50fa8fc9ef95 100644 --- a/arch/powerpc/kernel/swsusp_64.c +++ b/arch/powerpc/kernel/swsusp_64.c @@ -11,14 +11,11 @@ #include <linux/interrupt.h> #include <linux/nmi.h> +void do_after_copyback(void); + void do_after_copyback(void) { iommu_restore(); touch_softlockup_watchdog(); mb(); } - -void _iommu_save(void) -{ - iommu_save(); -} diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_85xx.S index 88cfdbd530f1..88cfdbd530f1 100644 --- a/arch/powerpc/kernel/swsusp_booke.S +++ b/arch/powerpc/kernel/swsusp_85xx.S diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index 6d3189830dd3..f645652c2654 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -76,16 +76,10 @@ swsusp_save_area: .space SL_SIZE - .section ".toc","aw" -swsusp_save_area_ptr: - .tc swsusp_save_area[TC],swsusp_save_area -restore_pblist_ptr: - .tc restore_pblist[TC],restore_pblist - .section .text .align 5 _GLOBAL(swsusp_arch_suspend) - ld r11,swsusp_save_area_ptr@toc(r2) + LOAD_REG_ADDR(r11, swsusp_save_area) SAVE_SPECIAL(LR) SAVE_REGISTER(r1) SAVE_SPECIAL(CR) @@ -128,11 +122,10 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) * stack pointer on the stack like a real stackframe */ addi r1,r1,-128 - bl _iommu_save bl swsusp_save /* restore LR */ - ld r11,swsusp_save_area_ptr@toc(r2) + LOAD_REG_ADDR(r11, swsusp_save_area) RESTORE_SPECIAL(LR) addi r1,r1,128 @@ -142,11 +135,11 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) _GLOBAL(swsusp_arch_resume) /* Stop pending alitvec streams and memory accesses */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) sync - ld r12,restore_pblist_ptr@toc(r2) + LOAD_REG_ADDR(r11, restore_pblist) ld r12,0(r12) cmpdi r12,0 @@ -188,7 +181,7 @@ nothing_to_copy: tlbia #endif - ld r11,swsusp_save_area_ptr@toc(r2) + LOAD_REG_ADDR(r11, swsusp_save_area) RESTORE_SPECIAL(CR) @@ -266,7 +259,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) bl do_after_copyback addi r1,r1,128 - ld r11,swsusp_save_area_ptr@toc(r2) + LOAD_REG_ADDR(r11, swsusp_save_area) RESTORE_SPECIAL(LR) li r3, 0 diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index d36c6391eaf5..d451a8229223 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -1,13 +1,23 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * sys_ppc32.c: Conversion between 32bit and 64bit native syscalls. + * sys_ppc32.c: 32-bit system calls with complex calling conventions. * * Copyright (C) 2001 IBM * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) * - * These routines maintain argument size conversion between 32bit and 64bit - * environment. + * 32-bit system calls with 64-bit arguments pass those in register pairs. + * This must be specially dealt with on 64-bit kernels. The compat_arg_u64_dual + * in generic compat syscalls is not always usable because the register + * pairing is constrained depending on preceding arguments. + * + * An analogous problem exists on 32-bit kernels with ARCH_HAS_SYSCALL_WRAPPER, + * the defined system call functions take the pt_regs as an argument, and there + * is a mapping macro which maps registers to arguments + * (SC_POWERPC_REGS_TO_ARGS) which also does not deal with these 64-bit + * arguments. + * + * This file contains these system calls. */ #include <linux/kernel.h> @@ -25,7 +35,6 @@ #include <linux/poll.h> #include <linux/personality.h> #include <linux/stat.h> -#include <linux/mman.h> #include <linux/in.h> #include <linux/syscalls.h> #include <linux/unistd.h> @@ -48,68 +57,79 @@ #include <asm/syscalls.h> #include <asm/switch_to.h> -unsigned long compat_sys_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - /* This should remain 12 even if PAGE_SIZE changes */ - return sys_mmap(addr, len, prot, flags, fd, pgoff << 12); -} - -/* - * long long munging: - * The 32 bit ABI passes long longs in an odd even register pair. - */ - -compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, - u32 reg6, u32 poshi, u32 poslo) -{ - return ksys_pread64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo); -} +#ifdef CONFIG_PPC32 +#define PPC32_SYSCALL_DEFINE4 SYSCALL_DEFINE4 +#define PPC32_SYSCALL_DEFINE5 SYSCALL_DEFINE5 +#define PPC32_SYSCALL_DEFINE6 SYSCALL_DEFINE6 +#else +#define PPC32_SYSCALL_DEFINE4 COMPAT_SYSCALL_DEFINE4 +#define PPC32_SYSCALL_DEFINE5 COMPAT_SYSCALL_DEFINE5 +#define PPC32_SYSCALL_DEFINE6 COMPAT_SYSCALL_DEFINE6 +#endif -compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, - u32 reg6, u32 poshi, u32 poslo) +PPC32_SYSCALL_DEFINE6(ppc_pread64, + unsigned int, fd, + char __user *, ubuf, compat_size_t, count, + u32, reg6, u32, pos1, u32, pos2) { - return ksys_pwrite64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo); + return ksys_pread64(fd, ubuf, count, merge_64(pos1, pos2)); } -compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offhi, u32 offlo, u32 count) +PPC32_SYSCALL_DEFINE6(ppc_pwrite64, + unsigned int, fd, + const char __user *, ubuf, compat_size_t, count, + u32, reg6, u32, pos1, u32, pos2) { - return ksys_readahead(fd, ((loff_t)offhi << 32) | offlo, count); + return ksys_pwrite64(fd, ubuf, count, merge_64(pos1, pos2)); } -asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4, - unsigned long high, unsigned long low) +PPC32_SYSCALL_DEFINE5(ppc_readahead, + int, fd, u32, r4, + u32, offset1, u32, offset2, u32, count) { - return ksys_truncate(path, (high << 32) | low); + return ksys_readahead(fd, merge_64(offset1, offset2), count); } -asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo, - u32 lenhi, u32 lenlo) +PPC32_SYSCALL_DEFINE4(ppc_truncate64, + const char __user *, path, u32, reg4, + unsigned long, len1, unsigned long, len2) { - return ksys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo, - ((loff_t)lenhi << 32) | lenlo); + return ksys_truncate(path, merge_64(len1, len2)); } -asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high, - unsigned long low) +PPC32_SYSCALL_DEFINE4(ppc_ftruncate64, + unsigned int, fd, u32, reg4, + unsigned long, len1, unsigned long, len2) { - return ksys_ftruncate(fd, (high << 32) | low); + return ksys_ftruncate(fd, merge_64(len1, len2)); } -long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low, - size_t len, int advice) +PPC32_SYSCALL_DEFINE6(ppc32_fadvise64, + int, fd, u32, unused, u32, offset1, u32, offset2, + size_t, len, int, advice) { - return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, len, + return ksys_fadvise64_64(fd, merge_64(offset1, offset2), len, advice); } -asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags, - unsigned offset_hi, unsigned offset_lo, - unsigned nbytes_hi, unsigned nbytes_lo) +PPC32_SYSCALL_DEFINE6(ppc_sync_file_range2, + int, fd, unsigned int, flags, + unsigned int, offset1, unsigned int, offset2, + unsigned int, nbytes1, unsigned int, nbytes2) { - loff_t offset = ((loff_t)offset_hi << 32) | offset_lo; - loff_t nbytes = ((loff_t)nbytes_hi << 32) | nbytes_lo; + loff_t offset = merge_64(offset1, offset2); + loff_t nbytes = merge_64(nbytes1, nbytes2); return ksys_sync_file_range(fd, offset, nbytes, flags); } + +#ifdef CONFIG_PPC32 +SYSCALL_DEFINE6(ppc_fallocate, + int, fd, int, mode, + u32, offset1, u32, offset2, u32, len1, u32, len2) +{ + return ksys_fallocate(fd, mode, + merge_64(offset1, offset2), + merge_64(len1, len2)); +} +#endif diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c new file mode 100644 index 000000000000..77fedb190c93 --- /dev/null +++ b/arch/powerpc/kernel/syscall.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/compat.h> +#include <linux/context_tracking.h> +#include <linux/randomize_kstack.h> + +#include <asm/interrupt.h> +#include <asm/kup.h> +#include <asm/syscall.h> +#include <asm/time.h> +#include <asm/tm.h> +#include <asm/unistd.h> + + +/* Has to run notrace because it is entered not completely "reconciled" */ +notrace long system_call_exception(struct pt_regs *regs, unsigned long r0) +{ + long ret; + syscall_fn f; + + kuap_lock(); + + add_random_kstack_offset(); + + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); + + trace_hardirqs_off(); /* finish reconciling */ + + CT_WARN_ON(ct_state() == CONTEXT_KERNEL); + user_exit_irqoff(); + + BUG_ON(regs_is_unrecoverable(regs)); + BUG_ON(!(regs->msr & MSR_PR)); + BUG_ON(arch_irq_disabled_regs(regs)); + +#ifdef CONFIG_PPC_PKEY + if (mmu_has_feature(MMU_FTR_PKEY)) { + unsigned long amr, iamr; + bool flush_needed = false; + /* + * When entering from userspace we mostly have the AMR/IAMR + * different from kernel default values. Hence don't compare. + */ + amr = mfspr(SPRN_AMR); + iamr = mfspr(SPRN_IAMR); + regs->amr = amr; + regs->iamr = iamr; + if (mmu_has_feature(MMU_FTR_KUAP)) { + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + flush_needed = true; + } + if (mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) { + mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED); + flush_needed = true; + } + if (flush_needed) + isync(); + } else +#endif + kuap_assert_locked(); + + booke_restore_dbcr0(); + + account_cpu_user_entry(); + + account_stolen_time(); + + /* + * This is not required for the syscall exit path, but makes the + * stack frame look nicer. If this was initialised in the first stack + * frame, or if the unwinder was taught the first stack frame always + * returns to user with IRQS_ENABLED, this store could be avoided! + */ + irq_soft_mask_regs_set_state(regs, IRQS_ENABLED); + + /* + * If system call is called with TM active, set _TIF_RESTOREALL to + * prevent RFSCV being used to return to userspace, because POWER9 + * TM implementation has problems with this instruction returning to + * transactional state. Final register values are not relevant because + * the transaction will be aborted upon return anyway. Or in the case + * of unsupported_scv SIGILL fault, the return state does not much + * matter because it's an edge case. + */ + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && + unlikely(MSR_TM_TRANSACTIONAL(regs->msr))) + set_bits(_TIF_RESTOREALL, ¤t_thread_info()->flags); + + /* + * If the system call was made with a transaction active, doom it and + * return without performing the system call. Unless it was an + * unsupported scv vector, in which case it's treated like an illegal + * instruction. + */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (unlikely(MSR_TM_TRANSACTIONAL(regs->msr)) && + !trap_is_unsupported_scv(regs)) { + /* Enable TM in the kernel, and disable EE (for scv) */ + hard_irq_disable(); + mtmsr(mfmsr() | MSR_TM); + + /* tabort, this dooms the transaction, nothing else */ + asm volatile(".long 0x7c00071d | ((%0) << 16)" + :: "r"(TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT)); + + /* + * Userspace will never see the return value. Execution will + * resume after the tbegin. of the aborted transaction with the + * checkpointed register state. A context switch could occur + * or signal delivered to the process before resuming the + * doomed transaction context, but that should all be handled + * as expected. + */ + return -ENOSYS; + } +#endif // CONFIG_PPC_TRANSACTIONAL_MEM + + local_irq_enable(); + + if (unlikely(read_thread_flags() & _TIF_SYSCALL_DOTRACE)) { + if (unlikely(trap_is_unsupported_scv(regs))) { + /* Unsupported scv vector */ + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + return regs->gpr[3]; + } + /* + * We use the return value of do_syscall_trace_enter() as the + * syscall number. If the syscall was rejected for any reason + * do_syscall_trace_enter() returns an invalid syscall number + * and the test against NR_syscalls will fail and the return + * value to be used is in regs->gpr[3]. + */ + r0 = do_syscall_trace_enter(regs); + if (unlikely(r0 >= NR_syscalls)) + return regs->gpr[3]; + + } else if (unlikely(r0 >= NR_syscalls)) { + if (unlikely(trap_is_unsupported_scv(regs))) { + /* Unsupported scv vector */ + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + return regs->gpr[3]; + } + return -ENOSYS; + } + + /* May be faster to do array_index_nospec? */ + barrier_nospec(); + +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER + // No COMPAT if we have SYSCALL_WRAPPER, see Kconfig + f = (void *)sys_call_table[r0]; + ret = f(regs); +#else + if (unlikely(is_compat_task())) { + unsigned long r3, r4, r5, r6, r7, r8; + + f = (void *)compat_sys_call_table[r0]; + + r3 = regs->gpr[3] & 0x00000000ffffffffULL; + r4 = regs->gpr[4] & 0x00000000ffffffffULL; + r5 = regs->gpr[5] & 0x00000000ffffffffULL; + r6 = regs->gpr[6] & 0x00000000ffffffffULL; + r7 = regs->gpr[7] & 0x00000000ffffffffULL; + r8 = regs->gpr[8] & 0x00000000ffffffffULL; + + ret = f(r3, r4, r5, r6, r7, r8); + } else { + f = (void *)sys_call_table[r0]; + + ret = f(regs->gpr[3], regs->gpr[4], regs->gpr[5], + regs->gpr[6], regs->gpr[7], regs->gpr[8]); + } +#endif + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * so the maximum stack offset is 1k bytes (10 bits). + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints: the powerpc architecture + * may have two kinds of stack alignment (16-bytes and 8-bytes). + * + * So the resulting 6 or 7 bits of entropy is seen in SP[9:4] or SP[9:3]. + */ + choose_random_kstack_offset(mftb()); + + return ret; +} diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 078608ec2e92..68ebb23a5af4 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -35,26 +35,18 @@ #include <asm/syscalls.h> #include <asm/time.h> #include <asm/unistd.h> -#include <asm/asm-prototypes.h> -static inline long do_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off, int shift) +static long do_mmap2(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long off, int shift) { - long ret = -EINVAL; - if (!arch_validate_prot(prot, addr)) - goto out; + return -EINVAL; - if (shift) { - if (off & ((1 << shift) - 1)) - goto out; - off >>= shift; - } + if (!IS_ALIGNED(off, 1 << shift)) + return -EINVAL; - ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off); -out: - return ret; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> shift); } SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, @@ -64,6 +56,16 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(mmap2, + unsigned long, addr, size_t, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off_4k) +{ + return do_mmap2(addr, len, prot, flags, fd, off_4k, PAGE_SHIFT-12); +} +#endif + SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, off_t, offset) @@ -71,58 +73,47 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT); } -#ifdef CONFIG_PPC32 -/* - * Due to some executables calling the wrong select we sometimes - * get wrong args. This determines how the args are being passed - * (a single ptr to them all args passed) then calls - * sys_select() with the appropriate args. -- Cort - */ -int -ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) -{ - if ( (unsigned long)n >= 4096 ) - { - unsigned long __user *buffer = (unsigned long __user *)n; - if (!access_ok(buffer, 5*sizeof(unsigned long)) - || __get_user(n, buffer) - || __get_user(inp, ((fd_set __user * __user *)(buffer+1))) - || __get_user(outp, ((fd_set __user * __user *)(buffer+2))) - || __get_user(exp, ((fd_set __user * __user *)(buffer+3))) - || __get_user(tvp, ((struct __kernel_old_timeval __user * __user *)(buffer+4)))) - return -EFAULT; - } - return sys_select(n, inp, outp, exp, tvp); -} -#endif - #ifdef CONFIG_PPC64 -long ppc64_personality(unsigned long personality) +static long do_ppc64_personality(unsigned long personality) { long ret; if (personality(current->personality) == PER_LINUX32 && personality(personality) == PER_LINUX) personality = (personality & ~PER_MASK) | PER_LINUX32; - ret = sys_personality(personality); + ret = ksys_personality(personality); if (personality(ret) == PER_LINUX32) ret = (ret & ~PER_MASK) | PER_LINUX; return ret; } -#endif -long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, - u32 len_high, u32 len_low) +SYSCALL_DEFINE1(ppc64_personality, unsigned long, personality) +{ + return do_ppc64_personality(personality); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE1(ppc64_personality, unsigned long, personality) +{ + return do_ppc64_personality(personality); +} +#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_PPC64 */ + +SYSCALL_DEFINE6(ppc_fadvise64_64, + int, fd, int, advice, u32, offset_high, u32, offset_low, + u32, len_high, u32, len_low) { - return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, - (u64)len_high << 32 | len_low, advice); + return ksys_fadvise64_64(fd, merge_64(offset_high, offset_low), + merge_64(len_high, len_low), advice); } SYSCALL_DEFINE0(switch_endian) { struct thread_info *ti; - current->thread.regs->msr ^= MSR_LE; + regs_set_return_msr(current->thread.regs, + current->thread.regs->msr ^ MSR_LE); /* * Set TIF_RESTOREALL so that r3 isn't clobbered on return to diff --git a/arch/powerpc/kernel/syscalls/Makefile b/arch/powerpc/kernel/syscalls/Makefile index 27b48954808d..9d7bd81510b8 100644 --- a/arch/powerpc/kernel/syscalls/Makefile +++ b/arch/powerpc/kernel/syscalls/Makefile @@ -2,62 +2,47 @@ kapi := arch/$(SRCARCH)/include/generated/asm uapi := arch/$(SRCARCH)/include/generated/uapi/asm -_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ - $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') +$(shell mkdir -p $(uapi) $(kapi)) -syscall := $(srctree)/$(src)/syscall.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +syscall := $(src)/syscall.tbl +syshdr := $(srctree)/scripts/syscallhdr.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abis_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis $(abis) $< $@ quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abis_$(basetarget))' \ - '$(systbl_abi_$(basetarget))' \ - '$(systbl_offset_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ -syshdr_abis_unistd_32 := common,nospu,32 -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: abis := common,nospu,32 +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abis_unistd_64 := common,nospu,64 -$(uapi)/unistd_64.h: $(syscall) $(syshdr) +$(uapi)/unistd_64.h: abis := common,nospu,64 +$(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -systbl_abis_syscall_table_32 := common,nospu,32 -systbl_abi_syscall_table_32 := 32 -$(kapi)/syscall_table_32.h: $(syscall) $(systbl) +$(kapi)/syscall_table_32.h: abis := common,nospu,32 +$(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_64 := common,nospu,64 -systbl_abi_syscall_table_64 := 64 -$(kapi)/syscall_table_64.h: $(syscall) $(systbl) +$(kapi)/syscall_table_64.h: abis := common,nospu,64 +$(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_c32 := common,nospu,32 -systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) - $(call if_changed,systbl) - -systbl_abis_syscall_table_spu := common,spu -systbl_abi_syscall_table_spu := spu -$(kapi)/syscall_table_spu.h: $(syscall) $(systbl) +$(kapi)/syscall_table_spu.h: abis := common,spu +$(kapi)/syscall_table_spu.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h unistd_64.h kapisyshdr-y += syscall_table_32.h \ syscall_table_64.h \ - syscall_table_c32.h \ syscall_table_spu.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 43f736ed47f2..17173b82ca21 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -9,7 +9,7 @@ # 0 nospu restart_syscall sys_restart_syscall 1 nospu exit sys_exit -2 nospu fork ppc_fork +2 nospu fork sys_fork 3 common read sys_read 4 common write sys_write 5 common open sys_open compat_sys_open @@ -32,7 +32,7 @@ 18 spu oldstat sys_ni_syscall 19 common lseek sys_lseek compat_sys_lseek 20 common getpid sys_getpid -21 nospu mount sys_mount compat_sys_mount +21 nospu mount sys_mount 22 32 umount sys_oldumount 22 64 umount sys_ni_syscall 22 spu umount sys_ni_syscall @@ -110,7 +110,7 @@ 79 common settimeofday sys_settimeofday compat_sys_settimeofday 80 common getgroups sys_getgroups 81 common setgroups sys_setgroups -82 32 select ppc_select sys_ni_syscall +82 32 select sys_old_select compat_sys_old_select 82 64 select sys_ni_syscall 82 spu select sys_ni_syscall 83 common symlink sys_symlink @@ -158,7 +158,7 @@ 119 32 sigreturn sys_sigreturn compat_sys_sigreturn 119 64 sigreturn sys_ni_syscall 119 spu sigreturn sys_ni_syscall -120 nospu clone ppc_clone +120 nospu clone sys_clone 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common modify_ldt sys_ni_syscall @@ -176,11 +176,11 @@ 131 nospu quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs -136 32 personality sys_personality ppc64_personality -136 64 personality ppc64_personality -136 spu personality ppc64_personality +136 32 personality sys_personality compat_sys_ppc64_personality +136 64 personality sys_ppc64_personality +136 spu personality sys_ppc64_personality 137 common afs_syscall sys_ni_syscall 138 common setfsuid sys_setfsuid 139 common setfsgid sys_setfsgid @@ -189,11 +189,11 @@ 142 common _newselect sys_select compat_sys_select 143 common flock sys_flock 144 common msync sys_msync -145 common readv sys_readv compat_sys_readv -146 common writev sys_writev compat_sys_writev +145 common readv sys_readv +146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 nospu _sysctl sys_sysctl compat_sys_sysctl +149 nospu _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall @@ -228,8 +228,10 @@ 176 64 rt_sigtimedwait sys_rt_sigtimedwait 177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend -179 common pread64 sys_pread64 compat_sys_pread64 -180 common pwrite64 sys_pwrite64 compat_sys_pwrite64 +179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64 +179 64 pread64 sys_pread64 +180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64 +180 64 pwrite64 sys_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -240,12 +242,13 @@ 186 spu sendfile sys_sendfile64 187 common getpmsg sys_ni_syscall 188 common putpmsg sys_ni_syscall -189 nospu vfork ppc_vfork +189 nospu vfork sys_vfork 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit -191 common readahead sys_readahead compat_sys_readahead +191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead +191 64 readahead sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 -193 32 truncate64 sys_truncate64 compat_sys_truncate64 -194 32 ftruncate64 sys_ftruncate64 compat_sys_ftruncate64 +193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64 +194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64 195 32 stat64 sys_stat64 196 32 lstat64 sys_lstat64 197 32 fstat64 sys_fstat64 @@ -288,9 +291,10 @@ 230 common io_submit sys_io_submit compat_sys_io_submit 231 common io_cancel sys_io_cancel 232 nospu set_tid_address sys_set_tid_address -233 common fadvise64 sys_fadvise64 ppc32_fadvise64 +233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64 +233 64 fadvise64 sys_fadvise64 234 nospu exit_group sys_exit_group -235 nospu lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +235 nospu lookup_dcookie sys_ni_syscall 236 common epoll_create sys_epoll_create 237 common epoll_ctl sys_epoll_ctl 238 common epoll_wait sys_epoll_wait @@ -316,26 +320,24 @@ 248 32 clock_nanosleep sys_clock_nanosleep_time32 248 64 clock_nanosleep sys_clock_nanosleep 248 spu clock_nanosleep sys_clock_nanosleep -249 32 swapcontext ppc_swapcontext ppc32_swapcontext -249 64 swapcontext ppc64_swapcontext -249 spu swapcontext sys_ni_syscall +249 nospu swapcontext sys_swapcontext compat_sys_swapcontext 250 common tgkill sys_tgkill 251 32 utimes sys_utimes_time32 251 64 utimes sys_utimes 251 spu utimes sys_utimes 252 common statfs64 sys_statfs64 compat_sys_statfs64 253 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -254 32 fadvise64_64 ppc_fadvise64_64 +254 32 fadvise64_64 sys_ppc_fadvise64_64 254 spu fadvise64_64 sys_ni_syscall 255 common rtas sys_rtas 256 32 sys_debug_setcontext sys_debug_setcontext sys_ni_syscall 256 64 sys_debug_setcontext sys_ni_syscall 256 spu sys_debug_setcontext sys_ni_syscall # 257 reserved for vserver -258 nospu migrate_pages sys_migrate_pages compat_sys_migrate_pages -259 nospu mbind sys_mbind compat_sys_mbind -260 nospu get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +258 nospu migrate_pages sys_migrate_pages +259 nospu mbind sys_mbind +260 nospu get_mempolicy sys_get_mempolicy +261 nospu set_mempolicy sys_set_mempolicy 262 nospu mq_open sys_mq_open compat_sys_mq_open 263 nospu mq_unlink sys_mq_unlink 264 32 mq_timedsend sys_mq_timedsend_time32 @@ -363,7 +365,7 @@ 282 common unshare sys_unshare 283 common splice sys_splice 284 common tee sys_tee -285 common vmsplice sys_vmsplice compat_sys_vmsplice +285 common vmsplice sys_vmsplice 286 common openat sys_openat compat_sys_openat 287 common mkdirat sys_mkdirat 288 common mknodat sys_mknodat @@ -383,7 +385,7 @@ 298 common faccessat sys_faccessat 299 common get_robust_list sys_get_robust_list compat_sys_get_robust_list 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list -301 common move_pages sys_move_pages compat_sys_move_pages +301 common move_pages sys_move_pages 302 common getcpu sys_getcpu 303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 304 32 utimensat sys_utimensat_time32 @@ -392,8 +394,11 @@ 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd -308 common sync_file_range2 sys_sync_file_range2 compat_sys_sync_file_range2 -309 nospu fallocate sys_fallocate compat_sys_fallocate +308 32 sync_file_range2 sys_ppc_sync_file_range2 compat_sys_ppc_sync_file_range2 +308 64 sync_file_range2 sys_sync_file_range2 +308 spu sync_file_range2 sys_sync_file_range2 +309 32 fallocate sys_ppc_fallocate compat_sys_fallocate +309 64 fallocate sys_fallocate 310 nospu subpage_prot sys_subpage_prot 311 32 timerfd_settime sys_timerfd_settime32 311 64 timerfd_settime sys_timerfd_settime @@ -427,8 +432,8 @@ 336 common recv sys_recv compat_sys_recv 337 common recvfrom sys_recvfrom compat_sys_recvfrom 338 common shutdown sys_shutdown -339 common setsockopt sys_setsockopt compat_sys_setsockopt -340 common getsockopt sys_getsockopt compat_sys_getsockopt +339 common setsockopt sys_setsockopt sys_setsockopt +340 common getsockopt sys_getsockopt sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg 343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 @@ -443,8 +448,8 @@ 348 common syncfs sys_syncfs 349 common sendmmsg sys_sendmmsg compat_sys_sendmmsg 350 common setns sys_setns -351 nospu process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv -352 nospu process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +351 nospu process_vm_readv sys_process_vm_readv +352 nospu process_vm_writev sys_process_vm_writev 353 nospu finit_module sys_finit_module 354 nospu kcmp sys_kcmp 355 common sched_setattr sys_sched_setattr @@ -456,7 +461,7 @@ 361 common bpf sys_bpf 362 nospu execveat sys_execveat compat_sys_execveat 363 32 switch_endian sys_ni_syscall -363 64 switch_endian ppc_switch_endian +363 64 switch_endian sys_switch_endian 363 spu switch_endian sys_ni_syscall 364 common userfaultfd sys_userfaultfd 365 common membarrier sys_membarrier @@ -516,4 +521,30 @@ 432 common fsmount sys_fsmount 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open -435 nospu clone3 ppc_clone3 +435 nospu clone3 sys_clone3 +436 common close_range sys_close_range +437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr +443 common quotactl_fd sys_quotactl_fd +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat +452 common fchmodat2 sys_fchmodat2 +453 common map_shadow_stack sys_ni_syscall +454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount +459 common lsm_get_self_attr sys_lsm_get_self_attr +460 common lsm_set_self_attr sys_lsm_set_self_attr +461 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/powerpc/kernel/syscalls/syscallhdr.sh b/arch/powerpc/kernel/syscalls/syscallhdr.sh deleted file mode 100644 index c0a9a32937f1..000000000000 --- a/arch/powerpc/kernel/syscalls/syscallhdr.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_UAPI_ASM_POWERPC_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - printf "#ifndef %s\n" "${fileguard}" - printf "#define %s\n" "${fileguard}" - printf "\n" - - nxt=0 - while read nr abi name entry compat ; do - if [ -z "$offset" ]; then - printf "#define __NR_%s%s\t%s\n" \ - "${prefix}" "${name}" "${nr}" - else - printf "#define __NR_%s%s\t(%s + %s)\n" \ - "${prefix}" "${name}" "${offset}" "${nr}" - fi - nxt=$((nr+1)) - done - - printf "\n" - printf "#ifdef __KERNEL__\n" - printf "#define __NR_syscalls\t%s\n" "${nxt}" - printf "#endif\n" - printf "\n" - printf "#endif /* %s */" "${fileguard}" - printf "\n" -) > "$out" diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh deleted file mode 100644 index f7393a7b18aa..000000000000 --- a/arch/powerpc/kernel/syscalls/syscalltbl.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -my_abi="$4" -offset="$5" - -emit() { - t_nxt="$1" - t_nr="$2" - t_entry="$3" - - while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" - t_nxt=$((t_nxt+1)) - done - printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" -} - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - nxt=0 - if [ -z "$offset" ]; then - offset=0 - fi - - while read nr abi name entry compat ; do - if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then - emit $((nxt+offset)) $((nr+offset)) $compat - else - emit $((nxt+offset)) $((nr+offset)) $entry - fi - nxt=$((nr+1)) - done -) > "$out" diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 80a676da11cb..0f39a6b84132 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -9,16 +9,17 @@ #include <linux/nodemask.h> #include <linux/cpumask.h> #include <linux/notifier.h> +#include <linux/of.h> #include <asm/current.h> #include <asm/processor.h> #include <asm/cputable.h> #include <asm/hvcall.h> -#include <asm/prom.h> #include <asm/machdep.h> #include <asm/smp.h> #include <asm/pmc.h> #include <asm/firmware.h> +#include <asm/idle.h> #include <asm/svm.h> #include "cacheinfo.h" @@ -31,29 +32,27 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices); -/* - * SMT snooze delay stuff, 64-bit only for now - */ - #ifdef CONFIG_PPC64 -/* Time in microseconds we delay before sleeping in the idle loop */ -static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 }; +/* + * Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle: + * smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in + * 2014: + * + * "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean + * up the kernel code." + * + * powerpc-utils stopped using it as of 1.3.8. At some point in the future this + * code should be removed. + */ static ssize_t store_smt_snooze_delay(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct cpu *cpu = container_of(dev, struct cpu, dev); - ssize_t ret; - long snooze; - - ret = sscanf(buf, "%ld", &snooze); - if (ret != 1) - return -EINVAL; - - per_cpu(smt_snooze_delay, cpu->dev.id) = snooze; + pr_warn_once("%s (%d) stored to unsupported smt_snooze_delay, which has no effect.\n", + current->comm, current->pid); return count; } @@ -61,9 +60,9 @@ static ssize_t show_smt_snooze_delay(struct device *dev, struct device_attribute *attr, char *buf) { - struct cpu *cpu = container_of(dev, struct cpu, dev); - - return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id)); + pr_warn_once("%s (%d) read from unsupported smt_snooze_delay\n", + current->comm, current->pid); + return sprintf(buf, "100\n"); } static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay, @@ -71,23 +70,170 @@ static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay, static int __init setup_smt_snooze_delay(char *str) { - unsigned int cpu; - long snooze; - if (!cpu_has_feature(CPU_FTR_SMT)) return 1; - snooze = simple_strtol(str, NULL, 10); - for_each_possible_cpu(cpu) - per_cpu(smt_snooze_delay, cpu) = snooze; - + pr_warn("smt-snooze-delay command line option has no effect\n"); return 1; } __setup("smt-snooze-delay=", setup_smt_snooze_delay); #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#define __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, EXTRA) \ +static void read_##NAME(void *val) \ +{ \ + *(unsigned long *)val = mfspr(ADDRESS); \ +} \ +static void write_##NAME(void *val) \ +{ \ + EXTRA; \ + mtspr(ADDRESS, *(unsigned long *)val); \ +} + +#define __SYSFS_SPRSETUP_SHOW_STORE(NAME) \ +static ssize_t show_##NAME(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, dev); \ + unsigned long val; \ + smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1); \ + return sprintf(buf, "%lx\n", val); \ +} \ +static ssize_t __used \ + store_##NAME(struct device *dev, struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, dev); \ + unsigned long val; \ + int ret = sscanf(buf, "%lx", &val); \ + if (ret != 1) \ + return -EINVAL; \ + smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \ + return count; \ +} + +#define SYSFS_PMCSETUP(NAME, ADDRESS) \ + __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ppc_enable_pmcs()) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) +#define SYSFS_SPRSETUP(NAME, ADDRESS) \ + __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) + +#define SYSFS_SPRSETUP_SHOW_STORE(NAME) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) + +#ifdef CONFIG_PPC64 + +/* + * This is the system wide DSCR register default value. Any + * change to this default value through the sysfs interface + * will update all per cpu DSCR default values across the + * system stored in their respective PACA structures. + */ +static unsigned long dscr_default; + +/** + * read_dscr() - Fetch the cpu specific DSCR default + * @val: Returned cpu specific DSCR default value + * + * This function returns the per cpu DSCR default value + * for any cpu which is contained in it's PACA structure. + */ +static void read_dscr(void *val) +{ + *(unsigned long *)val = get_paca()->dscr_default; +} + + +/** + * write_dscr() - Update the cpu specific DSCR default + * @val: New cpu specific DSCR default value to update + * + * This function updates the per cpu DSCR default value + * for any cpu which is contained in it's PACA structure. + */ +static void write_dscr(void *val) +{ + get_paca()->dscr_default = *(unsigned long *)val; + if (!current->thread.dscr_inherit) { + current->thread.dscr = *(unsigned long *)val; + mtspr(SPRN_DSCR, *(unsigned long *)val); + } +} + +SYSFS_SPRSETUP_SHOW_STORE(dscr); +static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr); + +static void add_write_permission_dev_attr(struct device_attribute *attr) +{ + attr->attr.mode |= 0200; +} + +/** + * show_dscr_default() - Fetch the system wide DSCR default + * @dev: Device structure + * @attr: Device attribute structure + * @buf: Interface buffer + * + * This function returns the system wide DSCR default value. + */ +static ssize_t show_dscr_default(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%lx\n", dscr_default); +} + +/** + * store_dscr_default() - Update the system wide DSCR default + * @dev: Device structure + * @attr: Device attribute structure + * @buf: Interface buffer + * @count: Size of the update + * + * This function updates the system wide DSCR default value. + */ +static ssize_t __used store_dscr_default(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + unsigned long val; + int ret = 0; + + ret = sscanf(buf, "%lx", &val); + if (ret != 1) + return -EINVAL; + dscr_default = val; + + on_each_cpu(write_dscr, &val, 1); + + return count; +} + +static DEVICE_ATTR(dscr_default, 0600, + show_dscr_default, store_dscr_default); + +static void __init sysfs_create_dscr_default(void) +{ + if (cpu_has_feature(CPU_FTR_DSCR)) { + struct device *dev_root; + int cpu; + + dscr_default = spr_default_dscr; + for_each_possible_cpu(cpu) + paca_ptrs[cpu]->dscr_default = dscr_default; + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + device_create_file(dev_root, &dev_attr_dscr_default); + put_device(dev_root); + } + } +} +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_E500 #define MAX_BIT 63 static u64 pw20_wt; @@ -407,84 +553,35 @@ void ppc_enable_pmcs(void) } EXPORT_SYMBOL(ppc_enable_pmcs); -#define __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, EXTRA) \ -static void read_##NAME(void *val) \ -{ \ - *(unsigned long *)val = mfspr(ADDRESS); \ -} \ -static void write_##NAME(void *val) \ -{ \ - EXTRA; \ - mtspr(ADDRESS, *(unsigned long *)val); \ -} -#define __SYSFS_SPRSETUP_SHOW_STORE(NAME) \ -static ssize_t show_##NAME(struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ -{ \ - struct cpu *cpu = container_of(dev, struct cpu, dev); \ - unsigned long val; \ - smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1); \ - return sprintf(buf, "%lx\n", val); \ -} \ -static ssize_t __used \ - store_##NAME(struct device *dev, struct device_attribute *attr, \ - const char *buf, size_t count) \ -{ \ - struct cpu *cpu = container_of(dev, struct cpu, dev); \ - unsigned long val; \ - int ret = sscanf(buf, "%lx", &val); \ - if (ret != 1) \ - return -EINVAL; \ - smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \ - return count; \ -} - -#define SYSFS_PMCSETUP(NAME, ADDRESS) \ - __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ppc_enable_pmcs()) \ - __SYSFS_SPRSETUP_SHOW_STORE(NAME) -#define SYSFS_SPRSETUP(NAME, ADDRESS) \ - __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ) \ - __SYSFS_SPRSETUP_SHOW_STORE(NAME) - -#define SYSFS_SPRSETUP_SHOW_STORE(NAME) \ - __SYSFS_SPRSETUP_SHOW_STORE(NAME) /* Let's define all possible registers, we'll only hook up the ones * that are implemented on the current processor */ -#if defined(CONFIG_PPC64) +#ifdef CONFIG_PMU_SYSFS +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_BOOK3S_32) #define HAS_PPC_PMC_CLASSIC 1 #define HAS_PPC_PMC_IBM 1 -#define HAS_PPC_PMC_PA6T 1 -#elif defined(CONFIG_PPC_BOOK3S_32) -#define HAS_PPC_PMC_CLASSIC 1 -#define HAS_PPC_PMC_IBM 1 -#define HAS_PPC_PMC_G4 1 #endif +#ifdef CONFIG_PPC64 +#define HAS_PPC_PMC_PA6T 1 +#define HAS_PPC_PMC56 1 +#endif -#ifdef HAS_PPC_PMC_CLASSIC -SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0); -SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1); -SYSFS_PMCSETUP(pmc1, SPRN_PMC1); -SYSFS_PMCSETUP(pmc2, SPRN_PMC2); -SYSFS_PMCSETUP(pmc3, SPRN_PMC3); -SYSFS_PMCSETUP(pmc4, SPRN_PMC4); -SYSFS_PMCSETUP(pmc5, SPRN_PMC5); -SYSFS_PMCSETUP(pmc6, SPRN_PMC6); - -#ifdef HAS_PPC_PMC_G4 -SYSFS_PMCSETUP(mmcr2, SPRN_MMCR2); +#ifdef CONFIG_PPC_BOOK3S_32 +#define HAS_PPC_PMC_G4 1 #endif +#endif /* CONFIG_PMU_SYSFS */ +#if defined(CONFIG_PPC64) && defined(CONFIG_DEBUG_MISC) +#define HAS_PPC_PA6T +#endif +/* + * SPRs which are not related to PMU. + */ #ifdef CONFIG_PPC64 -SYSFS_PMCSETUP(pmc7, SPRN_PMC7); -SYSFS_PMCSETUP(pmc8, SPRN_PMC8); - -SYSFS_PMCSETUP(mmcra, SPRN_MMCRA); SYSFS_SPRSETUP(purr, SPRN_PURR); SYSFS_SPRSETUP(spurr, SPRN_SPURR); SYSFS_SPRSETUP(pir, SPRN_PIR); @@ -495,115 +592,40 @@ SYSFS_SPRSETUP(tscr, SPRN_TSCR); enable write when needed with a separate function. Lets be conservative and default to pseries. */ -static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra); static DEVICE_ATTR(spurr, 0400, show_spurr, NULL); static DEVICE_ATTR(purr, 0400, show_purr, store_purr); static DEVICE_ATTR(pir, 0400, show_pir, NULL); static DEVICE_ATTR(tscr, 0600, show_tscr, store_tscr); +#endif /* CONFIG_PPC64 */ -/* - * This is the system wide DSCR register default value. Any - * change to this default value through the sysfs interface - * will update all per cpu DSCR default values across the - * system stored in their respective PACA structures. - */ -static unsigned long dscr_default; - -/** - * read_dscr() - Fetch the cpu specific DSCR default - * @val: Returned cpu specific DSCR default value - * - * This function returns the per cpu DSCR default value - * for any cpu which is contained in it's PACA structure. - */ -static void read_dscr(void *val) -{ - *(unsigned long *)val = get_paca()->dscr_default; -} - - -/** - * write_dscr() - Update the cpu specific DSCR default - * @val: New cpu specific DSCR default value to update - * - * This function updates the per cpu DSCR default value - * for any cpu which is contained in it's PACA structure. - */ -static void write_dscr(void *val) -{ - get_paca()->dscr_default = *(unsigned long *)val; - if (!current->thread.dscr_inherit) { - current->thread.dscr = *(unsigned long *)val; - mtspr(SPRN_DSCR, *(unsigned long *)val); - } -} - -SYSFS_SPRSETUP_SHOW_STORE(dscr); -static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr); - -static void add_write_permission_dev_attr(struct device_attribute *attr) -{ - attr->attr.mode |= 0200; -} - -/** - * show_dscr_default() - Fetch the system wide DSCR default - * @dev: Device structure - * @attr: Device attribute structure - * @buf: Interface buffer - * - * This function returns the system wide DSCR default value. - */ -static ssize_t show_dscr_default(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return sprintf(buf, "%lx\n", dscr_default); -} - -/** - * store_dscr_default() - Update the system wide DSCR default - * @dev: Device structure - * @attr: Device attribute structure - * @buf: Interface buffer - * @count: Size of the update - * - * This function updates the system wide DSCR default value. - */ -static ssize_t __used store_dscr_default(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - unsigned long val; - int ret = 0; - - ret = sscanf(buf, "%lx", &val); - if (ret != 1) - return -EINVAL; - dscr_default = val; +#ifdef HAS_PPC_PMC_CLASSIC +SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0); +SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1); +SYSFS_PMCSETUP(pmc1, SPRN_PMC1); +SYSFS_PMCSETUP(pmc2, SPRN_PMC2); +SYSFS_PMCSETUP(pmc3, SPRN_PMC3); +SYSFS_PMCSETUP(pmc4, SPRN_PMC4); +SYSFS_PMCSETUP(pmc5, SPRN_PMC5); +SYSFS_PMCSETUP(pmc6, SPRN_PMC6); +#endif - on_each_cpu(write_dscr, &val, 1); +#ifdef HAS_PPC_PMC_G4 +SYSFS_PMCSETUP(mmcr2, SPRN_MMCR2); +#endif - return count; -} +#ifdef HAS_PPC_PMC56 +SYSFS_PMCSETUP(pmc7, SPRN_PMC7); +SYSFS_PMCSETUP(pmc8, SPRN_PMC8); -static DEVICE_ATTR(dscr_default, 0600, - show_dscr_default, store_dscr_default); +SYSFS_PMCSETUP(mmcra, SPRN_MMCRA); +SYSFS_PMCSETUP(mmcr3, SPRN_MMCR3); -static void sysfs_create_dscr_default(void) -{ - if (cpu_has_feature(CPU_FTR_DSCR)) { - int err = 0; - int cpu; +static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra); +static DEVICE_ATTR(mmcr3, 0600, show_mmcr3, store_mmcr3); +#endif /* HAS_PPC_PMC56 */ - dscr_default = spr_default_dscr; - for_each_possible_cpu(cpu) - paca_ptrs[cpu]->dscr_default = dscr_default; - err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); - } -} -#endif /* CONFIG_PPC64 */ #ifdef HAS_PPC_PMC_PA6T SYSFS_PMCSETUP(pa6t_pmc0, SPRN_PA6T_PMC0); @@ -612,7 +634,9 @@ SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2); SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3); SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4); SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5); -#ifdef CONFIG_DEBUG_MISC +#endif + +#ifdef HAS_PPC_PA6T SYSFS_SPRSETUP(hid0, SPRN_HID0); SYSFS_SPRSETUP(hid1, SPRN_HID1); SYSFS_SPRSETUP(hid4, SPRN_HID4); @@ -641,15 +665,14 @@ SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0); SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1); SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2); SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3); -#endif /* CONFIG_DEBUG_MISC */ -#endif /* HAS_PPC_PMC_PA6T */ +#endif /* HAS_PPC_PA6T */ #ifdef HAS_PPC_PMC_IBM static struct device_attribute ibm_common_attrs[] = { __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0), __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1), }; -#endif /* HAS_PPC_PMC_G4 */ +#endif /* HAS_PPC_PMC_IBM */ #ifdef HAS_PPC_PMC_G4 static struct device_attribute g4_common_attrs[] = { @@ -659,6 +682,7 @@ static struct device_attribute g4_common_attrs[] = { }; #endif /* HAS_PPC_PMC_G4 */ +#ifdef HAS_PPC_PMC_CLASSIC static struct device_attribute classic_pmc_attrs[] = { __ATTR(pmc1, 0600, show_pmc1, store_pmc1), __ATTR(pmc2, 0600, show_pmc2, store_pmc2), @@ -666,14 +690,16 @@ static struct device_attribute classic_pmc_attrs[] = { __ATTR(pmc4, 0600, show_pmc4, store_pmc4), __ATTR(pmc5, 0600, show_pmc5, store_pmc5), __ATTR(pmc6, 0600, show_pmc6, store_pmc6), -#ifdef CONFIG_PPC64 +#ifdef HAS_PPC_PMC56 __ATTR(pmc7, 0600, show_pmc7, store_pmc7), __ATTR(pmc8, 0600, show_pmc8, store_pmc8), #endif }; +#endif -#ifdef HAS_PPC_PMC_PA6T +#if defined(HAS_PPC_PMC_PA6T) || defined(HAS_PPC_PA6T) static struct device_attribute pa6t_attrs[] = { +#ifdef HAS_PPC_PMC_PA6T __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0), __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1), __ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0), @@ -682,7 +708,8 @@ static struct device_attribute pa6t_attrs[] = { __ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3), __ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4), __ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5), -#ifdef CONFIG_DEBUG_MISC +#endif +#ifdef HAS_PPC_PA6T __ATTR(hid0, 0600, show_hid0, store_hid0), __ATTR(hid1, 0600, show_hid1, store_hid1), __ATTR(hid4, 0600, show_hid4, store_hid4), @@ -711,10 +738,9 @@ static struct device_attribute pa6t_attrs[] = { __ATTR(tsr1, 0600, show_tsr1, store_tsr1), __ATTR(tsr2, 0600, show_tsr2, store_tsr2), __ATTR(tsr3, 0600, show_tsr3, store_tsr3), -#endif /* CONFIG_DEBUG_MISC */ +#endif /* HAS_PPC_PA6T */ }; -#endif /* HAS_PPC_PMC_PA6T */ -#endif /* HAS_PPC_PMC_CLASSIC */ +#endif #ifdef CONFIG_PPC_SVM static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf) @@ -723,16 +749,89 @@ static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char } static DEVICE_ATTR(svm, 0444, show_svm, NULL); -static void create_svm_file(void) +static void __init create_svm_file(void) { - device_create_file(cpu_subsys.dev_root, &dev_attr_svm); + struct device *dev_root = bus_get_dev_root(&cpu_subsys); + + if (dev_root) { + device_create_file(dev_root, &dev_attr_svm); + put_device(dev_root); + } } #else -static void create_svm_file(void) +static void __init create_svm_file(void) { } #endif /* CONFIG_PPC_SVM */ +#ifdef CONFIG_PPC_PSERIES +static void read_idle_purr(void *val) +{ + u64 *ret = val; + + *ret = read_this_idle_purr(); +} + +static ssize_t idle_purr_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + u64 val; + + smp_call_function_single(cpu->dev.id, read_idle_purr, &val, 1); + return sprintf(buf, "%llx\n", val); +} +static DEVICE_ATTR(idle_purr, 0400, idle_purr_show, NULL); + +static void create_idle_purr_file(struct device *s) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_create_file(s, &dev_attr_idle_purr); +} + +static void remove_idle_purr_file(struct device *s) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_remove_file(s, &dev_attr_idle_purr); +} + +static void read_idle_spurr(void *val) +{ + u64 *ret = val; + + *ret = read_this_idle_spurr(); +} + +static ssize_t idle_spurr_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + u64 val; + + smp_call_function_single(cpu->dev.id, read_idle_spurr, &val, 1); + return sprintf(buf, "%llx\n", val); +} +static DEVICE_ATTR(idle_spurr, 0400, idle_spurr_show, NULL); + +static void create_idle_spurr_file(struct device *s) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_create_file(s, &dev_attr_idle_spurr); +} + +static void remove_idle_spurr_file(struct device *s) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_remove_file(s, &dev_attr_idle_spurr); +} + +#else /* CONFIG_PPC_PSERIES */ +#define create_idle_purr_file(s) +#define remove_idle_purr_file(s) +#define create_idle_spurr_file(s) +#define remove_idle_spurr_file(s) +#endif /* CONFIG_PPC_PSERIES */ + static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -754,25 +853,25 @@ static int register_cpu_online(unsigned int cpu) #ifdef HAS_PPC_PMC_IBM case PPC_PMC_IBM: attrs = ibm_common_attrs; - nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute); + nattrs = ARRAY_SIZE(ibm_common_attrs); pmc_attrs = classic_pmc_attrs; break; #endif /* HAS_PPC_PMC_IBM */ #ifdef HAS_PPC_PMC_G4 case PPC_PMC_G4: attrs = g4_common_attrs; - nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute); + nattrs = ARRAY_SIZE(g4_common_attrs); pmc_attrs = classic_pmc_attrs; break; #endif /* HAS_PPC_PMC_G4 */ -#ifdef HAS_PPC_PMC_PA6T +#if defined(HAS_PPC_PMC_PA6T) || defined(HAS_PPC_PA6T) case PPC_PMC_PA6T: /* PA Semi starts counting at PMC0 */ attrs = pa6t_attrs; - nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute); + nattrs = ARRAY_SIZE(pa6t_attrs); pmc_attrs = NULL; break; -#endif /* HAS_PPC_PMC_PA6T */ +#endif default: attrs = NULL; nattrs = 0; @@ -787,17 +886,25 @@ static int register_cpu_online(unsigned int cpu) device_create_file(s, &pmc_attrs[i]); #ifdef CONFIG_PPC64 +#ifdef CONFIG_PMU_SYSFS if (cpu_has_feature(CPU_FTR_MMCRA)) device_create_file(s, &dev_attr_mmcra); + if (cpu_has_feature(CPU_FTR_ARCH_31)) + device_create_file(s, &dev_attr_mmcr3); +#endif /* CONFIG_PMU_SYSFS */ + if (cpu_has_feature(CPU_FTR_PURR)) { if (!firmware_has_feature(FW_FEATURE_LPAR)) add_write_permission_dev_attr(&dev_attr_purr); device_create_file(s, &dev_attr_purr); + create_idle_purr_file(s); } - if (cpu_has_feature(CPU_FTR_SPURR)) + if (cpu_has_feature(CPU_FTR_SPURR)) { device_create_file(s, &dev_attr_spurr); + create_idle_spurr_file(s); + } if (cpu_has_feature(CPU_FTR_DSCR)) device_create_file(s, &dev_attr_dscr); @@ -810,7 +917,7 @@ static int register_cpu_online(unsigned int cpu) device_create_file(s, &dev_attr_tscr); #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { device_create_file(s, &dev_attr_pw20_state); device_create_file(s, &dev_attr_pw20_wait_time); @@ -831,7 +938,8 @@ static int unregister_cpu_online(unsigned int cpu) struct device_attribute *attrs, *pmc_attrs; int i, nattrs; - BUG_ON(!c->hotpluggable); + if (WARN_RATELIMIT(!c->hotpluggable, "cpu %d can't be offlined\n", cpu)) + return -EBUSY; #ifdef CONFIG_PPC64 if (cpu_has_feature(CPU_FTR_SMT)) @@ -843,25 +951,25 @@ static int unregister_cpu_online(unsigned int cpu) #ifdef HAS_PPC_PMC_IBM case PPC_PMC_IBM: attrs = ibm_common_attrs; - nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute); + nattrs = ARRAY_SIZE(ibm_common_attrs); pmc_attrs = classic_pmc_attrs; break; #endif /* HAS_PPC_PMC_IBM */ #ifdef HAS_PPC_PMC_G4 case PPC_PMC_G4: attrs = g4_common_attrs; - nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute); + nattrs = ARRAY_SIZE(g4_common_attrs); pmc_attrs = classic_pmc_attrs; break; #endif /* HAS_PPC_PMC_G4 */ -#ifdef HAS_PPC_PMC_PA6T +#if defined(HAS_PPC_PMC_PA6T) || defined(HAS_PPC_PA6T) case PPC_PMC_PA6T: /* PA Semi starts counting at PMC0 */ attrs = pa6t_attrs; - nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute); + nattrs = ARRAY_SIZE(pa6t_attrs); pmc_attrs = NULL; break; -#endif /* HAS_PPC_PMC_PA6T */ +#endif default: attrs = NULL; nattrs = 0; @@ -876,14 +984,23 @@ static int unregister_cpu_online(unsigned int cpu) device_remove_file(s, &pmc_attrs[i]); #ifdef CONFIG_PPC64 +#ifdef CONFIG_PMU_SYSFS if (cpu_has_feature(CPU_FTR_MMCRA)) device_remove_file(s, &dev_attr_mmcra); - if (cpu_has_feature(CPU_FTR_PURR)) + if (cpu_has_feature(CPU_FTR_ARCH_31)) + device_remove_file(s, &dev_attr_mmcr3); +#endif /* CONFIG_PMU_SYSFS */ + + if (cpu_has_feature(CPU_FTR_PURR)) { device_remove_file(s, &dev_attr_purr); + remove_idle_purr_file(s); + } - if (cpu_has_feature(CPU_FTR_SPURR)) + if (cpu_has_feature(CPU_FTR_SPURR)) { device_remove_file(s, &dev_attr_spurr); + remove_idle_spurr_file(s); + } if (cpu_has_feature(CPU_FTR_DSCR)) device_remove_file(s, &dev_attr_dscr); @@ -896,7 +1013,7 @@ static int unregister_cpu_online(unsigned int cpu) device_remove_file(s, &dev_attr_tscr); #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { device_remove_file(s, &dev_attr_pw20_state); device_remove_file(s, &dev_attr_pw20_wait_time); @@ -1003,14 +1120,6 @@ EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group); /* NUMA stuff */ #ifdef CONFIG_NUMA -static void register_nodes(void) -{ - int i; - - for (i = 0; i < MAX_NUMNODES; i++) - register_one_node(i); -} - int sysfs_add_device_to_node(struct device *dev, int nid) { struct node *node = node_devices[nid]; @@ -1025,13 +1134,6 @@ void sysfs_remove_device_from_node(struct device *dev, int nid) sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); } EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); - -#else -static void register_nodes(void) -{ - return; -} - #endif /* Only valid if CPU is present. */ @@ -1048,11 +1150,10 @@ static int __init topology_init(void) { int cpu, r; - register_nodes(); - for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); +#ifdef CONFIG_HOTPLUG_CPU /* * For now, we just see if the system supports making * the RTAS calls for CPU hotplug. But, there may be a @@ -1060,8 +1161,9 @@ static int __init topology_init(void) * CPU. For instance, the boot cpu might never be valid * for hotplugging. */ - if (ppc_md.cpu_die) + if (smp_ops && smp_ops->cpu_offline_self) c->hotpluggable = 1; +#endif if (cpu_online(cpu) || c->hotpluggable) { register_cpu(c, cpu); diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S deleted file mode 100644 index 5b905a2f4e4d..000000000000 --- a/arch/powerpc/kernel/systbl.S +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file contains the table of syscall-handling functions. - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) - * and Paul Mackerras. - * - * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com) - * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) - */ - -#include <asm/ppc_asm.h> - -.section .rodata,"a" - -#ifdef CONFIG_PPC64 - .p2align 3 -#endif - -.globl sys_call_table -sys_call_table: -#ifdef CONFIG_PPC64 -#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) -#include <asm/syscall_table_64.h> -#undef __SYSCALL -#else -#define __SYSCALL(nr, entry) .long entry -#include <asm/syscall_table_32.h> -#undef __SYSCALL -#endif - -#ifdef CONFIG_COMPAT -.globl compat_sys_call_table -compat_sys_call_table: -#define compat_sys_sigsuspend sys_sigsuspend -#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) -#include <asm/syscall_table_c32.h> -#undef __SYSCALL -#endif diff --git a/arch/powerpc/kernel/systbl.c b/arch/powerpc/kernel/systbl.c new file mode 100644 index 000000000000..4305f2a2162f --- /dev/null +++ b/arch/powerpc/kernel/systbl.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This file contains the table of syscall-handling functions. + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) + * and Paul Mackerras. + * + * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com) + * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) + */ + +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <asm/unistd.h> +#include <asm/syscalls.h> + +#undef __SYSCALL_WITH_COMPAT +#define __SYSCALL_WITH_COMPAT(nr, entry, compat) __SYSCALL(nr, entry) + +#undef __SYSCALL +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +#define __SYSCALL(nr, entry) [nr] = entry, +#else +/* + * Coerce syscall handlers with arbitrary parameters to common type + * requires cast to void* to avoid -Wcast-function-type. + */ +#define __SYSCALL(nr, entry) [nr] = (void *) entry, +#endif + +const syscall_fn sys_call_table[] = { +#ifdef CONFIG_PPC64 +#include <asm/syscall_table_64.h> +#else +#include <asm/syscall_table_32.h> +#endif +}; + +#ifdef CONFIG_COMPAT +#undef __SYSCALL_WITH_COMPAT +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +const syscall_fn compat_sys_call_table[] = { +#include <asm/syscall_table_32.h> +}; +#endif /* CONFIG_COMPAT */ diff --git a/arch/powerpc/kernel/systbl_chk.sh b/arch/powerpc/kernel/systbl_chk.sh deleted file mode 100644 index c7ac3ed657c4..000000000000 --- a/arch/powerpc/kernel/systbl_chk.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0-or-later -# -# Just process the CPP output from systbl_chk.c and complain -# if anything is out of order. -# -# Copyright © 2008 IBM Corporation -# - -awk 'BEGIN { num = -1; } # Ignore the beginning of the file - /^#/ { next; } - /^[ \t]*$/ { next; } - /^START_TABLE/ { num = 0; next; } - /^END_TABLE/ { - if (num != $2) { - printf "Error: NR_syscalls (%s) is not one more than the last syscall (%s)\n", - $2, num - 1; - exit(1); - } - num = -1; # Ignore the rest of the file - } - { - if (num == -1) next; - if (($1 != -1) && ($1 != num)) { - printf "Error: Syscall %s out of order (expected %s)\n", - $1, num; - exit(1); - }; - num++; - }' "$1" diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index e2ab8a111b69..cba6dd15de3b 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -13,21 +13,22 @@ */ #include <linux/errno.h> -#include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/param.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/delay.h> +#include <linux/workqueue.h> +#include <asm/interrupt.h> #include <asm/io.h> #include <asm/reg.h> #include <asm/nvram.h> #include <asm/cache.h> #include <asm/8xx_immap.h> #include <asm/machdep.h> -#include <asm/asm-prototypes.h> #include "setup.h" @@ -39,9 +40,7 @@ static struct tau_temp unsigned char grew; } tau[NR_CPUS]; -struct timer_list tau_timer; - -#undef DEBUG +static bool tau_int_enable; /* TODO: put these in a /proc interface, with some sanity checks, and maybe * dynamic adjustment to minimize # of interrupts */ @@ -50,72 +49,49 @@ struct timer_list tau_timer; #define step_size 2 /* step size when temp goes out of range */ #define window_expand 1 /* expand the window by this much */ /* configurable values for shrinking the window */ -#define shrink_timer 2*HZ /* period between shrinking the window */ +#define shrink_timer 2000 /* period between shrinking the window */ #define min_window 2 /* minimum window size, degrees C */ static void set_thresholds(unsigned long cpu) { -#ifdef CONFIG_TAU_INT - /* - * setup THRM1, - * threshold, valid bit, enable interrupts, interrupt when below threshold - */ - mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID); + u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0; - /* setup THRM2, - * threshold, valid bit, enable interrupts, interrupt when above threshold - */ - mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE); -#else - /* same thing but don't enable interrupts */ - mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID); - mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V); -#endif + /* setup THRM1, threshold, valid bit, interrupt when below threshold */ + mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID); + + /* setup THRM2, threshold, valid bit, interrupt when above threshold */ + mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie); } static void TAUupdate(int cpu) { - unsigned thrm; - -#ifdef DEBUG - printk("TAUupdate "); -#endif + u32 thrm; + u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V; /* if both thresholds are crossed, the step_sizes cancel out * and the window winds up getting expanded twice. */ - if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */ - if(thrm & THRM1_TIN){ /* crossed low threshold */ - if (tau[cpu].low >= step_size){ - tau[cpu].low -= step_size; - tau[cpu].high -= (step_size - window_expand); - } - tau[cpu].grew = 1; -#ifdef DEBUG - printk("low threshold crossed "); -#endif + thrm = mfspr(SPRN_THRM1); + if ((thrm & bits) == bits) { + mtspr(SPRN_THRM1, 0); + + if (tau[cpu].low >= step_size) { + tau[cpu].low -= step_size; + tau[cpu].high -= (step_size - window_expand); } + tau[cpu].grew = 1; + pr_debug("%s: low threshold crossed\n", __func__); } - if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */ - if(thrm & THRM1_TIN){ /* crossed high threshold */ - if (tau[cpu].high <= 127-step_size){ - tau[cpu].low += (step_size - window_expand); - tau[cpu].high += step_size; - } - tau[cpu].grew = 1; -#ifdef DEBUG - printk("high threshold crossed "); -#endif + thrm = mfspr(SPRN_THRM2); + if ((thrm & bits) == bits) { + mtspr(SPRN_THRM2, 0); + + if (tau[cpu].high <= 127 - step_size) { + tau[cpu].low += (step_size - window_expand); + tau[cpu].high += step_size; } + tau[cpu].grew = 1; + pr_debug("%s: high threshold crossed\n", __func__); } - -#ifdef DEBUG - printk("grew = %d\n", tau[cpu].grew); -#endif - -#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */ - set_thresholds(cpu); -#endif - } #ifdef CONFIG_TAU_INT @@ -124,33 +100,29 @@ static void TAUupdate(int cpu) * with interrupts disabled */ -void TAUException(struct pt_regs * regs) +DEFINE_INTERRUPT_HANDLER_ASYNC(TAUException) { int cpu = smp_processor_id(); - irq_enter(); tau[cpu].interrupts++; TAUupdate(cpu); - - irq_exit(); } #endif /* CONFIG_TAU_INT */ static void tau_timeout(void * info) { int cpu; - unsigned long flags; int size; int shrink; - /* disabling interrupts *should* be okay */ - local_irq_save(flags); cpu = smp_processor_id(); -#ifndef CONFIG_TAU_INT - TAUupdate(cpu); -#endif + if (!tau_int_enable) + TAUupdate(cpu); + + /* Stop thermal sensor comparisons and interrupts */ + mtspr(SPRN_THRM3, 0); size = tau[cpu].high - tau[cpu].low; if (size > min_window && ! tau[cpu].grew) { @@ -173,32 +145,26 @@ static void tau_timeout(void * info) set_thresholds(cpu); - /* - * Do the enable every time, since otherwise a bunch of (relatively) - * complex sleep code needs to be added. One mtspr every time - * tau_timeout is called is probably not a big deal. - * - * Enable thermal sensor and set up sample interval timer - * need 20 us to do the compare.. until a nice 'cpu_speed' function - * call is implemented, just assume a 500 mhz clock. It doesn't really - * matter if we take too long for a compare since it's all interrupt - * driven anyway. - * - * use a extra long time.. (60 us @ 500 mhz) + /* Restart thermal sensor comparisons and interrupts. + * The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet" + * recommends that "the maximum value be set in THRM3 under all + * conditions." */ - mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E); - - local_irq_restore(flags); + mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E); } -static void tau_timeout_smp(struct timer_list *unused) -{ +static struct workqueue_struct *tau_workq; - /* schedule ourselves to be run again */ - mod_timer(&tau_timer, jiffies + shrink_timer) ; +static void tau_work_func(struct work_struct *work) +{ + msleep(shrink_timer); on_each_cpu(tau_timeout, NULL, 0); + /* schedule ourselves to be run again */ + queue_work(tau_workq, work); } +static DECLARE_WORK(tau_work, tau_work_func); + /* * setup the TAU * @@ -231,21 +197,19 @@ static int __init TAU_init(void) return 1; } + tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) && + !strcmp(cur_cpu_spec->platform, "ppc750"); - /* first, set up the window shrinking timer */ - timer_setup(&tau_timer, tau_timeout_smp, 0); - tau_timer.expires = jiffies + shrink_timer; - add_timer(&tau_timer); + tau_workq = alloc_ordered_workqueue("tau", 0); + if (!tau_workq) + return -ENOMEM; on_each_cpu(TAU_init_smp, NULL, 0); - printk("Thermal assist unit "); -#ifdef CONFIG_TAU_INT - printk("using interrupts, "); -#else - printk("using timers, "); -#endif - printk("shrink_timer: %d jiffies\n", shrink_timer); + queue_work(tau_workq, &tau_work); + + pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n", + tau_int_enable ? "interrupts" : "workqueue", shrink_timer); tau_initialized = 1; return 0; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 1168e8b37e30..df20cf201f74 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -31,6 +31,7 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/sched/cputime.h> #include <linux/kernel.h> #include <linux/param.h> #include <linux/string.h> @@ -50,40 +51,32 @@ #include <linux/irq.h> #include <linux/delay.h> #include <linux/irq_work.h> -#include <linux/clk-provider.h> +#include <linux/of_clk.h> #include <linux/suspend.h> -#include <linux/sched/cputime.h> #include <linux/processor.h> -#include <asm/trace.h> +#include <linux/mc146818rtc.h> +#include <linux/platform_device.h> +#include <asm/trace.h> +#include <asm/interrupt.h> #include <asm/io.h> #include <asm/nvram.h> #include <asm/cache.h> #include <asm/machdep.h> #include <linux/uaccess.h> #include <asm/time.h> -#include <asm/prom.h> #include <asm/irq.h> #include <asm/div64.h> #include <asm/smp.h> #include <asm/vdso_datapage.h> #include <asm/firmware.h> -#include <asm/asm-prototypes.h> +#include <asm/mce.h> /* powerpc clocksource/clockevent code */ #include <linux/clockchips.h> #include <linux/timekeeper_internal.h> -static u64 rtc_read(struct clocksource *); -static struct clocksource clocksource_rtc = { - .name = "rtc", - .rating = 400, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .mask = CLOCKSOURCE_MASK(64), - .read = rtc_read, -}; - static u64 timebase_read(struct clocksource *); static struct clocksource clocksource_timebase = { .name = "timebase", @@ -91,10 +84,12 @@ static struct clocksource clocksource_timebase = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .mask = CLOCKSOURCE_MASK(64), .read = timebase_read, + .vdso_clock_mode = VDSO_CLOCKMODE_ARCHTIMER, }; #define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF u64 decrementer_max = DECREMENTER_DEFAULT_MAX; +EXPORT_SYMBOL_GPL(decrementer_max); /* for KVM HDEC */ static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -113,7 +108,13 @@ struct clock_event_device decrementer_clockevent = { }; EXPORT_SYMBOL(decrementer_clockevent); -DEFINE_PER_CPU(u64, decrementers_next_tb); +/* + * This always puts next_tb beyond now, so the clock event will never fire + * with the usual comparison, no need for a separate test for stopped. + */ +#define DEC_CLOCKEVENT_STOPPED ~0ULL +DEFINE_PER_CPU(u64, decrementers_next_tb) = DEC_CLOCKEVENT_STOPPED; +EXPORT_SYMBOL_GPL(decrementers_next_tb); static DEFINE_PER_CPU(struct clock_event_device, decrementers); #define XSEC_PER_SEC (1024*1024) @@ -129,7 +130,7 @@ unsigned long tb_ticks_per_jiffy; unsigned long tb_ticks_per_usec = 100; /* sane default */ EXPORT_SYMBOL(tb_ticks_per_usec); unsigned long tb_ticks_per_sec; -EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime_t conversions */ +EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime conversions */ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL_GPL(rtc_lock); @@ -150,25 +151,6 @@ bool tb_invalid; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* - * Factor for converting from cputime_t (timebase ticks) to - * microseconds. This is stored as 0.64 fixed-point binary fraction. - */ -u64 __cputime_usec_factor; -EXPORT_SYMBOL(__cputime_usec_factor); - -#ifdef CONFIG_PPC_SPLPAR -void (*dtl_consumer)(struct dtl_entry *, u64); -#endif - -static void calc_cputime_factors(void) -{ - struct div_result res; - - div128_by_32(1000000, 0, tb_ticks_per_sec, &res); - __cputime_usec_factor = res.result_low; -} - -/* * Read the SPURR on systems that have it, otherwise the PURR, * or if that doesn't exist return the timebase value passed in. */ @@ -181,99 +163,6 @@ static inline unsigned long read_spurr(unsigned long tb) return tb; } -#ifdef CONFIG_PPC_SPLPAR - -/* - * Scan the dispatch trace log and count up the stolen time. - * Should be called with interrupts disabled. - */ -static u64 scan_dispatch_log(u64 stop_tb) -{ - u64 i = local_paca->dtl_ridx; - struct dtl_entry *dtl = local_paca->dtl_curr; - struct dtl_entry *dtl_end = local_paca->dispatch_log_end; - struct lppaca *vpa = local_paca->lppaca_ptr; - u64 tb_delta; - u64 stolen = 0; - u64 dtb; - - if (!dtl) - return 0; - - if (i == be64_to_cpu(vpa->dtl_idx)) - return 0; - while (i < be64_to_cpu(vpa->dtl_idx)) { - dtb = be64_to_cpu(dtl->timebase); - tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + - be32_to_cpu(dtl->ready_to_enqueue_time); - barrier(); - if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { - /* buffer has overflowed */ - i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; - dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); - continue; - } - if (dtb > stop_tb) - break; - if (dtl_consumer) - dtl_consumer(dtl, i); - stolen += tb_delta; - ++i; - ++dtl; - if (dtl == dtl_end) - dtl = local_paca->dispatch_log; - } - local_paca->dtl_ridx = i; - local_paca->dtl_curr = dtl; - return stolen; -} - -/* - * Accumulate stolen time by scanning the dispatch trace log. - * Called on entry from user mode. - */ -void notrace accumulate_stolen_time(void) -{ - u64 sst, ust; - unsigned long save_irq_soft_mask = irq_soft_mask_return(); - struct cpu_accounting_data *acct = &local_paca->accounting; - - /* We are called early in the exception entry, before - * soft/hard_enabled are sync'ed to the expected state - * for the exception. We are hard disabled but the PACA - * needs to reflect that so various debug stuff doesn't - * complain - */ - irq_soft_mask_set(IRQS_DISABLED); - - sst = scan_dispatch_log(acct->starttime_user); - ust = scan_dispatch_log(acct->starttime); - acct->stime -= sst; - acct->utime -= ust; - acct->steal_time += ust + sst; - - irq_soft_mask_set(save_irq_soft_mask); -} - -static inline u64 calculate_stolen_time(u64 stop_tb) -{ - if (!firmware_has_feature(FW_FEATURE_SPLPAR)) - return 0; - - if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) - return scan_dispatch_log(stop_tb); - - return 0; -} - -#else /* CONFIG_PPC_SPLPAR */ -static inline u64 calculate_stolen_time(u64 stop_tb) -{ - return 0; -} - -#endif /* CONFIG_PPC_SPLPAR */ - /* * Account time for a transition between system, hard irq * or soft irq state. @@ -318,12 +207,11 @@ static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, return stime_scaled; } -static unsigned long vtime_delta(struct task_struct *tsk, +static unsigned long vtime_delta(struct cpu_accounting_data *acct, unsigned long *stime_scaled, unsigned long *steal_time) { unsigned long now, stime; - struct cpu_accounting_data *acct = get_accounting(tsk); WARN_ON_ONCE(!irqs_disabled()); @@ -333,34 +221,39 @@ static unsigned long vtime_delta(struct task_struct *tsk, *stime_scaled = vtime_delta_scaled(acct, now, stime); - *steal_time = calculate_stolen_time(now); + if (IS_ENABLED(CONFIG_PPC_SPLPAR) && + firmware_has_feature(FW_FEATURE_SPLPAR)) + *steal_time = pseries_calculate_stolen_time(now); + else + *steal_time = 0; return stime; } +static void vtime_delta_kernel(struct cpu_accounting_data *acct, + unsigned long *stime, unsigned long *stime_scaled) +{ + unsigned long steal_time; + + *stime = vtime_delta(acct, stime_scaled, &steal_time); + *stime -= min(*stime, steal_time); + acct->steal_time += steal_time; +} + void vtime_account_kernel(struct task_struct *tsk) { - unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); + unsigned long stime, stime_scaled; - stime = vtime_delta(tsk, &stime_scaled, &steal_time); - - stime -= min(stime, steal_time); - acct->steal_time += steal_time; + vtime_delta_kernel(acct, &stime, &stime_scaled); - if ((tsk->flags & PF_VCPU) && !irq_count()) { + if (tsk->flags & PF_VCPU) { acct->gtime += stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->utime_scaled += stime_scaled; #endif } else { - if (hardirq_count()) - acct->hardirq_time += stime; - else if (in_serving_softirq()) - acct->softirq_time += stime; - else - acct->stime += stime; - + acct->stime += stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->stime_scaled += stime_scaled; #endif @@ -373,10 +266,34 @@ void vtime_account_idle(struct task_struct *tsk) unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); - stime = vtime_delta(tsk, &stime_scaled, &steal_time); + stime = vtime_delta(acct, &stime_scaled, &steal_time); acct->idle_time += stime + steal_time; } +static void vtime_account_irq_field(struct cpu_accounting_data *acct, + unsigned long *field) +{ + unsigned long stime, stime_scaled; + + vtime_delta_kernel(acct, &stime, &stime_scaled); + *field += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + acct->stime_scaled += stime_scaled; +#endif +} + +void vtime_account_softirq(struct task_struct *tsk) +{ + struct cpu_accounting_data *acct = get_accounting(tsk); + vtime_account_irq_field(acct, &acct->softirq_time); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + struct cpu_accounting_data *acct = get_accounting(tsk); + vtime_account_irq_field(acct, &acct->hardirq_time); +} + static void vtime_flush_scaled(struct task_struct *tsk, struct cpu_accounting_data *acct) { @@ -437,27 +354,14 @@ void vtime_flush(struct task_struct *tsk) acct->hardirq_time = 0; acct->softirq_time = 0; } +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -#define calc_cputime_factors() -#endif - -void __delay(unsigned long loops) +void __no_kcsan __delay(unsigned long loops) { unsigned long start; - int diff; spin_begin(); - if (__USE_RTC()) { - start = get_rtcl(); - do { - /* the RTCL register wraps at 1000000000 */ - diff = get_rtcl() - start; - if (diff < 0) - diff += 1000000000; - spin_cpu_relax(); - } while (diff < loops); - } else if (tb_invalid) { + if (tb_invalid) { /* * TB is in error state and isn't ticking anymore. * HMI handler was unable to recover from TB error. @@ -465,15 +369,15 @@ void __delay(unsigned long loops) */ spin_cpu_relax(); } else { - start = get_tbl(); - while (get_tbl() - start < loops) + start = mftb(); + while (mftb() - start < loops) spin_cpu_relax(); } spin_end(); } EXPORT_SYMBOL(__delay); -void udelay(unsigned long usecs) +void __no_kcsan udelay(unsigned long usecs) { __delay(tb_ticks_per_usec * usecs); } @@ -522,35 +426,6 @@ static inline void clear_irq_work_pending(void) "i" (offsetof(struct paca_struct, irq_work_pending))); } -void arch_irq_work_raise(void) -{ - preempt_disable(); - set_irq_work_pending_flag(); - /* - * Non-nmi code running with interrupts disabled will replay - * irq_happened before it re-enables interrupts, so setthe - * decrementer there instead of causing a hardware exception - * which would immediately hit the masked interrupt handler - * and have the net effect of setting the decrementer in - * irq_happened. - * - * NMI interrupts can not check this when they return, so the - * decrementer hardware exception is raised, which will fire - * when interrupts are next enabled. - * - * BookE does not support this yet, it must audit all NMI - * interrupt handlers to ensure they call nmi_enter() so this - * check would be correct. - */ - if (IS_ENABLED(CONFIG_BOOKE) || !irqs_disabled() || in_nmi()) { - set_dec(1); - } else { - hard_irq_disable(); - local_paca->irq_happened |= PACA_IRQ_DEC; - } - preempt_enable(); -} - #else /* 32-bit */ DEFINE_PER_CPU(u8, irq_work_pending); @@ -559,95 +434,133 @@ DEFINE_PER_CPU(u8, irq_work_pending); #define test_irq_work_pending() __this_cpu_read(irq_work_pending) #define clear_irq_work_pending() __this_cpu_write(irq_work_pending, 0) +#endif /* 32 vs 64 bit */ + void arch_irq_work_raise(void) { + /* + * 64-bit code that uses irq soft-mask can just cause an immediate + * interrupt here that gets soft masked, if this is called under + * local_irq_disable(). It might be possible to prevent that happening + * by noticing interrupts are disabled and setting decrementer pending + * to be replayed when irqs are enabled. The problem there is that + * tracing can call irq_work_raise, including in code that does low + * level manipulations of irq soft-mask state (e.g., trace_hardirqs_on) + * which could get tangled up if we're messing with the same state + * here. + */ preempt_disable(); set_irq_work_pending_flag(); set_dec(1); preempt_enable(); } -#endif /* 32 vs 64 bit */ +static void set_dec_or_work(u64 val) +{ + set_dec(val); + /* We may have raced with new irq work */ + if (unlikely(test_irq_work_pending())) + set_dec(1); +} #else /* CONFIG_IRQ_WORK */ #define test_irq_work_pending() 0 #define clear_irq_work_pending() +static void set_dec_or_work(u64 val) +{ + set_dec(val); +} #endif /* CONFIG_IRQ_WORK */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void timer_rearm_host_dec(u64 now) +{ + u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); + + WARN_ON_ONCE(!arch_irqs_disabled()); + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (now >= *next_tb) { + local_paca->irq_happened |= PACA_IRQ_DEC; + } else { + now = *next_tb - now; + if (now > decrementer_max) + now = decrementer_max; + set_dec_or_work(now); + } +} +EXPORT_SYMBOL_GPL(timer_rearm_host_dec); +#endif + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. */ -void timer_interrupt(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) { struct clock_event_device *evt = this_cpu_ptr(&decrementers); u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); struct pt_regs *old_regs; u64 now; - /* Some implementations of hotplug will get timer interrupts while - * offline, just ignore these and we also need to set - * decrementers_next_tb as MAX to make sure __check_irq_replay - * don't replay timer interrupt when return, otherwise we'll trap - * here infinitely :( + /* + * Some implementations of hotplug will get timer interrupts while + * offline, just ignore these. */ if (unlikely(!cpu_online(smp_processor_id()))) { - *next_tb = ~(u64)0; set_dec(decrementer_max); return; } - /* Ensure a positive value is written to the decrementer, or else - * some CPUs will continue to take decrementer exceptions. When the - * PPC_WATCHDOG (decrementer based) is configured, keep this at most - * 31 bits, which is about 4 seconds on most systems, which gives - * the watchdog a chance of catching timer interrupt hard lockups. - */ - if (IS_ENABLED(CONFIG_PPC_WATCHDOG)) - set_dec(0x7fffffff); - else - set_dec(decrementer_max); - - /* Conditionally hard-enable interrupts now that the DEC has been - * bumped to its maximum value - */ - may_hard_irq_enable(); + /* Conditionally hard-enable interrupts. */ + if (should_hard_irq_enable(regs)) { + /* + * Ensure a positive value is written to the decrementer, or + * else some CPUs will continue to take decrementer exceptions. + * When the PPC_WATCHDOG (decrementer based) is configured, + * keep this at most 31 bits, which is about 4 seconds on most + * systems, which gives the watchdog a chance of catching timer + * interrupt hard lockups. + */ + if (IS_ENABLED(CONFIG_PPC_WATCHDOG)) + set_dec(0x7fffffff); + else + set_dec(decrementer_max); + do_hard_irq_enable(); + } #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); + __do_IRQ(regs); #endif old_regs = set_irq_regs(regs); - irq_enter(); + trace_timer_interrupt_entry(regs); if (test_irq_work_pending()) { clear_irq_work_pending(); + mce_run_irq_context_handlers(); irq_work_run(); } - now = get_tb_or_rtc(); + now = get_tb(); if (now >= *next_tb) { - *next_tb = ~(u64)0; - if (evt->event_handler) - evt->event_handler(evt); + evt->event_handler(evt); __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= decrementer_max) - set_dec(now); - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + if (now > decrementer_max) + now = decrementer_max; + set_dec_or_work(now); __this_cpu_inc(irq_stat.timer_irqs_others); } trace_timer_interrupt_exit(regs); - irq_exit(); + set_irq_regs(old_regs); } EXPORT_SYMBOL(timer_interrupt); @@ -655,26 +568,18 @@ EXPORT_SYMBOL(timer_interrupt); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST void timer_broadcast_interrupt(void) { - u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); - - *next_tb = ~(u64)0; tick_receive_broadcast(); __this_cpu_inc(irq_stat.broadcast_irqs_event); } #endif -/* - * Hypervisor decrementer interrupts shouldn't occur but are sometimes - * left pending on exit from a KVM guest. We don't need to do anything - * to clear them, as they are edge-triggered. - */ -void hdec_interrupt(struct pt_regs *regs) -{ -} - #ifdef CONFIG_SUSPEND -static void generic_suspend_disable_irqs(void) +/* Overrides the weak version in kernel/power/main.c */ +void arch_suspend_disable_irqs(void) { + if (ppc_md.suspend_disable_irqs) + ppc_md.suspend_disable_irqs(); + /* Disable the decrementer, so that it doesn't interfere * with suspending. */ @@ -684,23 +589,11 @@ static void generic_suspend_disable_irqs(void) set_dec(decrementer_max); } -static void generic_suspend_enable_irqs(void) -{ - local_irq_enable(); -} - -/* Overrides the weak version in kernel/power/main.c */ -void arch_suspend_disable_irqs(void) -{ - if (ppc_md.suspend_disable_irqs) - ppc_md.suspend_disable_irqs(); - generic_suspend_disable_irqs(); -} - /* Overrides the weak version in kernel/power/main.c */ void arch_suspend_enable_irqs(void) { - generic_suspend_enable_irqs(); + local_irq_enable(); + if (ppc_md.suspend_enable_irqs) ppc_md.suspend_enable_irqs(); } @@ -721,8 +614,6 @@ EXPORT_SYMBOL_GPL(tb_to_ns); */ notrace unsigned long long sched_clock(void) { - if (__USE_RTC()) - return get_rtc(); return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } @@ -782,7 +673,7 @@ static int __init get_freq(char *name, int cells, unsigned long *val) static void start_cpu_decrementer(void) { -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x unsigned int tcr; /* Clear any pending timer interrupts */ @@ -838,7 +729,7 @@ static void __read_persistent_clock(struct timespec64 *ts) static int first = 1; ts->tv_nsec = 0; - /* XXX this is a litle fragile but will work okay in the short term */ + /* XXX this is a little fragile but will work okay in the short term */ if (first) { first = 0; if (ppc_md.time_init) @@ -872,113 +763,14 @@ void read_persistent_clock64(struct timespec64 *ts) } /* clocksource code */ -static notrace u64 rtc_read(struct clocksource *cs) -{ - return (u64)get_rtc(); -} - static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); } - -void update_vsyscall(struct timekeeper *tk) -{ - struct timespec64 xt; - struct clocksource *clock = tk->tkr_mono.clock; - u32 mult = tk->tkr_mono.mult; - u32 shift = tk->tkr_mono.shift; - u64 cycle_last = tk->tkr_mono.cycle_last; - u64 new_tb_to_xs, new_stamp_xsec; - u64 frac_sec; - - if (clock != &clocksource_timebase) - return; - - xt.tv_sec = tk->xtime_sec; - xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); - - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * This computes ((2^20 / 1e9) * mult) >> shift as a - * 0.64 fixed-point fraction. - * The computation in the else clause below won't overflow - * (as long as the timebase frequency is >= 1.049 MHz) - * but loses precision because we lose the low bits of the constant - * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9. - * For a shift of 24 the error is about 0.5e-9, or about 0.5ns - * over a second. (Shift values are usually 22, 23 or 24.) - * For high frequency clocks such as the 512MHz timebase clock - * on POWER[6789], the mult value is small (e.g. 32768000) - * and so we can shift the constant by 16 initially - * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the - * remaining shifts after the multiplication, which gives a - * more accurate result (e.g. with mult = 32768000, shift = 24, - * the error is only about 1.2e-12, or 0.7ns over 10 minutes). - */ - if (mult <= 62500000 && clock->shift >= 16) - new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16); - else - new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); - - /* - * Compute the fractional second in units of 2^-32 seconds. - * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift - * in nanoseconds, so multiplying that by 2^32 / 1e9 gives - * it in units of 2^-32 seconds. - * We assume shift <= 32 because clocks_calc_mult_shift() - * generates shift values in the range 0 - 32. - */ - frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift); - do_div(frac_sec, NSEC_PER_SEC); - - /* - * Work out new stamp_xsec value for any legacy users of systemcfg. - * stamp_xsec is in units of 2^-20 seconds. - */ - new_stamp_xsec = frac_sec >> 12; - new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC; - - /* - * tb_update_count is used to allow the userspace gettimeofday code - * to assure itself that it sees a consistent view of the tb_to_xs and - * stamp_xsec variables. It reads the tb_update_count, then reads - * tb_to_xs and stamp_xsec and then reads tb_update_count again. If - * the two values of tb_update_count match and are even then the - * tb_to_xs and stamp_xsec values are consistent. If not, then it - * loops back and reads them again until this criteria is met. - */ - vdso_data->tb_orig_stamp = cycle_last; - vdso_data->stamp_xsec = new_stamp_xsec; - vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec; - vdso_data->stamp_xtime_sec = xt.tv_sec; - vdso_data->stamp_xtime_nsec = xt.tv_nsec; - vdso_data->stamp_sec_fraction = frac_sec; - vdso_data->hrtimer_res = hrtimer_resolution; - smp_wmb(); - ++(vdso_data->tb_update_count); -} - -void update_vsyscall_tz(void) -{ - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; -} - static void __init clocksource_init(void) { - struct clocksource *clock; - - if (__USE_RTC()) - clock = &clocksource_rtc; - else - clock = &clocksource_timebase; + struct clocksource *clock = &clocksource_timebase; if (clocksource_register_hz(clock, tb_ticks_per_sec)) { printk(KERN_ERR "clocksource: %s is already registered\n", @@ -993,19 +785,17 @@ static void __init clocksource_init(void) static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { - __this_cpu_write(decrementers_next_tb, get_tb_or_rtc() + evt); - set_dec(evt); - - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + __this_cpu_write(decrementers_next_tb, get_tb() + evt); + set_dec_or_work(evt); return 0; } static int decrementer_shutdown(struct clock_event_device *dev) { - decrementer_set_next_event(decrementer_max, dev); + __this_cpu_write(decrementers_next_tb, DEC_CLOCKEVENT_STOPPED); + set_dec_or_work(decrementer_max); + return 0; } @@ -1084,7 +874,7 @@ void secondary_cpu_time_init(void) */ start_cpu_decrementer(); - /* FIME: Should make unrelatred change to move snapshot_timebase + /* FIME: Should make unrelated change to move snapshot_timebase * call here ! */ register_decrementer_clockevent(smp_processor_id()); } @@ -1096,22 +886,20 @@ void __init time_init(void) u64 scale; unsigned shift; - if (__USE_RTC()) { - /* 601 processor: dec counts down by 128 every 128ns */ - ppc_tb_freq = 1000000000; - } else { - /* Normal PowerPC with timebase register */ + /* Normal PowerPC with timebase register */ + if (ppc_md.calibrate_decr) ppc_md.calibrate_decr(); - printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", - ppc_tb_freq / 1000000, ppc_tb_freq % 1000000); - printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", - ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); - } + else + generic_calibrate_decr(); + + printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", + ppc_tb_freq / 1000000, ppc_tb_freq % 1000000); + printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", + ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; - calc_cputime_factors(); /* * Compute scale factor for sched_clock. @@ -1132,7 +920,7 @@ void __init time_init(void) tb_to_ns_scale = scale; tb_to_ns_shift = shift; /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ - boot_tb = get_tb_or_rtc(); + boot_tb = get_tb(); /* If platform provided a timezone (pmac), we correct the time */ if (timezone_offset) { @@ -1140,7 +928,6 @@ void __init time_init(void) sys_tz.tz_dsttime = 0; } - vdso_data->tb_update_count = 0; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; /* initialise and enable the large decrementer (if we have one) */ @@ -1158,9 +945,8 @@ void __init time_init(void) init_decrementer_clockevent(); tick_setup_hrtimer_broadcast(); -#ifdef CONFIG_COMMON_CLK of_clk_init(NULL); -#endif + enable_sched_clock_irqtime(); } /* diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 6ba0fdd1e7f8..a9cd6507163a 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -6,13 +6,13 @@ * Copyright 2012 Matt Evans & Michael Neuling, IBM Corporation. */ +#include <linux/export.h> #include <asm/asm-offsets.h> #include <asm/ppc_asm.h> #include <asm/ppc-opcode.h> #include <asm/ptrace.h> #include <asm/reg.h> #include <asm/bug.h> -#include <asm/export.h> #include <asm/feature-fixups.h> #ifdef CONFIG_VSX @@ -117,11 +117,18 @@ _GLOBAL(tm_reclaim) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) - /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ + /* We've a struct pt_regs at [r1+STACK_INT_FRAME_REGS]. */ std r3, STK_PARAM(R3)(r1) SAVE_NVGPRS(r1) + /* + * Save kernel live AMR since it will be clobbered by treclaim + * but can be used elsewhere later in kernel space. + */ + mfspr r3, SPRN_AMR + std r3, TM_FRAME_L1(r1) + /* We need to setup MSR for VSX register save instructions. */ mfmsr r14 mr r15, r14 @@ -215,15 +222,12 @@ _GLOBAL(tm_reclaim) * Make r7 look like an exception frame so that we can use the neat * GPRx(n) macros. r7 is NOT a pt_regs ptr! */ - subi r7, r7, STACK_FRAME_OVERHEAD + subi r7, r7, STACK_INT_FRAME_REGS /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ SAVE_GPR(0, r7) /* user r0 */ - SAVE_GPR(2, r7) /* user r2 */ - SAVE_4GPRS(3, r7) /* user r3-r6 */ - SAVE_GPR(8, r7) /* user r8 */ - SAVE_GPR(9, r7) /* user r9 */ - SAVE_GPR(10, r7) /* user r10 */ + SAVE_GPRS(2, 6, r7) /* user r2-r6 */ + SAVE_GPRS(8, 10, r7) /* user r8-r10 */ ld r3, GPR1(r1) /* user r1 */ ld r4, GPR7(r1) /* user r7 */ ld r5, GPR11(r1) /* user r11 */ @@ -245,7 +249,7 @@ _GLOBAL(tm_reclaim) * but is used in signal return to 'wind back' to the abort handler. */ - /* ******************** CR,LR,CCR,MSR ********** */ + /* ***************** CTR, LR, CR, XER ********** */ mfctr r3 mflr r4 mfcr r5 @@ -256,7 +260,6 @@ _GLOBAL(tm_reclaim) std r5, _CCR(r7) std r6, _XER(r7) - /* ******************** TAR, DSCR ********** */ mfspr r3, SPRN_TAR mfspr r4, SPRN_DSCR @@ -264,6 +267,10 @@ _GLOBAL(tm_reclaim) std r3, THREAD_TM_TAR(r12) std r4, THREAD_TM_DSCR(r12) + /* ******************** AMR **************** */ + mfspr r3, SPRN_AMR + std r3, THREAD_TM_AMR(r12) + /* * MSR and flags: We don't change CRs, and we don't need to alter MSR. */ @@ -308,7 +315,9 @@ _GLOBAL(tm_reclaim) std r3, THREAD_TM_TFHAR(r12) std r4, THREAD_TM_TFIAR(r12) - /* AMR is checkpointed too, but is unsupported by Linux. */ + /* Restore kernel live AMR */ + ld r8, TM_FRAME_L1(r1) + mtspr SPRN_AMR, r8 /* Restore original MSR/IRQ state & clear TM mode */ ld r14, TM_FRAME_L0(r1) /* Orig MSR */ @@ -350,11 +359,18 @@ _GLOBAL(__tm_recheckpoint) stdu r1, -TM_FRAME_SIZE(r1) /* - * We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. + * We've a struct pt_regs at [r1+STACK_INT_FRAME_REGS]. * This is used for backing up the NVGPRs: */ SAVE_NVGPRS(r1) + /* + * Save kernel live AMR since it will be clobbered for trechkpt + * but can be used elsewhere later in kernel space. + */ + mfspr r8, SPRN_AMR + std r8, TM_FRAME_L0(r1) + /* Load complete register state from ts_ckpt* registers */ addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */ @@ -363,7 +379,7 @@ _GLOBAL(__tm_recheckpoint) * Make r7 look like an exception frame so that we can use the neat * GPRx(n) macros. r7 is now NOT a pt_regs ptr! */ - subi r7, r7, STACK_FRAME_OVERHEAD + subi r7, r7, STACK_INT_FRAME_REGS /* We need to setup MSR for FP/VMX/VSX register save instructions. */ mfmsr r6 @@ -404,7 +420,7 @@ _GLOBAL(__tm_recheckpoint) restore_gprs: - /* ******************** CR,LR,CCR,MSR ********** */ + /* ****************** CTR, LR, XER ************* */ ld r4, _CTR(r7) ld r5, _LINK(r7) ld r8, _XER(r7) @@ -417,17 +433,18 @@ restore_gprs: ld r4, THREAD_TM_TAR(r3) mtspr SPRN_TAR, r4 + /* ******************** AMR ******************** */ + ld r4, THREAD_TM_AMR(r3) + mtspr SPRN_AMR, r4 + /* Load up the PPR and DSCR in GPRs only at this stage */ ld r5, THREAD_TM_DSCR(r3) ld r6, THREAD_TM_PPR(r3) REST_GPR(0, r7) /* GPR0 */ - REST_2GPRS(2, r7) /* GPR2-3 */ - REST_GPR(4, r7) /* GPR4 */ - REST_4GPRS(8, r7) /* GPR8-11 */ - REST_2GPRS(12, r7) /* GPR12-13 */ - - REST_NVGPRS(r7) /* GPR14-31 */ + REST_GPRS(2, 4, r7) /* GPR2-4 */ + REST_GPRS(8, 12, r7) /* GPR8-12 */ + REST_GPRS(14, 31, r7) /* GPR14-31 */ /* Load up PPR and DSCR here so we don't run with user values for long */ mtspr SPRN_DSCR, r5 @@ -463,18 +480,24 @@ restore_gprs: REST_GPR(6, r7) /* - * Store r1 and r5 on the stack so that we can access them after we - * clear MSR RI. + * Store user r1 and r5 and r13 on the stack (in the unused save + * areas / compiler reserved areas), so that we can access them after + * we clear MSR RI. */ REST_GPR(5, r7) std r5, -8(r1) - ld r5, GPR1(r7) + ld r5, GPR13(r7) std r5, -16(r1) + ld r5, GPR1(r7) + std r5, -24(r1) REST_GPR(7, r7) - /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */ + /* Stash the stack pointer away for use after recheckpoint */ + std r1, PACAR1(r13) + + /* Clear MSR RI since we are about to clobber r13. EE is already off */ li r5, 0 mtmsrd r5, 1 @@ -485,9 +508,9 @@ restore_gprs: * until we turn MSR RI back on. */ - SET_SCRATCH0(r1) ld r5, -8(r1) - ld r1, -16(r1) + ld r13, -16(r1) + ld r1, -24(r1) /* Commit register state as checkpointed state: */ TRECHKPT @@ -503,12 +526,16 @@ restore_gprs: */ GET_PACA(r13) - GET_SCRATCH0(r1) + ld r1, PACAR1(r13) - /* R1 is restored, so we are recoverable again. EE is still off */ + /* R13, R1 is restored, so we are recoverable again. EE is still off */ li r4, MSR_RI mtmsrd r4, 1 + /* Restore kernel live AMR */ + ld r8, TM_FRAME_L0(r1) + mtspr SPRN_AMR, r8 + REST_NVGPRS(r1) addi r1, r1, TM_FRAME_SIZE diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index 858503775c58..125f4ca588b9 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -6,18 +6,15 @@ ifdef CONFIG_FUNCTION_TRACER # do not trace tracer code CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ftrace_64_pg.o = $(CC_FLAGS_FTRACE) endif -obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o -obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64.o +obj32-$(CONFIG_FUNCTION_TRACER) += ftrace.o ftrace_entry.o ifdef CONFIG_MPROFILE_KERNEL -obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_mprofile.o +obj64-$(CONFIG_FUNCTION_TRACER) += ftrace.o ftrace_entry.o else -obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_pg.o +obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_pg.o ftrace_64_pg_entry.o endif -obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o -obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_PPC64) += $(obj64-y) @@ -26,4 +23,9 @@ obj-$(CONFIG_PPC32) += $(obj32-y) # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_ftrace.o := n KCOV_INSTRUMENT_ftrace.o := n +KCSAN_SANITIZE_ftrace.o := n UBSAN_SANITIZE_ftrace.o := n +GCOV_PROFILE_ftrace_64_pg.o := n +KCOV_INSTRUMENT_ftrace_64_pg.o := n +KCSAN_SANITIZE_ftrace_64_pg.o := n +UBSAN_SANITIZE_ftrace_64_pg.o := n diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 7ea0ca044b65..82010629cf88 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -22,834 +22,285 @@ #include <linux/init.h> #include <linux/list.h> -#include <asm/asm-prototypes.h> #include <asm/cacheflush.h> #include <asm/code-patching.h> #include <asm/ftrace.h> #include <asm/syscall.h> +#include <asm/inst.h> - -#ifdef CONFIG_DYNAMIC_FTRACE - -/* - * We generally only have a single long_branch tramp and at most 2 or 3 plt - * tramps generated. But, we don't use the plt tramps currently. We also allot - * 2 tramps after .text and .init.text. So, we only end up with around 3 usable - * tramps in total. Set aside 8 just to be sure. - */ -#define NUM_FTRACE_TRAMPS 8 +#define NUM_FTRACE_TRAMPS 2 static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; -static unsigned int -ftrace_call_replace(unsigned long ip, unsigned long addr, int link) +static ppc_inst_t ftrace_create_branch_inst(unsigned long ip, unsigned long addr, int link) { - unsigned int op; - - addr = ppc_function_entry((void *)addr); + ppc_inst_t op; - /* if (link) set op to 'bl' else 'b' */ - op = create_branch((unsigned int *)ip, addr, link ? 1 : 0); + WARN_ON(!is_offset_in_branch_range(addr - ip)); + create_branch(&op, (u32 *)ip, addr, link ? BRANCH_SET_LINK : 0); return op; } -static int -ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new) +static inline int ftrace_read_inst(unsigned long ip, ppc_inst_t *op) { - unsigned int replaced; - - /* - * Note: - * We are paranoid about modifying text, as if a bug was to happen, it - * could cause us to read or write to someplace that could cause harm. - * Carefully read and modify the code with probe_kernel_*(), and make - * sure what we read is what we expected it to be before modifying it. - */ - - /* read the text we want to modify */ - if (probe_kernel_read(&replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (copy_inst_from_kernel_nofault(op, (void *)ip)) { + pr_err("0x%lx: fetching instruction failed\n", ip); return -EFAULT; - - /* Make sure it is what we expect it to be */ - if (replaced != old) { - pr_err("%p: replaced (%#x) != old (%#x)", - (void *)ip, replaced, old); - return -EINVAL; } - /* replace the text with the new text */ - if (patch_instruction((unsigned int *)ip, new)) - return -EPERM; - return 0; } -/* - * Helper functions that are the same for both PPC64 and PPC32. - */ -static int test_24bit_addr(unsigned long ip, unsigned long addr) -{ - addr = ppc_function_entry((void *)addr); - - /* use the create_branch to verify that this offset can be branched */ - return create_branch((unsigned int *)ip, addr, 0); -} - -static int is_bl_op(unsigned int op) -{ - return (op & 0xfc000003) == 0x48000001; -} - -static int is_b_op(unsigned int op) -{ - return (op & 0xfc000003) == 0x48000000; -} - -static unsigned long find_bl_target(unsigned long ip, unsigned int op) +static inline int ftrace_validate_inst(unsigned long ip, ppc_inst_t inst) { - int offset; + ppc_inst_t op; + int ret; - offset = (op & 0x03fffffc); - /* make it signed */ - if (offset & 0x02000000) - offset |= 0xfe000000; + ret = ftrace_read_inst(ip, &op); + if (!ret && !ppc_inst_equal(op, inst)) { + pr_err("0x%lx: expected (%08lx) != found (%08lx)\n", + ip, ppc_inst_as_ulong(inst), ppc_inst_as_ulong(op)); + ret = -EINVAL; + } - return ip + (long)offset; + return ret; } -#ifdef CONFIG_MODULES -#ifdef CONFIG_PPC64 -static int -__ftrace_make_nop(struct module *mod, - struct dyn_ftrace *rec, unsigned long addr) +static inline int ftrace_modify_code(unsigned long ip, ppc_inst_t old, ppc_inst_t new) { - unsigned long entry, ptr, tramp; - unsigned long ip = rec->ip; - unsigned int op, pop; - - /* read where this goes */ - if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { - pr_err("Fetching opcode failed.\n"); - return -EFAULT; - } - - /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(op)) { - pr_err("Not expected bl: opcode is %x\n", op); - return -EINVAL; - } - - /* lets find where the pointer goes */ - tramp = find_bl_target(ip, op); + int ret = ftrace_validate_inst(ip, old); - pr_devel("ip:%lx jumps to %lx", ip, tramp); + if (!ret) + ret = patch_instruction((u32 *)ip, new); - if (module_trampoline_target(mod, tramp, &ptr)) { - pr_err("Failed to get trampoline target\n"); - return -EFAULT; - } - - pr_devel("trampoline target %lx", ptr); - - entry = ppc_global_function_entry((void *)addr); - /* This should match what was called */ - if (ptr != entry) { - pr_err("addr %lx does not match expected %lx\n", ptr, entry); - return -EINVAL; - } - -#ifdef CONFIG_MPROFILE_KERNEL - /* When using -mkernel_profile there is no load to jump over */ - pop = PPC_INST_NOP; - - if (probe_kernel_read(&op, (void *)(ip - 4), 4)) { - pr_err("Fetching instruction at %lx failed.\n", ip - 4); - return -EFAULT; - } - - /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ - if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { - pr_err("Unexpected instruction %08x around bl _mcount\n", op); - return -EINVAL; - } -#else - /* - * Our original call site looks like: - * - * bl <tramp> - * ld r2,XX(r1) - * - * Milton Miller pointed out that we can not simply nop the branch. - * If a task was preempted when calling a trace function, the nops - * will remove the way to restore the TOC in r2 and the r2 TOC will - * get corrupted. - * - * Use a b +8 to jump over the load. - */ - - pop = PPC_INST_BRANCH | 8; /* b +8 */ - - /* - * Check what is in the next instruction. We can see ld r2,40(r1), but - * on first pass after boot we will see mflr r0. - */ - if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) { - pr_err("Fetching op failed.\n"); - return -EFAULT; - } - - if (op != PPC_INST_LD_TOC) { - pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); - return -EINVAL; - } -#endif /* CONFIG_MPROFILE_KERNEL */ - - if (patch_instruction((unsigned int *)ip, pop)) { - pr_err("Patching NOP failed.\n"); - return -EPERM; - } - - return 0; + return ret; } -#else /* !PPC64 */ -static int -__ftrace_make_nop(struct module *mod, - struct dyn_ftrace *rec, unsigned long addr) +static int is_bl_op(ppc_inst_t op) { - unsigned int op; - unsigned int jmp[4]; - unsigned long ip = rec->ip; - unsigned long tramp; - - if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(op)) { - pr_err("Not expected bl: opcode is %x\n", op); - return -EINVAL; - } - - /* lets find where the pointer goes */ - tramp = find_bl_target(ip, op); - - /* - * On PPC32 the trampoline looks like: - * 0x3d, 0x80, 0x00, 0x00 lis r12,sym@ha - * 0x39, 0x8c, 0x00, 0x00 addi r12,r12,sym@l - * 0x7d, 0x89, 0x03, 0xa6 mtctr r12 - * 0x4e, 0x80, 0x04, 0x20 bctr - */ - - pr_devel("ip:%lx jumps to %lx", ip, tramp); - - /* Find where the trampoline jumps to */ - if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { - pr_err("Failed to read %lx\n", tramp); - return -EFAULT; - } - - pr_devel(" %08x %08x ", jmp[0], jmp[1]); - - /* verify that this is what we expect it to be */ - if (((jmp[0] & 0xffff0000) != 0x3d800000) || - ((jmp[1] & 0xffff0000) != 0x398c0000) || - (jmp[2] != 0x7d8903a6) || - (jmp[3] != 0x4e800420)) { - pr_err("Not a trampoline\n"); - return -EINVAL; - } - - tramp = (jmp[1] & 0xffff) | - ((jmp[0] & 0xffff) << 16); - if (tramp & 0x8000) - tramp -= 0x10000; - - pr_devel(" %lx ", tramp); - - if (tramp != addr) { - pr_err("Trampoline location %08lx does not match addr\n", - tramp); - return -EINVAL; - } - - op = PPC_INST_NOP; - - if (patch_instruction((unsigned int *)ip, op)) - return -EPERM; - - return 0; + return (ppc_inst_val(op) & ~PPC_LI_MASK) == PPC_RAW_BL(0); } -#endif /* PPC64 */ -#endif /* CONFIG_MODULES */ static unsigned long find_ftrace_tramp(unsigned long ip) { int i; - /* - * We have the compiler generated long_branch tramps at the end - * and we prefer those - */ - for (i = NUM_FTRACE_TRAMPS - 1; i >= 0; i--) + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) if (!ftrace_tramps[i]) continue; - else if (create_branch((void *)ip, ftrace_tramps[i], 0)) + else if (is_offset_in_branch_range(ftrace_tramps[i] - ip)) return ftrace_tramps[i]; return 0; } -static int add_ftrace_tramp(unsigned long tramp) -{ - int i; - - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_tramps[i]) { - ftrace_tramps[i] = tramp; - return 0; - } - - return -1; -} - -/* - * If this is a compiler generated long_branch trampoline (essentially, a - * trampoline that has a branch to _mcount()), we re-write the branch to - * instead go to ftrace_[regs_]caller() and note down the location of this - * trampoline. - */ -static int setup_mcount_compiler_tramp(unsigned long tramp) +static int ftrace_get_call_inst(struct dyn_ftrace *rec, unsigned long addr, ppc_inst_t *call_inst) { - int i, op; - unsigned long ptr; - static unsigned long ftrace_plt_tramps[NUM_FTRACE_TRAMPS]; - - /* Is this a known long jump tramp? */ - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_tramps[i]) - break; - else if (ftrace_tramps[i] == tramp) - return 0; - - /* Is this a known plt tramp? */ - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_plt_tramps[i]) - break; - else if (ftrace_plt_tramps[i] == tramp) - return -1; - - /* New trampoline -- read where this goes */ - if (probe_kernel_read(&op, (void *)tramp, sizeof(int))) { - pr_debug("Fetching opcode failed.\n"); - return -1; - } - - /* Is this a 24 bit branch? */ - if (!is_b_op(op)) { - pr_debug("Trampoline is not a long branch tramp.\n"); - return -1; - } - - /* lets find where the pointer goes */ - ptr = find_bl_target(tramp, op); - - if (ptr != ppc_global_function_entry((void *)_mcount)) { - pr_debug("Trampoline target %p is not _mcount\n", (void *)ptr); - return -1; - } + unsigned long ip = rec->ip; + unsigned long stub; - /* Let's re-write the tramp to go to ftrace_[regs_]caller */ -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - ptr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - ptr = ppc_global_function_entry((void *)ftrace_caller); + if (is_offset_in_branch_range(addr - ip)) { + /* Within range */ + stub = addr; +#ifdef CONFIG_MODULES + } else if (rec->arch.mod) { + /* Module code would be going to one of the module stubs */ + stub = (addr == (unsigned long)ftrace_caller ? rec->arch.mod->arch.tramp : + rec->arch.mod->arch.tramp_regs); #endif - if (!create_branch((void *)tramp, ptr, 0)) { - pr_debug("%ps is not reachable from existing mcount tramp\n", - (void *)ptr); - return -1; - } - - if (patch_branch((unsigned int *)tramp, ptr, 0)) { - pr_debug("REL24 out of range!\n"); - return -1; - } - - if (add_ftrace_tramp(tramp)) { - pr_debug("No tramp locations left\n"); - return -1; - } - - return 0; -} - -static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long tramp, ip = rec->ip; - unsigned int op; - - /* Read where this goes */ - if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { - pr_err("Fetching opcode failed.\n"); - return -EFAULT; - } - - /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(op)) { - pr_err("Not expected bl: opcode is %x\n", op); - return -EINVAL; - } - - /* Let's find where the pointer goes */ - tramp = find_bl_target(ip, op); - - pr_devel("ip:%lx jumps to %lx", ip, tramp); - - if (setup_mcount_compiler_tramp(tramp)) { - /* Are other trampolines reachable? */ - if (!find_ftrace_tramp(ip)) { - pr_err("No ftrace trampolines reachable from %ps\n", - (void *)ip); + } else if (core_kernel_text(ip)) { + /* We would be branching to one of our ftrace stubs */ + stub = find_ftrace_tramp(ip); + if (!stub) { + pr_err("0x%lx: No ftrace stubs reachable\n", ip); return -EINVAL; } + } else { + return -EINVAL; } - if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { - pr_err("Patching NOP failed.\n"); - return -EPERM; - } - + *call_inst = ftrace_create_branch_inst(ip, stub, 1); return 0; } -int ftrace_make_nop(struct module *mod, - struct dyn_ftrace *rec, unsigned long addr) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { - unsigned long ip = rec->ip; - unsigned int old, new; - - /* - * If the calling address is more that 24 bits away, - * then we had to use a trampoline to make the call. - * Otherwise just update the call site. - */ - if (test_24bit_addr(ip, addr)) { - /* within range */ - old = ftrace_call_replace(ip, addr, 1); - new = PPC_INST_NOP; - return ftrace_modify_code(ip, old, new); - } else if (core_kernel_text(ip)) - return __ftrace_make_nop_kernel(rec, addr); - -#ifdef CONFIG_MODULES - /* - * Out of range jumps are called from modules. - * We should either already have a pointer to the module - * or it has been passed in. - */ - if (!rec->arch.mod) { - if (!mod) { - pr_err("No module loaded addr=%lx\n", addr); - return -EFAULT; - } - rec->arch.mod = mod; - } else if (mod) { - if (mod != rec->arch.mod) { - pr_err("Record mod %p not equal to passed in mod %p\n", - rec->arch.mod, mod); - return -EINVAL; - } - /* nothing to do if mod == rec->arch.mod */ - } else - mod = rec->arch.mod; - - return __ftrace_make_nop(mod, rec, addr); -#else - /* We should not get here without modules */ + /* This should never be called since we override ftrace_replace_code() */ + WARN_ON(1); return -EINVAL; -#endif /* CONFIG_MODULES */ -} - -#ifdef CONFIG_MODULES -#ifdef CONFIG_PPC64 -/* - * Examine the existing instructions for __ftrace_make_call. - * They should effectively be a NOP, and follow formal constraints, - * depending on the ABI. Return false if they don't. - */ -#ifndef CONFIG_MPROFILE_KERNEL -static int -expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1) -{ - /* - * We expect to see: - * - * b +8 - * ld r2,XX(r1) - * - * The load offset is different depending on the ABI. For simplicity - * just mask it out when doing the compare. - */ - if ((op0 != 0x48000008) || ((op1 & 0xffff0000) != 0xe8410000)) - return 0; - return 1; -} -#else -static int -expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1) -{ - /* look for patched "NOP" on ppc64 with -mprofile-kernel */ - if (op0 != PPC_INST_NOP) - return 0; - return 1; } #endif -static int -__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned int op[2]; - void *ip = (void *)rec->ip; - unsigned long entry, ptr, tramp; - struct module *mod = rec->arch.mod; - - /* read where this goes */ - if (probe_kernel_read(op, ip, sizeof(op))) - return -EFAULT; - - if (!expected_nop_sequence(ip, op[0], op[1])) { - pr_err("Unexpected call sequence at %p: %x %x\n", - ip, op[0], op[1]); - return -EINVAL; - } - - /* If we never set up ftrace trampoline(s), then bail */ -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - if (!mod->arch.tramp || !mod->arch.tramp_regs) { -#else - if (!mod->arch.tramp) { -#endif - pr_err("No ftrace trampoline\n"); - return -EINVAL; - } - -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - if (rec->flags & FTRACE_FL_REGS) - tramp = mod->arch.tramp_regs; - else -#endif - tramp = mod->arch.tramp; - - if (module_trampoline_target(mod, tramp, &ptr)) { - pr_err("Failed to get trampoline target\n"); - return -EFAULT; - } - - pr_devel("trampoline target %lx", ptr); - - entry = ppc_global_function_entry((void *)addr); - /* This should match what was called */ - if (ptr != entry) { - pr_err("addr %lx does not match expected %lx\n", ptr, entry); - return -EINVAL; - } + ppc_inst_t old, new; + int ret; - /* Ensure branch is within 24 bits */ - if (!create_branch(ip, tramp, BRANCH_SET_LINK)) { - pr_err("Branch out of range\n"); + /* This can only ever be called during module load */ + if (WARN_ON(!IS_ENABLED(CONFIG_MODULES) || core_kernel_text(rec->ip))) return -EINVAL; - } - if (patch_branch(ip, tramp, BRANCH_SET_LINK)) { - pr_err("REL24 out of range!\n"); - return -EINVAL; - } + old = ppc_inst(PPC_RAW_NOP()); + ret = ftrace_get_call_inst(rec, addr, &new); + if (ret) + return ret; - return 0; + return ftrace_modify_code(rec->ip, old, new); } -#else /* !CONFIG_PPC64: */ -static int -__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned int op; - unsigned long ip = rec->ip; - - /* read where this goes */ - if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* It should be pointing to a nop */ - if (op != PPC_INST_NOP) { - pr_err("Expected NOP but have %x\n", op); - return -EINVAL; - } - - /* If we never set up a trampoline to ftrace_caller, then bail */ - if (!rec->arch.mod->arch.tramp) { - pr_err("No ftrace trampoline\n"); - return -EINVAL; - } - - /* create the branch to the trampoline */ - op = create_branch((unsigned int *)ip, - rec->arch.mod->arch.tramp, BRANCH_SET_LINK); - if (!op) { - pr_err("REL24 out of range!\n"); - return -EINVAL; - } - - pr_devel("write to %lx\n", rec->ip); - - if (patch_instruction((unsigned int *)ip, op)) - return -EPERM; - - return 0; + /* + * This should never be called since we override ftrace_replace_code(), + * as well as ftrace_init_nop() + */ + WARN_ON(1); + return -EINVAL; } -#endif /* CONFIG_PPC64 */ -#endif /* CONFIG_MODULES */ -static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) +void ftrace_replace_code(int enable) { - unsigned int op; - void *ip = (void *)rec->ip; - unsigned long tramp, entry, ptr; + ppc_inst_t old, new, call_inst, new_call_inst; + ppc_inst_t nop_inst = ppc_inst(PPC_RAW_NOP()); + unsigned long ip, new_addr, addr; + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + int ret = 0, update; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + ip = rec->ip; + + if (rec->flags & FTRACE_FL_DISABLED && !(rec->flags & FTRACE_FL_ENABLED)) + continue; - /* Make sure we're being asked to patch branch to a known ftrace addr */ - entry = ppc_global_function_entry((void *)ftrace_caller); - ptr = ppc_global_function_entry((void *)addr); + addr = ftrace_get_addr_curr(rec); + new_addr = ftrace_get_addr_new(rec); + update = ftrace_update_record(rec, enable); - if (ptr != entry) { -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - entry = ppc_global_function_entry((void *)ftrace_regs_caller); - if (ptr != entry) { -#endif - pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); - return -EINVAL; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + switch (update) { + case FTRACE_UPDATE_IGNORE: + default: + continue; + case FTRACE_UPDATE_MODIFY_CALL: + ret = ftrace_get_call_inst(rec, new_addr, &new_call_inst); + ret |= ftrace_get_call_inst(rec, addr, &call_inst); + old = call_inst; + new = new_call_inst; + break; + case FTRACE_UPDATE_MAKE_NOP: + ret = ftrace_get_call_inst(rec, addr, &call_inst); + old = call_inst; + new = nop_inst; + break; + case FTRACE_UPDATE_MAKE_CALL: + ret = ftrace_get_call_inst(rec, new_addr, &call_inst); + old = nop_inst; + new = call_inst; + break; } -#endif - } - - /* Make sure we have a nop */ - if (probe_kernel_read(&op, ip, sizeof(op))) { - pr_err("Unable to read ftrace location %p\n", ip); - return -EFAULT; - } - - if (op != PPC_INST_NOP) { - pr_err("Unexpected call sequence at %p: %x\n", ip, op); - return -EINVAL; - } - - tramp = find_ftrace_tramp((unsigned long)ip); - if (!tramp) { - pr_err("No ftrace trampolines reachable from %ps\n", ip); - return -EINVAL; - } - if (patch_branch(ip, tramp, BRANCH_SET_LINK)) { - pr_err("Error patching branch to ftrace tramp!\n"); - return -EINVAL; + if (!ret) + ret = ftrace_modify_code(ip, old, new); + if (ret) + goto out; } - return 0; -} - -int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned int old, new; - - /* - * If the calling address is more that 24 bits away, - * then we had to use a trampoline to make the call. - * Otherwise just update the call site. - */ - if (test_24bit_addr(ip, addr)) { - /* within range */ - old = PPC_INST_NOP; - new = ftrace_call_replace(ip, addr, 1); - return ftrace_modify_code(ip, old, new); - } else if (core_kernel_text(ip)) - return __ftrace_make_call_kernel(rec, addr); - -#ifdef CONFIG_MODULES - /* - * Out of range jumps are called from modules. - * Being that we are converting from nop, it had better - * already have a module defined. - */ - if (!rec->arch.mod) { - pr_err("No module loaded\n"); - return -EINVAL; - } - - return __ftrace_make_call(rec, addr); -#else - /* We should not get here without modules */ - return -EINVAL; -#endif /* CONFIG_MODULES */ +out: + if (ret) + ftrace_bug(ret, rec); + return; } -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS -#ifdef CONFIG_MODULES -static int -__ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, - unsigned long addr) +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { - unsigned int op; - unsigned long ip = rec->ip; - unsigned long entry, ptr, tramp; - struct module *mod = rec->arch.mod; - - /* If we never set up ftrace trampolines, then bail */ - if (!mod->arch.tramp || !mod->arch.tramp_regs) { - pr_err("No ftrace trampoline\n"); - return -EINVAL; - } - - /* read where this goes */ - if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { - pr_err("Fetching opcode failed.\n"); - return -EFAULT; - } - - /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(op)) { - pr_err("Not expected bl: opcode is %x\n", op); + unsigned long addr, ip = rec->ip; + ppc_inst_t old, new; + int ret = 0; + + /* Verify instructions surrounding the ftrace location */ + if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY)) { + /* Expect nops */ + ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_NOP())); + if (!ret) + ret = ftrace_validate_inst(ip, ppc_inst(PPC_RAW_NOP())); + } else if (IS_ENABLED(CONFIG_PPC32)) { + /* Expected sequence: 'mflr r0', 'stw r0,4(r1)', 'bl _mcount' */ + ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); + if (!ret) + ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 4))); + } else if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) { + /* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl _mcount' */ + ret = ftrace_read_inst(ip - 4, &old); + if (!ret && !ppc_inst_equal(old, ppc_inst(PPC_RAW_MFLR(_R0)))) { + ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); + ret |= ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STD(_R0, _R1, 16))); + } + } else { return -EINVAL; } - /* lets find where the pointer goes */ - tramp = find_bl_target(ip, op); - entry = ppc_global_function_entry((void *)old_addr); + if (ret) + return ret; - pr_devel("ip:%lx jumps to %lx", ip, tramp); - - if (tramp != entry) { - /* old_addr is not within range, so we must have used a trampoline */ - if (module_trampoline_target(mod, tramp, &ptr)) { - pr_err("Failed to get trampoline target\n"); + if (!core_kernel_text(ip)) { + if (!mod) { + pr_err("0x%lx: No module provided for non-kernel address\n", ip); return -EFAULT; } - - pr_devel("trampoline target %lx", ptr); - - /* This should match what was called */ - if (ptr != entry) { - pr_err("addr %lx does not match expected %lx\n", ptr, entry); - return -EINVAL; - } - } - - /* The new target may be within range */ - if (test_24bit_addr(ip, addr)) { - /* within range */ - if (patch_branch((unsigned int *)ip, addr, BRANCH_SET_LINK)) { - pr_err("REL24 out of range!\n"); - return -EINVAL; - } - - return 0; - } - - if (rec->flags & FTRACE_FL_REGS) - tramp = mod->arch.tramp_regs; - else - tramp = mod->arch.tramp; - - if (module_trampoline_target(mod, tramp, &ptr)) { - pr_err("Failed to get trampoline target\n"); - return -EFAULT; - } - - pr_devel("trampoline target %lx", ptr); - - entry = ppc_global_function_entry((void *)addr); - /* This should match what was called */ - if (ptr != entry) { - pr_err("addr %lx does not match expected %lx\n", ptr, entry); - return -EINVAL; - } - - /* Ensure branch is within 24 bits */ - if (!create_branch((unsigned int *)ip, tramp, BRANCH_SET_LINK)) { - pr_err("Branch out of range\n"); - return -EINVAL; - } - - if (patch_branch((unsigned int *)ip, tramp, BRANCH_SET_LINK)) { - pr_err("REL24 out of range!\n"); - return -EINVAL; + rec->arch.mod = mod; } - return 0; -} -#endif - -int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, - unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned int old, new; - - /* - * If the calling address is more that 24 bits away, - * then we had to use a trampoline to make the call. - * Otherwise just update the call site. - */ - if (test_24bit_addr(ip, addr) && test_24bit_addr(ip, old_addr)) { - /* within range */ - old = ftrace_call_replace(ip, old_addr, 1); - new = ftrace_call_replace(ip, addr, 1); - return ftrace_modify_code(ip, old, new); - } else if (core_kernel_text(ip)) { + /* Nop-out the ftrace location */ + new = ppc_inst(PPC_RAW_NOP()); + addr = MCOUNT_ADDR; + if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY)) { + /* we instead patch-in the 'mflr r0' */ + old = ppc_inst(PPC_RAW_NOP()); + new = ppc_inst(PPC_RAW_MFLR(_R0)); + ret = ftrace_modify_code(ip - 4, old, new); + } else if (is_offset_in_branch_range(addr - ip)) { + /* Within range */ + old = ftrace_create_branch_inst(ip, addr, 1); + ret = ftrace_modify_code(ip, old, new); + } else if (core_kernel_text(ip) || (IS_ENABLED(CONFIG_MODULES) && mod)) { /* - * We always patch out of range locations to go to the regs - * variant, so there is nothing to do here + * We would be branching to a linker-generated stub, or to the module _mcount + * stub. Let's just confirm we have a 'bl' here. */ - return 0; - } - -#ifdef CONFIG_MODULES - /* - * Out of range jumps are called from modules. - */ - if (!rec->arch.mod) { - pr_err("No module loaded\n"); + ret = ftrace_read_inst(ip, &old); + if (ret) + return ret; + if (!is_bl_op(old)) { + pr_err("0x%lx: expected (bl) != found (%08lx)\n", ip, ppc_inst_as_ulong(old)); + return -EINVAL; + } + ret = patch_instruction((u32 *)ip, new); + } else { return -EINVAL; } - return __ftrace_modify_call(rec, old_addr, addr); -#else - /* We should not get here without modules */ - return -EINVAL; -#endif /* CONFIG_MODULES */ + return ret; } -#endif int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - unsigned int old, new; + ppc_inst_t old, new; int ret; - old = *(unsigned int *)&ftrace_call; - new = ftrace_call_replace(ip, (unsigned long)func, 1); + old = ppc_inst_read((u32 *)&ftrace_call); + new = ftrace_create_branch_inst(ip, ppc_function_entry(func), 1); ret = ftrace_modify_code(ip, old, new); -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* Also update the regs callback function */ - if (!ret) { + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && !ret) { ip = (unsigned long)(&ftrace_regs_call); - old = *(unsigned int *)&ftrace_regs_call; - new = ftrace_call_replace(ip, (unsigned long)func, 1); + old = ppc_inst_read((u32 *)&ftrace_regs_call); + new = ftrace_create_branch_inst(ip, ppc_function_entry(func), 1); ret = ftrace_modify_code(ip, old, new); } -#endif return ret; } @@ -863,91 +314,103 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -#ifdef CONFIG_PPC64 -#define PACATOC offsetof(struct paca_struct, kernel_toc) - -extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; - -int __init ftrace_dyn_arch_init(void) +void ftrace_free_init_tramp(void) { int i; - unsigned int *tramp[] = { ftrace_tramp_text, ftrace_tramp_init }; - u32 stub_insns[] = { - 0xe98d0000 | PACATOC, /* ld r12,PACATOC(r13) */ - 0x3d8c0000, /* addis r12,r12,<high> */ - 0x398c0000, /* addi r12,r12,<low> */ - 0x7d8903a6, /* mtctr r12 */ - 0x4e800420, /* bctr */ - }; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - unsigned long addr = ppc_global_function_entry((void *)ftrace_caller); -#endif - long reladdr = addr - kernel_toc_addr(); - if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { - pr_err("Address of %ps out of range of kernel_toc.\n", - (void *)addr); - return -1; - } + for (i = 0; i < NUM_FTRACE_TRAMPS && ftrace_tramps[i]; i++) + if (ftrace_tramps[i] == (unsigned long)ftrace_tramp_init) { + ftrace_tramps[i] = 0; + return; + } +} - for (i = 0; i < 2; i++) { - memcpy(tramp[i], stub_insns, sizeof(stub_insns)); - tramp[i][1] |= PPC_HA(reladdr); - tramp[i][2] |= PPC_LO(reladdr); - add_ftrace_tramp((unsigned long)tramp[i]); - } +static void __init add_ftrace_tramp(unsigned long tramp) +{ + int i; - return 0; + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) { + ftrace_tramps[i] = tramp; + return; + } } -#else + int __init ftrace_dyn_arch_init(void) { - return 0; -} + unsigned int *tramp[] = { ftrace_tramp_text, ftrace_tramp_init }; + unsigned long addr = FTRACE_REGS_ADDR; + long reladdr; + int i; + u32 stub_insns[] = { +#ifdef CONFIG_PPC_KERNEL_PCREL + /* pla r12,addr */ + PPC_PREFIX_MLS | __PPC_PRFX_R(1), + PPC_INST_PADDI | ___PPC_RT(_R12), + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR() +#elif defined(CONFIG_PPC64) + PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernel_toc)), + PPC_RAW_ADDIS(_R12, _R12, 0), + PPC_RAW_ADDI(_R12, _R12, 0), + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR() +#else + PPC_RAW_LIS(_R12, 0), + PPC_RAW_ADDI(_R12, _R12, 0), + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR() #endif -#endif /* CONFIG_DYNAMIC_FTRACE */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -extern void ftrace_graph_call(void); -extern void ftrace_graph_stub(void); + }; -int ftrace_enable_ftrace_graph_caller(void) -{ - unsigned long ip = (unsigned long)(&ftrace_graph_call); - unsigned long addr = (unsigned long)(&ftrace_graph_caller); - unsigned long stub = (unsigned long)(&ftrace_graph_stub); - unsigned int old, new; + if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) { + for (i = 0; i < 2; i++) { + reladdr = addr - (unsigned long)tramp[i]; - old = ftrace_call_replace(ip, stub, 0); - new = ftrace_call_replace(ip, addr, 0); + if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) { + pr_err("Address of %ps out of range of pcrel address.\n", + (void *)addr); + return -1; + } - return ftrace_modify_code(ip, old, new); -} + memcpy(tramp[i], stub_insns, sizeof(stub_insns)); + tramp[i][0] |= IMM_H18(reladdr); + tramp[i][1] |= IMM_L(reladdr); + add_ftrace_tramp((unsigned long)tramp[i]); + } + } else if (IS_ENABLED(CONFIG_PPC64)) { + reladdr = addr - kernel_toc_addr(); -int ftrace_disable_ftrace_graph_caller(void) -{ - unsigned long ip = (unsigned long)(&ftrace_graph_call); - unsigned long addr = (unsigned long)(&ftrace_graph_caller); - unsigned long stub = (unsigned long)(&ftrace_graph_stub); - unsigned int old, new; + if (reladdr >= (long)SZ_2G || reladdr < -(long long)SZ_2G) { + pr_err("Address of %ps out of range of kernel_toc.\n", + (void *)addr); + return -1; + } - old = ftrace_call_replace(ip, addr, 0); - new = ftrace_call_replace(ip, stub, 0); + for (i = 0; i < 2; i++) { + memcpy(tramp[i], stub_insns, sizeof(stub_insns)); + tramp[i][1] |= PPC_HA(reladdr); + tramp[i][2] |= PPC_LO(reladdr); + add_ftrace_tramp((unsigned long)tramp[i]); + } + } else { + for (i = 0; i < 2; i++) { + memcpy(tramp[i], stub_insns, sizeof(stub_insns)); + tramp[i][0] |= PPC_HA(addr); + tramp[i][1] |= PPC_LO(addr); + add_ftrace_tramp((unsigned long)tramp[i]); + } + } - return ftrace_modify_code(ip, old, new); + return 0; } -/* - * Hook the return address and push it in the stack of return addrs - * in current thread info. Return the address we want to divert to. - */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, - unsigned long sp) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) { - unsigned long return_hooker; + unsigned long sp = fregs->regs.gpr[1]; + int bit; if (unlikely(ftrace_graph_is_dead())) goto out; @@ -955,21 +418,15 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, if (unlikely(atomic_read(¤t->tracing_graph_pause))) goto out; - return_hooker = ppc_function_entry(return_to_handler); + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + goto out; + + if (!function_graph_enter(parent_ip, ip, 0, (unsigned long *)sp)) + parent_ip = ppc_function_entry(return_to_handler); - if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) - parent = return_hooker; + ftrace_test_recursion_unlock(bit); out: - return parent; + fregs->regs.link = parent_ip; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef PPC64_ELF_ABI_v1 -char *arch_ftrace_match_adjust(char *str, const char *search) -{ - if (str[0] == '.' && search[0] != '.') - return str + 1; - else - return str; -} -#endif /* PPC64_ELF_ABI_v1 */ diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S deleted file mode 100644 index e023ae59c429..000000000000 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Split from entry_32.S - */ - -#include <linux/magic.h> -#include <asm/reg.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/ftrace.h> -#include <asm/export.h> - -_GLOBAL(mcount) -_GLOBAL(_mcount) - /* - * It is required that _mcount on PPC32 must preserve the - * link register. But we have r0 to play with. We use r0 - * to push the return address back to the caller of mcount - * into the ctr register, restore the link register and - * then jump back using the ctr register. - */ - mflr r0 - mtctr r0 - lwz r0, 4(r1) - mtlr r0 - bctr - -_GLOBAL(ftrace_caller) - MCOUNT_SAVE_FRAME - /* r3 ends up with link register */ - subi r3, r3, MCOUNT_INSN_SIZE -.globl ftrace_call -ftrace_call: - bl ftrace_stub - nop -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - b ftrace_graph_stub -_GLOBAL(ftrace_graph_stub) -#endif - MCOUNT_RESTORE_FRAME - /* old link register ends up in ctr reg */ - bctr - -EXPORT_SYMBOL(_mcount) - -_GLOBAL(ftrace_stub) - blr - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -_GLOBAL(ftrace_graph_caller) - addi r5, r1, 48 - /* load r4 with local address */ - lwz r4, 44(r1) - subi r4, r4, MCOUNT_INSN_SIZE - - /* Grab the LR out of the caller stack frame */ - lwz r3,52(r1) - - bl prepare_ftrace_return - nop - - /* - * prepare_ftrace_return gives us the address we divert to. - * Change the LR in the callers stack frame to this. - */ - stw r3,52(r1) - - MCOUNT_RESTORE_FRAME - /* old link register ends up in ctr reg */ - bctr - -_GLOBAL(return_to_handler) - /* need to save return values */ - stwu r1, -32(r1) - stw r3, 20(r1) - stw r4, 16(r1) - stw r31, 12(r1) - mr r31, r1 - - bl ftrace_return_to_handler - nop - - /* return value has real return address */ - mtlr r3 - - lwz r3, 20(r1) - lwz r4, 16(r1) - lwz r31,12(r1) - lwz r1, 0(r1) - - /* Jump back to real return address */ - blr -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S deleted file mode 100644 index 25e5b9e47c06..000000000000 --- a/arch/powerpc/kernel/trace/ftrace_64.S +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Split from entry_64.S - */ - -#include <linux/magic.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/ftrace.h> -#include <asm/ppc-opcode.h> -#include <asm/export.h> - -.pushsection ".tramp.ftrace.text","aw",@progbits; -.globl ftrace_tramp_text -ftrace_tramp_text: - .space 64 -.popsection - -.pushsection ".tramp.ftrace.init","aw",@progbits; -.globl ftrace_tramp_init -ftrace_tramp_init: - .space 64 -.popsection - -_GLOBAL(mcount) -_GLOBAL(_mcount) -EXPORT_SYMBOL(_mcount) - mflr r12 - mtctr r12 - mtlr r0 - bctr - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -_GLOBAL(return_to_handler) - /* need to save return values */ - std r4, -32(r1) - std r3, -24(r1) - /* save TOC */ - std r2, -16(r1) - std r31, -8(r1) - mr r31, r1 - stdu r1, -112(r1) - - /* - * We might be called from a module. - * Switch to our TOC to run inside the core kernel. - */ - ld r2, PACATOC(r13) - - bl ftrace_return_to_handler - nop - - /* return value has real return address */ - mtlr r3 - - ld r1, 0(r1) - ld r4, -32(r1) - ld r3, -24(r1) - ld r2, -16(r1) - ld r31, -8(r1) - - /* Jump back to real return address */ - blr -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c new file mode 100644 index 000000000000..7b85c3b460a3 --- /dev/null +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c @@ -0,0 +1,846 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * + * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box. + * + * Added function graph tracer code, taken from x86 that was written + * by Frederic Weisbecker, and ported to PPC by Steven Rostedt. + * + */ + +#define pr_fmt(fmt) "ftrace-powerpc: " fmt + +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/list.h> + +#include <asm/cacheflush.h> +#include <asm/code-patching.h> +#include <asm/ftrace.h> +#include <asm/syscall.h> +#include <asm/inst.h> + +/* + * We generally only have a single long_branch tramp and at most 2 or 3 plt + * tramps generated. But, we don't use the plt tramps currently. We also allot + * 2 tramps after .text and .init.text. So, we only end up with around 3 usable + * tramps in total. Set aside 8 just to be sure. + */ +#define NUM_FTRACE_TRAMPS 8 +static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; + +static ppc_inst_t +ftrace_call_replace(unsigned long ip, unsigned long addr, int link) +{ + ppc_inst_t op; + + addr = ppc_function_entry((void *)addr); + + /* if (link) set op to 'bl' else 'b' */ + create_branch(&op, (u32 *)ip, addr, link ? BRANCH_SET_LINK : 0); + + return op; +} + +static inline int +ftrace_modify_code(unsigned long ip, ppc_inst_t old, ppc_inst_t new) +{ + ppc_inst_t replaced; + + /* + * Note: + * We are paranoid about modifying text, as if a bug was to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with probe_kernel_*(), and make + * sure what we read is what we expected it to be before modifying it. + */ + + /* read the text we want to modify */ + if (copy_inst_from_kernel_nofault(&replaced, (void *)ip)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (!ppc_inst_equal(replaced, old)) { + pr_err("%p: replaced (%08lx) != old (%08lx)", (void *)ip, + ppc_inst_as_ulong(replaced), ppc_inst_as_ulong(old)); + return -EINVAL; + } + + /* replace the text with the new text */ + return patch_instruction((u32 *)ip, new); +} + +/* + * Helper functions that are the same for both PPC64 and PPC32. + */ +static int test_24bit_addr(unsigned long ip, unsigned long addr) +{ + addr = ppc_function_entry((void *)addr); + + return is_offset_in_branch_range(addr - ip); +} + +static int is_bl_op(ppc_inst_t op) +{ + return (ppc_inst_val(op) & ~PPC_LI_MASK) == PPC_RAW_BL(0); +} + +static int is_b_op(ppc_inst_t op) +{ + return (ppc_inst_val(op) & ~PPC_LI_MASK) == PPC_RAW_BRANCH(0); +} + +static unsigned long find_bl_target(unsigned long ip, ppc_inst_t op) +{ + int offset; + + offset = PPC_LI(ppc_inst_val(op)); + /* make it signed */ + if (offset & 0x02000000) + offset |= 0xfe000000; + + return ip + (long)offset; +} + +#ifdef CONFIG_MODULES +static int +__ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long entry, ptr, tramp; + unsigned long ip = rec->ip; + ppc_inst_t op, pop; + + /* read where this goes */ + if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %08lx\n", ppc_inst_as_ulong(op)); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, op); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (module_trampoline_target(mod, tramp, &ptr)) { + pr_err("Failed to get trampoline target\n"); + return -EFAULT; + } + + pr_devel("trampoline target %lx", ptr); + + entry = ppc_global_function_entry((void *)addr); + /* This should match what was called */ + if (ptr != entry) { + pr_err("addr %lx does not match expected %lx\n", ptr, entry); + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) { + if (copy_inst_from_kernel_nofault(&op, (void *)(ip - 4))) { + pr_err("Fetching instruction at %lx failed.\n", ip - 4); + return -EFAULT; + } + + /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ + if (!ppc_inst_equal(op, ppc_inst(PPC_RAW_MFLR(_R0))) && + !ppc_inst_equal(op, ppc_inst(PPC_INST_STD_LR))) { + pr_err("Unexpected instruction %08lx around bl _mcount\n", + ppc_inst_as_ulong(op)); + return -EINVAL; + } + } else if (IS_ENABLED(CONFIG_PPC64)) { + /* + * Check what is in the next instruction. We can see ld r2,40(r1), but + * on first pass after boot we will see mflr r0. + */ + if (copy_inst_from_kernel_nofault(&op, (void *)(ip + 4))) { + pr_err("Fetching op failed.\n"); + return -EFAULT; + } + + if (!ppc_inst_equal(op, ppc_inst(PPC_INST_LD_TOC))) { + pr_err("Expected %08lx found %08lx\n", PPC_INST_LD_TOC, + ppc_inst_as_ulong(op)); + return -EINVAL; + } + } + + /* + * When using -mprofile-kernel or PPC32 there is no load to jump over. + * + * Otherwise our original call site looks like: + * + * bl <tramp> + * ld r2,XX(r1) + * + * Milton Miller pointed out that we can not simply nop the branch. + * If a task was preempted when calling a trace function, the nops + * will remove the way to restore the TOC in r2 and the r2 TOC will + * get corrupted. + * + * Use a b +8 to jump over the load. + */ + if (IS_ENABLED(CONFIG_MPROFILE_KERNEL) || IS_ENABLED(CONFIG_PPC32)) + pop = ppc_inst(PPC_RAW_NOP()); + else + pop = ppc_inst(PPC_RAW_BRANCH(8)); /* b +8 */ + + if (patch_instruction((u32 *)ip, pop)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + + return 0; +} +#else +static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +static unsigned long find_ftrace_tramp(unsigned long ip) +{ + int i; + + /* + * We have the compiler generated long_branch tramps at the end + * and we prefer those + */ + for (i = NUM_FTRACE_TRAMPS - 1; i >= 0; i--) + if (!ftrace_tramps[i]) + continue; + else if (is_offset_in_branch_range(ftrace_tramps[i] - ip)) + return ftrace_tramps[i]; + + return 0; +} + +static int add_ftrace_tramp(unsigned long tramp) +{ + int i; + + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) { + ftrace_tramps[i] = tramp; + return 0; + } + + return -1; +} + +/* + * If this is a compiler generated long_branch trampoline (essentially, a + * trampoline that has a branch to _mcount()), we re-write the branch to + * instead go to ftrace_[regs_]caller() and note down the location of this + * trampoline. + */ +static int setup_mcount_compiler_tramp(unsigned long tramp) +{ + int i; + ppc_inst_t op; + unsigned long ptr; + + /* Is this a known long jump tramp? */ + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (ftrace_tramps[i] == tramp) + return 0; + + /* New trampoline -- read where this goes */ + if (copy_inst_from_kernel_nofault(&op, (void *)tramp)) { + pr_debug("Fetching opcode failed.\n"); + return -1; + } + + /* Is this a 24 bit branch? */ + if (!is_b_op(op)) { + pr_debug("Trampoline is not a long branch tramp.\n"); + return -1; + } + + /* lets find where the pointer goes */ + ptr = find_bl_target(tramp, op); + + if (ptr != ppc_global_function_entry((void *)_mcount)) { + pr_debug("Trampoline target %p is not _mcount\n", (void *)ptr); + return -1; + } + + /* Let's re-write the tramp to go to ftrace_[regs_]caller */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + ptr = ppc_global_function_entry((void *)ftrace_regs_caller); + else + ptr = ppc_global_function_entry((void *)ftrace_caller); + + if (patch_branch((u32 *)tramp, ptr, 0)) { + pr_debug("REL24 out of range!\n"); + return -1; + } + + if (add_ftrace_tramp(tramp)) { + pr_debug("No tramp locations left\n"); + return -1; + } + + return 0; +} + +static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long tramp, ip = rec->ip; + ppc_inst_t op; + + /* Read where this goes */ + if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %08lx\n", ppc_inst_as_ulong(op)); + return -EINVAL; + } + + /* Let's find where the pointer goes */ + tramp = find_bl_target(ip, op); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (setup_mcount_compiler_tramp(tramp)) { + /* Are other trampolines reachable? */ + if (!find_ftrace_tramp(ip)) { + pr_err("No ftrace trampolines reachable from %ps\n", + (void *)ip); + return -EINVAL; + } + } + + if (patch_instruction((u32 *)ip, ppc_inst(PPC_RAW_NOP()))) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + + return 0; +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + ppc_inst_t old, new; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + old = ftrace_call_replace(ip, addr, 1); + new = ppc_inst(PPC_RAW_NOP()); + return ftrace_modify_code(ip, old, new); + } else if (core_kernel_text(ip)) { + return __ftrace_make_nop_kernel(rec, addr); + } else if (!IS_ENABLED(CONFIG_MODULES)) { + return -EINVAL; + } + + /* + * Out of range jumps are called from modules. + * We should either already have a pointer to the module + * or it has been passed in. + */ + if (!rec->arch.mod) { + if (!mod) { + pr_err("No module loaded addr=%lx\n", addr); + return -EFAULT; + } + rec->arch.mod = mod; + } else if (mod) { + if (mod != rec->arch.mod) { + pr_err("Record mod %p not equal to passed in mod %p\n", + rec->arch.mod, mod); + return -EINVAL; + } + /* nothing to do if mod == rec->arch.mod */ + } else + mod = rec->arch.mod; + + return __ftrace_make_nop(mod, rec, addr); +} + +#ifdef CONFIG_MODULES +/* + * Examine the existing instructions for __ftrace_make_call. + * They should effectively be a NOP, and follow formal constraints, + * depending on the ABI. Return false if they don't. + */ +static bool expected_nop_sequence(void *ip, ppc_inst_t op0, ppc_inst_t op1) +{ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + return ppc_inst_equal(op0, ppc_inst(PPC_RAW_NOP())); + else + return ppc_inst_equal(op0, ppc_inst(PPC_RAW_BRANCH(8))) && + ppc_inst_equal(op1, ppc_inst(PPC_INST_LD_TOC)); +} + +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + ppc_inst_t op[2]; + void *ip = (void *)rec->ip; + unsigned long entry, ptr, tramp; + struct module *mod = rec->arch.mod; + + /* read where this goes */ + if (copy_inst_from_kernel_nofault(op, ip)) + return -EFAULT; + + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && + copy_inst_from_kernel_nofault(op + 1, ip + 4)) + return -EFAULT; + + if (!expected_nop_sequence(ip, op[0], op[1])) { + pr_err("Unexpected call sequence at %p: %08lx %08lx\n", ip, + ppc_inst_as_ulong(op[0]), ppc_inst_as_ulong(op[1])); + return -EINVAL; + } + + /* If we never set up ftrace trampoline(s), then bail */ + if (!mod->arch.tramp || + (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && !mod->arch.tramp_regs)) { + pr_err("No ftrace trampoline\n"); + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && rec->flags & FTRACE_FL_REGS) + tramp = mod->arch.tramp_regs; + else + tramp = mod->arch.tramp; + + if (module_trampoline_target(mod, tramp, &ptr)) { + pr_err("Failed to get trampoline target\n"); + return -EFAULT; + } + + pr_devel("trampoline target %lx", ptr); + + entry = ppc_global_function_entry((void *)addr); + /* This should match what was called */ + if (ptr != entry) { + pr_err("addr %lx does not match expected %lx\n", ptr, entry); + return -EINVAL; + } + + if (patch_branch(ip, tramp, BRANCH_SET_LINK)) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } + + return 0; +} +#else +static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + ppc_inst_t op; + void *ip = (void *)rec->ip; + unsigned long tramp, entry, ptr; + + /* Make sure we're being asked to patch branch to a known ftrace addr */ + entry = ppc_global_function_entry((void *)ftrace_caller); + ptr = ppc_global_function_entry((void *)addr); + + if (ptr != entry && IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + entry = ppc_global_function_entry((void *)ftrace_regs_caller); + + if (ptr != entry) { + pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); + return -EINVAL; + } + + /* Make sure we have a nop */ + if (copy_inst_from_kernel_nofault(&op, ip)) { + pr_err("Unable to read ftrace location %p\n", ip); + return -EFAULT; + } + + if (!ppc_inst_equal(op, ppc_inst(PPC_RAW_NOP()))) { + pr_err("Unexpected call sequence at %p: %08lx\n", + ip, ppc_inst_as_ulong(op)); + return -EINVAL; + } + + tramp = find_ftrace_tramp((unsigned long)ip); + if (!tramp) { + pr_err("No ftrace trampolines reachable from %ps\n", ip); + return -EINVAL; + } + + if (patch_branch(ip, tramp, BRANCH_SET_LINK)) { + pr_err("Error patching branch to ftrace tramp!\n"); + return -EINVAL; + } + + return 0; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + ppc_inst_t old, new; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + old = ppc_inst(PPC_RAW_NOP()); + new = ftrace_call_replace(ip, addr, 1); + return ftrace_modify_code(ip, old, new); + } else if (core_kernel_text(ip)) { + return __ftrace_make_call_kernel(rec, addr); + } else if (!IS_ENABLED(CONFIG_MODULES)) { + /* We should not get here without modules */ + return -EINVAL; + } + + /* + * Out of range jumps are called from modules. + * Being that we are converting from nop, it had better + * already have a module defined. + */ + if (!rec->arch.mod) { + pr_err("No module loaded\n"); + return -EINVAL; + } + + return __ftrace_make_call(rec, addr); +} + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +#ifdef CONFIG_MODULES +static int +__ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + ppc_inst_t op; + unsigned long ip = rec->ip; + unsigned long entry, ptr, tramp; + struct module *mod = rec->arch.mod; + + /* If we never set up ftrace trampolines, then bail */ + if (!mod->arch.tramp || !mod->arch.tramp_regs) { + pr_err("No ftrace trampoline\n"); + return -EINVAL; + } + + /* read where this goes */ + if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %08lx\n", ppc_inst_as_ulong(op)); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, op); + entry = ppc_global_function_entry((void *)old_addr); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (tramp != entry) { + /* old_addr is not within range, so we must have used a trampoline */ + if (module_trampoline_target(mod, tramp, &ptr)) { + pr_err("Failed to get trampoline target\n"); + return -EFAULT; + } + + pr_devel("trampoline target %lx", ptr); + + /* This should match what was called */ + if (ptr != entry) { + pr_err("addr %lx does not match expected %lx\n", ptr, entry); + return -EINVAL; + } + } + + /* The new target may be within range */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + if (patch_branch((u32 *)ip, addr, BRANCH_SET_LINK)) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } + + return 0; + } + + if (rec->flags & FTRACE_FL_REGS) + tramp = mod->arch.tramp_regs; + else + tramp = mod->arch.tramp; + + if (module_trampoline_target(mod, tramp, &ptr)) { + pr_err("Failed to get trampoline target\n"); + return -EFAULT; + } + + pr_devel("trampoline target %lx", ptr); + + entry = ppc_global_function_entry((void *)addr); + /* This should match what was called */ + if (ptr != entry) { + pr_err("addr %lx does not match expected %lx\n", ptr, entry); + return -EINVAL; + } + + if (patch_branch((u32 *)ip, tramp, BRANCH_SET_LINK)) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } + + return 0; +} +#else +static int __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) +{ + return 0; +} +#endif + +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + unsigned long ip = rec->ip; + ppc_inst_t old, new; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(ip, addr) && test_24bit_addr(ip, old_addr)) { + /* within range */ + old = ftrace_call_replace(ip, old_addr, 1); + new = ftrace_call_replace(ip, addr, 1); + return ftrace_modify_code(ip, old, new); + } else if (core_kernel_text(ip)) { + /* + * We always patch out of range locations to go to the regs + * variant, so there is nothing to do here + */ + return 0; + } else if (!IS_ENABLED(CONFIG_MODULES)) { + /* We should not get here without modules */ + return -EINVAL; + } + + /* + * Out of range jumps are called from modules. + */ + if (!rec->arch.mod) { + pr_err("No module loaded\n"); + return -EINVAL; + } + + return __ftrace_modify_call(rec, old_addr, addr); +} +#endif + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + ppc_inst_t old, new; + int ret; + + old = ppc_inst_read((u32 *)&ftrace_call); + new = ftrace_call_replace(ip, (unsigned long)func, 1); + ret = ftrace_modify_code(ip, old, new); + + /* Also update the regs callback function */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && !ret) { + ip = (unsigned long)(&ftrace_regs_call); + old = ppc_inst_read((u32 *)&ftrace_regs_call); + new = ftrace_call_replace(ip, (unsigned long)func, 1); + ret = ftrace_modify_code(ip, old, new); + } + + return ret; +} + +/* + * Use the default ftrace_modify_all_code, but without + * stop_machine(). + */ +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + +#ifdef CONFIG_PPC64 +#define PACATOC offsetof(struct paca_struct, kernel_toc) + +extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; + +void ftrace_free_init_tramp(void) +{ + int i; + + for (i = 0; i < NUM_FTRACE_TRAMPS && ftrace_tramps[i]; i++) + if (ftrace_tramps[i] == (unsigned long)ftrace_tramp_init) { + ftrace_tramps[i] = 0; + return; + } +} + +int __init ftrace_dyn_arch_init(void) +{ + int i; + unsigned int *tramp[] = { ftrace_tramp_text, ftrace_tramp_init }; + u32 stub_insns[] = { + PPC_RAW_LD(_R12, _R13, PACATOC), + PPC_RAW_ADDIS(_R12, _R12, 0), + PPC_RAW_ADDI(_R12, _R12, 0), + PPC_RAW_MTCTR(_R12), + PPC_RAW_BCTR() + }; + unsigned long addr; + long reladdr; + + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + addr = ppc_global_function_entry((void *)ftrace_regs_caller); + else + addr = ppc_global_function_entry((void *)ftrace_caller); + + reladdr = addr - kernel_toc_addr(); + + if (reladdr >= SZ_2G || reladdr < -(long)SZ_2G) { + pr_err("Address of %ps out of range of kernel_toc.\n", + (void *)addr); + return -1; + } + + for (i = 0; i < 2; i++) { + memcpy(tramp[i], stub_insns, sizeof(stub_insns)); + tramp[i][1] |= PPC_HA(reladdr); + tramp[i][2] |= PPC_LO(reladdr); + add_ftrace_tramp((unsigned long)tramp[i]); + } + + return 0; +} +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +extern void ftrace_graph_call(void); +extern void ftrace_graph_stub(void); + +static int ftrace_modify_ftrace_graph_caller(bool enable) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + unsigned long addr = (unsigned long)(&ftrace_graph_caller); + unsigned long stub = (unsigned long)(&ftrace_graph_stub); + ppc_inst_t old, new; + + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)) + return 0; + + old = ftrace_call_replace(ip, enable ? stub : addr, 0); + new = ftrace_call_replace(ip, enable ? addr : stub, 0); + + return ftrace_modify_code(ip, old, new); +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + return ftrace_modify_ftrace_graph_caller(true); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + return ftrace_modify_ftrace_graph_caller(false); +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. Return the address we want to divert to. + */ +static unsigned long +__prepare_ftrace_return(unsigned long parent, unsigned long ip, unsigned long sp) +{ + unsigned long return_hooker; + int bit; + + if (unlikely(ftrace_graph_is_dead())) + goto out; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + goto out; + + bit = ftrace_test_recursion_trylock(ip, parent); + if (bit < 0) + goto out; + + return_hooker = ppc_function_entry(return_to_handler); + + if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) + parent = return_hooker; + + ftrace_test_recursion_unlock(bit); +out: + return parent; +} + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + fregs->regs.link = __prepare_ftrace_return(parent_ip, ip, fregs->regs.gpr[1]); +} +#else +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp) +{ + return __prepare_ftrace_return(parent, ip, sp); +} +#endif +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_PPC64_ELF_ABI_V1 +char *arch_ftrace_match_adjust(char *str, const char *search) +{ + if (str[0] == '.' && search[0] != '.') + return str + 1; + else + return str; +} +#endif /* CONFIG_PPC64_ELF_ABI_V1 */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg_entry.S index 6708e24db0ab..a8a7f28404c8 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.S +++ b/arch/powerpc/kernel/trace/ftrace_64_pg_entry.S @@ -3,12 +3,12 @@ * Split from ftrace_64.S */ +#include <linux/export.h> #include <linux/magic.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/ftrace.h> #include <asm/ppc-opcode.h> -#include <asm/export.h> _GLOBAL_TOC(ftrace_caller) lbz r3, PACA_FTRACE_ENABLED(r13) @@ -65,3 +65,68 @@ _GLOBAL(ftrace_graph_caller) addi r1, r1, 112 blr #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +.pushsection ".tramp.ftrace.text","aw",@progbits; +.globl ftrace_tramp_text +ftrace_tramp_text: + .space 32 +.popsection + +.pushsection ".tramp.ftrace.init","aw",@progbits; +.globl ftrace_tramp_init +ftrace_tramp_init: + .space 32 +.popsection + +_GLOBAL(mcount) +_GLOBAL(_mcount) +EXPORT_SYMBOL(_mcount) + mflr r12 + mtctr r12 + mtlr r0 + bctr + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +_GLOBAL(return_to_handler) + /* need to save return values */ +#ifdef CONFIG_PPC64 + std r4, -32(r1) + std r3, -24(r1) + /* save TOC */ + std r2, -16(r1) + std r31, -8(r1) + mr r31, r1 + stdu r1, -112(r1) + + /* + * We might be called from a module. + * Switch to our TOC to run inside the core kernel. + */ + LOAD_PACA_TOC() +#else + stwu r1, -16(r1) + stw r3, 8(r1) + stw r4, 12(r1) +#endif + + bl ftrace_return_to_handler + nop + + /* return value has real return address */ + mtlr r3 + +#ifdef CONFIG_PPC64 + ld r1, 0(r1) + ld r4, -32(r1) + ld r3, -24(r1) + ld r2, -16(r1) + ld r31, -8(r1) +#else + lwz r3, 8(r1) + lwz r4, 12(r1) + addi r1, r1, 16 +#endif + + /* Jump back to real return address */ + blr +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_entry.S index f9fd5f743eba..76dbe9fd2c0f 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_entry.S @@ -3,12 +3,12 @@ * Split from ftrace_64.S */ +#include <linux/export.h> #include <linux/magic.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/ftrace.h> #include <asm/ppc-opcode.h> -#include <asm/export.h> #include <asm/thread_info.h> #include <asm/bug.h> #include <asm/ptrace.h> @@ -32,53 +32,74 @@ * Our job is to save the register state into a struct pt_regs (on the stack) * and then arrange for the ftrace function to be called. */ -_GLOBAL(ftrace_regs_caller) - /* Save the original return address in A's stack frame */ - std r0,LRSAVE(r1) +.macro ftrace_regs_entry allregs + /* Create a minimal stack frame for representing B */ + PPC_STLU r1, -STACK_FRAME_MIN_SIZE(r1) /* Create our stack frame + pt_regs */ - stdu r1,-SWITCH_FRAME_SIZE(r1) + PPC_STLU r1,-SWITCH_FRAME_SIZE(r1) /* Save all gprs to pt_regs */ SAVE_GPR(0, r1) - SAVE_10GPRS(2, r1) + SAVE_GPRS(3, 10, r1) +#ifdef CONFIG_PPC64 + /* Save the original return address in A's stack frame */ + std r0, LRSAVE+SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE(r1) /* Ok to continue? */ lbz r3, PACA_FTRACE_ENABLED(r13) cmpdi r3, 0 beq ftrace_no_trace +#endif - SAVE_10GPRS(12, r1) - SAVE_10GPRS(22, r1) + .if \allregs == 1 + SAVE_GPR(2, r1) + SAVE_GPRS(11, 31, r1) + .else +#ifdef CONFIG_LIVEPATCH_64 + SAVE_GPR(14, r1) +#endif + .endif /* Save previous stack pointer (r1) */ - addi r8, r1, SWITCH_FRAME_SIZE - std r8, GPR1(r1) + addi r8, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE + PPC_STL r8, GPR1(r1) + .if \allregs == 1 /* Load special regs for save below */ mfmsr r8 mfctr r9 mfxer r10 mfcr r11 + .else + /* Clear MSR to flag as ftrace_caller versus frace_regs_caller */ + li r8, 0 + .endif /* Get the _mcount() call site out of LR */ mflr r7 /* Save it as pt_regs->nip */ - std r7, _NIP(r1) + PPC_STL r7, _NIP(r1) + /* Also save it in B's stackframe header for proper unwind */ + PPC_STL r7, LRSAVE+SWITCH_FRAME_SIZE(r1) /* Save the read LR in pt_regs->link */ - std r0, _LINK(r1) + PPC_STL r0, _LINK(r1) +#ifdef CONFIG_PPC64 /* Save callee's TOC in the ABI compliant location */ - std r2, 24(r1) - ld r2,PACATOC(r13) /* get kernel TOC in r2 */ - - addis r3,r2,function_trace_op@toc@ha - addi r3,r3,function_trace_op@toc@l + std r2, STK_GOT(r1) + LOAD_PACA_TOC() /* get kernel TOC in r2 */ + LOAD_REG_ADDR(r3, function_trace_op) ld r5,0(r3) +#else + lis r3,function_trace_op@ha + lwz r5,function_trace_op@l(r3) +#endif -#ifdef CONFIG_LIVEPATCH - mr r14,r7 /* remember old NIP */ +#ifdef CONFIG_LIVEPATCH_64 + mr r14, r7 /* remember old NIP */ #endif + /* Calculate ip from nip-4 into r3 for call below */ subi r3, r7, MCOUNT_INSN_SIZE @@ -86,130 +107,85 @@ _GLOBAL(ftrace_regs_caller) mr r4, r0 /* Save special regs */ - std r8, _MSR(r1) - std r9, _CTR(r1) - std r10, _XER(r1) - std r11, _CCR(r1) + PPC_STL r8, _MSR(r1) + .if \allregs == 1 + PPC_STL r9, _CTR(r1) + PPC_STL r10, _XER(r1) + PPC_STL r11, _CCR(r1) + .endif /* Load &pt_regs in r6 for call below */ - addi r6, r1 ,STACK_FRAME_OVERHEAD - - /* ftrace_call(r3, r4, r5, r6) */ -.globl ftrace_regs_call -ftrace_regs_call: - bl ftrace_stub - nop + addi r6, r1, STACK_INT_FRAME_REGS +.endm +.macro ftrace_regs_exit allregs /* Load ctr with the possibly modified NIP */ - ld r3, _NIP(r1) + PPC_LL r3, _NIP(r1) mtctr r3 -#ifdef CONFIG_LIVEPATCH + +#ifdef CONFIG_LIVEPATCH_64 cmpd r14, r3 /* has NIP been altered? */ #endif /* Restore gprs */ - REST_GPR(0,r1) - REST_10GPRS(2,r1) - REST_10GPRS(12,r1) - REST_10GPRS(22,r1) + .if \allregs == 1 + REST_GPRS(2, 31, r1) + .else + REST_GPRS(3, 10, r1) +#ifdef CONFIG_LIVEPATCH_64 + REST_GPR(14, r1) +#endif + .endif /* Restore possibly modified LR */ - ld r0, _LINK(r1) + PPC_LL r0, _LINK(r1) mtlr r0 +#ifdef CONFIG_PPC64 /* Restore callee's TOC */ - ld r2, 24(r1) + ld r2, STK_GOT(r1) +#endif /* Pop our stack frame */ - addi r1, r1, SWITCH_FRAME_SIZE + addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE -#ifdef CONFIG_LIVEPATCH +#ifdef CONFIG_LIVEPATCH_64 /* Based on the cmpd above, if the NIP was altered handle livepatch */ bne- livepatch_handler #endif + bctr /* jump after _mcount site */ +.endm -ftrace_caller_common: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - b ftrace_graph_stub -_GLOBAL(ftrace_graph_stub) -#endif +_GLOBAL(ftrace_regs_caller) + ftrace_regs_entry 1 + /* ftrace_call(r3, r4, r5, r6) */ +.globl ftrace_regs_call +ftrace_regs_call: + bl ftrace_stub + ftrace_regs_exit 1 - bctr /* jump after _mcount site */ +_GLOBAL(ftrace_caller) + ftrace_regs_entry 0 + /* ftrace_call(r3, r4, r5, r6) */ +.globl ftrace_call +ftrace_call: + bl ftrace_stub + ftrace_regs_exit 0 _GLOBAL(ftrace_stub) blr +#ifdef CONFIG_PPC64 ftrace_no_trace: mflr r3 mtctr r3 REST_GPR(3, r1) - addi r1, r1, SWITCH_FRAME_SIZE + addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE mtlr r0 bctr +#endif -_GLOBAL(ftrace_caller) - /* Save the original return address in A's stack frame */ - std r0, LRSAVE(r1) - - /* Create our stack frame + pt_regs */ - stdu r1, -SWITCH_FRAME_SIZE(r1) - - /* Save all gprs to pt_regs */ - SAVE_8GPRS(3, r1) - - lbz r3, PACA_FTRACE_ENABLED(r13) - cmpdi r3, 0 - beq ftrace_no_trace - - /* Get the _mcount() call site out of LR */ - mflr r7 - std r7, _NIP(r1) - - /* Save callee's TOC in the ABI compliant location */ - std r2, 24(r1) - ld r2, PACATOC(r13) /* get kernel TOC in r2 */ - - addis r3, r2, function_trace_op@toc@ha - addi r3, r3, function_trace_op@toc@l - ld r5, 0(r3) - - /* Calculate ip from nip-4 into r3 for call below */ - subi r3, r7, MCOUNT_INSN_SIZE - - /* Put the original return address in r4 as parent_ip */ - mr r4, r0 - - /* Set pt_regs to NULL */ - li r6, 0 - - /* ftrace_call(r3, r4, r5, r6) */ -.globl ftrace_call -ftrace_call: - bl ftrace_stub - nop - - ld r3, _NIP(r1) - mtctr r3 - - /* Restore gprs */ - REST_8GPRS(3,r1) - - /* Restore callee's TOC */ - ld r2, 24(r1) - - /* Pop our stack frame */ - addi r1, r1, SWITCH_FRAME_SIZE - - /* Reload original LR */ - ld r0, LRSAVE(r1) - mtlr r0 - - /* Handle function_graph or go back */ - b ftrace_caller_common - -#ifdef CONFIG_LIVEPATCH +#ifdef CONFIG_LIVEPATCH_64 /* * This function runs in the mcount context, between two functions. As * such it can only clobber registers which are volatile and used in @@ -277,54 +253,69 @@ livepatch_handler: blr #endif /* CONFIG_LIVEPATCH */ +#ifndef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY +_GLOBAL(mcount) +_GLOBAL(_mcount) +EXPORT_SYMBOL(_mcount) + mflr r12 + mtctr r12 + mtlr r0 + bctr +#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER -_GLOBAL(ftrace_graph_caller) +_GLOBAL(return_to_handler) + /* need to save return values */ +#ifdef CONFIG_PPC64 + std r4, -32(r1) + std r3, -24(r1) + /* save TOC */ + std r2, -16(r1) + std r31, -8(r1) + mr r31, r1 stdu r1, -112(r1) - /* with -mprofile-kernel, parameter regs are still alive at _mcount */ - std r10, 104(r1) - std r9, 96(r1) - std r8, 88(r1) - std r7, 80(r1) - std r6, 72(r1) - std r5, 64(r1) - std r4, 56(r1) - std r3, 48(r1) - - /* Save callee's TOC in the ABI compliant location */ - std r2, 24(r1) - ld r2, PACATOC(r13) /* get kernel TOC in r2 */ - addi r5, r1, 112 - mfctr r4 /* ftrace_caller has moved local addr here */ - std r4, 40(r1) - mflr r3 /* ftrace_caller has restored LR from stack */ - subi r4, r4, MCOUNT_INSN_SIZE + /* + * We might be called from a module. + * Switch to our TOC to run inside the core kernel. + */ + LOAD_PACA_TOC() +#else + stwu r1, -16(r1) + stw r3, 8(r1) + stw r4, 12(r1) +#endif - bl prepare_ftrace_return + bl ftrace_return_to_handler nop - /* - * prepare_ftrace_return gives us the address we divert to. - * Change the LR to this. - */ + /* return value has real return address */ mtlr r3 - ld r0, 40(r1) - mtctr r0 - ld r10, 104(r1) - ld r9, 96(r1) - ld r8, 88(r1) - ld r7, 80(r1) - ld r6, 72(r1) - ld r5, 64(r1) - ld r4, 56(r1) - ld r3, 48(r1) - - /* Restore callee's TOC */ - ld r2, 24(r1) +#ifdef CONFIG_PPC64 + ld r1, 0(r1) + ld r4, -32(r1) + ld r3, -24(r1) + ld r2, -16(r1) + ld r31, -8(r1) +#else + lwz r3, 8(r1) + lwz r4, 12(r1) + addi r1, r1, 16 +#endif - addi r1, r1, 112 - mflr r0 - std r0, LRSAVE(r1) - bctr + /* Jump back to real return address */ + blr #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +.pushsection ".tramp.ftrace.text","aw",@progbits; +.globl ftrace_tramp_text +ftrace_tramp_text: + .space 32 +.popsection + +.pushsection ".tramp.ftrace.init","aw",@progbits; +.globl ftrace_tramp_init +ftrace_tramp_init: + .space 32 +.popsection diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 014ff0701f24..11e062b47d3f 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,11 +37,11 @@ #include <linux/smp.h> #include <linux/console.h> #include <linux/kmsg_dump.h> +#include <linux/debugfs.h> #include <asm/emulated_ops.h> -#include <asm/pgtable.h> #include <linux/uaccess.h> -#include <asm/debugfs.h> +#include <asm/interrupt.h> #include <asm/io.h> #include <asm/machdep.h> #include <asm/rtas.h> @@ -53,7 +53,6 @@ #ifdef CONFIG_PPC64 #include <asm/firmware.h> #include <asm/processor.h> -#include <asm/tm.h> #endif #include <asm/kexec.h> #include <asm/ppc-opcode.h> @@ -68,6 +67,8 @@ #include <asm/kprobes.h> #include <asm/stacktrace.h> #include <asm/nmi.h> +#include <asm/disassemble.h> +#include <asm/udbg.h> #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) int (*__debugger)(struct pt_regs *regs) __read_mostly; @@ -156,7 +157,7 @@ static int die_owner = -1; static unsigned int die_nest_count; static int die_counter; -extern void panic_flush_kmsg_start(void) +void panic_flush_kmsg_start(void) { /* * These are mostly taken from kernel/panic.c, but tries to do @@ -169,9 +170,8 @@ extern void panic_flush_kmsg_start(void) bust_spinlocks(1); } -extern void panic_flush_kmsg_end(void) +void panic_flush_kmsg_end(void) { - printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(0); debug_locks_off(); @@ -222,7 +222,7 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, /* * system_reset_excption handles debugger, crash dump, panic, for 0x100 */ - if (TRAP(regs) == 0x100) + if (TRAP(regs) == INTERRUPT_SYSTEM_RESET) return; crash_fadump(regs, "die oops"); @@ -246,7 +246,7 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, if (panic_on_oops) panic("Fatal exception"); - do_exit(signr); + make_task_dead(signr); } NOKPROBE_SYMBOL(oops_end); @@ -290,7 +290,7 @@ void die(const char *str, struct pt_regs *regs, long err) /* * system_reset_excption handles debugger, crash dump, panic, for 0x100 */ - if (TRAP(regs) != 0x100) { + if (TRAP(regs) != INTERRUPT_SYSTEM_RESET) { if (debugger(regs)) return; } @@ -341,19 +341,19 @@ static bool exception_common(int signr, struct pt_regs *regs, int code, return false; } - show_signal_msg(signr, regs, code, addr); + /* + * Must not enable interrupts even for user-mode exception, because + * this can be called from machine check, which may be a NMI or IRQ + * which don't like interrupts being enabled. Could check for + * in_hardirq || in_nmi perhaps, but there doesn't seem to be a good + * reason why _exception() should enable irqs for an exception handler, + * the handlers themselves do that directly. + */ - if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs)) - local_irq_enable(); + show_signal_msg(signr, regs, code, addr); current->thread.trap_nr = code; - /* - * Save all the pkey registers AMR/IAMR/UAMOR. Eg: Core dumps need - * to capture the content, if the task gets killed. - */ - thread_pkey_regs_save(¤t->thread); - return true; } @@ -394,7 +394,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) * Builds that do not support KVM could take this second option to increase * the recoverability of NMIs. */ -void hv_nmi_check_nonrecoverable(struct pt_regs *regs) +noinstr void hv_nmi_check_nonrecoverable(struct pt_regs *regs) { #ifdef CONFIG_PPC_POWERNV unsigned long kbase = (unsigned long)_stext; @@ -411,7 +411,7 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) * Now test if the interrupt has hit a range that may be using * HSPRG1 without having RI=0 (i.e., an HSRR interrupt). The * problem ranges all run un-relocated. Test real and virt modes - * at the same time by droping the high bit of the nip (virt mode + * at the same time by dropping the high bit of the nip (virt mode * entry points still have the +0x4000 offset). */ nip &= ~0xc000000000000000ULL; @@ -435,23 +435,16 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) nonrecoverable: regs->msr &= ~MSR_RI; + local_paca->hsrr_valid = 0; + local_paca->srr_valid = 0; #endif } - -void system_reset_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception) { unsigned long hsrr0, hsrr1; - bool nested = in_nmi(); bool saved_hsrrs = false; /* - * Avoid crashes in case of nested NMI exceptions. Recoverability - * is determined by RI and in_nmi - */ - if (!nested) - nmi_enter(); - - /* * System reset can interrupt code where HSRRs are live and MSR[RI]=1. * The system reset interrupt itself may clobber HSRRs (e.g., to call * OPAL), so save them here and restore them before returning. @@ -510,21 +503,23 @@ out: #ifdef CONFIG_PPC_BOOK3S_64 BUG_ON(get_paca()->in_nmi == 0); if (get_paca()->in_nmi > 1) - nmi_panic(regs, "Unrecoverable nested System Reset"); + die("Unrecoverable nested System Reset", regs, SIGABRT); #endif /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) - nmi_panic(regs, "Unrecoverable System Reset"); + if (regs_is_unrecoverable(regs)) { + /* For the reason explained in die_mce, nmi_exit before die */ + nmi_exit(); + die("Unrecoverable System Reset", regs, SIGABRT); + } if (saved_hsrrs) { mtspr(SPRN_HSRR0, hsrr0); mtspr(SPRN_HSRR1, hsrr1); } - if (!nested) - nmi_exit(); - /* What should we do here? We could issue a shutdown or hard reset. */ + + return 0; } /* @@ -532,9 +527,6 @@ out: * Check if the NIP corresponds to the address of a sync * instruction for which there is an entry in the exception * table. - * Note that the 601 only takes a machine check on TEA - * (transfer error ack) signal assertion, and does not - * set any of the top 16 bits of SRR1. * -- paulus. */ static inline int check_io_access(struct pt_regs *regs) @@ -554,11 +546,11 @@ static inline int check_io_access(struct pt_regs *regs) * For the debug message, we look at the preceding * load or store. */ - if (*nip == PPC_INST_NOP) + if (*nip == PPC_RAW_NOP()) nip -= 2; - else if (*nip == PPC_INST_ISYNC) + else if (*nip == PPC_RAW_ISYNC()) --nip; - if (*nip == PPC_INST_SYNC || (*nip >> 26) == OP_TRAP) { + if (*nip == PPC_RAW_SYNC() || get_op(*nip) == OP_TRAP) { unsigned int rb; --nip; @@ -566,8 +558,8 @@ static inline int check_io_access(struct pt_regs *regs) printk(KERN_DEBUG "%s bad port %lx at %p\n", (*nip & 0x100)? "OUT to": "IN from", regs->gpr[rb] - _IO_BASE, nip); - regs->msr |= MSR_RI; - regs->nip = extable_fixup(entry); + regs_set_recoverable(regs); + regs_set_return_ip(regs, extable_fixup(entry)); return 1; } } @@ -578,11 +570,13 @@ static inline int check_io_access(struct pt_regs *regs) #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* On 4xx, the reason for the machine check or program exception is in the ESR. */ -#define get_reason(regs) ((regs)->dsisr) +#define get_reason(regs) ((regs)->esr) #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR #define REASON_TRAP ESR_PTR +#define REASON_PREFIXED 0 +#define REASON_BOUNDARY 0 /* single-step stuff */ #define single_stepping(regs) (current->thread.debug.dbcr0 & DBCR0_IC) @@ -597,13 +591,17 @@ static inline int check_io_access(struct pt_regs *regs) #define REASON_ILLEGAL SRR1_PROGILL #define REASON_PRIVILEGED SRR1_PROGPRIV #define REASON_TRAP SRR1_PROGTRAP +#define REASON_PREFIXED SRR1_PREFIXED +#define REASON_BOUNDARY SRR1_BOUNDARY #define single_stepping(regs) ((regs)->msr & MSR_SE) -#define clear_single_step(regs) ((regs)->msr &= ~MSR_SE) -#define clear_br_trace(regs) ((regs)->msr &= ~MSR_BE) +#define clear_single_step(regs) (regs_set_return_msr((regs), (regs)->msr & ~MSR_SE)) +#define clear_br_trace(regs) (regs_set_return_msr((regs), (regs)->msr & ~MSR_BE)) #endif -#if defined(CONFIG_E500) +#define inst_length(reason) (((reason) & REASON_PREFIXED) ? 8 : 4) + +#if defined(CONFIG_PPC_E500) int machine_check_e500mc(struct pt_regs *regs) { unsigned long mcsr = mfspr(SPRN_MCSR); @@ -757,31 +755,6 @@ int machine_check_generic(struct pt_regs *regs) { return 0; } -#elif defined(CONFIG_E200) -int machine_check_e200(struct pt_regs *regs) -{ - unsigned long reason = mfspr(SPRN_MCSR); - - printk("Machine check in kernel mode.\n"); - printk("Caused by (from MCSR=%lx): ", reason); - - if (reason & MCSR_MCP) - pr_cont("Machine Check Signal\n"); - if (reason & MCSR_CP_PERR) - pr_cont("Cache Push Parity Error\n"); - if (reason & MCSR_CPERR) - pr_cont("Cache Parity Error\n"); - if (reason & MCSR_EXCP_ERR) - pr_cont("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n"); - if (reason & MCSR_BUS_IRERR) - pr_cont("Bus - Read Bus Error on instruction fetch\n"); - if (reason & MCSR_BUS_DRERR) - pr_cont("Bus - Read Bus Error on data load\n"); - if (reason & MCSR_BUS_WRERR) - pr_cont("Bus - Write Bus Error on buffered store or cache line push\n"); - - return 0; -} #elif defined(CONFIG_PPC32) int machine_check_generic(struct pt_regs *regs) { @@ -793,7 +766,6 @@ int machine_check_generic(struct pt_regs *regs) case 0x80000: pr_cont("Machine check signal\n"); break; - case 0: /* for 601 */ case 0x40000: case 0x140000: /* 7450 MSS error and TEA */ pr_cont("Transfer error ack signal\n"); @@ -820,12 +792,31 @@ int machine_check_generic(struct pt_regs *regs) } #endif /* everything else */ -void machine_check_exception(struct pt_regs *regs) +void die_mce(const char *str, struct pt_regs *regs, long err) +{ + /* + * The machine check wants to kill the interrupted context, + * but make_task_dead() checks for in_interrupt() and panics + * in that case, so exit the irq/nmi before calling die. + */ + if (in_nmi()) + nmi_exit(); + else + irq_exit(); + die(str, regs, err); +} + +/* + * BOOK3S_64 does not usually call this handler as a non-maskable interrupt + * (it uses its own early real-mode handler to handle the MCE proper + * and then raises irq_work to call this handler when interrupts are + * enabled). The only time when this is not true is if the early handler + * is unrecoverable, then it does call this directly to try to get a + * message out. + */ +static void __machine_check_exception(struct pt_regs *regs) { int recover = 0; - bool nested = in_nmi(); - if (!nested) - nmi_enter(); __this_cpu_inc(irq_stat.mce_exceptions); @@ -851,23 +842,41 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - if (!nested) - nmi_exit(); - - die("Machine check", regs, SIGBUS); + die_mce("Machine check", regs, SIGBUS); +bail: /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) - nmi_panic(regs, "Unrecoverable Machine check"); + if (regs_is_unrecoverable(regs)) + die_mce("Unrecoverable Machine check", regs, SIGBUS); +} - return; +#ifdef CONFIG_PPC_BOOK3S_64 +DEFINE_INTERRUPT_HANDLER_RAW(machine_check_early_boot) +{ + udbg_printf("Machine check (early boot)\n"); + udbg_printf("SRR0=0x%016lx SRR1=0x%016lx\n", regs->nip, regs->msr); + udbg_printf(" DAR=0x%016lx DSISR=0x%08lx\n", regs->dar, regs->dsisr); + udbg_printf(" LR=0x%016lx R1=0x%08lx\n", regs->link, regs->gpr[1]); + udbg_printf("------\n"); + die("Machine check (early boot)", regs, SIGBUS); + for (;;) + ; + return 0; +} -bail: - if (!nested) - nmi_exit(); +DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async) +{ + __machine_check_exception(regs); +} +#endif +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) +{ + __machine_check_exception(regs); + + return 0; } -void SMIException(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */ { die("System Management Interrupt", regs, SIGABRT); } @@ -877,11 +886,11 @@ static void p9_hmi_special_emu(struct pt_regs *regs) { unsigned int ra, rb, t, i, sel, instr, rc; const void __user *addr; - u8 vbuf[16], *vdst; + u8 vbuf[16] __aligned(16), *vdst; unsigned long ea, msr, msr_mask; bool swap; - if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + if (__get_user(instr, (unsigned int __user *)regs->nip)) return; /* @@ -1049,16 +1058,15 @@ static void p9_hmi_special_emu(struct pt_regs *regs) #endif /* !__LITTLE_ENDIAN__ */ /* Go to next instruction */ - regs->nip += 4; + regs_add_return_ip(regs, 4); } #endif /* CONFIG_VSX */ -void handle_hmi_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_ASYNC(handle_hmi_exception) { struct pt_regs *old_regs; old_regs = set_irq_regs(regs); - irq_enter(); #ifdef CONFIG_VSX /* Real mode flagged P9 special emu is needed */ @@ -1078,46 +1086,52 @@ void handle_hmi_exception(struct pt_regs *regs) if (ppc_md.handle_hmi_exception) ppc_md.handle_hmi_exception(regs); - irq_exit(); set_irq_regs(old_regs); } -void unknown_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(unknown_exception) { - enum ctx_state prev_state = exception_enter(); - printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n", regs->nip, regs->msr, regs->trap); _exception(SIGTRAP, regs, TRAP_UNK, 0); +} + +DEFINE_INTERRUPT_HANDLER_ASYNC(unknown_async_exception) +{ + printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n", + regs->nip, regs->msr, regs->trap); - exception_exit(prev_state); + _exception(SIGTRAP, regs, TRAP_UNK, 0); } -void instruction_breakpoint_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_NMI(unknown_nmi_exception) { - enum ctx_state prev_state = exception_enter(); + printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n", + regs->nip, regs->msr, regs->trap); + _exception(SIGTRAP, regs, TRAP_UNK, 0); + + return 0; +} + +DEFINE_INTERRUPT_HANDLER(instruction_breakpoint_exception) +{ if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - goto bail; + return; if (debugger_iabr_match(regs)) - goto bail; + return; _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); - -bail: - exception_exit(prev_state); } -void RunModeException(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(RunModeException) { _exception(SIGTRAP, regs, TRAP_UNK, 0); } -void single_step_exception(struct pt_regs *regs) +static void __single_step_exception(struct pt_regs *regs) { - enum ctx_state prev_state = exception_enter(); - clear_single_step(regs); clear_br_trace(regs); @@ -1126,16 +1140,17 @@ void single_step_exception(struct pt_regs *regs) if (notify_die(DIE_SSTEP, "single_step", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - goto bail; + return; if (debugger_sstep(regs)) - goto bail; + return; _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); +} -bail: - exception_exit(prev_state); +DEFINE_INTERRUPT_HANDLER(single_step_exception) +{ + __single_step_exception(regs); } -NOKPROBE_SYMBOL(single_step_exception); /* * After we have successfully emulated an instruction, we have to @@ -1143,12 +1158,13 @@ NOKPROBE_SYMBOL(single_step_exception); * pretend we got a single-step exception. This was pointed out * by Kumar Gala. -- paulus */ -static void emulate_single_step(struct pt_regs *regs) +void emulate_single_step(struct pt_regs *regs) { if (single_stepping(regs)) - single_step_exception(regs); + __single_step_exception(regs); } +#ifdef CONFIG_PPC_FPU_REGS static inline int __parse_fpscr(unsigned long fpscr) { int ret = FPE_FLTUNK; @@ -1175,6 +1191,7 @@ static inline int __parse_fpscr(unsigned long fpscr) return ret; } +#endif static void parse_fpe(struct pt_regs *regs) { @@ -1182,7 +1199,9 @@ static void parse_fpe(struct pt_regs *regs) flush_fp_to_thread(current); +#ifdef CONFIG_PPC_FPU_REGS code = __parse_fpscr(current->thread.fp_state.fpscr); +#endif _exception(SIGFPE, regs, code, regs->nip); } @@ -1333,7 +1352,6 @@ static int emulate_instruction(struct pt_regs *regs) if (!user_mode(regs)) return -EINVAL; - CHECK_FULL_REGS(regs); if (get_user(instword, (u32 __user *)(regs->nip))) return -EFAULT; @@ -1421,16 +1439,17 @@ static int emulate_instruction(struct pt_regs *regs) return -EINVAL; } +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { return is_kernel_addr(addr); } +#endif #ifdef CONFIG_MATH_EMULATION static int emulate_math(struct pt_regs *regs) { int ret; - extern int do_mathemu(struct pt_regs *regs); ret = do_mathemu(regs); if (ret >= 0) @@ -1457,9 +1476,8 @@ static int emulate_math(struct pt_regs *regs) static inline int emulate_math(struct pt_regs *regs) { return -1; } #endif -void program_check_exception(struct pt_regs *regs) +static void do_program_check(struct pt_regs *regs) { - enum ctx_state prev_state = exception_enter(); unsigned int reason = get_reason(regs); /* We can now get here via a FP Unavailable exception if the core @@ -1468,22 +1486,22 @@ void program_check_exception(struct pt_regs *regs) if (reason & REASON_FP) { /* IEEE FP exception */ parse_fpe(regs); - goto bail; + return; } if (reason & REASON_TRAP) { unsigned long bugaddr; /* Debugger is first in line to stop recursive faults in * rcu_lock, notify_die, or atomic_notifier_call_chain */ if (debugger_bpt(regs)) - goto bail; + return; if (kprobe_handler(regs)) - goto bail; + return; /* trap exception */ if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - goto bail; + return; bugaddr = regs->nip; /* @@ -1494,11 +1512,15 @@ void program_check_exception(struct pt_regs *regs) if (!(regs->msr & MSR_PR) && /* not user-mode */ report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) { - regs->nip += 4; - goto bail; + regs_add_return_ip(regs, 4); + return; + } + + /* User mode considers other cases after enabling IRQs */ + if (!user_mode(regs)) { + _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); + return; } - _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); - goto bail; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (reason & REASON_TM) { @@ -1519,7 +1541,7 @@ void program_check_exception(struct pt_regs *regs) */ if (user_mode(regs)) { _exception(SIGILL, regs, ILL_ILLOPN, regs->nip); - goto bail; + return; } else { printk(KERN_EMERG "Unexpected TM Bad Thing exception " "at %lx (msr 0x%lx) tm_scratch=%llx\n", @@ -1531,17 +1553,43 @@ void program_check_exception(struct pt_regs *regs) /* * If we took the program check in the kernel skip down to sending a - * SIGILL. The subsequent cases all relate to emulating instructions - * which we should only do for userspace. We also do not want to enable - * interrupts for kernel faults because that might lead to further - * faults, and loose the context of the original exception. + * SIGILL. The subsequent cases all relate to user space, such as + * emulating instructions which we should only do for user space. We + * also do not want to enable interrupts for kernel faults because that + * might lead to further faults, and loose the context of the original + * exception. */ if (!user_mode(regs)) goto sigill; - /* We restore the interrupt state now */ - if (!arch_irq_disabled_regs(regs)) - local_irq_enable(); + interrupt_cond_local_irq_enable(regs); + + /* + * (reason & REASON_TRAP) is mostly handled before enabling IRQs, + * except get_user_instr() can sleep so we cannot reliably inspect the + * current instruction in that context. Now that we know we are + * handling a user space trap and can sleep, we can check if the trap + * was a hashchk failure. + */ + if (reason & REASON_TRAP) { + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) { + ppc_inst_t insn; + + if (get_user_instr(insn, (void __user *)regs->nip)) { + _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); + return; + } + + if (ppc_inst_primary_opcode(insn) == 31 && + get_xop(ppc_inst_val(insn)) == OP_31_XOP_HASHCHK) { + _exception(SIGILL, regs, ILL_ILLOPN, regs->nip); + return; + } + } + + _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); + return; + } /* (reason & REASON_ILLEGAL) would be the obvious thing here, * but there seems to be a hardware bug on the 405GP (RevD) @@ -1552,18 +1600,18 @@ void program_check_exception(struct pt_regs *regs) * pattern to occurrences etc. -dgibson 31/Mar/2003 */ if (!emulate_math(regs)) - goto bail; + return; /* Try to emulate it if we should. */ if (reason & (REASON_ILLEGAL | REASON_PRIVILEGED)) { switch (emulate_instruction(regs)) { case 0: - regs->nip += 4; + regs_add_return_ip(regs, 4); emulate_single_step(regs); - goto bail; + return; case -EFAULT: _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - goto bail; + return; } } @@ -1573,42 +1621,49 @@ sigill: else _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); -bail: - exception_exit(prev_state); } -NOKPROBE_SYMBOL(program_check_exception); + +DEFINE_INTERRUPT_HANDLER(program_check_exception) +{ + do_program_check(regs); +} /* * This occurs when running in hypervisor mode on POWER6 or later * and an illegal instruction is encountered. */ -void emulation_assist_interrupt(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(emulation_assist_interrupt) { - regs->msr |= REASON_ILLEGAL; - program_check_exception(regs); + regs_set_return_msr(regs, regs->msr | REASON_ILLEGAL); + do_program_check(regs); } -NOKPROBE_SYMBOL(emulation_assist_interrupt); -void alignment_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(alignment_exception) { - enum ctx_state prev_state = exception_enter(); int sig, code, fixed = 0; + unsigned long reason; + + interrupt_cond_local_irq_enable(regs); - /* We restore the interrupt state now */ - if (!arch_irq_disabled_regs(regs)) - local_irq_enable(); + reason = get_reason(regs); + if (reason & REASON_BOUNDARY) { + sig = SIGBUS; + code = BUS_ADRALN; + goto bad; + } if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT)) - goto bail; + return; /* we don't implement logging of alignment exceptions */ if (!(current->thread.align_ctl & PR_UNALIGN_SIGBUS)) fixed = fix_alignment(regs); if (fixed == 1) { - regs->nip += 4; /* skip over emulated instruction */ + /* skip over emulated instruction */ + regs_add_return_ip(regs, inst_length(reason)); emulate_single_step(regs); - goto bail; + return; } /* Operand address was bad */ @@ -1619,55 +1674,40 @@ void alignment_exception(struct pt_regs *regs) sig = SIGBUS; code = BUS_ADRALN; } +bad: if (user_mode(regs)) _exception(sig, regs, code, regs->dar); else - bad_page_fault(regs, regs->dar, sig); - -bail: - exception_exit(prev_state); + bad_page_fault(regs, sig); } -void StackOverflow(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(stack_overflow_exception) { - pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n", - current->comm, task_pid_nr(current), regs->gpr[1]); - debugger(regs); - show_regs(regs); - panic("kernel stack overflow"); + die("Kernel stack overflow", regs, SIGSEGV); } -void kernel_fp_unavailable_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(kernel_fp_unavailable_exception) { - enum ctx_state prev_state = exception_enter(); - printk(KERN_EMERG "Unrecoverable FP Unavailable Exception " "%lx at %lx\n", regs->trap, regs->nip); die("Unrecoverable FP Unavailable Exception", regs, SIGABRT); - - exception_exit(prev_state); } -void altivec_unavailable_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(altivec_unavailable_exception) { - enum ctx_state prev_state = exception_enter(); - if (user_mode(regs)) { /* A user program has executed an altivec instruction, but this kernel doesn't support altivec. */ _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - goto bail; + return; } printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception " "%lx at %lx\n", regs->trap, regs->nip); die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT); - -bail: - exception_exit(prev_state); } -void vsx_unavailable_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(vsx_unavailable_exception) { if (user_mode(regs)) { /* A user program has executed an vsx instruction, @@ -1681,13 +1721,13 @@ void vsx_unavailable_exception(struct pt_regs *regs) die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT); } -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 static void tm_unavailable(struct pt_regs *regs) { #ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (user_mode(regs)) { current->thread.load_tm++; - regs->msr |= MSR_TM; + regs_set_return_msr(regs, regs->msr | MSR_TM); tm_enable(); tm_restore_sprs(¤t->thread); return; @@ -1698,7 +1738,7 @@ static void tm_unavailable(struct pt_regs *regs) die("Unrecoverable TM Unavailable Exception", regs, SIGABRT); } -void facility_unavailable_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(facility_unavailable_exception) { static char *facility_strings[] = { [FSCR_FP_LG] = "FPU", @@ -1711,6 +1751,7 @@ void facility_unavailable_exception(struct pt_regs *regs) [FSCR_TAR_LG] = "TAR", [FSCR_MSGP_LG] = "MSGP", [FSCR_SCV_LG] = "SCV", + [FSCR_PREFIX_LG] = "PREFIX", }; char *facility = "unknown"; u64 value; @@ -1718,7 +1759,7 @@ void facility_unavailable_exception(struct pt_regs *regs) u8 status; bool hv; - hv = (TRAP(regs) == 0xf80); + hv = (TRAP(regs) == INTERRUPT_H_FAC_UNAVAIL); if (hv) value = mfspr(SPRN_HFSCR); else @@ -1737,9 +1778,7 @@ void facility_unavailable_exception(struct pt_regs *regs) die("Unexpected facility unavailable exception", regs, SIGABRT); } - /* We restore the interrupt state now */ - if (!arch_irq_disabled_regs(regs)) - local_irq_enable(); + interrupt_cond_local_irq_enable(regs); if (status == FSCR_DSCR_LG) { /* @@ -1780,7 +1819,7 @@ void facility_unavailable_exception(struct pt_regs *regs) pr_err("DSCR based mfspr emulation failed\n"); return; } - regs->nip += 4; + regs_add_return_ip(regs, 4); emulate_single_step(regs); } return; @@ -1817,7 +1856,7 @@ out: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -void fp_unavailable_tm(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(fp_unavailable_tm) { /* Note: This does not handle any kind of FP laziness. */ @@ -1850,7 +1889,7 @@ void fp_unavailable_tm(struct pt_regs *regs) tm_recheckpoint(¤t->thread); } -void altivec_unavailable_tm(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(altivec_unavailable_tm) { /* See the comments in fp_unavailable_tm(). This function operates * the same way. @@ -1865,7 +1904,7 @@ void altivec_unavailable_tm(struct pt_regs *regs) current->thread.used_vr = 1; } -void vsx_unavailable_tm(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(vsx_unavailable_tm) { /* See the comments in fp_unavailable_tm(). This works similarly, * though we're loading both FP and VEC registers in here. @@ -1890,11 +1929,40 @@ void vsx_unavailable_tm(struct pt_regs *regs) } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -void performance_monitor_exception(struct pt_regs *regs) +#ifdef CONFIG_PPC64 +DECLARE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi); +DEFINE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi) { __this_cpu_inc(irq_stat.pmu_irqs); perf_irq(regs); + + return 0; +} +#endif + +DECLARE_INTERRUPT_HANDLER_ASYNC(performance_monitor_exception_async); +DEFINE_INTERRUPT_HANDLER_ASYNC(performance_monitor_exception_async) +{ + __this_cpu_inc(irq_stat.pmu_irqs); + + perf_irq(regs); +} + +DEFINE_INTERRUPT_HANDLER_RAW(performance_monitor_exception) +{ + /* + * On 64-bit, if perf interrupts hit in a local_irq_disable + * (soft-masked) region, we consider them as NMIs. This is required to + * prevent hash faults on user addresses when reading callchains (and + * looks better from an irq tracing perspective). + */ + if (IS_ENABLED(CONFIG_PPC64) && unlikely(arch_irq_disabled_regs(regs))) + performance_monitor_exception_nmi(regs); + else + performance_monitor_exception_async(regs); + + return 0; } #ifdef CONFIG_PPC_ADV_DEBUG_REGS @@ -1948,7 +2016,7 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status) */ if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, current->thread.debug.dbcr1)) - regs->msr |= MSR_DE; + regs_set_return_msr(regs, regs->msr | MSR_DE); else /* Make sure the IDM flag is off */ current->thread.debug.dbcr0 &= ~DBCR0_IDM; @@ -1957,8 +2025,10 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status) mtspr(SPRN_DBCR0, current->thread.debug.dbcr0); } -void DebugException(struct pt_regs *regs, unsigned long debug_status) +DEFINE_INTERRUPT_HANDLER(DebugException) { + unsigned long debug_status = regs->dsisr; + current->thread.debug.dbsr = debug_status; /* Hack alert: On BookE, Branch Taken stops on the branch itself, while @@ -1967,7 +2037,7 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status) * instead of stopping here when hitting a BT */ if (debug_status & DBSR_BT) { - regs->msr &= ~MSR_DE; + regs_set_return_msr(regs, regs->msr & ~MSR_DE); /* Disable BT */ mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~DBCR0_BT); @@ -1978,7 +2048,7 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status) if (user_mode(regs)) { current->thread.debug.dbcr0 &= ~DBCR0_BT; current->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC; - regs->msr |= MSR_DE; + regs_set_return_msr(regs, regs->msr | MSR_DE); return; } @@ -1992,7 +2062,7 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status) if (debugger_sstep(regs)) return; } else if (debug_status & DBSR_IC) { /* Instruction complete */ - regs->msr &= ~MSR_DE; + regs_set_return_msr(regs, regs->msr & ~MSR_DE); /* Disable instruction completion */ mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~DBCR0_IC); @@ -2014,7 +2084,7 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status) current->thread.debug.dbcr0 &= ~DBCR0_IC; if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, current->thread.debug.dbcr1)) - regs->msr |= MSR_DE; + regs_set_return_msr(regs, regs->msr | MSR_DE); else /* Make sure the IDM bit is off */ current->thread.debug.dbcr0 &= ~DBCR0_IDM; @@ -2024,19 +2094,10 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status) } else handle_debug(regs, debug_status); } -NOKPROBE_SYMBOL(DebugException); #endif /* CONFIG_PPC_ADV_DEBUG_REGS */ -#if !defined(CONFIG_TAU_INT) -void TAUException(struct pt_regs *regs) -{ - printk("TAU trap at PC: %lx, MSR: %lx, vector=%lx %s\n", - regs->nip, regs->msr, regs->trap, print_tainted()); -} -#endif /* CONFIG_INT_TAU */ - #ifdef CONFIG_ALTIVEC -void altivec_assist_exception(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(altivec_assist_exception) { int err; @@ -2051,7 +2112,7 @@ void altivec_assist_exception(struct pt_regs *regs) PPC_WARN_EMULATED(altivec, regs); err = emulate_altivec(regs); if (err == 0) { - regs->nip += 4; /* skip emulated instruction */ + regs_add_return_ip(regs, 4); /* skip emulated instruction */ emulate_single_step(regs); return; } @@ -2069,10 +2130,11 @@ void altivec_assist_exception(struct pt_regs *regs) } #endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_FSL_BOOKE -void CacheLockingException(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +#ifdef CONFIG_PPC_85xx +DEFINE_INTERRUPT_HANDLER(CacheLockingException) { + unsigned long error_code = regs->dsisr; + /* We treat cache locking instructions from the user * as priv ops, in the future we could try to do * something smarter @@ -2081,20 +2143,17 @@ void CacheLockingException(struct pt_regs *regs, unsigned long address, _exception(SIGILL, regs, ILL_PRVOPC, regs->nip); return; } -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ #ifdef CONFIG_SPE -void SPEFloatingPointException(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(SPEFloatingPointException) { - extern int do_spe_mathemu(struct pt_regs *regs); unsigned long spefscr; int fpexc_mode; int code = FPE_FLTUNK; int err; - /* We restore the interrupt state now */ - if (!arch_irq_disabled_regs(regs)) - local_irq_enable(); + interrupt_cond_local_irq_enable(regs); flush_spe_to_thread(current); @@ -2117,7 +2176,7 @@ void SPEFloatingPointException(struct pt_regs *regs) err = do_spe_mathemu(regs); if (err == 0) { - regs->nip += 4; /* skip emulated instruction */ + regs_add_return_ip(regs, 4); /* skip emulated instruction */ emulate_single_step(regs); return; } @@ -2136,24 +2195,21 @@ void SPEFloatingPointException(struct pt_regs *regs) return; } -void SPEFloatingPointRoundException(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException) { - extern int speround_handler(struct pt_regs *regs); int err; - /* We restore the interrupt state now */ - if (!arch_irq_disabled_regs(regs)) - local_irq_enable(); + interrupt_cond_local_irq_enable(regs); preempt_disable(); if (regs->msr & MSR_SPE) giveup_spe(current); preempt_enable(); - regs->nip -= 4; + regs_add_return_ip(regs, -4); err = speround_handler(regs); if (err == 0) { - regs->nip += 4; /* skip emulated instruction */ + regs_add_return_ip(regs, 4); /* skip emulated instruction */ emulate_single_step(regs); return; } @@ -2178,30 +2234,22 @@ void SPEFloatingPointRoundException(struct pt_regs *regs) * in the MSR is 0. This indicates that SRR0/1 are live, and that * we therefore lost state by taking this exception. */ -void unrecoverable_exception(struct pt_regs *regs) +void __noreturn unrecoverable_exception(struct pt_regs *regs) { pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", regs->trap, regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); + /* die() should not return */ + for (;;) + ; } -NOKPROBE_SYMBOL(unrecoverable_exception); #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) -/* - * Default handler for a Watchdog exception, - * spins until a reboot occurs - */ -void __attribute__ ((weak)) WatchdogHandler(struct pt_regs *regs) -{ - /* Generic WatchdogHandler, implement your own */ - mtspr(SPRN_TCR, mfspr(SPRN_TCR)&(~TCR_WIE)); - return; -} - -void WatchdogException(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_NMI(WatchdogException) { printk (KERN_EMERG "PowerPC Book-E Watchdog Exception\n"); - WatchdogHandler(regs); + mtspr(SPRN_TCR, mfspr(SPRN_TCR) & ~TCR_WIE); + return 0; } #endif @@ -2209,18 +2257,12 @@ void WatchdogException(struct pt_regs *regs) * We enter here if we discover during exception entry that we are * running in supervisor mode with a userspace value in the stack pointer. */ -void kernel_bad_stack(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER(kernel_bad_stack) { printk(KERN_EMERG "Bad kernel stack pointer %lx at %lx\n", regs->gpr[1], regs->nip); die("Bad kernel stack pointer", regs, SIGABRT); } -NOKPROBE_SYMBOL(kernel_bad_stack); - -void __init trap_init(void) -{ -} - #ifdef CONFIG_PPC_EMULATED_STATS @@ -2269,35 +2311,20 @@ void ppc_warn_emulated_print(const char *type) static int __init ppc_warn_emulated_init(void) { - struct dentry *dir, *d; + struct dentry *dir; unsigned int i; struct ppc_emulated_entry *entries = (void *)&ppc_emulated; - if (!powerpc_debugfs_root) - return -ENODEV; - dir = debugfs_create_dir("emulated_instructions", - powerpc_debugfs_root); - if (!dir) - return -ENOMEM; + arch_debugfs_dir); - d = debugfs_create_u32("do_warn", 0644, dir, - &ppc_warn_emulated); - if (!d) - goto fail; + debugfs_create_u32("do_warn", 0644, dir, &ppc_warn_emulated); - for (i = 0; i < sizeof(ppc_emulated)/sizeof(*entries); i++) { - d = debugfs_create_u32(entries[i].name, 0644, dir, - (u32 *)&entries[i].val.counter); - if (!d) - goto fail; - } + for (i = 0; i < sizeof(ppc_emulated)/sizeof(*entries); i++) + debugfs_create_u32(entries[i].name, 0644, dir, + (u32 *)&entries[i].val.counter); return 0; - -fail: - debugfs_remove_recursive(dir); - return -ENOMEM; } device_initcall(ppc_warn_emulated_init); diff --git a/arch/powerpc/kernel/ucall.S b/arch/powerpc/kernel/ucall.S index 07296bc39166..80a1f9a4300a 100644 --- a/arch/powerpc/kernel/ucall.S +++ b/arch/powerpc/kernel/ucall.S @@ -5,8 +5,8 @@ * Copyright 2019, IBM Corporation. * */ +#include <linux/export.h> #include <asm/ppc_asm.h> -#include <asm/export.h> _GLOBAL(ucall_norets) EXPORT_SYMBOL_GPL(ucall_norets) diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 01595e8cafe7..92b3fc258d11 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -5,7 +5,7 @@ * c 2001 PPC 64 Team, IBM Corp */ -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/console.h> @@ -67,6 +67,8 @@ void __init udbg_early_init(void) udbg_init_debug_opal_raw(); #elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI) udbg_init_debug_opal_hvsi(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_16550) + udbg_init_debug_16550(); #endif #ifdef CONFIG_PPC_EARLY_DEBUG diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c index 9356b60d6030..a0467e528b70 100644 --- a/arch/powerpc/kernel/udbg_16550.c +++ b/arch/powerpc/kernel/udbg_16550.c @@ -7,7 +7,7 @@ #include <linux/types.h> #include <asm/udbg.h> #include <asm/io.h> -#include <asm/reg_a2.h> +#include <asm/early_ioremap.h> extern u8 real_readb(volatile u8 __iomem *addr); extern void real_writeb(u8 data, volatile u8 __iomem *addr); @@ -84,7 +84,7 @@ static int udbg_uart_getc(void) return udbg_uart_in(UART_RBR); } -static void udbg_use_uart(void) +static void __init udbg_use_uart(void) { udbg_putc = udbg_uart_putc; udbg_flush = udbg_uart_flush; @@ -92,7 +92,7 @@ static void udbg_use_uart(void) udbg_getc_poll = udbg_uart_getc_poll; } -void udbg_uart_setup(unsigned int speed, unsigned int clock) +void __init udbg_uart_setup(unsigned int speed, unsigned int clock) { unsigned int dll, base_bauds; @@ -121,7 +121,7 @@ void udbg_uart_setup(unsigned int speed, unsigned int clock) udbg_uart_out(UART_FCR, 0x7); } -unsigned int udbg_probe_uart_speed(unsigned int clock) +unsigned int __init udbg_probe_uart_speed(unsigned int clock) { unsigned int dll, dlm, divisor, prescaler, speed; u8 old_lcr; @@ -172,7 +172,7 @@ static void udbg_uart_out_pio(unsigned int reg, u8 data) outb(data, udbg_uart.pio_base + (reg * udbg_uart_stride)); } -void udbg_uart_init_pio(unsigned long port, unsigned int stride) +void __init udbg_uart_init_pio(unsigned long port, unsigned int stride) { if (!port) return; @@ -194,7 +194,7 @@ static void udbg_uart_out_mmio(unsigned int reg, u8 data) } -void udbg_uart_init_mmio(void __iomem *addr, unsigned int stride) +void __init udbg_uart_init_mmio(void __iomem *addr, unsigned int stride) { if (!addr) return; @@ -296,3 +296,35 @@ void __init udbg_init_40x_realmode(void) } #endif /* CONFIG_PPC_EARLY_DEBUG_40x */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_16550 + +static void __iomem *udbg_uart_early_addr; + +void __init udbg_init_debug_16550(void) +{ + udbg_uart_early_addr = early_ioremap(CONFIG_PPC_EARLY_DEBUG_16550_PHYSADDR, 0x1000); + udbg_uart_init_mmio(udbg_uart_early_addr, CONFIG_PPC_EARLY_DEBUG_16550_STRIDE); +} + +static int __init udbg_init_debug_16550_ioremap(void) +{ + void __iomem *addr; + + if (!udbg_uart_early_addr) + return 0; + + addr = ioremap(CONFIG_PPC_EARLY_DEBUG_16550_PHYSADDR, 0x1000); + if (WARN_ON(!addr)) + return -ENOMEM; + + udbg_uart_init_mmio(addr, CONFIG_PPC_EARLY_DEBUG_16550_STRIDE); + early_iounmap(udbg_uart_early_addr, 0x1000); + udbg_uart_early_addr = NULL; + + return 0; +} + +early_initcall(udbg_init_debug_16550_ioremap); + +#endif /* CONFIG_PPC_EARLY_DEBUG_16550 */ diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index 1cfef0e5fec5..95a41ae9dfa7 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -14,6 +14,7 @@ #include <linux/kdebug.h> #include <asm/sstep.h> +#include <asm/inst.h> #define UPROBE_TRAP_NR UINT_MAX @@ -40,6 +41,18 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, if (addr & 0x03) return -EINVAL; + if (cpu_has_feature(CPU_FTR_ARCH_31) && + ppc_inst_prefixed(ppc_inst_read(auprobe->insn)) && + (addr & 0x3f) == 60) { + pr_info_ratelimited("Cannot register a uprobe on 64 byte unaligned prefixed instruction\n"); + return -EINVAL; + } + + if (!can_single_step(ppc_inst_val(ppc_inst_read(auprobe->insn)))) { + pr_info_ratelimited("Cannot register a uprobe on instructions that can't be single stepped\n"); + return -ENOTSUPP; + } + return 0; } @@ -54,7 +67,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) autask->saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; - regs->nip = current->utask->xol_vaddr; + regs_set_return_ip(regs, current->utask->xol_vaddr); user_enable_single_step(current); return 0; @@ -111,7 +124,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) * support doesn't exist and have to fix-up the next instruction * to be executed. */ - regs->nip = utask->vaddr + MAX_UINSN_BYTES; + regs_set_return_ip(regs, (unsigned long)ppc_inst_next((void *)utask->vaddr, auprobe->insn)); user_disable_single_step(current); return 0; @@ -140,6 +153,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, case DIE_SSTEP: if (uprobe_post_sstep_notifier(regs)) return NOTIFY_STOP; + break; default: break; } @@ -173,7 +187,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) * emulate_step() returns 1 if the insn was successfully emulated. * For all other cases, we need to single-step in hardware. */ - ret = emulate_step(regs, auprobe->insn); + ret = emulate_step(regs, ppc_inst_read(auprobe->insn)); if (ret > 0) return true; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index eae9ddaecbcf..7a2ff9010f17 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -17,12 +17,14 @@ #include <linux/elf.h> #include <linux/security.h> #include <linux/memblock.h> +#include <linux/syscalls.h> +#include <linux/time_namespace.h> +#include <vdso/datapage.h> -#include <asm/pgtable.h> +#include <asm/syscall.h> #include <asm/processor.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/prom.h> #include <asm/machdep.h> #include <asm/cputable.h> #include <asm/sections.h> @@ -31,39 +33,13 @@ #include <asm/vdso_datapage.h> #include <asm/setup.h> -#undef DEBUG - -#ifdef DEBUG -#define DBG(fmt...) printk(fmt) -#else -#define DBG(fmt...) -#endif - -/* Max supported size for symbol names */ -#define MAX_SYMNAME 64 - /* The alignment of the vDSO */ #define VDSO_ALIGNMENT (1 << 16) -static unsigned int vdso32_pages; -static void *vdso32_kbase; -static struct page **vdso32_pagelist; -unsigned long vdso32_sigtramp; -unsigned long vdso32_rt_sigtramp; - -#ifdef CONFIG_VDSO32 extern char vdso32_start, vdso32_end; -#endif - -#ifdef CONFIG_PPC64 extern char vdso64_start, vdso64_end; -static void *vdso64_kbase = &vdso64_start; -static unsigned int vdso64_pages; -static struct page **vdso64_pagelist; -unsigned long vdso64_rt_sigtramp; -#endif /* CONFIG_PPC64 */ -static int vdso_ready; +long sys_ni_syscall(void); /* * The vdso data page (aka. systemcfg for old ppc64 fans) is here. @@ -71,116 +47,152 @@ static int vdso_ready; * with it, it will become dynamically allocated */ static union { - struct vdso_data data; + struct vdso_arch_data data; u8 page[PAGE_SIZE]; } vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +struct vdso_arch_data *vdso_data = &vdso_data_store.data; + +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + +static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma, + unsigned long text_size) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + + if (new_size != text_size) + return -EINVAL; -/* Format of the patch table */ -struct vdso_patch_def + current->mm->context.vdso = (void __user *)new_vma->vm_start; + + return 0; +} + +static int vdso32_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - unsigned long ftr_mask, ftr_value; - const char *gen_name; - const char *fix_name; + return vdso_mremap(sm, new_vma, &vdso32_end - &vdso32_start); +} + +static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); +} + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf); + +static struct vm_special_mapping vvar_spec __ro_after_init = { + .name = "[vvar]", + .fault = vvar_fault, }; -/* Table of functions to patch based on the CPU type/revision - * - * Currently, we only change sync_dicache to do nothing on processors - * with a coherent icache - */ -static struct vdso_patch_def vdso_patches[] = { - { - CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, - "__kernel_sync_dicache", "__kernel_sync_dicache_p5" - }, +static struct vm_special_mapping vdso32_spec __ro_after_init = { + .name = "[vdso]", + .mremap = vdso32_mremap, }; +static struct vm_special_mapping vdso64_spec __ro_after_init = { + .name = "[vdso]", + .mremap = vdso64_mremap, +}; + +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return ((struct vdso_arch_data *)vvar_page)->data; +} + /* - * Some infos carried around for each of them during parsing at - * boot time. + * The vvar mapping contains data for a specific time namespace, so when a task + * changes namespace we must unmap its vvar data for the old namespace. + * Subsequent faults will map in data for the new namespace. + * + * For more details see timens_setup_vdso_data(). */ -struct lib32_elfinfo +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { - Elf32_Ehdr *hdr; /* ptr to ELF */ - Elf32_Sym *dynsym; /* ptr to .dynsym section */ - unsigned long dynsymsize; /* size of .dynsym section */ - char *dynstr; /* ptr to .dynstr section */ - unsigned long text; /* offset of .text section in .so */ -}; + struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma; + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vvar_spec)) + zap_vma_pages(vma); + } + mmap_read_unlock(mm); -struct lib64_elfinfo + return 0; +} +#endif + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) { - Elf64_Ehdr *hdr; - Elf64_Sym *dynsym; - unsigned long dynsymsize; - char *dynstr; - unsigned long text; -}; + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long pfn; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = virt_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = virt_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + return vmf_insert_pfn(vma, vmf->address, pfn); +} /* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { + unsigned long vdso_size, vdso_base, mappings_size; + struct vm_special_mapping *vdso_spec; + unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE; struct mm_struct *mm = current->mm; - struct page **vdso_pagelist; - unsigned long vdso_pages; - unsigned long vdso_base; - int rc; - - if (!vdso_ready) - return 0; + struct vm_area_struct *vma; -#ifdef CONFIG_PPC64 if (is_32bit_task()) { - vdso_pagelist = vdso32_pagelist; - vdso_pages = vdso32_pages; - vdso_base = VDSO32_MBASE; + vdso_spec = &vdso32_spec; + vdso_size = &vdso32_end - &vdso32_start; } else { - vdso_pagelist = vdso64_pagelist; - vdso_pages = vdso64_pages; - /* - * On 64bit we don't have a preferred map address. This - * allows get_unmapped_area to find an area near other mmaps - * and most likely share a SLB entry. - */ - vdso_base = 0; + vdso_spec = &vdso64_spec; + vdso_size = &vdso64_end - &vdso64_start; } -#else - vdso_pagelist = vdso32_pagelist; - vdso_pages = vdso32_pages; - vdso_base = VDSO32_MBASE; -#endif - current->mm->context.vdso_base = 0; - - /* vDSO has a problem and was disabled, just don't "enable" it for the - * process - */ - if (vdso_pages == 0) - return 0; - /* Add a page to the vdso size for the data page */ - vdso_pages ++; + mappings_size = vdso_size + vvar_size; + mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK; /* - * pick a base address for the vDSO in process space. We try to put it - * at vdso_base which is the "natural" base for it, but we might fail - * and end up putting it elsewhere. + * Pick a base address for the vDSO in process space. * Add enough to the size so that the result can be aligned. */ - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - vdso_base = get_unmapped_area(NULL, vdso_base, - (vdso_pages << PAGE_SHIFT) + - ((VDSO_ALIGNMENT - 1) & PAGE_MASK), - 0, 0); - if (IS_ERR_VALUE(vdso_base)) { - rc = vdso_base; - goto fail_mmapsem; - } + vdso_base = get_unmapped_area(NULL, 0, mappings_size, 0, 0); + if (IS_ERR_VALUE(vdso_base)) + return vdso_base; /* Add required alignment. */ vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); @@ -188,9 +200,15 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) /* * Put vDSO base into mm struct. We need to do this before calling * install_special_mapping or the perf counter mmap tracking code - * will fail to recognise it as a vDSO (since arch_vma_name fails). + * will fail to recognise it as a vDSO. */ - current->mm->context.vdso_base = vdso_base; + mm->context.vdso = (void __user *)vdso_base + vvar_size; + + vma = _install_special_mapping(mm, vdso_base, vvar_size, + VM_READ | VM_MAYREAD | VM_IO | + VM_DONTDUMP | VM_PFNMAP, &vvar_spec); + if (IS_ERR(vma)) + return PTR_ERR(vma); /* * our vma flags don't have VM_WRITE so by default, the process isn't @@ -202,439 +220,57 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * It's fine to use that for setting breakpoints in the vDSO code * pages though. */ - rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso_pagelist); - if (rc) { - current->mm->context.vdso_base = 0; - goto fail_mmapsem; - } - - up_write(&mm->mmap_sem); - return 0; - - fail_mmapsem: - up_write(&mm->mmap_sem); - return rc; -} - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso_base) - return "[vdso]"; - return NULL; -} - - - -#ifdef CONFIG_VDSO32 -static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname, - unsigned long *size) -{ - Elf32_Shdr *sechdrs; - unsigned int i; - char *secnames; - - /* Grab section headers and strings so we can tell who is who */ - sechdrs = (void *)ehdr + ehdr->e_shoff; - secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset; - - /* Find the section they want */ - for (i = 1; i < ehdr->e_shnum; i++) { - if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) { - if (size) - *size = sechdrs[i].sh_size; - return (void *)ehdr + sechdrs[i].sh_offset; - } - } - *size = 0; - return NULL; -} - -static Elf32_Sym * __init find_symbol32(struct lib32_elfinfo *lib, - const char *symname) -{ - unsigned int i; - char name[MAX_SYMNAME], *c; - - for (i = 0; i < (lib->dynsymsize / sizeof(Elf32_Sym)); i++) { - if (lib->dynsym[i].st_name == 0) - continue; - strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, - MAX_SYMNAME); - c = strchr(name, '@'); - if (c) - *c = 0; - if (strcmp(symname, name) == 0) - return &lib->dynsym[i]; - } - return NULL; -} - -/* Note that we assume the section is .text and the symbol is relative to - * the library base - */ -static unsigned long __init find_function32(struct lib32_elfinfo *lib, - const char *symname) -{ - Elf32_Sym *sym = find_symbol32(lib, symname); - - if (sym == NULL) { - printk(KERN_WARNING "vDSO32: function %s not found !\n", - symname); - return 0; - } - return sym->st_value - VDSO32_LBASE; -} - -static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64, - const char *orig, const char *fix) -{ - Elf32_Sym *sym32_gen, *sym32_fix; - - sym32_gen = find_symbol32(v32, orig); - if (sym32_gen == NULL) { - printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", orig); - return -1; - } - if (fix == NULL) { - sym32_gen->st_name = 0; - return 0; - } - sym32_fix = find_symbol32(v32, fix); - if (sym32_fix == NULL) { - printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", fix); - return -1; - } - sym32_gen->st_value = sym32_fix->st_value; - sym32_gen->st_size = sym32_fix->st_size; - sym32_gen->st_info = sym32_fix->st_info; - sym32_gen->st_other = sym32_fix->st_other; - sym32_gen->st_shndx = sym32_fix->st_shndx; + vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size, + VM_READ | VM_EXEC | VM_MAYREAD | + VM_MAYWRITE | VM_MAYEXEC, vdso_spec); + if (IS_ERR(vma)) + do_munmap(mm, vdso_base, vvar_size, NULL); - return 0; -} -#else /* !CONFIG_VDSO32 */ -static unsigned long __init find_function32(struct lib32_elfinfo *lib, - const char *symname) -{ - return 0; -} - -static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64, - const char *orig, const char *fix) -{ - return 0; -} -#endif /* CONFIG_VDSO32 */ - - -#ifdef CONFIG_PPC64 - -static void * __init find_section64(Elf64_Ehdr *ehdr, const char *secname, - unsigned long *size) -{ - Elf64_Shdr *sechdrs; - unsigned int i; - char *secnames; - - /* Grab section headers and strings so we can tell who is who */ - sechdrs = (void *)ehdr + ehdr->e_shoff; - secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset; - - /* Find the section they want */ - for (i = 1; i < ehdr->e_shnum; i++) { - if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) { - if (size) - *size = sechdrs[i].sh_size; - return (void *)ehdr + sechdrs[i].sh_offset; - } - } - if (size) - *size = 0; - return NULL; -} - -static Elf64_Sym * __init find_symbol64(struct lib64_elfinfo *lib, - const char *symname) -{ - unsigned int i; - char name[MAX_SYMNAME], *c; - - for (i = 0; i < (lib->dynsymsize / sizeof(Elf64_Sym)); i++) { - if (lib->dynsym[i].st_name == 0) - continue; - strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, - MAX_SYMNAME); - c = strchr(name, '@'); - if (c) - *c = 0; - if (strcmp(symname, name) == 0) - return &lib->dynsym[i]; - } - return NULL; + return PTR_ERR_OR_ZERO(vma); } -/* Note that we assume the section is .text and the symbol is relative to - * the library base - */ -static unsigned long __init find_function64(struct lib64_elfinfo *lib, - const char *symname) -{ - Elf64_Sym *sym = find_symbol64(lib, symname); - - if (sym == NULL) { - printk(KERN_WARNING "vDSO64: function %s not found !\n", - symname); - return 0; - } -#ifdef VDS64_HAS_DESCRIPTORS - return *((u64 *)(vdso64_kbase + sym->st_value - VDSO64_LBASE)) - - VDSO64_LBASE; -#else - return sym->st_value - VDSO64_LBASE; -#endif -} - -static int __init vdso_do_func_patch64(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64, - const char *orig, const char *fix) -{ - Elf64_Sym *sym64_gen, *sym64_fix; - - sym64_gen = find_symbol64(v64, orig); - if (sym64_gen == NULL) { - printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", orig); - return -1; - } - if (fix == NULL) { - sym64_gen->st_name = 0; - return 0; - } - sym64_fix = find_symbol64(v64, fix); - if (sym64_fix == NULL) { - printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", fix); - return -1; - } - sym64_gen->st_value = sym64_fix->st_value; - sym64_gen->st_size = sym64_fix->st_size; - sym64_gen->st_info = sym64_fix->st_info; - sym64_gen->st_other = sym64_fix->st_other; - sym64_gen->st_shndx = sym64_fix->st_shndx; - - return 0; -} - -#endif /* CONFIG_PPC64 */ - - -static __init int vdso_do_find_sections(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64) -{ - void *sect; - - /* - * Locate symbol tables & text section - */ - -#ifdef CONFIG_VDSO32 - v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize); - v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL); - if (v32->dynsym == NULL || v32->dynstr == NULL) { - printk(KERN_ERR "vDSO32: required symbol section not found\n"); - return -1; - } - sect = find_section32(v32->hdr, ".text", NULL); - if (sect == NULL) { - printk(KERN_ERR "vDSO32: the .text section was not found\n"); - return -1; - } - v32->text = sect - vdso32_kbase; -#endif - -#ifdef CONFIG_PPC64 - v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize); - v64->dynstr = find_section64(v64->hdr, ".dynstr", NULL); - if (v64->dynsym == NULL || v64->dynstr == NULL) { - printk(KERN_ERR "vDSO64: required symbol section not found\n"); - return -1; - } - sect = find_section64(v64->hdr, ".text", NULL); - if (sect == NULL) { - printk(KERN_ERR "vDSO64: the .text section was not found\n"); - return -1; - } - v64->text = sect - vdso64_kbase; -#endif /* CONFIG_PPC64 */ - - return 0; -} - -static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - /* - * Find signal trampolines - */ + struct mm_struct *mm = current->mm; + int rc; -#ifdef CONFIG_PPC64 - vdso64_rt_sigtramp = find_function64(v64, "__kernel_sigtramp_rt64"); -#endif - vdso32_sigtramp = find_function32(v32, "__kernel_sigtramp32"); - vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32"); -} + mm->context.vdso = NULL; -static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64) -{ -#ifdef CONFIG_VDSO32 - Elf32_Sym *sym32; -#endif -#ifdef CONFIG_PPC64 - Elf64_Sym *sym64; - - sym64 = find_symbol64(v64, "__kernel_datapage_offset"); - if (sym64 == NULL) { - printk(KERN_ERR "vDSO64: Can't find symbol " - "__kernel_datapage_offset !\n"); - return -1; - } - *((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) = - (vdso64_pages << PAGE_SHIFT) - - (sym64->st_value - VDSO64_LBASE); -#endif /* CONFIG_PPC64 */ + if (mmap_write_lock_killable(mm)) + return -EINTR; -#ifdef CONFIG_VDSO32 - sym32 = find_symbol32(v32, "__kernel_datapage_offset"); - if (sym32 == NULL) { - printk(KERN_ERR "vDSO32: Can't find symbol " - "__kernel_datapage_offset !\n"); - return -1; - } - *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) = - (vdso32_pages << PAGE_SHIFT) - - (sym32->st_value - VDSO32_LBASE); -#endif + rc = __arch_setup_additional_pages(bprm, uses_interp); + if (rc) + mm->context.vdso = NULL; - return 0; + mmap_write_unlock(mm); + return rc; } +#define VDSO_DO_FIXUPS(type, value, bits, sec) do { \ + void *__start = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_start); \ + void *__end = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_end); \ + \ + do_##type##_fixups((value), __start, __end); \ +} while (0) -static __init int vdso_fixup_features(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64) +static void __init vdso_fixup_features(void) { - unsigned long size; - void *start; - #ifdef CONFIG_PPC64 - start = find_section64(v64->hdr, "__ftr_fixup", &size); - if (start) - do_feature_fixups(cur_cpu_spec->cpu_features, - start, start + size); - - start = find_section64(v64->hdr, "__mmu_ftr_fixup", &size); - if (start) - do_feature_fixups(cur_cpu_spec->mmu_features, - start, start + size); - - start = find_section64(v64->hdr, "__fw_ftr_fixup", &size); - if (start) - do_feature_fixups(powerpc_firmware_features, - start, start + size); - - start = find_section64(v64->hdr, "__lwsync_fixup", &size); - if (start) - do_lwsync_fixups(cur_cpu_spec->cpu_features, - start, start + size); + VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 64, ftr_fixup); + VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 64, mmu_ftr_fixup); + VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 64, fw_ftr_fixup); + VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 64, lwsync_fixup); #endif /* CONFIG_PPC64 */ #ifdef CONFIG_VDSO32 - start = find_section32(v32->hdr, "__ftr_fixup", &size); - if (start) - do_feature_fixups(cur_cpu_spec->cpu_features, - start, start + size); - - start = find_section32(v32->hdr, "__mmu_ftr_fixup", &size); - if (start) - do_feature_fixups(cur_cpu_spec->mmu_features, - start, start + size); - + VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 32, ftr_fixup); + VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 32, mmu_ftr_fixup); #ifdef CONFIG_PPC64 - start = find_section32(v32->hdr, "__fw_ftr_fixup", &size); - if (start) - do_feature_fixups(powerpc_firmware_features, - start, start + size); + VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 32, fw_ftr_fixup); #endif /* CONFIG_PPC64 */ - - start = find_section32(v32->hdr, "__lwsync_fixup", &size); - if (start) - do_lwsync_fixups(cur_cpu_spec->cpu_features, - start, start + size); + VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 32, lwsync_fixup); #endif - - return 0; -} - -static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(vdso_patches); i++) { - struct vdso_patch_def *patch = &vdso_patches[i]; - int match = (cur_cpu_spec->cpu_features & patch->ftr_mask) - == patch->ftr_value; - if (!match) - continue; - - DBG("replacing %s with %s...\n", patch->gen_name, - patch->fix_name ? "NONE" : patch->fix_name); - - /* - * Patch the 32 bits and 64 bits symbols. Note that we do not - * patch the "." symbol on 64 bits. - * It would be easy to do, but doesn't seem to be necessary, - * patching the OPD symbol is enough. - */ - vdso_do_func_patch32(v32, v64, patch->gen_name, - patch->fix_name); -#ifdef CONFIG_PPC64 - vdso_do_func_patch64(v32, v64, patch->gen_name, - patch->fix_name); -#endif /* CONFIG_PPC64 */ - } - - return 0; -} - - -static __init int vdso_setup(void) -{ - struct lib32_elfinfo v32; - struct lib64_elfinfo v64; - - v32.hdr = vdso32_kbase; -#ifdef CONFIG_PPC64 - v64.hdr = vdso64_kbase; -#endif - if (vdso_do_find_sections(&v32, &v64)) - return -1; - - if (vdso_fixup_datapage(&v32, &v64)) - return -1; - - if (vdso_fixup_features(&v32, &v64)) - return -1; - - if (vdso_fixup_alt_funcs(&v32, &v64)) - return -1; - - vdso_setup_trampolines(&v32, &v64); - - return 0; } /* @@ -644,26 +280,13 @@ static __init int vdso_setup(void) static void __init vdso_setup_syscall_map(void) { unsigned int i; - extern unsigned long *sys_call_table; -#ifdef CONFIG_PPC64 - extern unsigned long *compat_sys_call_table; -#endif - extern unsigned long sys_ni_syscall; - for (i = 0; i < NR_syscalls; i++) { -#ifdef CONFIG_PPC64 - if (sys_call_table[i] != sys_ni_syscall) - vdso_data->syscall_map_64[i >> 5] |= - 0x80000000UL >> (i & 0x1f); - if (compat_sys_call_table[i] != sys_ni_syscall) - vdso_data->syscall_map_32[i >> 5] |= - 0x80000000UL >> (i & 0x1f); -#else /* CONFIG_PPC64 */ - if (sys_call_table[i] != sys_ni_syscall) - vdso_data->syscall_map_32[i >> 5] |= - 0x80000000UL >> (i & 0x1f); -#endif /* CONFIG_PPC64 */ + if (sys_call_table[i] != (void *)&sys_ni_syscall) + vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); + if (IS_ENABLED(CONFIG_COMPAT) && + compat_sys_call_table[i] != (void *)&sys_ni_syscall) + vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); } } @@ -682,7 +305,7 @@ int vdso_getcpu_init(void) node = cpu_to_node(cpu); WARN_ON_ONCE(node > 0xffff); - val = (cpu & 0xfff) | ((node & 0xffff) << 16); + val = (cpu & 0xffff) | ((node & 0xffff) << 16); mtspr(SPRN_SPRG_VDSO_WRITE, val); get_paca()->sprg_vdso = val; @@ -694,10 +317,24 @@ int vdso_getcpu_init(void) early_initcall(vdso_getcpu_init); #endif -static int __init vdso_init(void) +static struct page ** __init vdso_setup_pages(void *start, void *end) { int i; + struct page **pagelist; + int pages = (end - start) >> PAGE_SHIFT; + + pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!pagelist) + panic("%s: Cannot allocate page list for VDSO", __func__); + + for (i = 0; i < pages; i++) + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); + return pagelist; +} + +static int __init vdso_init(void) +{ #ifdef CONFIG_PPC64 /* * Fill up the "systemcfg" stuff for backward compatibility @@ -722,80 +359,19 @@ static int __init vdso_init(void) vdso_data->icache_block_size = ppc64_caches.l1i.block_size; vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size; vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size; - - /* - * Calculate the size of the 64 bits vDSO - */ - vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; - DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages); -#else - vdso_data->dcache_block_size = L1_CACHE_BYTES; - vdso_data->dcache_log_block_size = L1_CACHE_SHIFT; - vdso_data->icache_block_size = L1_CACHE_BYTES; - vdso_data->icache_log_block_size = L1_CACHE_SHIFT; #endif /* CONFIG_PPC64 */ - -#ifdef CONFIG_VDSO32 - vdso32_kbase = &vdso32_start; - - /* - * Calculate the size of the 32 bits vDSO - */ - vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; - DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); -#endif - - - /* - * Setup the syscall map in the vDOS - */ vdso_setup_syscall_map(); - /* - * Initialize the vDSO images in memory, that is do necessary - * fixups of vDSO symbols, locate trampolines, etc... - */ - if (vdso_setup()) { - printk(KERN_ERR "vDSO setup failure, not enabled !\n"); - vdso32_pages = 0; -#ifdef CONFIG_PPC64 - vdso64_pages = 0; -#endif - return 0; - } - -#ifdef CONFIG_VDSO32 - /* Make sure pages are in the correct state */ - vdso32_pagelist = kcalloc(vdso32_pages + 2, sizeof(struct page *), - GFP_KERNEL); - BUG_ON(vdso32_pagelist == NULL); - for (i = 0; i < vdso32_pages; i++) { - struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); - get_page(pg); - vdso32_pagelist[i] = pg; - } - vdso32_pagelist[i++] = virt_to_page(vdso_data); - vdso32_pagelist[i] = NULL; -#endif + vdso_fixup_features(); -#ifdef CONFIG_PPC64 - vdso64_pagelist = kcalloc(vdso64_pages + 2, sizeof(struct page *), - GFP_KERNEL); - BUG_ON(vdso64_pagelist == NULL); - for (i = 0; i < vdso64_pages; i++) { - struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); - get_page(pg); - vdso64_pagelist[i] = pg; - } - vdso64_pagelist[i++] = virt_to_page(vdso_data); - vdso64_pagelist[i] = NULL; -#endif /* CONFIG_PPC64 */ + if (IS_ENABLED(CONFIG_VDSO32)) + vdso32_spec.pages = vdso_setup_pages(&vdso32_start, &vdso32_end); - get_page(virt_to_page(vdso_data)); + if (IS_ENABLED(CONFIG_PPC64)) + vdso64_spec.pages = vdso_setup_pages(&vdso64_start, &vdso64_end); smp_wmb(); - vdso_ready = 1; return 0; } diff --git a/arch/powerpc/kernel/vdso/.gitignore b/arch/powerpc/kernel/vdso/.gitignore new file mode 100644 index 000000000000..dd9bdd67758b --- /dev/null +++ b/arch/powerpc/kernel/vdso/.gitignore @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso32.lds +vdso32.so.dbg +vdso64.lds +vdso64.so.dbg diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile new file mode 100644 index 000000000000..1b93655c2857 --- /dev/null +++ b/arch/powerpc/kernel/vdso/Makefile @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: GPL-2.0 + +# List of files in the vdso, has to be asm only for now + +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile + +obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o +obj-vdso64 = sigtramp64-64.o gettimeofday-64.o datapage-64.o cacheflush-64.o note-64.o getcpu-64.o + +ifneq ($(c-gettimeofday-y),) + CFLAGS_vgettimeofday-32.o += -include $(c-gettimeofday-y) + CFLAGS_vgettimeofday-32.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) + CFLAGS_vgettimeofday-32.o += $(call cc-option, -fno-stack-protector) + CFLAGS_vgettimeofday-32.o += -DDISABLE_BRANCH_PROFILING + CFLAGS_vgettimeofday-32.o += -ffreestanding -fasynchronous-unwind-tables + CFLAGS_REMOVE_vgettimeofday-32.o = $(CC_FLAGS_FTRACE) + CFLAGS_REMOVE_vgettimeofday-32.o += -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc + # This flag is supported by clang for 64-bit but not 32-bit so it will cause + # an unused command line flag warning for this file. + ifdef CONFIG_CC_IS_CLANG + CFLAGS_REMOVE_vgettimeofday-32.o += -fno-stack-clash-protection + endif + CFLAGS_vgettimeofday-64.o += -include $(c-gettimeofday-y) + CFLAGS_vgettimeofday-64.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) + CFLAGS_vgettimeofday-64.o += $(call cc-option, -fno-stack-protector) + CFLAGS_vgettimeofday-64.o += -DDISABLE_BRANCH_PROFILING + CFLAGS_vgettimeofday-64.o += -ffreestanding -fasynchronous-unwind-tables + CFLAGS_REMOVE_vgettimeofday-64.o = $(CC_FLAGS_FTRACE) +# Go prior to 1.16.x assumes r30 is not clobbered by any VDSO code. That used to be true +# by accident when the VDSO was hand-written asm code, but may not be now that the VDSO is +# compiler generated. To avoid breaking Go tell GCC not to use r30. Impact on code +# generation is minimal, it will just use r29 instead. + CFLAGS_vgettimeofday-64.o += $(call cc-option, -ffixed-r30) +endif + +# Build rules + +ifdef CROSS32_COMPILE + VDSOCC := $(CROSS32_COMPILE)gcc +else + VDSOCC := $(CC) +endif + +targets := $(obj-vdso32) vdso32.so.dbg vgettimeofday-32.o +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) +targets += $(obj-vdso64) vdso64.so.dbg vgettimeofday-64.o +obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) + +GCOV_PROFILE := n +KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n + +ccflags-y := -fno-common -fno-builtin +ldflags-y := -Wl,--hash-style=both -nostdlib -shared -z noexecstack $(CLANG_FLAGS) +ldflags-$(CONFIG_LD_IS_LLD) += $(call cc-option,--ld-path=$(LD),-fuse-ld=lld) +ldflags-$(CONFIG_LD_ORPHAN_WARN) += -Wl,--orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) + +# Filter flags that clang will warn are unused for linking +ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CFLAGS)) + +CC32FLAGS := -m32 +LD32FLAGS := -Wl,-soname=linux-vdso32.so.1 +AS32FLAGS := -D__VDSO32__ + +LD64FLAGS := -Wl,-soname=linux-vdso64.so.1 +AS64FLAGS := -D__VDSO64__ + +targets += vdso32.lds +CPPFLAGS_vdso32.lds += -P -C -Upowerpc +targets += vdso64.lds +CPPFLAGS_vdso64.lds += -P -C + +# link rule for the .so file, .lds has to be first +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) $(obj)/vgettimeofday-32.o FORCE + $(call if_changed,vdso32ld_and_check) +$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj)/vgettimeofday-64.o FORCE + $(call if_changed,vdso64ld_and_check) + +# assembly rules for the .S files +$(obj-vdso32): %-32.o: %.S FORCE + $(call if_changed_dep,vdso32as) +$(obj)/vgettimeofday-32.o: %-32.o: %.c FORCE + $(call if_changed_dep,vdso32cc) +$(obj-vdso64): %-64.o: %.S FORCE + $(call if_changed_dep,vdso64as) +$(obj)/vgettimeofday-64.o: %-64.o: %.c FORCE + $(call if_changed_dep,cc_o_c) + +# Generate VDSO offsets using helper script +gen-vdso32sym := $(srctree)/$(src)/gen_vdso32_offsets.sh +quiet_cmd_vdso32sym = VDSO32SYM $@ + cmd_vdso32sym = $(NM) $< | $(gen-vdso32sym) | LC_ALL=C sort > $@ +gen-vdso64sym := $(srctree)/$(src)/gen_vdso64_offsets.sh +quiet_cmd_vdso64sym = VDSO64SYM $@ + cmd_vdso64sym = $(NM) $< | $(gen-vdso64sym) | LC_ALL=C sort > $@ + +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE + $(call if_changed,vdso32sym) +include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE + $(call if_changed,vdso64sym) + +# actual build commands +quiet_cmd_vdso32ld_and_check = VDSO32L $@ + cmd_vdso32ld_and_check = $(VDSOCC) $(ldflags-y) $(CC32FLAGS) $(LD32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^); $(cmd_vdso_check) +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) $(AS32FLAGS) -c -o $@ $< +quiet_cmd_vdso32cc = VDSO32C $@ + cmd_vdso32cc = $(VDSOCC) $(c_flags) $(CC32FLAGS) -c -o $@ $< + +quiet_cmd_vdso64ld_and_check = VDSO64L $@ + cmd_vdso64ld_and_check = $(VDSOCC) $(ldflags-y) $(LD64FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^); $(cmd_vdso_check) +quiet_cmd_vdso64as = VDSO64A $@ + cmd_vdso64as = $(VDSOCC) $(a_flags) $(AS64FLAGS) -c -o $@ $< + +OBJECT_FILES_NON_STANDARD := y diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso/cacheflush.S index 526f5ba2593e..0085ae464dac 100644 --- a/arch/powerpc/kernel/vdso64/cacheflush.S +++ b/arch/powerpc/kernel/vdso/cacheflush.S @@ -8,7 +8,9 @@ #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/vdso.h> +#include <asm/vdso_datapage.h> #include <asm/asm-offsets.h> +#include <asm/cache.h> .text @@ -22,59 +24,76 @@ */ V_FUNCTION_BEGIN(__kernel_sync_dicache) .cfi_startproc +BEGIN_FTR_SECTION + b 3f +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) +#ifdef CONFIG_PPC64 mflr r12 .cfi_register lr,r12 - mr r11,r3 - bl V_LOCAL_FUNC(__get_datapage) + get_datapage r10 mtlr r12 - mr r10,r3 + .cfi_restore lr +#endif +#ifdef CONFIG_PPC64 lwz r7,CFG_DCACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ +#else + li r5, L1_CACHE_BYTES - 1 +#endif + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ +#ifdef CONFIG_PPC64 lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) - srd. r8,r8,r9 /* compute line count */ + PPC_SRL. r8,r8,r9 /* compute line count */ +#else + srwi. r8, r8, L1_CACHE_SHIFT + mr r7, r6 +#endif crclr cr0*4+so beqlr /* nothing to do? */ mtctr r8 1: dcbst 0,r6 +#ifdef CONFIG_PPC64 add r6,r6,r7 +#else + addi r6, r6, L1_CACHE_BYTES +#endif bdnz 1b sync /* Now invalidate the instruction cache */ +#ifdef CONFIG_PPC64 lwz r7,CFG_ICACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) - srd. r8,r8,r9 /* compute line count */ + PPC_SRL. r8,r8,r9 /* compute line count */ crclr cr0*4+so beqlr /* nothing to do? */ +#endif mtctr r8 +#ifdef CONFIG_PPC64 2: icbi 0,r6 add r6,r6,r7 +#else +2: icbi 0, r7 + addi r7, r7, L1_CACHE_BYTES +#endif bdnz 2b isync li r3,0 blr - .cfi_endproc -V_FUNCTION_END(__kernel_sync_dicache) - - -/* - * POWER5 version of __kernel_sync_dicache - */ -V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) - .cfi_startproc +3: crclr cr0*4+so sync + icbi 0,r1 isync li r3,0 blr .cfi_endproc -V_FUNCTION_END(__kernel_sync_dicache_p5) +V_FUNCTION_END(__kernel_sync_dicache) diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso/datapage.S index 6c7401bd284e..db8e167f0166 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso/datapage.S @@ -10,34 +10,9 @@ #include <asm/asm-offsets.h> #include <asm/unistd.h> #include <asm/vdso.h> +#include <asm/vdso_datapage.h> .text - .global __kernel_datapage_offset; -__kernel_datapage_offset: - .long 0 - -V_FUNCTION_BEGIN(__get_datapage) - .cfi_startproc - /* We don't want that exposed or overridable as we want other objects - * to be able to bl directly to here - */ - .protected __get_datapage - .hidden __get_datapage - - mflr r0 - .cfi_register lr,r0 - - bcl 20,31,data_page_branch -data_page_branch: - mflr r3 - mtlr r0 - addi r3, r3, __kernel_datapage_offset-data_page_branch - lwz r0,0(r3) - .cfi_restore lr - add r3,r0,r3 - blr - .cfi_endproc -V_FUNCTION_END(__get_datapage) /* * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; @@ -52,15 +27,18 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) .cfi_startproc mflr r12 .cfi_register lr,r12 - mr r4,r3 - bl __get_datapage@local + mr. r4,r3 + get_datapage r3 mtlr r12 +#ifdef __powerpc64__ + addi r3,r3,CFG_SYSCALL_MAP64 +#else addi r3,r3,CFG_SYSCALL_MAP32 - cmpli cr0,r4,0 +#endif + crclr cr0*4+so beqlr li r0,NR_syscalls stw r0,0(r4) - crclr cr0*4+so blr .cfi_endproc V_FUNCTION_END(__kernel_get_syscall_map) @@ -70,17 +48,17 @@ V_FUNCTION_END(__kernel_get_syscall_map) * * returns the timebase frequency in HZ */ -#ifndef CONFIG_PPC_BOOK3S_601 V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 .cfi_register lr,r12 - bl __get_datapage@local + get_datapage r3 +#ifndef __powerpc64__ lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) - lwz r3,CFG_TB_TICKS_PER_SEC(r3) +#endif + PPC_LL r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 crclr cr0*4+so blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) -#endif diff --git a/arch/powerpc/kernel/vdso/gen_vdso32_offsets.sh b/arch/powerpc/kernel/vdso/gen_vdso32_offsets.sh new file mode 100755 index 000000000000..c7b54a5dcd3e --- /dev/null +++ b/arch/powerpc/kernel/vdso/gen_vdso32_offsets.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Author: Will Deacon <will.deacon@arm.com +# + +LC_ALL=C +sed -n -e 's/^00*/0/' -e \ +'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/powerpc/kernel/vdso/gen_vdso64_offsets.sh b/arch/powerpc/kernel/vdso/gen_vdso64_offsets.sh new file mode 100755 index 000000000000..4bf15ffd5933 --- /dev/null +++ b/arch/powerpc/kernel/vdso/gen_vdso64_offsets.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Author: Will Deacon <will.deacon@arm.com +# + +LC_ALL=C +sed -n -e 's/^00*/0/' -e \ +'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso64_offset_\2\t0x\1/p' diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso/getcpu.S index 63e914539e1a..8e08ccf19062 100644 --- a/arch/powerpc/kernel/vdso32/getcpu.S +++ b/arch/powerpc/kernel/vdso/getcpu.S @@ -15,19 +15,36 @@ * int __kernel_getcpu(unsigned *cpu, unsigned *node); * */ +#if defined(CONFIG_PPC64) V_FUNCTION_BEGIN(__kernel_getcpu) .cfi_startproc mfspr r5,SPRN_SPRG_VDSO_READ - cmpwi cr0,r3,0 - cmpwi cr1,r4,0 + PPC_LCMPI cr0,r3,0 + PPC_LCMPI cr1,r4,0 clrlwi r6,r5,16 rlwinm r7,r5,16,31-15,31-0 beq cr0,1f stw r6,0(r3) -1: beq cr1,2f - stw r7,0(r4) -2: crclr cr0*4+so +1: crclr cr0*4+so li r3,0 /* always success */ + beqlr cr1 + stw r7,0(r4) + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) +#elif !defined(CONFIG_SMP) +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + cmpwi cr0, r3, 0 + cmpwi cr1, r4, 0 + li r5, 0 + beq cr0, 1f + stw r5, 0(r3) +1: li r3, 0 /* always success */ + crclr cr0*4+so + beqlr cr1 + stw r5, 0(r4) blr .cfi_endproc V_FUNCTION_END(__kernel_getcpu) +#endif diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S new file mode 100644 index 000000000000..48fc6658053a --- /dev/null +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Userland implementation of gettimeofday() for processes + * for use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org, + * IBM Corp. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/vdso_datapage.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> + +/* + * The macro sets two stack frames, one for the caller and one for the callee + * because there are no requirement for the caller to set a stack frame when + * calling VDSO so it may have omitted to set one, especially on PPC64 + */ + +.macro cvdso_call funct call_time=0 + .cfi_startproc + PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM + mflr r0 + PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM + PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) + .cfi_rel_offset lr, PPC_MIN_STKFRM + PPC_LR_STKOFF +#ifdef __powerpc64__ + PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT +#endif + get_datapage r5 + .ifeq \call_time + addi r5, r5, VDSO_DATA_OFFSET + .else + addi r4, r5, VDSO_DATA_OFFSET + .endif +#ifdef __powerpc64__ + bl CFUNC(DOTSYM(\funct)) +#else + bl \funct +#endif + PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) +#ifdef __powerpc64__ + PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_restore r2 +#endif + .ifeq \call_time + cmpwi r3, 0 + .endif + mtlr r0 + addi r1, r1, 2 * PPC_MIN_STKFRM + .cfi_restore lr + .cfi_def_cfa_offset 0 + crclr so + .ifeq \call_time + beqlr+ + crset so + neg r3, r3 + .endif + blr + .cfi_endproc +.endm + + .text +/* + * Exact prototype of gettimeofday + * + * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); + * + */ +V_FUNCTION_BEGIN(__kernel_gettimeofday) + cvdso_call __c_kernel_gettimeofday +V_FUNCTION_END(__kernel_gettimeofday) + +/* + * Exact prototype of clock_gettime() + * + * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_gettime) + cvdso_call __c_kernel_clock_gettime +V_FUNCTION_END(__kernel_clock_gettime) + +/* + * Exact prototype of clock_gettime64() + * + * int __kernel_clock_gettime64(clockid_t clock_id, struct __timespec64 *ts); + * + */ +#ifndef __powerpc64__ +V_FUNCTION_BEGIN(__kernel_clock_gettime64) + cvdso_call __c_kernel_clock_gettime64 +V_FUNCTION_END(__kernel_clock_gettime64) +#endif + +/* + * Exact prototype of clock_getres() + * + * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_getres) + cvdso_call __c_kernel_clock_getres +V_FUNCTION_END(__kernel_clock_getres) + + +/* + * Exact prototype of time() + * + * time_t time(time *t); + * + */ +V_FUNCTION_BEGIN(__kernel_time) + cvdso_call __c_kernel_time call_time=1 +V_FUNCTION_END(__kernel_time) + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ +#ifndef __powerpc64__ +_GLOBAL(_restgpr_31_x) +_GLOBAL(_rest32gpr_31_x) + lwz r0,4(r11) + lwz r31,-4(r11) + mtlr r0 + mr r1,r11 + blr +#endif diff --git a/arch/powerpc/kernel/vdso32/note.S b/arch/powerpc/kernel/vdso/note.S index 227a7327399e..227a7327399e 100644 --- a/arch/powerpc/kernel/vdso32/note.S +++ b/arch/powerpc/kernel/vdso/note.S diff --git a/arch/powerpc/kernel/vdso32/sigtramp.S b/arch/powerpc/kernel/vdso/sigtramp32.S index 0bcc5e5fe789..0bcc5e5fe789 100644 --- a/arch/powerpc/kernel/vdso32/sigtramp.S +++ b/arch/powerpc/kernel/vdso/sigtramp32.S diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso/sigtramp64.S index a8cc0409d7d2..2d4067561293 100644 --- a/arch/powerpc/kernel/vdso64/sigtramp.S +++ b/arch/powerpc/kernel/vdso/sigtramp64.S @@ -6,6 +6,7 @@ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp. */ +#include <asm/cache.h> /* IFETCH_ALIGN_BYTES */ #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/unistd.h> @@ -14,21 +15,26 @@ .text -/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from - the return address to get an address in the middle of the presumed - call instruction. Since we don't have a call here, we artificially - extend the range covered by the unwind info by padding before the - real start. */ - nop +/* + * __kernel_start_sigtramp_rt64 and __kernel_sigtramp_rt64 together + * are one function split in two parts. The kernel jumps to the former + * and the signal handler indirectly (by blr) returns to the latter. + * __kernel_sigtramp_rt64 needs to point to the return address so + * glibc can correctly identify the trampoline stack frame. + */ .balign 8 + .balign IFETCH_ALIGN_BYTES +V_FUNCTION_BEGIN(__kernel_start_sigtramp_rt64) +.Lsigrt_start: + bctrl /* call the handler */ +V_FUNCTION_END(__kernel_start_sigtramp_rt64) V_FUNCTION_BEGIN(__kernel_sigtramp_rt64) -.Lsigrt_start = . - 4 addi r1, r1, __SIGNAL_FRAMESIZE li r0,__NR_rt_sigreturn sc .Lsigrt_end: V_FUNCTION_END(__kernel_sigtramp_rt64) -/* The ".balign 8" above and the following zeros mimic the old stack +/* The .balign 8 above and the following zeros mimic the old stack trampoline layout. The last magic value is the ucontext pointer, chosen in such a way that older libgcc unwind code returns a zero for a sigcontext pointer. */ diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S index 00c025ba4a92..426e1ccc6971 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso/vdso32.lds.S @@ -4,6 +4,8 @@ * library */ #include <asm/vdso.h> +#include <asm/page.h> +#include <asm-generic/vmlinux.lds.h> #ifdef __LITTLE_ENDIAN__ OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle") @@ -11,11 +13,11 @@ OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle") OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc") #endif OUTPUT_ARCH(powerpc:common) -ENTRY(_start) SECTIONS { - . = VDSO32_LBASE + SIZEOF_HEADERS; + PROVIDE(_vdso_datapage = . - 2 * PAGE_SIZE); + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -36,17 +38,25 @@ SECTIONS PROVIDE(etext = .); . = ALIGN(8); + VDSO_ftr_fixup_start = .; __ftr_fixup : { *(__ftr_fixup) } + VDSO_ftr_fixup_end = .; . = ALIGN(8); + VDSO_mmu_ftr_fixup_start = .; __mmu_ftr_fixup : { *(__mmu_ftr_fixup) } + VDSO_mmu_ftr_fixup_end = .; . = ALIGN(8); + VDSO_lwsync_fixup_start = .; __lwsync_fixup : { *(__lwsync_fixup) } + VDSO_lwsync_fixup_end = .; #ifdef CONFIG_PPC64 . = ALIGN(8); + VDSO_fw_ftr_fixup_start = .; __fw_ftr_fixup : { *(__fw_ftr_fixup) } + VDSO_fw_ftr_fixup_end = .; #endif /* @@ -68,49 +78,16 @@ SECTIONS __end = .; PROVIDE(end = .); - /* - * Stabs debugging sections are here too. - */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } - - /* - * DWARF debug sections. - * Symbols in the DWARF debugging sections are relative to the beginning - * of the section so we begin them at 0. - */ - /* DWARF 1 */ - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - /* GNU DWARF 1 extensions */ - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - /* DWARF 1.1 and DWARF 2 */ - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } + DWARF_DEBUG + ELF_DETAILS /DISCARD/ : { *(.note.GNU-stack) + *(*.EMB.apuinfo) + *(.branch_lt) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) + *(.got1 .glink .iplt .rela*) } } @@ -138,27 +115,26 @@ VERSION { VDSO_VERSION_STRING { global: - /* - * Has to be there for the kernel to find - */ - __kernel_datapage_offset; - __kernel_get_syscall_map; -#ifndef CONFIG_PPC_BOOK3S_601 __kernel_gettimeofday; __kernel_clock_gettime; + __kernel_clock_gettime64; __kernel_clock_getres; __kernel_time; __kernel_get_tbfreq; -#endif __kernel_sync_dicache; - __kernel_sync_dicache_p5; __kernel_sigtramp32; __kernel_sigtramp_rt32; -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) || !defined(CONFIG_SMP) __kernel_getcpu; #endif local: *; }; } + +/* + * Make the sigreturn code visible to the kernel. + */ +VDSO_sigtramp32 = __kernel_sigtramp32; +VDSO_sigtramp_rt32 = __kernel_sigtramp_rt32; diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S index 256fb9720298..bda6c8cdd459 100644 --- a/arch/powerpc/kernel/vdso64/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso/vdso64.lds.S @@ -4,6 +4,8 @@ * library */ #include <asm/vdso.h> +#include <asm/page.h> +#include <asm-generic/vmlinux.lds.h> #ifdef __LITTLE_ENDIAN__ OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle") @@ -11,11 +13,11 @@ OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle") OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc") #endif OUTPUT_ARCH(powerpc:common64) -ENTRY(_start) SECTIONS { - . = VDSO64_LBASE + SIZEOF_HEADERS; + PROVIDE(_vdso_datapage = . - 2 * PAGE_SIZE); + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -30,23 +32,31 @@ SECTIONS . = ALIGN(16); .text : { *(.text .stub .text.* .gnu.linkonce.t.* __ftr_alt_*) - *(.sfpr .glink) + *(.sfpr) } :text PROVIDE(__etext = .); PROVIDE(_etext = .); PROVIDE(etext = .); . = ALIGN(8); + VDSO_ftr_fixup_start = .; __ftr_fixup : { *(__ftr_fixup) } + VDSO_ftr_fixup_end = .; . = ALIGN(8); + VDSO_mmu_ftr_fixup_start = .; __mmu_ftr_fixup : { *(__mmu_ftr_fixup) } + VDSO_mmu_ftr_fixup_end = .; . = ALIGN(8); + VDSO_lwsync_fixup_start = .; __lwsync_fixup : { *(__lwsync_fixup) } + VDSO_lwsync_fixup_end = .; . = ALIGN(8); + VDSO_fw_ftr_fixup_start = .; __fw_ftr_fixup : { *(__fw_ftr_fixup) } + VDSO_fw_ftr_fixup_end = .; /* * Other stuff is appended to the text segment: @@ -61,56 +71,22 @@ SECTIONS .gcc_except_table : { *(.gcc_except_table) } .rela.dyn ALIGN(8) : { *(.rela.dyn) } - .opd ALIGN(8) : { KEEP (*(.opd)) } .got ALIGN(8) : { *(.got .toc) } _end = .; PROVIDE(end = .); - /* - * Stabs debugging sections are here too. - */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } - - /* - * DWARF debug sections. - * Symbols in the DWARF debugging sections are relative to the beginning - * of the section so we begin them at 0. - */ - /* DWARF 1 */ - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - /* GNU DWARF 1 extensions */ - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - /* DWARF 1.1 and DWARF 2 */ - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } + DWARF_DEBUG + ELF_DETAILS /DISCARD/ : { *(.note.GNU-stack) + *(*.EMB.apuinfo) *(.branch_lt) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) + *(.opd) + *(.glink .iplt .plt .rela*) } } @@ -138,18 +114,12 @@ VERSION { VDSO_VERSION_STRING { global: - /* - * Has to be there for the kernel to find - */ - __kernel_datapage_offset; - __kernel_get_syscall_map; __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; __kernel_get_tbfreq; __kernel_sync_dicache; - __kernel_sync_dicache_p5; __kernel_sigtramp_rt64; __kernel_getcpu; __kernel_time; @@ -157,3 +127,8 @@ VERSION local: *; }; } + +/* + * Make the sigreturn code visible to the kernel. + */ +VDSO_sigtramp_rt64 = __kernel_start_sigtramp_rt64; diff --git a/arch/powerpc/kernel/vdso/vgettimeofday.c b/arch/powerpc/kernel/vdso/vgettimeofday.c new file mode 100644 index 000000000000..55a287c9a736 --- /dev/null +++ b/arch/powerpc/kernel/vdso/vgettimeofday.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Powerpc userspace implementations of gettimeofday() and similar. + */ +#include <linux/time.h> +#include <linux/types.h> + +#ifdef __powerpc64__ +int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts, + const struct vdso_data *vd) +{ + return __cvdso_clock_gettime_data(vd, clock, ts); +} + +int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res, + const struct vdso_data *vd) +{ + return __cvdso_clock_getres_data(vd, clock_id, res); +} +#else +int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts, + const struct vdso_data *vd) +{ + return __cvdso_clock_gettime32_data(vd, clock, ts); +} + +int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts, + const struct vdso_data *vd) +{ + return __cvdso_clock_gettime_data(vd, clock, ts); +} + +int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, + const struct vdso_data *vd) +{ + return __cvdso_clock_getres_time32_data(vd, clock_id, res); +} +#endif + +int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, + const struct vdso_data *vd) +{ + return __cvdso_gettimeofday_data(vd, tv, tz); +} + +__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd) +{ + return __cvdso_time_data(vd, time); +} diff --git a/arch/powerpc/kernel/vdso32/.gitignore b/arch/powerpc/kernel/vdso32/.gitignore deleted file mode 100644 index fea5809857a5..000000000000 --- a/arch/powerpc/kernel/vdso32/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -vdso32.lds -vdso32.so.dbg diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile deleted file mode 100644 index 06f54d947057..000000000000 --- a/arch/powerpc/kernel/vdso32/Makefile +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -# List of files in the vdso, has to be asm only for now - -obj-vdso32-$(CONFIG_PPC64) = getcpu.o -obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \ - $(obj-vdso32-y) - -# Build rules - -ifdef CROSS32_COMPILE - VDSOCC := $(CROSS32_COMPILE)gcc -else - VDSOCC := $(CC) -endif - -CC32FLAGS := -ifdef CONFIG_PPC64 -CC32FLAGS += -m32 -endif - -targets := $(obj-vdso32) vdso32.so vdso32.so.dbg -obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) - -GCOV_PROFILE := n -KCOV_INSTRUMENT := n -UBSAN_SANITIZE := n - -ccflags-y := -shared -fno-common -fno-builtin -nostdlib \ - -Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both -asflags-y := -D__VDSO32__ -s - -obj-y += vdso32_wrapper.o -extra-y += vdso32.lds -CPPFLAGS_vdso32.lds += -P -C -Upowerpc - -# Force dependency (incbin is bad) -$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so - -# link rule for the .so file, .lds has to be first -$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE - $(call if_changed,vdso32ld) - -# strip rule for the .so file -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -# assembly rules for the .S files -$(obj-vdso32): %.o: %.S FORCE - $(call if_changed_dep,vdso32as) - -# actual build commands -quiet_cmd_vdso32ld = VDSO32L $@ - cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) -quiet_cmd_vdso32as = VDSO32A $@ - cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $< - -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso32.so: $(obj)/vdso32.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso32.so diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S deleted file mode 100644 index 7f882e7b9f43..000000000000 --- a/arch/powerpc/kernel/vdso32/cacheflush.S +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * vDSO provided cache flush routines - * - * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), - * IBM Corp. - */ -#include <asm/processor.h> -#include <asm/ppc_asm.h> -#include <asm/vdso.h> -#include <asm/asm-offsets.h> - - .text - -/* - * Default "generic" version of __kernel_sync_dicache. - * - * void __kernel_sync_dicache(unsigned long start, unsigned long end) - * - * Flushes the data cache & invalidate the instruction cache for the - * provided range [start, end[ - */ -V_FUNCTION_BEGIN(__kernel_sync_dicache) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - mr r11,r3 - bl __get_datapage@local - mtlr r12 - mr r10,r3 - - lwz r7,CFG_DCACHE_BLOCKSZ(r10) - addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ - subf r8,r6,r4 /* compute length */ - add r8,r8,r5 /* ensure we get enough */ - lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) - srw. r8,r8,r9 /* compute line count */ - crclr cr0*4+so - beqlr /* nothing to do? */ - mtctr r8 -1: dcbst 0,r6 - add r6,r6,r7 - bdnz 1b - sync - -/* Now invalidate the instruction cache */ - - lwz r7,CFG_ICACHE_BLOCKSZ(r10) - addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ - subf r8,r6,r4 /* compute length */ - add r8,r8,r5 - lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) - srw. r8,r8,r9 /* compute line count */ - crclr cr0*4+so - beqlr /* nothing to do? */ - mtctr r8 -2: icbi 0,r6 - add r6,r6,r7 - bdnz 2b - isync - li r3,0 - blr - .cfi_endproc -V_FUNCTION_END(__kernel_sync_dicache) - - -/* - * POWER5 version of __kernel_sync_dicache - */ -V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) - .cfi_startproc - crclr cr0*4+so - sync - isync - li r3,0 - blr - .cfi_endproc -V_FUNCTION_END(__kernel_sync_dicache_p5) - diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S deleted file mode 100644 index 3306672f57a9..000000000000 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ /dev/null @@ -1,290 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Userland implementation of gettimeofday() for 32 bits processes in a - * ppc64 kernel for use in the vDSO - * - * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org, - * IBM Corp. - */ -#include <asm/processor.h> -#include <asm/ppc_asm.h> -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> - -/* Offset for the low 32-bit part of a field of long type */ -#ifdef CONFIG_PPC64 -#define LOPART 4 -#else -#define LOPART 0 -#endif - - .text -/* - * Exact prototype of gettimeofday - * - * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); - * - */ -V_FUNCTION_BEGIN(__kernel_gettimeofday) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - - mr r10,r3 /* r10 saves tv */ - mr r11,r4 /* r11 saves tz */ - bl __get_datapage@local /* get data page */ - mr r9, r3 /* datapage ptr in r9 */ - cmplwi r10,0 /* check if tv is NULL */ - beq 3f - lis r7,1000000@ha /* load up USEC_PER_SEC */ - addi r7,r7,1000000@l /* so we get microseconds in r4 */ - bl __do_get_tspec@local /* get sec/usec from tb & kernel */ - stw r3,TVAL32_TV_SEC(r10) - stw r4,TVAL32_TV_USEC(r10) - -3: cmplwi r11,0 /* check if tz is NULL */ - beq 1f - lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */ - lwz r5,CFG_TZ_DSTTIME(r9) - stw r4,TZONE_TZ_MINWEST(r11) - stw r5,TZONE_TZ_DSTTIME(r11) - -1: mtlr r12 - crclr cr0*4+so - li r3,0 - blr - .cfi_endproc -V_FUNCTION_END(__kernel_gettimeofday) - -/* - * Exact prototype of clock_gettime() - * - * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_gettime) - .cfi_startproc - /* Check for supported clock IDs */ - cmpli cr0,r3,CLOCK_REALTIME - cmpli cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f - - mflr r12 /* r12 saves lr */ - .cfi_register lr,r12 - mr r11,r4 /* r11 saves tp */ - bl __get_datapage@local /* get data page */ - mr r9,r3 /* datapage ptr in r9 */ - lis r7,NSEC_PER_SEC@h /* want nanoseconds */ - ori r7,r7,NSEC_PER_SEC@l -50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ - bne cr1,80f /* not monotonic -> all done */ - - /* - * CLOCK_MONOTONIC - */ - - /* now we must fixup using wall to monotonic. We need to snapshot - * that value and do the counter trick again. Fortunately, we still - * have the counter value in r8 that was returned by __do_get_xsec. - * At this point, r3,r4 contain our sec/nsec values, r5 and r6 - * can be used, r7 contains NSEC_PER_SEC. - */ - - lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9) - lwz r6,WTOM_CLOCK_NSEC(r9) - - /* We now have our offset in r5,r6. We create a fake dependency - * on that value and re-check the counter - */ - or r0,r6,r5 - xor r0,r0,r0 - add r9,r9,r0 - lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - cmpl cr0,r8,r0 /* check if updated */ - bne- 50b - - /* Calculate and store result. Note that this mimics the C code, - * which may cause funny results if nsec goes negative... is that - * possible at all ? - */ - add r3,r3,r5 - add r4,r4,r6 - cmpw cr0,r4,r7 - cmpwi cr1,r4,0 - blt 1f - subf r4,r7,r4 - addi r3,r3,1 -1: bge cr1,80f - addi r3,r3,-1 - add r4,r4,r7 - -80: stw r3,TSPC32_TV_SEC(r11) - stw r4,TSPC32_TV_NSEC(r11) - - mtlr r12 - crclr cr0*4+so - li r3,0 - blr - - /* - * syscall fallback - */ -99: - li r0,__NR_clock_gettime - .cfi_restore lr - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_gettime) - - -/* - * Exact prototype of clock_getres() - * - * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_getres) - .cfi_startproc - /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f - - mflr r12 - .cfi_register lr,r12 - bl __get_datapage@local /* get data page */ - lwz r5, CLOCK_HRTIMER_RES(r3) - mtlr r12 - li r3,0 - cmpli cr0,r4,0 - crclr cr0*4+so - beqlr - stw r3,TSPC32_TV_SEC(r4) - stw r5,TSPC32_TV_NSEC(r4) - blr - - /* - * syscall fallback - */ -99: - li r0,__NR_clock_getres - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_getres) - - -/* - * Exact prototype of time() - * - * time_t time(time *t); - * - */ -V_FUNCTION_BEGIN(__kernel_time) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - - mr r11,r3 /* r11 holds t */ - bl __get_datapage@local - mr r9, r3 /* datapage ptr in r9 */ - - lwz r3,STAMP_XTIME_SEC+LOPART(r9) - - cmplwi r11,0 /* check if t is NULL */ - beq 2f - stw r3,0(r11) /* store result at *t */ -2: mtlr r12 - crclr cr0*4+so - blr - .cfi_endproc -V_FUNCTION_END(__kernel_time) - -/* - * This is the core of clock_gettime() and gettimeofday(), - * it returns the current time in r3 (seconds) and r4. - * On entry, r7 gives the resolution of r4, either USEC_PER_SEC - * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds. - * It expects the datapage ptr in r9 and doesn't clobber it. - * It clobbers r0, r5 and r6. - * On return, r8 contains the counter value that can be reused. - * This clobbers cr0 but not any other cr field. - */ -__do_get_tspec: - .cfi_startproc - /* Check for update count & load values. We use the low - * order 32 bits of the update count - */ -1: lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r9,r9,r0 - - /* Load orig stamp (offset to TB) */ - lwz r5,CFG_TB_ORIG_STAMP(r9) - lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) - - /* Get a stable TB value */ -2: MFTBU(r3) - MFTBL(r4) - MFTBU(r0) - cmplw cr0,r3,r0 - bne- 2b - - /* Subtract tb orig stamp and shift left 12 bits. - */ - subfc r4,r6,r4 - subfe r0,r5,r3 - slwi r0,r0,12 - rlwimi. r0,r4,12,20,31 - slwi r4,r4,12 - - /* - * Load scale factor & do multiplication. - * We only use the high 32 bits of the tb_to_xs value. - * Even with a 1GHz timebase clock, the high 32 bits of - * tb_to_xs will be at least 4 million, so the error from - * ignoring the low 32 bits will be no more than 0.25ppm. - * The error will just make the clock run very very slightly - * slow until the next time the kernel updates the VDSO data, - * at which point the clock will catch up to the kernel's value, - * so there is no long-term error accumulation. - */ - lwz r5,CFG_TB_TO_XS(r9) /* load values */ - mulhwu r4,r4,r5 - li r3,0 - - beq+ 4f /* skip high part computation if 0 */ - mulhwu r3,r0,r5 - mullw r5,r0,r5 - addc r4,r4,r5 - addze r3,r3 -4: - /* At this point, we have seconds since the xtime stamp - * as a 32.32 fixed-point number in r3 and r4. - * Load & add the xtime stamp. - */ - lwz r5,STAMP_XTIME_SEC+LOPART(r9) - lwz r6,STAMP_SEC_FRAC(r9) - addc r4,r4,r6 - adde r3,r3,r5 - - /* We create a fake dependency on the result in r3/r4 - * and re-check the counter - */ - or r6,r4,r3 - xor r0,r6,r6 - add r9,r9,r0 - lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - cmplw cr0,r8,r0 /* check if updated */ - bne- 1b - - mulhwu r4,r4,r7 /* convert to micro or nanoseconds */ - - blr - .cfi_endproc diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32_wrapper.S index 3f5ef035b0a9..10f92f265d51 100644 --- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S +++ b/arch/powerpc/kernel/vdso32_wrapper.S @@ -7,7 +7,7 @@ .globl vdso32_start, vdso32_end .balign PAGE_SIZE vdso32_start: - .incbin "arch/powerpc/kernel/vdso32/vdso32.so.dbg" + .incbin "arch/powerpc/kernel/vdso/vdso32.so.dbg" .balign PAGE_SIZE vdso32_end: diff --git a/arch/powerpc/kernel/vdso64/.gitignore b/arch/powerpc/kernel/vdso64/.gitignore deleted file mode 100644 index 77a0b423642c..000000000000 --- a/arch/powerpc/kernel/vdso64/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -vdso64.lds -vdso64.so.dbg diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile deleted file mode 100644 index 32ebb3522ea1..000000000000 --- a/arch/powerpc/kernel/vdso64/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# List of files in the vdso, has to be asm only for now - -obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o - -# Build rules - -targets := $(obj-vdso64) vdso64.so vdso64.so.dbg -obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) - -GCOV_PROFILE := n -KCOV_INSTRUMENT := n -UBSAN_SANITIZE := n - -ccflags-y := -shared -fno-common -fno-builtin -nostdlib \ - -Wl,-soname=linux-vdso64.so.1 -Wl,--hash-style=both -asflags-y := -D__VDSO64__ -s - -obj-y += vdso64_wrapper.o -extra-y += vdso64.lds -CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) - -# Force dependency (incbin is bad) -$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so - -# link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE - $(call if_changed,vdso64ld) - -# strip rule for the .so file -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -# actual build commands -quiet_cmd_vdso64ld = VDSO64L $@ - cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) - -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso64.so: $(obj)/vdso64.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso64.so diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S deleted file mode 100644 index dc84f5ae3802..000000000000 --- a/arch/powerpc/kernel/vdso64/datapage.S +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Access to the shared data page by the vDSO & syscall map - * - * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. - */ - -#include <asm/processor.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> -#include <asm/vdso.h> - - .text -.global __kernel_datapage_offset; -__kernel_datapage_offset: - .long 0 - -V_FUNCTION_BEGIN(__get_datapage) - .cfi_startproc - /* We don't want that exposed or overridable as we want other objects - * to be able to bl directly to here - */ - .protected __get_datapage - .hidden __get_datapage - - mflr r0 - .cfi_register lr,r0 - - bcl 20,31,data_page_branch -data_page_branch: - mflr r3 - mtlr r0 - addi r3, r3, __kernel_datapage_offset-data_page_branch - lwz r0,0(r3) - .cfi_restore lr - add r3,r0,r3 - blr - .cfi_endproc -V_FUNCTION_END(__get_datapage) - -/* - * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; - * - * returns a pointer to the syscall map. the map is agnostic to the - * size of "long", unlike kernel bitops, it stores bits from top to - * bottom so that memory actually contains a linear bitmap - * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of - * 32 bits int at N >> 5. - */ -V_FUNCTION_BEGIN(__kernel_get_syscall_map) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - mr r4,r3 - bl V_LOCAL_FUNC(__get_datapage) - mtlr r12 - addi r3,r3,CFG_SYSCALL_MAP64 - cmpldi cr0,r4,0 - crclr cr0*4+so - beqlr - li r0,NR_syscalls - stw r0,0(r4) - blr - .cfi_endproc -V_FUNCTION_END(__kernel_get_syscall_map) - - -/* - * void unsigned long __kernel_get_tbfreq(void); - * - * returns the timebase frequency in HZ - */ -V_FUNCTION_BEGIN(__kernel_get_tbfreq) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - bl V_LOCAL_FUNC(__get_datapage) - ld r3,CFG_TB_TICKS_PER_SEC(r3) - mtlr r12 - crclr cr0*4+so - blr - .cfi_endproc -V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso64/getcpu.S b/arch/powerpc/kernel/vdso64/getcpu.S deleted file mode 100644 index 12bbf236cdc4..000000000000 --- a/arch/powerpc/kernel/vdso64/getcpu.S +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * - * Copyright (C) IBM Corporation, 2012 - * - * Author: Anton Blanchard <anton@au.ibm.com> - */ -#include <asm/ppc_asm.h> -#include <asm/vdso.h> - - .text -/* - * Exact prototype of getcpu - * - * int __kernel_getcpu(unsigned *cpu, unsigned *node); - * - */ -V_FUNCTION_BEGIN(__kernel_getcpu) - .cfi_startproc - mfspr r5,SPRN_SPRG_VDSO_READ - cmpdi cr0,r3,0 - cmpdi cr1,r4,0 - clrlwi r6,r5,16 - rlwinm r7,r5,16,31-15,31-0 - beq cr0,1f - stw r6,0(r3) -1: beq cr1,2f - stw r7,0(r4) -2: crclr cr0*4+so - li r3,0 /* always success */ - blr - .cfi_endproc -V_FUNCTION_END(__kernel_getcpu) diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S deleted file mode 100644 index 1c9a04703250..000000000000 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ /dev/null @@ -1,289 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Userland implementation of gettimeofday() for 64 bits processes in a - * ppc64 kernel for use in the vDSO - * - * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), - * IBM Corp. - */ -#include <asm/processor.h> -#include <asm/ppc_asm.h> -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> - - .text -/* - * Exact prototype of gettimeofday - * - * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); - * - */ -V_FUNCTION_BEGIN(__kernel_gettimeofday) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - - mr r11,r3 /* r11 holds tv */ - mr r10,r4 /* r10 holds tz */ - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ - cmpldi r11,0 /* check if tv is NULL */ - beq 2f - lis r7,1000000@ha /* load up USEC_PER_SEC */ - addi r7,r7,1000000@l - bl V_LOCAL_FUNC(__do_get_tspec) /* get sec/us from tb & kernel */ - std r4,TVAL64_TV_SEC(r11) /* store sec in tv */ - std r5,TVAL64_TV_USEC(r11) /* store usec in tv */ -2: cmpldi r10,0 /* check if tz is NULL */ - beq 1f - lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */ - lwz r5,CFG_TZ_DSTTIME(r3) - stw r4,TZONE_TZ_MINWEST(r10) - stw r5,TZONE_TZ_DSTTIME(r10) -1: mtlr r12 - crclr cr0*4+so - li r3,0 /* always success */ - blr - .cfi_endproc -V_FUNCTION_END(__kernel_gettimeofday) - - -/* - * Exact prototype of clock_gettime() - * - * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_gettime) - .cfi_startproc - /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - - cmpwi cr5,r3,CLOCK_REALTIME_COARSE - cmpwi cr6,r3,CLOCK_MONOTONIC_COARSE - cror cr5*4+eq,cr5*4+eq,cr6*4+eq - - cror cr0*4+eq,cr0*4+eq,cr5*4+eq - bne cr0,99f - - mflr r12 /* r12 saves lr */ - .cfi_register lr,r12 - mr r11,r4 /* r11 saves tp */ - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ - lis r7,NSEC_PER_SEC@h /* want nanoseconds */ - ori r7,r7,NSEC_PER_SEC@l - beq cr5,70f -50: bl V_LOCAL_FUNC(__do_get_tspec) /* get time from tb & kernel */ - bne cr1,80f /* if not monotonic, all done */ - - /* - * CLOCK_MONOTONIC - */ - - /* now we must fixup using wall to monotonic. We need to snapshot - * that value and do the counter trick again. Fortunately, we still - * have the counter value in r8 that was returned by __do_get_tspec. - * At this point, r4,r5 contain our sec/nsec values. - */ - - ld r6,WTOM_CLOCK_SEC(r3) - lwa r9,WTOM_CLOCK_NSEC(r3) - - /* We now have our result in r6,r9. We create a fake dependency - * on that result and re-check the counter - */ - or r0,r6,r9 - xor r0,r0,r0 - add r3,r3,r0 - ld r0,CFG_TB_UPDATE_COUNT(r3) - cmpld cr0,r0,r8 /* check if updated */ - bne- 50b - b 78f - - /* - * For coarse clocks we get data directly from the vdso data page, so - * we don't need to call __do_get_tspec, but we still need to do the - * counter trick. - */ -70: ld r8,CFG_TB_UPDATE_COUNT(r3) - andi. r0,r8,1 /* pending update ? loop */ - bne- 70b - add r3,r3,r0 /* r0 is already 0 */ - - /* - * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE - * too - */ - ld r4,STAMP_XTIME_SEC(r3) - ld r5,STAMP_XTIME_NSEC(r3) - bne cr6,75f - - /* CLOCK_MONOTONIC_COARSE */ - ld r6,WTOM_CLOCK_SEC(r3) - lwa r9,WTOM_CLOCK_NSEC(r3) - - /* check if counter has updated */ - or r0,r6,r9 -75: or r0,r0,r4 - or r0,r0,r5 - xor r0,r0,r0 - add r3,r3,r0 - ld r0,CFG_TB_UPDATE_COUNT(r3) - cmpld cr0,r0,r8 /* check if updated */ - bne- 70b - - /* Counter has not updated, so continue calculating proper values for - * sec and nsec if monotonic coarse, or just return with the proper - * values for realtime. - */ - bne cr6,80f - - /* Add wall->monotonic offset and check for overflow or underflow */ -78: add r4,r4,r6 - add r5,r5,r9 - cmpd cr0,r5,r7 - cmpdi cr1,r5,0 - blt 79f - subf r5,r7,r5 - addi r4,r4,1 -79: bge cr1,80f - addi r4,r4,-1 - add r5,r5,r7 - -80: std r4,TSPC64_TV_SEC(r11) - std r5,TSPC64_TV_NSEC(r11) - - mtlr r12 - crclr cr0*4+so - li r3,0 - blr - - /* - * syscall fallback - */ -99: - li r0,__NR_clock_gettime - .cfi_restore lr - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_gettime) - - -/* - * Exact prototype of clock_getres() - * - * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_getres) - .cfi_startproc - /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f - - mflr r12 - .cfi_register lr,r12 - bl V_LOCAL_FUNC(__get_datapage) - lwz r5, CLOCK_HRTIMER_RES(r3) - mtlr r12 - li r3,0 - cmpldi cr0,r4,0 - crclr cr0*4+so - beqlr - std r3,TSPC64_TV_SEC(r4) - std r5,TSPC64_TV_NSEC(r4) - blr - - /* - * syscall fallback - */ -99: - li r0,__NR_clock_getres - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_getres) - -/* - * Exact prototype of time() - * - * time_t time(time *t); - * - */ -V_FUNCTION_BEGIN(__kernel_time) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - - mr r11,r3 /* r11 holds t */ - bl V_LOCAL_FUNC(__get_datapage) - - ld r4,STAMP_XTIME_SEC(r3) - - cmpldi r11,0 /* check if t is NULL */ - beq 2f - std r4,0(r11) /* store result at *t */ -2: mtlr r12 - crclr cr0*4+so - mr r3,r4 - blr - .cfi_endproc -V_FUNCTION_END(__kernel_time) - - -/* - * This is the core of clock_gettime() and gettimeofday(), - * it returns the current time in r4 (seconds) and r5. - * On entry, r7 gives the resolution of r5, either USEC_PER_SEC - * or NSEC_PER_SEC, giving r5 in microseconds or nanoseconds. - * It expects the datapage ptr in r3 and doesn't clobber it. - * It clobbers r0, r6 and r9. - * On return, r8 contains the counter value that can be reused. - * This clobbers cr0 but not any other cr field. - */ -V_FUNCTION_BEGIN(__do_get_tspec) - .cfi_startproc - /* check for update count & load values */ -1: ld r8,CFG_TB_UPDATE_COUNT(r3) - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r3,r3,r0 - - /* Get TB & offset it. We use the MFTB macro which will generate - * workaround code for Cell. - */ - MFTB(r6) - ld r9,CFG_TB_ORIG_STAMP(r3) - subf r6,r9,r6 - - /* Scale result */ - ld r5,CFG_TB_TO_XS(r3) - sldi r6,r6,12 /* compute time since stamp_xtime */ - mulhdu r6,r6,r5 /* in units of 2^-32 seconds */ - - /* Add stamp since epoch */ - ld r4,STAMP_XTIME_SEC(r3) - lwz r5,STAMP_SEC_FRAC(r3) - or r0,r4,r5 - or r0,r0,r6 - xor r0,r0,r0 - add r3,r3,r0 - ld r0,CFG_TB_UPDATE_COUNT(r3) - cmpld r0,r8 /* check if updated */ - bne- 1b /* reload if so */ - - /* convert to seconds & nanoseconds and add to stamp */ - add r6,r6,r5 /* add on fractional seconds of xtime */ - mulhwu r5,r6,r7 /* compute micro or nanoseconds and */ - srdi r6,r6,32 /* seconds since stamp_xtime */ - clrldi r5,r5,32 - add r4,r4,r6 - blr - .cfi_endproc -V_FUNCTION_END(__do_get_tspec) diff --git a/arch/powerpc/kernel/vdso64/note.S b/arch/powerpc/kernel/vdso64/note.S deleted file mode 100644 index dc2a509f7e8a..000000000000 --- a/arch/powerpc/kernel/vdso64/note.S +++ /dev/null @@ -1 +0,0 @@ -#include "../vdso32/note.S" diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64_wrapper.S index 1d56d81fe3b3..839d1a61411d 100644 --- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S +++ b/arch/powerpc/kernel/vdso64_wrapper.S @@ -7,7 +7,7 @@ .globl vdso64_start, vdso64_end .balign PAGE_SIZE vdso64_start: - .incbin "arch/powerpc/kernel/vdso64/vdso64.so.dbg" + .incbin "arch/powerpc/kernel/vdso/vdso64.so.dbg" .balign PAGE_SIZE vdso64_end: diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c index 4acd3fb2b38e..fd9432875ebc 100644 --- a/arch/powerpc/kernel/vecemu.c +++ b/arch/powerpc/kernel/vecemu.c @@ -10,6 +10,7 @@ #include <asm/processor.h> #include <asm/switch_to.h> #include <linux/uaccess.h> +#include <asm/inst.h> /* Functions in vector.S */ extern void vaddfp(vector128 *dst, vector128 *a, vector128 *b); @@ -260,21 +261,24 @@ static unsigned int rfin(unsigned int x) int emulate_altivec(struct pt_regs *regs) { - unsigned int instr, i; + ppc_inst_t instr; + unsigned int i, word; unsigned int va, vb, vc, vd; vector128 *vrs; - if (get_user(instr, (unsigned int __user *) regs->nip)) + if (get_user_instr(instr, (void __user *)regs->nip)) return -EFAULT; - if ((instr >> 26) != 4) + + word = ppc_inst_val(instr); + if (ppc_inst_primary_opcode(instr) != 4) return -EINVAL; /* not an altivec instruction */ - vd = (instr >> 21) & 0x1f; - va = (instr >> 16) & 0x1f; - vb = (instr >> 11) & 0x1f; - vc = (instr >> 6) & 0x1f; + vd = (word >> 21) & 0x1f; + va = (word >> 16) & 0x1f; + vb = (word >> 11) & 0x1f; + vc = (word >> 6) & 0x1f; vrs = current->thread.vr_state.vr; - switch (instr & 0x3f) { + switch (word & 0x3f) { case 10: switch (vc) { case 0: /* vaddfp */ diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 8eb867dbad5f..80b3f6e476b6 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/export.h> +#include <linux/linkage.h> #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/reg.h> @@ -7,7 +9,6 @@ #include <asm/thread_info.h> #include <asm/page.h> #include <asm/ptrace.h> -#include <asm/export.h> #include <asm/asm-compat.h> /* @@ -32,6 +33,7 @@ _GLOBAL(store_vr_state) mfvscr v0 li r4, VRSTATE_VSCR stvx v0, r4, r3 + lvx v0, 0, r3 blr EXPORT_SYMBOL(store_vr_state) @@ -47,6 +49,10 @@ EXPORT_SYMBOL(store_vr_state) */ _GLOBAL(load_up_altivec) mfmsr r5 /* grab the current MSR */ +#ifdef CONFIG_PPC_BOOK3S_64 + /* interrupt doesn't set MSR[RI] and HPT can fault on current access */ + ori r5,r5,MSR_RI +#endif oris r5,r5,MSR_VEC@h MTMSRD(r5) /* enable use of AltiVec now */ isync @@ -65,20 +71,21 @@ _GLOBAL(load_up_altivec) 1: /* enable use of VMX after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + addi r5,r2,THREAD oris r9,r9,MSR_VEC@h #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ oris r12,r12,MSR_VEC@h std r12,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3S_64 + li r4,0 + stb r4,PACASRR_VALID(r13) +#endif #endif - /* Don't care if r4 overflows, this is desired behaviour */ - lbz r4,THREAD_LOAD_VEC(r5) - addi r4,r4,1 + li r4,1 stb r4,THREAD_LOAD_VEC(r5) addi r6,r5,THREAD_VRSTATE - li r4,1 li r10,VRSTATE_VSCR stw r4,THREAD_USED_VR(r5) lvx v0,r10,r6 @@ -86,6 +93,7 @@ _GLOBAL(load_up_altivec) REST_32VRS(0,r4,r6) /* restore registers and return */ blr +_ASM_NOKPROBE_SYMBOL(load_up_altivec) /* * save_altivec(tsk) @@ -102,6 +110,7 @@ _GLOBAL(save_altivec) mfvscr v0 li r4,VRSTATE_VSCR stvx v0,r4,r7 + lvx v0,0,r7 blr #ifdef CONFIG_VSX @@ -124,6 +133,12 @@ _GLOBAL(load_up_vsx) andis. r5,r12,MSR_VEC@h beql+ load_up_altivec /* skip if already loaded */ +#ifdef CONFIG_PPC_BOOK3S_64 + /* interrupt doesn't set MSR[RI] and HPT can fault on current access */ + li r5,MSR_RI + mtmsrd r5,1 +#endif + ld r4,PACACURRENT(r13) addi r4,r4,THREAD /* Get THREAD */ li r6,1 @@ -131,7 +146,9 @@ _GLOBAL(load_up_vsx) /* enable use of VSX after return */ oris r12,r12,MSR_VSX@h std r12,_MSR(r1) - b fast_exception_return + li r4,0 + stb r4,PACASRR_VALID(r13) + b fast_interrupt_return_srr #endif /* CONFIG_VSX */ @@ -141,8 +158,8 @@ _GLOBAL(load_up_vsx) * usage of floating-point registers. These routines must be called * with preempt disabled. */ -#ifdef CONFIG_PPC32 .data +#ifdef CONFIG_PPC32 fpzero: .long 0 fpone: @@ -155,24 +172,29 @@ fphalf: lfs fr,name@l(r11) #else - .section ".toc","aw" fpzero: - .tc FD_0_0[TC],0 + .quad 0 fpone: - .tc FD_3ff00000_0[TC],0x3ff0000000000000 /* 1.0 */ + .quad 0x3ff0000000000000 /* 1.0 */ fphalf: - .tc FD_3fe00000_0[TC],0x3fe0000000000000 /* 0.5 */ + .quad 0x3fe0000000000000 /* 0.5 */ -#define LDCONST(fr, name) \ - lfd fr,name@toc(r2) +#ifdef CONFIG_PPC_KERNEL_PCREL +#define LDCONST(fr, name) \ + pla r11,name@pcrel; \ + lfd fr,0(r11) +#else +#define LDCONST(fr, name) \ + addis r11,r2,name@toc@ha; \ + lfd fr,name@toc@l(r11) +#endif #endif - .text /* * Internal routine to enable floating point and set FPSCR to 0. * Don't call it from C; it doesn't use the normal calling convention. */ -fpenable: +SYM_FUNC_START_LOCAL(fpenable) #ifdef CONFIG_PPC32 stwu r1,-64(r1) #else @@ -189,6 +211,7 @@ fpenable: mffs fr31 MTFSF_L(fr1) blr +SYM_FUNC_END(fpenable) fpdisable: mtlr r12 diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8834220036a5..1c5970df3233 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,6 +8,23 @@ #define BSS_FIRST_SECTIONS *(.bss.prominit) #define EMITS_PT_NOTE #define RO_EXCEPTION_TABLE_ALIGN 0 +#define RUNTIME_DISCARD_EXIT + +#define SOFT_MASK_TABLE(align) \ + . = ALIGN(align); \ + __soft_mask_table : AT(ADDR(__soft_mask_table) - LOAD_OFFSET) { \ + __start___soft_mask_table = .; \ + KEEP(*(__soft_mask_table)) \ + __stop___soft_mask_table = .; \ + } + +#define RESTART_TABLE(align) \ + . = ALIGN(align); \ + __restart_table : AT(ADDR(__restart_table) - LOAD_OFFSET) { \ + __start___restart_table = .; \ + KEEP(*(__restart_table)) \ + __stop___restart_table = .; \ + } #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> @@ -15,7 +32,10 @@ #include <asm/thread_info.h> #define STRICT_ALIGN_SIZE (1 << CONFIG_DATA_SHIFT) -#define ETEXT_ALIGN_SIZE (1 << CONFIG_ETEXT_SHIFT) + +#if STRICT_ALIGN_SIZE < PAGE_SIZE +#error "CONFIG_DATA_SHIFT must be >= PAGE_SHIFT" +#endif ENTRY(_stext) @@ -52,7 +72,7 @@ SECTIONS .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { #ifdef CONFIG_PPC64 KEEP(*(.head.text.first_256B)); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 #else KEEP(*(.head.text.real_vectors)); *(.head.text.real_trampolines); @@ -86,12 +106,10 @@ SECTIONS ALIGN_FUNCTION(); #endif /* careful! __ftr_alt_* sections need to be close to .text */ - *(.text.hot TEXT_MAIN .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); -#ifdef CONFIG_PPC64 + *(.text.hot .text.hot.* TEXT_MAIN .text.fixup .text.unlikely .text.unlikely.* .fixup __ftr_alt_* .ref.text); *(.tramp.ftrace.text); -#endif + NOINSTR_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT @@ -104,26 +122,76 @@ SECTIONS * included with the main text sections, so put it by itself. */ *(.sfpr); + *(.text.asan.* .text.tsan.*) MEM_KEEP(init.text) MEM_KEEP(exit.text) + } :text + + . = ALIGN(PAGE_SIZE); + _etext = .; + PROVIDE32 (etext = .); + + /* Read-only data */ + RO_DATA(PAGE_SIZE) #ifdef CONFIG_PPC32 + .sdata2 : AT(ADDR(.sdata2) - LOAD_OFFSET) { + *(.sdata2) + } +#endif + + .data.rel.ro : AT(ADDR(.data.rel.ro) - LOAD_OFFSET) { + *(.data.rel.ro .data.rel.ro.*) + } + + .branch_lt : AT(ADDR(.branch_lt) - LOAD_OFFSET) { + *(.branch_lt) + } + +#ifdef CONFIG_PPC32 + .got1 : AT(ADDR(.got1) - LOAD_OFFSET) { *(.got1) + } + .got2 : AT(ADDR(.got2) - LOAD_OFFSET) { __got2_start = .; *(.got2) __got2_end = .; -#endif /* CONFIG_PPC32 */ + } + .got : AT(ADDR(.got) - LOAD_OFFSET) { + *(.got) + *(.got.plt) + } + .plt : AT(ADDR(.plt) - LOAD_OFFSET) { + /* XXX: is .plt (and .got.plt) required? */ + *(.plt) + } - } :text +#else /* CONFIG_PPC32 */ +#ifndef CONFIG_PPC_KERNEL_PCREL + .toc1 : AT(ADDR(.toc1) - LOAD_OFFSET) { + *(.toc1) + } +#endif - . = ALIGN(ETEXT_ALIGN_SIZE); - _etext = .; - PROVIDE32 (etext = .); + .got : AT(ADDR(.got) - LOAD_OFFSET) ALIGN(256) { +#ifdef CONFIG_PPC_KERNEL_PCREL + *(.got) +#else + *(.got .toc) +#endif + } - /* Read-only data */ - RO_DATA(PAGE_SIZE) + SOFT_MASK_TABLE(8) + RESTART_TABLE(8) + +#ifdef CONFIG_PPC64_ELF_ABI_V1 + .opd : AT(ADDR(.opd) - LOAD_OFFSET) { + __start_opd = .; + KEEP(*(.opd)) + __end_opd = .; + } +#endif -#ifdef CONFIG_PPC64 . = ALIGN(8); __stf_entry_barrier_fixup : AT(ADDR(__stf_entry_barrier_fixup) - LOAD_OFFSET) { __start___stf_entry_barrier_fixup = .; @@ -132,6 +200,27 @@ SECTIONS } . = ALIGN(8); + __uaccess_flush_fixup : AT(ADDR(__uaccess_flush_fixup) - LOAD_OFFSET) { + __start___uaccess_flush_fixup = .; + *(__uaccess_flush_fixup) + __stop___uaccess_flush_fixup = .; + } + + . = ALIGN(8); + __entry_flush_fixup : AT(ADDR(__entry_flush_fixup) - LOAD_OFFSET) { + __start___entry_flush_fixup = .; + *(__entry_flush_fixup) + __stop___entry_flush_fixup = .; + } + + . = ALIGN(8); + __scv_entry_flush_fixup : AT(ADDR(__scv_entry_flush_fixup) - LOAD_OFFSET) { + __start___scv_entry_flush_fixup = .; + *(__scv_entry_flush_fixup) + __stop___scv_entry_flush_fixup = .; + } + + . = ALIGN(8); __stf_exit_barrier_fixup : AT(ADDR(__stf_exit_barrier_fixup) - LOAD_OFFSET) { __start___stf_exit_barrier_fixup = .; *(__stf_exit_barrier_fixup) @@ -144,7 +233,7 @@ SECTIONS *(__rfi_flush_fixup) __stop___rfi_flush_fixup = .; } -#endif /* CONFIG_PPC64 */ +#endif /* CONFIG_PPC32 */ #ifdef CONFIG_PPC_BARRIER_NOSPEC . = ALIGN(8); @@ -155,7 +244,7 @@ SECTIONS } #endif /* CONFIG_PPC_BARRIER_NOSPEC */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 . = ALIGN(8); __spec_btb_flush_fixup : AT(ADDR(__spec_btb_flush_fixup) - LOAD_OFFSET) { __start__btb_flush_fixup = .; @@ -164,19 +253,28 @@ SECTIONS } #endif + /* + * Various code relies on __init_begin being at the strict RWX boundary. + */ + . = ALIGN(STRICT_ALIGN_SIZE); + __srwx_boundary = .; + __end_rodata = .; + __init_begin = .; + /* * Init sections discarded at runtime */ - . = ALIGN(STRICT_ALIGN_SIZE); - __init_begin = .; - . = ALIGN(PAGE_SIZE); .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; INIT_TEXT + + /* + *.init.text might be RO so we must ensure this section ends on + * a page boundary. + */ + . = ALIGN(PAGE_SIZE); _einittext = .; -#ifdef CONFIG_PPC64 *(.tramp.ftrace.init); -#endif } :text /* .exit.text is discarded at runtime, not link time, @@ -186,21 +284,9 @@ SECTIONS EXIT_TEXT } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - INIT_SETUP(16) - } - - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - INIT_CALLS - } + . = ALIGN(PAGE_SIZE); - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - CON_INITCALL - } + INIT_DATA_SECTION(16) . = ALIGN(8); __ftr_fixup : AT(ADDR(__ftr_fixup) - LOAD_OFFSET) { @@ -228,9 +314,6 @@ SECTIONS __stop___fw_ftr_fixup = .; } #endif - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - INIT_RAM_FS - } PERCPU_SECTION(L1_CACHE_BYTES) @@ -244,9 +327,7 @@ SECTIONS . = ALIGN(8); .dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET) { -#ifdef CONFIG_PPC32 __dynamic_symtab = .; -#endif *(.dynsym) } .dynstr : AT(ADDR(.dynstr) - LOAD_OFFSET) { *(.dynstr) } @@ -256,6 +337,7 @@ SECTIONS *(.dynamic) } .hash : AT(ADDR(.hash) - LOAD_OFFSET) { *(.hash) } + .gnu.hash : AT(ADDR(.gnu.hash) - LOAD_OFFSET) { *(.gnu.hash) } .interp : AT(ADDR(.interp) - LOAD_OFFSET) { *(.interp) } .rela.dyn : AT(ADDR(.rela.dyn) - LOAD_OFFSET) { @@ -281,49 +363,16 @@ SECTIONS . = ALIGN(PAGE_SIZE); _sdata = .; -#ifdef CONFIG_PPC32 .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA -#ifdef CONFIG_UBSAN - *(.data..Lubsan_data*) - *(.data..Lubsan_type*) -#endif *(.data.rel*) +#ifdef CONFIG_PPC32 *(SDATA_MAIN) - *(.sdata2) - *(.got.plt) *(.got) - *(.plt) - *(.branch_lt) - } -#else - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - *(.data.rel*) - *(.toc1) - *(.branch_lt) - } - - .opd : AT(ADDR(.opd) - LOAD_OFFSET) { - __start_opd = .; - KEEP(*(.opd)) - __end_opd = .; - } - - . = ALIGN(256); - .got : AT(ADDR(.got) - LOAD_OFFSET) { - __toc_start = .; -#ifndef CONFIG_RELOCATABLE - __prom_init_toc_start = .; - arch/powerpc/kernel/prom_init.o*(.toc .got) - __prom_init_toc_end = .; #endif - *(.got) - *(.toc) } -#endif /* The initial task and kernel stack */ - INIT_TASK_DATA_SECTION(THREAD_SIZE) + INIT_TASK_DATA_SECTION(THREAD_ALIGN) .data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) { PAGE_ALIGNED_DATA(PAGE_SIZE) @@ -358,16 +407,18 @@ SECTIONS _end = . ; PROVIDE32 (end = .); - STABS_DEBUG - DWARF_DEBUG + ELF_DETAILS DISCARDS /DISCARD/ : { *(*.EMB.apuinfo) - *(.glink .iplt .plt .rela* .comment) + *(.glink .iplt .plt) *(.gnu.version*) *(.gnu.attributes) *(.eh_frame) +#ifndef CONFIG_RELOCATABLE + *(.rela*) +#endif } } diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index af3c15a1d41e..8c464a5d8246 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -24,9 +24,12 @@ #include <linux/kdebug.h> #include <linux/sched/debug.h> #include <linux/delay.h> +#include <linux/processor.h> #include <linux/smp.h> +#include <asm/interrupt.h> #include <asm/paca.h> +#include <asm/nmi.h> /* * The powerpc watchdog ensures that each CPU is able to service timers. @@ -53,7 +56,7 @@ * solved by also having a SMP watchdog where all CPUs check all other * CPUs heartbeat. * - * The SMP checker can detect lockups on other CPUs. A gobal "pending" + * The SMP checker can detect lockups on other CPUs. A global "pending" * cpumask is kept, containing all CPUs which enable the watchdog. Each * CPU clears their pending bit in their heartbeat timer. When the bitmask * becomes empty, the last CPU to clear its pending bit updates a global @@ -82,10 +85,41 @@ static DEFINE_PER_CPU(u64, wd_timer_tb); /* SMP checker bits */ static unsigned long __wd_smp_lock; +static unsigned long __wd_reporting; +static unsigned long __wd_nmi_output; static cpumask_t wd_smp_cpus_pending; static cpumask_t wd_smp_cpus_stuck; static u64 wd_smp_last_reset_tb; +#ifdef CONFIG_PPC_PSERIES +static u64 wd_timeout_pct; +#endif + +/* + * Try to take the exclusive watchdog action / NMI IPI / printing lock. + * wd_smp_lock must be held. If this fails, we should return and wait + * for the watchdog to kick in again (or another CPU to trigger it). + * + * Importantly, if hardlockup_panic is set, wd_try_report failure should + * not delay the panic, because whichever other CPU is reporting will + * call panic. + */ +static bool wd_try_report(void) +{ + if (__wd_reporting) + return false; + __wd_reporting = 1; + return true; +} + +/* End printing after successful wd_try_report. wd_smp_lock not required. */ +static void wd_end_reporting(void) +{ + smp_mb(); /* End printing "critical section" */ + WARN_ON_ONCE(__wd_reporting == 0); + WRITE_ONCE(__wd_reporting, 0); +} + static inline void wd_smp_lock(unsigned long *flags) { /* @@ -125,108 +159,182 @@ static void wd_lockup_ipi(struct pt_regs *regs) else dump_stack(); + /* + * __wd_nmi_output must be set after we printk from NMI context. + * + * printk from NMI context defers printing to the console to irq_work. + * If that NMI was taken in some code that is hard-locked, then irqs + * are disabled so irq_work will never fire. That can result in the + * hard lockup messages being delayed (indefinitely, until something + * else kicks the console drivers). + * + * Setting __wd_nmi_output will cause another CPU to notice and kick + * the console drivers for us. + * + * xchg is not needed here (it could be a smp_mb and store), but xchg + * gives the memory ordering and atomicity required. + */ + xchg(&__wd_nmi_output, 1); + /* Do not panic from here because that can recurse into NMI IPI layer */ } -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) +static bool set_cpu_stuck(int cpu) { - cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); - cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + /* + * See wd_smp_clear_cpu_pending() + */ + smp_mb(); if (cpumask_empty(&wd_smp_cpus_pending)) { - wd_smp_last_reset_tb = tb; + wd_smp_last_reset_tb = get_tb(); cpumask_andnot(&wd_smp_cpus_pending, &wd_cpus_enabled, &wd_smp_cpus_stuck); + return true; } -} -static void set_cpu_stuck(int cpu, u64 tb) -{ - set_cpumask_stuck(cpumask_of(cpu), tb); + return false; } -static void watchdog_smp_panic(int cpu, u64 tb) +static void watchdog_smp_panic(int cpu) { + static cpumask_t wd_smp_cpus_ipi; // protected by reporting unsigned long flags; + u64 tb, last_reset; int c; wd_smp_lock(&flags); /* Double check some things under lock */ - if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) + tb = get_tb(); + last_reset = wd_smp_last_reset_tb; + if ((s64)(tb - last_reset) < (s64)wd_smp_panic_timeout_tb) goto out; if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) goto out; - if (cpumask_weight(&wd_smp_cpus_pending) == 0) + if (!wd_try_report()) + goto out; + for_each_online_cpu(c) { + if (!cpumask_test_cpu(c, &wd_smp_cpus_pending)) + continue; + if (c == cpu) + continue; // should not happen + + __cpumask_set_cpu(c, &wd_smp_cpus_ipi); + if (set_cpu_stuck(c)) + break; + } + if (cpumask_empty(&wd_smp_cpus_ipi)) { + wd_end_reporting(); goto out; + } + wd_smp_unlock(&flags); pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n", - cpu, cpumask_pr_args(&wd_smp_cpus_pending)); + cpu, cpumask_pr_args(&wd_smp_cpus_ipi)); pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n", - cpu, tb, wd_smp_last_reset_tb, - tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000); + cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000); if (!sysctl_hardlockup_all_cpu_backtrace) { /* * Try to trigger the stuck CPUs, unless we are going to * get a backtrace on all of them anyway. */ - for_each_cpu(c, &wd_smp_cpus_pending) { - if (c == cpu) - continue; + for_each_cpu(c, &wd_smp_cpus_ipi) { smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + __cpumask_clear_cpu(c, &wd_smp_cpus_ipi); } + } else { + trigger_allbutcpu_cpu_backtrace(cpu); + cpumask_clear(&wd_smp_cpus_ipi); } - /* Take the stuck CPUs out of the watch group */ - set_cpumask_stuck(&wd_smp_cpus_pending, tb); - - wd_smp_unlock(&flags); - - printk_safe_flush(); - /* - * printk_safe_flush() seems to require another print - * before anything actually goes out to console. - */ - if (sysctl_hardlockup_all_cpu_backtrace) - trigger_allbutself_cpu_backtrace(); - if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); + wd_end_reporting(); + return; out: wd_smp_unlock(&flags); } -static void wd_smp_clear_cpu_pending(int cpu, u64 tb) +static void wd_smp_clear_cpu_pending(int cpu) { if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { struct pt_regs *regs = get_irq_regs(); unsigned long flags; - wd_smp_lock(&flags); - pr_emerg("CPU %d became unstuck TB:%lld\n", - cpu, tb); + cpu, get_tb()); print_irqtrace_events(current); if (regs) show_regs(regs); else dump_stack(); + wd_smp_lock(&flags); cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); wd_smp_unlock(&flags); + } else { + /* + * The last CPU to clear pending should have reset the + * watchdog so we generally should not find it empty + * here if our CPU was clear. However it could happen + * due to a rare race with another CPU taking the + * last CPU out of the mask concurrently. + * + * We can't add a warning for it. But just in case + * there is a problem with the watchdog that is causing + * the mask to not be reset, try to kick it along here. + */ + if (unlikely(cpumask_empty(&wd_smp_cpus_pending))) + goto none_pending; } return; } + + /* + * All other updates to wd_smp_cpus_pending are performed under + * wd_smp_lock. All of them are atomic except the case where the + * mask becomes empty and is reset. This will not happen here because + * cpu was tested to be in the bitmap (above), and a CPU only clears + * its own bit. _Except_ in the case where another CPU has detected a + * hard lockup on our CPU and takes us out of the pending mask. So in + * normal operation there will be no race here, no problem. + * + * In the lockup case, this atomic clear-bit vs a store that refills + * other bits in the accessed word wll not be a problem. The bit clear + * is atomic so it will not cause the store to get lost, and the store + * will never set this bit so it will not overwrite the bit clear. The + * only way for a stuck CPU to return to the pending bitmap is to + * become unstuck itself. + */ cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + + /* + * Order the store to clear pending with the load(s) to check all + * words in the pending mask to check they are all empty. This orders + * with the same barrier on another CPU. This prevents two CPUs + * clearing the last 2 pending bits, but neither seeing the other's + * store when checking if the mask is empty, and missing an empty + * mask, which ends with a false positive. + */ + smp_mb(); if (cpumask_empty(&wd_smp_cpus_pending)) { unsigned long flags; +none_pending: + /* + * Double check under lock because more than one CPU could see + * a clear mask with the lockless check after clearing their + * pending bits. + */ wd_smp_lock(&flags); if (cpumask_empty(&wd_smp_cpus_pending)) { - wd_smp_last_reset_tb = tb; + wd_smp_last_reset_tb = get_tb(); cpumask_andnot(&wd_smp_cpus_pending, &wd_cpus_enabled, &wd_smp_cpus_stuck); @@ -241,33 +349,60 @@ static void watchdog_timer_interrupt(int cpu) per_cpu(wd_timer_tb, cpu) = tb; - wd_smp_clear_cpu_pending(cpu, tb); + wd_smp_clear_cpu_pending(cpu); if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) - watchdog_smp_panic(cpu, tb); + watchdog_smp_panic(cpu); + + if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) { + /* + * Something has called printk from NMI context. It might be + * stuck, so this triggers a flush that will get that + * printk output to the console. + * + * See wd_lockup_ipi. + */ + printk_trigger_flush(); + } } -void soft_nmi_interrupt(struct pt_regs *regs) +DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) { unsigned long flags; int cpu = raw_smp_processor_id(); u64 tb; - if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) - return; + /* should only arrive from kernel, with irqs disabled */ + WARN_ON_ONCE(!arch_irq_disabled_regs(regs)); - nmi_enter(); + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return 0; __this_cpu_inc(irq_stat.soft_nmi_irqs); tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { + /* + * Taking wd_smp_lock here means it is a soft-NMI lock, which + * means we can't take any regular or irqsafe spin locks while + * holding this lock. This is why timers can't printk while + * holding the lock. + */ wd_smp_lock(&flags); if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) { wd_smp_unlock(&flags); - goto out; + return 0; } - set_cpu_stuck(cpu, tb); + if (!wd_try_report()) { + wd_smp_unlock(&flags); + /* Couldn't report, try again in 100ms */ + mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000); + return 0; + } + + set_cpu_stuck(cpu); + + wd_smp_unlock(&flags); pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n", cpu, (void *)regs->nip); @@ -278,26 +413,32 @@ void soft_nmi_interrupt(struct pt_regs *regs) print_irqtrace_events(current); show_regs(regs); - wd_smp_unlock(&flags); + xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi if (sysctl_hardlockup_all_cpu_backtrace) - trigger_allbutself_cpu_backtrace(); + trigger_allbutcpu_cpu_backtrace(cpu); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); + + wd_end_reporting(); } + /* + * We are okay to change DEC in soft_nmi_interrupt because the masked + * handler has marked a DEC as pending, so the timer interrupt will be + * replayed as soon as local irqs are enabled again. + */ if (wd_panic_timeout_tb < 0x7fffffff) mtspr(SPRN_DEC, wd_panic_timeout_tb); -out: - nmi_exit(); + return 0; } static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { int cpu = smp_processor_id(); - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) return HRTIMER_NORESTART; if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) @@ -314,11 +455,15 @@ void arch_touch_nmi_watchdog(void) { unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); - u64 tb = get_tb(); + u64 tb; + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) + return; + + tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { per_cpu(wd_timer_tb, cpu) = tb; - wd_smp_clear_cpu_pending(cpu, tb); + wd_smp_clear_cpu_pending(cpu); } } EXPORT_SYMBOL(arch_touch_nmi_watchdog); @@ -334,7 +479,7 @@ static void start_watchdog(void *arg) return; } - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) return; if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) @@ -376,7 +521,7 @@ static void stop_watchdog(void *arg) cpumask_clear_cpu(cpu, &wd_cpus_enabled); wd_smp_unlock(&flags); - wd_smp_clear_cpu_pending(cpu, get_tb()); + wd_smp_clear_cpu_pending(cpu); } static int stop_watchdog_on_cpu(unsigned int cpu) @@ -386,7 +531,13 @@ static int stop_watchdog_on_cpu(unsigned int cpu) static void watchdog_calc_timeouts(void) { - wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; + u64 threshold = watchdog_thresh; + +#ifdef CONFIG_PPC_PSERIES + threshold += (READ_ONCE(wd_timeout_pct) * threshold) / 100; +#endif + + wd_panic_timeout_tb = threshold * ppc_tb_freq; /* Have the SMP detector trigger a bit later */ wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; @@ -395,7 +546,7 @@ static void watchdog_calc_timeouts(void) wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; } -void watchdog_nmi_stop(void) +void watchdog_hardlockup_stop(void) { int cpu; @@ -403,7 +554,7 @@ void watchdog_nmi_stop(void) stop_watchdog_on_cpu(cpu); } -void watchdog_nmi_start(void) +void watchdog_hardlockup_start(void) { int cpu; @@ -415,7 +566,7 @@ void watchdog_nmi_start(void) /* * Invoked from core watchdog init. */ -int __init watchdog_nmi_probe(void) +int __init watchdog_hardlockup_probe(void) { int err; @@ -429,3 +580,12 @@ int __init watchdog_nmi_probe(void) } return 0; } + +#ifdef CONFIG_PPC_PSERIES +void watchdog_hardlockup_set_timeout_pct(u64 pct) +{ + pr_info("Set the NMI watchdog timeout factor to %llu%%\n", pct); + WRITE_ONCE(wd_timeout_pct, pct); + lockup_detector_reconfigure(); +} +#endif |