diff options
Diffstat (limited to 'arch/powerpc')
47 files changed, 2848 insertions, 404 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 951e18f5335b..7c93c7eb0e56 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -132,6 +132,7 @@ config PPC select IRQ_PER_CPU select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL + select IRQ_FORCED_THREADING select HAVE_RCU_TABLE_FREE if SMP select HAVE_SYSCALL_TRACEPOINTS select HAVE_BPF_JIT if (PPC64 && NET) diff --git a/arch/powerpc/configs/chroma_defconfig b/arch/powerpc/configs/chroma_defconfig new file mode 100644 index 000000000000..acf7fb280464 --- /dev/null +++ b/arch/powerpc/configs/chroma_defconfig @@ -0,0 +1,307 @@ +CONFIG_PPC64=y +CONFIG_PPC_BOOK3E_64=y +# CONFIG_VIRT_CPU_ACCOUNTING is not set +CONFIG_SMP=y +CONFIG_NR_CPUS=256 +CONFIG_EXPERIMENTAL=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_AUDIT=y +CONFIG_AUDITSYSCALL=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=19 +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_RESOURCE_COUNTERS=y +CONFIG_CGROUP_MEM_RES_CTLR=y +CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y +CONFIG_NAMESPACES=y +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_INITRAMFS_COMPRESSION_GZIP=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +CONFIG_PERF_COUNTERS=y +CONFIG_PROFILING=y +CONFIG_OPROFILE=y +CONFIG_KPROBES=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_SCOM_DEBUGFS=y +CONFIG_PPC_A2_DD2=y +CONFIG_KVM_GUEST=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_HZ_100=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_NUMA=y +# CONFIG_MIGRATION is not set +CONFIG_PPC_64K_PAGES=y +CONFIG_SCHED_SMT=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="" +# CONFIG_SECCOMP is not set +CONFIG_PCIEPORTBUS=y +# CONFIG_PCIEASPM is not set +CONFIG_PCI_MSI=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_STATISTICS=y +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_NET_IPIP=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +CONFIG_IPV6=y +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=y +CONFIG_IPV6_TUNNEL=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_UDPLITE=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_NET_TCPPROBE=y +# CONFIG_WIRELESS is not set +CONFIG_NET_9P=y +CONFIG_NET_9P_DEBUG=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_MTD=y +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_CFI=y +CONFIG_MTD_CFI_ADV_OPTIONS=y +CONFIG_MTD_CFI_LE_BYTE_SWAP=y +CONFIG_MTD_CFI_INTELEXT=y +CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_CFI_STAA=y +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_PROC_DEVICETREE=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_CRYPTOLOOP=y +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=65536 +CONFIG_CDROM_PKTCDVD=y +CONFIG_MISC_DEVICES=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=y +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SPI_ATTRS=y +CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SRP_ATTRS=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SIL24=y +CONFIG_SATA_MV=y +CONFIG_SATA_SIL=y +CONFIG_PATA_CMD64X=y +CONFIG_PATA_MARVELL=y +CONFIG_PATA_SIL680=y +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_SNAPSHOT=y +CONFIG_DM_MIRROR=y +CONFIG_DM_ZERO=y +CONFIG_DM_UEVENT=y +CONFIG_NETDEVICES=y +CONFIG_TUN=y +CONFIG_E1000E=y +CONFIG_TIGON3=y +# CONFIG_WLAN is not set +# CONFIG_INPUT is not set +# CONFIG_SERIO is not set +# CONFIG_VT is not set +CONFIG_DEVPTS_MULTIPLE_INSTANCES=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_HW_RANDOM=y +CONFIG_RAW_DRIVER=y +CONFIG_MAX_RAW_DEVS=1024 +# CONFIG_HWMON is not set +# CONFIG_VGA_ARB is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_EDAC=y +CONFIG_EDAC_MM_EDAC=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_DS1511=y +CONFIG_RTC_DRV_DS1553=y +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT2_FS_XIP=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +# CONFIG_DNOTIFY is not set +CONFIG_FUSE_FS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_CONFIGFS_FS=m +CONFIG_CRAMFS=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_ROOT_NFS=y +CONFIG_CIFS=y +CONFIG_CIFS_WEAK_PW_HASH=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_CRC_CCITT=m +CONFIG_CRC_T10DIF=y +CONFIG_LIBCRC32C=m +CONFIG_PRINTK_TIME=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_STRIP_ASM_SYMS=y +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_SCHED_DEBUG is not set +CONFIG_DEBUG_INFO=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_PPC_EMULATED_STATS=y +CONFIG_XMON=y +CONFIG_XMON_DEFAULT=y +CONFIG_VIRQ_DEBUG=y +CONFIG_PPC_EARLY_DEBUG=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_AES=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_LZO=m +# CONFIG_CRYPTO_ANSI_CPRNG is not set +CONFIG_VIRTUALIZATION=y diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index e30442c539ce..7044233124ba 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -437,7 +437,7 @@ extern const char *powerpc_base_platform; #define CPU_FTRS_COMPATIBLE (CPU_FTR_USE_TB | CPU_FTR_PPCAS_ARCH_V2) #define CPU_FTRS_A2 (CPU_FTR_USE_TB | CPU_FTR_SMT | CPU_FTR_DBELL | \ - CPU_FTR_NOEXECUTE | CPU_FTR_NODSISRALIGN) + CPU_FTR_NOEXECUTE | CPU_FTR_NODSISRALIGN | CPU_FTR_ICSWX) #ifdef __powerpc64__ #ifdef CONFIG_PPC_BOOK3E diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index b540d6fcedd6..bf37931d1ad6 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -213,6 +213,9 @@ struct machdep_calls { * allow assignment/enabling of the device. */ int (*pcibios_enable_device_hook)(struct pci_dev *); + /* Called after scan and before resource survey */ + void (*pcibios_fixup_phb)(struct pci_controller *hose); + /* Called to shutdown machine specific hardware not already controlled * by other drivers. */ diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 0260ea5ec3c2..50210b9b0147 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -214,6 +214,10 @@ typedef struct { unsigned int id; unsigned int active; unsigned long vdso_base; +#ifdef CONFIG_PPC_ICSWX + struct spinlock *cop_lockp; /* guard cop related stuff */ + unsigned long acop; /* mask of enabled coprocessor types */ +#endif /* CONFIG_PPC_ICSWX */ #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ u64 high_slices_psize; /* 4 bits per slice for now */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 56b879ab3a40..882b6aa6c857 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -153,8 +153,8 @@ struct pci_dn { int pci_ext_config_space; /* for pci devices */ -#ifdef CONFIG_EEH struct pci_dev *pcidev; /* back-pointer to the pci device */ +#ifdef CONFIG_EEH int class_code; /* pci device class */ int eeh_mode; /* See eeh.h for possible EEH_MODEs */ int eeh_config_addr; @@ -164,6 +164,10 @@ struct pci_dn { int eeh_false_positives; /* # times this device reported #ff's */ u32 config_space[16]; /* saved PCI config space */ #endif +#define IODA_INVALID_PE (-1) +#ifdef CONFIG_PPC_POWERNV + int pe_number; +#endif }; /* Get the pointer to a device_node's pci_dn */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 88b0bd925a8b..2e0e4110f7ae 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -170,6 +170,9 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addre #define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \ _PAGE_COHERENT | _PAGE_WRITETHRU)) +#define pgprot_cached_noncoherent(prot) \ + (__pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL)) + #define pgprot_writecombine pgprot_noncached_wc struct file; diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 03c48e819c8e..500fe1dc43e6 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -187,6 +187,10 @@ #define SPRN_CSRR1 SPRN_SRR3 /* Critical Save and Restore Register 1 */ #endif +#ifdef CONFIG_PPC_ICSWX +#define SPRN_HACOP 0x15F /* Hypervisor Available Coprocessor Register */ +#endif + /* Bit definitions for CCR1. */ #define CCR1_DPC 0x00000100 /* Disable L1 I-Cache/D-Cache parity checking */ #define CCR1_TCS 0x00000080 /* Timer Clock Select */ diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 41f69ae79d4e..1646b76bd3d2 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -245,6 +245,12 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); +#ifdef CONFIG_PPC_RTAS_DAEMON +extern void rtas_cancel_event_scan(void); +#else +static inline void rtas_cancel_event_scan(void) { } +#endif + /* Error types logged. */ #define ERR_FLAG_ALREADY_LOGGED 0x0 #define ERR_FLAG_BOOT 0x1 /* log was pulled from NVRAM on boot */ diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index f663634cccc9..743f36b38e5d 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -26,10 +26,14 @@ /* * Tces come in two formats, one for the virtual bus and a different - * format for PCI + * format for PCI. PCI TCEs can have hardware or software maintianed + * coherency. */ -#define TCE_VB 0 -#define TCE_PCI 1 +#define TCE_VB 0 +#define TCE_PCI 1 +#define TCE_PCI_SWINV_CREATE 2 +#define TCE_PCI_SWINV_FREE 4 +#define TCE_PCI_SWINV_PAIR 8 /* TCE page size is 4096 bytes (1 << 12) */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index fe6f7c2c9c68..7eb10fb96cd0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -219,5 +219,7 @@ DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); extern void secondary_cpu_time_init(void); extern void iSeries_time_init_early(void); +DECLARE_PER_CPU(u64, decrementers_next_tb); + #endif /* __KERNEL__ */ #endif /* __POWERPC_TIME_H */ diff --git a/arch/powerpc/kernel/cpu_setup_a2.S b/arch/powerpc/kernel/cpu_setup_a2.S index 7f818feaa7a5..ebc62f42a237 100644 --- a/arch/powerpc/kernel/cpu_setup_a2.S +++ b/arch/powerpc/kernel/cpu_setup_a2.S @@ -41,11 +41,16 @@ _GLOBAL(__setup_cpu_a2) * core local but doing it always won't hurt */ -#ifdef CONFIG_PPC_WSP_COPRO +#ifdef CONFIG_PPC_ICSWX /* Make sure ACOP starts out as zero */ li r3,0 mtspr SPRN_ACOP,r3 + /* Skip the following if we are in Guest mode */ + mfmsr r3 + andis. r0,r3,MSR_GS@h + bne _icswx_skip_guest + /* Enable icswx instruction */ mfspr r3,SPRN_A2_CCR2 ori r3,r3,A2_CCR2_ENABLE_ICSWX @@ -54,7 +59,8 @@ _GLOBAL(__setup_cpu_a2) /* Unmask all CTs in HACOP */ li r3,-1 mtspr SPRN_HACOP,r3 -#endif /* CONFIG_PPC_WSP_COPRO */ +_icswx_skip_guest: +#endif /* CONFIG_PPC_ICSWX */ /* Enable doorbell */ mfspr r3,SPRN_A2_CCR2 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c3c46948d94..2ff4f5e59620 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -115,6 +115,15 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } +static inline notrace void decrementer_check_overflow(void) +{ + u64 now = get_tb_or_rtc(); + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + if (now >= *next_tb) + set_dec(1); +} + notrace void arch_local_irq_restore(unsigned long en) { /* @@ -164,16 +173,13 @@ notrace void arch_local_irq_restore(unsigned long en) */ local_paca->hard_enabled = en; -#ifndef CONFIG_BOOKE - /* On server, re-trigger the decrementer if it went negative since - * some processors only trigger on edge transitions of the sign bit. - * - * BookE has a level sensitive decrementer (latches in TSR) so we - * don't need that + /* + * Trigger the decrementer if we have a pending event. Some processors + * only trigger on edge transitions of the sign bit. We might also + * have disabled interrupts long enough that the decrementer wrapped + * to positive. */ - if ((int)mfspr(SPRN_DEC) < 0) - mtspr(SPRN_DEC, 1); -#endif /* CONFIG_BOOKE */ + decrementer_check_overflow(); /* * Force the delivery of pending soft-disabled interrupts on PS3. diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 069aa75e7161..9bffc028f45a 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -921,18 +921,22 @@ static void __devinit pcibios_fixup_resources(struct pci_dev *dev) struct resource *res = dev->resource + i; if (!res->flags) continue; - /* On platforms that have PCI_PROBE_ONLY set, we don't - * consider 0 as an unassigned BAR value. It's technically - * a valid value, but linux doesn't like it... so when we can - * re-assign things, we do so, but if we can't, we keep it - * around and hope for the best... + + /* If we're going to re-assign everything, we mark all resources + * as unset (and 0-base them). In addition, we mark BARs starting + * at 0 as unset as well, except if PCI_PROBE_ONLY is also set + * since in that case, we don't want to re-assign anything */ - if (res->start == 0 && !pci_has_flag(PCI_PROBE_ONLY)) { - pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] is unassigned\n", - pci_name(dev), i, - (unsigned long long)res->start, - (unsigned long long)res->end, - (unsigned int)res->flags); + if (pci_has_flag(PCI_REASSIGN_ALL_RSRC) || + (res->start == 0 && !pci_has_flag(PCI_PROBE_ONLY))) { + /* Only print message if not re-assigning */ + if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC)) + pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] " + "is unassigned\n", + pci_name(dev), i, + (unsigned long long)res->start, + (unsigned long long)res->end, + (unsigned int)res->flags); res->end -= res->start; res->start = 0; res->flags |= IORESOURCE_UNSET; @@ -1042,6 +1046,16 @@ static void __devinit pcibios_fixup_bridge(struct pci_bus *bus) if (i >= 3 && bus->self->transparent) continue; + /* If we are going to re-assign everything, mark the resource + * as unset and move it down to 0 + */ + if (pci_has_flag(PCI_REASSIGN_ALL_RSRC)) { + res->flags |= IORESOURCE_UNSET; + res->end -= res->start; + res->start = 0; + continue; + } + pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x] fixup...\n", pci_name(dev), i, (unsigned long long)res->start,\ @@ -1262,18 +1276,15 @@ void pcibios_allocate_bus_resources(struct pci_bus *bus) pci_bus_for_each_resource(bus, res, i) { if (!res || !res->flags || res->start > res->end || res->parent) continue; + + /* If the resource was left unset at this point, we clear it */ + if (res->flags & IORESOURCE_UNSET) + goto clear_resource; + if (bus->parent == NULL) pr = (res->flags & IORESOURCE_IO) ? &ioport_resource : &iomem_resource; else { - /* Don't bother with non-root busses when - * re-assigning all resources. We clear the - * resource flags as if they were colliding - * and as such ensure proper re-allocation - * later. - */ - if (pci_has_flag(PCI_REASSIGN_ALL_RSRC)) - goto clear_resource; pr = pci_find_parent_resource(bus->self, res); if (pr == res) { /* this happens when the generic PCI @@ -1304,9 +1315,9 @@ void pcibios_allocate_bus_resources(struct pci_bus *bus) if (reparent_resources(pr, res) == 0) continue; } - printk(KERN_WARNING "PCI: Cannot allocate resource region " - "%d of PCI bridge %d, will remap\n", i, bus->number); -clear_resource: + pr_warning("PCI: Cannot allocate resource region " + "%d of PCI bridge %d, will remap\n", i, bus->number); + clear_resource: res->start = res->end = 0; res->flags = 0; } @@ -1451,16 +1462,11 @@ void __init pcibios_resource_survey(void) { struct pci_bus *b; - /* Allocate and assign resources. If we re-assign everything, then - * we skip the allocate phase - */ + /* Allocate and assign resources */ list_for_each_entry(b, &pci_root_buses, node) pcibios_allocate_bus_resources(b); - - if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC)) { - pcibios_allocate_resources(0); - pcibios_allocate_resources(1); - } + pcibios_allocate_resources(0); + pcibios_allocate_resources(1); /* Before we start assigning unassigned resource, we try to reserve * the low IO area and the VGA memory area if they intersect the @@ -1732,6 +1738,12 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose) if (mode == PCI_PROBE_NORMAL) hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); + /* Platform gets a chance to do some global fixups before + * we proceed to resource allocation + */ + if (ppc_md.pcibios_fixup_phb) + ppc_md.pcibios_fixup_phb(hose); + /* Configure PCI Express settings */ if (bus && !pci_has_flag(PCI_PROBE_ONLY)) { struct pci_bus *child; diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 4e69deb89b37..dd9e4a04bf79 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -50,6 +50,9 @@ void * __devinit update_dn_pci_info(struct device_node *dn, void *data) dn->data = pdn; pdn->node = dn; pdn->phb = phb; +#ifdef CONFIG_PPC_POWERNV + pdn->pe_number = IODA_INVALID_PE; +#endif regs = of_get_property(dn, "reg", NULL); if (regs) { /* First register entry is addr (00BBSS00) */ diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index e037c7494fd8..4174b4b23246 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -568,6 +568,12 @@ static void rtas_flash_firmware(int reboot_type) } /* + * Just before starting the firmware flash, cancel the event scan work + * to avoid any soft lockup issues. + */ + rtas_cancel_event_scan(); + + /* * NOTE: the "first" block must be under 4GB, so we create * an entry with no data blocks in the reserved buffer in * the kernel data segment. diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 481ef064c8f1..1045ff49cc6d 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -472,6 +472,13 @@ static void start_event_scan(void) &event_scan_work, event_scan_delay); } +/* Cancel the rtas event scan work */ +void rtas_cancel_event_scan(void) +{ + cancel_delayed_work_sync(&event_scan_work); +} +EXPORT_SYMBOL_GPL(rtas_cancel_event_scan); + static int __init rtas_init(void) { struct proc_dir_entry *entry; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 6df70907d60a..f0abe92f63f2 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -187,7 +187,8 @@ int smp_request_message_ipi(int virq, int msg) return 1; } #endif - err = request_irq(virq, smp_ipi_action[msg], IRQF_PERCPU, + err = request_irq(virq, smp_ipi_action[msg], + IRQF_PERCPU | IRQF_NO_THREAD, smp_ipi_name[msg], 0); WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n", virq, smp_ipi_name[msg], err); diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index ce035c1905f0..f579be552094 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -177,11 +177,13 @@ SYSFS_PMCSETUP(mmcra, SPRN_MMCRA); SYSFS_PMCSETUP(purr, SPRN_PURR); SYSFS_PMCSETUP(spurr, SPRN_SPURR); SYSFS_PMCSETUP(dscr, SPRN_DSCR); +SYSFS_PMCSETUP(pir, SPRN_PIR); static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra); static SYSDEV_ATTR(spurr, 0600, show_spurr, NULL); static SYSDEV_ATTR(dscr, 0600, show_dscr, store_dscr); static SYSDEV_ATTR(purr, 0600, show_purr, store_purr); +static SYSDEV_ATTR(pir, 0400, show_pir, NULL); unsigned long dscr_default = 0; EXPORT_SYMBOL(dscr_default); @@ -392,6 +394,9 @@ static void __cpuinit register_cpu_online(unsigned int cpu) if (cpu_has_feature(CPU_FTR_DSCR)) sysdev_create_file(s, &attr_dscr); + + if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2)) + sysdev_create_file(s, &attr_pir); #endif /* CONFIG_PPC64 */ cacheinfo_cpu_online(cpu); @@ -462,6 +467,9 @@ static void unregister_cpu_online(unsigned int cpu) if (cpu_has_feature(CPU_FTR_DSCR)) sysdev_remove_file(s, &attr_dscr); + + if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2)) + sysdev_remove_file(s, &attr_pir); #endif /* CONFIG_PPC64 */ cacheinfo_cpu_offline(cpu); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 522bb1dfc353..9754743db8b9 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -86,8 +86,6 @@ static struct clocksource clocksource_rtc = { .rating = 400, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .mult = 0, /* To be filled in */ .read = rtc_read, }; @@ -97,8 +95,6 @@ static struct clocksource clocksource_timebase = { .rating = 400, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .mult = 0, /* To be filled in */ .read = timebase_read, }; @@ -110,22 +106,16 @@ static void decrementer_set_mode(enum clock_event_mode mode, struct clock_event_device *dev); static struct clock_event_device decrementer_clockevent = { - .name = "decrementer", - .rating = 200, - .shift = 0, /* To be filled in */ - .mult = 0, /* To be filled in */ - .irq = 0, - .set_next_event = decrementer_set_next_event, - .set_mode = decrementer_set_mode, - .features = CLOCK_EVT_FEAT_ONESHOT, + .name = "decrementer", + .rating = 200, + .irq = 0, + .set_next_event = decrementer_set_next_event, + .set_mode = decrementer_set_mode, + .features = CLOCK_EVT_FEAT_ONESHOT, }; -struct decrementer_clock { - struct clock_event_device event; - u64 next_tb; -}; - -static DEFINE_PER_CPU(struct decrementer_clock, decrementers); +DEFINE_PER_CPU(u64, decrementers_next_tb); +static DEFINE_PER_CPU(struct clock_event_device, decrementers); #ifdef CONFIG_PPC_ISERIES static unsigned long __initdata iSeries_recal_titan; @@ -441,7 +431,7 @@ EXPORT_SYMBOL(profile_pc); /* * This function recalibrates the timebase based on the 49-bit time-of-day * value in the Titan chip. The Titan is much more accurate than the value - * returned by the service processor for the timebase frequency. + * returned by the service processor for the timebase frequency. */ static int __init iSeries_tb_recal(void) @@ -576,9 +566,8 @@ void arch_irq_work_raise(void) void timer_interrupt(struct pt_regs * regs) { struct pt_regs *old_regs; - struct decrementer_clock *decrementer = &__get_cpu_var(decrementers); - struct clock_event_device *evt = &decrementer->event; - u64 now; + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + struct clock_event_device *evt = &__get_cpu_var(decrementers); /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. @@ -613,16 +602,9 @@ void timer_interrupt(struct pt_regs * regs) get_lppaca()->int_dword.fields.decr_int = 0; #endif - now = get_tb_or_rtc(); - if (now >= decrementer->next_tb) { - decrementer->next_tb = ~(u64)0; - if (evt->event_handler) - evt->event_handler(evt); - } else { - now = decrementer->next_tb - now; - if (now <= DECREMENTER_MAX) - set_dec((int)now); - } + *next_tb = ~(u64)0; + if (evt->event_handler) + evt->event_handler(evt); #ifdef CONFIG_PPC_ISERIES if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending()) @@ -650,9 +632,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(0x7fffffff); + set_dec(DECREMENTER_MAX); local_irq_disable(); - set_dec(0x7fffffff); + set_dec(DECREMENTER_MAX); } static void generic_suspend_enable_irqs(void) @@ -824,9 +806,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, ++vdso_data->tb_update_count; smp_mb(); - /* XXX this assumes clock->shift == 22 */ - /* 4611686018 ~= 2^(20+64-22) / 1e9 */ - new_tb_to_xs = (u64) mult * 4611686018ULL; + /* 19342813113834067 ~= 2^(20+64) / 1e9 */ + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; do_div(new_stamp_xsec, 1000000000); new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; @@ -877,9 +858,7 @@ static void __init clocksource_init(void) else clock = &clocksource_timebase; - clock->mult = clocksource_hz2mult(tb_ticks_per_sec, clock->shift); - - if (clocksource_register(clock)) { + if (clocksource_register_hz(clock, tb_ticks_per_sec)) { printk(KERN_ERR "clocksource: %s is already registered\n", clock->name); return; @@ -892,7 +871,7 @@ static void __init clocksource_init(void) static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { - __get_cpu_var(decrementers).next_tb = get_tb_or_rtc() + evt; + __get_cpu_var(decrementers_next_tb) = get_tb_or_rtc() + evt; set_dec(evt); return 0; } @@ -904,34 +883,9 @@ static void decrementer_set_mode(enum clock_event_mode mode, decrementer_set_next_event(DECREMENTER_MAX, dev); } -static inline uint64_t div_sc64(unsigned long ticks, unsigned long nsec, - int shift) -{ - uint64_t tmp = ((uint64_t)ticks) << shift; - - do_div(tmp, nsec); - return tmp; -} - -static void __init setup_clockevent_multiplier(unsigned long hz) -{ - u64 mult, shift = 32; - - while (1) { - mult = div_sc64(hz, NSEC_PER_SEC, shift); - if (mult && (mult >> 32UL) == 0UL) - break; - - shift--; - } - - decrementer_clockevent.shift = shift; - decrementer_clockevent.mult = mult; -} - static void register_decrementer_clockevent(int cpu) { - struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; + struct clock_event_device *dec = &per_cpu(decrementers, cpu); *dec = decrementer_clockevent; dec->cpumask = cpumask_of(cpu); @@ -946,7 +900,8 @@ static void __init init_decrementer_clockevent(void) { int cpu = smp_processor_id(); - setup_clockevent_multiplier(ppc_tb_freq); + clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4); + decrementer_clockevent.max_delta_ns = clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent); decrementer_clockevent.min_delta_ns = @@ -1014,10 +969,10 @@ void __init time_init(void) boot_tb = get_tb_or_rtc(); /* If platform provided a timezone (pmac), we correct the time */ - if (timezone_offset) { + if (timezone_offset) { sys_tz.tz_minuteswest = -timezone_offset / 60; sys_tz.tz_dsttime = 0; - } + } vdso_data->tb_update_count = 0; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 991ee813d2a8..3787b61f7d20 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -21,6 +21,8 @@ obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ tlb_hash$(CONFIG_WORD_SIZE).o \ mmu_context_hash$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_ICSWX) += icswx.o +obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5efe8c96d37f..2f0d1b032a89 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -44,6 +44,8 @@ #include <asm/siginfo.h> #include <mm/mmu_decl.h> +#include "icswx.h" + #ifdef CONFIG_KPROBES static inline int notify_page_fault(struct pt_regs *regs) { @@ -143,6 +145,21 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ +#ifdef CONFIG_PPC_ICSWX + /* + * we need to do this early because this "data storage + * interrupt" does not update the DAR/DEAR so we don't want to + * look at it + */ + if (error_code & ICSWX_DSI_UCT) { + int ret; + + ret = acop_handle_fault(regs, address, error_code); + if (ret) + return ret; + } +#endif + if (notify_page_fault(regs)) return 0; diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c new file mode 100644 index 000000000000..5d9a59eaad93 --- /dev/null +++ b/arch/powerpc/mm/icswx.c @@ -0,0 +1,273 @@ +/* + * ICSWX and ACOP Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#include "icswx.h" + +/* + * The processor and its L2 cache cause the icswx instruction to + * generate a COP_REQ transaction on PowerBus. The transaction has no + * address, and the processor does not perform an MMU access to + * authenticate the transaction. The command portion of the PowerBus + * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor + * Process ID (PID), which the coprocessor compares to the authorized + * LPID and PID held in the coprocessor, to determine if the process + * is authorized to generate the transaction. The data of the COP_REQ + * transaction is 128-byte or less in size and is placed in cacheable + * memory on a 128-byte cache line boundary. + * + * The task to use a coprocessor should use use_cop() to mark the use + * of the Coprocessor Type (CT) and context switching. On a server + * class processor, the PID register is used only for coprocessor + * management + * and so a coprocessor PID is allocated before + * executing icswx + * instruction. Drop_cop() is used to free the + * coprocessor PID. + * + * Example: + * Host Fabric Interface (HFI) is a PowerPC network coprocessor. + * Each HFI have multiple windows. Each HFI window serves as a + * network device sending to and receiving from HFI network. + * HFI immediate send function uses icswx instruction. The immediate + * send function allows small (single cache-line) packets be sent + * without using the regular HFI send FIFO and doorbell, which are + * much slower than immediate send. + * + * For each task intending to use HFI immediate send, the HFI driver + * calls use_cop() to obtain a coprocessor PID for the task. + * The HFI driver then allocate a free HFI window and save the + * coprocessor PID to the HFI window to allow the task to use the + * HFI window. + * + * The HFI driver repeatedly creates immediate send packets and + * issues icswx instruction to send data through the HFI window. + * The HFI compares the coprocessor PID in the CPU PID register + * to the PID held in the HFI window to determine if the transaction + * is allowed. + * + * When the task to release the HFI window, the HFI driver calls + * drop_cop() to release the coprocessor PID. + */ + +void switch_cop(struct mm_struct *next) +{ +#ifdef CONFIG_ICSWX_PID + mtspr(SPRN_PID, next->context.cop_pid); +#endif + mtspr(SPRN_ACOP, next->context.acop); +} + +/** + * Start using a coprocessor. + * @acop: mask of coprocessor to be used. + * @mm: The mm the coprocessor to associate with. Most likely current mm. + * + * Return a positive PID if successful. Negative errno otherwise. + * The returned PID will be fed to the coprocessor to determine if an + * icswx transaction is authenticated. + */ +int use_cop(unsigned long acop, struct mm_struct *mm) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) + return -ENODEV; + + if (!mm || !acop) + return -EINVAL; + + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); + spin_lock(mm->context.cop_lockp); + + ret = get_cop_pid(mm); + if (ret < 0) + goto out; + + /* update acop */ + mm->context.acop |= acop; + + sync_cop(mm); + + /* + * If this is a threaded process then there might be other threads + * running. We need to send an IPI to force them to pick up any + * change in PID and ACOP. + */ + if (atomic_read(&mm->mm_users) > 1) + smp_call_function(sync_cop, mm, 1); + +out: + spin_unlock(mm->context.cop_lockp); + spin_unlock(&mm->page_table_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(use_cop); + +/** + * Stop using a coprocessor. + * @acop: mask of coprocessor to be stopped. + * @mm: The mm the coprocessor associated with. + */ +void drop_cop(unsigned long acop, struct mm_struct *mm) +{ + int free_pid; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) + return; + + if (WARN_ON_ONCE(!mm)) + return; + + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); + spin_lock(mm->context.cop_lockp); + + mm->context.acop &= ~acop; + + free_pid = disable_cop_pid(mm); + sync_cop(mm); + + /* + * If this is a threaded process then there might be other threads + * running. We need to send an IPI to force them to pick up any + * change in PID and ACOP. + */ + if (atomic_read(&mm->mm_users) > 1) + smp_call_function(sync_cop, mm, 1); + + if (free_pid != COP_PID_NONE) + free_cop_pid(free_pid); + + spin_unlock(mm->context.cop_lockp); + spin_unlock(&mm->page_table_lock); +} +EXPORT_SYMBOL_GPL(drop_cop); + +static int acop_use_cop(int ct) +{ + /* todo */ + return -1; +} + +/* + * Get the instruction word at the NIP + */ +static u32 acop_get_inst(struct pt_regs *regs) +{ + u32 inst; + u32 __user *p; + + p = (u32 __user *)regs->nip; + if (!access_ok(VERIFY_READ, p, sizeof(*p))) + return 0; + + if (__get_user(inst, p)) + return 0; + + return inst; +} + +/** + * @regs: regsiters at time of interrupt + * @address: storage address + * @error_code: Fault code, usually the DSISR or ESR depending on + * processor type + * + * Return 0 if we are able to resolve the data storage fault that + * results from a CT miss in the ACOP register. + */ +int acop_handle_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + int ct; + u32 inst = 0; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) { + pr_info("No coprocessors available"); + _exception(SIGILL, regs, ILL_ILLOPN, address); + } + + if (!user_mode(regs)) { + /* this could happen if the HV denies the + * kernel access, for now we just die */ + die("ICSWX from kernel failed", regs, SIGSEGV); + } + + /* Some implementations leave us a hint for the CT */ + ct = ICSWX_GET_CT_HINT(error_code); + if (ct < 0) { + /* we have to peek at the instruction word to figure out CT */ + u32 ccw; + u32 rs; + + inst = acop_get_inst(regs); + if (inst == 0) + return -1; + + rs = (inst >> (31 - 10)) & 0x1f; + ccw = regs->gpr[rs]; + ct = (ccw >> 16) & 0x3f; + } + + if (!acop_use_cop(ct)) + return 0; + + /* at this point the CT is unknown to the system */ + pr_warn("%s[%d]: Coprocessor %d is unavailable", + current->comm, current->pid, ct); + + /* get inst if we don't already have it */ + if (inst == 0) { + inst = acop_get_inst(regs); + if (inst == 0) + return -1; + } + + /* Check if the instruction is the "record form" */ + if (inst & 1) { + /* + * the instruction is "record" form so we can reject + * using CR0 + */ + regs->ccr &= ~(0xful << 28); + regs->ccr |= ICSWX_RC_NOT_FOUND << 28; + + /* Move on to the next instruction */ + regs->nip += 4; + } else { + /* + * There is no architected mechanism to report a bad + * CT so we could either SIGILL or report nothing. + * Since the non-record version should only bu used + * for "hints" or "don't care" we should probably do + * nothing. However, I could see how some people + * might want an SIGILL so it here if you want it. + */ +#ifdef CONFIG_PPC_ICSWX_USE_SIGILL + _exception(SIGILL, regs, ILL_ILLOPN, address); +#else + regs->nip += 4; +#endif + } + + return 0; +} +EXPORT_SYMBOL_GPL(acop_handle_fault); diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h new file mode 100644 index 000000000000..42176bd0884c --- /dev/null +++ b/arch/powerpc/mm/icswx.h @@ -0,0 +1,62 @@ +#ifndef _ARCH_POWERPC_MM_ICSWX_H_ +#define _ARCH_POWERPC_MM_ICSWX_H_ + +/* + * ICSWX and ACOP Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/mmu_context.h> + +/* also used to denote that PIDs are not used */ +#define COP_PID_NONE 0 + +static inline void sync_cop(void *arg) +{ + struct mm_struct *mm = arg; + + if (mm == current->active_mm) + switch_cop(current->active_mm); +} + +#ifdef CONFIG_PPC_ICSWX_PID +extern int get_cop_pid(struct mm_struct *mm); +extern int disable_cop_pid(struct mm_struct *mm); +extern void free_cop_pid(int free_pid); +#else +#define get_cop_pid(m) (COP_PID_NONE) +#define disable_cop_pid(m) (COP_PID_NONE) +#define free_cop_pid(p) +#endif + +/* + * These are implementation bits for architected registers. If this + * ever becomes architecture the should be moved to reg.h et. al. + */ +/* UCT is the same bit for Server and Embedded */ +#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */ + +#ifdef CONFIG_PPC_BOOK3E +/* Embedded implementation gives us no hints as to what the CT is */ +#define ICSWX_GET_CT_HINT(x) (-1) +#else +/* Server implementation contains the CT value in the DSISR */ +#define ICSWX_DSISR_CTMASK 0x00003f00 +#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8) +#endif + +#define ICSWX_RC_STARTED 0x8 /* The request has been started */ +#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */ +#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */ +#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */ + +extern int acop_handle_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code); +#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */ diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c new file mode 100644 index 000000000000..91e30eb7d054 --- /dev/null +++ b/arch/powerpc/mm/icswx_pid.c @@ -0,0 +1,87 @@ +/* + * ICSWX and ACOP/PID Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/module.h> +#include "icswx.h" + +#define COP_PID_MIN (COP_PID_NONE + 1) +#define COP_PID_MAX (0xFFFF) + +static DEFINE_SPINLOCK(mmu_context_acop_lock); +static DEFINE_IDA(cop_ida); + +static int new_cop_pid(struct ida *ida, int min_id, int max_id, + spinlock_t *lock) +{ + int index; + int err; + +again: + if (!ida_pre_get(ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(lock); + err = ida_get_new_above(ida, min_id, &index); + spin_unlock(lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > max_id) { + spin_lock(lock); + ida_remove(ida, index); + spin_unlock(lock); + return -ENOMEM; + } + + return index; +} + +int get_cop_pid(struct mm_struct *mm) +{ + int pid; + + if (mm->context.cop_pid == COP_PID_NONE) { + pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX, + &mmu_context_acop_lock); + if (pid >= 0) + mm->context.cop_pid = pid; + } + return mm->context.cop_pid; +} + +int disable_cop_pid(struct mm_struct *mm) +{ + int free_pid = COP_PID_NONE; + + if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) { + free_pid = mm->context.cop_pid; + mm->context.cop_pid = COP_PID_NONE; + } + return free_pid; +} + +void free_cop_pid(int free_pid) +{ + spin_lock(&mmu_context_acop_lock); + ida_remove(&cop_ida, free_pid); + spin_unlock(&mmu_context_acop_lock); +} diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index ca988a3d5fb2..40677aa0190e 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -24,200 +24,7 @@ #include <asm/mmu_context.h> -#ifdef CONFIG_PPC_ICSWX -/* - * The processor and its L2 cache cause the icswx instruction to - * generate a COP_REQ transaction on PowerBus. The transaction has - * no address, and the processor does not perform an MMU access - * to authenticate the transaction. The command portion of the - * PowerBus COP_REQ transaction includes the LPAR_ID (LPID) and - * the coprocessor Process ID (PID), which the coprocessor compares - * to the authorized LPID and PID held in the coprocessor, to determine - * if the process is authorized to generate the transaction. - * The data of the COP_REQ transaction is 128-byte or less and is - * placed in cacheable memory on a 128-byte cache line boundary. - * - * The task to use a coprocessor should use use_cop() to allocate - * a coprocessor PID before executing icswx instruction. use_cop() - * also enables the coprocessor context switching. Drop_cop() is - * used to free the coprocessor PID. - * - * Example: - * Host Fabric Interface (HFI) is a PowerPC network coprocessor. - * Each HFI have multiple windows. Each HFI window serves as a - * network device sending to and receiving from HFI network. - * HFI immediate send function uses icswx instruction. The immediate - * send function allows small (single cache-line) packets be sent - * without using the regular HFI send FIFO and doorbell, which are - * much slower than immediate send. - * - * For each task intending to use HFI immediate send, the HFI driver - * calls use_cop() to obtain a coprocessor PID for the task. - * The HFI driver then allocate a free HFI window and save the - * coprocessor PID to the HFI window to allow the task to use the - * HFI window. - * - * The HFI driver repeatedly creates immediate send packets and - * issues icswx instruction to send data through the HFI window. - * The HFI compares the coprocessor PID in the CPU PID register - * to the PID held in the HFI window to determine if the transaction - * is allowed. - * - * When the task to release the HFI window, the HFI driver calls - * drop_cop() to release the coprocessor PID. - */ - -#define COP_PID_NONE 0 -#define COP_PID_MIN (COP_PID_NONE + 1) -#define COP_PID_MAX (0xFFFF) - -static DEFINE_SPINLOCK(mmu_context_acop_lock); -static DEFINE_IDA(cop_ida); - -void switch_cop(struct mm_struct *next) -{ - mtspr(SPRN_PID, next->context.cop_pid); - mtspr(SPRN_ACOP, next->context.acop); -} - -static int new_cop_pid(struct ida *ida, int min_id, int max_id, - spinlock_t *lock) -{ - int index; - int err; - -again: - if (!ida_pre_get(ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(lock); - err = ida_get_new_above(ida, min_id, &index); - spin_unlock(lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > max_id) { - spin_lock(lock); - ida_remove(ida, index); - spin_unlock(lock); - return -ENOMEM; - } - - return index; -} - -static void sync_cop(void *arg) -{ - struct mm_struct *mm = arg; - - if (mm == current->active_mm) - switch_cop(current->active_mm); -} - -/** - * Start using a coprocessor. - * @acop: mask of coprocessor to be used. - * @mm: The mm the coprocessor to associate with. Most likely current mm. - * - * Return a positive PID if successful. Negative errno otherwise. - * The returned PID will be fed to the coprocessor to determine if an - * icswx transaction is authenticated. - */ -int use_cop(unsigned long acop, struct mm_struct *mm) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return -ENODEV; - - if (!mm || !acop) - return -EINVAL; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - if (mm->context.cop_pid == COP_PID_NONE) { - ret = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX, - &mmu_context_acop_lock); - if (ret < 0) - goto out; - - mm->context.cop_pid = ret; - } - mm->context.acop |= acop; - - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - - ret = mm->context.cop_pid; - -out: - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(use_cop); - -/** - * Stop using a coprocessor. - * @acop: mask of coprocessor to be stopped. - * @mm: The mm the coprocessor associated with. - */ -void drop_cop(unsigned long acop, struct mm_struct *mm) -{ - int free_pid = COP_PID_NONE; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return; - - if (WARN_ON_ONCE(!mm)) - return; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - mm->context.acop &= ~acop; - - if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) { - free_pid = mm->context.cop_pid; - mm->context.cop_pid = COP_PID_NONE; - } - - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - - if (free_pid != COP_PID_NONE) { - spin_lock(&mmu_context_acop_lock); - ida_remove(&cop_ida, free_pid); - spin_unlock(&mmu_context_acop_lock); - } - - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); -} -EXPORT_SYMBOL_GPL(drop_cop); - -#endif /* CONFIG_PPC_ICSWX */ +#include "icswx.h" static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c index f24926b00451..d993b66f1abf 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -179,7 +179,7 @@ static irqreturn_t mpc85xx_8259_cascade_action(int irq, void *dev_id) static struct irqaction mpc85xxcds_8259_irqaction = { .handler = mpc85xx_8259_cascade_action, - .flags = IRQF_SHARED, + .flags = IRQF_SHARED | IRQF_NO_THREAD, .name = "8259 cascade", }; #endif /* PPC_I8259 */ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index fbecae0fbb49..836b44286b3e 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -236,7 +236,7 @@ config VSX config PPC_ICSWX bool "Support for PowerPC icswx coprocessor instruction" - depends on POWER4 + depends on POWER4 || PPC_A2 default n ---help--- @@ -252,6 +252,25 @@ config PPC_ICSWX If in doubt, say N here. +config PPC_ICSWX_PID + bool "icswx requires direct PID management" + depends on PPC_ICSWX && POWER4 + default y + ---help--- + The PID register in server is used explicitly for ICSWX. In + embedded systems PID managment is done by the system. + +config PPC_ICSWX_USE_SIGILL + bool "Should a bad CT cause a SIGILL?" + depends on PPC_ICSWX + default n + ---help--- + Should a bad CT used for "non-record form ICSWX" cause an + illegal intruction signal or should it be silent as + architected. + + If in doubt, say N here. + config SPE bool "SPE Support" depends on E200 || (E500 && !PPC_E500MC) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 592c3d51b817..ae9fc7bc17d6 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -1037,6 +1037,8 @@ static int __init cell_iommu_fixed_mapping_init(void) /* The fixed mapping is only supported on axon machines */ np = of_find_node_by_name(NULL, "axon"); + of_node_put(np); + if (!np) { pr_debug("iommu: fixed mapping disabled, no axons found\n"); return -1; diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 96580b189ec2..970ea1de4298 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -494,11 +494,15 @@ static int __init pmac_declare_of_platform_devices(void) return -1; np = of_find_node_by_name(NULL, "valkyrie"); - if (np) + if (np) { of_platform_device_create(np, "valkyrie", NULL); + of_node_put(np); + } np = of_find_node_by_name(NULL, "platinum"); - if (np) + if (np) { of_platform_device_create(np, "platinum", NULL); + of_node_put(np); + } np = of_find_node_by_type(NULL, "smu"); if (np) { of_platform_device_create(np, "smu", NULL); diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 9b6a820bdd7d..e6f7378f4a49 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -200,7 +200,7 @@ static int psurge_secondary_ipi_init(void) if (psurge_secondary_virq) rc = request_irq(psurge_secondary_virq, psurge_ipi_intr, - IRQF_PERCPU, "IPI", NULL); + IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL); if (rc) pr_err("Failed to setup secondary cpu IPI\n"); @@ -408,7 +408,7 @@ static int __init smp_psurge_kick_cpu(int nr) static struct irqaction psurge_irqaction = { .handler = psurge_ipi_intr, - .flags = IRQF_PERCPU, + .flags = IRQF_PERCPU | IRQF_NO_THREAD, .name = "primary IPI", }; diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 31853008b418..bcc3cb48a44e 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -2,4 +2,4 @@ obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o obj-y += opal-rtc.o opal-nvram.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o +obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c new file mode 100644 index 000000000000..cf89f305c8b1 --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -0,0 +1,1325 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/abs_addr.h> + +#include "powernv.h" +#include "pci.h" + +struct resource_wrap { + struct list_head link; + resource_size_t size; + resource_size_t align; + struct pci_dev *dev; /* Set if it's a device */ + struct pci_bus *bus; /* Set if it's a bridge */ +}; + +static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe, + struct va_format *vaf) +{ + char pfix[32]; + + if (pe->pdev) + strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); + else + sprintf(pfix, "%04x:%02x ", + pci_domain_nr(pe->pbus), pe->pbus->number); + return printk("pci %s%s: [PE# %.3d] %pV", level, pfix, pe->pe_number, vaf); +} + +#define define_pe_printk_level(func, kern_level) \ +static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + int r; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __pe_printk(kern_level, pe, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ + +define_pe_printk_level(pe_err, KERN_ERR); +define_pe_printk_level(pe_warn, KERN_WARNING); +define_pe_printk_level(pe_info, KERN_INFO); + + +/* Calculate resource usage & alignment requirement of a single + * device. This will also assign all resources within the device + * for a given type starting at 0 for the biggest one and then + * assigning in decreasing order of size. + */ +static void __devinit pnv_ioda_calc_dev(struct pci_dev *dev, unsigned int flags, + resource_size_t *size, + resource_size_t *align) +{ + resource_size_t start; + struct resource *r; + int i; + + pr_devel(" -> CDR %s\n", pci_name(dev)); + + *size = *align = 0; + + /* Clear the resources out and mark them all unset */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (!(r->flags & flags)) + continue; + if (r->start) { + r->end -= r->start; + r->start = 0; + } + r->flags |= IORESOURCE_UNSET; + } + + /* We currently keep all memory resources together, we + * will handle prefetch & 64-bit separately in the future + * but for now we stick everybody in M32 + */ + start = 0; + for (;;) { + resource_size_t max_size = 0; + int max_no = -1; + + /* Find next biggest resource */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (!(r->flags & IORESOURCE_UNSET) || + !(r->flags & flags)) + continue; + if (resource_size(r) > max_size) { + max_size = resource_size(r); + max_no = i; + } + } + if (max_no < 0) + break; + r = &dev->resource[max_no]; + if (max_size > *align) + *align = max_size; + *size += max_size; + r->start = start; + start += max_size; + r->end = r->start + max_size - 1; + r->flags &= ~IORESOURCE_UNSET; + pr_devel(" -> R%d %016llx..%016llx\n", + max_no, r->start, r->end); + } + pr_devel(" <- CDR %s size=%llx align=%llx\n", + pci_name(dev), *size, *align); +} + +/* Allocate a resource "wrap" for a given device or bridge and + * insert it at the right position in the sorted list + */ +static void __devinit pnv_ioda_add_wrap(struct list_head *list, + struct pci_bus *bus, + struct pci_dev *dev, + resource_size_t size, + resource_size_t align) +{ + struct resource_wrap *w1, *w = kzalloc(sizeof(*w), GFP_KERNEL); + + w->size = size; + w->align = align; + w->dev = dev; + w->bus = bus; + + list_for_each_entry(w1, list, link) { + if (w1->align < align) { + list_add_tail(&w->link, &w1->link); + return; + } + } + list_add_tail(&w->link, list); +} + +/* Offset device resources of a given type */ +static void __devinit pnv_ioda_offset_dev(struct pci_dev *dev, + unsigned int flags, + resource_size_t offset) +{ + struct resource *r; + int i; + + pr_devel(" -> ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); + + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (r->flags & flags) { + dev->resource[i].start += offset; + dev->resource[i].end += offset; + } + } + + pr_devel(" <- ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); +} + +/* Offset bus resources (& all children) of a given type */ +static void __devinit pnv_ioda_offset_bus(struct pci_bus *bus, + unsigned int flags, + resource_size_t offset) +{ + struct resource *r; + struct pci_dev *dev; + struct pci_bus *cbus; + int i; + + pr_devel(" -> OBR %s [%x] +%016llx\n", + bus->self ? pci_name(bus->self) : "root", flags, offset); + + for (i = 0; i < 2; i++) { + r = bus->resource[i]; + if (r && (r->flags & flags)) { + bus->resource[i]->start += offset; + bus->resource[i]->end += offset; + } + } + list_for_each_entry(dev, &bus->devices, bus_list) + pnv_ioda_offset_dev(dev, flags, offset); + list_for_each_entry(cbus, &bus->children, node) + pnv_ioda_offset_bus(cbus, flags, offset); + + pr_devel(" <- OBR %s [%x]\n", + bus->self ? pci_name(bus->self) : "root", flags); +} + +/* This is the guts of our IODA resource allocation. This is called + * recursively for each bus in the system. It calculates all the + * necessary size and requirements for children and assign them + * resources such that: + * + * - Each function fits in it's own contiguous set of IO/M32 + * segment + * + * - All segments behind a P2P bridge are contiguous and obey + * alignment constraints of those bridges + */ +static void __devinit pnv_ioda_calc_bus(struct pci_bus *bus, unsigned int flags, + resource_size_t *size, + resource_size_t *align) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + resource_size_t dev_size, dev_align, start; + resource_size_t min_align, min_balign; + struct pci_dev *cdev; + struct pci_bus *cbus; + struct list_head head; + struct resource_wrap *w; + unsigned int bres; + + *size = *align = 0; + + pr_devel("-> CBR %s [%x]\n", + bus->self ? pci_name(bus->self) : "root", flags); + + /* Calculate alignment requirements based on the type + * of resource we are working on + */ + if (flags & IORESOURCE_IO) { + bres = 0; + min_align = phb->ioda.io_segsize; + min_balign = 0x1000; + } else { + bres = 1; + min_align = phb->ioda.m32_segsize; + min_balign = 0x100000; + } + + /* Gather all our children resources ordered by alignment */ + INIT_LIST_HEAD(&head); + + /* - Busses */ + list_for_each_entry(cbus, &bus->children, node) { + pnv_ioda_calc_bus(cbus, flags, &dev_size, &dev_align); + pnv_ioda_add_wrap(&head, cbus, NULL, dev_size, dev_align); + } + + /* - Devices */ + list_for_each_entry(cdev, &bus->devices, bus_list) { + pnv_ioda_calc_dev(cdev, flags, &dev_size, &dev_align); + /* Align them to segment size */ + if (dev_align < min_align) + dev_align = min_align; + pnv_ioda_add_wrap(&head, NULL, cdev, dev_size, dev_align); + } + if (list_empty(&head)) + goto empty; + + /* Now we can do two things: assign offsets to them within that + * level and get our total alignment & size requirements. The + * assignment algorithm is going to be uber-trivial for now, we + * can try to be smarter later at filling out holes. + */ + start = bus->self ? 0 : bus->resource[bres]->start; + + /* Don't hand out IO 0 */ + if ((flags & IORESOURCE_IO) && !bus->self) + start += 0x1000; + + while(!list_empty(&head)) { + w = list_first_entry(&head, struct resource_wrap, link); + list_del(&w->link); + if (w->size) { + if (start) { + start = ALIGN(start, w->align); + if (w->dev) + pnv_ioda_offset_dev(w->dev,flags,start); + else if (w->bus) + pnv_ioda_offset_bus(w->bus,flags,start); + } + if (w->align > *align) + *align = w->align; + } + start += w->size; + kfree(w); + } + *size = start; + + /* Align and setup bridge resources */ + *align = max_t(resource_size_t, *align, + max_t(resource_size_t, min_align, min_balign)); + *size = ALIGN(*size, + max_t(resource_size_t, min_align, min_balign)); + empty: + /* Only setup P2P's, not the PHB itself */ + if (bus->self) { + WARN_ON(bus->resource[bres] == NULL); + bus->resource[bres]->start = 0; + bus->resource[bres]->flags = (*size) ? flags : 0; + bus->resource[bres]->end = (*size) ? (*size - 1) : 0; + + /* Clear prefetch bus resources for now */ + bus->resource[2]->flags = 0; + } + + pr_devel("<- CBR %s [%x] *size=%016llx *align=%016llx\n", + bus->self ? pci_name(bus->self) : "root", flags,*size,*align); +} + +static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev) +{ + struct device_node *np; + + np = pci_device_to_OF_node(dev); + if (!np) + return NULL; + return PCI_DN(np); +} + +static void __devinit pnv_ioda_setup_pe_segments(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + unsigned int pe, i; + resource_size_t pos; + struct resource io_res; + struct resource m32_res; + struct pci_bus_region region; + int rc; + + /* Anything not referenced in the device-tree gets PE#0 */ + pe = pdn ? pdn->pe_number : 0; + + /* Calculate the device min/max */ + io_res.start = m32_res.start = (resource_size_t)-1; + io_res.end = m32_res.end = 0; + io_res.flags = IORESOURCE_IO; + m32_res.flags = IORESOURCE_MEM; + + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *r = NULL; + if (dev->resource[i].flags & IORESOURCE_IO) + r = &io_res; + if (dev->resource[i].flags & IORESOURCE_MEM) + r = &m32_res; + if (!r) + continue; + if (dev->resource[i].start < r->start) + r->start = dev->resource[i].start; + if (dev->resource[i].end > r->end) + r->end = dev->resource[i].end; + } + + /* Setup IO segments */ + if (io_res.start < io_res.end) { + pcibios_resource_to_bus(dev, ®ion, &io_res); + pos = region.start; + i = pos / phb->ioda.io_segsize; + while(i < phb->ioda.total_pe && pos <= region.end) { + if (phb->ioda.io_segmap[i]) { + pr_err("%s: Trying to use IO seg #%d which is" + " already used by PE# %d\n", + pci_name(dev), i, + phb->ioda.io_segmap[i]); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + phb->ioda.io_segmap[i] = pe; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, + OPAL_IO_WINDOW_TYPE, + 0, i); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d setting up mapping" + " for IO seg# %d\n", + pci_name(dev), rc, i); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + pos += phb->ioda.io_segsize; + i++; + }; + } + + /* Setup M32 segments */ + if (m32_res.start < m32_res.end) { + pcibios_resource_to_bus(dev, ®ion, &m32_res); + pos = region.start; + i = pos / phb->ioda.m32_segsize; + while(i < phb->ioda.total_pe && pos <= region.end) { + if (phb->ioda.m32_segmap[i]) { + pr_err("%s: Trying to use M32 seg #%d which is" + " already used by PE# %d\n", + pci_name(dev), i, + phb->ioda.m32_segmap[i]); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + phb->ioda.m32_segmap[i] = pe; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, + OPAL_M32_WINDOW_TYPE, + 0, i); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d setting up mapping" + " for M32 seg# %d\n", + pci_name(dev), rc, i); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + pos += phb->ioda.m32_segsize; + i++; + } + } +} + +/* Check if a resource still fits in the total IO or M32 range + * for a given PHB + */ +static int __devinit pnv_ioda_resource_fit(struct pci_controller *hose, + struct resource *r) +{ + struct resource *bounds; + + if (r->flags & IORESOURCE_IO) + bounds = &hose->io_resource; + else if (r->flags & IORESOURCE_MEM) + bounds = &hose->mem_resources[0]; + else + return 1; + + if (r->start >= bounds->start && r->end <= bounds->end) + return 1; + r->flags = 0; + return 0; +} + +static void __devinit pnv_ioda_update_resources(struct pci_bus *bus) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pci_bus *cbus; + struct pci_dev *cdev; + unsigned int i; + u16 cmd; + + /* Clear all device enables */ + list_for_each_entry(cdev, &bus->devices, bus_list) { + pci_read_config_word(cdev, PCI_COMMAND, &cmd); + cmd &= ~(PCI_COMMAND_IO|PCI_COMMAND_MEMORY|PCI_COMMAND_MASTER); + pci_write_config_word(cdev, PCI_COMMAND, cmd); + } + + /* Check if bus resources fit in our IO or M32 range */ + for (i = 0; bus->self && (i < 2); i++) { + struct resource *r = bus->resource[i]; + if (r && !pnv_ioda_resource_fit(hose, r)) + pr_err("%s: Bus %d resource %d disabled, no room\n", + pci_name(bus->self), bus->number, i); + } + + /* Update self if it's not a PHB */ + if (bus->self) + pci_setup_bridge(bus); + + /* Update child devices */ + list_for_each_entry(cdev, &bus->devices, bus_list) { + /* Check if resource fits, if not, disabled it */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *r = &cdev->resource[i]; + if (!pnv_ioda_resource_fit(hose, r)) + pr_err("%s: Resource %d disabled, no room\n", + pci_name(cdev), i); + } + + /* Assign segments */ + pnv_ioda_setup_pe_segments(cdev); + + /* Update HW BARs */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) + pci_update_resource(cdev, i); + } + + /* Update child busses */ + list_for_each_entry(cbus, &bus->children, node) + pnv_ioda_update_resources(cbus); +} + +static int __devinit pnv_ioda_alloc_pe(struct pnv_phb *phb) +{ + unsigned long pe; + + do { + pe = find_next_zero_bit(phb->ioda.pe_alloc, + phb->ioda.total_pe, 0); + if (pe >= phb->ioda.total_pe) + return IODA_INVALID_PE; + } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); + + phb->ioda.pe_array[pe].pe_number = pe; + return pe; +} + +static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +{ + WARN_ON(phb->ioda.pe_array[pe].pdev); + + memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); + clear_bit(pe, phb->ioda.pe_alloc); +} + +/* Currently those 2 are only used when MSIs are enabled, this will change + * but in the meantime, we need to protect them to avoid warnings + */ +#ifdef CONFIG_PCI_MSI +static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (!pdn) + return NULL; + if (pdn->pe_number == IODA_INVALID_PE) + return NULL; + return &phb->ioda.pe_array[pdn->pe_number]; +} + +static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev) +{ + struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev); + + while (!pe && dev->bus->self) { + dev = dev->bus->self; + pe = __pnv_ioda_get_one_pe(dev); + if (pe) + pe = pe->bus_pe; + } + return pe; +} +#endif /* CONFIG_PCI_MSI */ + +static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct pci_dev *parent; + uint8_t bcomp, dcomp, fcomp; + long rc, rid_end, rid; + + /* Bus validation ? */ + if (pe->pbus) { + int count; + + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; + parent = pe->pbus->self; + count = pe->pbus->subordinate - pe->pbus->secondary + 1; + switch(count) { + case 1: bcomp = OpalPciBusAll; break; + case 2: bcomp = OpalPciBus7Bits; break; + case 4: bcomp = OpalPciBus6Bits; break; + case 8: bcomp = OpalPciBus5Bits; break; + case 16: bcomp = OpalPciBus4Bits; break; + case 32: bcomp = OpalPciBus3Bits; break; + default: + pr_err("%s: Number of subordinate busses %d" + " unsupported\n", + pci_name(pe->pbus->self), count); + /* Do an exact match only */ + bcomp = OpalPciBusAll; + } + rid_end = pe->rid + (count << 8); + } else { + parent = pe->pdev->bus->self; + bcomp = OpalPciBusAll; + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; + rid_end = pe->rid + 1; + } + + /* Associate PE in PELT */ + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, + bcomp, dcomp, fcomp, OPAL_MAP_PE); + if (rc) { + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + return -ENXIO; + } + opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Add to all parents PELT-V */ + while (parent) { + struct pci_dn *pdn = pnv_ioda_get_pdn(parent); + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, 1); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + /* Setup reverse map */ + for (rid = pe->rid; rid < rid_end; rid++) + phb->ioda.pe_rmap[rid] = pe->pe_number; + + /* Setup one MVTs on IODA1 */ + if (phb->type == PNV_PHB_IODA1) { + pe->mve_number = pe->pe_number; + rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, + pe->pe_number); + if (rc) { + pe_err(pe, "OPAL error %ld setting up MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } else { + rc = opal_pci_set_mve_enable(phb->opal_id, + pe->mve_number, 1); + if (rc) { + pe_err(pe, "OPAL error %ld enabling MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } + } + } else if (phb->type == PNV_PHB_IODA2) + pe->mve_number = 0; + + return 0; +} + +static void __devinit pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct pnv_ioda_pe *lpe; + + list_for_each_entry(lpe, &phb->ioda.pe_list, link) { + if (lpe->dma_weight < pe->dma_weight) { + list_add_tail(&pe->link, &lpe->link); + return; + } + } + list_add_tail(&pe->link, &phb->ioda.pe_list); +} + +static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) +{ + /* This is quite simplistic. The "base" weight of a device + * is 10. 0 means no DMA is to be accounted for it. + */ + + /* If it's a bridge, no DMA */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return 0; + + /* Reduce the weight of slow USB controllers */ + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || + dev->class == PCI_CLASS_SERIAL_USB_OHCI || + dev->class == PCI_CLASS_SERIAL_USB_EHCI) + return 3; + + /* Increase the weight of RAID (includes Obsidian) */ + if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) + return 15; + + /* Default */ + return 10; +} + +static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + struct pnv_ioda_pe *pe; + int pe_num; + + if (!pdn) { + pr_err("%s: Device tree node not associated properly\n", + pci_name(dev)); + return NULL; + } + if (pdn->pe_number != IODA_INVALID_PE) + return NULL; + + /* PE#0 has been pre-set */ + if (dev->bus->number == 0) + pe_num = 0; + else + pe_num = pnv_ioda_alloc_pe(phb); + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available, disabling device\n", + pci_name(dev)); + return NULL; + } + + /* NOTE: We get only one ref to the pci_dev for the pdn, not for the + * pointer in the PE data structure, both should be destroyed at the + * same time. However, this needs to be looked at more closely again + * once we actually start removing things (Hotplug, SR-IOV, ...) + * + * At some point we want to remove the PDN completely anyways + */ + pe = &phb->ioda.pe_array[pe_num]; + pci_dev_get(dev); + pdn->pcidev = dev; + pdn->pe_number = pe_num; + pe->pdev = dev; + pe->pbus = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = dev->bus->number << 8 | pdn->devfn; + + pe_info(pe, "Associated device to PE\n"); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pdn->pe_number = IODA_INVALID_PE; + pe->pdev = NULL; + pci_dev_put(dev); + return NULL; + } + + /* Assign a DMA weight to the device */ + pe->dma_weight = pnv_ioda_dma_weight(dev); + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); + + return pe; +} + +static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (pdn == NULL) { + pr_warn("%s: No device node associated with device !\n", + pci_name(dev)); + continue; + } + pci_dev_get(dev); + pdn->pcidev = dev; + pdn->pe_number = pe->pe_number; + pe->dma_weight += pnv_ioda_dma_weight(dev); + if (dev->subordinate) + pnv_ioda_setup_same_PE(dev->subordinate, pe); + } +} + +static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, + struct pnv_ioda_pe *ppe) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_bus *bus = dev->subordinate; + struct pnv_ioda_pe *pe; + int pe_num; + + if (!bus) { + pr_warning("%s: Bridge without a subordinate bus !\n", + pci_name(dev)); + return; + } + pe_num = pnv_ioda_alloc_pe(phb); + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available, disabling bus\n", + pci_name(dev)); + return; + } + + pe = &phb->ioda.pe_array[pe_num]; + ppe->bus_pe = pe; + pe->pbus = bus; + pe->pdev = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = bus->secondary << 8; + pe->dma_weight = 0; + + pe_info(pe, "Secondary busses %d..%d associated with PE\n", + bus->secondary, bus->subordinate); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pe->pbus = NULL; + return; + } + + /* Associate it with all child devices */ + pnv_ioda_setup_same_PE(bus, pe); + + /* Account for one DMA PE if at least one DMA capable device exist + * below the bridge + */ + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); +} + +static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus) +{ + struct pci_dev *dev; + struct pnv_ioda_pe *pe; + + list_for_each_entry(dev, &bus->devices, bus_list) { + pe = pnv_ioda_setup_dev_PE(dev); + if (pe == NULL) + continue; + /* Leaving the PCIe domain ... single PE# */ + if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) + pnv_ioda_setup_bus_PE(dev, pe); + else if (dev->subordinate) + pnv_ioda_setup_PEs(dev->subordinate); + } +} + +static void __devinit pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, + struct pci_dev *dev) +{ + /* We delay DMA setup after we have assigned all PE# */ +} + +static void __devinit pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, + struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, &pe->tce32_table); + if (dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + +static void __devinit pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + unsigned int base, + unsigned int segs) +{ + + struct page *tce_mem = NULL; + const __be64 *swinvp; + struct iommu_table *tbl; + unsigned int i; + int64_t rc; + void *addr; + + /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ +#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) + + /* XXX FIXME: Handle 64-bit only DMA devices */ + /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ + /* XXX FIXME: Allocate multi-level tables on PHB3 */ + + /* We shouldn't already have a 32-bit DMA associated */ + if (WARN_ON(pe->tce32_seg >= 0)) + return; + + /* Grab a 32-bit TCE table */ + pe->tce32_seg = base; + pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", + (base << 28), ((base + segs) << 28) - 1); + + /* XXX Currently, we allocate one big contiguous table for the + * TCEs. We only really need one chunk per 256M of TCE space + * (ie per segment) but that's an optimization for later, it + * requires some added smarts with our get/put_tce implementation + */ + tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, + get_order(TCE32_TABLE_SIZE * segs)); + if (!tce_mem) { + pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, TCE32_TABLE_SIZE * segs); + + /* Configure HW */ + for (i = 0; i < segs; i++) { + rc = opal_pci_map_pe_dma_window(phb->opal_id, + pe->pe_number, + base + i, 1, + __pa(addr) + TCE32_TABLE_SIZE * i, + TCE32_TABLE_SIZE, 0x1000); + if (rc) { + pe_err(pe, " Failed to configure 32-bit TCE table," + " err %ld\n", rc); + goto fail; + } + } + + /* Setup linux iommu table */ + tbl = &pe->tce32_table; + pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, + base << 28); + + /* OPAL variant of P7IOC SW invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data + * to or. Since the bus is only printed out on table free + * errors, and on the first pass the data will be a relative + * bus number, print that out instead. + */ + tbl->it_busno = 0; + tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE + | TCE_PCI_SWINV_PAIR; + } + iommu_init_table(tbl, phb->hose->node); + + if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe->pbus); + + return; + fail: + /* XXX Failure: Try to fallback to 64-bit only ? */ + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + if (tce_mem) + __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); +} + +static void __devinit pnv_ioda_setup_dma(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + unsigned int residual, remaining, segs, tw, base; + struct pnv_ioda_pe *pe; + + /* If we have more PE# than segments available, hand out one + * per PE until we run out and let the rest fail. If not, + * then we assign at least one segment per PE, plus more based + * on the amount of devices under that PE + */ + if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) + residual = 0; + else + residual = phb->ioda.tce32_count - + phb->ioda.dma_pe_count; + + pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", + hose->global_number, phb->ioda.tce32_count); + pr_info("PCI: %d PE# for a total weight of %d\n", + phb->ioda.dma_pe_count, phb->ioda.dma_weight); + + /* Walk our PE list and configure their DMA segments, hand them + * out one base segment plus any residual segments based on + * weight + */ + remaining = phb->ioda.tce32_count; + tw = phb->ioda.dma_weight; + base = 0; + list_for_each_entry(pe, &phb->ioda.pe_list, link) { + if (!pe->dma_weight) + continue; + if (!remaining) { + pe_warn(pe, "No DMA32 resources available\n"); + continue; + } + segs = 1; + if (residual) { + segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; + if (segs > remaining) + segs = remaining; + } + pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", + pe->dma_weight, segs); + pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + remaining -= segs; + base += segs; + } +} + +#ifdef CONFIG_PCI_MSI +static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int is_64, + struct msi_msg *msg) +{ + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); + unsigned int xive_num = hwirq - phb->msi_base; + uint64_t addr64; + uint32_t addr32, data; + int rc; + + /* No PE assigned ? bail out ... no MSI for you ! */ + if (pe == NULL) + return -ENXIO; + + /* Check if we have an MVE */ + if (pe->mve_number < 0) + return -ENXIO; + + /* Assign XIVE to PE */ + rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); + if (rc) { + pr_warn("%s: OPAL error %d setting XIVE %d PE\n", + pci_name(dev), rc, xive_num); + return -EIO; + } + + if (is_64) { + rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, + &addr64, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = addr64 >> 32; + msg->address_lo = addr64 & 0xfffffffful; + } else { + rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, + &addr32, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = 0; + msg->address_lo = addr32; + } + msg->data = data; + + pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," + " address=%x_%08x data=%x PE# %d\n", + pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, + msg->address_hi, msg->address_lo, data, pe->pe_number); + + return 0; +} + +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +{ + unsigned int bmap_size; + const __be32 *prop = of_get_property(phb->hose->dn, + "ibm,opal-msi-ranges", NULL); + if (!prop) { + /* BML Fallback */ + prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); + } + if (!prop) + return; + + phb->msi_base = be32_to_cpup(prop); + phb->msi_count = be32_to_cpup(prop + 1); + bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long); + phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL); + if (!phb->msi_map) { + pr_err("PCI %d: Failed to allocate MSI bitmap !\n", + phb->hose->global_number); + return; + } + phb->msi_setup = pnv_pci_ioda_msi_setup; + phb->msi32_support = 1; + pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", + phb->msi_count, phb->msi_base); +} +#else +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +/* This is the starting point of our IODA specific resource + * allocation process + */ +static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose) +{ + resource_size_t size, align; + struct pci_bus *child; + + /* Associate PEs per functions */ + pnv_ioda_setup_PEs(hose->bus); + + /* Calculate all resources */ + pnv_ioda_calc_bus(hose->bus, IORESOURCE_IO, &size, &align); + pnv_ioda_calc_bus(hose->bus, IORESOURCE_MEM, &size, &align); + + /* Apply then to HW */ + pnv_ioda_update_resources(hose->bus); + + /* Setup DMA */ + pnv_ioda_setup_dma(hose->private_data); + + /* Configure PCI Express settings */ + list_for_each_entry(child, &hose->bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + pcie_bus_configure_settings(child, self->pcie_mpss); + } +} + +/* Prevent enabling devices for which we couldn't properly + * assign a PE + */ +static int __devinit pnv_pci_enable_device_hook(struct pci_dev *dev) +{ + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return -EINVAL; + return 0; +} + +static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, + u32 devfn) +{ + return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; +} + +void __init pnv_pci_init_ioda1_phb(struct device_node *np) +{ + struct pci_controller *hose; + static int primary = 1; + struct pnv_phb *phb; + unsigned long size, m32map_off, iomap_off, pemap_off; + const u64 *prop64; + u64 phb_id; + void *aux; + long rc; + + pr_info(" Initializing IODA OPAL PHB %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-phbid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-phbid\" property !\n"); + return; + } + phb_id = be64_to_cpup(prop64); + pr_debug(" PHB-ID : 0x%016llx\n", phb_id); + + phb = alloc_bootmem(sizeof(struct pnv_phb)); + if (phb) { + memset(phb, 0, sizeof(struct pnv_phb)); + phb->hose = hose = pcibios_alloc_controller(np); + } + if (!phb || !phb->hose) { + pr_err("PCI: Failed to allocate PCI controller for %s\n", + np->full_name); + return; + } + + spin_lock_init(&phb->lock); + /* XXX Use device-tree */ + hose->first_busno = 0; + hose->last_busno = 0xff; + hose->private_data = phb; + phb->opal_id = phb_id; + phb->type = PNV_PHB_IODA1; + + /* We parse "ranges" now since we need to deduce the register base + * from the IO base + */ + pci_process_bridge_OF_ranges(phb->hose, np, primary); + primary = 0; + + /* Magic formula from Milton */ + phb->regs = of_iomap(np, 0); + if (phb->regs == NULL) + pr_err(" Failed to map registers !\n"); + + + /* XXX This is hack-a-thon. This needs to be changed so that: + * - we obtain stuff like PE# etc... from device-tree + * - we properly re-allocate M32 ourselves + * (the OFW one isn't very good) + */ + + /* Initialize more IODA stuff */ + phb->ioda.total_pe = 128; + + phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); + /* OFW Has already off top 64k of M32 space (MSI space) */ + phb->ioda.m32_size += 0x10000; + + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; + phb->ioda.m32_pci_base = hose->mem_resources[0].start - + hose->pci_mem_offset; + phb->ioda.io_size = hose->pci_io_size; + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; + phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + + /* Allocate aux data & arrays */ + size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + m32map_off = size; + size += phb->ioda.total_pe; + iomap_off = size; + size += phb->ioda.total_pe; + pemap_off = size; + size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); + aux = alloc_bootmem(size); + memset(aux, 0, size); + phb->ioda.pe_alloc = aux; + phb->ioda.m32_segmap = aux + m32map_off; + phb->ioda.io_segmap = aux + iomap_off; + phb->ioda.pe_array = aux + pemap_off; + set_bit(0, phb->ioda.pe_alloc); + + INIT_LIST_HEAD(&phb->ioda.pe_list); + + /* Calculate how many 32-bit TCE segments we have */ + phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + + /* Clear unusable m64 */ + hose->mem_resources[1].flags = 0; + hose->mem_resources[1].start = 0; + hose->mem_resources[1].end = 0; + hose->mem_resources[2].flags = 0; + hose->mem_resources[2].start = 0; + hose->mem_resources[2].end = 0; + +#if 0 + rc = opal_pci_set_phb_mem_window(opal->phb_id, + window_type, + window_num, + starting_real_address, + starting_pci_address, + segment_size); +#endif + + pr_info(" %d PE's M32: 0x%x [segment=0x%x] IO: 0x%x [segment=0x%x]\n", + phb->ioda.total_pe, + phb->ioda.m32_size, phb->ioda.m32_segsize, + phb->ioda.io_size, phb->ioda.io_segsize); + + if (phb->regs) { + pr_devel(" BUID = 0x%016llx\n", in_be64(phb->regs + 0x100)); + pr_devel(" PHB2_CR = 0x%016llx\n", in_be64(phb->regs + 0x160)); + pr_devel(" IO_BAR = 0x%016llx\n", in_be64(phb->regs + 0x170)); + pr_devel(" IO_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x178)); + pr_devel(" IO_SAR = 0x%016llx\n", in_be64(phb->regs + 0x180)); + pr_devel(" M32_BAR = 0x%016llx\n", in_be64(phb->regs + 0x190)); + pr_devel(" M32_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x198)); + pr_devel(" M32_SAR = 0x%016llx\n", in_be64(phb->regs + 0x1a0)); + } + phb->hose->ops = &pnv_pci_ops; + + /* Setup RID -> PE mapping function */ + phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; + + /* Setup TCEs */ + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; + + /* Setup MSI support */ + pnv_pci_init_ioda_msis(phb); + + /* We set both probe_only and PCI_REASSIGN_ALL_RSRC. This is an + * odd combination which essentially means that we skip all resource + * fixups and assignments in the generic code, and do it all + * ourselves here + */ + pci_probe_only = 1; + ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb; + ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; + pci_add_flags(PCI_REASSIGN_ALL_RSRC); + + /* Reset IODA tables to a clean state */ + rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_RESET, OPAL_ASSERT_RESET); + if (rc) + pr_warning(" OPAL Error %ld performing IODA reset !\n", rc); + opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE); +} + +void __init pnv_pci_init_ioda_hub(struct device_node *np) +{ + struct device_node *phbn; + const u64 *prop64; + u64 hub_id; + + pr_info("Probing IODA IO-Hub %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-hubid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-hubid\" property !\n"); + return; + } + hub_id = be64_to_cpup(prop64); + pr_devel(" HUB-ID : 0x%016llx\n", hub_id); + + /* Count child PHBs */ + for_each_child_of_node(np, phbn) { + /* Look for IODA1 PHBs */ + if (of_device_is_compatible(phbn, "ibm,ioda-phb")) + pnv_pci_init_ioda1_phb(phbn); + } +} diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 85bb66d7f933..c0ed379498a0 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -257,12 +257,54 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; + +static void pnv_tce_invalidate(struct iommu_table *tbl, + u64 *startp, u64 *endp) +{ + u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; + unsigned long start, end, inc; + + start = __pa(startp); + end = __pa(endp); + + + /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ + if (tbl->it_busno) { + start <<= 12; + end <<= 12; + inc = 128 << 12; + start |= tbl->it_busno; + end |= tbl->it_busno; + } + /* p7ioc-style invalidation, 2 TCEs per write */ + else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { + start |= (1ull << 63); + end |= (1ull << 63); + inc = 16; + } + /* Default (older HW) */ + else + inc = 128; + + end |= inc - 1; /* round up end to be different than start */ + + mb(); /* Ensure above stores are visible */ + while (start <= end) { + __raw_writeq(start, invalidate); + start += inc; + } + /* The iommu layer will do another mb() for us on build() and + * we don't care on free() + */ +} + + static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 proto_tce; - u64 *tcep; + u64 *tcep, *tces; u64 rpn; proto_tce = TCE_PCI_READ; // Read allowed @@ -270,25 +312,33 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; - tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; + rpn = __pa(uaddr) >> TCE_SHIFT; - while (npages--) { - /* can't move this out since we might cross LMB boundary */ - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; - *tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + while (npages--) + *(tcep++) = proto_tce | (rpn++ << TCE_RPN_SHIFT); + + /* Some implementations won't cache invalid TCEs and thus may not + * need that flush. We'll probably turn it_type into a bit mask + * of flags if that becomes the case + */ + if (tbl->it_type & TCE_PCI_SWINV_CREATE) + pnv_tce_invalidate(tbl, tces, tcep - 1); - uaddr += TCE_PAGE_SIZE; - tcep++; - } return 0; } static void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { - u64 *tcep = ((u64 *)tbl->it_base) + index; + u64 *tcep, *tces; + + tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; while (npages--) *(tcep++) = 0; + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_tce_invalidate(tbl, tces, tcep - 1); } void pnv_pci_setup_iommu_table(struct iommu_table *tbl, @@ -308,13 +358,14 @@ static struct iommu_table * __devinit pnv_pci_setup_bml_iommu(struct pci_controller *hose) { struct iommu_table *tbl; - const __be64 *basep; + const __be64 *basep, *swinvp; const __be32 *sizep; basep = of_get_property(hose->dn, "linux,tce-base", NULL); sizep = of_get_property(hose->dn, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) { - pr_err("PCI: %s has missing tce entries !\n", hose->dn->full_name); + pr_err("PCI: %s has missing tce entries !\n", + hose->dn->full_name); return NULL; } tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose->node); @@ -323,6 +374,15 @@ pnv_pci_setup_bml_iommu(struct pci_controller *hose) pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), be32_to_cpup(sizep), 0); iommu_init_table(tbl, hose->node); + + /* Deal with SW invalidated TCEs when needed (BML way) */ + swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info", + NULL); + if (swinvp) { + tbl->it_busno = swinvp[1]; + tbl->it_index = (unsigned long)ioremap(swinvp[0], 8); + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; + } return tbl; } @@ -356,6 +416,13 @@ static void __devinit pnv_pci_dma_dev_setup(struct pci_dev *pdev) pnv_pci_dma_fallback_setup(hose, pdev); } +/* Fixup wrong class code in p7ioc root complex */ +static void __devinit pnv_p7ioc_rc_quirk(struct pci_dev *dev) +{ + dev->class = PCI_CLASS_BRIDGE_PCI << 8; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk); + static int pnv_pci_probe_mode(struct pci_bus *bus) { struct pci_controller *hose = pci_bus_to_host(bus); @@ -400,12 +467,24 @@ void __init pnv_pci_init(void) init_pci_config_tokens(); find_and_init_phbs(); #endif /* CONFIG_PPC_POWERNV_RTAS */ - } else { - /* OPAL is here, do our normal stuff */ + } + /* OPAL is here, do our normal stuff */ + else { + int found_ioda = 0; + + /* Look for IODA IO-Hubs. We don't support mixing IODA + * and p5ioc2 due to the need to change some global + * probing flags + */ + for_each_compatible_node(np, NULL, "ibm,ioda-hub") { + pnv_pci_init_ioda_hub(np); + found_ioda = 1; + } /* Look for p5ioc2 IO-Hubs */ - for_each_compatible_node(np, NULL, "ibm,p5ioc2") - pnv_pci_init_p5ioc2_hub(np); + if (!found_ioda) + for_each_compatible_node(np, NULL, "ibm,p5ioc2") + pnv_pci_init_p5ioc2_hub(np); } /* Setup the linkage between OF nodes and PHBs */ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index d4dbc4950936..28ae4ca512c4 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -9,6 +9,50 @@ enum pnv_phb_type { PNV_PHB_IODA2, }; +/* Data associated with a PE, including IOMMU tracking etc.. */ +struct pnv_ioda_pe { + /* A PE can be associated with a single device or an + * entire bus (& children). In the former case, pdev + * is populated, in the later case, pbus is. + */ + struct pci_dev *pdev; + struct pci_bus *pbus; + + /* Effective RID (device RID for a device PE and base bus + * RID with devfn 0 for a bus PE) + */ + unsigned int rid; + + /* PE number */ + unsigned int pe_number; + + /* "Weight" assigned to the PE for the sake of DMA resource + * allocations + */ + unsigned int dma_weight; + + /* This is a PCI-E -> PCI-X bridge, this points to the + * corresponding bus PE + */ + struct pnv_ioda_pe *bus_pe; + + /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ + int tce32_seg; + int tce32_segcount; + struct iommu_table tce32_table; + + /* XXX TODO: Add support for additional 64-bit iommus */ + + /* MSIs. MVE index is identical for for 32 and 64 bit MSI + * and -1 if not supported. (It's actually identical to the + * PE number) + */ + int mve_number; + + /* Link in list of PE#s */ + struct list_head link; +}; + struct pnv_phb { struct pci_controller *hose; enum pnv_phb_type type; @@ -34,6 +78,45 @@ struct pnv_phb { struct { struct iommu_table iommu_table; } p5ioc2; + + struct { + /* Global bridge info */ + unsigned int total_pe; + unsigned int m32_size; + unsigned int m32_segsize; + unsigned int m32_pci_base; + unsigned int io_size; + unsigned int io_segsize; + unsigned int io_pci_base; + + /* PE allocation bitmap */ + unsigned long *pe_alloc; + + /* M32 & IO segment maps */ + unsigned int *m32_segmap; + unsigned int *io_segmap; + struct pnv_ioda_pe *pe_array; + + /* Reverse map of PEs, will have to extend if + * we are to support more than 256 PEs, indexed + * bus { bus, devfn } + */ + unsigned char pe_rmap[0x10000]; + + /* 32-bit TCE tables allocation */ + unsigned long tce32_count; + + /* Total "weight" for the sake of DMA resources + * allocation + */ + unsigned int dma_weight; + unsigned int dma_pe_count; + + /* Sorted list of used PE's, sorted at + * boot for resource allocation purposes + */ + struct list_head pe_list; + } ioda; }; }; @@ -43,6 +126,7 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset); extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); +extern void pnv_pci_init_ioda_hub(struct device_node *np); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b719d9709730..c442f2b1980f 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -52,13 +52,42 @@ #include "plpar_wrappers.h" +static void tce_invalidate_pSeries_sw(struct iommu_table *tbl, + u64 *startp, u64 *endp) +{ + u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; + unsigned long start, end, inc; + + start = __pa(startp); + end = __pa(endp); + inc = L1_CACHE_BYTES; /* invalidate a cacheline of TCEs at a time */ + + /* If this is non-zero, change the format. We shift the + * address and or in the magic from the device tree. */ + if (tbl->it_busno) { + start <<= 12; + end <<= 12; + inc <<= 12; + start |= tbl->it_busno; + end |= tbl->it_busno; + } + + end |= inc - 1; /* round up end to be different than start */ + + mb(); /* Make sure TCEs in memory are written */ + while (start <= end) { + out_be64(invalidate, start); + start += inc; + } +} + static int tce_build_pSeries(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 proto_tce; - u64 *tcep; + u64 *tcep, *tces; u64 rpn; proto_tce = TCE_PCI_READ; // Read allowed @@ -66,7 +95,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; - tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((u64 *)tbl->it_base) + index; while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ @@ -76,18 +105,24 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, uaddr += TCE_PAGE_SIZE; tcep++; } + + if (tbl->it_type == TCE_PCI_SWINV_CREATE) + tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); return 0; } static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) { - u64 *tcep; + u64 *tcep, *tces; - tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((u64 *)tbl->it_base) + index; while (npages--) *(tcep++) = 0; + + if (tbl->it_type == TCE_PCI_SWINV_FREE) + tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); } static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) @@ -425,7 +460,7 @@ static void iommu_table_setparms(struct pci_controller *phb, struct iommu_table *tbl) { struct device_node *node; - const unsigned long *basep; + const unsigned long *basep, *sw_inval; const u32 *sizep; node = phb->dn; @@ -462,6 +497,22 @@ static void iommu_table_setparms(struct pci_controller *phb, tbl->it_index = 0; tbl->it_blocksize = 16; tbl->it_type = TCE_PCI; + + sw_inval = of_get_property(node, "linux,tce-sw-invalidate-info", NULL); + if (sw_inval) { + /* + * This property contains information on how to + * invalidate the TCE entry. The first property is + * the base MMIO address used to invalidate entries. + * The second property tells us the format of the TCE + * invalidate (whether it needs to be shifted) and + * some magic routing info to add to our invalidate + * command. + */ + tbl->it_index = (unsigned long) ioremap(sw_inval[0], 8); + tbl->it_busno = sw_inval[1]; /* overload this with magic */ + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; + } } /* diff --git a/arch/powerpc/platforms/wsp/Kconfig b/arch/powerpc/platforms/wsp/Kconfig index bd560c786ed6..57d22a2f4ba9 100644 --- a/arch/powerpc/platforms/wsp/Kconfig +++ b/arch/powerpc/platforms/wsp/Kconfig @@ -1,20 +1,28 @@ config PPC_WSP bool select PPC_A2 + select GENERIC_TBSYNC + select PPC_ICSWX select PPC_SCOM select PPC_XICS select PPC_ICP_NATIVE select PCI select PPC_IO_WORKAROUNDS if PCI select PPC_INDIRECT_PIO if PCI + select PPC_WSP_COPRO default n menu "WSP platform selection" depends on PPC_BOOK3E_64 config PPC_PSR2 - bool "PSR-2 platform" - select GENERIC_TBSYNC + bool "PowerEN System Reference Platform 2" + select EPAPR_BOOT + select PPC_WSP + default y + +config PPC_CHROMA + bool "PowerEN PCIe Chroma Card" select EPAPR_BOOT select PPC_WSP default y diff --git a/arch/powerpc/platforms/wsp/Makefile b/arch/powerpc/platforms/wsp/Makefile index a1486b436f02..56817ac98fc9 100644 --- a/arch/powerpc/platforms/wsp/Makefile +++ b/arch/powerpc/platforms/wsp/Makefile @@ -1,8 +1,10 @@ ccflags-y += -mno-minimal-toc -obj-y += setup.o ics.o -obj-$(CONFIG_PPC_PSR2) += psr2.o opb_pic.o +obj-y += setup.o ics.o wsp.o +obj-$(CONFIG_PPC_PSR2) += psr2.o +obj-$(CONFIG_PPC_CHROMA) += chroma.o h8.o +obj-$(CONFIG_PPC_WSP) += opb_pic.o obj-$(CONFIG_PPC_WSP) += scom_wsp.o obj-$(CONFIG_SMP) += smp.o scom_smp.o obj-$(CONFIG_PCI) += wsp_pci.o -obj-$(CONFIG_PCI_MSI) += msi.o
\ No newline at end of file +obj-$(CONFIG_PCI_MSI) += msi.o diff --git a/arch/powerpc/platforms/wsp/chroma.c b/arch/powerpc/platforms/wsp/chroma.c new file mode 100644 index 000000000000..ca6fa26f6e63 --- /dev/null +++ b/arch/powerpc/platforms/wsp/chroma.c @@ -0,0 +1,56 @@ +/* + * Copyright 2008-2011, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/of.h> +#include <linux/smp.h> +#include <linux/time.h> + +#include <asm/machdep.h> +#include <asm/system.h> +#include <asm/udbg.h> + +#include "ics.h" +#include "wsp.h" + +void __init chroma_setup_arch(void) +{ + wsp_setup_arch(); + wsp_setup_h8(); + +} + +static int __init chroma_probe(void) +{ + unsigned long root = of_get_flat_dt_root(); + + if (!of_flat_dt_is_compatible(root, "ibm,wsp-chroma")) + return 0; + + return 1; +} + +define_machine(chroma_md) { + .name = "Chroma PCIe", + .probe = chroma_probe, + .setup_arch = chroma_setup_arch, + .restart = wsp_h8_restart, + .power_off = wsp_h8_power_off, + .halt = wsp_halt, + .calibrate_decr = generic_calibrate_decr, + .init_IRQ = wsp_setup_irq, + .progress = udbg_progress, + .power_save = book3e_idle, +}; + +machine_arch_initcall(chroma_md, wsp_probe_devices); diff --git a/arch/powerpc/platforms/wsp/h8.c b/arch/powerpc/platforms/wsp/h8.c new file mode 100644 index 000000000000..d18e6cc19df3 --- /dev/null +++ b/arch/powerpc/platforms/wsp/h8.c @@ -0,0 +1,134 @@ +/* + * Copyright 2008-2011, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/io.h> + +#include "wsp.h" + +/* + * The UART connection to the H8 is over ttyS1 which is just a 16550. + * We assume that FW has it setup right and no one messes with it. + */ + + +static u8 __iomem *h8; + +#define RBR 0 /* Receiver Buffer Register */ +#define THR 0 /* Transmitter Holding Register */ +#define LSR 5 /* Line Status Register */ +#define LSR_DR 0x01 /* LSR value for Data-Ready */ +#define LSR_THRE 0x20 /* LSR value for Transmitter-Holding-Register-Empty */ +static void wsp_h8_putc(int c) +{ + u8 lsr; + + do { + lsr = readb(h8 + LSR); + } while ((lsr & LSR_THRE) != LSR_THRE); + writeb(c, h8 + THR); +} + +static int wsp_h8_getc(void) +{ + u8 lsr; + + do { + lsr = readb(h8 + LSR); + } while ((lsr & LSR_DR) != LSR_DR); + + return readb(h8 + RBR); +} + +static void wsp_h8_puts(const char *s, int sz) +{ + int i; + + for (i = 0; i < sz; i++) { + wsp_h8_putc(s[i]); + + /* no flow control so wait for echo */ + wsp_h8_getc(); + } + wsp_h8_putc('\r'); + wsp_h8_putc('\n'); +} + +static void wsp_h8_terminal_cmd(const char *cmd, int sz) +{ + hard_irq_disable(); + wsp_h8_puts(cmd, sz); + /* should never return, but just in case */ + for (;;) + continue; +} + + +void wsp_h8_restart(char *cmd) +{ + static const char restart[] = "warm-reset"; + + (void)cmd; + wsp_h8_terminal_cmd(restart, sizeof(restart) - 1); +} + +void wsp_h8_power_off(void) +{ + static const char off[] = "power-off"; + + wsp_h8_terminal_cmd(off, sizeof(off) - 1); +} + +static void __iomem *wsp_h8_getaddr(void) +{ + struct device_node *aliases; + struct device_node *uart; + struct property *path; + void __iomem *va = NULL; + + /* + * there is nothing in the devtree to tell us which is mapped + * to the H8, but se know it is the second serial port. + */ + + aliases = of_find_node_by_path("/aliases"); + if (aliases == NULL) + return NULL; + + path = of_find_property(aliases, "serial1", NULL); + if (path == NULL) + goto out; + + uart = of_find_node_by_path(path->value); + if (uart == NULL) + goto out; + + va = of_iomap(uart, 0); + + /* remove it so no one messes with it */ + of_detach_node(uart); + of_node_put(uart); + +out: + of_node_put(aliases); + + return va; +} + +void __init wsp_setup_h8(void) +{ + h8 = wsp_h8_getaddr(); + + /* Devtree change? lets hard map it anyway */ + if (h8 == NULL) { + pr_warn("UART to H8 could not be found"); + h8 = ioremap(0xffc0008000ULL, 0x100); + } +} diff --git a/arch/powerpc/platforms/wsp/opb_pic.c b/arch/powerpc/platforms/wsp/opb_pic.c index be05631a3c1c..19f353dfcd03 100644 --- a/arch/powerpc/platforms/wsp/opb_pic.c +++ b/arch/powerpc/platforms/wsp/opb_pic.c @@ -320,7 +320,8 @@ void __init opb_pic_init(void) } /* Attach opb interrupt handler to new virtual IRQ */ - rc = request_irq(virq, opb_irq_handler, 0, "OPB LS Cascade", opb); + rc = request_irq(virq, opb_irq_handler, IRQF_NO_THREAD, + "OPB LS Cascade", opb); if (rc) { printk("opb: request_irq failed: %d\n", rc); continue; diff --git a/arch/powerpc/platforms/wsp/psr2.c b/arch/powerpc/platforms/wsp/psr2.c index 166f2e4b4bee..0c1ae06d0be1 100644 --- a/arch/powerpc/platforms/wsp/psr2.c +++ b/arch/powerpc/platforms/wsp/psr2.c @@ -14,10 +14,10 @@ #include <linux/mm.h> #include <linux/of.h> #include <linux/smp.h> +#include <linux/time.h> #include <asm/machdep.h> #include <asm/system.h> -#include <asm/time.h> #include <asm/udbg.h> #include "ics.h" @@ -27,7 +27,8 @@ static void psr2_spin(void) { hard_irq_disable(); - for (;;) ; + for (;;) + continue; } static void psr2_restart(char *cmd) @@ -35,65 +36,32 @@ static void psr2_restart(char *cmd) psr2_spin(); } -static int psr2_probe_devices(void) -{ - struct device_node *np; - - /* Our RTC is a ds1500. It seems to be programatically compatible - * with the ds1511 for which we have a driver so let's use that - */ - np = of_find_compatible_node(NULL, NULL, "dallas,ds1500"); - if (np != NULL) { - struct resource res; - if (of_address_to_resource(np, 0, &res) == 0) - platform_device_register_simple("ds1511", 0, &res, 1); - } - return 0; -} -machine_arch_initcall(psr2_md, psr2_probe_devices); - -static void __init psr2_setup_arch(void) -{ - /* init to some ~sane value until calibrate_delay() runs */ - loops_per_jiffy = 50000000; - - scom_init_wsp(); - - /* Setup SMP callback */ -#ifdef CONFIG_SMP - a2_setup_smp(); -#endif -#ifdef CONFIG_PCI - wsp_setup_pci(); -#endif - -} - static int __init psr2_probe(void) { unsigned long root = of_get_flat_dt_root(); + if (of_flat_dt_is_compatible(root, "ibm,wsp-chroma")) { + /* chroma systems also claim they are psr2s */ + return 0; + } + if (!of_flat_dt_is_compatible(root, "ibm,psr2")) return 0; return 1; } -static void __init psr2_init_irq(void) -{ - wsp_init_irq(); - opb_pic_init(); -} - define_machine(psr2_md) { .name = "PSR2 A2", .probe = psr2_probe, - .setup_arch = psr2_setup_arch, + .setup_arch = wsp_setup_arch, .restart = psr2_restart, .power_off = psr2_spin, .halt = psr2_spin, .calibrate_decr = generic_calibrate_decr, - .init_IRQ = psr2_init_irq, + .init_IRQ = wsp_setup_irq, .progress = udbg_progress, .power_save = book3e_idle, }; + +machine_arch_initcall(psr2_md, wsp_probe_devices); diff --git a/arch/powerpc/platforms/wsp/wsp.c b/arch/powerpc/platforms/wsp/wsp.c new file mode 100644 index 000000000000..d25cc96c21b8 --- /dev/null +++ b/arch/powerpc/platforms/wsp/wsp.c @@ -0,0 +1,115 @@ +/* + * Copyright 2008-2011, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/of_device.h> +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/time.h> + +#include <asm/scom.h> + +#include "wsp.h" +#include "ics.h" + +#define WSP_SOC_COMPATIBLE "ibm,wsp-soc" +#define PBIC_COMPATIBLE "ibm,wsp-pbic" +#define COPRO_COMPATIBLE "ibm,wsp-coprocessor" + +static int __init wsp_probe_buses(void) +{ + static __initdata struct of_device_id bus_ids[] = { + /* + * every node in between needs to be here or you won't + * find it + */ + { .compatible = WSP_SOC_COMPATIBLE, }, + { .compatible = PBIC_COMPATIBLE, }, + { .compatible = COPRO_COMPATIBLE, }, + {}, + }; + of_platform_bus_probe(NULL, bus_ids, NULL); + + return 0; +} + +void __init wsp_setup_arch(void) +{ + /* init to some ~sane value until calibrate_delay() runs */ + loops_per_jiffy = 50000000; + + scom_init_wsp(); + + /* Setup SMP callback */ +#ifdef CONFIG_SMP + a2_setup_smp(); +#endif +#ifdef CONFIG_PCI + wsp_setup_pci(); +#endif +} + +void __init wsp_setup_irq(void) +{ + wsp_init_irq(); + opb_pic_init(); +} + + +int __init wsp_probe_devices(void) +{ + struct device_node *np; + + /* Our RTC is a ds1500. It seems to be programatically compatible + * with the ds1511 for which we have a driver so let's use that + */ + np = of_find_compatible_node(NULL, NULL, "dallas,ds1500"); + if (np != NULL) { + struct resource res; + if (of_address_to_resource(np, 0, &res) == 0) + platform_device_register_simple("ds1511", 0, &res, 1); + } + + wsp_probe_buses(); + + return 0; +} + +void wsp_halt(void) +{ + u64 val; + scom_map_t m; + struct device_node *dn; + struct device_node *mine; + struct device_node *me; + + me = of_get_cpu_node(smp_processor_id(), NULL); + mine = scom_find_parent(me); + + /* This will halt all the A2s but not power off the chip */ + for_each_node_with_property(dn, "scom-controller") { + if (dn == mine) + continue; + m = scom_map(dn, 0, 1); + + /* read-modify-write it so the HW probe does not get + * confused */ + val = scom_read(m, 0); + val |= 1; + scom_write(m, 0, val); + scom_unmap(m); + } + m = scom_map(mine, 0, 1); + val = scom_read(m, 0); + val |= 1; + scom_write(m, 0, val); + /* should never return */ + scom_unmap(m); +} diff --git a/arch/powerpc/platforms/wsp/wsp.h b/arch/powerpc/platforms/wsp/wsp.h index 33479818f62a..10c1d1fff362 100644 --- a/arch/powerpc/platforms/wsp/wsp.h +++ b/arch/powerpc/platforms/wsp/wsp.h @@ -6,15 +6,25 @@ /* Devtree compatible strings for major devices */ #define PCIE_COMPATIBLE "ibm,wsp-pciex" +extern void wsp_setup_arch(void); +extern void wsp_setup_irq(void); +extern int wsp_probe_devices(void); +extern void wsp_halt(void); + extern void wsp_setup_pci(void); extern void scom_init_wsp(void); extern void a2_setup_smp(void); extern int a2_scom_startup_cpu(unsigned int lcpu, int thr_idx, struct device_node *np); -int smp_a2_cpu_bootable(unsigned int nr); -int __devinit smp_a2_kick_cpu(int nr); +extern int smp_a2_cpu_bootable(unsigned int nr); +extern int __devinit smp_a2_kick_cpu(int nr); + +extern void opb_pic_init(void); -void opb_pic_init(void); +/* chroma specific managment */ +extern void wsp_h8_restart(char *cmd); +extern void wsp_h8_power_off(void); +extern void __init wsp_setup_h8(void); #endif /* __WSP_H */ diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 8c7e8528e7c4..b3fa3d7d4ab6 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -901,7 +901,7 @@ int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type) if (vold != vnew) mpic_irq_write(src, MPIC_INFO(IRQ_VECTOR_PRI), vnew); - return IRQ_SET_MASK_OK_NOCOPY;; + return IRQ_SET_MASK_OK_NOCOPY; } void mpic_set_vector(unsigned int virq, unsigned int vector) diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 63762c672a03..d72eda6a4c05 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -137,7 +137,7 @@ static void xics_request_ipi(void) * IPIs are marked IRQF_PERCPU. The handler was set in map. */ BUG_ON(request_irq(ipi, icp_ops->ipi_action, - IRQF_PERCPU, "IPI", NULL)); + IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); } int __init xics_smp_probe(void) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 03a217ae3be0..cb95eea74d3d 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -228,13 +228,11 @@ Commands:\n\ t print backtrace\n\ x exit monitor and recover\n\ X exit monitor and dont recover\n" -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E) " u dump segment table or SLB\n" -#endif -#ifdef CONFIG_PPC_STD_MMU_32 +#elif defined(CONFIG_PPC_STD_MMU_32) " u dump segment registers\n" -#endif -#ifdef CONFIG_44x +#elif defined(CONFIG_44x) || defined(CONFIG_PPC_BOOK3E) " u dump TLB\n" #endif " ? help\n" @@ -340,7 +338,7 @@ int cpus_are_in_xmon(void) static inline int unrecoverable_excp(struct pt_regs *regs) { -#if defined(CONFIG_4xx) || defined(CONFIG_BOOK3E) +#if defined(CONFIG_4xx) || defined(CONFIG_PPC_BOOK3E) /* We have no MSR_RI bit on 4xx or Book3e, so we simply return false */ return 0; #else @@ -885,13 +883,11 @@ cmds(struct pt_regs *excp) case 'u': dump_segments(); break; -#endif -#ifdef CONFIG_4xx +#elif defined(CONFIG_4xx) case 'u': dump_tlb_44x(); break; -#endif -#ifdef CONFIG_PPC_BOOK3E +#elif defined(CONFIG_PPC_BOOK3E) case 'u': dump_tlb_book3e(); break; |