diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
54 files changed, 8782 insertions, 2666 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 595e9f8a6539..afc0f6a61337 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -7,6 +7,7 @@ config PPC_PSERIES select OF_DYNAMIC select FORCE_PCI select PCI_MSI + select GENERIC_ALLOCATOR select PPC_XICS select PPC_XIVE_SPAPR select PPC_ICP_NATIVE @@ -17,24 +18,37 @@ config PPC_PSERIES select PPC_RTAS_DAEMON select RTAS_ERROR_LOGGING select PPC_UDBG_16550 - select PPC_NATIVE select PPC_DOORBELL select HOTPLUG_CPU - select ARCH_RANDOM - select PPC_DOORBELL select FORCE_SMP select SWIOTLB + select ARCH_SUPPORTS_PER_VMA_LOCK default y +config PARAVIRT + bool + +config PARAVIRT_SPINLOCKS + bool + +config PARAVIRT_TIME_ACCOUNTING + select PARAVIRT + bool + config PPC_SPLPAR - depends on PPC_PSERIES bool "Support for shared-processor logical partitions" + depends on PPC_PSERIES + select PARAVIRT_SPINLOCKS if PPC_QUEUED_SPINLOCKS + select PARAVIRT_TIME_ACCOUNTING if VIRT_CPU_ACCOUNTING_GEN + default y help Enabling this option will make the kernel run more efficiently on logically-partitioned pSeries systems which use shared processors, that is, which share physical processors between two or more partitions. + Say Y if you are unsure. + config DTL bool "Dispatch Trace Log" depends on PPC_SPLPAR && DEBUG_FS @@ -55,10 +69,6 @@ config PSERIES_ENERGY Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint -config SCANLOG - tristate "Scanlog dump interface" - depends on RTAS_PROC && PPC_PSERIES - config IO_EVENT_IRQ bool "IO Event Interrupt support" depends on PPC_PSERIES @@ -141,6 +151,25 @@ config IBMEBUS help Bus device driver for GX bus based adapters. +config PSERIES_PLPKS + depends on PPC_PSERIES + select NLS + bool + # PowerVM provides an isolated Platform Keystore (PKS) storage + # allocation for each LPAR with individually managed access + # controls to store sensitive information securely. It can be + # used to store asymmetric public keys or secrets as required + # by different usecases. + # + # This option is selected by in-kernel consumers that require + # access to the PKS. + +config PSERIES_PLPKS_SED + depends on PPC_PSERIES + bool + # This option is selected by in-kernel consumers that require + # access to the SED PKS keystore. + config PAPR_SCM depends on PPC_PSERIES && MEMORY_HOTPLUG && LIBNVDIMM tristate "Support for the PAPR Storage Class Memory interface" @@ -153,6 +182,7 @@ config PPC_SVM select SWIOTLB select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_FORCE_DMA_UNENCRYPTED + select ARCH_HAS_CC_PLATFORM help There are certain POWER platforms which support secure guests using the Protected Execution Facility, with the help of an Ultravisor diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index a3c74a5cf20d..f936962a2946 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -3,12 +3,13 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ - of_helpers.o \ + of_helpers.o rtas-work-area.o papr-sysparm.o \ + papr-vpd.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ - pci.o pci_dlpar.o eeh_pseries.o msi.o + pci.o pci_dlpar.o eeh_pseries.o msi.o \ + papr_platform_attributes.o dtl.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SCANLOG) += scanlog.o obj-$(CONFIG_KEXEC_CORE) += kexec.o obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o @@ -19,7 +20,6 @@ obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o obj-$(CONFIG_CMM) += cmm.o -obj-$(CONFIG_DTL) += dtl.o obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o @@ -28,7 +28,14 @@ obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_SVM) += svm.o obj-$(CONFIG_FA_DUMP) += rtas-fadump.o - -ifdef CONFIG_PPC_PSERIES +obj-$(CONFIG_PSERIES_PLPKS) += plpks.o +obj-$(CONFIG_PPC_SECURE_BOOT) += plpks-secvar.o +obj-$(CONFIG_PSERIES_PLPKS_SED) += plpks_sed_ops.o obj-$(CONFIG_SUSPEND) += suspend.o -endif +obj-$(CONFIG_PPC_VAS) += vas.o vas-sysfs.o + +obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o + +# nothing that operates in real mode is safe for KASAN +KASAN_SANITIZE_ras.o := n +KASAN_SANITIZE_kexec.o := n diff --git a/arch/powerpc/platforms/pseries/cc_platform.c b/arch/powerpc/platforms/pseries/cc_platform.c new file mode 100644 index 000000000000..e8021af83a19 --- /dev/null +++ b/arch/powerpc/platforms/pseries/cc_platform.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Confidential Computing Platform Capability checks + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#include <linux/export.h> +#include <linux/cc_platform.h> + +#include <asm/machdep.h> +#include <asm/svm.h> + +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_secure_guest(); + + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 91571841df8a..5f4037c1d7fe 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -19,14 +19,10 @@ #include <linux/stringify.h> #include <linux/swap.h> #include <linux/device.h> -#include <linux/mount.h> -#include <linux/pseudo_fs.h> -#include <linux/magic.h> #include <linux/balloon_compaction.h> #include <asm/firmware.h> #include <asm/hvcall.h> #include <asm/mmu.h> -#include <asm/pgalloc.h> #include <linux/uaccess.h> #include <linux/memory.h> #include <asm/plpar_wrappers.h> @@ -476,8 +472,6 @@ static struct notifier_block cmm_reboot_nb = { static int cmm_memory_cb(struct notifier_block *self, unsigned long action, void *arg) { - int ret = 0; - switch (action) { case MEM_GOING_OFFLINE: mutex_lock(&hotplug_mutex); @@ -494,7 +488,7 @@ static int cmm_memory_cb(struct notifier_block *self, break; } - return notifier_from_errno(ret); + return NOTIFY_OK; } static struct notifier_block cmm_mem_nb = { @@ -503,19 +497,6 @@ static struct notifier_block cmm_mem_nb = { }; #ifdef CONFIG_BALLOON_COMPACTION -static struct vfsmount *balloon_mnt; - -static int cmm_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type balloon_fs = { - .name = "ppc-cmm", - .init_fs_context = cmm_init_fs_context, - .kill_sb = kill_anon_super, -}; - static int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -539,6 +520,16 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, /* balloon page list reference */ get_page(newpage); + /* + * When we migrate a page to a different zone, we have to fixup the + * count of both involved zones as we adjusted the managed page count + * when inflating. + */ + if (page_zone(page) != page_zone(newpage)) { + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } + spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_insert(b_dev_info, newpage); balloon_page_delete(page); @@ -557,47 +548,13 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, return MIGRATEPAGE_SUCCESS; } -static int cmm_balloon_compaction_init(void) +static void cmm_balloon_compaction_init(void) { - int rc; - balloon_devinfo_init(&b_dev_info); b_dev_info.migratepage = cmm_migratepage; - - balloon_mnt = kern_mount(&balloon_fs); - if (IS_ERR(balloon_mnt)) { - rc = PTR_ERR(balloon_mnt); - balloon_mnt = NULL; - return rc; - } - - b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); - if (IS_ERR(b_dev_info.inode)) { - rc = PTR_ERR(b_dev_info.inode); - b_dev_info.inode = NULL; - kern_unmount(balloon_mnt); - balloon_mnt = NULL; - return rc; - } - - b_dev_info.inode->i_mapping->a_ops = &balloon_aops; - return 0; -} -static void cmm_balloon_compaction_deinit(void) -{ - if (b_dev_info.inode) - iput(b_dev_info.inode); - b_dev_info.inode = NULL; - kern_unmount(balloon_mnt); - balloon_mnt = NULL; } #else /* CONFIG_BALLOON_COMPACTION */ -static int cmm_balloon_compaction_init(void) -{ - return 0; -} - -static void cmm_balloon_compaction_deinit(void) +static void cmm_balloon_compaction_init(void) { } #endif /* CONFIG_BALLOON_COMPACTION */ @@ -615,9 +572,7 @@ static int cmm_init(void) if (!firmware_has_feature(FW_FEATURE_CMO) && !simulate) return -EOPNOTSUPP; - rc = cmm_balloon_compaction_init(); - if (rc) - return rc; + cmm_balloon_compaction_init(); rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) @@ -651,7 +606,6 @@ out_reboot_notifier: out_oom_notifier: unregister_oom_notifier(&cmm_oom_nb); out_balloon_compaction: - cmm_balloon_compaction_deinit(); return rc; } @@ -670,7 +624,6 @@ static void cmm_exit(void) unregister_memory_notifier(&cmm_mem_nb); cmm_free_pages(atomic_long_read(&loaned_pages)); cmm_unregister_sysfs(&cmm_dev); - cmm_balloon_compaction_deinit(); } /** diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 16e86ba8aa20..47f8eabd1bee 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -19,10 +19,10 @@ #include "of_helpers.h" #include "pseries.h" -#include <asm/prom.h> #include <asm/machdep.h> #include <linux/uaccess.h> #include <asm/rtas.h> +#include <asm/rtas-work-area.h> static struct workqueue_struct *pseries_hp_wq; @@ -127,7 +127,6 @@ void dlpar_free_cc_nodes(struct device_node *dn) #define NEXT_PROPERTY 3 #define PREV_PARENT 4 #define MORE_MEMORY 5 -#define CALL_AGAIN -2 #define ERR_CFG_USE -9003 struct device_node *dlpar_configure_connector(__be32 drc_index, @@ -139,34 +138,27 @@ struct device_node *dlpar_configure_connector(__be32 drc_index, struct property *property; struct property *last_property = NULL; struct cc_workarea *ccwa; + struct rtas_work_area *work_area; char *data_buf; int cc_token; int rc = -1; - cc_token = rtas_token("ibm,configure-connector"); + cc_token = rtas_function_token(RTAS_FN_IBM_CONFIGURE_CONNECTOR); if (cc_token == RTAS_UNKNOWN_SERVICE) return NULL; - data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); - if (!data_buf) - return NULL; + work_area = rtas_work_area_alloc(SZ_4K); + data_buf = rtas_work_area_raw_buf(work_area); ccwa = (struct cc_workarea *)&data_buf[0]; ccwa->drc_index = drc_index; ccwa->zero = 0; do { - /* Since we release the rtas_data_buf lock between configure - * connector calls we want to re-populate the rtas_data_buffer - * with the contents of the previous call. - */ - spin_lock(&rtas_data_buf_lock); - - memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE); - rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); - memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE); - - spin_unlock(&rtas_data_buf_lock); + do { + rc = rtas_call(cc_token, 2, 1, NULL, + rtas_work_area_phys(work_area), NULL); + } while (rtas_busy_delay(rc)); switch (rc) { case COMPLETE: @@ -216,9 +208,6 @@ struct device_node *dlpar_configure_connector(__be32 drc_index, last_dn = last_dn->parent; break; - case CALL_AGAIN: - break; - case MORE_MEMORY: case ERR_CFG_USE: default: @@ -229,7 +218,7 @@ struct device_node *dlpar_configure_connector(__be32 drc_index, } while (rc); cc_error: - kfree(data_buf); + rtas_work_area_free(work_area); if (rc) { if (first_dn) @@ -290,8 +279,7 @@ int dlpar_acquire_drc(u32 drc_index) { int dr_status, rc; - rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, - DR_ENTITY_SENSE, drc_index); + rc = rtas_get_sensor(DR_ENTITY_SENSE, drc_index, &dr_status); if (rc || dr_status != DR_ENTITY_UNUSABLE) return -1; @@ -312,8 +300,7 @@ int dlpar_release_drc(u32 drc_index) { int dr_status, rc; - rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, - DR_ENTITY_SENSE, drc_index); + rc = rtas_get_sensor(DR_ENTITY_SENSE, drc_index, &dr_status); if (rc || dr_status != DR_ENTITY_PRESENT) return -1; @@ -330,6 +317,19 @@ int dlpar_release_drc(u32 drc_index) return 0; } +int dlpar_unisolate_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_get_sensor(DR_ENTITY_SENSE, drc_index, &dr_status); + if (rc || dr_status != DR_ENTITY_PRESENT) + return -1; + + rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + + return 0; +} + int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) { int rc; @@ -379,7 +379,7 @@ static void pseries_hp_work_fn(struct work_struct *work) handle_dlpar_errorlog(hp_work->errlog); kfree(hp_work->errlog); - kfree((void *)work); + kfree(work); } void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog) @@ -512,7 +512,7 @@ static int dlpar_parse_id_type(char **cmd, struct pseries_hp_errorlog *hp_elog) return 0; } -static ssize_t dlpar_store(struct class *class, struct class_attribute *attr, +static ssize_t dlpar_store(const struct class *class, const struct class_attribute *attr, const char *buf, size_t count) { struct pseries_hp_errorlog hp_elog; @@ -521,11 +521,8 @@ static ssize_t dlpar_store(struct class *class, struct class_attribute *attr, int rc; args = argbuf = kstrdup(buf, GFP_KERNEL); - if (!argbuf) { - pr_info("Could not allocate resources for DLPAR operation\n"); - kfree(argbuf); + if (!argbuf) return -ENOMEM; - } /* * Parse out the request from the user, this will be in the form: @@ -554,7 +551,7 @@ dlpar_store_out: return rc ? rc : count; } -static ssize_t dlpar_show(struct class *class, struct class_attribute *attr, +static ssize_t dlpar_show(const struct class *class, const struct class_attribute *attr, char *buf) { return sprintf(buf, "%s\n", "memory,cpu"); @@ -567,8 +564,7 @@ int __init dlpar_workqueue_init(void) if (pseries_hp_wq) return 0; - pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue", - WQ_UNBOUND, 1); + pseries_hp_wq = alloc_ordered_workqueue("pseries hotplug workqueue", 0); return pseries_hp_wq ? 0 : -ENOMEM; } diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index eab8aa293743..3f1cdccebc9c 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -11,12 +11,14 @@ #include <linux/spinlock.h> #include <asm/smp.h> #include <linux/uaccess.h> +#include <linux/debugfs.h> #include <asm/firmware.h> +#include <asm/dtl.h> #include <asm/lppaca.h> -#include <asm/debugfs.h> #include <asm/plpar_wrappers.h> #include <asm/machdep.h> +#ifdef CONFIG_DTL struct dtl { struct dtl_entry *buf; int cpu; @@ -36,6 +38,15 @@ static u8 dtl_event_mask = DTL_LOG_ALL; static int dtl_buf_entries = N_DISPATCH_LOG; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + +/* + * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls + * reading from the dispatch trace log. If other code wants to consume + * DTL entries, it can set this pointer to a function that will get + * called once for each DTL entry that gets processed. + */ +static void (*dtl_consumer)(struct dtl_entry *entry, u64 index); + struct dtl_ring { u64 write_index; struct dtl_entry *write_ptr; @@ -337,7 +348,7 @@ static int dtl_init(void) /* set up common debugfs structure */ - dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); + dtl_dir = debugfs_create_dir("dtl", arch_debugfs_dir); debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); @@ -354,3 +365,81 @@ static int dtl_init(void) return 0; } machine_arch_initcall(pseries, dtl_init); +#endif /* CONFIG_DTL */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +/* + * Scan the dispatch trace log and count up the stolen time. + * Should be called with interrupts disabled. + */ +static notrace u64 scan_dispatch_log(u64 stop_tb) +{ + u64 i = local_paca->dtl_ridx; + struct dtl_entry *dtl = local_paca->dtl_curr; + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + u64 tb_delta; + u64 stolen = 0; + u64 dtb; + + if (!dtl) + return 0; + + if (i == be64_to_cpu(vpa->dtl_idx)) + return 0; + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtb = be64_to_cpu(dtl->timebase); + tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + + be32_to_cpu(dtl->ready_to_enqueue_time); + barrier(); + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { + /* buffer has overflowed */ + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + if (dtb > stop_tb) + break; +#ifdef CONFIG_DTL + if (dtl_consumer) + dtl_consumer(dtl, i); +#endif + stolen += tb_delta; + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + local_paca->dtl_ridx = i; + local_paca->dtl_curr = dtl; + return stolen; +} + +/* + * Accumulate stolen time by scanning the dispatch trace log. + * Called on entry from user mode. + */ +void notrace pseries_accumulate_stolen_time(void) +{ + u64 sst, ust; + struct cpu_accounting_data *acct = &local_paca->accounting; + + sst = scan_dispatch_log(acct->starttime_user); + ust = scan_dispatch_log(acct->starttime); + acct->stime -= sst; + acct->utime -= ust; + acct->steal_time += ust + sst; +} + +u64 pseries_calculate_stolen_time(u64 stop_tb) +{ + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) + return scan_dispatch_log(stop_tb); + + return 0; +} + +#endif diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 893ba3f562c4..b1ae0c0d1187 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -24,6 +24,7 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/spinlock.h> +#include <linux/crash_dump.h> #include <asm/eeh.h> #include <asm/eeh_event.h> @@ -42,7 +43,9 @@ static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_pe; -void pseries_pcibios_bus_add_device(struct pci_dev *pdev) +static void pseries_eeh_init_edev(struct pci_dn *pdn); + +static void pseries_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); @@ -52,8 +55,6 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) dev_dbg(&pdev->dev, "EEH: Setting up device\n"); #ifdef CONFIG_PCI_IOV if (pdev->is_virtfn) { - struct pci_dn *physfn_pdn; - pdn->device_id = pdev->device; pdn->vendor_id = pdev->vendor; pdn->class_code = pdev->class; @@ -63,95 +64,187 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) * completion from platform. */ pdn->last_allow_rc = 0; - physfn_pdn = pci_get_pdn(pdev->physfn); - pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; } #endif - eeh_add_device_early(pdn); - eeh_add_device_late(pdev); + pseries_eeh_init_edev(pdn); #ifdef CONFIG_PCI_IOV if (pdev->is_virtfn) { + /* + * FIXME: This really should be handled by choosing the right + * parent PE in pseries_eeh_init_edev(). + */ + struct eeh_pe *physfn_pe = pci_dev_to_eeh_dev(pdev->physfn)->pe; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); - eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ - eeh_add_to_parent_pe(edev); /* Add as VF PE type */ + eeh_pe_tree_remove(edev); /* Remove as it is adding to bus pe */ + eeh_pe_tree_insert(edev, physfn_pe); /* Add as VF PE type */ } #endif - eeh_sysfs_add_device(pdev); + eeh_probe_device(pdev); } -/* - * Buffer for reporting slot-error-detail rtas calls. Its here - * in BSS, and not dynamically alloced, so that it ends up in - * RMO where RTAS can access it. - */ -static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; -static DEFINE_SPINLOCK(slot_errbuf_lock); -static int eeh_error_buf_size; /** - * pseries_eeh_init - EEH platform dependent initialization + * pseries_eeh_get_pe_config_addr - Find the pe_config_addr for a device + * @pdn: pci_dn of the input device + * + * The EEH RTAS calls use a tuple consisting of: (buid_hi, buid_lo, + * pe_config_addr) as a handle to a given PE. This function finds the + * pe_config_addr based on the device's config addr. + * + * Keep in mind that the pe_config_addr *might* be numerically identical to the + * device's config addr, but the two are conceptually distinct. * - * EEH platform dependent initialization on pseries. + * Returns the pe_config_addr, or a negative error code. */ -static int pseries_eeh_init(void) +static int pseries_eeh_get_pe_config_addr(struct pci_dn *pdn) { - /* figure out EEH RTAS function call tokens */ - ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); - ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); - ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); - ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); - ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); - ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); - ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); - ibm_configure_pe = rtas_token("ibm,configure-pe"); + int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); + struct pci_controller *phb = pdn->phb; + int ret, rets[3]; - /* - * ibm,configure-pe and ibm,configure-bridge have the same semantics, - * however ibm,configure-pe can be faster. If we can't find - * ibm,configure-pe then fall back to using ibm,configure-bridge. - */ - if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE) - ibm_configure_pe = rtas_token("ibm,configure-bridge"); + if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { + /* + * First of all, use function 1 to determine if this device is + * part of a PE or not. ret[0] being zero indicates it's not. + */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), 1); + if (ret || (rets[0] == 0)) + return -ENOENT; + + /* Retrieve the associated PE config address with function 0 */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), 0); + if (ret) { + pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", + __func__, phb->global_number, config_addr); + return -ENXIO; + } + + return rets[0]; + } + + if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), 0); + if (ret) { + pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", + __func__, phb->global_number, config_addr); + return -ENXIO; + } + + return rets[0]; + } /* - * Necessary sanity check. We needn't check "get-config-addr-info" - * and its variant since the old firmware probably support address - * of domain/bus/slot/function for EEH RTAS operations. + * PAPR does describe a process for finding the pe_config_addr that was + * used before the ibm,get-config-addr-info calls were added. However, + * I haven't found *any* systems that don't have that RTAS call + * implemented. If you happen to find one that needs the old DT based + * process, patches are welcome! */ - if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE || - ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE || - (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE && - ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) || - ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE || - ibm_configure_pe == RTAS_UNKNOWN_SERVICE) { - pr_info("EEH functionality not supported\n"); - return -EINVAL; - } + return -ENOENT; +} - /* Initialize error log lock and size */ - spin_lock_init(&slot_errbuf_lock); - eeh_error_buf_size = rtas_token("rtas-error-log-max"); - if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { - pr_info("%s: unknown EEH error log size\n", - __func__); - eeh_error_buf_size = 1024; - } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { - pr_info("%s: EEH error log size %d exceeds the maximal %d\n", - __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX); - eeh_error_buf_size = RTAS_ERROR_LOG_MAX; +/** + * pseries_eeh_phb_reset - Reset the specified PHB + * @phb: PCI controller + * @config_addr: the associated config address + * @option: reset option + * + * Reset the specified PHB/PE + */ +static int pseries_eeh_phb_reset(struct pci_controller *phb, int config_addr, int option) +{ + int ret; + + /* Reset PE through RTAS call */ + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), option); + + /* If fundamental-reset not supported, try hot-reset */ + if (option == EEH_RESET_FUNDAMENTAL && ret == -8) { + option = EEH_RESET_HOT; + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), option); } - /* Set EEH probe mode */ - eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); + /* We need reset hold or settlement delay */ + if (option == EEH_RESET_FUNDAMENTAL || option == EEH_RESET_HOT) + msleep(EEH_PE_RST_HOLD_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); - /* Set EEH machine dependent code */ - ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; + return ret; +} - return 0; +/** + * pseries_eeh_phb_configure_bridge - Configure PCI bridges in the indicated PE + * @phb: PCI controller + * @config_addr: the associated config address + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int pseries_eeh_phb_configure_bridge(struct pci_controller *phb, int config_addr) +{ + int ret; + /* Waiting 0.2s maximum before skipping configuration */ + int max_wait = 200; + + while (max_wait > 0) { + ret = rtas_call(ibm_configure_pe, 3, 1, NULL, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid)); + + if (!ret) + return ret; + if (ret < 0) + break; + + /* + * If RTAS returns a delay value that's above 100ms, cut it + * down to 100ms in case firmware made a mistake. For more + * on how these delay values work see rtas_busy_delay_time + */ + if (ret > RTAS_EXTENDED_DELAY_MIN+2 && + ret <= RTAS_EXTENDED_DELAY_MAX) + ret = RTAS_EXTENDED_DELAY_MIN+2; + + max_wait -= rtas_busy_delay_time(ret); + + if (max_wait < 0) + break; + + rtas_busy_delay(ret); + } + + pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n", + __func__, phb->global_number, config_addr, ret); + /* PAPR defines -3 as "Parameter Error" for this function: */ + if (ret == -3) + return -EINVAL; + else + return -EIO; } +/* + * Buffer for reporting slot-error-detail rtas calls. Its here + * in BSS, and not dynamically alloced, so that it ends up in + * RMO where RTAS can access it. + */ +static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; +static DEFINE_SPINLOCK(slot_errbuf_lock); +static int eeh_error_buf_size; + static int pseries_eeh_cap_start(struct pci_dn *pdn) { u32 status; @@ -159,7 +252,7 @@ static int pseries_eeh_cap_start(struct pci_dn *pdn) if (!pdn) return 0; - rtas_read_config(pdn, PCI_STATUS, 2, &status); + rtas_pci_dn_read_config(pdn, PCI_STATUS, 2, &status); if (!(status & PCI_STATUS_CAP_LIST)) return 0; @@ -177,11 +270,11 @@ static int pseries_eeh_find_cap(struct pci_dn *pdn, int cap) return 0; while (cnt--) { - rtas_read_config(pdn, pos, 1, &pos); + rtas_pci_dn_read_config(pdn, pos, 1, &pos); if (pos < 0x40) break; pos &= ~3; - rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id); + rtas_pci_dn_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id); if (id == 0xff) break; if (id == cap) @@ -201,7 +294,7 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap) if (!edev || !edev->pcie_cap) return 0; - if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + if (rtas_pci_dn_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) return 0; else if (!header) return 0; @@ -214,7 +307,7 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap) if (pos < 256) break; - if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + if (rtas_pci_dn_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) break; } @@ -222,34 +315,88 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap) } /** - * pseries_eeh_probe - EEH probe on the given device + * pseries_eeh_pe_get_parent - Retrieve the parent PE + * @edev: EEH device + * + * The whole PEs existing in the system are organized as hierarchy + * tree. The function is used to retrieve the parent PE according + * to the parent EEH device. + */ +static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev) +{ + struct eeh_dev *parent; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + + /* + * It might have the case for the indirect parent + * EEH device already having associated PE, but + * the direct parent EEH device doesn't have yet. + */ + if (edev->physfn) + pdn = pci_get_pdn(edev->physfn); + else + pdn = pdn ? pdn->parent : NULL; + while (pdn) { + /* We're poking out of PCI territory */ + parent = pdn_to_eeh_dev(pdn); + if (!parent) + return NULL; + + if (parent->pe) + return parent->pe; + + pdn = pdn->parent; + } + + return NULL; +} + +/** + * pseries_eeh_init_edev - initialise the eeh_dev and eeh_pe for a pci_dn + * * @pdn: PCI device node - * @data: Unused * - * When EEH module is installed during system boot, all PCI devices - * are checked one by one to see if it supports EEH. The function - * is introduced for the purpose. + * When we discover a new PCI device via the device-tree we create a + * corresponding pci_dn and we allocate, but don't initialise, an eeh_dev. + * This function takes care of the initialisation and inserts the eeh_dev + * into the correct eeh_pe. If no eeh_pe exists we'll allocate one. */ -static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) +static void pseries_eeh_init_edev(struct pci_dn *pdn) { + struct eeh_pe pe, *parent; struct eeh_dev *edev; - struct eeh_pe pe; u32 pcie_flags; - int enable = 0; int ret; - /* Retrieve OF node and eeh device */ + if (WARN_ON_ONCE(!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))) + return; + + /* + * Find the eeh_dev for this pdn. The storage for the eeh_dev was + * allocated at the same time as the pci_dn. + * + * XXX: We should probably re-visit that. + */ edev = pdn_to_eeh_dev(pdn); - if (!edev || edev->pe) - return NULL; + if (!edev) + return; + + /* + * If ->pe is set then we've already probed this device. We hit + * this path when a pci_dev is removed and rescanned while recovering + * a PE (i.e. for devices where the driver doesn't support error + * recovery). + */ + if (edev->pe) + return; /* Check class/vendor/device IDs */ if (!pdn->vendor_id || !pdn->device_id || !pdn->class_code) - return NULL; + return; /* Skip for PCI-ISA bridge */ if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) - return NULL; + return; eeh_edev_dbg(edev, "Probing device\n"); @@ -258,16 +405,15 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) * correctly reflects that current device is root port * or PCIe switch downstream port. */ - edev->class_code = pdn->class_code; edev->pcix_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_PCIX); edev->pcie_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_EXP); edev->aer_cap = pseries_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR); edev->mode &= 0xFFFFFF00; - if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { - rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS, - 2, &pcie_flags); + rtas_pci_dn_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS, + 2, &pcie_flags); pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4; if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT) edev->mode |= EEH_DEV_ROOT_PORT; @@ -276,51 +422,83 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) } } - /* Initialize the fake PE */ + /* first up, find the pe_config_addr for the PE containing the device */ + ret = pseries_eeh_get_pe_config_addr(pdn); + if (ret < 0) { + eeh_edev_dbg(edev, "Unable to find pe_config_addr\n"); + goto err; + } + + /* Try enable EEH on the fake PE */ memset(&pe, 0, sizeof(struct eeh_pe)); pe.phb = pdn->phb; - pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); + pe.addr = ret; - /* Enable EEH on the device */ eeh_edev_dbg(edev, "Enabling EEH on device\n"); ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); if (ret) { eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret); - } else { - /* Retrieve PE address */ - edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); - pe.addr = edev->pe_config_addr; - - /* Some older systems (Power4) allow the ibm,set-eeh-option - * call to succeed even on nodes where EEH is not supported. - * Verify support explicitly. - */ - ret = eeh_ops->get_state(&pe, NULL); - if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT) - enable = 1; - - if (enable) { - eeh_add_flag(EEH_ENABLED); - eeh_add_to_parent_pe(edev); - } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) && - (pdn_to_eeh_dev(pdn->parent))->pe) { - /* This device doesn't support EEH, but it may have an - * EEH parent, in which case we mark it as supported. - */ - edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; - eeh_add_to_parent_pe(edev); - } - eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n", - (enable ? "enabled" : "unsupported"), ret); + goto err; } - /* Save memory bars */ + edev->pe_config_addr = pe.addr; + + eeh_add_flag(EEH_ENABLED); + + parent = pseries_eeh_pe_get_parent(edev); + eeh_pe_tree_insert(edev, parent); eeh_save_bars(edev); + eeh_edev_dbg(edev, "EEH enabled for device"); - return NULL; + return; + +err: + eeh_edev_dbg(edev, "EEH is unsupported on device (code = %d)\n", ret); +} + +static struct eeh_dev *pseries_eeh_probe(struct pci_dev *pdev) +{ + struct eeh_dev *edev; + struct pci_dn *pdn; + + pdn = pci_get_pdn_by_devfn(pdev->bus, pdev->devfn); + if (!pdn) + return NULL; + + /* + * If the system supports EEH on this device then the eeh_dev was + * configured and inserted into a PE in pseries_eeh_init_edev() + */ + edev = pdn_to_eeh_dev(pdn); + if (!edev || !edev->pe) + return NULL; + + return edev; } /** + * pseries_eeh_init_edev_recursive - Enable EEH for the indicated device + * @pdn: PCI device node + * + * This routine must be used to perform EEH initialization for the + * indicated PCI device that was added after system boot (e.g. + * hotplug, dlpar). + */ +void pseries_eeh_init_edev_recursive(struct pci_dn *pdn) +{ + struct pci_dn *n; + + if (!pdn) + return; + + list_for_each_entry(n, &pdn->child_list, list) + pseries_eeh_init_edev_recursive(n); + + pseries_eeh_init_edev(pdn); +} +EXPORT_SYMBOL_GPL(pseries_eeh_init_edev_recursive); + +/** * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable * @pe: EEH PE * @option: operation to be issued @@ -332,10 +510,9 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) static int pseries_eeh_set_option(struct eeh_pe *pe, int option) { int ret = 0; - int config_addr; /* - * When we're enabling or disabling EEH functioality on + * When we're enabling or disabling EEH functionality on * the particular PE, the PE config address is possibly * unavailable. Therefore, we have to figure it out from * the FDT node. @@ -345,86 +522,23 @@ static int pseries_eeh_set_option(struct eeh_pe *pe, int option) case EEH_OPT_ENABLE: case EEH_OPT_THAW_MMIO: case EEH_OPT_THAW_DMA: - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; break; case EEH_OPT_FREEZE_PE: /* Not support */ return 0; default: - pr_err("%s: Invalid option %d\n", - __func__, option); + pr_err("%s: Invalid option %d\n", __func__, option); return -EINVAL; } ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), + pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid), option); return ret; } /** - * pseries_eeh_get_pe_addr - Retrieve PE address - * @pe: EEH PE - * - * Retrieve the assocated PE address. Actually, there're 2 RTAS - * function calls dedicated for the purpose. We need implement - * it through the new function and then the old one. Besides, - * you should make sure the config address is figured out from - * FDT node before calling the function. - * - * It's notable that zero'ed return value means invalid PE config - * address. - */ -static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) -{ - int ret = 0; - int rets[3]; - - if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { - /* - * First of all, we need to make sure there has one PE - * associated with the device. Otherwise, PE address is - * meaningless. - */ - ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, - pe->config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), 1); - if (ret || (rets[0] == 0)) - return 0; - - /* Retrieve the associated PE config address */ - ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, - pe->config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), 0); - if (ret) { - pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", - __func__, pe->phb->global_number, pe->config_addr); - return 0; - } - - return rets[0]; - } - - if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { - ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, - pe->config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), 0); - if (ret) { - pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", - __func__, pe->phb->global_number, pe->config_addr); - return 0; - } - - return rets[0]; - } - - return ret; -} - -/** * pseries_eeh_get_state - Retrieve PE state * @pe: EEH PE * @delay: suggested time to wait if state is unavailable @@ -439,25 +553,19 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) */ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) { - int config_addr; int ret; int rets[4]; int result; - /* Figure out PE config address if possible */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets, - config_addr, BUID_HI(pe->phb->buid), + pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid)); } else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) { /* Fake PE unavailable info */ rets[2] = 0; ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets, - config_addr, BUID_HI(pe->phb->buid), + pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid)); } else { return EEH_STATE_NOT_SUPPORT; @@ -511,36 +619,7 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) */ static int pseries_eeh_reset(struct eeh_pe *pe, int option) { - int config_addr; - int ret; - - /* Figure out PE address */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - - /* Reset PE through RTAS call */ - ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), option); - - /* If fundamental-reset not supported, try hot-reset */ - if (option == EEH_RESET_FUNDAMENTAL && - ret == -8) { - option = EEH_RESET_HOT; - ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), option); - } - - /* We need reset hold or settlement delay */ - if (option == EEH_RESET_FUNDAMENTAL || - option == EEH_RESET_HOT) - msleep(EEH_PE_RST_HOLD_TIME); - else - msleep(EEH_PE_RST_SETTLE_TIME); - - return ret; + return pseries_eeh_phb_reset(pe->phb, pe->addr, option); } /** @@ -556,19 +635,13 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option) */ static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len) { - int config_addr; unsigned long flags; int ret; spin_lock_irqsave(&slot_errbuf_lock, flags); memset(slot_errbuf, 0, eeh_error_buf_size); - /* Figure out the PE address */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - - ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr, + ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid), virt_to_phys(drv_log), len, virt_to_phys(slot_errbuf), eeh_error_buf_size, @@ -584,110 +657,49 @@ static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, u * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE * @pe: EEH PE * - * The function will be called to reconfigure the bridges included - * in the specified PE so that the mulfunctional PE would be recovered - * again. */ static int pseries_eeh_configure_bridge(struct eeh_pe *pe) { - int config_addr; - int ret; - /* Waiting 0.2s maximum before skipping configuration */ - int max_wait = 200; - - /* Figure out the PE address */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - - while (max_wait > 0) { - ret = rtas_call(ibm_configure_pe, 3, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid)); - - if (!ret) - return ret; - - /* - * If RTAS returns a delay value that's above 100ms, cut it - * down to 100ms in case firmware made a mistake. For more - * on how these delay values work see rtas_busy_delay_time - */ - if (ret > RTAS_EXTENDED_DELAY_MIN+2 && - ret <= RTAS_EXTENDED_DELAY_MAX) - ret = RTAS_EXTENDED_DELAY_MIN+2; - - max_wait -= rtas_busy_delay_time(ret); - - if (max_wait < 0) - break; - - rtas_busy_delay(ret); - } - - pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n", - __func__, pe->phb->global_number, pe->addr, ret); - return ret; + return pseries_eeh_phb_configure_bridge(pe->phb, pe->addr); } /** * pseries_eeh_read_config - Read PCI config space - * @pdn: PCI device node - * @where: PCI address + * @edev: EEH device handle + * @where: PCI config space offset * @size: size to read * @val: return value * * Read config space from the speicifed device */ -static int pseries_eeh_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +static int pseries_eeh_read_config(struct eeh_dev *edev, int where, int size, u32 *val) { - return rtas_read_config(pdn, where, size, val); + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + + return rtas_pci_dn_read_config(pdn, where, size, val); } /** * pseries_eeh_write_config - Write PCI config space - * @pdn: PCI device node - * @where: PCI address + * @edev: EEH device handle + * @where: PCI config space offset * @size: size to write * @val: value to be written * * Write config space to the specified device */ -static int pseries_eeh_write_config(struct pci_dn *pdn, int where, int size, u32 val) +static int pseries_eeh_write_config(struct eeh_dev *edev, int where, int size, u32 val) { - return rtas_write_config(pdn, where, size, val); -} - -static int pseries_eeh_restore_config(struct pci_dn *pdn) -{ - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - s64 ret = 0; - - if (!edev) - return -EEXIST; - - /* - * FIXME: The MPS, error routing rules, timeout setting are worthy - * to be exported by firmware in extendible way. - */ - if (edev->physfn) - ret = eeh_restore_vf_config(pdn); - - if (ret) { - pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, edev->pe_config_addr, ret); - return -EIO; - } + struct pci_dn *pdn = eeh_dev_to_pdn(edev); - return ret; + return rtas_pci_dn_write_config(pdn, where, size, val); } #ifdef CONFIG_PCI_IOV -int pseries_send_allow_unfreeze(struct pci_dn *pdn, - u16 *vf_pe_array, int cur_vfs) +static int pseries_send_allow_unfreeze(struct pci_dn *pdn, u16 *vf_pe_array, int cur_vfs) { int rc; - int ibm_allow_unfreeze = rtas_token("ibm,open-sriov-allow-unfreeze"); + int ibm_allow_unfreeze = rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE); unsigned long buid, addr; addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); @@ -709,8 +721,8 @@ int pseries_send_allow_unfreeze(struct pci_dn *pdn, static int pseries_call_allow_unfreeze(struct eeh_dev *edev) { + int cur_vfs = 0, rc = 0, vf_index, bus, devfn, vf_pe_num; struct pci_dn *pdn, *tmp, *parent, *physfn_pdn; - int cur_vfs = 0, rc = 0, vf_index, bus, devfn; u16 *vf_pe_array; vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); @@ -743,8 +755,10 @@ static int pseries_call_allow_unfreeze(struct eeh_dev *edev) } } else { pdn = pci_get_pdn(edev->pdev); - vf_pe_array[0] = cpu_to_be16(pdn->pe_number); physfn_pdn = pci_get_pdn(edev->physfn); + + vf_pe_num = physfn_pdn->pe_num_map[edev->vf_index]; + vf_pe_array[0] = cpu_to_be16(vf_pe_num); rc = pseries_send_allow_unfreeze(physfn_pdn, vf_pe_array, 1); pdn->last_allow_rc = rc; @@ -755,15 +769,12 @@ static int pseries_call_allow_unfreeze(struct eeh_dev *edev) return rc; } -static int pseries_notify_resume(struct pci_dn *pdn) +static int pseries_notify_resume(struct eeh_dev *edev) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - if (!edev) return -EEXIST; - if (rtas_token("ibm,open-sriov-allow-unfreeze") - == RTAS_UNKNOWN_SERVICE) + if (rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE) == RTAS_UNKNOWN_SERVICE) return -EINVAL; if (edev->pdev->is_physfn || edev->pdev->is_virtfn) @@ -775,10 +786,8 @@ static int pseries_notify_resume(struct pci_dn *pdn) static struct eeh_ops pseries_eeh_ops = { .name = "pseries", - .init = pseries_eeh_init, .probe = pseries_eeh_probe, .set_option = pseries_eeh_set_option, - .get_pe_addr = pseries_eeh_get_pe_addr, .get_state = pseries_eeh_get_state, .reset = pseries_eeh_reset, .get_log = pseries_eeh_get_log, @@ -787,7 +796,7 @@ static struct eeh_ops pseries_eeh_ops = { .read_config = pseries_eeh_read_config, .write_config = pseries_eeh_write_config, .next_error = NULL, - .restore_config = pseries_eeh_restore_config, + .restore_config = NULL, /* NB: configure_bridge() does this */ #ifdef CONFIG_PCI_IOV .notify_resume = pseries_notify_resume #endif @@ -801,15 +810,78 @@ static struct eeh_ops pseries_eeh_ops = { */ static int __init eeh_pseries_init(void) { - int ret; + struct pci_controller *phb; + struct pci_dn *pdn; + int ret, config_addr; + + /* figure out EEH RTAS function call tokens */ + ibm_set_eeh_option = rtas_function_token(RTAS_FN_IBM_SET_EEH_OPTION); + ibm_set_slot_reset = rtas_function_token(RTAS_FN_IBM_SET_SLOT_RESET); + ibm_read_slot_reset_state2 = rtas_function_token(RTAS_FN_IBM_READ_SLOT_RESET_STATE2); + ibm_read_slot_reset_state = rtas_function_token(RTAS_FN_IBM_READ_SLOT_RESET_STATE); + ibm_slot_error_detail = rtas_function_token(RTAS_FN_IBM_SLOT_ERROR_DETAIL); + ibm_get_config_addr_info2 = rtas_function_token(RTAS_FN_IBM_GET_CONFIG_ADDR_INFO2); + ibm_get_config_addr_info = rtas_function_token(RTAS_FN_IBM_GET_CONFIG_ADDR_INFO); + ibm_configure_pe = rtas_function_token(RTAS_FN_IBM_CONFIGURE_PE); + + /* + * ibm,configure-pe and ibm,configure-bridge have the same semantics, + * however ibm,configure-pe can be faster. If we can't find + * ibm,configure-pe then fall back to using ibm,configure-bridge. + */ + if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE) + ibm_configure_pe = rtas_function_token(RTAS_FN_IBM_CONFIGURE_BRIDGE); - ret = eeh_ops_register(&pseries_eeh_ops); + /* + * Necessary sanity check. We needn't check "get-config-addr-info" + * and its variant since the old firmware probably support address + * of domain/bus/slot/function for EEH RTAS operations. + */ + if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE || + ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE || + (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE && + ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) || + ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE || + ibm_configure_pe == RTAS_UNKNOWN_SERVICE) { + pr_info("EEH functionality not supported\n"); + return -EINVAL; + } + + /* Initialize error log size */ + eeh_error_buf_size = rtas_get_error_log_max(); + + /* Set EEH probe mode */ + eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); + + /* Set EEH machine dependent code */ + ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; + + if (is_kdump_kernel() || reset_devices) { + pr_info("Issue PHB reset ...\n"); + list_for_each_entry(phb, &hose_list, list_node) { + // Skip if the slot is empty + if (list_empty(&PCI_DN(phb->dn)->child_list)) + continue; + + pdn = list_first_entry(&PCI_DN(phb->dn)->child_list, struct pci_dn, list); + config_addr = pseries_eeh_get_pe_config_addr(pdn); + + /* invalid PE config addr */ + if (config_addr < 0) + continue; + + pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_FUNDAMENTAL); + pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_DEACTIVATE); + pseries_eeh_phb_configure_bridge(phb, config_addr); + } + } + + ret = eeh_init(&pseries_eeh_ops); if (!ret) pr_info("EEH: pSeries platform initialized\n"); else pr_info("EEH: pSeries platform initialization failure (%d)\n", ret); - return ret; } -machine_early_initcall(pseries, eeh_pseries_init); +machine_arch_initcall(pseries, eeh_pseries_init); diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c index be661e919c76..623dfe0d8e1c 100644 --- a/arch/powerpc/platforms/pseries/event_sources.c +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -8,7 +8,7 @@ #include "pseries.h" -void request_event_sources_irqs(struct device_node *np, +void __init request_event_sources_irqs(struct device_node *np, irq_handler_t handler, const char *name) { diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index d4a8f1702417..18447e5fa17d 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -22,6 +22,7 @@ #include <asm/firmware.h> #include <asm/prom.h> #include <asm/udbg.h> +#include <asm/svm.h> #include "pseries.h" @@ -55,7 +56,8 @@ hypertas_fw_features_table[] = { {FW_FEATURE_LLAN, "hcall-lLAN"}, {FW_FEATURE_BULK_REMOVE, "hcall-bulk"}, {FW_FEATURE_XDABR, "hcall-xdabr"}, - {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE, + "hcall-multi-tce"}, {FW_FEATURE_SPLPAR, "hcall-splpar"}, {FW_FEATURE_VPHN, "hcall-vphn"}, {FW_FEATURE_SET_MODE, "hcall-set-mode"}, @@ -63,6 +65,10 @@ hypertas_fw_features_table[] = { {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"}, {FW_FEATURE_PAPR_SCM, "hcall-scm"}, + {FW_FEATURE_RPT_INVALIDATE, "hcall-rpt-invalidate"}, + {FW_FEATURE_ENERGY_SCALE_INFO, "hcall-energy-scale-info"}, + {FW_FEATURE_WATCHDOG, "hcall-watchdog"}, + {FW_FEATURE_PLPKS, "hcall-pks"}, }; /* Build up the firmware features bitmask using the contents of @@ -100,6 +106,12 @@ static void __init fw_hypertas_feature_init(const char *hypertas, } } + if (is_secure_guest() && + (powerpc_firmware_features & FW_FEATURE_PUT_TCE_IND)) { + powerpc_firmware_features &= ~FW_FEATURE_PUT_TCE_IND; + pr_debug("SVM: disabling PUT_TCE_IND firmware feature\n"); + } + pr_debug(" <- fw_hypertas_feature_init()\n"); } @@ -110,10 +122,11 @@ struct vec5_fw_feature { static __initdata struct vec5_fw_feature vec5_fw_features_table[] = { - {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, + {FW_FEATURE_FORM1_AFFINITY, OV5_FORM1_AFFINITY}, {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY}, }; static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 3e8cbfe7a80f..e62835a12d73 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -35,53 +35,15 @@ #include <asm/topology.h> #include "pseries.h" -#include "offline_states.h" /* This version can't take the spinlock, because it never returns */ static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; -static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = - CPU_STATE_OFFLINE; -static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; - -static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; - -static bool cede_offline_enabled __read_mostly = true; - /* - * Enable/disable cede_offline when available. + * Record the CPU ids used on each nodes. + * Protected by cpu_add_remove_lock. */ -static int __init setup_cede_offline(char *str) -{ - return (kstrtobool(str, &cede_offline_enabled) == 0); -} - -__setup("cede_offline=", setup_cede_offline); - -enum cpu_state_vals get_cpu_current_state(int cpu) -{ - return per_cpu(current_state, cpu); -} - -void set_cpu_current_state(int cpu, enum cpu_state_vals state) -{ - per_cpu(current_state, cpu) = state; -} - -enum cpu_state_vals get_preferred_offline_state(int cpu) -{ - return per_cpu(preferred_offline_state, cpu); -} - -void set_preferred_offline_state(int cpu, enum cpu_state_vals state) -{ - per_cpu(preferred_offline_state, cpu) = state; -} - -void set_default_offline_state(int cpu) -{ - per_cpu(preferred_offline_state, cpu) = default_offline_state; -} +static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES]; static void rtas_stop_self(void) { @@ -91,19 +53,14 @@ static void rtas_stop_self(void) BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE); - printk("cpu %u (hwid %u) Ready to die...\n", - smp_processor_id(), hard_smp_processor_id()); - rtas_call_unlocked(&args, rtas_stop_self_token, 0, 1, NULL); panic("Alas, I survived.\n"); } -static void pseries_mach_cpu_die(void) +static void pseries_cpu_offline_self(void) { - unsigned int cpu = smp_processor_id(); unsigned int hwcpu = hard_smp_processor_id(); - u8 cede_latency_hint = 0; local_irq_disable(); idle_task_exit(); @@ -112,50 +69,8 @@ static void pseries_mach_cpu_die(void) else xics_teardown_cpu(); - if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - set_cpu_current_state(cpu, CPU_STATE_INACTIVE); - if (ppc_md.suspend_disable_cpu) - ppc_md.suspend_disable_cpu(); - - cede_latency_hint = 2; - - get_lppaca()->idle = 1; - if (!lppaca_shared_proc(get_lppaca())) - get_lppaca()->donate_dedicated_cpu = 1; - - while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - while (!prep_irq_for_idle()) { - local_irq_enable(); - local_irq_disable(); - } - - extended_cede_processor(cede_latency_hint); - } - - local_irq_disable(); - - if (!lppaca_shared_proc(get_lppaca())) - get_lppaca()->donate_dedicated_cpu = 0; - get_lppaca()->idle = 0; - - if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { - unregister_slb_shadow(hwcpu); - - hard_irq_disable(); - /* - * Call to start_secondary_resume() will not return. - * Kernel stack will be reset and start_secondary() - * will be called to continue the online operation. - */ - start_secondary_resume(); - } - } - - /* Requested state is CPU_STATE_OFFLINE at this point */ - WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE); - - set_cpu_current_state(cpu, CPU_STATE_OFFLINE); unregister_slb_shadow(hwcpu); + unregister_vpa(hwcpu); rtas_stop_self(); /* Should never get here... */ @@ -179,6 +94,9 @@ static int pseries_cpu_disable(void) xive_smp_disable_cpu(); else xics_migrate_irqs_away(); + + cleanup_cpu_mmu_context(); + return 0; } @@ -191,114 +109,180 @@ static int pseries_cpu_disable(void) * to self-destroy so that the cpu-offline thread can send the CPU_DEAD * notifications. * - * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to + * OTOH, pseries_cpu_offline_self() is called by the @cpu when it wants to * self-destruct. */ static void pseries_cpu_die(unsigned int cpu) { - int tries; int cpu_status = 1; unsigned int pcpu = get_hard_smp_processor_id(cpu); + unsigned long timeout = jiffies + msecs_to_jiffies(120000); - if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - cpu_status = 1; - for (tries = 0; tries < 5000; tries++) { - if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) { - cpu_status = 0; - break; - } - msleep(1); - } - } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + while (true) { + cpu_status = smp_query_cpu_stopped(pcpu); + if (cpu_status == QCSS_STOPPED || + cpu_status == QCSS_HARDWARE_ERROR) + break; - for (tries = 0; tries < 25; tries++) { - cpu_status = smp_query_cpu_stopped(pcpu); - if (cpu_status == QCSS_STOPPED || - cpu_status == QCSS_HARDWARE_ERROR) - break; - cpu_relax(); + if (time_after(jiffies, timeout)) { + pr_warn("CPU %i (hwid %i) didn't die after 120 seconds\n", + cpu, pcpu); + timeout = jiffies + msecs_to_jiffies(120000); } + + cond_resched(); } - if (cpu_status != 0) { - printk("Querying DEAD? cpu %i (%i) shows %i\n", - cpu, pcpu, cpu_status); + if (cpu_status == QCSS_HARDWARE_ERROR) { + pr_warn("CPU %i (hwid %i) reported error while dying\n", + cpu, pcpu); } - /* Isolation and deallocation are definitely done by - * drslot_chrp_cpu. If they were not they would be - * done here. Change isolate state to Isolate and - * change allocation-state to Unusable. - */ paca_ptrs[cpu]->cpu_start = 0; } +/** + * find_cpu_id_range - found a linear ranger of @nthreads free CPU ids. + * @nthreads : the number of threads (cpu ids) + * @assigned_node : the node it belongs to or NUMA_NO_NODE if free ids from any + * node can be peek. + * @cpu_mask: the returned CPU mask. + * + * Returns 0 on success. + */ +static int find_cpu_id_range(unsigned int nthreads, int assigned_node, + cpumask_var_t *cpu_mask) +{ + cpumask_var_t candidate_mask; + unsigned int cpu, node; + int rc = -ENOSPC; + + if (!zalloc_cpumask_var(&candidate_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(*cpu_mask); + for (cpu = 0; cpu < nthreads; cpu++) + cpumask_set_cpu(cpu, *cpu_mask); + + BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + + /* Get a bitmap of unoccupied slots. */ + cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); + + if (assigned_node != NUMA_NO_NODE) { + /* + * Remove free ids previously assigned on the other nodes. We + * can walk only online nodes because once a node became online + * it is not turned offlined back. + */ + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(candidate_mask, candidate_mask, + node_recorded_ids_map[node]); + } + } + + if (cpumask_empty(candidate_mask)) + goto out; + + while (!cpumask_empty(*cpu_mask)) { + if (cpumask_subset(*cpu_mask, candidate_mask)) + /* Found a range where we can insert the new cpu(s) */ + break; + cpumask_shift_left(*cpu_mask, *cpu_mask, nthreads); + } + + if (!cpumask_empty(*cpu_mask)) + rc = 0; + +out: + free_cpumask_var(candidate_mask); + return rc; +} + /* * Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle - * here is that a cpu device node may represent up to two logical cpus + * here is that a cpu device node may represent multiple logical cpus * in the SMT case. We must honor the assumption in other code that * the logical ids for sibling SMT threads x and y are adjacent, such * that x^1 == y and y^1 == x. */ static int pseries_add_processor(struct device_node *np) { - unsigned int cpu; - cpumask_var_t candidate_mask, tmp; - int err = -ENOSPC, len, nthreads, i; + int len, nthreads, node, cpu, assigned_node; + int rc = 0; + cpumask_var_t cpu_mask; const __be32 *intserv; intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); if (!intserv) return 0; - zalloc_cpumask_var(&candidate_mask, GFP_KERNEL); - zalloc_cpumask_var(&tmp, GFP_KERNEL); - nthreads = len / sizeof(u32); - for (i = 0; i < nthreads; i++) - cpumask_set_cpu(i, tmp); - cpu_maps_update_begin(); + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; - BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + /* + * Fetch from the DT nodes read by dlpar_configure_connector() the NUMA + * node id the added CPU belongs to. + */ + node = of_node_to_nid(np); + if (node < 0 || !node_possible(node)) + node = first_online_node; - /* Get a bitmap of unoccupied slots. */ - cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); - if (cpumask_empty(candidate_mask)) { - /* If we get here, it most likely means that NR_CPUS is - * less than the partition's max processors setting. + BUG_ON(node == NUMA_NO_NODE); + assigned_node = node; + + cpu_maps_update_begin(); + + rc = find_cpu_id_range(nthreads, node, &cpu_mask); + if (rc && nr_node_ids > 1) { + /* + * Try again, considering the free CPU ids from the other node. */ - printk(KERN_ERR "Cannot add cpu %pOF; this system configuration" - " supports %d logical cpus.\n", np, - num_possible_cpus()); - goto out_unlock; + node = NUMA_NO_NODE; + rc = find_cpu_id_range(nthreads, NUMA_NO_NODE, &cpu_mask); } - while (!cpumask_empty(tmp)) - if (cpumask_subset(tmp, candidate_mask)) - /* Found a range where we can insert the new cpu(s) */ - break; - else - cpumask_shift_left(tmp, tmp, nthreads); - - if (cpumask_empty(tmp)) { - printk(KERN_ERR "Unable to find space in cpu_present_mask for" - " processor %pOFn with %d thread(s)\n", np, - nthreads); - goto out_unlock; + if (rc) { + pr_err("Cannot add cpu %pOF; this system configuration" + " supports %d logical cpus.\n", np, num_possible_cpus()); + goto out; } - for_each_cpu(cpu, tmp) { + for_each_cpu(cpu, cpu_mask) { BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++)); } - err = 0; -out_unlock: + + /* Record the newly used CPU ids for the associate node. */ + cpumask_or(node_recorded_ids_map[assigned_node], + node_recorded_ids_map[assigned_node], cpu_mask); + + /* + * If node is set to NUMA_NO_NODE, CPU ids have be reused from + * another node, remove them from its mask. + */ + if (node == NUMA_NO_NODE) { + cpu = cpumask_first(cpu_mask); + pr_warn("Reusing free CPU ids %d-%d from another node\n", + cpu, cpu + nthreads - 1); + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(node_recorded_ids_map[node], + node_recorded_ids_map[node], + cpu_mask); + } + } + +out: cpu_maps_update_done(); - free_cpumask_var(candidate_mask); - free_cpumask_var(tmp); - return err; + free_cpumask_var(cpu_mask); + return rc; } /* @@ -359,28 +343,27 @@ static int dlpar_offline_cpu(struct device_node *dn) if (get_hard_smp_processor_id(cpu) != thread) continue; - if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + if (!cpu_online(cpu)) break; - if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { - set_preferred_offline_state(cpu, - CPU_STATE_OFFLINE); - cpu_maps_update_done(); - timed_topology_update(1); - rc = device_offline(get_cpu_device(cpu)); - if (rc) - goto out; - cpu_maps_update_begin(); - break; - } - /* - * The cpu is in CPU_STATE_INACTIVE. - * Upgrade it's state to CPU_STATE_OFFLINE. + * device_offline() will return -EBUSY (via cpu_down()) if there + * is only one CPU left. Check it here to fail earlier and with a + * more informative error message, while also retaining the + * cpu_add_remove_lock to be sure that no CPUs are being + * online/offlined during this check. */ - set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); - WARN_ON(plpar_hcall_norets(H_PROD, thread) != H_SUCCESS); - __cpu_die(cpu); + if (num_online_cpus() == 1) { + pr_warn("Unable to remove last online CPU %pOFn\n", dn); + rc = -EBUSY; + goto out_unlock; + } + + cpu_maps_update_done(); + rc = device_offline(get_cpu_device(cpu)); + if (rc) + goto out; + cpu_maps_update_begin(); break; } if (cpu == num_possible_cpus()) { @@ -388,6 +371,7 @@ static int dlpar_offline_cpu(struct device_node *dn) thread); } } +out_unlock: cpu_maps_update_done(); out: @@ -414,11 +398,16 @@ static int dlpar_online_cpu(struct device_node *dn) for_each_present_cpu(cpu) { if (get_hard_smp_processor_id(cpu) != thread) continue; - BUG_ON(get_cpu_current_state(cpu) - != CPU_STATE_OFFLINE); + + if (!topology_is_primary_thread(cpu)) { + if (cpu_smt_control != CPU_SMT_ENABLED) + break; + if (!topology_smt_thread_allowed(cpu)) + break; + } + cpu_maps_update_done(); - timed_topology_update(1); - find_and_online_cpu_nid(cpu); + find_and_update_cpu_nid(cpu); rc = device_online(get_cpu_device(cpu)); if (rc) { dlpar_offline_cpu(dn); @@ -512,7 +501,7 @@ static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index) bool found = false; int rc, index; - if (of_find_property(parent, "ibm,drc-info", NULL)) + if (of_property_present(parent, "ibm,drc-info")) return drc_info_valid_index(parent, drc_index); /* Note that the format of the ibm,drc-indexes array is @@ -536,6 +525,27 @@ static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index) return found; } +static int pseries_cpuhp_attach_nodes(struct device_node *dn) +{ + struct of_changeset cs; + int ret; + + /* + * This device node is unattached but may have siblings; open-code the + * traversal. + */ + for (of_changeset_init(&cs); dn != NULL; dn = dn->sibling) { + ret = of_changeset_attach_node(&cs, dn); + if (ret) + goto out; + } + + ret = of_changeset_apply(&cs); +out: + of_changeset_destroy(&cs); + return ret; +} + static ssize_t dlpar_cpu_add(u32 drc_index) { struct device_node *dn, *parent; @@ -578,7 +588,7 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return -EINVAL; } - rc = dlpar_attach_node(dn, parent); + rc = pseries_cpuhp_attach_nodes(dn); /* Regardless we are done with parent now */ of_node_put(parent); @@ -595,6 +605,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return saved_rc; } + update_numa_distance(dn); + rc = dlpar_online_cpu(dn); if (rc) { saved_rc = rc; @@ -613,6 +625,60 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return rc; } +static unsigned int pseries_cpuhp_cache_use_count(const struct device_node *cachedn) +{ + unsigned int use_count = 0; + struct device_node *dn, *tn; + + WARN_ON(!of_node_is_type(cachedn, "cache")); + + for_each_of_cpu_node(dn) { + tn = of_find_next_cache_node(dn); + of_node_put(tn); + if (tn == cachedn) + use_count++; + } + + for_each_node_by_type(dn, "cache") { + tn = of_find_next_cache_node(dn); + of_node_put(tn); + if (tn == cachedn) + use_count++; + } + + return use_count; +} + +static int pseries_cpuhp_detach_nodes(struct device_node *cpudn) +{ + struct device_node *dn; + struct of_changeset cs; + int ret = 0; + + of_changeset_init(&cs); + ret = of_changeset_detach_node(&cs, cpudn); + if (ret) + goto out; + + dn = cpudn; + while ((dn = of_find_next_cache_node(dn))) { + if (pseries_cpuhp_cache_use_count(dn) > 1) { + of_node_put(dn); + break; + } + + ret = of_changeset_detach_node(&cs, dn); + of_node_put(dn); + if (ret) + goto out; + } + + ret = of_changeset_apply(&cs); +out: + of_changeset_destroy(&cs); + return ret; +} + static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) { int rc; @@ -634,7 +700,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) return rc; } - rc = dlpar_detach_node(dn); + rc = pseries_cpuhp_detach_nodes(dn); if (rc) { int saved_rc = rc; @@ -686,258 +752,32 @@ static int dlpar_cpu_remove_by_index(u32 drc_index) return rc; } -static int find_dlpar_cpus_to_remove(u32 *cpu_drcs, int cpus_to_remove) -{ - struct device_node *dn; - int cpus_found = 0; - int rc; - - /* We want to find cpus_to_remove + 1 CPUs to ensure we do not - * remove the last CPU. - */ - for_each_node_by_type(dn, "cpu") { - cpus_found++; - - if (cpus_found > cpus_to_remove) { - of_node_put(dn); - break; - } - - /* Note that cpus_found is always 1 ahead of the index - * into the cpu_drcs array, so we use cpus_found - 1 - */ - rc = of_property_read_u32(dn, "ibm,my-drc-index", - &cpu_drcs[cpus_found - 1]); - if (rc) { - pr_warn("Error occurred getting drc-index for %pOFn\n", - dn); - of_node_put(dn); - return -1; - } - } - - if (cpus_found < cpus_to_remove) { - pr_warn("Failed to find enough CPUs (%d of %d) to remove\n", - cpus_found, cpus_to_remove); - } else if (cpus_found == cpus_to_remove) { - pr_warn("Cannot remove all CPUs\n"); - } - - return cpus_found; -} - -static int dlpar_cpu_remove_by_count(u32 cpus_to_remove) -{ - u32 *cpu_drcs; - int cpus_found; - int cpus_removed = 0; - int i, rc; - - pr_debug("Attempting to hot-remove %d CPUs\n", cpus_to_remove); - - cpu_drcs = kcalloc(cpus_to_remove, sizeof(*cpu_drcs), GFP_KERNEL); - if (!cpu_drcs) - return -EINVAL; - - cpus_found = find_dlpar_cpus_to_remove(cpu_drcs, cpus_to_remove); - if (cpus_found <= cpus_to_remove) { - kfree(cpu_drcs); - return -EINVAL; - } - - for (i = 0; i < cpus_to_remove; i++) { - rc = dlpar_cpu_remove_by_index(cpu_drcs[i]); - if (rc) - break; - - cpus_removed++; - } - - if (cpus_removed != cpus_to_remove) { - pr_warn("CPU hot-remove failed, adding back removed CPUs\n"); - - for (i = 0; i < cpus_removed; i++) - dlpar_cpu_add(cpu_drcs[i]); - - rc = -EINVAL; - } else { - rc = 0; - } - - kfree(cpu_drcs); - return rc; -} - -static int find_drc_info_cpus_to_add(struct device_node *cpus, - struct property *info, - u32 *cpu_drcs, u32 cpus_to_add) -{ - struct of_drc_info drc; - const __be32 *value; - u32 count, drc_index; - int cpus_found = 0; - int i, j; - - if (!info) - return -1; - - value = of_prop_next_u32(info, NULL, &count); - if (value) - value++; - - for (i = 0; i < count; i++) { - of_read_drc_info_cell(&info, &value, &drc); - if (strncmp(drc.drc_type, "CPU", 3)) - break; - - drc_index = drc.drc_index_start; - for (j = 0; j < drc.num_sequential_elems; j++) { - if (dlpar_cpu_exists(cpus, drc_index)) - continue; - - cpu_drcs[cpus_found++] = drc_index; - - if (cpus_found == cpus_to_add) - return cpus_found; - - drc_index += drc.sequential_inc; - } - } - - return cpus_found; -} - -static int find_drc_index_cpus_to_add(struct device_node *cpus, - u32 *cpu_drcs, u32 cpus_to_add) -{ - int cpus_found = 0; - int index, rc; - u32 drc_index; - - /* Search the ibm,drc-indexes array for possible CPU drcs to - * add. Note that the format of the ibm,drc-indexes array is - * the number of entries in the array followed by the array - * of drc values so we start looking at index = 1. - */ - index = 1; - while (cpus_found < cpus_to_add) { - rc = of_property_read_u32_index(cpus, "ibm,drc-indexes", - index++, &drc_index); - - if (rc) - break; - - if (dlpar_cpu_exists(cpus, drc_index)) - continue; - - cpu_drcs[cpus_found++] = drc_index; - } - - return cpus_found; -} - -static int dlpar_cpu_add_by_count(u32 cpus_to_add) -{ - struct device_node *parent; - struct property *info; - u32 *cpu_drcs; - int cpus_added = 0; - int cpus_found; - int i, rc; - - pr_debug("Attempting to hot-add %d CPUs\n", cpus_to_add); - - cpu_drcs = kcalloc(cpus_to_add, sizeof(*cpu_drcs), GFP_KERNEL); - if (!cpu_drcs) - return -EINVAL; - - parent = of_find_node_by_path("/cpus"); - if (!parent) { - pr_warn("Could not find CPU root node in device tree\n"); - kfree(cpu_drcs); - return -1; - } - - info = of_find_property(parent, "ibm,drc-info", NULL); - if (info) - cpus_found = find_drc_info_cpus_to_add(parent, info, cpu_drcs, cpus_to_add); - else - cpus_found = find_drc_index_cpus_to_add(parent, cpu_drcs, cpus_to_add); - - of_node_put(parent); - - if (cpus_found < cpus_to_add) { - pr_warn("Failed to find enough CPUs (%d of %d) to add\n", - cpus_found, cpus_to_add); - kfree(cpu_drcs); - return -EINVAL; - } - - for (i = 0; i < cpus_to_add; i++) { - rc = dlpar_cpu_add(cpu_drcs[i]); - if (rc) - break; - - cpus_added++; - } - - if (cpus_added < cpus_to_add) { - pr_warn("CPU hot-add failed, removing any added CPUs\n"); - - for (i = 0; i < cpus_added; i++) - dlpar_cpu_remove_by_index(cpu_drcs[i]); - - rc = -EINVAL; - } else { - rc = 0; - } - - kfree(cpu_drcs); - return rc; -} - -int dlpar_cpu_readd(int cpu) -{ - struct device_node *dn; - struct device *dev; - u32 drc_index; - int rc; - - dev = get_cpu_device(cpu); - dn = dev->of_node; - - rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index); - - rc = dlpar_cpu_remove_by_index(drc_index); - if (!rc) - rc = dlpar_cpu_add(drc_index); - - return rc; -} - int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) { - u32 count, drc_index; + u32 drc_index; int rc; - count = hp_elog->_drc_u.drc_count; drc_index = hp_elog->_drc_u.drc_index; lock_device_hotplug(); switch (hp_elog->action) { case PSERIES_HP_ELOG_ACTION_REMOVE: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) - rc = dlpar_cpu_remove_by_count(count); - else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { rc = dlpar_cpu_remove_by_index(drc_index); + /* + * Setting the isolation state of an UNISOLATED/CONFIGURED + * device to UNISOLATE is a no-op, but the hypervisor can + * use it as a hint that the CPU removal failed. + */ + if (rc) + dlpar_unisolate_drc(drc_index); + } else rc = -EINVAL; break; case PSERIES_HP_ELOG_ACTION_ADD: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) - rc = dlpar_cpu_add_by_count(count); - else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) rc = dlpar_cpu_add(drc_index); else rc = -EINVAL; @@ -1013,60 +853,49 @@ static struct notifier_block pseries_smp_nb = { .notifier_call = pseries_smp_notifier, }; -#define MAX_CEDE_LATENCY_LEVELS 4 -#define CEDE_LATENCY_PARAM_LENGTH 10 -#define CEDE_LATENCY_PARAM_MAX_LENGTH \ - (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char)) -#define CEDE_LATENCY_TOKEN 45 - -static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH]; - -static int parse_cede_parameters(void) -{ - memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH); - return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - CEDE_LATENCY_TOKEN, - __pa(cede_parameters), - CEDE_LATENCY_PARAM_MAX_LENGTH); -} - -static int __init pseries_cpu_hotplug_init(void) +void __init pseries_cpu_hotplug_init(void) { - int cpu; int qcss_tok; -#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE - ppc_md.cpu_probe = dlpar_cpu_probe; - ppc_md.cpu_release = dlpar_cpu_release; -#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ - - rtas_stop_self_token = rtas_token("stop-self"); - qcss_tok = rtas_token("query-cpu-stopped-state"); + rtas_stop_self_token = rtas_function_token(RTAS_FN_STOP_SELF); + qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE); if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE || qcss_tok == RTAS_UNKNOWN_SERVICE) { printk(KERN_INFO "CPU Hotplug not supported by firmware " "- disabling.\n"); - return 0; + return; } - ppc_md.cpu_die = pseries_mach_cpu_die; + smp_ops->cpu_offline_self = pseries_cpu_offline_self; smp_ops->cpu_disable = pseries_cpu_disable; smp_ops->cpu_die = pseries_cpu_die; +} + +static int __init pseries_dlpar_init(void) +{ + unsigned int node; + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + ppc_md.cpu_probe = dlpar_cpu_probe; + ppc_md.cpu_release = dlpar_cpu_release; +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ /* Processors can be added/removed only on LPAR */ if (firmware_has_feature(FW_FEATURE_LPAR)) { - of_reconfig_notifier_register(&pseries_smp_nb); - cpu_maps_update_begin(); - if (cede_offline_enabled && parse_cede_parameters() == 0) { - default_offline_state = CPU_STATE_INACTIVE; - for_each_online_cpu(cpu) - set_default_offline_state(cpu); + for_each_node(node) { + if (!alloc_cpumask_var_node(&node_recorded_ids_map[node], + GFP_KERNEL, node)) + return -ENOMEM; + + /* Record ids of CPU added at boot time */ + cpumask_copy(node_recorded_ids_map[node], + cpumask_of_node(node)); } - cpu_maps_update_done(); + + of_reconfig_notifier_register(&pseries_smp_nb); } return 0; } -machine_arch_initcall(pseries, pseries_cpu_hotplug_init); +machine_arch_initcall(pseries, pseries_dlpar_init); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index c126b94d1943..3fe3ddb30c04 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -16,57 +16,11 @@ #include <asm/firmware.h> #include <asm/machdep.h> -#include <asm/prom.h> #include <asm/sparsemem.h> #include <asm/fadump.h> #include <asm/drmem.h> #include "pseries.h" -static bool rtas_hp_event; - -unsigned long pseries_memory_block_size(void) -{ - struct device_node *np; - unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE; - struct resource r; - - np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (np) { - const __be64 *size; - - size = of_get_property(np, "ibm,lmb-size", NULL); - if (size) - memblock_size = be64_to_cpup(size); - of_node_put(np); - } else if (machine_is(pseries)) { - /* This fallback really only applies to pseries */ - unsigned int memzero_size = 0; - - np = of_find_node_by_path("/memory@0"); - if (np) { - if (!of_address_to_resource(np, 0, &r)) - memzero_size = resource_size(&r); - of_node_put(np); - } - - if (memzero_size) { - /* We now know the size of memory@0, use this to find - * the first memoryblock and get its size. - */ - char buf[64]; - - sprintf(buf, "/memory@%x", memzero_size); - np = of_find_node_by_path(buf); - if (np) { - if (!of_address_to_resource(np, 0, &r)) - memblock_size = resource_size(&r); - of_node_put(np); - } - } - } - return memblock_size; -} - static void dlpar_free_property(struct property *prop) { kfree(prop->name); @@ -101,7 +55,8 @@ static bool find_aa_index(struct device_node *dr_node, struct property *ala_prop, const u32 *lmb_assoc, u32 *aa_index) { - u32 *assoc_arrays, new_prop_size; + __be32 *assoc_arrays; + u32 new_prop_size; struct property *new_prop; int aa_arrays, aa_array_entries, aa_array_sz; int i, index; @@ -177,6 +132,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) return -ENODEV; } + update_numa_distance(lmb_node); + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (!dr_node) { dlpar_free_cc_nodes(lmb_node); @@ -208,13 +165,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) { unsigned long section_nr; - struct mem_section *mem_sect; struct memory_block *mem_block; section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); - mem_sect = __nr_to_section(section_nr); - mem_block = find_memory_block(mem_sect); + mem_block = find_memory_block(section_nr); return mem_block; } @@ -223,7 +178,7 @@ static int get_lmb_range(u32 drc_index, int n_lmbs, struct drmem_lmb **end_lmb) { struct drmem_lmb *lmb, *start, *end; - struct drmem_lmb *last_lmb; + struct drmem_lmb *limit; start = NULL; for_each_drmem_lmb(lmb) { @@ -236,10 +191,10 @@ static int get_lmb_range(u32 drc_index, int n_lmbs, if (!start) return -EINVAL; - end = &start[n_lmbs - 1]; + end = &start[n_lmbs]; - last_lmb = &drmem_info->lmbs[drmem_info->n_lmbs - 1]; - if (end > last_lmb) + limit = &drmem_info->lmbs[drmem_info->n_lmbs]; + if (end > limit) return -EINVAL; *start_lmb = start; @@ -253,8 +208,10 @@ static int dlpar_change_lmb_state(struct drmem_lmb *lmb, bool online) int rc; mem_block = lmb_to_memblock(lmb); - if (!mem_block) + if (!mem_block) { + pr_err("Failed memory block lookup for LMB 0x%x\n", lmb->drc_index); return -EINVAL; + } if (online && mem_block->dev.offline) rc = device_online(&mem_block->dev); @@ -279,11 +236,11 @@ static int dlpar_offline_lmb(struct drmem_lmb *lmb) return dlpar_change_lmb_state(lmb, false); } -static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) +static int pseries_remove_memblock(unsigned long base, unsigned long memblock_size) { - unsigned long block_sz, start_pfn; + unsigned long start_pfn; int sections_per_block; - int i, nid; + int i; start_pfn = base >> PAGE_SHIFT; @@ -292,12 +249,10 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz if (!pfn_valid(start_pfn)) goto out; - block_sz = pseries_memory_block_size(); - sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - nid = memory_add_physaddr_to_nid(base); + sections_per_block = memory_block_size / MIN_MEMORY_BLOCK_SIZE; for (i = 0; i < sections_per_block; i++) { - __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + __remove_memory(base, MIN_MEMORY_BLOCK_SIZE); base += MIN_MEMORY_BLOCK_SIZE; } @@ -310,10 +265,8 @@ out: static int pseries_remove_mem_node(struct device_node *np) { - const __be32 *regs; - unsigned long base; - unsigned int lmb_size; - int ret = -EINVAL; + int ret; + struct resource res; /* * Check to see if we are actually removing memory @@ -324,75 +277,59 @@ static int pseries_remove_mem_node(struct device_node *np) /* * Find the base address and size of the memblock */ - regs = of_get_property(np, "reg", NULL); - if (!regs) + ret = of_address_to_resource(np, 0, &res); + if (ret) return ret; - base = be64_to_cpu(*(unsigned long *)regs); - lmb_size = be32_to_cpu(regs[3]); - - pseries_remove_memblock(base, lmb_size); + pseries_remove_memblock(res.start, resource_size(&res)); return 0; } static bool lmb_is_removable(struct drmem_lmb *lmb) { - int i, scns_per_block; - bool rc = true; - unsigned long pfn, block_sz; - u64 phys_addr; - - if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) + if ((lmb->flags & DRCONF_MEM_RESERVED) || + !(lmb->flags & DRCONF_MEM_ASSIGNED)) return false; - block_sz = memory_block_size_bytes(); - scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - phys_addr = lmb->base_addr; - #ifdef CONFIG_FA_DUMP /* * Don't hot-remove memory that falls in fadump boot memory area * and memory that is reserved for capturing old kernel memory. */ - if (is_fadump_memory_area(phys_addr, block_sz)) + if (is_fadump_memory_area(lmb->base_addr, memory_block_size_bytes())) return false; #endif - - for (i = 0; i < scns_per_block; i++) { - pfn = PFN_DOWN(phys_addr); - if (!pfn_present(pfn)) - continue; - - rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION); - phys_addr += MIN_MEMORY_BLOCK_SIZE; - } - - return rc; + /* device_offline() will determine if we can actually remove this lmb */ + return true; } static int dlpar_add_lmb(struct drmem_lmb *); static int dlpar_remove_lmb(struct drmem_lmb *lmb) { - unsigned long block_sz; + struct memory_block *mem_block; int rc; if (!lmb_is_removable(lmb)) return -EINVAL; + mem_block = lmb_to_memblock(lmb); + if (mem_block == NULL) + return -EINVAL; + rc = dlpar_offline_lmb(lmb); - if (rc) + if (rc) { + put_device(&mem_block->dev); return rc; + } - block_sz = pseries_memory_block_size(); - - __remove_memory(lmb->nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, memory_block_size); + put_device(&mem_block->dev); /* Update memory regions for memory remove */ - memblock_remove(lmb->base_addr, block_sz); + memblock_remove(lmb->base_addr, memory_block_size); invalidate_lmb_associativity_index(lmb); - lmb_clear_nid(lmb); lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; @@ -401,7 +338,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) static int dlpar_memory_remove_by_count(u32 lmbs_to_remove) { struct drmem_lmb *lmb; - int lmbs_removed = 0; + int lmbs_reserved = 0; int lmbs_available = 0; int rc; @@ -435,12 +372,12 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove) */ drmem_mark_lmb_reserved(lmb); - lmbs_removed++; - if (lmbs_removed == lmbs_to_remove) + lmbs_reserved++; + if (lmbs_reserved == lmbs_to_remove) break; } - if (lmbs_removed != lmbs_to_remove) { + if (lmbs_reserved != lmbs_to_remove) { pr_err("Memory hot-remove failed, adding LMB's back\n"); for_each_drmem_lmb(lmb) { @@ -453,6 +390,10 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove) lmb->drc_index); drmem_remove_lmb_reservation(lmb); + + lmbs_reserved--; + if (lmbs_reserved == 0) + break; } rc = -EINVAL; @@ -466,6 +407,10 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove) lmb->base_addr); drmem_remove_lmb_reservation(lmb); + + lmbs_reserved--; + if (lmbs_reserved == 0) + break; } rc = 0; } @@ -479,7 +424,7 @@ static int dlpar_memory_remove_by_index(u32 drc_index) int lmb_found; int rc; - pr_info("Attempting to hot-remove LMB, drc index %x\n", drc_index); + pr_debug("Attempting to hot-remove LMB, drc index %x\n", drc_index); lmb_found = 0; for_each_drmem_lmb(lmb) { @@ -493,56 +438,22 @@ static int dlpar_memory_remove_by_index(u32 drc_index) } } - if (!lmb_found) + if (!lmb_found) { + pr_debug("Failed to look up LMB for drc index %x\n", drc_index); rc = -EINVAL; - - if (rc) - pr_info("Failed to hot-remove memory at %llx\n", - lmb->base_addr); - else - pr_info("Memory at %llx was hot-removed\n", lmb->base_addr); - - return rc; -} - -static int dlpar_memory_readd_by_index(u32 drc_index) -{ - struct drmem_lmb *lmb; - int lmb_found; - int rc; - - pr_info("Attempting to update LMB, drc index %x\n", drc_index); - - lmb_found = 0; - for_each_drmem_lmb(lmb) { - if (lmb->drc_index == drc_index) { - lmb_found = 1; - rc = dlpar_remove_lmb(lmb); - if (!rc) { - rc = dlpar_add_lmb(lmb); - if (rc) - dlpar_release_drc(lmb->drc_index); - } - break; - } + } else if (rc) { + pr_debug("Failed to hot-remove memory at %llx\n", + lmb->base_addr); + } else { + pr_debug("Memory at %llx was hot-removed\n", lmb->base_addr); } - if (!lmb_found) - rc = -EINVAL; - - if (rc) - pr_info("Failed to update memory at %llx\n", - lmb->base_addr); - else - pr_info("Memory at %llx was updated\n", lmb->base_addr); - return rc; } static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) { struct drmem_lmb *lmb, *start_lmb, *end_lmb; - int lmbs_available = 0; int rc; pr_info("Attempting to hot-remove %u LMB(s) at %x\n", @@ -555,18 +466,29 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) if (rc) return -EINVAL; - /* Validate that there are enough LMBs to satisfy the request */ + /* + * Validate that all LMBs in range are not reserved. Note that it + * is ok if they are !ASSIGNED since our goal here is to remove the + * LMB range, regardless of whether some LMBs were already removed + * by any other reason. + * + * This is a contrast to what is done in remove_by_count() where we + * check for both RESERVED and !ASSIGNED (via lmb_is_removable()), + * because we want to remove a fixed amount of LMBs in that function. + */ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) { - if (lmb->flags & DRCONF_MEM_RESERVED) - break; - - lmbs_available++; + if (lmb->flags & DRCONF_MEM_RESERVED) { + pr_err("Memory at %llx (drc index %x) is reserved\n", + lmb->base_addr, lmb->drc_index); + return -EINVAL; + } } - if (lmbs_available < lmbs_to_remove) - return -EINVAL; - for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) { + /* + * dlpar_remove_lmb() will error out if the LMB is already + * !ASSIGNED, but this case is a no-op for us. + */ if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) continue; @@ -585,6 +507,13 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) if (!drmem_lmb_reserved(lmb)) continue; + /* + * Setting the isolation state of an UNISOLATED/CONFIGURED + * device to UNISOLATE is a no-op, but the hypervisor can + * use it as a hint that the LMB removal failed. + */ + dlpar_unisolate_drc(lmb->drc_index); + rc = dlpar_add_lmb(lmb); if (rc) pr_err("Failed to add LMB, drc index %x\n", @@ -611,7 +540,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) #else static inline int pseries_remove_memblock(unsigned long base, - unsigned int memblock_size) + unsigned long memblock_size) { return -EOPNOTSUPP; } @@ -619,10 +548,6 @@ static inline int pseries_remove_mem_node(struct device_node *np) { return 0; } -static inline int dlpar_memory_remove(struct pseries_hp_errorlog *hp_elog) -{ - return -EOPNOTSUPP; -} static int dlpar_remove_lmb(struct drmem_lmb *lmb) { return -EOPNOTSUPP; @@ -635,10 +560,6 @@ static int dlpar_memory_remove_by_index(u32 drc_index) { return -EOPNOTSUPP; } -static int dlpar_memory_readd_by_index(u32 drc_index) -{ - return -EOPNOTSUPP; -} static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) { @@ -649,7 +570,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) static int dlpar_add_lmb(struct drmem_lmb *lmb) { unsigned long block_sz; - int rc; + int nid, rc; if (lmb->flags & DRCONF_MEM_ASSIGNED) return -EINVAL; @@ -657,24 +578,30 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = update_lmb_associativity_index(lmb); if (rc) { dlpar_release_drc(lmb->drc_index); + pr_err("Failed to configure LMB 0x%x\n", lmb->drc_index); return rc; } - lmb_set_nid(lmb); block_sz = memory_block_size_bytes(); + /* Find the node id for this LMB. Fake one if necessary. */ + nid = of_drconf_to_nid_single(lmb); + if (nid < 0 || !node_possible(nid)) + nid = first_online_node; + /* Add the memory */ - rc = __add_memory(lmb->nid, lmb->base_addr, block_sz); + rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY); if (rc) { + pr_err("Failed to add LMB 0x%x to node %u", lmb->drc_index, nid); invalidate_lmb_associativity_index(lmb); return rc; } rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(lmb->nid, lmb->base_addr, block_sz); + pr_err("Failed to online LMB 0x%x on node %u\n", lmb->drc_index, nid); + __remove_memory(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); - lmb_clear_nid(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; } @@ -686,7 +613,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add) { struct drmem_lmb *lmb; int lmbs_available = 0; - int lmbs_added = 0; + int lmbs_reserved = 0; int rc; pr_info("Attempting to hot-add %d LMB(s)\n", lmbs_to_add); @@ -696,6 +623,9 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add) /* Validate that there are enough LMBs to satisfy the request */ for_each_drmem_lmb(lmb) { + if (lmb->flags & DRCONF_MEM_RESERVED) + continue; + if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) lmbs_available++; @@ -724,13 +654,12 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add) * requested LMBs cannot be added. */ drmem_mark_lmb_reserved(lmb); - - lmbs_added++; - if (lmbs_added == lmbs_to_add) + lmbs_reserved++; + if (lmbs_reserved == lmbs_to_add) break; } - if (lmbs_added != lmbs_to_add) { + if (lmbs_reserved != lmbs_to_add) { pr_err("Memory hot-add failed, removing any added LMBs\n"); for_each_drmem_lmb(lmb) { @@ -745,6 +674,10 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add) dlpar_release_drc(lmb->drc_index); drmem_remove_lmb_reservation(lmb); + lmbs_reserved--; + + if (lmbs_reserved == 0) + break; } rc = -EINVAL; } else { @@ -752,9 +685,13 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add) if (!drmem_lmb_reserved(lmb)) continue; - pr_info("Memory at %llx (drc index %x) was hot-added\n", - lmb->base_addr, lmb->drc_index); + pr_debug("Memory at %llx (drc index %x) was hot-added\n", + lmb->base_addr, lmb->drc_index); drmem_remove_lmb_reservation(lmb); + lmbs_reserved--; + + if (lmbs_reserved == 0) + break; } rc = 0; } @@ -799,7 +736,6 @@ static int dlpar_memory_add_by_index(u32 drc_index) static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index) { struct drmem_lmb *lmb, *start_lmb, *end_lmb; - int lmbs_available = 0; int rc; pr_info("Attempting to hot-add %u LMB(s) at index %x\n", @@ -814,15 +750,14 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index) /* Validate that the LMBs in this range are not reserved */ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) { - if (lmb->flags & DRCONF_MEM_RESERVED) - break; - - lmbs_available++; + /* Fail immediately if the whole range can't be hot-added */ + if (lmb->flags & DRCONF_MEM_RESERVED) { + pr_err("Memory at %llx (drc index %x) is reserved\n", + lmb->base_addr, lmb->drc_index); + return -EINVAL; + } } - if (lmbs_available < lmbs_to_add) - return -EINVAL; - for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) { if (lmb->flags & DRCONF_MEM_ASSIGNED) continue; @@ -921,21 +856,14 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) } break; - case PSERIES_HP_ELOG_ACTION_READD: - drc_index = hp_elog->_drc_u.drc_index; - rc = dlpar_memory_readd_by_index(drc_index); - break; default: pr_err("Invalid action (%d) specified\n", hp_elog->action); rc = -EINVAL; break; } - if (!rc) { - rtas_hp_event = true; + if (!rc) rc = drmem_update_dt(); - rtas_hp_event = false; - } unlock_device_hotplug(); return rc; @@ -943,10 +871,8 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) static int pseries_add_mem_node(struct device_node *np) { - const __be32 *regs; - unsigned long base; - unsigned int lmb_size; - int ret = -EINVAL; + int ret; + struct resource res; /* * Check to see if we are actually adding memory @@ -957,74 +883,17 @@ static int pseries_add_mem_node(struct device_node *np) /* * Find the base and size of the memblock */ - regs = of_get_property(np, "reg", NULL); - if (!regs) + ret = of_address_to_resource(np, 0, &res); + if (ret) return ret; - base = be64_to_cpu(*(unsigned long *)regs); - lmb_size = be32_to_cpu(regs[3]); - /* * Update memory region to represent the memory add */ - ret = memblock_add(base, lmb_size); + ret = memblock_add(res.start, resource_size(&res)); return (ret < 0) ? -EINVAL : 0; } -static int pseries_update_drconf_memory(struct of_reconfig_data *pr) -{ - struct of_drconf_cell_v1 *new_drmem, *old_drmem; - unsigned long memblock_size; - u32 entries; - __be32 *p; - int i, rc = -EINVAL; - - if (rtas_hp_event) - return 0; - - memblock_size = pseries_memory_block_size(); - if (!memblock_size) - return -EINVAL; - - if (!pr->old_prop) - return 0; - - p = (__be32 *) pr->old_prop->value; - if (!p) - return -EINVAL; - - /* The first int of the property is the number of lmb's described - * by the property. This is followed by an array of of_drconf_cell - * entries. Get the number of entries and skip to the array of - * of_drconf_cell's. - */ - entries = be32_to_cpu(*p++); - old_drmem = (struct of_drconf_cell_v1 *)p; - - p = (__be32 *)pr->prop->value; - p++; - new_drmem = (struct of_drconf_cell_v1 *)p; - - for (i = 0; i < entries; i++) { - if ((be32_to_cpu(old_drmem[i].flags) & DRCONF_MEM_ASSIGNED) && - (!(be32_to_cpu(new_drmem[i].flags) & DRCONF_MEM_ASSIGNED))) { - rc = pseries_remove_memblock( - be64_to_cpu(old_drmem[i].base_addr), - memblock_size); - break; - } else if ((!(be32_to_cpu(old_drmem[i].flags) & - DRCONF_MEM_ASSIGNED)) && - (be32_to_cpu(new_drmem[i].flags) & - DRCONF_MEM_ASSIGNED)) { - rc = memblock_add(be64_to_cpu(old_drmem[i].base_addr), - memblock_size); - rc = (rc < 0) ? -EINVAL : 0; - break; - } - } - return rc; -} - static int pseries_memory_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -1039,9 +908,9 @@ static int pseries_memory_notifier(struct notifier_block *nb, err = pseries_remove_mem_node(rd->dn); break; case OF_RECONFIG_UPDATE_PROPERTY: - if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) - err = pseries_update_drconf_memory(rd); - break; + if (!strcmp(rd->dn->name, + "ibm,dynamic-reconfiguration-memory")) + drmem_update_lmbs(rd->prop); } return notifier_from_errno(err); } diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index 2136e42833af..2b0cac6fb61f 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -16,7 +16,7 @@ #ifdef CONFIG_TRACEPOINTS #ifndef CONFIG_JUMP_LABEL - .section ".toc","aw" + .data .globl hcall_tracepoint_refcount hcall_tracepoint_refcount: @@ -27,7 +27,9 @@ hcall_tracepoint_refcount: /* * precall must preserve all registers. use unused STK_PARAM() - * areas to save snapshots and opcode. + * areas to save snapshots and opcode. STK_PARAM() in the caller's + * frame will be available even on ELFv2 because these are all + * variadic functions. */ #define HCALL_INST_PRECALL(FIRST_REG) \ mflr r0; \ @@ -41,29 +43,29 @@ hcall_tracepoint_refcount: std r10,STK_PARAM(R10)(r1); \ std r0,16(r1); \ addi r4,r1,STK_PARAM(FIRST_REG); \ - stdu r1,-STACK_FRAME_OVERHEAD(r1); \ - bl __trace_hcall_entry; \ - ld r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \ - ld r4,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1); \ - ld r5,STACK_FRAME_OVERHEAD+STK_PARAM(R5)(r1); \ - ld r6,STACK_FRAME_OVERHEAD+STK_PARAM(R6)(r1); \ - ld r7,STACK_FRAME_OVERHEAD+STK_PARAM(R7)(r1); \ - ld r8,STACK_FRAME_OVERHEAD+STK_PARAM(R8)(r1); \ - ld r9,STACK_FRAME_OVERHEAD+STK_PARAM(R9)(r1); \ - ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R10)(r1) + stdu r1,-STACK_FRAME_MIN_SIZE(r1); \ + bl CFUNC(__trace_hcall_entry); \ + ld r3,STACK_FRAME_MIN_SIZE+STK_PARAM(R3)(r1); \ + ld r4,STACK_FRAME_MIN_SIZE+STK_PARAM(R4)(r1); \ + ld r5,STACK_FRAME_MIN_SIZE+STK_PARAM(R5)(r1); \ + ld r6,STACK_FRAME_MIN_SIZE+STK_PARAM(R6)(r1); \ + ld r7,STACK_FRAME_MIN_SIZE+STK_PARAM(R7)(r1); \ + ld r8,STACK_FRAME_MIN_SIZE+STK_PARAM(R8)(r1); \ + ld r9,STACK_FRAME_MIN_SIZE+STK_PARAM(R9)(r1); \ + ld r10,STACK_FRAME_MIN_SIZE+STK_PARAM(R10)(r1) /* * postcall is performed immediately before function return which * allows liberal use of volatile registers. */ #define __HCALL_INST_POSTCALL \ - ld r0,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \ - std r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \ + ld r0,STACK_FRAME_MIN_SIZE+STK_PARAM(R3)(r1); \ + std r3,STACK_FRAME_MIN_SIZE+STK_PARAM(R3)(r1); \ mr r4,r3; \ mr r3,r0; \ - bl __trace_hcall_exit; \ - ld r0,STACK_FRAME_OVERHEAD+16(r1); \ - addi r1,r1,STACK_FRAME_OVERHEAD; \ + bl CFUNC(__trace_hcall_exit); \ + ld r0,STACK_FRAME_MIN_SIZE+16(r1); \ + addi r1,r1,STACK_FRAME_MIN_SIZE; \ ld r3,STK_PARAM(R3)(r1); \ mtlr r0 @@ -88,8 +90,8 @@ hcall_tracepoint_refcount: BEGIN_FTR_SECTION; \ b 1f; \ END_FTR_SECTION(0, 1); \ - ld r12,hcall_tracepoint_refcount@toc(r2); \ - std r12,32(r1); \ + LOAD_REG_ADDR(r12, hcall_tracepoint_refcount) ; \ + ld r12,0(r12); \ cmpdi r12,0; \ bne- LABEL; \ 1: @@ -102,6 +104,20 @@ END_FTR_SECTION(0, 1); \ #define HCALL_BRANCH(LABEL) #endif +_GLOBAL_TOC(plpar_hcall_norets_notrace) + HMT_MEDIUM + + mfcr r0 + stw r0,8(r1) + HVSC /* invoke the hypervisor */ + + li r4,0 + stb r4,PACASRR_VALID(r13) + + lwz r0,8(r1) + mtcrf 0xff,r0 + blr /* return r3 = status */ + _GLOBAL_TOC(plpar_hcall_norets) HMT_MEDIUM @@ -110,6 +126,9 @@ _GLOBAL_TOC(plpar_hcall_norets) HCALL_BRANCH(plpar_hcall_norets_trace) HVSC /* invoke the hypervisor */ + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 blr /* return r3 = status */ @@ -119,6 +138,10 @@ plpar_hcall_norets_trace: HCALL_INST_PRECALL(R4) HVSC HCALL_INST_POSTCALL_NORETS + + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 blr @@ -149,6 +172,9 @@ _GLOBAL_TOC(plpar_hcall) std r6, 16(r12) std r7, 24(r12) + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 @@ -158,9 +184,6 @@ _GLOBAL_TOC(plpar_hcall) plpar_hcall_trace: HCALL_INST_PRECALL(R5) - std r4,STK_PARAM(R4)(r1) - mr r0,r4 - mr r4,r5 mr r5,r6 mr r6,r7 @@ -170,7 +193,7 @@ plpar_hcall_trace: HVSC - ld r12,STK_PARAM(R4)(r1) + ld r12,STACK_FRAME_MIN_SIZE+STK_PARAM(R4)(r1) std r4,0(r12) std r5,8(r12) std r6,16(r12) @@ -178,6 +201,9 @@ plpar_hcall_trace: HCALL_INST_POSTCALL(r12) + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 @@ -213,6 +239,9 @@ _GLOBAL(plpar_hcall_raw) std r6, 16(r12) std r7, 24(r12) + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 @@ -252,6 +281,9 @@ _GLOBAL_TOC(plpar_hcall9) std r11,56(r12) std r0, 64(r12) + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 @@ -261,23 +293,20 @@ _GLOBAL_TOC(plpar_hcall9) plpar_hcall9_trace: HCALL_INST_PRECALL(R5) - std r4,STK_PARAM(R4)(r1) - mr r0,r4 - mr r4,r5 mr r5,r6 mr r6,r7 mr r7,r8 mr r8,r9 mr r9,r10 - ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R11)(r1) - ld r11,STACK_FRAME_OVERHEAD+STK_PARAM(R12)(r1) - ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R13)(r1) + ld r10,STACK_FRAME_MIN_SIZE+STK_PARAM(R11)(r1) + ld r11,STACK_FRAME_MIN_SIZE+STK_PARAM(R12)(r1) + ld r12,STACK_FRAME_MIN_SIZE+STK_PARAM(R13)(r1) HVSC mr r0,r12 - ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1) + ld r12,STACK_FRAME_MIN_SIZE+STK_PARAM(R4)(r1) std r4,0(r12) std r5,8(r12) std r6,16(r12) @@ -290,6 +319,9 @@ plpar_hcall9_trace: HCALL_INST_POSTCALL(r12) + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 @@ -329,6 +361,9 @@ _GLOBAL(plpar_hcall9_raw) std r11,56(r12) std r0, 64(r12) + li r4,0 + stb r4,PACASRR_VALID(r13) + lwz r0,8(r1) mtcrf 0xff,r0 diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index c40c62ec432e..3a50612a78db 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -26,7 +26,7 @@ struct hcall_stats { }; #define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) -DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); +static DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); /* * Routines for displaying the statistics in debugfs @@ -70,31 +70,14 @@ static int hc_show(struct seq_file *m, void *p) return 0; } -static const struct seq_operations hcall_inst_seq_ops = { +static const struct seq_operations hcall_inst_sops = { .start = hc_start, .next = hc_next, .stop = hc_stop, .show = hc_show }; -static int hcall_inst_seq_open(struct inode *inode, struct file *file) -{ - int rc; - struct seq_file *seq; - - rc = seq_open(file, &hcall_inst_seq_ops); - seq = file->private_data; - seq->private = file_inode(file)->i_private; - - return rc; -} - -static const struct file_operations hcall_inst_seq_fops = { - .open = hcall_inst_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(hcall_inst); #define HCALL_ROOT_DIR "hcall_inst" #define CPU_NAME_BUF_SIZE 32 @@ -149,7 +132,7 @@ static int __init hcall_inst_init(void) snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu); debugfs_create_file(cpu_name_buf, 0444, hcall_root, per_cpu(hcall_stats, cpu), - &hcall_inst_seq_fops); + &hcall_inst_fops); } return 0; diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c index 1ac52963e08b..8803c947998e 100644 --- a/arch/powerpc/platforms/pseries/hvconsole.c +++ b/arch/powerpc/platforms/pseries/hvconsole.c @@ -25,7 +25,7 @@ * firmware. * @count: not used? */ -int hvc_get_chars(uint32_t vtermno, char *buf, int count) +ssize_t hvc_get_chars(uint32_t vtermno, u8 *buf, size_t count) { long ret; unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; @@ -52,7 +52,7 @@ EXPORT_SYMBOL(hvc_get_chars); * firmware. Must be at least 16 bytes, even if count is less than 16. * @count: Send this number of characters. */ -int hvc_put_chars(uint32_t vtermno, const char *buf, int count) +ssize_t hvc_put_chars(uint32_t vtermno, const u8 *buf, size_t count) { unsigned long *lbuf = (unsigned long *) buf; long ret; diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c index 267139b13530..d48c9c7ce10f 100644 --- a/arch/powerpc/platforms/pseries/hvcserver.c +++ b/arch/powerpc/platforms/pseries/hvcserver.c @@ -45,7 +45,7 @@ static int hvcs_convert(long to_convert) case H_LONG_BUSY_ORDER_10_SEC: case H_LONG_BUSY_ORDER_100_SEC: return -EBUSY; - case H_FUNCTION: /* fall through */ + case H_FUNCTION: default: return -EPERM; } @@ -176,7 +176,7 @@ int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head, = (unsigned int)last_p_partition_ID; /* copy the Null-term char too */ - strlcpy(&next_partner_info->location_code[0], + strscpy(&next_partner_info->location_code[0], (char *)&pi_buff[2], sizeof(next_partner_info->location_code)); diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c index b91eb0929ed1..b401282727a4 100644 --- a/arch/powerpc/platforms/pseries/ibmebus.c +++ b/arch/powerpc/platforms/pseries/ibmebus.c @@ -40,19 +40,22 @@ #include <linux/export.h> #include <linux/console.h> #include <linux/kobject.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/interrupt.h> +#include <linux/irqdomain.h> #include <linux/of.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/of_platform.h> +#include <linux/platform_device.h> #include <asm/ibmebus.h> +#include <asm/machdep.h> static struct device ibmebus_bus_device = { /* fake "parent" device */ .init_name = "ibmebus", }; -struct bus_type ibmebus_bus_type; +const struct bus_type ibmebus_bus_type; /* These devices will automatically be added to the bus during init */ static const struct of_device_id ibmebus_matches[] __initconst = { @@ -150,7 +153,11 @@ static const struct dma_map_ops ibmebus_dma_ops = { static int ibmebus_match_path(struct device *dev, const void *data) { struct device_node *dn = to_platform_device(dev)->dev.of_node; - return (of_find_node_by_path(data) == dn); + struct device_node *tn = of_find_node_by_path(data); + + of_node_put(tn); + + return (tn == dn); } static int ibmebus_match_node(struct device *dev, const void *data) @@ -261,7 +268,7 @@ static char *ibmebus_chomp(const char *in, size_t count) return out; } -static ssize_t probe_store(struct bus_type *bus, const char *buf, size_t count) +static ssize_t probe_store(const struct bus_type *bus, const char *buf, size_t count) { struct device_node *dn = NULL; struct device *dev; @@ -299,7 +306,7 @@ out: } static BUS_ATTR_WO(probe); -static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count) +static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count) { struct device *dev; char *path; @@ -354,24 +361,23 @@ static int ibmebus_bus_device_probe(struct device *dev) if (!drv->probe) return error; - of_dev_get(of_dev); + get_device(dev); if (of_driver_match_device(dev, dev->driver)) error = drv->probe(of_dev); if (error) - of_dev_put(of_dev); + put_device(dev); return error; } -static int ibmebus_bus_device_remove(struct device *dev) +static void ibmebus_bus_device_remove(struct device *dev) { struct platform_device *of_dev = to_platform_device(dev); struct platform_driver *drv = to_platform_driver(dev->driver); if (dev->driver && drv->remove) drv->remove(of_dev); - return 0; } static void ibmebus_bus_device_shutdown(struct device *dev) @@ -421,9 +427,14 @@ static struct attribute *ibmebus_bus_device_attrs[] = { }; ATTRIBUTE_GROUPS(ibmebus_bus_device); -struct bus_type ibmebus_bus_type = { +static int ibmebus_bus_modalias(const struct device *dev, struct kobj_uevent_env *env) +{ + return of_device_uevent_modalias(dev, env); +} + +const struct bus_type ibmebus_bus_type = { .name = "ibmebus", - .uevent = of_device_uevent_modalias, + .uevent = ibmebus_bus_modalias, .bus_groups = ibmbus_bus_groups, .match = ibmebus_bus_bus_match, .probe = ibmebus_bus_device_probe, @@ -450,6 +461,7 @@ static int __init ibmebus_bus_init(void) if (err) { printk(KERN_WARNING "%s: device_register returned %i\n", __func__, err); + put_device(&ibmebus_bus_device); bus_unregister(&ibmebus_bus_type); return err; @@ -464,4 +476,4 @@ static int __init ibmebus_bus_init(void) return 0; } -postcore_initcall(ibmebus_bus_init); +machine_postcore_initcall(pseries, ibmebus_bus_init); diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c index 7b74d4d34e9a..f411d4fe7b24 100644 --- a/arch/powerpc/platforms/pseries/io_event_irq.c +++ b/arch/powerpc/platforms/pseries/io_event_irq.c @@ -143,7 +143,7 @@ static int __init ioei_init(void) { struct device_node *np; - ioei_check_exception_token = rtas_token("check-exception"); + ioei_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE) return -ENODEV; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 6ba081dd61c9..e8c4129697b1 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -22,6 +22,7 @@ #include <linux/crash_dump.h> #include <linux/memory.h> #include <linux/of.h> +#include <linux/of_address.h> #include <linux/iommu.h> #include <linux/rculist.h> #include <asm/io.h> @@ -36,32 +37,53 @@ #include <asm/udbg.h> #include <asm/mmzone.h> #include <asm/plpar_wrappers.h> -#include <asm/svm.h> #include "pseries.h" -static struct iommu_table_group *iommu_pseries_alloc_group(int node) +enum { + DDW_QUERY_PE_DMA_WIN = 0, + DDW_CREATE_PE_DMA_WIN = 1, + DDW_REMOVE_PE_DMA_WIN = 2, + + DDW_APPLICABLE_SIZE +}; + +enum { + DDW_EXT_SIZE = 0, + DDW_EXT_RESET_DMA_WIN = 1, + DDW_EXT_QUERY_OUT_SIZE = 2 +}; + +static struct iommu_table *iommu_pseries_alloc_table(int node) { - struct iommu_table_group *table_group; struct iommu_table *tbl; - table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, - node); - if (!table_group) - return NULL; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto free_group; + return NULL; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); + return tbl; +} + +static struct iommu_table_group *iommu_pseries_alloc_group(int node) +{ + struct iommu_table_group *table_group; - table_group->tables[0] = tbl; + table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); + if (!table_group) + return NULL; + +#ifdef CONFIG_IOMMU_API + table_group->ops = &spapr_tce_table_group_ops; + table_group->pgsizes = SZ_4K; +#endif - return table_group; + table_group->tables[0] = iommu_pseries_alloc_table(node); + if (table_group->tables[0]) + return table_group; -free_group: kfree(table_group); return NULL; } @@ -69,19 +91,24 @@ free_group: static void iommu_pseries_free_group(struct iommu_table_group *table_group, const char *node_name) { - struct iommu_table *tbl; - if (!table_group) return; - tbl = table_group->tables[0]; #ifdef CONFIG_IOMMU_API if (table_group->group) { iommu_group_put(table_group->group); BUG_ON(table_group->group); } #endif - iommu_tce_table_put(tbl); + + /* Default DMA window table is at index 0, while DDW at 1. SR-IOV + * adapters only have table on index 1. + */ + if (table_group->tables[0]) + iommu_tce_table_put(table_group->tables[0]); + + if (table_group->tables[1]) + iommu_tce_table_put(table_group->tables[1]); kfree(table_group); } @@ -94,6 +121,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, u64 proto_tce; __be64 *tcep; u64 rpn; + const unsigned long tceshift = tbl->it_page_shift; + const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); proto_tce = TCE_PCI_READ; // Read allowed @@ -104,10 +133,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ - rpn = __pa(uaddr) >> TCE_SHIFT; - *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + rpn = __pa(uaddr) >> tceshift; + *tcep = cpu_to_be64(proto_tce | rpn << tceshift); - uaddr += TCE_PAGE_SIZE; + uaddr += pagesize; tcep++; } return 0; @@ -133,10 +162,10 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return be64_to_cpu(*tcep); } -static void tce_free_pSeriesLP(struct iommu_table*, long, long); +static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); -static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, long npages, unsigned long uaddr, enum dma_data_direction direction, unsigned long attrs) @@ -147,25 +176,25 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; long tcenum_start = tcenum, npages_start = npages; - rpn = __pa(uaddr) >> TCE_SHIFT; + rpn = __pa(uaddr) >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; while (npages--) { - tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); + tce = proto_tce | rpn << tceshift; + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; - tce_free_pSeriesLP(tbl, tcenum_start, + tce_free_pSeriesLP(liobn, tcenum_start, tceshift, (npages_start - (npages + 1))); break; } if (rc && printk_ratelimit()) { printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); - printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tindex = 0x%llx\n", (u64)liobn); printk("\ttcenum = 0x%llx\n", (u64)tcenum); printk("\ttce val = 0x%llx\n", tce ); dump_stack(); @@ -192,9 +221,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long tcenum_start = tcenum, npages_start = npages; int ret = 0; unsigned long flags; + const unsigned long tceshift = tbl->it_page_shift; - if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tceshift, npages, uaddr, direction, attrs); } @@ -210,13 +241,14 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, /* If allocation fails, fall back to the loop implementation */ if (!tcep) { local_irq_restore(flags); - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, - direction, attrs); + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tceshift, + npages, uaddr, direction, attrs); } __this_cpu_write(tce_page, tcep); } - rpn = __pa(uaddr) >> TCE_SHIFT; + rpn = __pa(uaddr) >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -227,15 +259,15 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, * Set up the page with TCE data, looping through and setting * the values. */ - limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); + limit = min_t(long, npages, 4096 / TCE_ENTRY_SIZE); for (l = 0; l < limit; l++) { - tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); rpn++; } rc = plpar_tce_put_indirect((u64)tbl->it_index, - (u64)tcenum << 12, + (u64)tcenum << tceshift, (u64)__pa(tcep), limit); @@ -262,16 +294,17 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) +static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + long npages) { u64 rc; while (npages--) { - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0); + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0); if (rc && printk_ratelimit()) { printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); - printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tindex = 0x%llx\n", (u64)liobn); printk("\ttcenum = 0x%llx\n", (u64)tcenum); dump_stack(); } @@ -284,11 +317,22 @@ static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) { u64 rc; + long rpages = npages; + unsigned long limit; - if (!firmware_has_feature(FW_FEATURE_MULTITCE)) - return tce_free_pSeriesLP(tbl, tcenum, npages); + if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) + return tce_free_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, npages); - rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); + do { + limit = min_t(unsigned long, rpages, 512); + + rc = plpar_tce_stuff((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, 0, limit); + + rpages -= limit; + tcenum += limit; + } while (rpages > 0 && !rc); if (rc && printk_ratelimit()) { printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); @@ -304,7 +348,8 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) u64 rc; unsigned long tce_ret; - rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret); + rc = plpar_tce_get((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, &tce_ret); if (rc && printk_ratelimit()) { printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); @@ -324,16 +369,17 @@ struct dynamic_dma_window_prop { __be32 window_shift; /* ilog2(tce_window_size) */ }; -struct direct_window { +struct dma_win { struct device_node *device; const struct dynamic_dma_window_prop *prop; + bool direct; struct list_head list; }; /* Dynamic DMA Window support */ struct ddw_query_response { u32 windows_available; - u32 largest_available_block; + u64 largest_available_block; u32 page_size; u32 migration_capable; }; @@ -344,12 +390,11 @@ struct ddw_create_response { u32 addr_lo; }; -static LIST_HEAD(direct_window_list); +static LIST_HEAD(dma_win_list); /* prevents races between memory on/offline and window creation */ -static DEFINE_SPINLOCK(direct_window_list_lock); +static DEFINE_SPINLOCK(dma_win_list_lock); /* protects initializing window twice for same device */ -static DEFINE_MUTEX(direct_window_init_mutex); -#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" +static DEFINE_MUTEX(dma_win_init_mutex); static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, unsigned long num_pfn, const void *arg) @@ -401,6 +446,19 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, u64 rc = 0; long l, limit; + if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { + unsigned long tceshift = be32_to_cpu(maprange->tce_shift); + unsigned long dmastart = (start_pfn << PAGE_SHIFT) + + be64_to_cpu(maprange->dma_base); + unsigned long tcenum = dmastart >> tceshift; + unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift; + void *uaddr = __va(start_pfn << PAGE_SHIFT); + + return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn), + tcenum, tceshift, npages, (unsigned long) uaddr, + DMA_BIDIRECTIONAL, 0); + } + local_irq_disable(); /* to protect tcep and the page behind it */ tcep = __this_cpu_read(tce_page); @@ -435,7 +493,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, * Set up the page with TCE data, looping through and setting * the values. */ - limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + limit = min_t(long, num_tce, 4096 / TCE_ENTRY_SIZE); dma_offset = next + be64_to_cpu(maprange->dma_base); for (l = 0; l < limit; l++) { @@ -463,6 +521,24 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); } +static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno, + unsigned long liobn, unsigned long win_addr, + unsigned long window_size, unsigned long page_shift, + void *base, struct iommu_table_ops *table_ops) +{ + tbl->it_busno = busno; + tbl->it_index = liobn; + tbl->it_offset = win_addr >> page_shift; + tbl->it_size = window_size >> page_shift; + tbl->it_page_shift = page_shift; + tbl->it_base = (unsigned long)base; + tbl->it_blocksize = 16; + tbl->it_type = TCE_PCI; + tbl->it_ops = table_ops; +} + +struct iommu_table_ops iommu_table_pseries_ops; + static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl) @@ -471,8 +547,13 @@ static void iommu_table_setparms(struct pci_controller *phb, const unsigned long *basep; const u32 *sizep; - node = phb->dn; + /* Test if we are going over 2GB of DMA space */ + if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) { + udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + } + node = phb->dn; basep = of_get_property(node, "linux,tce-base", NULL); sizep = of_get_property(node, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) { @@ -481,59 +562,17 @@ static void iommu_table_setparms(struct pci_controller *phb, return; } - tbl->it_base = (unsigned long)__va(*basep); + iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur, + phb->dma_window_size, IOMMU_PAGE_SHIFT_4K, + __va(*basep), &iommu_table_pseries_ops); if (!is_kdump_kernel()) memset((void *)tbl->it_base, 0, *sizep); - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - - /* Units of tce entries */ - tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift; - - /* Test if we are going over 2GB of DMA space */ - if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { - udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - } - phb->dma_window_base_cur += phb->dma_window_size; - - /* Set the tce table size - measured in entries */ - tbl->it_size = phb->dma_window_size >> tbl->it_page_shift; - - tbl->it_index = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; } -/* - * iommu_table_setparms_lpar - * - * Function: On pSeries LPAR systems, return TCE table info, given a pci bus. - */ -static void iommu_table_setparms_lpar(struct pci_controller *phb, - struct device_node *dn, - struct iommu_table *tbl, - struct iommu_table_group *table_group, - const __be32 *dma_window) -{ - unsigned long offset, size; - - of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); - - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - tbl->it_base = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; - tbl->it_offset = offset >> tbl->it_page_shift; - tbl->it_size = size >> tbl->it_page_shift; - - table_group->tce32_start = offset; - table_group->tce32_size = size; -} +struct iommu_table_ops iommu_table_lpar_multi_ops; struct iommu_table_ops iommu_table_pseries_ops = { .set = tce_build_pSeries, @@ -609,8 +648,9 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) tbl = pci->table_group->tables[0]; iommu_table_setparms(pci->phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, pci->phb->node, 0, 0); + + if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) + panic("Failed to initialize iommu table"); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -622,8 +662,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) #ifdef CONFIG_IOMMU_API static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned - long *tce, enum dma_data_direction *direction, - bool realmode) + long *tce, enum dma_data_direction *direction) { long rc; unsigned long ioba = (unsigned long) index << tbl->it_page_shift; @@ -657,29 +696,97 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = { .get = tce_get_pSeriesLP }; +/* + * Find nearest ibm,dma-window (default DMA window) or direct DMA window or + * dynamic 64bit DMA window, walking up the device tree. + */ +static struct device_node *pci_dma_find(struct device_node *dn, + struct dynamic_dma_window_prop *prop) +{ + const __be32 *default_prop = NULL; + const __be32 *ddw_prop = NULL; + struct device_node *rdn = NULL; + bool default_win = false, ddw_win = false; + + for ( ; dn && PCI_DN(dn); dn = dn->parent) { + default_prop = of_get_property(dn, "ibm,dma-window", NULL); + if (default_prop) { + rdn = dn; + default_win = true; + } + ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; + } + ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; + } + + /* At least found default window, which is the case for normal boot */ + if (default_win) + break; + } + + /* For PCI devices there will always be a DMA window, either on the device + * or parent bus + */ + WARN_ON(!(default_win | ddw_win)); + + /* caller doesn't want to get DMA window property */ + if (!prop) + return rdn; + + /* parse DMA window property. During normal system boot, only default + * DMA window is passed in OF. But, for kdump, a dedicated adapter might + * have both default and DDW in FDT. In this scenario, DDW takes precedence + * over default window. + */ + if (ddw_win) { + struct dynamic_dma_window_prop *p; + + p = (struct dynamic_dma_window_prop *)ddw_prop; + prop->liobn = p->liobn; + prop->dma_base = p->dma_base; + prop->tce_shift = p->tce_shift; + prop->window_shift = p->window_shift; + } else if (default_win) { + unsigned long offset, size, liobn; + + of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size); + + prop->liobn = cpu_to_be32((u32)liobn); + prop->dma_base = cpu_to_be64(offset); + prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K); + prop->window_shift = cpu_to_be32(order_base_2(size)); + } + + return rdn; +} + static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) { struct iommu_table *tbl; struct device_node *dn, *pdn; struct pci_dn *ppci; - const __be32 *dma_window = NULL; + struct dynamic_dma_window_prop prop; dn = pci_bus_to_OF_node(bus); pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", dn); - /* Find nearest ibm,dma-window, walking up the device tree */ - for (pdn = dn; pdn != NULL; pdn = pdn->parent) { - dma_window = of_get_property(pdn, "ibm,dma-window", NULL); - if (dma_window != NULL) - break; - } + pdn = pci_dma_find(dn, &prop); - if (dma_window == NULL) { - pr_debug(" no ibm,dma-window property !\n"); - return; - } + /* In PPC architecture, there will always be DMA window on bus or one of the + * parent bus. During reboot, there will be ibm,dma-window property to + * define DMA window. For kdump, there will at least be default window or DDW + * or both. + */ ppci = PCI_DN(pdn); @@ -689,10 +796,24 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) if (!ppci->table_group) { ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); tbl = ppci->table_group->tables[0]; - iommu_table_setparms_lpar(ppci->phb, pdn, tbl, - ppci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, ppci->phb->node, 0, 0); + + iommu_table_setparms_common(tbl, ppci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); + + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) + panic("Failed to initialize iommu table"); + iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -720,8 +841,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, phb->node, 0, 0); + + if (!iommu_init_table(tbl, phb->node, 0, 0)) + panic("Failed to initialize iommu table"); + set_iommu_table_base(&dev->dev, tbl); return; } @@ -753,28 +876,10 @@ static int __init disable_ddw_setup(char *str) early_param("disable_ddw", disable_ddw_setup); -static void remove_ddw(struct device_node *np, bool remove_prop) +static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp) { - struct dynamic_dma_window_prop *dwp; - struct property *win64; - u32 ddw_avail[3]; - u64 liobn; - int ret = 0; - - ret = of_property_read_u32_array(np, "ibm,ddw-applicable", - &ddw_avail[0], 3); - - win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); - if (!win64) - return; - - if (ret || win64->length < sizeof(*dwp)) - goto delprop; - - dwp = win64->value; - liobn = (u64)be32_to_cpu(dwp->liobn); + int ret; - /* clear the whole window, note the arg is in kernel pages */ ret = tce_clearrange_multi_pSeriesLP(0, 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); if (ret) @@ -783,86 +888,208 @@ static void remove_ddw(struct device_node *np, bool remove_prop) else pr_debug("%pOF successfully cleared tces in window.\n", np); +} + +/* + * Call only if DMA window is clean. + */ +static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn) +{ + int ret; - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); if (ret) - pr_warn("%pOF: failed to remove direct window: rtas returned " + pr_warn("%pOF: failed to remove DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", - np, ret, ddw_avail[2], liobn); + np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); else - pr_debug("%pOF: successfully removed direct window: rtas returned " + pr_debug("%pOF: successfully removed DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", - np, ret, ddw_avail[2], liobn); + np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); +} + +static void remove_dma_window(struct device_node *np, u32 *ddw_avail, + struct property *win) +{ + struct dynamic_dma_window_prop *dwp; + u64 liobn; -delprop: - if (remove_prop) - ret = of_remove_property(np, win64); + dwp = win->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + clean_dma_window(np, dwp); + __remove_dma_window(np, ddw_avail, liobn); +} + +static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name) +{ + struct property *win; + u32 ddw_avail[DDW_APPLICABLE_SIZE]; + int ret = 0; + + win = of_find_property(np, win_name, NULL); + if (!win) + return -EINVAL; + + ret = of_property_read_u32_array(np, "ibm,ddw-applicable", + &ddw_avail[0], DDW_APPLICABLE_SIZE); + if (ret) + return 0; + + + if (win->length >= sizeof(struct dynamic_dma_window_prop)) + remove_dma_window(np, ddw_avail, win); + + if (!remove_prop) + return 0; + + ret = of_remove_property(np, win); if (ret) - pr_warn("%pOF: failed to remove direct window property: %d\n", + pr_warn("%pOF: failed to remove DMA window property: %d\n", np, ret); + return 0; } -static u64 find_existing_ddw(struct device_node *pdn) +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift, + bool *direct_mapping) { - struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; - u64 dma_addr = 0; + struct dma_win *window; + const struct dynamic_dma_window_prop *dma64; + bool found = false; - spin_lock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); /* check if we already created a window and dupe that config if so */ - list_for_each_entry(window, &direct_window_list, list) { + list_for_each_entry(window, &dma_win_list, list) { if (window->device == pdn) { - direct64 = window->prop; - dma_addr = be64_to_cpu(direct64->dma_base); + dma64 = window->prop; + *dma_addr = be64_to_cpu(dma64->dma_base); + *window_shift = be32_to_cpu(dma64->window_shift); + *direct_mapping = window->direct; + found = true; break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); - return dma_addr; + return found; } -static int find_existing_ddw_windows(void) +static struct dma_win *ddw_list_new_entry(struct device_node *pdn, + const struct dynamic_dma_window_prop *dma64) +{ + struct dma_win *window; + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + return NULL; + + window->device = pdn; + window->prop = dma64; + window->direct = false; + + return window; +} + +static void find_existing_ddw_windows_named(const char *name) { int len; struct device_node *pdn; - struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; - - if (!firmware_has_feature(FW_FEATURE_LPAR)) - return 0; + struct dma_win *window; + const struct dynamic_dma_window_prop *dma64; - for_each_node_with_property(pdn, DIRECT64_PROPNAME) { - direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); - if (!direct64) + for_each_node_with_property(pdn, name) { + dma64 = of_get_property(pdn, name, &len); + if (!dma64 || len < sizeof(*dma64)) { + remove_ddw(pdn, true, name); continue; + } - window = kzalloc(sizeof(*window), GFP_KERNEL); - if (!window || len < sizeof(struct dynamic_dma_window_prop)) { - kfree(window); - remove_ddw(pdn, true); - continue; + /* If at the time of system initialization, there are DDWs in OF, + * it means this is during kexec. DDW could be direct or dynamic. + * We will just mark DDWs as "dynamic" since this is kdump path, + * no need to worry about perforance. ddw_list_new_entry() will + * set window->direct = false. + */ + window = ddw_list_new_entry(pdn, dma64); + if (!window) { + of_node_put(pdn); + break; } - window->device = pdn; - window->prop = direct64; - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); } +} + +static int find_existing_ddw_windows(void) +{ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + find_existing_ddw_windows_named(DIRECT64_PROPNAME); + find_existing_ddw_windows_named(DMA64_PROPNAME); return 0; } machine_arch_initcall(pseries, find_existing_ddw_windows); +/** + * ddw_read_ext - Get the value of an DDW extension + * @np: device node from which the extension value is to be read. + * @extnum: index number of the extension. + * @value: pointer to return value, modified when extension is available. + * + * Checks if "ibm,ddw-extensions" exists for this node, and get the value + * on index 'extnum'. + * It can be used only to check if a property exists, passing value == NULL. + * + * Returns: + * 0 if extension successfully read + * -EINVAL if the "ibm,ddw-extensions" does not exist, + * -ENODATA if "ibm,ddw-extensions" does not have a value, and + * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension. + */ +static inline int ddw_read_ext(const struct device_node *np, int extnum, + u32 *value) +{ + static const char propname[] = "ibm,ddw-extensions"; + u32 count; + int ret; + + ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count); + if (ret) + return ret; + + if (count < extnum) + return -EOVERFLOW; + + if (!value) + value = &count; + + return of_property_read_u32_index(np, propname, extnum, value); +} + static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, - struct ddw_query_response *query) + struct ddw_query_response *query, + struct device_node *parent) { struct device_node *dn; struct pci_dn *pdn; - u32 cfg_addr; + u32 cfg_addr, ext_query, query_out[5]; u64 buid; - int ret; + int ret, out_sz; + + /* + * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many + * output parameters ibm,query-pe-dma-windows will have, ranging from + * 5 to 6. + */ + ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query); + if (!ret && ext_query == 1) + out_sz = 6; + else + out_sz = 5; /* * Get the config address and phb buid of the PE window. @@ -875,11 +1102,30 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, buid = pdn->phb->buid; cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, - cfg_addr, BUID_HI(buid), BUID_LO(buid)); - dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" - " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid), - BUID_LO(buid), ret); + ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + + switch (out_sz) { + case 5: + query->windows_available = query_out[0]; + query->largest_available_block = query_out[1]; + query->page_size = query_out[2]; + query->migration_capable = query_out[3]; + break; + case 6: + query->windows_available = query_out[0]; + query->largest_available_block = ((u64)query_out[1] << 32) | + query_out[2]; + query->page_size = query_out[3]; + query->migration_capable = query_out[4]; + break; + } + + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n", + ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid), + BUID_LO(buid), ret, query->largest_available_block, + query->page_size, query->windows_available); + return ret; } @@ -906,15 +1152,16 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, - cfg_addr, BUID_HI(buid), BUID_LO(buid), - page_shift, window_shift); + ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4, + (u32 *)create, cfg_addr, BUID_HI(buid), + BUID_LO(buid), page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(&dev->dev, "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " - "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1], - cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift, - window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + "(liobn = 0x%x starting addr = %x %x)\n", + ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid), + BUID_LO(buid), page_shift, window_shift, ret, create->liobn, + create->addr_hi, create->addr_lo); return ret; } @@ -928,30 +1175,107 @@ static LIST_HEAD(failed_ddw_pdn_list); static phys_addr_t ddw_memory_hotplug_max(void) { - phys_addr_t max_addr = memory_hotplug_max(); + resource_size_t max_addr = memory_hotplug_max(); struct device_node *memory; for_each_node_by_type(memory, "memory") { - unsigned long start, size; - int n_mem_addr_cells, n_mem_size_cells, len; - const __be32 *memcell_buf; + struct resource res; - memcell_buf = of_get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) + if (of_address_to_resource(memory, 0, &res)) continue; - n_mem_addr_cells = of_n_addr_cells(memory); - n_mem_size_cells = of_n_size_cells(memory); + max_addr = max_t(resource_size_t, max_addr, res.end + 1); + } + + return max_addr; +} + +/* + * Platforms supporting the DDW option starting with LoPAR level 2.7 implement + * ibm,ddw-extensions, which carries the rtas token for + * ibm,reset-pe-dma-windows. + * That rtas-call can be used to restore the default DMA window for the device. + */ +static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) +{ + int ret; + u32 cfg_addr, reset_dma_win; + u64 buid; + struct device_node *dn; + struct pci_dn *pdn; + + ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); + if (ret) + return; + + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + buid = pdn->phb->buid; + cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); + + ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid), + BUID_LO(buid)); + if (ret) + dev_info(&dev->dev, + "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ", + reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), + ret); +} + +/* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */ +static int iommu_get_page_shift(u32 query_page_size) +{ + /* Supported IO page-sizes according to LoPAR, note that 2M is out of order */ + const int shift[] = { + __builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M), + __builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M), + __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M) + }; + + int i = ARRAY_SIZE(shift) - 1; + int ret = 0; - start = of_read_number(memcell_buf, n_mem_addr_cells); - memcell_buf += n_mem_addr_cells; - size = of_read_number(memcell_buf, n_mem_size_cells); - memcell_buf += n_mem_size_cells; + /* + * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field: + * - bit 31 means 4k pages are supported, + * - bit 30 means 64k pages are supported, and so on. + * Larger pagesizes map more memory with the same amount of TCEs, so start probing them. + */ + for (; i >= 0 ; i--) { + if (query_page_size & (1 << i)) + ret = max(ret, shift[i]); + } + + return ret; +} - max_addr = max_t(phys_addr_t, max_addr, start + size); +static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr, + u32 page_shift, u32 window_shift) +{ + struct dynamic_dma_window_prop *ddwprop; + struct property *win64; + + win64 = kzalloc(sizeof(*win64), GFP_KERNEL); + if (!win64) + return NULL; + + win64->name = kstrdup(propname, GFP_KERNEL); + ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->value = ddwprop; + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + kfree(win64->name); + kfree(win64->value); + kfree(win64); + return NULL; } - return max_addr; + ddwprop->liobn = cpu_to_be32(liobn); + ddwprop->dma_base = cpu_to_be64(dma_addr); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(window_shift); + + return win64; } /* @@ -963,26 +1287,34 @@ static phys_addr_t ddw_memory_hotplug_max(void) * pdn: the parent pe node with the ibm,dma_window property * Future: also check if we can remap the base window for our base page size * - * returns the dma offset for use by the direct mapped DMA code. + * returns true if can map all pages (direct mapping), false otherwise.. */ -static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) { - int len, ret; + int len = 0, ret; + int max_ram_len = order_base_2(ddw_memory_hotplug_max()); struct ddw_query_response query; struct ddw_create_response create; int page_shift; - u64 dma_addr, max_addr; + u64 win_addr; + const char *win_name; struct device_node *dn; - u32 ddw_avail[3]; - struct direct_window *window; + u32 ddw_avail[DDW_APPLICABLE_SIZE]; + struct dma_win *window; struct property *win64; - struct dynamic_dma_window_prop *ddwprop; struct failed_ddw_pdn *fpdn; + bool default_win_removed = false, direct_mapping = false; + bool pmem_present; + struct pci_dn *pci = PCI_DN(pdn); + struct property *default_win = NULL; + + dn = of_find_node_by_type(NULL, "ibm,pmemory"); + pmem_present = dn != NULL; + of_node_put(dn); - mutex_lock(&direct_window_init_mutex); + mutex_lock(&dma_win_init_mutex); - dma_addr = find_existing_ddw(pdn); - if (dma_addr != 0) + if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping)) goto out_unlock; /* @@ -1006,7 +1338,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * the property is actually in the parent, not the PE */ ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", - &ddw_avail[0], 3); + &ddw_avail[0], DDW_APPLICABLE_SIZE); if (ret) goto out_failed; @@ -1017,108 +1349,198 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * of page sizes: supported and supported for migrate-dma. */ dn = pci_device_to_OF_node(dev); - ret = query_ddw(dev, ddw_avail, &query); + ret = query_ddw(dev, ddw_avail, &query, pdn); if (ret != 0) goto out_failed; + /* + * If there is no window available, remove the default DMA window, + * if it's present. This will make all the resources available to the + * new DDW window. + * If anything fails after this, we need to restore it, so also check + * for extensions presence. + */ if (query.windows_available == 0) { - /* - * no additional windows are available for this device. - * We might be able to reallocate the existing window, - * trading in for a larger page size. - */ - dev_dbg(&dev->dev, "no free dynamic windows"); - goto out_failed; - } - if (query.page_size & 4) { - page_shift = 24; /* 16MB */ - } else if (query.page_size & 2) { - page_shift = 16; /* 64kB */ - } else if (query.page_size & 1) { - page_shift = 12; /* 4kB */ - } else { - dev_dbg(&dev->dev, "no supported direct page size in mask %x", - query.page_size); - goto out_failed; + int reset_win_ext; + + /* DDW + IOMMU on single window may fail if there is any allocation */ + if (iommu_table_in_use(pci->table_group->tables[0])) { + dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); + goto out_failed; + } + + default_win = of_find_property(pdn, "ibm,dma-window", NULL); + if (!default_win) + goto out_failed; + + reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL); + if (reset_win_ext) + goto out_failed; + + remove_dma_window(pdn, ddw_avail, default_win); + default_win_removed = true; + + /* Query again, to check if the window is available */ + ret = query_ddw(dev, ddw_avail, &query, pdn); + if (ret != 0) + goto out_failed; + + if (query.windows_available == 0) { + /* no windows are available for this device. */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_failed; + } } - /* verify the window * number of ptes will map the partition */ - /* check largest block * page size > max memory hotplug addr */ - max_addr = ddw_memory_hotplug_max(); - if (query.largest_available_block < (max_addr >> page_shift)) { - dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u " - "%llu-sized pages\n", max_addr, query.largest_available_block, - 1ULL << page_shift); + + page_shift = iommu_get_page_shift(query.page_size); + if (!page_shift) { + dev_dbg(&dev->dev, "no supported page size in mask %x", + query.page_size); goto out_failed; } - len = order_base_2(max_addr); - win64 = kzalloc(sizeof(struct property), GFP_KERNEL); - if (!win64) { - dev_info(&dev->dev, - "couldn't allocate property for 64bit dma window\n"); - goto out_failed; + + + /* + * The "ibm,pmemory" can appear anywhere in the address space. + * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS + * for the upper limit and fallback to max RAM otherwise but this + * disables device::dma_ops_bypass. + */ + len = max_ram_len; + if (pmem_present) { + if (query.largest_available_block >= + (1ULL << (MAX_PHYSMEM_BITS - page_shift))) + len = MAX_PHYSMEM_BITS; + else + dev_info(&dev->dev, "Skipping ibm,pmemory"); } - win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); - win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); - win64->length = sizeof(*ddwprop); - if (!win64->name || !win64->value) { - dev_info(&dev->dev, - "couldn't allocate property name and value\n"); - goto out_free_prop; + + /* check if the available block * number of ptes will map everything */ + if (query.largest_available_block < (1ULL << (len - page_shift))) { + dev_dbg(&dev->dev, + "can't map partition max 0x%llx with %llu %llu-sized pages\n", + 1ULL << len, + query.largest_available_block, + 1ULL << page_shift); + + len = order_base_2(query.largest_available_block << page_shift); + win_name = DMA64_PROPNAME; + } else { + direct_mapping = !default_win_removed || + (len == MAX_PHYSMEM_BITS) || + (!pmem_present && (len == max_ram_len)); + win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME; } ret = create_ddw(dev, ddw_avail, &create, page_shift, len); if (ret != 0) - goto out_free_prop; - - ddwprop->liobn = cpu_to_be32(create.liobn); - ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) | - create.addr_lo); - ddwprop->tce_shift = cpu_to_be32(page_shift); - ddwprop->window_shift = cpu_to_be32(len); + goto out_failed; dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", create.liobn, dn); - window = kzalloc(sizeof(*window), GFP_KERNEL); - if (!window) - goto out_clear_window; + win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; + win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len); - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, - win64->value, tce_setrange_multi_pSeriesLP_walk); - if (ret) { - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", - dn, ret); - goto out_free_window; + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property, property name, or value\n"); + goto out_remove_win; } ret = of_add_property(pdn, win64); if (ret) { - dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", - pdn, ret); - goto out_free_window; + dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d", + pdn, ret); + goto out_free_prop; } - window->device = pdn; - window->prop = ddwprop; - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + window = ddw_list_new_entry(pdn, win64->value); + if (!window) + goto out_del_prop; + + if (direct_mapping) { + window->direct = true; + + /* DDW maps the whole partition, so enable direct DMA mapping */ + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", + dn, ret); + + /* Make sure to clean DDW if any TCE was set*/ + clean_dma_window(pdn, win64->value); + goto out_del_list; + } + } else { + struct iommu_table *newtbl; + int i; + unsigned long start = 0, end = 0; + + window->direct = false; + + for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { + const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; + + /* Look for MMIO32 */ + if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { + start = pci->phb->mem_resources[i].start; + end = pci->phb->mem_resources[i].end; + break; + } + } + + /* New table for using DDW instead of the default DMA window */ + newtbl = iommu_pseries_alloc_table(pci->phb->node); + if (!newtbl) { + dev_dbg(&dev->dev, "couldn't create new IOMMU table\n"); + goto out_del_list; + } + + iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr, + 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); + iommu_init_table(newtbl, pci->phb->node, start, end); + + pci->table_group->tables[1] = newtbl; - dma_addr = be64_to_cpu(ddwprop->dma_base); + set_iommu_table_base(&dev->dev, newtbl); + } + + if (default_win_removed) { + iommu_tce_table_put(pci->table_group->tables[0]); + pci->table_group->tables[0] = NULL; + + /* default_win is valid here because default_win_removed == true */ + of_remove_property(pdn, default_win); + dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn); + } + + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); + + dev->dev.archdata.dma_offset = win_addr; goto out_unlock; -out_free_window: +out_del_list: kfree(window); -out_clear_window: - remove_ddw(pdn, true); +out_del_prop: + of_remove_property(pdn, win64); out_free_prop: kfree(win64->name); kfree(win64->value); kfree(win64); +out_remove_win: + /* DDW is clean, so it's ok to call this directly. */ + __remove_dma_window(pdn, ddw_avail, create.liobn); + out_failed: + if (default_win_removed) + reset_dma_window(dev, pdn); fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); if (!fpdn) @@ -1127,16 +1549,25 @@ out_failed: list_add(&fpdn->list, &failed_ddw_pdn_list); out_unlock: - mutex_unlock(&direct_window_init_mutex); - return dma_addr; + mutex_unlock(&dma_win_init_mutex); + + /* + * If we have persistent memory and the window size is only as big + * as RAM, then we failed to create a window to cover persistent + * memory and need to set the DMA limit. + */ + if (pmem_present && direct_mapping && len == max_ram_len) + dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); + + return direct_mapping; } static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; struct iommu_table *tbl; - const __be32 *dma_window = NULL; struct pci_dn *pci; + struct dynamic_dma_window_prop prop; pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); @@ -1149,13 +1580,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); pr_debug(" node is %pOF\n", dn); - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; - pdn = pdn->parent) { - dma_window = of_get_property(pdn, "ibm,dma-window", NULL); - if (dma_window) - break; - } - + pdn = pci_dma_find(dn, &prop); if (!pdn || !PCI_DN(pdn)) { printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " "no DMA window found for pci dev=%s dn=%pOF\n", @@ -1168,9 +1593,21 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pci->table_group) { pci->table_group = iommu_pseries_alloc_group(pci->phb->node); tbl = pci->table_group->tables[0]; - iommu_table_setparms_lpar(pci->phb, pdn, tbl, - pci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; + + iommu_table_setparms_common(tbl, pci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + pci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); @@ -1186,7 +1623,6 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) { struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; - const __be32 *dma_window = NULL; /* only attempt to use a new window if 64-bit DMA is requested */ if (dma_mask < DMA_BIT_MASK(64)) @@ -1200,18 +1636,9 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) * search upwards in the tree until we either hit a dma-window * property, OR find a parent with a table already allocated. */ - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; - pdn = pdn->parent) { - dma_window = of_get_property(pdn, "ibm,dma-window", NULL); - if (dma_window) - break; - } - - if (pdn && PCI_DN(pdn)) { - pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn); - if (pdev->dev.archdata.dma_offset) - return true; - } + pdn = pci_dma_find(dn, NULL); + if (pdn && PCI_DN(pdn)) + return enable_ddw(pdev, pdn); return false; } @@ -1219,29 +1646,33 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct direct_window *window; + struct dma_win *window; struct memory_notify *arg = data; int ret = 0; switch (action) { case MEM_GOING_ONLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { - ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, - arg->nr_pages, window->prop); + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { + if (window->direct) { + ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + } /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; case MEM_CANCEL_ONLINE: case MEM_OFFLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { - ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, - arg->nr_pages, window->prop); + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { + if (window->direct) { + ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + } /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: break; @@ -1262,7 +1693,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti struct of_reconfig_data *rd = data; struct device_node *np = rd->dn; struct pci_dn *pci = PCI_DN(np); - struct direct_window *window; + struct dma_win *window; switch (action) { case OF_RECONFIG_DETACH_NODE: @@ -1273,20 +1704,22 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * we have to remove the property when releasing * the device node. */ - remove_ddw(np, false); + if (remove_ddw(np, false, DIRECT64_PROPNAME)) + remove_ddw(np, false, DMA64_PROPNAME); + if (pci && pci->table_group) iommu_pseries_free_group(pci->table_group, np->full_name); - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { if (window->device == np) { list_del(&window->list); kfree(window); break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: err = NOTIFY_DONE; @@ -1300,7 +1733,7 @@ static struct notifier_block iommu_reconfig_nb = { }; /* These are called very early. */ -void iommu_init_early_pSeries(void) +void __init iommu_init_early_pSeries(void) { if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL)) return; @@ -1320,51 +1753,44 @@ void iommu_init_early_pSeries(void) of_reconfig_notifier_register(&iommu_reconfig_nb); register_memory_notifier(&iommu_mem_nb); - /* - * Secure guest memory is inacessible to devices so regular DMA isn't - * possible. - * - * In that case keep devices' dma_map_ops as NULL so that the generic - * DMA code path will use SWIOTLB to bounce buffers for DMA. - */ - if (!is_secure_guest()) - set_pci_dma_ops(&dma_iommu_ops); + set_pci_dma_ops(&dma_iommu_ops); } static int __init disable_multitce(char *str) { if (strcmp(str, "off") == 0 && firmware_has_feature(FW_FEATURE_LPAR) && - firmware_has_feature(FW_FEATURE_MULTITCE)) { + (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) || + firmware_has_feature(FW_FEATURE_STUFF_TCE))) { printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); - powerpc_firmware_features &= ~FW_FEATURE_MULTITCE; + powerpc_firmware_features &= + ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE); } return 1; } __setup("multitce=", disable_multitce); -static int tce_iommu_bus_notifier(struct notifier_block *nb, - unsigned long action, void *data) +#ifdef CONFIG_SPAPR_TCE_IOMMU +struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose, + struct pci_dev *pdev) { - struct device *dev = data; + struct device_node *pdn, *dn = pdev->dev.of_node; + struct iommu_group *grp; + struct pci_dn *pci; - switch (action) { - case BUS_NOTIFY_DEL_DEVICE: - iommu_del_device(dev); - return 0; - default: - return 0; - } -} + pdn = pci_dma_find(dn, NULL); + if (!pdn || !PCI_DN(pdn)) + return ERR_PTR(-ENODEV); -static struct notifier_block tce_iommu_bus_nb = { - .notifier_call = tce_iommu_bus_notifier, -}; + pci = PCI_DN(pdn); + if (!pci->table_group) + return ERR_PTR(-ENODEV); -static int __init tce_iommu_bus_notifier_init(void) -{ - bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); - return 0; + grp = pci->table_group->group; + if (!grp) + return ERR_PTR(-ENODEV); + + return iommu_group_ref_get(grp); } -machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init); +#endif diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index 145fcfbc017f..096d09ed89f6 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -6,7 +6,7 @@ #include <linux/kernel.h> #include <linux/interrupt.h> -#include <asm/machdep.h> +#include <asm/setup.h> #include <asm/page.h> #include <asm/firmware.h> #include <asm/kexec.h> @@ -61,3 +61,11 @@ void pseries_kexec_cpu_down(int crash_shutdown, int secondary) } else xics_kexec_teardown_cpu(secondary); } + +void pseries_machine_kexec(struct kimage *image) +{ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) + pseries_disable_reloc_on_exc(); + + default_machine_kexec(image); +} diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 60cb29ae4739..4e9916bb03d7 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -21,16 +21,18 @@ #include <linux/cpuhotplug.h> #include <linux/workqueue.h> #include <linux/proc_fs.h> +#include <linux/pgtable.h> +#include <linux/debugfs.h> + #include <asm/processor.h> #include <asm/mmu.h> #include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/machdep.h> +#include <asm/setup.h> #include <asm/mmu_context.h> #include <asm/iommu.h> #include <asm/tlb.h> -#include <asm/prom.h> #include <asm/cputable.h> +#include <asm/papr-sysparm.h> #include <asm/udbg.h> #include <asm/smp.h> #include <asm/trace.h> @@ -38,8 +40,8 @@ #include <asm/plpar_wrappers.h> #include <asm/kexec.h> #include <asm/fadump.h> -#include <asm/asm-prototypes.h> -#include <asm/debugfs.h> +#include <asm/dtl.h> +#include <asm/vphn.h> #include "pseries.h" @@ -56,6 +58,7 @@ EXPORT_SYMBOL(plpar_hcall); EXPORT_SYMBOL(plpar_hcall9); EXPORT_SYMBOL(plpar_hcall_norets); +#ifdef CONFIG_PPC_64S_HASH_MMU /* * H_BLOCK_REMOVE supported block size for this page size in segment who's base * page size is that page size. @@ -64,6 +67,7 @@ EXPORT_SYMBOL(plpar_hcall_norets); * page size. */ static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; +#endif /* * Due to the involved complexity, and that the current hypervisor is only @@ -188,9 +192,9 @@ static void free_dtl_buffers(unsigned long *time_limit) continue; kmem_cache_free(dtl_cache, pp->dispatch_log); pp->dtl_ridx = 0; - pp->dispatch_log = 0; - pp->dispatch_log_end = 0; - pp->dtl_curr = 0; + pp->dispatch_log = NULL; + pp->dispatch_log_end = NULL; + pp->dtl_curr = NULL; if (time_limit && time_after(jiffies, *time_limit)) { cond_resched(); @@ -219,7 +223,7 @@ static void destroy_cpu_associativity(void) { kfree(vcpu_associativity); kfree(pcpu_associativity); - vcpu_associativity = pcpu_associativity = 0; + vcpu_associativity = pcpu_associativity = NULL; } static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag) @@ -260,7 +264,7 @@ static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) return -EIO; - return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); } static int cpu_home_node_dispatch_distance(int disp_cpu) @@ -280,7 +284,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu) if (!disp_cpu_assoc || !vcpu_assoc) return -EIO; - return cpu_distance(disp_cpu_assoc, vcpu_assoc); + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc); } static void update_vcpu_disp_stat(int disp_cpu) @@ -522,8 +526,10 @@ static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p, if (cmd) { rc = init_cpu_associativity(); - if (rc) + if (rc) { + destroy_cpu_associativity(); goto out; + } for_each_possible_cpu(cpu) { disp = per_cpu_ptr(&vcpu_disp_data, cpu); @@ -582,12 +588,12 @@ static int vcpudispatch_stats_open(struct inode *inode, struct file *file) return single_open(file, vcpudispatch_stats_display, NULL); } -static const struct file_operations vcpudispatch_stats_proc_ops = { - .open = vcpudispatch_stats_open, - .read = seq_read, - .write = vcpudispatch_stats_write, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops vcpudispatch_stats_proc_ops = { + .proc_open = vcpudispatch_stats_open, + .proc_read = seq_read, + .proc_write = vcpudispatch_stats_write, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static ssize_t vcpudispatch_stats_freq_write(struct file *file, @@ -626,17 +632,17 @@ static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file) return single_open(file, vcpudispatch_stats_freq_display, NULL); } -static const struct file_operations vcpudispatch_stats_freq_proc_ops = { - .open = vcpudispatch_stats_freq_open, - .read = seq_read, - .write = vcpudispatch_stats_freq_write, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops vcpudispatch_stats_freq_proc_ops = { + .proc_open = vcpudispatch_stats_freq_open, + .proc_read = seq_read, + .proc_write = vcpudispatch_stats_freq_write, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static int __init vcpudispatch_stats_procfs_init(void) { - if (!lppaca_shared_proc(get_lppaca())) + if (!lppaca_shared_proc()) return 0; if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL, @@ -650,6 +656,21 @@ static int __init vcpudispatch_stats_procfs_init(void) } machine_device_initcall(pseries, vcpudispatch_stats_procfs_init); + +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +u64 pseries_paravirt_steal_clock(int cpu) +{ + struct lppaca *lppaca = &lppaca_of(cpu); + + /* + * VPA steal time counters are reported at TB frequency. Hence do a + * conversion to ns before returning + */ + return tb_to_ns(be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) + + be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb))); +} +#endif + #endif /* CONFIG_PPC_SPLPAR */ void vpa_init(int cpu) @@ -679,7 +700,7 @@ void vpa_init(int cpu) return; } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. @@ -692,7 +713,7 @@ void vpa_init(int cpu) "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_64S_HASH_MMU */ /* * Register dispatch trace log, if one has been allocated. @@ -702,6 +723,36 @@ void vpa_init(int cpu) #ifdef CONFIG_PPC_BOOK3S_64 +static int __init pseries_lpar_register_process_table(unsigned long base, + unsigned long page_size, unsigned long table_size) +{ + long rc; + unsigned long flags = 0; + + if (table_size) + flags |= PROC_TABLE_NEW; + if (radix_enabled()) { + flags |= PROC_TABLE_RADIX; + if (mmu_has_feature(MMU_FTR_GTSE)) + flags |= PROC_TABLE_GTSE; + } else + flags |= PROC_TABLE_HPT_SLB; + for (;;) { + rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, + page_size, table_size); + if (!H_IS_LONG_BUSY(rc)) + break; + mdelay(get_longbusy_msecs(rc)); + } + if (rc != H_SUCCESS) { + pr_err("Failed to register process table (rc=%ld)\n", rc); + BUG(); + } + return rc; +} + +#ifdef CONFIG_PPC_64S_HASH_MMU + static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, @@ -792,7 +843,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) return -1; } -static void manual_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void manual_hpte_clear_all(void) { unsigned long size_bytes = 1UL << ppc64_pft_size; unsigned long hpte_count = size_bytes >> 4; @@ -825,7 +877,8 @@ static void manual_hpte_clear_all(void) } } -static int hcall_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace int hcall_hpte_clear_all(void) { int rc; @@ -836,7 +889,8 @@ static int hcall_hpte_clear_all(void) return rc; } -static void pseries_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void pseries_hpte_clear_all(void) { int rc; @@ -878,7 +932,8 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, want_v = hpte_encode_avpn(vpn, psize, ssize); - flags = (newpp & 7) | H_AVPN; + flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN; + flags |= (newpp & HPTE_R_KEY_HI) >> 48; if (mmu_has_feature(MMU_FTR_KERNEL_RO)) /* Move pp0 into bit 8 (IBM 55) */ flags |= (newpp & HPTE_R_PP0) >> 55; @@ -967,11 +1022,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); - flags = newpp & 7; + flags = newpp & (HPTE_R_PP | HPTE_R_N); if (mmu_has_feature(MMU_FTR_KERNEL_RO)) /* Move pp0 into bit 8 (IBM 55) */ flags |= (newpp & HPTE_R_PP0) >> 55; + flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO); + lpar_rc = plpar_pte_protect(flags, slot, 0); BUG_ON(lpar_rc != H_SUCCESS); @@ -1412,8 +1469,6 @@ static inline void __init check_lp_set_hblkrm(unsigned int lp, } } -#define SPLPAR_TLB_BIC_TOKEN 50 - /* * The size of the TLB Block Invalidate Characteristics is variable. But at the * maximum it will be the number of possible page sizes *2 + 10 bytes. @@ -1424,42 +1479,24 @@ static inline void __init check_lp_set_hblkrm(unsigned int lp, void __init pseries_lpar_read_hblkrm_characteristics(void) { - unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH]; - int call_status, len, idx, bpsize; + static struct papr_sysparm_buf buf __initdata; + int len, idx, bpsize; if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) return; - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - SPLPAR_TLB_BIC_TOKEN, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH); - local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0'; - spin_unlock(&rtas_data_buf_lock); - - if (call_status != 0) { - pr_warn("%s %s Error calling get-system-parameter (0x%x)\n", - __FILE__, __func__, call_status); + if (papr_sysparm_get(PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS, &buf)) return; - } - /* - * The first two (2) bytes of the data in the buffer are the length of - * the returned data, not counting these first two (2) bytes. - */ - len = be16_to_cpu(*((u16 *)local_buffer)) + 2; + len = be16_to_cpu(buf.len); if (len > SPLPAR_TLB_BIC_MAXLENGTH) { pr_warn("%s too large returned buffer %d", __func__, len); return; } - idx = 2; + idx = 0; while (idx < len) { - u8 block_shift = local_buffer[idx++]; + u8 block_shift = buf.val[idx++]; u32 block_size; unsigned int npsize; @@ -1468,9 +1505,9 @@ void __init pseries_lpar_read_hblkrm_characteristics(void) block_size = 1 << block_shift; - for (npsize = local_buffer[idx++]; + for (npsize = buf.val[idx++]; npsize > 0 && idx < len; npsize--) - check_lp_set_hblkrm((unsigned int) local_buffer[idx++], + check_lp_set_hblkrm((unsigned int)buf.val[idx++], block_size); } @@ -1620,7 +1657,7 @@ static int pseries_lpar_resize_hpt(unsigned long shift) } msleep(delay); rc = plpar_resize_hpt_prepare(0, shift); - }; + } switch (rc) { case H_SUCCESS: @@ -1664,32 +1701,6 @@ static int pseries_lpar_resize_hpt(unsigned long shift) return 0; } -static int pseries_lpar_register_process_table(unsigned long base, - unsigned long page_size, unsigned long table_size) -{ - long rc; - unsigned long flags = 0; - - if (table_size) - flags |= PROC_TABLE_NEW; - if (radix_enabled()) - flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE; - else - flags |= PROC_TABLE_HPT_SLB; - for (;;) { - rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, - page_size, table_size); - if (!H_IS_LONG_BUSY(rc)) - break; - mdelay(get_longbusy_msecs(rc)); - } - if (rc != H_SUCCESS) { - pr_err("Failed to register process table (rc=%ld)\n", rc); - BUG(); - } - return rc; -} - void __init hpte_init_pseries(void) { mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate; @@ -1712,14 +1723,17 @@ void __init hpte_init_pseries(void) if (cpu_has_feature(CPU_FTR_ARCH_300)) pseries_lpar_register_process_table(0, 0, 0); } +#endif /* CONFIG_PPC_64S_HASH_MMU */ -void radix_init_pseries(void) +#ifdef CONFIG_PPC_RADIX_MMU +void __init radix_init_pseries(void) { pr_info("Using radix MMU under hypervisor\n"); pseries_lpar_register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); } +#endif #ifdef CONFIG_PPC_SMLPAR #define CMO_FREE_HINT_DEFAULT 1 @@ -1813,30 +1827,28 @@ void hcall_tracepoint_unregfunc(void) #endif /* - * Since the tracing code might execute hcalls we need to guard against - * recursion. One example of this are spinlocks calling H_YIELD on - * shared processor partitions. + * Keep track of hcall tracing depth and prevent recursion. Warn if any is + * detected because it may indicate a problem. This will not catch all + * problems with tracing code making hcalls, because the tracing might have + * been invoked from a non-hcall, so the first hcall could recurse into it + * without warning here, but this better than nothing. + * + * Hcalls with specific problems being traced should use the _notrace + * plpar_hcall variants. */ static DEFINE_PER_CPU(unsigned int, hcall_trace_depth); -void __trace_hcall_entry(unsigned long opcode, unsigned long *args) +notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args) { unsigned long flags; unsigned int *depth; - /* - * We cannot call tracepoints inside RCU idle regions which - * means we must not trace H_CEDE. - */ - if (opcode == H_CEDE) - return; - local_irq_save(flags); depth = this_cpu_ptr(&hcall_trace_depth); - if (*depth) + if (WARN_ON_ONCE(*depth)) goto out; (*depth)++; @@ -1848,19 +1860,16 @@ out: local_irq_restore(flags); } -void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf) +notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf) { unsigned long flags; unsigned int *depth; - if (opcode == H_CEDE) - return; - local_irq_save(flags); depth = this_cpu_ptr(&hcall_trace_depth); - if (*depth) + if (*depth) /* Don't warn again on the way out */ goto out; (*depth)++; @@ -1917,7 +1926,8 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) return rc; } -static unsigned long vsid_unscramble(unsigned long vsid, int ssize) +#ifdef CONFIG_PPC_64S_HASH_MMU +static unsigned long __init vsid_unscramble(unsigned long vsid, int ssize) { unsigned long protovsid; unsigned long va_bits = VA_BITS; @@ -1977,6 +1987,7 @@ static int __init reserve_vrma_context_id(void) return 0; } machine_device_initcall(pseries, reserve_vrma_context_id); +#endif #ifdef CONFIG_DEBUG_FS /* debugfs file interface for vpa data */ @@ -2005,7 +2016,7 @@ static int __init vpa_debugfs_init(void) if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; - vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); + vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir); /* set up the per-cpu vpa file*/ for_each_possible_cpu(i) { diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index e33e8bc4b69b..f73c4d1c26af 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/proc_fs.h> #include <linux/init.h> +#include <asm/papr-sysparm.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -28,7 +29,6 @@ #include <asm/firmware.h> #include <asm/rtas.h> #include <asm/time.h> -#include <asm/prom.h> #include <asm/vdso_datapage.h> #include <asm/vio.h> #include <asm/mmu.h> @@ -36,6 +36,7 @@ #include <asm/drmem.h> #include "pseries.h" +#include "vas.h" /* pseries_vas_dlpar_cpu() */ /* * This isn't a module but we expose that to userspace @@ -136,6 +137,39 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) return rc; } +static void show_gpci_data(struct seq_file *m) +{ + struct hv_gpci_request_buffer *buf; + unsigned int affinity_score; + long ret; + + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (buf == NULL) + return; + + /* + * Show the local LPAR's affinity score. + * + * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall. + * The score is at byte 0xB in the output buffer. + */ + memset(&buf->params, 0, sizeof(buf->params)); + buf->params.counter_request = cpu_to_be32(0xB1); + buf->params.starting_index = cpu_to_be32(-1); /* local LPAR */ + buf->params.counter_info_version_in = 0x5; /* v5+ for score */ + ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf), + sizeof(*buf)); + if (ret != H_SUCCESS) { + pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n", + ret, be32_to_cpu(buf->params.detail_rc)); + goto out; + } + affinity_score = buf->bytes[0xB]; + seq_printf(m, "partition_affinity_score=%u\n", affinity_score); +out: + kfree(buf); +} + static unsigned h_pic(unsigned long *pool_idle_time, unsigned long *num_procs) { @@ -172,7 +206,7 @@ static void parse_ppp_data(struct seq_file *m) ppp_data.active_system_procs); /* pool related entries are appropriate for shared configs */ - if (lppaca_shared_proc(get_lppaca())) { + if (lppaca_shared_proc()) { unsigned long pool_idle_time, pool_procs; seq_printf(m, "pool=%d\n", ppp_data.pool_num); @@ -278,7 +312,59 @@ static void parse_mpp_x_data(struct seq_file *m) seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles); } -#define SPLPAR_CHARACTERISTICS_TOKEN 20 +/* + * Read the lpar name using the RTAS ibm,get-system-parameter call. + * + * The name read through this call is updated if changes are made by the end + * user on the hypervisor side. + * + * Some hypervisor (like Qemu) may not provide this value. In that case, a non + * null value is returned. + */ +static int read_rtas_lpar_name(struct seq_file *m) +{ + struct papr_sysparm_buf *buf; + int err; + + buf = papr_sysparm_buf_alloc(); + if (!buf) + return -ENOMEM; + + err = papr_sysparm_get(PAPR_SYSPARM_LPAR_NAME, buf); + if (!err) + seq_printf(m, "partition_name=%s\n", buf->val); + + papr_sysparm_buf_free(buf); + return err; +} + +/* + * Read the LPAR name from the Device Tree. + * + * The value read in the DT is not updated if the end-user is touching the LPAR + * name on the hypervisor side. + */ +static int read_dt_lpar_name(struct seq_file *m) +{ + struct device_node *root = of_find_node_by_path("/"); + const char *name; + int ret; + + ret = of_property_read_string(root, "ibm,partition-name", &name); + of_node_put(root); + if (ret) + return -ENOENT; + + seq_printf(m, "partition_name=%s\n", name); + return 0; +} + +static void read_lpar_name(struct seq_file *m) +{ + if (read_rtas_lpar_name(m) && read_dt_lpar_name(m)) + pr_err_once("Error can't get the LPAR name"); +} + #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) /* @@ -289,45 +375,25 @@ static void parse_mpp_x_data(struct seq_file *m) */ static void parse_system_parameter_string(struct seq_file *m) { - int call_status; + struct papr_sysparm_buf *buf; - unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); - if (!local_buffer) { - printk(KERN_ERR "%s %s kmalloc failure at line %d\n", - __FILE__, __func__, __LINE__); + buf = papr_sysparm_buf_alloc(); + if (!buf) return; - } - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH); - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - SPLPAR_CHARACTERISTICS_TOKEN, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH); - local_buffer[SPLPAR_MAXLENGTH - 1] = '\0'; - spin_unlock(&rtas_data_buf_lock); - - if (call_status != 0) { - printk(KERN_INFO - "%s %s Error calling get-system-parameter (0x%x)\n", - __FILE__, __func__, call_status); + if (papr_sysparm_get(PAPR_SYSPARM_SHARED_PROC_LPAR_ATTRS, buf)) { + goto out_free; } else { + const char *local_buffer; int splpar_strlen; int idx, w_idx; char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); - if (!workbuffer) { - printk(KERN_ERR "%s %s kmalloc failure at line %d\n", - __FILE__, __func__, __LINE__); - kfree(local_buffer); - return; - } -#ifdef LPARCFG_DEBUG - printk(KERN_INFO "success calling get-system-parameter\n"); -#endif - splpar_strlen = local_buffer[0] * 256 + local_buffer[1]; - local_buffer += 2; /* step over strlen value */ + + if (!workbuffer) + goto out_free; + + splpar_strlen = be16_to_cpu(buf->len); + local_buffer = buf->val; w_idx = 0; idx = 0; @@ -361,7 +427,8 @@ static void parse_system_parameter_string(struct seq_file *m) kfree(workbuffer); local_buffer -= 2; /* back up over strlen value */ } - kfree(local_buffer); +out_free: + papr_sysparm_buf_free(buf); } /* Return the number of processors in the system. @@ -435,10 +502,10 @@ static void maxmem_data(struct seq_file *m) { unsigned long maxmem = 0; - maxmem += drmem_info->n_lmbs * drmem_info->lmb_size; + maxmem += (unsigned long)drmem_info->n_lmbs * drmem_info->lmb_size; maxmem += hugetlb_total_pages() * PAGE_SIZE; - seq_printf(m, "MaxMem=%ld\n", maxmem); + seq_printf(m, "MaxMem=%lu\n", maxmem); } static int pseries_lparcfg_data(struct seq_file *m, void *v) @@ -463,6 +530,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) if (firmware_has_feature(FW_FEATURE_SPLPAR)) { /* this call handles the ibm,get-system-parameter contents */ + read_lpar_name(m); parse_system_parameter_string(m); parse_ppp_data(m); parse_mpp_data(m); @@ -487,6 +555,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) partition_active_processors * 100); } + show_gpci_data(m); + seq_printf(m, "partition_active_processors=%d\n", partition_active_processors); @@ -494,14 +564,17 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) partition_potential_processors); seq_printf(m, "shared_processor_mode=%d\n", - lppaca_shared_proc(get_lppaca())); + lppaca_shared_proc()); -#ifdef CONFIG_PPC_BOOK3S_64 - seq_printf(m, "slb_size=%d\n", mmu_slb_size); +#ifdef CONFIG_PPC_64S_HASH_MMU + if (!radix_enabled()) + seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif parse_em_data(m); maxmem_data(m); + seq_printf(m, "security_flavor=%u\n", pseries_security_flavor); + return 0; } @@ -624,6 +697,16 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, return -EINVAL; retval = update_ppp(new_entitled_ptr, NULL); + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + /* + * The hypervisor assigns VAS resources based + * on entitled capacity for shared mode. + * Reconfig VAS windows based on DLPAR CPU events. + */ + if (pseries_vas_dlpar_cpu() != 0) + retval = H_HARDWARE; + } } else if (!strcmp(kbuf, "capacity_weight")) { char *endp; *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); @@ -698,12 +781,12 @@ static int lparcfg_open(struct inode *inode, struct file *file) return single_open(file, lparcfg_data, NULL); } -static const struct file_operations lparcfg_fops = { - .read = seq_read, - .write = lparcfg_write, - .open = lparcfg_open, - .release = single_release, - .llseek = seq_lseek, +static const struct proc_ops lparcfg_proc_ops = { + .proc_read = seq_read, + .proc_write = lparcfg_write, + .proc_open = lparcfg_open, + .proc_release = single_release, + .proc_lseek = seq_lseek, }; static int __init lparcfg_init(void) @@ -714,7 +797,7 @@ static int __init lparcfg_init(void) if (firmware_has_feature(FW_FEATURE_SPLPAR)) mode |= 0200; - if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) { + if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_proc_ops)) { printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); return -EIO; } diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index b571285f6c14..1798f0f14d58 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -6,12 +6,17 @@ * Copyright (C) 2010 IBM Corporation */ + +#define pr_fmt(fmt) "mobility: " fmt + #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/kobject.h> +#include <linux/nmi.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/stat.h> +#include <linux/stop_machine.h> #include <linux/completion.h> #include <linux/device.h> #include <linux/delay.h> @@ -19,8 +24,10 @@ #include <linux/stringify.h> #include <asm/machdep.h> +#include <asm/nmi.h> #include <asm/rtas.h> #include "pseries.h" +#include "vas.h" /* vas_migration_handler() */ #include "../../kernel/cacheinfo.h" static struct kobject *mobility_kobj; @@ -42,6 +49,30 @@ struct update_props_workarea { #define MIGRATION_SCOPE (1) #define PRRN_SCOPE -2 +#ifdef CONFIG_PPC_WATCHDOG +static unsigned int nmi_wd_lpm_factor = 200; + +#ifdef CONFIG_SYSCTL +static struct ctl_table nmi_wd_lpm_factor_ctl_table[] = { + { + .procname = "nmi_wd_lpm_factor", + .data = &nmi_wd_lpm_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, +}; + +static int __init register_nmi_wd_lpm_factor_sysctl(void) +{ + register_sysctl("kernel", nmi_wd_lpm_factor_ctl_table); + + return 0; +} +device_initcall(register_nmi_wd_lpm_factor_sysctl); +#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_PPC_WATCHDOG */ + static int mobility_rtas_call(int token, char *buf, s32 scope) { int rc; @@ -56,16 +87,31 @@ static int mobility_rtas_call(int token, char *buf, s32 scope) return rc; } -static int delete_dt_node(__be32 phandle) +static int delete_dt_node(struct device_node *dn) { - struct device_node *dn; + struct device_node *pdn; + bool is_platfac; - dn = of_find_node_by_phandle(be32_to_cpu(phandle)); - if (!dn) - return -ENOENT; + pdn = of_get_parent(dn); + is_platfac = of_node_is_type(dn, "ibm,platform-facilities") || + of_node_is_type(pdn, "ibm,platform-facilities"); + of_node_put(pdn); + + /* + * The drivers that bind to nodes in the platform-facilities + * hierarchy don't support node removal, and the removal directive + * from firmware is always followed by an add of an equivalent + * node. The capability (e.g. RNG, encryption, compression) + * represented by the node is never interrupted by the migration. + * So ignore changes to this part of the tree. + */ + if (is_platfac) { + pr_notice("ignoring remove operation for %pOFfp\n", dn); + return 0; + } + pr_debug("removing node %pOFfp\n", dn); dlpar_detach_node(dn); - of_node_put(dn); return 0; } @@ -122,6 +168,7 @@ static int update_dt_property(struct device_node *dn, struct property **prop, } if (!more) { + pr_debug("updating node %pOF property %s\n", dn, name); of_update_property(dn, new_prop); *prop = NULL; } @@ -129,10 +176,9 @@ static int update_dt_property(struct device_node *dn, struct property **prop, return 0; } -static int update_dt_node(__be32 phandle, s32 scope) +static int update_dt_node(struct device_node *dn, s32 scope) { struct update_props_workarea *upwa; - struct device_node *dn; struct property *prop = NULL; int i, rc, rtas_rc; char *prop_data; @@ -141,7 +187,7 @@ static int update_dt_node(__be32 phandle, s32 scope) u32 nprops; u32 vd; - update_properties_token = rtas_token("ibm,update-properties"); + update_properties_token = rtas_function_token(RTAS_FN_IBM_UPDATE_PROPERTIES); if (update_properties_token == RTAS_UNKNOWN_SERVICE) return -EINVAL; @@ -149,14 +195,8 @@ static int update_dt_node(__be32 phandle, s32 scope) if (!rtas_buf) return -ENOMEM; - dn = of_find_node_by_phandle(be32_to_cpu(phandle)); - if (!dn) { - kfree(rtas_buf); - return -ENOENT; - } - upwa = (struct update_props_workarea *)&rtas_buf[0]; - upwa->phandle = phandle; + upwa->phandle = cpu_to_be32(dn->phandle); do { rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf, @@ -168,7 +208,7 @@ static int update_dt_node(__be32 phandle, s32 scope) nprops = be32_to_cpu(upwa->nprops); /* On the first call to ibm,update-properties for a node the - * the first property value descriptor contains an empty + * first property value descriptor contains an empty * property name, the property value length encoded as u32, * and the property value is the node path being updated. */ @@ -202,11 +242,12 @@ static int update_dt_node(__be32 phandle, s32 scope) rc = update_dt_property(dn, &prop, prop_name, vd, prop_data); if (rc) { - printk(KERN_ERR "Could not update %s" - " property\n", prop_name); + pr_err("updating %s property failed: %d\n", + prop_name, rc); } prop_data += vd; + break; } cond_resched(); @@ -215,68 +256,51 @@ static int update_dt_node(__be32 phandle, s32 scope) cond_resched(); } while (rtas_rc == 1); - of_node_put(dn); kfree(rtas_buf); return 0; } -static int add_dt_node(__be32 parent_phandle, __be32 drc_index) +static int add_dt_node(struct device_node *parent_dn, __be32 drc_index) { struct device_node *dn; - struct device_node *parent_dn; int rc; - parent_dn = of_find_node_by_phandle(be32_to_cpu(parent_phandle)); - if (!parent_dn) - return -ENOENT; - dn = dlpar_configure_connector(drc_index, parent_dn); - if (!dn) { - of_node_put(parent_dn); + if (!dn) return -ENOENT; + + /* + * Since delete_dt_node() ignores this node type, this is the + * necessary counterpart. We also know that a platform-facilities + * node returned from dlpar_configure_connector() has children + * attached, and dlpar_attach_node() only adds the parent, leaking + * the children. So ignore these on the add side for now. + */ + if (of_node_is_type(dn, "ibm,platform-facilities")) { + pr_notice("ignoring add operation for %pOF\n", dn); + dlpar_free_cc_nodes(dn); + return 0; } rc = dlpar_attach_node(dn, parent_dn); if (rc) dlpar_free_cc_nodes(dn); - of_node_put(parent_dn); - return rc; -} + pr_debug("added node %pOFfp\n", dn); -static void prrn_update_node(__be32 phandle) -{ - struct pseries_hp_errorlog hp_elog; - struct device_node *dn; - - /* - * If a node is found from a the given phandle, the phandle does not - * represent the drc index of an LMB and we can ignore. - */ - dn = of_find_node_by_phandle(be32_to_cpu(phandle)); - if (dn) { - of_node_put(dn); - return; - } - - hp_elog.resource = PSERIES_HP_ELOG_RESOURCE_MEM; - hp_elog.action = PSERIES_HP_ELOG_ACTION_READD; - hp_elog.id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; - hp_elog._drc_u.drc_index = phandle; - - handle_dlpar_errorlog(&hp_elog); + return rc; } -int pseries_devicetree_update(s32 scope) +static int pseries_devicetree_update(s32 scope) { char *rtas_buf; __be32 *data; int update_nodes_token; int rc; - update_nodes_token = rtas_token("ibm,update-nodes"); + update_nodes_token = rtas_function_token(RTAS_FN_IBM_UPDATE_NODES); if (update_nodes_token == RTAS_UNKNOWN_SERVICE) - return -EINVAL; + return 0; rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); if (!rtas_buf) @@ -296,26 +320,31 @@ int pseries_devicetree_update(s32 scope) data++; for (i = 0; i < node_count; i++) { + struct device_node *np; __be32 phandle = *data++; __be32 drc_index; + np = of_find_node_by_phandle(be32_to_cpu(phandle)); + if (!np) { + pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n", + be32_to_cpu(phandle), action); + continue; + } + switch (action) { case DELETE_DT_NODE: - delete_dt_node(phandle); + delete_dt_node(np); break; case UPDATE_DT_NODE: - update_dt_node(phandle, scope); - - if (scope == PRRN_SCOPE) - prrn_update_node(phandle); - + update_dt_node(np, scope); break; case ADD_DT_NODE: drc_index = *data++; - add_dt_node(phandle, drc_index); + add_dt_node(np, drc_index); break; } + of_node_put(np); cond_resched(); } } @@ -330,21 +359,8 @@ int pseries_devicetree_update(s32 scope) void post_mobility_fixup(void) { int rc; - int activate_fw_token; - activate_fw_token = rtas_token("ibm,activate-firmware"); - if (activate_fw_token == RTAS_UNKNOWN_SERVICE) { - printk(KERN_ERR "Could not make post-mobility " - "activate-fw call.\n"); - return; - } - - do { - rc = rtas_call(activate_fw_token, 0, 1, NULL); - } while (rtas_busy_delay(rc)); - - if (rc) - printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc); + rtas_activate_firmware(); /* * We don't want CPUs to go online/offline while the device @@ -361,21 +377,410 @@ void post_mobility_fixup(void) rc = pseries_devicetree_update(MIGRATION_SCOPE); if (rc) - printk(KERN_ERR "Post-mobility device tree update " - "failed: %d\n", rc); + pr_err("device tree update failed: %d\n", rc); cacheinfo_rebuild(); cpus_read_unlock(); - /* Possibly switch to a new RFI flush type */ - pseries_setup_rfi_flush(); + /* Possibly switch to a new L1 flush type */ + pseries_setup_security_mitigations(); + + /* Reinitialise system information for hv-24x7 */ + read_24x7_sys_info(); return; } -static ssize_t migration_store(struct class *class, - struct class_attribute *attr, const char *buf, +static int poll_vasi_state(u64 handle, unsigned long *res) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long hvrc; + int ret; + + hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle); + switch (hvrc) { + case H_SUCCESS: + ret = 0; + *res = retbuf[0]; + break; + case H_PARAMETER: + ret = -EINVAL; + break; + case H_FUNCTION: + ret = -EOPNOTSUPP; + break; + case H_HARDWARE: + default: + pr_err("unexpected H_VASI_STATE result %ld\n", hvrc); + ret = -EIO; + break; + } + return ret; +} + +static int wait_for_vasi_session_suspending(u64 handle) +{ + unsigned long state; + int ret; + + /* + * Wait for transition from H_VASI_ENABLED to + * H_VASI_SUSPENDING. Treat anything else as an error. + */ + while (true) { + ret = poll_vasi_state(handle, &state); + + if (ret != 0 || state == H_VASI_SUSPENDING) { + break; + } else if (state == H_VASI_ENABLED) { + ssleep(1); + } else { + pr_err("unexpected H_VASI_STATE result %lu\n", state); + ret = -EIO; + break; + } + } + + /* + * Proceed even if H_VASI_STATE is unavailable. If H_JOIN or + * ibm,suspend-me are also unimplemented, we'll recover then. + */ + if (ret == -EOPNOTSUPP) + ret = 0; + + return ret; +} + +static void wait_for_vasi_session_completed(u64 handle) +{ + unsigned long state = 0; + int ret; + + pr_info("waiting for memory transfer to complete...\n"); + + /* + * Wait for transition from H_VASI_RESUMED to H_VASI_COMPLETED. + */ + while (true) { + ret = poll_vasi_state(handle, &state); + + /* + * If the memory transfer is already complete and the migration + * has been cleaned up by the hypervisor, H_PARAMETER is return, + * which is translate in EINVAL by poll_vasi_state(). + */ + if (ret == -EINVAL || (!ret && state == H_VASI_COMPLETED)) { + pr_info("memory transfer completed.\n"); + break; + } + + if (ret) { + pr_err("H_VASI_STATE return error (%d)\n", ret); + break; + } + + if (state != H_VASI_RESUMED) { + pr_err("unexpected H_VASI_STATE result %lu\n", state); + break; + } + + msleep(500); + } +} + +static void prod_single(unsigned int target_cpu) +{ + long hvrc; + int hwid; + + hwid = get_hard_smp_processor_id(target_cpu); + hvrc = plpar_hcall_norets(H_PROD, hwid); + if (hvrc == H_SUCCESS) + return; + pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n", + target_cpu, hwid, hvrc); +} + +static void prod_others(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + prod_single(cpu); + } +} + +static u16 clamp_slb_size(void) +{ +#ifdef CONFIG_PPC_64S_HASH_MMU + u16 prev = mmu_slb_size; + + slb_set_size(SLB_MIN_SIZE); + + return prev; +#else + return 0; +#endif +} + +static int do_suspend(void) +{ + u16 saved_slb_size; + int status; + int ret; + + pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id()); + + /* + * The destination processor model may have fewer SLB entries + * than the source. We reduce mmu_slb_size to a safe minimum + * before suspending in order to minimize the possibility of + * programming non-existent entries on the destination. If + * suspend fails, we restore it before returning. On success + * the OF reconfig path will update it from the new device + * tree after resuming on the destination. + */ + saved_slb_size = clamp_slb_size(); + + ret = rtas_ibm_suspend_me(&status); + if (ret != 0) { + pr_err("ibm,suspend-me error: %d\n", status); + slb_set_size(saved_slb_size); + } + + return ret; +} + +/** + * struct pseries_suspend_info - State shared between CPUs for join/suspend. + * @counter: Threads are to increment this upon resuming from suspend + * or if an error is received from H_JOIN. The thread which performs + * the first increment (i.e. sets it to 1) is responsible for + * waking the other threads. + * @done: False if join/suspend is in progress. True if the operation is + * complete (successful or not). + */ +struct pseries_suspend_info { + atomic_t counter; + bool done; +}; + +static int do_join(void *arg) +{ + struct pseries_suspend_info *info = arg; + atomic_t *counter = &info->counter; + long hvrc; + int ret; + +retry: + /* Must ensure MSR.EE off for H_JOIN. */ + hard_irq_disable(); + hvrc = plpar_hcall_norets(H_JOIN); + + switch (hvrc) { + case H_CONTINUE: + /* + * All other CPUs are offline or in H_JOIN. This CPU + * attempts the suspend. + */ + ret = do_suspend(); + break; + case H_SUCCESS: + /* + * The suspend is complete and this cpu has received a + * prod, or we've received a stray prod from unrelated + * code (e.g. paravirt spinlocks) and we need to join + * again. + * + * This barrier orders the return from H_JOIN above vs + * the load of info->done. It pairs with the barrier + * in the wakeup/prod path below. + */ + smp_mb(); + if (READ_ONCE(info->done) == false) { + pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying", + smp_processor_id()); + goto retry; + } + ret = 0; + break; + case H_BAD_MODE: + case H_HARDWARE: + default: + ret = -EIO; + pr_err_ratelimited("H_JOIN error %ld on CPU %i\n", + hvrc, smp_processor_id()); + break; + } + + if (atomic_inc_return(counter) == 1) { + pr_info("CPU %u waking all threads\n", smp_processor_id()); + WRITE_ONCE(info->done, true); + /* + * This barrier orders the store to info->done vs subsequent + * H_PRODs to wake the other CPUs. It pairs with the barrier + * in the H_SUCCESS case above. + */ + smp_mb(); + prod_others(); + } + /* + * Execution may have been suspended for several seconds, so reset + * the watchdogs. touch_nmi_watchdog() also touches the soft lockup + * watchdog. + */ + rcu_cpu_stall_reset(); + touch_nmi_watchdog(); + + return ret; +} + +/* + * Abort reason code byte 0. We use only the 'Migrating partition' value. + */ +enum vasi_aborting_entity { + ORCHESTRATOR = 1, + VSP_SOURCE = 2, + PARTITION_FIRMWARE = 3, + PLATFORM_FIRMWARE = 4, + VSP_TARGET = 5, + MIGRATING_PARTITION = 6, +}; + +static void pseries_cancel_migration(u64 handle, int err) +{ + u32 reason_code; + u32 detail; + u8 entity; + long hvrc; + + entity = MIGRATING_PARTITION; + detail = abs(err) & 0xffffff; + reason_code = (entity << 24) | detail; + + hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle, + H_VASI_SIGNAL_CANCEL, reason_code); + if (hvrc) + pr_err("H_VASI_SIGNAL error: %ld\n", hvrc); +} + +static int pseries_suspend(u64 handle) +{ + const unsigned int max_attempts = 5; + unsigned int retry_interval_ms = 1; + unsigned int attempt = 1; + int ret; + + while (true) { + struct pseries_suspend_info info; + unsigned long vasi_state; + int vasi_err; + + info = (struct pseries_suspend_info) { + .counter = ATOMIC_INIT(0), + .done = false, + }; + + ret = stop_machine(do_join, &info, cpu_online_mask); + if (ret == 0) + break; + /* + * Encountered an error. If the VASI stream is still + * in Suspending state, it's likely a transient + * condition related to some device in the partition + * and we can retry in the hope that the cause has + * cleared after some delay. + * + * A better design would allow drivers etc to prepare + * for the suspend and avoid conditions which prevent + * the suspend from succeeding. For now, we have this + * mitigation. + */ + pr_notice("Partition suspend attempt %u of %u error: %d\n", + attempt, max_attempts, ret); + + if (attempt == max_attempts) + break; + + vasi_err = poll_vasi_state(handle, &vasi_state); + if (vasi_err == 0) { + if (vasi_state != H_VASI_SUSPENDING) { + pr_notice("VASI state %lu after failed suspend\n", + vasi_state); + break; + } + } else if (vasi_err != -EOPNOTSUPP) { + pr_err("VASI state poll error: %d", vasi_err); + break; + } + + pr_notice("Will retry partition suspend after %u ms\n", + retry_interval_ms); + + msleep(retry_interval_ms); + retry_interval_ms *= 10; + attempt++; + } + + return ret; +} + +static int pseries_migrate_partition(u64 handle) +{ + int ret; + unsigned int factor = 0; + +#ifdef CONFIG_PPC_WATCHDOG + factor = nmi_wd_lpm_factor; +#endif + /* + * When the migration is initiated, the hypervisor changes VAS + * mappings to prepare before OS gets the notification and + * closes all VAS windows. NX generates continuous faults during + * this time and the user space can not differentiate these + * faults from the migration event. So reduce this time window + * by closing VAS windows at the beginning of this function. + */ + vas_migration_handler(VAS_SUSPEND); + + ret = wait_for_vasi_session_suspending(handle); + if (ret) + goto out; + + if (factor) + watchdog_hardlockup_set_timeout_pct(factor); + + ret = pseries_suspend(handle); + if (ret == 0) { + post_mobility_fixup(); + /* + * Wait until the memory transfer is complete, so that the user + * space process returns from the syscall after the transfer is + * complete. This allows the user hooks to be executed at the + * right time. + */ + wait_for_vasi_session_completed(handle); + } else + pseries_cancel_migration(handle, ret); + + if (factor) + watchdog_hardlockup_set_timeout_pct(0); + +out: + vas_migration_handler(VAS_RESUME); + + return ret; +} + +int rtas_syscall_dispatch_ibm_suspend_me(u64 handle) +{ + return pseries_migrate_partition(handle); +} + +static ssize_t migration_store(const struct class *class, + const struct class_attribute *attr, const char *buf, size_t count) { u64 streamid; @@ -385,21 +790,10 @@ static ssize_t migration_store(struct class *class, if (rc) return rc; - stop_topology_update(); - - do { - rc = rtas_ibm_suspend_me(streamid); - if (rc == -EAGAIN) - ssleep(1); - } while (rc == -EAGAIN); - + rc = pseries_migrate_partition(streamid); if (rc) return rc; - post_mobility_fixup(); - - start_topology_update(); - return count; } @@ -424,11 +818,11 @@ static int __init mobility_sysfs_init(void) rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr); if (rc) - pr_err("mobility: unable to create migration sysfs file (%d)\n", rc); + pr_err("unable to create migration sysfs file (%d)\n", rc); rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr); if (rc) - pr_err("mobility: unable to create api_version sysfs file (%d)\n", rc); + pr_err("unable to create api_version sysfs file (%d)\n", rc); return 0; } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 133f6adcb39c..6dfb55b52d36 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -4,14 +4,17 @@ * Copyright 2006-2007 Michael Ellerman, IBM Corp. */ +#include <linux/crash_dump.h> #include <linux/device.h> #include <linux/irq.h> +#include <linux/irqdomain.h> #include <linux/msi.h> #include <asm/rtas.h> #include <asm/hw_irq.h> #include <asm/ppc-pci.h> #include <asm/machdep.h> +#include <asm/xive.h> #include "pseries.h" @@ -23,6 +26,7 @@ static int query_token, change_token; #define RTAS_CHANGE_MSI_FN 3 #define RTAS_CHANGE_MSIX_FN 4 #define RTAS_CHANGE_32MSI_FN 5 +#define RTAS_CHANGE_32MSIX_FN 6 /* RTAS Helpers */ @@ -38,7 +42,7 @@ static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs) seq_num = 1; do { if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN || - func == RTAS_CHANGE_32MSI_FN) + func == RTAS_CHANGE_32MSI_FN || func == RTAS_CHANGE_32MSIX_FN) rc = rtas_call(change_token, 6, 4, rtas_ret, addr, BUID_HI(buid), BUID_LO(buid), func, num_irqs, seq_num); @@ -109,21 +113,6 @@ static int rtas_query_irq_number(struct pci_dn *pdn, int offset) return rtas_ret[0]; } -static void rtas_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct msi_desc *entry; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - } - - rtas_disable_msi(pdev); -} - static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) { struct device_node *dn; @@ -163,12 +152,12 @@ static int check_req_msix(struct pci_dev *pdev, int nvec) /* Quota calculation */ -static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +static struct device_node *__find_pe_total_msi(struct device_node *node, int *total) { struct device_node *dn; const __be32 *p; - dn = of_node_get(pci_device_to_OF_node(dev)); + dn = of_node_get(node); while (dn) { p = of_get_property(dn, "ibm,pe-total-#msi", NULL); if (p) { @@ -184,6 +173,11 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) return NULL; } +static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +{ + return __find_pe_total_msi(pci_device_to_OF_node(dev), total); +} + static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) { struct device_node *dn; @@ -329,27 +323,6 @@ out: return request; } -static int check_msix_entries(struct pci_dev *pdev) -{ - struct msi_desc *entry; - int expected; - - /* There's no way for us to express to firmware that we want - * a discontiguous, or non-zero based, range of MSI-X entries. - * So we must reject such requests. */ - - expected = 0; - for_each_pci_msi_entry(entry, pdev) { - if (entry->msi_attrib.entry_nr != expected) { - pr_debug("rtas_msi: bad MSI-X entries.\n"); - return -EINVAL; - } - expected++; - } - - return 0; -} - static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev) { u32 addr_hi, addr_lo; @@ -367,12 +340,11 @@ static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev) pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0); } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) +static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type, + msi_alloc_info_t *arg) { struct pci_dn *pdn; - int hwirq, virq, i, quota, rc; - struct msi_desc *entry; - struct msi_msg msg; + int quota, rc; int nvec = nvec_in; int use_32bit_msi_hack = 0; @@ -389,9 +361,6 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) if (quota && quota < nvec) return quota; - if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev)) - return -EINVAL; - /* * Firmware currently refuse any non power of two allocation * so we round up if the quota will allow it. @@ -438,8 +407,12 @@ again: if (use_32bit_msi_hack && rc > 0) rtas_hack_32bit_msi_gen2(pdev); - } else - rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec); + } else { + if (pdev->no_64bit_msi) + rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSIX_FN, nvec); + else + rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec); + } if (rc != nvec) { if (nvec != nvec_in) { @@ -450,32 +423,247 @@ again: return rc; } - i = 0; - for_each_pci_msi_entry(entry, pdev) { - hwirq = rtas_query_irq_number(pdn, i++); - if (hwirq < 0) { - pr_debug("rtas_msi: error (%d) getting hwirq\n", rc); - return hwirq; - } + return 0; +} - virq = irq_create_mapping(NULL, hwirq); +static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int type = pdev->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; - if (!virq) { - pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq); - return -ENOSPC; - } + return rtas_prepare_msi_irqs(pdev, nvec, type, arg); +} + +/* + * ->msi_free() is called before irq_domain_free_irqs_top() when the + * handler data is still available. Use that to clear the XIVE + * controller data. + */ +static void pseries_msi_ops_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int irq) +{ + if (xive_enabled()) + xive_irq_free_data(irq); +} + +/* + * RTAS can not disable one MSI at a time. It's all or nothing. Do it + * at the end after all IRQs have been freed. + */ +static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev) +{ + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + rtas_disable_msi(to_pci_dev(dev)); +} + +static struct msi_domain_ops pseries_pci_msi_domain_ops = { + .msi_prepare = pseries_msi_ops_prepare, + .msi_free = pseries_msi_ops_msi_free, + .msi_post_free = pseries_msi_post_free, +}; + +static void pseries_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static void pseries_msi_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void pseries_msi_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(data); + + /* + * Do not update the MSIx vector table. It's not strictly necessary + * because the table is initialized by the underlying hypervisor, PowerVM + * or QEMU/KVM. However, if the MSIx vector entry is cleared, any further + * activation will fail. This can happen in some drivers (eg. IPR) which + * deactivate an IRQ used for testing MSI support. + */ + entry->msg = *msg; +} + +static struct irq_chip pseries_pci_msi_irq_chip = { + .name = "pSeries-PCI-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = pseries_msi_mask, + .irq_unmask = pseries_msi_unmask, + .irq_eoi = irq_chip_eoi_parent, + .irq_write_msi_msg = pseries_msi_write_msg, +}; + + +/* + * Set MSI_FLAG_MSIX_CONTIGUOUS as there is no way to express to + * firmware to request a discontiguous or non-zero based range of + * MSI-X entries. Core code will reject such setup attempts. + */ +static struct msi_domain_info pseries_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX | + MSI_FLAG_MSIX_CONTIGUOUS), + .ops = &pseries_pci_msi_domain_ops, + .chip = &pseries_pci_msi_irq_chip, +}; + +static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __pci_read_msi_msg(irq_data_get_msi_desc(data), msg); +} + +static struct irq_chip pseries_msi_irq_chip = { + .name = "pSeries-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pseries_msi_compose_msg, +}; + +static int pseries_irq_parent_domain_alloc(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *phb = domain->host_data; + msi_alloc_info_t *info = arg; + struct msi_desc *desc = info->desc; + struct pci_dev *pdev = msi_desc_to_pci_dev(desc); + int hwirq; + int i, ret; + + hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index); + if (hwirq < 0) { + dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq); + return hwirq; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + phb->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pseries_irq_parent_domain_alloc(domain, virq + i, hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pseries_msi_irq_chip, domain->host_data); + } + + return 0; + +out: + /* TODO: handle RTAS cleanup in ->msi_finish() ? */ + irq_domain_free_irqs_parent(domain, virq, i - 1); + return ret; +} + +static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *phb = irq_data_get_irq_chip_data(d); + + pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); + + /* XIVE domain data is cleared through ->msi_free() */ +} - dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq); - irq_set_msi_desc(virq, entry); +static const struct irq_domain_ops pseries_irq_domain_ops = { + .alloc = pseries_irq_domain_alloc, + .free = pseries_irq_domain_free, +}; + +static int __pseries_msi_allocate_domains(struct pci_controller *phb, + unsigned int count) +{ + struct irq_domain *parent = irq_get_default_host(); + + phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI", + phb->global_number); + if (!phb->fwnode) + return -ENOMEM; + + phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count, + phb->fwnode, + &pseries_irq_domain_ops, phb); + if (!phb->dev_domain) { + pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + return -ENOMEM; + } - /* Read config space back so we can restore after reset */ - __pci_read_msi_msg(entry, &msg); - entry->msg = msg; + phb->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(phb->dn), + &pseries_msi_domain_info, + phb->dev_domain); + if (!phb->msi_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + irq_domain_remove(phb->dev_domain); + return -ENOMEM; } return 0; } +int pseries_msi_allocate_domains(struct pci_controller *phb) +{ + int count; + + if (!__find_pe_total_msi(phb->dn, &count)) { + pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + return -ENOSPC; + } + + return __pseries_msi_allocate_domains(phb, count); +} + +void pseries_msi_free_domains(struct pci_controller *phb) +{ + if (phb->msi_domain) + irq_domain_remove(phb->msi_domain); + if (phb->dev_domain) + irq_domain_remove(phb->dev_domain); + if (phb->fwnode) + irq_domain_free_fwnode(phb->fwnode); +} + static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) { /* No LSI -> leave MSIs (if any) configured */ @@ -496,10 +684,8 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) static int rtas_msi_init(void) { - struct pci_controller *phb; - - query_token = rtas_token("ibm,query-interrupt-source-number"); - change_token = rtas_token("ibm,change-msi"); + query_token = rtas_function_token(RTAS_FN_IBM_QUERY_INTERRUPT_SOURCE_NUMBER); + change_token = rtas_function_token(RTAS_FN_IBM_CHANGE_MSI); if ((query_token == RTAS_UNKNOWN_SERVICE) || (change_token == RTAS_UNKNOWN_SERVICE)) { @@ -509,16 +695,6 @@ static int rtas_msi_init(void) pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n"); - WARN_ON(pseries_pci_controller_ops.setup_msi_irqs); - pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - - list_for_each_entry(phb, &hose_list, list_node) { - WARN_ON(phb->controller_ops.setup_msi_irqs); - phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - } - WARN_ON(ppc_md.pci_irq_fixup); ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup; diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 69db2eca367f..8130c37962c0 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -13,9 +13,9 @@ #include <linux/slab.h> #include <linux/ctype.h> #include <linux/uaccess.h> +#include <linux/of.h> #include <asm/nvram.h> #include <asm/rtas.h> -#include <asm/prom.h> #include <asm/machdep.h> /* Max bytes to read/write in one go */ @@ -227,8 +227,8 @@ int __init pSeries_nvram_init(void) nvram_size = be32_to_cpup(nbytes_p); - nvram_fetch = rtas_token("nvram-fetch"); - nvram_store = rtas_token("nvram-store"); + nvram_fetch = rtas_function_token(RTAS_FN_NVRAM_FETCH); + nvram_store = rtas_function_token(RTAS_FN_NVRAM_STORE); printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size); of_node_put(nvram); diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c index 66dfd8256712..23241c71ef37 100644 --- a/arch/powerpc/platforms/pseries/of_helpers.c +++ b/arch/powerpc/platforms/pseries/of_helpers.c @@ -88,7 +88,7 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval, return -EINVAL; /* Should now know end of current entry */ - (*curval) = (void *)p2; + (*curval) = (void *)(++p2); data->last_drc_index = data->drc_index_start + ((data->num_sequential_elems - 1) * data->sequential_inc); diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h deleted file mode 100644 index 51414aee2862..000000000000 --- a/arch/powerpc/platforms/pseries/offline_states.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _OFFLINE_STATES_H_ -#define _OFFLINE_STATES_H_ - -/* Cpu offline states go here */ -enum cpu_state_vals { - CPU_STATE_OFFLINE, - CPU_STATE_INACTIVE, - CPU_STATE_ONLINE, - CPU_MAX_OFFLINE_STATES -}; - -#ifdef CONFIG_HOTPLUG_CPU -extern enum cpu_state_vals get_cpu_current_state(int cpu); -extern void set_cpu_current_state(int cpu, enum cpu_state_vals state); -extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state); -extern void set_default_offline_state(int cpu); -#else -static inline enum cpu_state_vals get_cpu_current_state(int cpu) -{ - return CPU_STATE_ONLINE; -} - -static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state) -{ -} - -static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state) -{ -} - -static inline void set_default_offline_state(int cpu) -{ -} -#endif - -extern enum cpu_state_vals get_preferred_offline_state(int cpu); -#endif diff --git a/arch/powerpc/platforms/pseries/papr-sysparm.c b/arch/powerpc/platforms/pseries/papr-sysparm.c new file mode 100644 index 000000000000..7063ce8884e4 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-sysparm.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-sysparm: " fmt + +#include <linux/anon_inodes.h> +#include <linux/bug.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <asm/machdep.h> +#include <asm/papr-sysparm.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> + +struct papr_sysparm_buf *papr_sysparm_buf_alloc(void) +{ + struct papr_sysparm_buf *buf = kzalloc(sizeof(*buf), GFP_KERNEL); + + return buf; +} + +void papr_sysparm_buf_free(struct papr_sysparm_buf *buf) +{ + kfree(buf); +} + +static size_t papr_sysparm_buf_get_length(const struct papr_sysparm_buf *buf) +{ + return be16_to_cpu(buf->len); +} + +static void papr_sysparm_buf_set_length(struct papr_sysparm_buf *buf, size_t length) +{ + WARN_ONCE(length > sizeof(buf->val), + "bogus length %zu, clamping to safe value", length); + length = min(sizeof(buf->val), length); + buf->len = cpu_to_be16(length); +} + +/* + * For use on buffers returned from ibm,get-system-parameter before + * returning them to callers. Ensures the encoded length of valid data + * cannot overrun buf->val[]. + */ +static void papr_sysparm_buf_clamp_length(struct papr_sysparm_buf *buf) +{ + papr_sysparm_buf_set_length(buf, papr_sysparm_buf_get_length(buf)); +} + +/* + * Perform some basic diligence on the system parameter buffer before + * submitting it to RTAS. + */ +static bool papr_sysparm_buf_can_submit(const struct papr_sysparm_buf *buf) +{ + /* + * Firmware ought to reject buffer lengths that exceed the + * maximum specified in PAPR, but there's no reason for the + * kernel to allow them either. + */ + if (papr_sysparm_buf_get_length(buf) > sizeof(buf->val)) + return false; + + return true; +} + +/** + * papr_sysparm_get() - Retrieve the value of a PAPR system parameter. + * @param: PAPR system parameter token as described in + * 7.3.16 "System Parameters Option". + * @buf: A &struct papr_sysparm_buf as returned from papr_sysparm_buf_alloc(). + * + * Place the result of querying the specified parameter, if available, + * in @buf. The result includes a be16 length header followed by the + * value, which may be a string or binary data. See &struct papr_sysparm_buf. + * + * Since there is at least one parameter (60, OS Service Entitlement + * Status) where the results depend on the incoming contents of the + * work area, the caller-supplied buffer is copied unmodified into the + * work area before calling ibm,get-system-parameter. + * + * A defined parameter may not be implemented on a given system, and + * some implemented parameters may not be available to all partitions + * on a system. A parameter's disposition may change at any time due + * to system configuration changes or partition migration. + * + * Context: This function may sleep. + * + * Return: 0 on success, -errno otherwise. @buf is unmodified on error. + */ +int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_GET_SYSTEM_PARAMETER); + struct rtas_work_area *work_area; + s32 fwrc; + int ret; + + might_sleep(); + + if (WARN_ON(!buf)) + return -EFAULT; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + if (!papr_sysparm_buf_can_submit(buf)) + return -EINVAL; + + work_area = rtas_work_area_alloc(sizeof(*buf)); + + memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf)); + + do { + fwrc = rtas_call(token, 3, 1, NULL, param.token, + rtas_work_area_phys(work_area), + rtas_work_area_size(work_area)); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case 0: + ret = 0; + memcpy(buf, rtas_work_area_raw_buf(work_area), sizeof(*buf)); + papr_sysparm_buf_clamp_length(buf); + break; + case -3: /* parameter not implemented */ + ret = -EOPNOTSUPP; + break; + case -9002: /* this partition not authorized to retrieve this parameter */ + ret = -EPERM; + break; + case -9999: /* "parameter error" e.g. the buffer is too small */ + ret = -EINVAL; + break; + default: + pr_err("unexpected ibm,get-system-parameter result %d\n", fwrc); + fallthrough; + case -1: /* Hardware/platform error */ + ret = -EIO; + break; + } + + rtas_work_area_free(work_area); + + return ret; +} + +int papr_sysparm_set(papr_sysparm_t param, const struct papr_sysparm_buf *buf) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_SET_SYSTEM_PARAMETER); + struct rtas_work_area *work_area; + s32 fwrc; + int ret; + + might_sleep(); + + if (WARN_ON(!buf)) + return -EFAULT; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + if (!papr_sysparm_buf_can_submit(buf)) + return -EINVAL; + + work_area = rtas_work_area_alloc(sizeof(*buf)); + + memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf)); + + do { + fwrc = rtas_call(token, 2, 1, NULL, param.token, + rtas_work_area_phys(work_area)); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case 0: + ret = 0; + break; + case -3: /* parameter not supported */ + ret = -EOPNOTSUPP; + break; + case -9002: /* this partition not authorized to modify this parameter */ + ret = -EPERM; + break; + case -9999: /* "parameter error" e.g. invalid input data */ + ret = -EINVAL; + break; + default: + pr_err("unexpected ibm,set-system-parameter result %d\n", fwrc); + fallthrough; + case -1: /* Hardware/platform error */ + ret = -EIO; + break; + } + + rtas_work_area_free(work_area); + + return ret; +} + +static struct papr_sysparm_buf * +papr_sysparm_buf_from_user(const struct papr_sysparm_io_block __user *user_iob) +{ + struct papr_sysparm_buf *kern_spbuf; + long err; + u16 len; + + /* + * The length of valid data that userspace claims to be in + * user_iob->data[]. + */ + if (get_user(len, &user_iob->length)) + return ERR_PTR(-EFAULT); + + static_assert(sizeof(user_iob->data) >= PAPR_SYSPARM_MAX_INPUT); + static_assert(sizeof(kern_spbuf->val) >= PAPR_SYSPARM_MAX_INPUT); + + if (len > PAPR_SYSPARM_MAX_INPUT) + return ERR_PTR(-EINVAL); + + kern_spbuf = papr_sysparm_buf_alloc(); + if (!kern_spbuf) + return ERR_PTR(-ENOMEM); + + papr_sysparm_buf_set_length(kern_spbuf, len); + + if (len > 0 && copy_from_user(kern_spbuf->val, user_iob->data, len)) { + err = -EFAULT; + goto free_sysparm_buf; + } + + return kern_spbuf; + +free_sysparm_buf: + papr_sysparm_buf_free(kern_spbuf); + return ERR_PTR(err); +} + +static int papr_sysparm_buf_to_user(const struct papr_sysparm_buf *kern_spbuf, + struct papr_sysparm_io_block __user *user_iob) +{ + u16 len_out = papr_sysparm_buf_get_length(kern_spbuf); + + if (put_user(len_out, &user_iob->length)) + return -EFAULT; + + static_assert(sizeof(user_iob->data) >= PAPR_SYSPARM_MAX_OUTPUT); + static_assert(sizeof(kern_spbuf->val) >= PAPR_SYSPARM_MAX_OUTPUT); + + if (copy_to_user(user_iob->data, kern_spbuf->val, PAPR_SYSPARM_MAX_OUTPUT)) + return -EFAULT; + + return 0; +} + +static long papr_sysparm_ioctl_get(struct papr_sysparm_io_block __user *user_iob) +{ + struct papr_sysparm_buf *kern_spbuf; + papr_sysparm_t param; + long ret; + + if (get_user(param.token, &user_iob->parameter)) + return -EFAULT; + + kern_spbuf = papr_sysparm_buf_from_user(user_iob); + if (IS_ERR(kern_spbuf)) + return PTR_ERR(kern_spbuf); + + ret = papr_sysparm_get(param, kern_spbuf); + if (ret) + goto free_sysparm_buf; + + ret = papr_sysparm_buf_to_user(kern_spbuf, user_iob); + if (ret) + goto free_sysparm_buf; + + ret = 0; + +free_sysparm_buf: + papr_sysparm_buf_free(kern_spbuf); + return ret; +} + + +static long papr_sysparm_ioctl_set(struct papr_sysparm_io_block __user *user_iob) +{ + struct papr_sysparm_buf *kern_spbuf; + papr_sysparm_t param; + long ret; + + if (get_user(param.token, &user_iob->parameter)) + return -EFAULT; + + kern_spbuf = papr_sysparm_buf_from_user(user_iob); + if (IS_ERR(kern_spbuf)) + return PTR_ERR(kern_spbuf); + + ret = papr_sysparm_set(param, kern_spbuf); + if (ret) + goto free_sysparm_buf; + + ret = 0; + +free_sysparm_buf: + papr_sysparm_buf_free(kern_spbuf); + return ret; +} + +static long papr_sysparm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (__force void __user *)arg; + long ret; + + switch (ioctl) { + case PAPR_SYSPARM_IOC_GET: + ret = papr_sysparm_ioctl_get(argp); + break; + case PAPR_SYSPARM_IOC_SET: + if (filp->f_mode & FMODE_WRITE) + ret = papr_sysparm_ioctl_set(argp); + else + ret = -EBADF; + break; + default: + ret = -ENOIOCTLCMD; + break; + } + return ret; +} + +static const struct file_operations papr_sysparm_ops = { + .unlocked_ioctl = papr_sysparm_ioctl, +}; + +static struct miscdevice papr_sysparm_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-sysparm", + .fops = &papr_sysparm_ops, +}; + +static __init int papr_sysparm_init(void) +{ + if (!rtas_function_implemented(RTAS_FN_IBM_GET_SYSTEM_PARAMETER)) + return -ENODEV; + + return misc_register(&papr_sysparm_dev); +} +machine_device_initcall(pseries, papr_sysparm_init); diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c new file mode 100644 index 000000000000..c29e85db5f35 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-vpd: " fmt + +#include <linux/anon_inodes.h> +#include <linux/build_bug.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/lockdep.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/signal.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/string_helpers.h> +#include <linux/uaccess.h> +#include <asm/machdep.h> +#include <asm/papr-vpd.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> +#include <uapi/asm/papr-vpd.h> + +/* + * Function-specific return values for ibm,get-vpd, derived from PAPR+ + * v2.13 7.3.20 "ibm,get-vpd RTAS Call". + */ +#define RTAS_IBM_GET_VPD_COMPLETE 0 /* All VPD has been retrieved. */ +#define RTAS_IBM_GET_VPD_MORE_DATA 1 /* More VPD is available. */ +#define RTAS_IBM_GET_VPD_START_OVER -4 /* VPD changed, restart call sequence. */ + +/** + * struct rtas_ibm_get_vpd_params - Parameters (in and out) for ibm,get-vpd. + * @loc_code: In: Caller-provided location code buffer. Must be RTAS-addressable. + * @work_area: In: Caller-provided work area buffer for results. + * @sequence: In: Sequence number. Out: Next sequence number. + * @written: Out: Bytes written by ibm,get-vpd to @work_area. + * @status: Out: RTAS call status. + */ +struct rtas_ibm_get_vpd_params { + const struct papr_location_code *loc_code; + struct rtas_work_area *work_area; + u32 sequence; + u32 written; + s32 status; +}; + +/** + * rtas_ibm_get_vpd() - Call ibm,get-vpd to fill a work area buffer. + * @params: See &struct rtas_ibm_get_vpd_params. + * + * Calls ibm,get-vpd until it errors or successfully deposits data + * into the supplied work area. Handles RTAS retry statuses. Maps RTAS + * error statuses to reasonable errno values. + * + * The caller is expected to invoke rtas_ibm_get_vpd() multiple times + * to retrieve all the VPD for the provided location code. Only one + * sequence should be in progress at any time; starting a new sequence + * will disrupt any sequence already in progress. Serialization of VPD + * retrieval sequences is the responsibility of the caller. + * + * The caller should inspect @params.status to determine whether more + * calls are needed to complete the sequence. + * + * Context: May sleep. + * Return: -ve on error, 0 otherwise. + */ +static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params) +{ + const struct papr_location_code *loc_code = params->loc_code; + struct rtas_work_area *work_area = params->work_area; + u32 rets[2]; + s32 fwrc; + int ret; + + lockdep_assert_held(&rtas_ibm_get_vpd_lock); + + do { + fwrc = rtas_call(rtas_function_token(RTAS_FN_IBM_GET_VPD), 4, 3, + rets, + __pa(loc_code), + rtas_work_area_phys(work_area), + rtas_work_area_size(work_area), + params->sequence); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: + ret = -EINVAL; + break; + case RTAS_IBM_GET_VPD_START_OVER: + ret = -EAGAIN; + break; + case RTAS_IBM_GET_VPD_MORE_DATA: + params->sequence = rets[0]; + fallthrough; + case RTAS_IBM_GET_VPD_COMPLETE: + params->written = rets[1]; + /* + * Kernel or firmware bug, do not continue. + */ + if (WARN(params->written > rtas_work_area_size(work_area), + "possible write beyond end of work area")) + ret = -EFAULT; + else + ret = 0; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,get-vpd status %d\n", fwrc); + break; + } + + params->status = fwrc; + return ret; +} + +/* + * Internal VPD "blob" APIs for accumulating ibm,get-vpd results into + * an immutable buffer to be attached to a file descriptor. + */ +struct vpd_blob { + const char *data; + size_t len; +}; + +static bool vpd_blob_has_data(const struct vpd_blob *blob) +{ + return blob->data && blob->len; +} + +static void vpd_blob_free(const struct vpd_blob *blob) +{ + if (blob) { + kvfree(blob->data); + kfree(blob); + } +} + +/** + * vpd_blob_extend() - Append data to a &struct vpd_blob. + * @blob: The blob to extend. + * @data: The new data to append to @blob. + * @len: The length of @data. + * + * Context: May sleep. + * Return: -ENOMEM on allocation failure, 0 otherwise. + */ +static int vpd_blob_extend(struct vpd_blob *blob, const char *data, size_t len) +{ + const size_t new_len = blob->len + len; + const size_t old_len = blob->len; + const char *old_ptr = blob->data; + char *new_ptr; + + new_ptr = old_ptr ? + kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) : + kvmalloc(len, GFP_KERNEL_ACCOUNT); + + if (!new_ptr) + return -ENOMEM; + + memcpy(&new_ptr[old_len], data, len); + blob->data = new_ptr; + blob->len = new_len; + return 0; +} + +/** + * vpd_blob_generate() - Construct a new &struct vpd_blob. + * @generator: Function that supplies the blob data. + * @arg: Context pointer supplied by caller, passed to @generator. + * + * The @generator callback is invoked until it returns NULL. @arg is + * passed to @generator in its first argument on each call. When + * @generator returns data, it should store the data length in its + * second argument. + * + * Context: May sleep. + * Return: A completely populated &struct vpd_blob, or NULL on error. + */ +static const struct vpd_blob * +vpd_blob_generate(const char * (*generator)(void *, size_t *), void *arg) +{ + struct vpd_blob *blob; + const char *buf; + size_t len; + int err = 0; + + blob = kzalloc(sizeof(*blob), GFP_KERNEL_ACCOUNT); + if (!blob) + return NULL; + + while (err == 0 && (buf = generator(arg, &len))) + err = vpd_blob_extend(blob, buf, len); + + if (err != 0 || !vpd_blob_has_data(blob)) + goto free_blob; + + return blob; +free_blob: + vpd_blob_free(blob); + return NULL; +} + +/* + * Internal VPD sequence APIs. A VPD sequence is a series of calls to + * ibm,get-vpd for a given location code. The sequence ends when an + * error is encountered or all VPD for the location code has been + * returned. + */ + +/** + * struct vpd_sequence - State for managing a VPD sequence. + * @error: Shall be zero as long as the sequence has not encountered an error, + * -ve errno otherwise. Use vpd_sequence_set_err() to update this. + * @params: Parameter block to pass to rtas_ibm_get_vpd(). + */ +struct vpd_sequence { + int error; + struct rtas_ibm_get_vpd_params params; +}; + +/** + * vpd_sequence_begin() - Begin a VPD retrieval sequence. + * @seq: Uninitialized sequence state. + * @loc_code: Location code that defines the scope of the VPD to return. + * + * Initializes @seq with the resources necessary to carry out a VPD + * sequence. Callers must pass @seq to vpd_sequence_end() regardless + * of whether the sequence succeeds. + * + * Context: May sleep. + */ +static void vpd_sequence_begin(struct vpd_sequence *seq, + const struct papr_location_code *loc_code) +{ + /* + * Use a static data structure for the location code passed to + * RTAS to ensure it's in the RMA and avoid a separate work + * area allocation. Guarded by the function lock. + */ + static struct papr_location_code static_loc_code; + + /* + * We could allocate the work area before acquiring the + * function lock, but that would allow concurrent requests to + * exhaust the limited work area pool for no benefit. So + * allocate the work area under the lock. + */ + mutex_lock(&rtas_ibm_get_vpd_lock); + static_loc_code = *loc_code; + *seq = (struct vpd_sequence) { + .params = { + .work_area = rtas_work_area_alloc(SZ_4K), + .loc_code = &static_loc_code, + .sequence = 1, + }, + }; +} + +/** + * vpd_sequence_end() - Finalize a VPD retrieval sequence. + * @seq: Sequence state. + * + * Releases resources obtained by vpd_sequence_begin(). + */ +static void vpd_sequence_end(struct vpd_sequence *seq) +{ + rtas_work_area_free(seq->params.work_area); + mutex_unlock(&rtas_ibm_get_vpd_lock); +} + +/** + * vpd_sequence_should_stop() - Determine whether a VPD retrieval sequence + * should continue. + * @seq: VPD sequence state. + * + * Examines the sequence error state and outputs of the last call to + * ibm,get-vpd to determine whether the sequence in progress should + * continue or stop. + * + * Return: True if the sequence has encountered an error or if all VPD for + * this sequence has been retrieved. False otherwise. + */ +static bool vpd_sequence_should_stop(const struct vpd_sequence *seq) +{ + bool done; + + if (seq->error) + return true; + + switch (seq->params.status) { + case 0: + if (seq->params.written == 0) + done = false; /* Initial state. */ + else + done = true; /* All data consumed. */ + break; + case 1: + done = false; /* More data available. */ + break; + default: + done = true; /* Error encountered. */ + break; + } + + return done; +} + +static int vpd_sequence_set_err(struct vpd_sequence *seq, int err) +{ + /* Preserve the first error recorded. */ + if (seq->error == 0) + seq->error = err; + + return seq->error; +} + +/* + * Generator function to be passed to vpd_blob_generate(). + */ +static const char *vpd_sequence_fill_work_area(void *arg, size_t *len) +{ + struct vpd_sequence *seq = arg; + struct rtas_ibm_get_vpd_params *p = &seq->params; + + if (vpd_sequence_should_stop(seq)) + return NULL; + if (vpd_sequence_set_err(seq, rtas_ibm_get_vpd(p))) + return NULL; + *len = p->written; + return rtas_work_area_raw_buf(p->work_area); +} + +/* + * Higher-level VPD retrieval code below. These functions use the + * vpd_blob_* and vpd_sequence_* APIs defined above to create fd-based + * VPD handles for consumption by user space. + */ + +/** + * papr_vpd_run_sequence() - Run a single VPD retrieval sequence. + * @loc_code: Location code that defines the scope of VPD to return. + * + * Context: May sleep. Holds a mutex and an RTAS work area for its + * duration. Typically performs multiple sleepable slab + * allocations. + * + * Return: A populated &struct vpd_blob on success. Encoded error + * pointer otherwise. + */ +static const struct vpd_blob *papr_vpd_run_sequence(const struct papr_location_code *loc_code) +{ + const struct vpd_blob *blob; + struct vpd_sequence seq; + + vpd_sequence_begin(&seq, loc_code); + blob = vpd_blob_generate(vpd_sequence_fill_work_area, &seq); + if (!blob) + vpd_sequence_set_err(&seq, -ENOMEM); + vpd_sequence_end(&seq); + + if (seq.error) { + vpd_blob_free(blob); + return ERR_PTR(seq.error); + } + + return blob; +} + +/** + * papr_vpd_retrieve() - Return the VPD for a location code. + * @loc_code: Location code that defines the scope of VPD to return. + * + * Run VPD sequences against @loc_code until a blob is successfully + * instantiated, or a hard error is encountered, or a fatal signal is + * pending. + * + * Context: May sleep. + * Return: A fully populated VPD blob when successful. Encoded error + * pointer otherwise. + */ +static const struct vpd_blob *papr_vpd_retrieve(const struct papr_location_code *loc_code) +{ + const struct vpd_blob *blob; + + /* + * EAGAIN means the sequence errored with a -4 (VPD changed) + * status from ibm,get-vpd, and we should attempt a new + * sequence. PAPR+ v2.13 R1–7.3.20–5 indicates that this + * should be a transient condition, not something that happens + * continuously. But we'll stop trying on a fatal signal. + */ + do { + blob = papr_vpd_run_sequence(loc_code); + if (!IS_ERR(blob)) /* Success. */ + break; + if (PTR_ERR(blob) != -EAGAIN) /* Hard error. */ + break; + pr_info_ratelimited("VPD changed during retrieval, retrying\n"); + cond_resched(); + } while (!fatal_signal_pending(current)); + + return blob; +} + +static ssize_t papr_vpd_handle_read(struct file *file, char __user *buf, size_t size, loff_t *off) +{ + const struct vpd_blob *blob = file->private_data; + + /* bug: we should not instantiate a handle without any data attached. */ + if (!vpd_blob_has_data(blob)) { + pr_err_once("handle without data\n"); + return -EIO; + } + + return simple_read_from_buffer(buf, size, off, blob->data, blob->len); +} + +static int papr_vpd_handle_release(struct inode *inode, struct file *file) +{ + const struct vpd_blob *blob = file->private_data; + + vpd_blob_free(blob); + + return 0; +} + +static loff_t papr_vpd_handle_seek(struct file *file, loff_t off, int whence) +{ + const struct vpd_blob *blob = file->private_data; + + return fixed_size_llseek(file, off, whence, blob->len); +} + + +static const struct file_operations papr_vpd_handle_ops = { + .read = papr_vpd_handle_read, + .llseek = papr_vpd_handle_seek, + .release = papr_vpd_handle_release, +}; + +/** + * papr_vpd_create_handle() - Create a fd-based handle for reading VPD. + * @ulc: Location code in user memory; defines the scope of the VPD to + * retrieve. + * + * Handler for PAPR_VPD_IOC_CREATE_HANDLE ioctl command. Validates + * @ulc and instantiates an immutable VPD "blob" for it. The blob is + * attached to a file descriptor for reading by user space. The memory + * backing the blob is freed when the file is released. + * + * The entire requested VPD is retrieved by this call and all + * necessary RTAS interactions are performed before returning the fd + * to user space. This keeps the read handler simple and ensures that + * the kernel can prevent interleaving of ibm,get-vpd call sequences. + * + * Return: The installed fd number if successful, -ve errno otherwise. + */ +static long papr_vpd_create_handle(struct papr_location_code __user *ulc) +{ + struct papr_location_code klc; + const struct vpd_blob *blob; + struct file *file; + long err; + int fd; + + if (copy_from_user(&klc, ulc, sizeof(klc))) + return -EFAULT; + + if (!string_is_terminated(klc.str, ARRAY_SIZE(klc.str))) + return -EINVAL; + + blob = papr_vpd_retrieve(&klc); + if (IS_ERR(blob)) + return PTR_ERR(blob); + + fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = fd; + goto free_blob; + } + + file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops, + (void *)blob, O_RDONLY); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto put_fd; + } + + file->f_mode |= FMODE_LSEEK | FMODE_PREAD; + fd_install(fd, file); + return fd; +put_fd: + put_unused_fd(fd); +free_blob: + vpd_blob_free(blob); + return err; +} + +/* + * Top-level ioctl handler for /dev/papr-vpd. + */ +static long papr_vpd_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (__force void __user *)arg; + long ret; + + switch (ioctl) { + case PAPR_VPD_IOC_CREATE_HANDLE: + ret = papr_vpd_create_handle(argp); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + return ret; +} + +static const struct file_operations papr_vpd_ops = { + .unlocked_ioctl = papr_vpd_dev_ioctl, +}; + +static struct miscdevice papr_vpd_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-vpd", + .fops = &papr_vpd_ops, +}; + +static __init int papr_vpd_init(void) +{ + if (!rtas_function_implemented(RTAS_FN_IBM_GET_VPD)) + return -ENODEV; + + return misc_register(&papr_vpd_dev); +} +machine_device_initcall(pseries, papr_vpd_init); diff --git a/arch/powerpc/platforms/pseries/papr_platform_attributes.c b/arch/powerpc/platforms/pseries/papr_platform_attributes.c new file mode 100644 index 000000000000..eea2041b270b --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr_platform_attributes.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Platform energy and frequency attributes driver + * + * This driver creates a sys file at /sys/firmware/papr/ which encapsulates a + * directory structure containing files in keyword - value pairs that specify + * energy and frequency configuration of the system. + * + * The format of exposing the sysfs information is as follows: + * /sys/firmware/papr/energy_scale_info/ + * |-- <id>/ + * |-- desc + * |-- value + * |-- value_desc (if exists) + * |-- <id>/ + * |-- desc + * |-- value + * |-- value_desc (if exists) + * + * Copyright 2022 IBM Corp. + */ + +#include <asm/hvcall.h> +#include <asm/machdep.h> +#include <asm/firmware.h> + +#include "pseries.h" + +/* + * Flag attributes to fetch either all or one attribute from the HCALL + * flag = BE(0) => fetch all attributes with firstAttributeId = 0 + * flag = BE(1) => fetch a single attribute with firstAttributeId = id + */ +#define ESI_FLAGS_ALL 0 +#define ESI_FLAGS_SINGLE (1ull << 63) + +#define KOBJ_MAX_ATTRS 3 + +#define ESI_HDR_SIZE sizeof(struct h_energy_scale_info_hdr) +#define ESI_ATTR_SIZE sizeof(struct energy_scale_attribute) +#define CURR_MAX_ESI_ATTRS 8 + +struct energy_scale_attribute { + __be64 id; + __be64 val; + u8 desc[64]; + u8 value_desc[64]; +} __packed; + +struct h_energy_scale_info_hdr { + __be64 num_attrs; + __be64 array_offset; + u8 data_header_version; +} __packed; + +struct papr_attr { + u64 id; + struct kobj_attribute kobj_attr; +}; + +struct papr_group { + struct attribute_group pg; + struct papr_attr pgattrs[KOBJ_MAX_ATTRS]; +}; + +static struct papr_group *papr_groups; +/* /sys/firmware/papr */ +static struct kobject *papr_kobj; +/* /sys/firmware/papr/energy_scale_info */ +static struct kobject *esi_kobj; + +/* + * Energy modes can change dynamically hence making a new hcall each time the + * information needs to be retrieved + */ +static int papr_get_attr(u64 id, struct energy_scale_attribute *esi) +{ + int esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * ESI_ATTR_SIZE); + int ret, max_esi_attrs = CURR_MAX_ESI_ATTRS; + struct energy_scale_attribute *curr_esi; + struct h_energy_scale_info_hdr *hdr; + char *buf; + + buf = kmalloc(esi_buf_size, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + +retry: + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_SINGLE, + id, virt_to_phys(buf), + esi_buf_size); + + /* + * If the hcall fails with not enough memory for either the + * header or data, attempt to allocate more + */ + if (ret == H_PARTIAL || ret == H_P4) { + char *temp_buf; + + max_esi_attrs += 4; + esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * max_esi_attrs); + + temp_buf = krealloc(buf, esi_buf_size, GFP_KERNEL); + if (temp_buf) { + buf = temp_buf; + } else { + ret = -ENOMEM; + goto out_buf; + } + + goto retry; + } + + if (ret != H_SUCCESS) { + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO"); + ret = -EIO; + goto out_buf; + } + + hdr = (struct h_energy_scale_info_hdr *) buf; + curr_esi = (struct energy_scale_attribute *) + (buf + be64_to_cpu(hdr->array_offset)); + + if (esi_buf_size < + be64_to_cpu(hdr->array_offset) + (be64_to_cpu(hdr->num_attrs) + * sizeof(struct energy_scale_attribute))) { + ret = -EIO; + goto out_buf; + } + + *esi = *curr_esi; + +out_buf: + kfree(buf); + + return ret; +} + +/* + * Extract and export the description of the energy scale attributes + */ +static ssize_t desc_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, + char *buf) +{ + struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr, + kobj_attr); + struct energy_scale_attribute esi; + int ret; + + ret = papr_get_attr(pattr->id, &esi); + if (ret) + return ret; + + return sysfs_emit(buf, "%s\n", esi.desc); +} + +/* + * Extract and export the numeric value of the energy scale attributes + */ +static ssize_t val_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, + char *buf) +{ + struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr, + kobj_attr); + struct energy_scale_attribute esi; + int ret; + + ret = papr_get_attr(pattr->id, &esi); + if (ret) + return ret; + + return sysfs_emit(buf, "%llu\n", be64_to_cpu(esi.val)); +} + +/* + * Extract and export the value description in string format of the energy + * scale attributes + */ +static ssize_t val_desc_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, + char *buf) +{ + struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr, + kobj_attr); + struct energy_scale_attribute esi; + int ret; + + ret = papr_get_attr(pattr->id, &esi); + if (ret) + return ret; + + return sysfs_emit(buf, "%s\n", esi.value_desc); +} + +static struct papr_ops_info { + const char *attr_name; + ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *kobj_attr, + char *buf); +} ops_info[KOBJ_MAX_ATTRS] = { + { "desc", desc_show }, + { "value", val_show }, + { "value_desc", val_desc_show }, +}; + +static void add_attr(u64 id, int index, struct papr_attr *attr) +{ + attr->id = id; + sysfs_attr_init(&attr->kobj_attr.attr); + attr->kobj_attr.attr.name = ops_info[index].attr_name; + attr->kobj_attr.attr.mode = 0444; + attr->kobj_attr.show = ops_info[index].show; +} + +static int add_attr_group(u64 id, struct papr_group *pg, bool show_val_desc) +{ + int i; + + for (i = 0; i < KOBJ_MAX_ATTRS; i++) { + if (!strcmp(ops_info[i].attr_name, "value_desc") && + !show_val_desc) { + continue; + } + add_attr(id, i, &pg->pgattrs[i]); + pg->pg.attrs[i] = &pg->pgattrs[i].kobj_attr.attr; + } + + return sysfs_create_group(esi_kobj, &pg->pg); +} + + +static int __init papr_init(void) +{ + int esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * ESI_ATTR_SIZE); + int ret, idx, i, max_esi_attrs = CURR_MAX_ESI_ATTRS; + struct h_energy_scale_info_hdr *esi_hdr; + struct energy_scale_attribute *esi_attrs; + uint64_t num_attrs; + char *esi_buf; + + if (!firmware_has_feature(FW_FEATURE_LPAR) || + !firmware_has_feature(FW_FEATURE_ENERGY_SCALE_INFO)) { + return -ENXIO; + } + + esi_buf = kmalloc(esi_buf_size, GFP_KERNEL); + if (esi_buf == NULL) + return -ENOMEM; + /* + * hcall( + * uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info + * uint64 flags, // Per the flag request + * uint64 firstAttributeId, // The attribute id + * uint64 bufferAddress, // Guest physical address of the output buffer + * uint64 bufferSize); // The size in bytes of the output buffer + */ +retry: + + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_ALL, 0, + virt_to_phys(esi_buf), esi_buf_size); + + /* + * If the hcall fails with not enough memory for either the + * header or data, attempt to allocate more + */ + if (ret == H_PARTIAL || ret == H_P4) { + char *temp_esi_buf; + + max_esi_attrs += 4; + esi_buf_size = ESI_HDR_SIZE + (CURR_MAX_ESI_ATTRS * max_esi_attrs); + + temp_esi_buf = krealloc(esi_buf, esi_buf_size, GFP_KERNEL); + if (temp_esi_buf) + esi_buf = temp_esi_buf; + else + return -ENOMEM; + + goto retry; + } + + if (ret != H_SUCCESS) { + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO, ret: %d\n", ret); + goto out_free_esi_buf; + } + + esi_hdr = (struct h_energy_scale_info_hdr *) esi_buf; + num_attrs = be64_to_cpu(esi_hdr->num_attrs); + esi_attrs = (struct energy_scale_attribute *) + (esi_buf + be64_to_cpu(esi_hdr->array_offset)); + + if (esi_buf_size < + be64_to_cpu(esi_hdr->array_offset) + + (num_attrs * sizeof(struct energy_scale_attribute))) { + goto out_free_esi_buf; + } + + papr_groups = kcalloc(num_attrs, sizeof(*papr_groups), GFP_KERNEL); + if (!papr_groups) + goto out_free_esi_buf; + + papr_kobj = kobject_create_and_add("papr", firmware_kobj); + if (!papr_kobj) { + pr_warn("kobject_create_and_add papr failed\n"); + goto out_papr_groups; + } + + esi_kobj = kobject_create_and_add("energy_scale_info", papr_kobj); + if (!esi_kobj) { + pr_warn("kobject_create_and_add energy_scale_info failed\n"); + goto out_kobj; + } + + /* Allocate the groups before registering */ + for (idx = 0; idx < num_attrs; idx++) { + papr_groups[idx].pg.attrs = kcalloc(KOBJ_MAX_ATTRS + 1, + sizeof(*papr_groups[idx].pg.attrs), + GFP_KERNEL); + if (!papr_groups[idx].pg.attrs) + goto out_pgattrs; + + papr_groups[idx].pg.name = kasprintf(GFP_KERNEL, "%lld", + be64_to_cpu(esi_attrs[idx].id)); + if (papr_groups[idx].pg.name == NULL) + goto out_pgattrs; + } + + for (idx = 0; idx < num_attrs; idx++) { + bool show_val_desc = true; + + /* Do not add the value desc attr if it does not exist */ + if (strnlen(esi_attrs[idx].value_desc, + sizeof(esi_attrs[idx].value_desc)) == 0) + show_val_desc = false; + + if (add_attr_group(be64_to_cpu(esi_attrs[idx].id), + &papr_groups[idx], + show_val_desc)) { + pr_warn("Failed to create papr attribute group %s\n", + papr_groups[idx].pg.name); + idx = num_attrs; + goto out_pgattrs; + } + } + + kfree(esi_buf); + return 0; +out_pgattrs: + for (i = 0; i < idx ; i++) { + kfree(papr_groups[i].pg.attrs); + kfree(papr_groups[i].pg.name); + } + kobject_put(esi_kobj); +out_kobj: + kobject_put(papr_kobj); +out_papr_groups: + kfree(papr_groups); +out_free_esi_buf: + kfree(esi_buf); + + return -ENOMEM; +} + +machine_device_initcall(pseries, papr_init); diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index c2ef320ba1bf..c233f9db039b 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -12,16 +12,81 @@ #include <linux/libnvdimm.h> #include <linux/platform_device.h> #include <linux/delay.h> +#include <linux/seq_buf.h> +#include <linux/nd.h> #include <asm/plpar_wrappers.h> +#include <asm/papr_pdsm.h> +#include <asm/mce.h> +#include <asm/unaligned.h> +#include <linux/perf_event.h> #define BIND_ANY_ADDR (~0ul) #define PAPR_SCM_DIMM_CMD_MASK \ ((1ul << ND_CMD_GET_CONFIG_SIZE) | \ (1ul << ND_CMD_GET_CONFIG_DATA) | \ - (1ul << ND_CMD_SET_CONFIG_DATA)) - + (1ul << ND_CMD_SET_CONFIG_DATA) | \ + (1ul << ND_CMD_CALL)) + +/* DIMM health bitmap indicators */ +/* SCM device is unable to persist memory contents */ +#define PAPR_PMEM_UNARMED (1ULL << (63 - 0)) +/* SCM device failed to persist memory contents */ +#define PAPR_PMEM_SHUTDOWN_DIRTY (1ULL << (63 - 1)) +/* SCM device contents are persisted from previous IPL */ +#define PAPR_PMEM_SHUTDOWN_CLEAN (1ULL << (63 - 2)) +/* SCM device contents are not persisted from previous IPL */ +#define PAPR_PMEM_EMPTY (1ULL << (63 - 3)) +/* SCM device memory life remaining is critically low */ +#define PAPR_PMEM_HEALTH_CRITICAL (1ULL << (63 - 4)) +/* SCM device will be garded off next IPL due to failure */ +#define PAPR_PMEM_HEALTH_FATAL (1ULL << (63 - 5)) +/* SCM contents cannot persist due to current platform health status */ +#define PAPR_PMEM_HEALTH_UNHEALTHY (1ULL << (63 - 6)) +/* SCM device is unable to persist memory contents in certain conditions */ +#define PAPR_PMEM_HEALTH_NON_CRITICAL (1ULL << (63 - 7)) +/* SCM device is encrypted */ +#define PAPR_PMEM_ENCRYPTED (1ULL << (63 - 8)) +/* SCM device has been scrubbed and locked */ +#define PAPR_PMEM_SCRUBBED_AND_LOCKED (1ULL << (63 - 9)) + +/* Bits status indicators for health bitmap indicating unarmed dimm */ +#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED | \ + PAPR_PMEM_HEALTH_UNHEALTHY) + +/* Bits status indicators for health bitmap indicating unflushed dimm */ +#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY) + +/* Bits status indicators for health bitmap indicating unrestored dimm */ +#define PAPR_PMEM_BAD_RESTORE_MASK (PAPR_PMEM_EMPTY) + +/* Bit status indicators for smart event notification */ +#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \ + PAPR_PMEM_HEALTH_FATAL | \ + PAPR_PMEM_HEALTH_UNHEALTHY) + +#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) +#define PAPR_SCM_PERF_STATS_VERSION 0x1 + +/* Struct holding a single performance metric */ +struct papr_scm_perf_stat { + u8 stat_id[8]; + __be64 stat_val; +} __packed; + +/* Struct exchanged between kernel and PHYP for fetching drc perf stats */ +struct papr_scm_perf_stats { + u8 eye_catcher[8]; + /* Should be PAPR_SCM_PERF_STATS_VERSION */ + __be32 stats_version; + /* Number of stats following */ + __be32 num_statistics; + /* zero or more performance matrics */ + struct papr_scm_perf_stat scm_statistic[]; +} __packed; + +/* private struct associated with each region */ struct papr_scm_priv { struct platform_device *pdev; struct device_node *dn; @@ -30,6 +95,7 @@ struct papr_scm_priv { uint64_t block_size; int metadata_size; bool is_volatile; + bool hcall_flush_required; uint64_t bound_addr; @@ -39,8 +105,62 @@ struct papr_scm_priv { struct resource res; struct nd_region *region; struct nd_interleave_set nd_set; + struct list_head region_list; + + /* Protect dimm health data from concurrent read/writes */ + struct mutex health_mutex; + + /* Last time the health information of the dimm was updated */ + unsigned long lasthealth_jiffies; + + /* Health information for the dimm */ + u64 health_bitmap; + + /* Holds the last known dirty shutdown counter value */ + u64 dirty_shutdown_counter; + + /* length of the stat buffer as expected by phyp */ + size_t stat_buffer_len; + + /* The bits which needs to be overridden */ + u64 health_bitmap_inject_mask; }; +static int papr_scm_pmem_flush(struct nd_region *nd_region, + struct bio *bio __maybe_unused) +{ + struct papr_scm_priv *p = nd_region_provider_data(nd_region); + unsigned long ret_buf[PLPAR_HCALL_BUFSIZE], token = 0; + long rc; + + dev_dbg(&p->pdev->dev, "flush drc 0x%x", p->drc_index); + + do { + rc = plpar_hcall(H_SCM_FLUSH, ret_buf, p->drc_index, token); + token = ret_buf[0]; + + /* Check if we are stalled for some time */ + if (H_IS_LONG_BUSY(rc)) { + msleep(get_longbusy_msecs(rc)); + rc = H_BUSY; + } else if (rc == H_BUSY) { + cond_resched(); + } + } while (rc == H_BUSY); + + if (rc) { + dev_err(&p->pdev->dev, "flush error: %ld", rc); + rc = -EIO; + } else { + dev_dbg(&p->pdev->dev, "flush drc 0x%x complete", p->drc_index); + } + + return rc; +} + +static LIST_HEAD(papr_nd_regions); +static DEFINE_MUTEX(papr_ndr_lock); + static int drc_pmem_bind(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; @@ -69,7 +189,8 @@ static int drc_pmem_bind(struct papr_scm_priv *p) return rc; p->bound_addr = saved; - dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + dev_dbg(&p->pdev->dev, "bound drc 0x%x to 0x%lx\n", + p->drc_index, (unsigned long)saved); return rc; } @@ -133,7 +254,7 @@ static int drc_pmem_query_n_bind(struct papr_scm_priv *p) goto err_out; p->bound_addr = start_addr; - dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + dev_dbg(&p->pdev->dev, "bound drc 0x%x to 0x%lx\n", p->drc_index, start_addr); return rc; err_out: @@ -143,6 +264,336 @@ err_out: return drc_pmem_bind(p); } +/* + * Query the Dimm performance stats from PHYP and copy them (if returned) to + * provided struct papr_scm_perf_stats instance 'stats' that can hold atleast + * (num_stats + header) bytes. + * - If buff_stats == NULL the return value is the size in bytes of the buffer + * needed to hold all supported performance-statistics. + * - If buff_stats != NULL and num_stats == 0 then we copy all known + * performance-statistics to 'buff_stat' and expect to be large enough to + * hold them. + * - if buff_stats != NULL and num_stats > 0 then copy the requested + * performance-statistics to buff_stats. + */ +static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, + struct papr_scm_perf_stats *buff_stats, + unsigned int num_stats) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + size_t size; + s64 rc; + + /* Setup the out buffer */ + if (buff_stats) { + memcpy(buff_stats->eye_catcher, + PAPR_SCM_PERF_STATS_EYECATCHER, 8); + buff_stats->stats_version = + cpu_to_be32(PAPR_SCM_PERF_STATS_VERSION); + buff_stats->num_statistics = + cpu_to_be32(num_stats); + + /* + * Calculate the buffer size based on num-stats provided + * or use the prefetched max buffer length + */ + if (num_stats) + /* Calculate size from the num_stats */ + size = sizeof(struct papr_scm_perf_stats) + + num_stats * sizeof(struct papr_scm_perf_stat); + else + size = p->stat_buffer_len; + } else { + /* In case of no out buffer ignore the size */ + size = 0; + } + + /* Do the HCALL asking PHYP for info */ + rc = plpar_hcall(H_SCM_PERFORMANCE_STATS, ret, p->drc_index, + buff_stats ? virt_to_phys(buff_stats) : 0, + size); + + /* Check if the error was due to an unknown stat-id */ + if (rc == H_PARTIAL) { + dev_err(&p->pdev->dev, + "Unknown performance stats, Err:0x%016lX\n", ret[0]); + return -ENOENT; + } else if (rc == H_AUTHORITY) { + dev_info(&p->pdev->dev, + "Permission denied while accessing performance stats"); + return -EPERM; + } else if (rc == H_UNSUPPORTED) { + dev_dbg(&p->pdev->dev, "Performance stats unsupported\n"); + return -EOPNOTSUPP; + } else if (rc != H_SUCCESS) { + dev_err(&p->pdev->dev, + "Failed to query performance stats, Err:%lld\n", rc); + return -EIO; + + } else if (!size) { + /* Handle case where stat buffer size was requested */ + dev_dbg(&p->pdev->dev, + "Performance stats size %ld\n", ret[0]); + return ret[0]; + } + + /* Successfully fetched the requested stats from phyp */ + dev_dbg(&p->pdev->dev, + "Performance stats returned %d stats\n", + be32_to_cpu(buff_stats->num_statistics)); + return 0; +} + +#ifdef CONFIG_PERF_EVENTS +#define to_nvdimm_pmu(_pmu) container_of(_pmu, struct nvdimm_pmu, pmu) + +static const char * const nvdimm_events_map[] = { + [1] = "CtlResCt", + [2] = "CtlResTm", + [3] = "PonSecs ", + [4] = "MemLife ", + [5] = "CritRscU", + [6] = "HostLCnt", + [7] = "HostSCnt", + [8] = "HostSDur", + [9] = "HostLDur", + [10] = "MedRCnt ", + [11] = "MedWCnt ", + [12] = "MedRDur ", + [13] = "MedWDur ", + [14] = "CchRHCnt", + [15] = "CchWHCnt", + [16] = "FastWCnt", +}; + +static int papr_scm_pmu_get_value(struct perf_event *event, struct device *dev, u64 *count) +{ + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + struct papr_scm_priv *p = dev_get_drvdata(dev); + int rc, size; + + /* Invalid eventcode */ + if (event->attr.config == 0 || event->attr.config >= ARRAY_SIZE(nvdimm_events_map)) + return -EINVAL; + + /* Allocate request buffer enough to hold single performance stat */ + size = sizeof(struct papr_scm_perf_stats) + + sizeof(struct papr_scm_perf_stat); + + if (!p) + return -EINVAL; + + stats = kzalloc(size, GFP_KERNEL); + if (!stats) + return -ENOMEM; + + stat = &stats->scm_statistic[0]; + memcpy(&stat->stat_id, + nvdimm_events_map[event->attr.config], + sizeof(stat->stat_id)); + stat->stat_val = 0; + + rc = drc_pmem_query_stats(p, stats, 1); + if (rc < 0) { + kfree(stats); + return rc; + } + + *count = be64_to_cpu(stat->stat_val); + kfree(stats); + return 0; +} + +static int papr_scm_pmu_event_init(struct perf_event *event) +{ + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu); + struct papr_scm_priv *p; + + if (!nd_pmu) + return -EINVAL; + + /* test the event attr type for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* it does not support event sampling mode */ + if (is_sampling_event(event)) + return -EOPNOTSUPP; + + /* no branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + p = (struct papr_scm_priv *)nd_pmu->dev->driver_data; + if (!p) + return -EINVAL; + + /* Invalid eventcode */ + if (event->attr.config == 0 || event->attr.config > 16) + return -EINVAL; + + return 0; +} + +static int papr_scm_pmu_add(struct perf_event *event, int flags) +{ + u64 count; + int rc; + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu); + + if (!nd_pmu) + return -EINVAL; + + if (flags & PERF_EF_START) { + rc = papr_scm_pmu_get_value(event, nd_pmu->dev, &count); + if (rc) + return rc; + + local64_set(&event->hw.prev_count, count); + } + + return 0; +} + +static void papr_scm_pmu_read(struct perf_event *event) +{ + u64 prev, now; + int rc; + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu); + + if (!nd_pmu) + return; + + rc = papr_scm_pmu_get_value(event, nd_pmu->dev, &now); + if (rc) + return; + + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); +} + +static void papr_scm_pmu_del(struct perf_event *event, int flags) +{ + papr_scm_pmu_read(event); +} + +static void papr_scm_pmu_register(struct papr_scm_priv *p) +{ + struct nvdimm_pmu *nd_pmu; + int rc, nodeid; + + nd_pmu = kzalloc(sizeof(*nd_pmu), GFP_KERNEL); + if (!nd_pmu) { + rc = -ENOMEM; + goto pmu_err_print; + } + + if (!p->stat_buffer_len) { + rc = -ENOENT; + goto pmu_check_events_err; + } + + nd_pmu->pmu.task_ctx_nr = perf_invalid_context; + nd_pmu->pmu.name = nvdimm_name(p->nvdimm); + nd_pmu->pmu.event_init = papr_scm_pmu_event_init; + nd_pmu->pmu.read = papr_scm_pmu_read; + nd_pmu->pmu.add = papr_scm_pmu_add; + nd_pmu->pmu.del = papr_scm_pmu_del; + + nd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_INTERRUPT | + PERF_PMU_CAP_NO_EXCLUDE; + + /*updating the cpumask variable */ + nodeid = numa_map_to_online_node(dev_to_node(&p->pdev->dev)); + nd_pmu->arch_cpumask = *cpumask_of_node(nodeid); + + rc = register_nvdimm_pmu(nd_pmu, p->pdev); + if (rc) + goto pmu_check_events_err; + + /* + * Set archdata.priv value to nvdimm_pmu structure, to handle the + * unregistering of pmu device. + */ + p->pdev->archdata.priv = nd_pmu; + return; + +pmu_check_events_err: + kfree(nd_pmu); +pmu_err_print: + dev_info(&p->pdev->dev, "nvdimm pmu didn't register rc=%d\n", rc); +} + +#else +static void papr_scm_pmu_register(struct papr_scm_priv *p) { } +#endif + +/* + * Issue hcall to retrieve dimm health info and populate papr_scm_priv with the + * health information. + */ +static int __drc_pmem_query_health(struct papr_scm_priv *p) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + u64 bitmap = 0; + long rc; + + /* issue the hcall */ + rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index); + if (rc == H_SUCCESS) + bitmap = ret[0] & ret[1]; + else if (rc == H_FUNCTION) + dev_info_once(&p->pdev->dev, + "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap"); + else { + + dev_err(&p->pdev->dev, + "Failed to query health information, Err:%ld\n", rc); + return -ENXIO; + } + + p->lasthealth_jiffies = jiffies; + /* Allow injecting specific health bits via inject mask. */ + if (p->health_bitmap_inject_mask) + bitmap = (bitmap & ~p->health_bitmap_inject_mask) | + p->health_bitmap_inject_mask; + WRITE_ONCE(p->health_bitmap, bitmap); + dev_dbg(&p->pdev->dev, + "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n", + ret[0], ret[1]); + + return 0; +} + +/* Min interval in seconds for assuming stable dimm health */ +#define MIN_HEALTH_QUERY_INTERVAL 60 + +/* Query cached health info and if needed call drc_pmem_query_health */ +static int drc_pmem_query_health(struct papr_scm_priv *p) +{ + unsigned long cache_timeout; + int rc; + + /* Protect concurrent modifications to papr_scm_priv */ + rc = mutex_lock_interruptible(&p->health_mutex); + if (rc) + return rc; + + /* Jiffies offset for which the health data is assumed to be same */ + cache_timeout = p->lasthealth_jiffies + + msecs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL * 1000); + + /* Fetch new health info is its older than MIN_HEALTH_QUERY_INTERVAL */ + if (time_after(jiffies, cache_timeout)) + rc = __drc_pmem_query_health(p); + else + /* Assume cached health data is valid */ + rc = 0; + + mutex_unlock(&p->health_mutex); + return rc; +} static int papr_scm_meta_get(struct papr_scm_priv *p, struct nd_cmd_get_config_data_hdr *hdr) @@ -245,16 +696,368 @@ static int papr_scm_meta_set(struct papr_scm_priv *p, return 0; } -int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, - unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) +/* + * Do a sanity checks on the inputs args to dimm-control function and return + * '0' if valid. Validation of PDSM payloads happens later in + * papr_scm_service_pdsm. + */ +static int is_cmd_valid(struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len) { - struct nd_cmd_get_config_size *get_size_hdr; + unsigned long cmd_mask = PAPR_SCM_DIMM_CMD_MASK; + struct nd_cmd_pkg *nd_cmd; struct papr_scm_priv *p; + enum papr_pdsm pdsm; /* Only dimm-specific calls are supported atm */ if (!nvdimm) return -EINVAL; + /* get the provider data from struct nvdimm */ + p = nvdimm_provider_data(nvdimm); + + if (!test_bit(cmd, &cmd_mask)) { + dev_dbg(&p->pdev->dev, "Unsupported cmd=%u\n", cmd); + return -EINVAL; + } + + /* For CMD_CALL verify pdsm request */ + if (cmd == ND_CMD_CALL) { + /* Verify the envelope and envelop size */ + if (!buf || + buf_len < (sizeof(struct nd_cmd_pkg) + ND_PDSM_HDR_SIZE)) { + dev_dbg(&p->pdev->dev, "Invalid pkg size=%u\n", + buf_len); + return -EINVAL; + } + + /* Verify that the nd_cmd_pkg.nd_family is correct */ + nd_cmd = (struct nd_cmd_pkg *)buf; + + if (nd_cmd->nd_family != NVDIMM_FAMILY_PAPR) { + dev_dbg(&p->pdev->dev, "Invalid pkg family=0x%llx\n", + nd_cmd->nd_family); + return -EINVAL; + } + + pdsm = (enum papr_pdsm)nd_cmd->nd_command; + + /* Verify if the pdsm command is valid */ + if (pdsm <= PAPR_PDSM_MIN || pdsm >= PAPR_PDSM_MAX) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Invalid PDSM\n", + pdsm); + return -EINVAL; + } + + /* Have enough space to hold returned 'nd_pkg_pdsm' header */ + if (nd_cmd->nd_size_out < ND_PDSM_HDR_SIZE) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Invalid payload\n", + pdsm); + return -EINVAL; + } + } + + /* Let the command be further processed */ + return 0; +} + +static int papr_pdsm_fuel_gauge(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + int rc, size; + u64 statval; + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + + /* Silently fail if fetching performance metrics isn't supported */ + if (!p->stat_buffer_len) + return 0; + + /* Allocate request buffer enough to hold single performance stat */ + size = sizeof(struct papr_scm_perf_stats) + + sizeof(struct papr_scm_perf_stat); + + stats = kzalloc(size, GFP_KERNEL); + if (!stats) + return -ENOMEM; + + stat = &stats->scm_statistic[0]; + memcpy(&stat->stat_id, "MemLife ", sizeof(stat->stat_id)); + stat->stat_val = 0; + + /* Fetch the fuel gauge and populate it in payload */ + rc = drc_pmem_query_stats(p, stats, 1); + if (rc < 0) { + dev_dbg(&p->pdev->dev, "Err(%d) fetching fuel gauge\n", rc); + goto free_stats; + } + + statval = be64_to_cpu(stat->stat_val); + dev_dbg(&p->pdev->dev, + "Fetched fuel-gauge %llu", statval); + payload->health.extension_flags |= + PDSM_DIMM_HEALTH_RUN_GAUGE_VALID; + payload->health.dimm_fuel_gauge = statval; + + rc = sizeof(struct nd_papr_pdsm_health); + +free_stats: + kfree(stats); + return rc; +} + +/* Add the dirty-shutdown-counter value to the pdsm */ +static int papr_pdsm_dsc(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + payload->health.extension_flags |= PDSM_DIMM_DSC_VALID; + payload->health.dimm_dsc = p->dirty_shutdown_counter; + + return sizeof(struct nd_papr_pdsm_health); +} + +/* Fetch the DIMM health info and populate it in provided package. */ +static int papr_pdsm_health(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + int rc; + + /* Ensure dimm health mutex is taken preventing concurrent access */ + rc = mutex_lock_interruptible(&p->health_mutex); + if (rc) + goto out; + + /* Always fetch upto date dimm health data ignoring cached values */ + rc = __drc_pmem_query_health(p); + if (rc) { + mutex_unlock(&p->health_mutex); + goto out; + } + + /* update health struct with various flags derived from health bitmap */ + payload->health = (struct nd_papr_pdsm_health) { + .extension_flags = 0, + .dimm_unarmed = !!(p->health_bitmap & PAPR_PMEM_UNARMED_MASK), + .dimm_bad_shutdown = !!(p->health_bitmap & PAPR_PMEM_BAD_SHUTDOWN_MASK), + .dimm_bad_restore = !!(p->health_bitmap & PAPR_PMEM_BAD_RESTORE_MASK), + .dimm_scrubbed = !!(p->health_bitmap & PAPR_PMEM_SCRUBBED_AND_LOCKED), + .dimm_locked = !!(p->health_bitmap & PAPR_PMEM_SCRUBBED_AND_LOCKED), + .dimm_encrypted = !!(p->health_bitmap & PAPR_PMEM_ENCRYPTED), + .dimm_health = PAPR_PDSM_DIMM_HEALTHY, + }; + + /* Update field dimm_health based on health_bitmap flags */ + if (p->health_bitmap & PAPR_PMEM_HEALTH_FATAL) + payload->health.dimm_health = PAPR_PDSM_DIMM_FATAL; + else if (p->health_bitmap & PAPR_PMEM_HEALTH_CRITICAL) + payload->health.dimm_health = PAPR_PDSM_DIMM_CRITICAL; + else if (p->health_bitmap & PAPR_PMEM_HEALTH_UNHEALTHY) + payload->health.dimm_health = PAPR_PDSM_DIMM_UNHEALTHY; + + /* struct populated hence can release the mutex now */ + mutex_unlock(&p->health_mutex); + + /* Populate the fuel gauge meter in the payload */ + papr_pdsm_fuel_gauge(p, payload); + /* Populate the dirty-shutdown-counter field */ + papr_pdsm_dsc(p, payload); + + rc = sizeof(struct nd_papr_pdsm_health); + +out: + return rc; +} + +/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */ +static int papr_pdsm_smart_inject(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + int rc; + u32 supported_flags = 0; + u64 inject_mask = 0, clear_mask = 0; + u64 mask; + + /* Check for individual smart error flags and update inject/clear masks */ + if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) { + supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL; + if (payload->smart_inject.fatal_enable) + inject_mask |= PAPR_PMEM_HEALTH_FATAL; + else + clear_mask |= PAPR_PMEM_HEALTH_FATAL; + } + + if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) { + supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN; + if (payload->smart_inject.unsafe_shutdown_enable) + inject_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; + else + clear_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; + } + + dev_dbg(&p->pdev->dev, "[Smart-inject] inject_mask=%#llx clear_mask=%#llx\n", + inject_mask, clear_mask); + + /* Prevent concurrent access to dimm health bitmap related members */ + rc = mutex_lock_interruptible(&p->health_mutex); + if (rc) + return rc; + + /* Use inject/clear masks to set health_bitmap_inject_mask */ + mask = READ_ONCE(p->health_bitmap_inject_mask); + mask = (mask & ~clear_mask) | inject_mask; + WRITE_ONCE(p->health_bitmap_inject_mask, mask); + + /* Invalidate cached health bitmap */ + p->lasthealth_jiffies = 0; + + mutex_unlock(&p->health_mutex); + + /* Return the supported flags back to userspace */ + payload->smart_inject.flags = supported_flags; + + return sizeof(struct nd_papr_pdsm_health); +} + +/* + * 'struct pdsm_cmd_desc' + * Identifies supported PDSMs' expected length of in/out payloads + * and pdsm service function. + * + * size_in : Size of input payload if any in the PDSM request. + * size_out : Size of output payload if any in the PDSM request. + * service : Service function for the PDSM request. Return semantics: + * rc < 0 : Error servicing PDSM and rc indicates the error. + * rc >=0 : Serviced successfully and 'rc' indicate number of + * bytes written to payload. + */ +struct pdsm_cmd_desc { + u32 size_in; + u32 size_out; + int (*service)(struct papr_scm_priv *dimm, + union nd_pdsm_payload *payload); +}; + +/* Holds all supported PDSMs' command descriptors */ +static const struct pdsm_cmd_desc __pdsm_cmd_descriptors[] = { + [PAPR_PDSM_MIN] = { + .size_in = 0, + .size_out = 0, + .service = NULL, + }, + /* New PDSM command descriptors to be added below */ + + [PAPR_PDSM_HEALTH] = { + .size_in = 0, + .size_out = sizeof(struct nd_papr_pdsm_health), + .service = papr_pdsm_health, + }, + + [PAPR_PDSM_SMART_INJECT] = { + .size_in = sizeof(struct nd_papr_pdsm_smart_inject), + .size_out = sizeof(struct nd_papr_pdsm_smart_inject), + .service = papr_pdsm_smart_inject, + }, + /* Empty */ + [PAPR_PDSM_MAX] = { + .size_in = 0, + .size_out = 0, + .service = NULL, + }, +}; + +/* Given a valid pdsm cmd return its command descriptor else return NULL */ +static inline const struct pdsm_cmd_desc *pdsm_cmd_desc(enum papr_pdsm cmd) +{ + if (cmd >= 0 || cmd < ARRAY_SIZE(__pdsm_cmd_descriptors)) + return &__pdsm_cmd_descriptors[cmd]; + + return NULL; +} + +/* + * For a given pdsm request call an appropriate service function. + * Returns errors if any while handling the pdsm command package. + */ +static int papr_scm_service_pdsm(struct papr_scm_priv *p, + struct nd_cmd_pkg *pkg) +{ + /* Get the PDSM header and PDSM command */ + struct nd_pkg_pdsm *pdsm_pkg = (struct nd_pkg_pdsm *)pkg->nd_payload; + enum papr_pdsm pdsm = (enum papr_pdsm)pkg->nd_command; + const struct pdsm_cmd_desc *pdsc; + int rc; + + /* Fetch corresponding pdsm descriptor for validation and servicing */ + pdsc = pdsm_cmd_desc(pdsm); + + /* Validate pdsm descriptor */ + /* Ensure that reserved fields are 0 */ + if (pdsm_pkg->reserved[0] || pdsm_pkg->reserved[1]) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Invalid reserved field\n", + pdsm); + return -EINVAL; + } + + /* If pdsm expects some input, then ensure that the size_in matches */ + if (pdsc->size_in && + pkg->nd_size_in != (pdsc->size_in + ND_PDSM_HDR_SIZE)) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Mismatched size_in=%d\n", + pdsm, pkg->nd_size_in); + return -EINVAL; + } + + /* If pdsm wants to return data, then ensure that size_out matches */ + if (pdsc->size_out && + pkg->nd_size_out != (pdsc->size_out + ND_PDSM_HDR_SIZE)) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Mismatched size_out=%d\n", + pdsm, pkg->nd_size_out); + return -EINVAL; + } + + /* Service the pdsm */ + if (pdsc->service) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Servicing..\n", pdsm); + + rc = pdsc->service(p, &pdsm_pkg->payload); + + if (rc < 0) { + /* error encountered while servicing pdsm */ + pdsm_pkg->cmd_status = rc; + pkg->nd_fw_size = ND_PDSM_HDR_SIZE; + } else { + /* pdsm serviced and 'rc' bytes written to payload */ + pdsm_pkg->cmd_status = 0; + pkg->nd_fw_size = ND_PDSM_HDR_SIZE + rc; + } + } else { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Unsupported PDSM request\n", + pdsm); + pdsm_pkg->cmd_status = -ENOENT; + pkg->nd_fw_size = ND_PDSM_HDR_SIZE; + } + + return pdsm_pkg->cmd_status; +} + +static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len, int *cmd_rc) +{ + struct nd_cmd_get_config_size *get_size_hdr; + struct nd_cmd_pkg *call_pkg = NULL; + struct papr_scm_priv *p; + int rc; + + rc = is_cmd_valid(nvdimm, cmd, buf, buf_len); + if (rc) { + pr_debug("Invalid cmd=0x%x. Err=%d\n", cmd, rc); + return rc; + } + + /* Use a local variable in case cmd_rc pointer is NULL */ + if (!cmd_rc) + cmd_rc = &rc; + p = nvdimm_provider_data(nvdimm); switch (cmd) { @@ -275,7 +1078,13 @@ int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, *cmd_rc = papr_scm_meta_set(p, buf); break; + case ND_CMD_CALL: + call_pkg = (struct nd_cmd_pkg *)buf; + *cmd_rc = papr_scm_service_pdsm(p, call_pkg); + break; + default: + dev_dbg(&p->pdev->dev, "Unknown command = %d\n", cmd); return -EINVAL; } @@ -284,24 +1093,147 @@ int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, return 0; } -static inline int papr_scm_node(int node) +static ssize_t health_bitmap_inject_show(struct device *dev, + struct device_attribute *attr, + char *buf) { - int min_dist = INT_MAX, dist; - int nid, min_node; + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); - if ((node == NUMA_NO_NODE) || node_online(node)) - return node; + return sprintf(buf, "%#llx\n", + READ_ONCE(p->health_bitmap_inject_mask)); +} - min_node = first_online_node; - for_each_online_node(nid) { - dist = node_distance(node, nid); - if (dist < min_dist) { - min_dist = dist; - min_node = nid; - } +static DEVICE_ATTR_ADMIN_RO(health_bitmap_inject); + +static ssize_t perf_stats_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int index; + ssize_t rc; + struct seq_buf s; + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); + + if (!p->stat_buffer_len) + return -ENOENT; + + /* Allocate the buffer for phyp where stats are written */ + stats = kzalloc(p->stat_buffer_len, GFP_KERNEL); + if (!stats) + return -ENOMEM; + + /* Ask phyp to return all dimm perf stats */ + rc = drc_pmem_query_stats(p, stats, 0); + if (rc) + goto free_stats; + /* + * Go through the returned output buffer and print stats and + * values. Since stat_id is essentially a char string of + * 8 bytes, simply use the string format specifier to print it. + */ + seq_buf_init(&s, buf, PAGE_SIZE); + for (index = 0, stat = stats->scm_statistic; + index < be32_to_cpu(stats->num_statistics); + ++index, ++stat) { + seq_buf_printf(&s, "%.8s = 0x%016llX\n", + stat->stat_id, + be64_to_cpu(stat->stat_val)); } - return min_node; + +free_stats: + kfree(stats); + return rc ? rc : (ssize_t)seq_buf_used(&s); +} +static DEVICE_ATTR_ADMIN_RO(perf_stats); + +static ssize_t flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); + struct seq_buf s; + u64 health; + int rc; + + rc = drc_pmem_query_health(p); + if (rc) + return rc; + + /* Copy health_bitmap locally, check masks & update out buffer */ + health = READ_ONCE(p->health_bitmap); + + seq_buf_init(&s, buf, PAGE_SIZE); + if (health & PAPR_PMEM_UNARMED_MASK) + seq_buf_printf(&s, "not_armed "); + + if (health & PAPR_PMEM_BAD_SHUTDOWN_MASK) + seq_buf_printf(&s, "flush_fail "); + + if (health & PAPR_PMEM_BAD_RESTORE_MASK) + seq_buf_printf(&s, "restore_fail "); + + if (health & PAPR_PMEM_ENCRYPTED) + seq_buf_printf(&s, "encrypted "); + + if (health & PAPR_PMEM_SMART_EVENT_MASK) + seq_buf_printf(&s, "smart_notify "); + + if (health & PAPR_PMEM_SCRUBBED_AND_LOCKED) + seq_buf_printf(&s, "scrubbed locked "); + + if (seq_buf_used(&s)) + seq_buf_printf(&s, "\n"); + + return seq_buf_used(&s); } +DEVICE_ATTR_RO(flags); + +static ssize_t dirty_shutdown_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); + + return sysfs_emit(buf, "%llu\n", p->dirty_shutdown_counter); +} +DEVICE_ATTR_RO(dirty_shutdown); + +static umode_t papr_nd_attribute_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct nvdimm *nvdimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(nvdimm); + + /* For if perf-stats not available remove perf_stats sysfs */ + if (attr == &dev_attr_perf_stats.attr && p->stat_buffer_len == 0) + return 0; + + return attr->mode; +} + +/* papr_scm specific dimm attributes */ +static struct attribute *papr_nd_attributes[] = { + &dev_attr_flags.attr, + &dev_attr_perf_stats.attr, + &dev_attr_dirty_shutdown.attr, + &dev_attr_health_bitmap_inject.attr, + NULL, +}; + +static const struct attribute_group papr_nd_attribute_group = { + .name = "papr", + .is_visible = papr_nd_attribute_visible, + .attrs = papr_nd_attributes, +}; + +static const struct attribute_group *papr_nd_attr_groups[] = { + &papr_nd_attribute_group, + NULL, +}; static int papr_scm_nvdimm_init(struct papr_scm_priv *p) { @@ -316,20 +1248,33 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) p->bus_desc.of_node = p->pdev->dev.of_node; p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL); + /* Set the dimm command family mask to accept PDSMs */ + set_bit(NVDIMM_FAMILY_PAPR, &p->bus_desc.dimm_family_mask); + if (!p->bus_desc.provider_name) return -ENOMEM; p->bus = nvdimm_bus_register(NULL, &p->bus_desc); if (!p->bus) { dev_err(dev, "Error creating nvdimm bus %pOF\n", p->dn); + kfree(p->bus_desc.provider_name); return -ENXIO; } dimm_flags = 0; - set_bit(NDD_ALIASING, &dimm_flags); + set_bit(NDD_LABELING, &dimm_flags); - p->nvdimm = nvdimm_create(p->bus, p, NULL, dimm_flags, - PAPR_SCM_DIMM_CMD_MASK, 0, NULL); + /* + * Check if the nvdimm is unarmed. No locking needed as we are still + * initializing. Ignore error encountered if any. + */ + __drc_pmem_query_health(p); + + if (p->health_bitmap & PAPR_PMEM_UNARMED_MASK) + set_bit(NDD_UNARMED, &dimm_flags); + + p->nvdimm = nvdimm_create(p->bus, p, papr_nd_attr_groups, + dimm_flags, PAPR_SCM_DIMM_CMD_MASK, 0, NULL); if (!p->nvdimm) { dev_err(dev, "Error creating DIMM object for %pOF\n", p->dn); goto err; @@ -347,7 +1292,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) memset(&ndr_desc, 0, sizeof(ndr_desc)); target_nid = dev_to_node(&p->pdev->dev); - online_nid = papr_scm_node(target_nid); + online_nid = numa_map_to_online_node(target_nid); ndr_desc.numa_node = online_nid; ndr_desc.target_node = target_nid; ndr_desc.res = &p->res; @@ -356,12 +1301,18 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) ndr_desc.mapping = &mapping; ndr_desc.num_mappings = 1; ndr_desc.nd_set = &p->nd_set; - set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + + if (p->hcall_flush_required) { + set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + ndr_desc.flush = papr_scm_pmem_flush; + } if (p->is_volatile) p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc); - else + else { + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc); + } if (!p->region) { dev_err(dev, "Error registering region %pR from %pOF\n", ndr_desc.res, p->dn); @@ -371,6 +1322,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) dev_info(dev, "Region registered with target node %d and online node %d", target_nid, online_nid); + mutex_lock(&papr_ndr_lock); + list_add_tail(&p->region_list, &papr_nd_regions); + mutex_unlock(&papr_ndr_lock); + return 0; err: nvdimm_bus_unregister(p->bus); @@ -378,14 +1333,78 @@ err: nvdimm_bus_unregister(p->bus); return -ENXIO; } +static void papr_scm_add_badblock(struct nd_region *region, + struct nvdimm_bus *bus, u64 phys_addr) +{ + u64 aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES); + + if (nvdimm_bus_add_badrange(bus, aligned_addr, L1_CACHE_BYTES)) { + pr_err("Bad block registration for 0x%llx failed\n", phys_addr); + return; + } + + pr_debug("Add memory range (0x%llx - 0x%llx) as bad range\n", + aligned_addr, aligned_addr + L1_CACHE_BYTES); + + nvdimm_region_notify(region, NVDIMM_REVALIDATE_POISON); +} + +static int handle_mce_ue(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct machine_check_event *evt = data; + struct papr_scm_priv *p; + u64 phys_addr; + bool found = false; + + if (evt->error_type != MCE_ERROR_TYPE_UE) + return NOTIFY_DONE; + + if (list_empty(&papr_nd_regions)) + return NOTIFY_DONE; + + /* + * The physical address obtained here is PAGE_SIZE aligned, so get the + * exact address from the effective address + */ + phys_addr = evt->u.ue_error.physical_address + + (evt->u.ue_error.effective_address & ~PAGE_MASK); + + if (!evt->u.ue_error.physical_address_provided || + !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT))) + return NOTIFY_DONE; + + /* mce notifier is called from a process context, so mutex is safe */ + mutex_lock(&papr_ndr_lock); + list_for_each_entry(p, &papr_nd_regions, region_list) { + if (phys_addr >= p->res.start && phys_addr <= p->res.end) { + found = true; + break; + } + } + + if (found) + papr_scm_add_badblock(p->region, p->bus, phys_addr); + + mutex_unlock(&papr_ndr_lock); + + return found ? NOTIFY_OK : NOTIFY_DONE; +} + +static struct notifier_block mce_ue_nb = { + .notifier_call = handle_mce_ue +}; + static int papr_scm_probe(struct platform_device *pdev) { struct device_node *dn = pdev->dev.of_node; u32 drc_index, metadata_size; u64 blocks, block_size; struct papr_scm_priv *p; + u8 uuid_raw[UUID_SIZE]; const char *uuid_str; - u64 uuid[2]; + ssize_t stat_size; + uuid_t uuid; int rc; /* check we have all the required DT properties */ @@ -409,11 +1428,21 @@ static int papr_scm_probe(struct platform_device *pdev) return -ENODEV; } + /* + * open firmware platform device create won't update the NUMA + * distance table. For PAPR SCM devices we use numa_map_to_online_node() + * to find the nearest online NUMA node and that requires correct + * distance table information. + */ + update_numa_distance(dn); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; + /* Initialize the dimm mutex */ + mutex_init(&p->health_mutex); + /* optional DT properties */ of_property_read_u32(dn, "ibm,metadata-size", &metadata_size); @@ -422,18 +1451,30 @@ static int papr_scm_probe(struct platform_device *pdev) p->block_size = block_size; p->blocks = blocks; p->is_volatile = !of_property_read_bool(dn, "ibm,cache-flush-required"); + p->hcall_flush_required = of_property_read_bool(dn, "ibm,hcall-flush-required"); + + if (of_property_read_u64(dn, "ibm,persistence-failed-count", + &p->dirty_shutdown_counter)) + p->dirty_shutdown_counter = 0; /* We just need to ensure that set cookies are unique across */ - uuid_parse(uuid_str, (uuid_t *) uuid); + uuid_parse(uuid_str, &uuid); + /* - * cookie1 and cookie2 are not really little endian - * we store a little endian representation of the - * uuid str so that we can compare this with the label - * area cookie irrespective of the endian config with which - * the kernel is built. + * The cookie1 and cookie2 are not really little endian. + * We store a raw buffer representation of the + * uuid string so that we can compare this with the label + * area cookie irrespective of the endian configuration + * with which the kernel is built. + * + * Historically we stored the cookie in the below format. + * for a uuid string 72511b67-0b3b-42fd-8d1d-5be3cae8bcaa + * cookie1 was 0xfd423b0b671b5172 + * cookie2 was 0xaabce8cae35b1d8d */ - p->nd_set.cookie1 = cpu_to_le64(uuid[0]); - p->nd_set.cookie2 = cpu_to_le64(uuid[1]); + export_uuid(uuid_raw, &uuid); + p->nd_set.cookie1 = get_unaligned_le64(&uuid_raw[0]); + p->nd_set.cookie2 = get_unaligned_le64(&uuid_raw[8]); /* might be zero */ p->metadata_size = metadata_size; @@ -458,11 +1499,20 @@ static int papr_scm_probe(struct platform_device *pdev) p->res.name = pdev->name; p->res.flags = IORESOURCE_MEM; + /* Try retrieving the stat buffer and see if its supported */ + stat_size = drc_pmem_query_stats(p, NULL, 0); + if (stat_size > 0) { + p->stat_buffer_len = stat_size; + dev_dbg(&p->pdev->dev, "Max perf-stat size %lu-bytes\n", + p->stat_buffer_len); + } + rc = papr_scm_nvdimm_init(p); if (rc) goto err2; platform_set_drvdata(pdev, p); + papr_scm_pmu_register(p); return 0; @@ -471,32 +1521,59 @@ err: kfree(p); return rc; } -static int papr_scm_remove(struct platform_device *pdev) +static void papr_scm_remove(struct platform_device *pdev) { struct papr_scm_priv *p = platform_get_drvdata(pdev); + mutex_lock(&papr_ndr_lock); + list_del(&p->region_list); + mutex_unlock(&papr_ndr_lock); + nvdimm_bus_unregister(p->bus); drc_pmem_unbind(p); - kfree(p); - return 0; + if (pdev->archdata.priv) + unregister_nvdimm_pmu(pdev->archdata.priv); + + pdev->archdata.priv = NULL; + kfree(p->bus_desc.provider_name); + kfree(p); } static const struct of_device_id papr_scm_match[] = { { .compatible = "ibm,pmemory" }, + { .compatible = "ibm,pmemory-v2" }, { }, }; static struct platform_driver papr_scm_driver = { .probe = papr_scm_probe, - .remove = papr_scm_remove, + .remove_new = papr_scm_remove, .driver = { .name = "papr_scm", .of_match_table = papr_scm_match, }, }; -module_platform_driver(papr_scm_driver); +static int __init papr_scm_init(void) +{ + int ret; + + ret = platform_driver_register(&papr_scm_driver); + if (!ret) + mce_register_notifier(&mce_ue_nb); + + return ret; +} +module_init(papr_scm_init); + +static void __exit papr_scm_exit(void) +{ + mce_unregister_notifier(&mce_ue_nb); + platform_driver_unregister(&papr_scm_driver); +} +module_exit(papr_scm_exit); + MODULE_DEVICE_TABLE(of, papr_scm_match); MODULE_LICENSE("GPL"); MODULE_AUTHOR("IBM Corporation"); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 722830978639..1772ae3d193d 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -14,7 +14,6 @@ #include <asm/eeh.h> #include <asm/pci-bridge.h> -#include <asm/prom.h> #include <asm/ppc-pci.h> #include <asm/pci.h> #include "pseries.h" @@ -55,14 +54,13 @@ struct pe_map_bar_entry { __be32 reserved; /* Reserved Space */ }; -int pseries_send_map_pe(struct pci_dev *pdev, - u16 num_vfs, - struct pe_map_bar_entry *vf_pe_array) +static int pseries_send_map_pe(struct pci_dev *pdev, u16 num_vfs, + struct pe_map_bar_entry *vf_pe_array) { struct pci_dn *pdn; int rc; unsigned long buid, addr; - int ibm_map_pes = rtas_token("ibm,open-sriov-map-pe-number"); + int ibm_map_pes = rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_MAP_PE_NUMBER); if (ibm_map_pes == RTAS_UNKNOWN_SERVICE) return -EINVAL; @@ -88,7 +86,7 @@ int pseries_send_map_pe(struct pci_dev *pdev, return rc; } -void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num) +static void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num) { struct pci_dn *pdn; @@ -102,7 +100,7 @@ void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num) pdn->pe_num_map[vf_index]); } -int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs) +static int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs) { struct pci_dn *pdn; int i, rc, vf_index; @@ -146,7 +144,7 @@ int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs) return rc; } -int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +static int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { struct pci_dn *pdn; int rc; @@ -189,14 +187,14 @@ int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) return rc; } -int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +static int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { /* Allocate PCI data */ - add_dev_pci_data(pdev); + add_sriov_vf_pdns(pdev); return pseries_pci_sriov_enable(pdev, num_vfs); } -int pseries_pcibios_sriov_disable(struct pci_dev *pdev) +static int pseries_pcibios_sriov_disable(struct pci_dev *pdev) { struct pci_dn *pdn; @@ -204,7 +202,7 @@ int pseries_pcibios_sriov_disable(struct pci_dev *pdev) /* Releasing pe_num_map */ kfree(pdn->pe_num_map); /* Release PCI data */ - remove_dev_pci_data(pdev); + remove_sriov_vf_pdns(pdev); pci_vf_drivers_autoprobe(pdev, true); return 0; } @@ -225,8 +223,6 @@ static void __init pSeries_request_regions(void) void __init pSeries_final_fixup(void) { - struct pci_controller *hose; - pSeries_request_regions(); eeh_show_enabled(); @@ -235,27 +231,6 @@ void __init pSeries_final_fixup(void) ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable; #endif - list_for_each_entry(hose, &hose_list, list_node) { - struct device_node *dn = hose->dn, *nvdn; - - while (1) { - dn = of_find_all_nodes(dn); - if (!dn) - break; - nvdn = of_parse_phandle(dn, "ibm,nvlink", 0); - if (!nvdn) - continue; - if (!of_device_is_compatible(nvdn, "ibm,npu-link")) - continue; - if (!of_device_is_compatible(nvdn->parent, - "ibm,power9-npu")) - continue; -#ifdef CONFIG_PPC_POWERNV - WARN_ON_ONCE(pnv_npu2_init(hose)); -#endif - break; - } - } } /* @@ -265,7 +240,7 @@ void __init pSeries_final_fixup(void) */ static void fixup_winbond_82c105(struct pci_dev* dev) { - int i; + struct resource *r; unsigned int reg; if (!machine_is(pseries)) @@ -276,20 +251,39 @@ static void fixup_winbond_82c105(struct pci_dev* dev) /* Enable LEGIRQ to use INTC instead of ISA interrupts */ pci_write_config_dword(dev, 0x40, reg | (1<<11)); - for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) { + pci_dev_for_each_resource(dev, r) { /* zap the 2nd function of the winbond chip */ - if (dev->resource[i].flags & IORESOURCE_IO - && dev->bus->number == 0 && dev->devfn == 0x81) - dev->resource[i].flags &= ~IORESOURCE_IO; - if (dev->resource[i].start == 0 && dev->resource[i].end) { - dev->resource[i].flags = 0; - dev->resource[i].end = 0; + if (dev->bus->number == 0 && dev->devfn == 0x81 && + r->flags & IORESOURCE_IO) + r->flags &= ~IORESOURCE_IO; + if (r->start == 0 && r->end) { + r->flags = 0; + r->end = 0; } } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105, fixup_winbond_82c105); +static enum pci_bus_speed prop_to_pci_speed(u32 prop) +{ + switch (prop) { + case 0x01: + return PCIE_SPEED_2_5GT; + case 0x02: + return PCIE_SPEED_5_0GT; + case 0x04: + return PCIE_SPEED_8_0GT; + case 0x08: + return PCIE_SPEED_16_0GT; + case 0x10: + return PCIE_SPEED_32_0GT; + default: + pr_debug("Unexpected PCI link speed property value\n"); + return PCI_SPEED_UNKNOWN; + } +} + int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) { struct device_node *dn, *pdn; @@ -322,35 +316,7 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) return 0; } - switch (pcie_link_speed_stats[0]) { - case 0x01: - bus->max_bus_speed = PCIE_SPEED_2_5GT; - break; - case 0x02: - bus->max_bus_speed = PCIE_SPEED_5_0GT; - break; - case 0x04: - bus->max_bus_speed = PCIE_SPEED_8_0GT; - break; - default: - bus->max_bus_speed = PCI_SPEED_UNKNOWN; - break; - } - - switch (pcie_link_speed_stats[1]) { - case 0x01: - bus->cur_bus_speed = PCIE_SPEED_2_5GT; - break; - case 0x02: - bus->cur_bus_speed = PCIE_SPEED_5_0GT; - break; - case 0x04: - bus->cur_bus_speed = PCIE_SPEED_8_0GT; - break; - default: - bus->cur_bus_speed = PCI_SPEED_UNKNOWN; - break; - } - + bus->max_bus_speed = prop_to_pci_speed(pcie_link_speed_stats[0]); + bus->cur_bus_speed = prop_to_pci_speed(pcie_link_speed_stats[1]); return 0; } diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 361986e4354e..4448386268d9 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -33,11 +33,15 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pci_devs_phb_init_dynamic(phb); + pseries_msi_allocate_domains(phb); + + ppc_iommu_register_device(phb); + /* Create EEH devices for the PHB */ - eeh_dev_phb_init_dynamic(phb); + eeh_phb_pe_create(phb); if (dn->child) - eeh_add_device_tree_early(PCI_DN(dn)); + pseries_eeh_init_edev_recursive(PCI_DN(dn)); pcibios_scan_phb(phb); pcibios_finish_adding_to_bus(phb->bus); @@ -50,6 +54,7 @@ EXPORT_SYMBOL_GPL(init_phb_dynamic); int remove_phb_dynamic(struct pci_controller *phb) { struct pci_bus *b = phb->bus; + struct pci_host_bridge *host_bridge = to_pci_host_bridge(b->bridge); struct resource *res; int rc, i; @@ -73,10 +78,18 @@ int remove_phb_dynamic(struct pci_controller *phb) } } + ppc_iommu_unregister_device(phb); + + pseries_msi_free_domains(phb); + + /* Keep a reference so phb isn't freed yet */ + get_device(&host_bridge->dev); + /* Remove the PCI bus and unregister the bridge device from sysfs */ phb->bus = NULL; pci_remove_bus(b); - device_unregister(b->bridge); + host_bridge->bus = NULL; + device_unregister(&host_bridge->dev); /* Now release the IO resource */ if (res->flags & IORESOURCE_IO) @@ -95,6 +108,7 @@ int remove_phb_dynamic(struct pci_controller *phb) * the pcibios_free_controller_deferred() callback; * see pseries_root_bridge_prepare(). */ + put_device(&host_bridge->dev); return 0; } diff --git a/arch/powerpc/platforms/pseries/plpks-secvar.c b/arch/powerpc/platforms/pseries/plpks-secvar.c new file mode 100644 index 000000000000..257fd1f8bc19 --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpks-secvar.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-only + +// Secure variable implementation using the PowerVM LPAR Platform KeyStore (PLPKS) +// +// Copyright 2022, 2023 IBM Corporation +// Authors: Russell Currey +// Andrew Donnellan +// Nayna Jain + +#define pr_fmt(fmt) "secvar: "fmt + +#include <linux/printk.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kobject.h> +#include <linux/nls.h> +#include <asm/machdep.h> +#include <asm/secvar.h> +#include <asm/plpks.h> + +// Config attributes for sysfs +#define PLPKS_CONFIG_ATTR(name, fmt, func) \ + static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *buf) \ + { \ + return sysfs_emit(buf, fmt, func()); \ + } \ + static struct kobj_attribute attr_##name = __ATTR_RO(name) + +PLPKS_CONFIG_ATTR(version, "%u\n", plpks_get_version); +PLPKS_CONFIG_ATTR(max_object_size, "%u\n", plpks_get_maxobjectsize); +PLPKS_CONFIG_ATTR(total_size, "%u\n", plpks_get_totalsize); +PLPKS_CONFIG_ATTR(used_space, "%u\n", plpks_get_usedspace); +PLPKS_CONFIG_ATTR(supported_policies, "%08x\n", plpks_get_supportedpolicies); +PLPKS_CONFIG_ATTR(signed_update_algorithms, "%016llx\n", plpks_get_signedupdatealgorithms); + +static const struct attribute *config_attrs[] = { + &attr_version.attr, + &attr_max_object_size.attr, + &attr_total_size.attr, + &attr_used_space.attr, + &attr_supported_policies.attr, + &attr_signed_update_algorithms.attr, + NULL, +}; + +static u32 get_policy(const char *name) +{ + if ((strcmp(name, "db") == 0) || + (strcmp(name, "dbx") == 0) || + (strcmp(name, "grubdb") == 0) || + (strcmp(name, "grubdbx") == 0) || + (strcmp(name, "sbat") == 0)) + return (PLPKS_WORLDREADABLE | PLPKS_SIGNEDUPDATE); + else + return PLPKS_SIGNEDUPDATE; +} + +static const char * const plpks_var_names[] = { + "PK", + "KEK", + "db", + "dbx", + "grubdb", + "grubdbx", + "sbat", + "moduledb", + "trustedcadb", + NULL, +}; + +static int plpks_get_variable(const char *key, u64 key_len, u8 *data, + u64 *data_size) +{ + struct plpks_var var = {0}; + int rc = 0; + + // We subtract 1 from key_len because we don't need to include the + // null terminator at the end of the string + var.name = kcalloc(key_len - 1, sizeof(wchar_t), GFP_KERNEL); + if (!var.name) + return -ENOMEM; + rc = utf8s_to_utf16s(key, key_len - 1, UTF16_LITTLE_ENDIAN, (wchar_t *)var.name, + key_len - 1); + if (rc < 0) + goto err; + var.namelen = rc * 2; + + var.os = PLPKS_VAR_LINUX; + if (data) { + var.data = data; + var.datalen = *data_size; + } + rc = plpks_read_os_var(&var); + + if (rc) + goto err; + + *data_size = var.datalen; + +err: + kfree(var.name); + if (rc && rc != -ENOENT) { + pr_err("Failed to read variable '%s': %d\n", key, rc); + // Return -EIO since userspace probably doesn't care about the + // specific error + rc = -EIO; + } + return rc; +} + +static int plpks_set_variable(const char *key, u64 key_len, u8 *data, + u64 data_size) +{ + struct plpks_var var = {0}; + int rc = 0; + u64 flags; + + // Secure variables need to be prefixed with 8 bytes of flags. + // We only want to perform the write if we have at least one byte of data. + if (data_size <= sizeof(flags)) + return -EINVAL; + + // We subtract 1 from key_len because we don't need to include the + // null terminator at the end of the string + var.name = kcalloc(key_len - 1, sizeof(wchar_t), GFP_KERNEL); + if (!var.name) + return -ENOMEM; + rc = utf8s_to_utf16s(key, key_len - 1, UTF16_LITTLE_ENDIAN, (wchar_t *)var.name, + key_len - 1); + if (rc < 0) + goto err; + var.namelen = rc * 2; + + // Flags are contained in the first 8 bytes of the buffer, and are always big-endian + flags = be64_to_cpup((__be64 *)data); + + var.datalen = data_size - sizeof(flags); + var.data = data + sizeof(flags); + var.os = PLPKS_VAR_LINUX; + var.policy = get_policy(key); + + // Unlike in the read case, the plpks error code can be useful to + // userspace on write, so we return it rather than just -EIO + rc = plpks_signed_update_var(&var, flags); + +err: + kfree(var.name); + return rc; +} + +// PLPKS dynamic secure boot doesn't give us a format string in the same way OPAL does. +// Instead, report the format using the SB_VERSION variable in the keystore. +// The string is made up by us, and takes the form "ibm,plpks-sb-v<n>" (or "ibm,plpks-sb-unknown" +// if the SB_VERSION variable doesn't exist). Hypervisor defines the SB_VERSION variable as a +// "1 byte unsigned integer value". +static ssize_t plpks_secvar_format(char *buf, size_t bufsize) +{ + struct plpks_var var = {0}; + ssize_t ret; + u8 version; + + var.component = NULL; + // Only the signed variables have null bytes in their names, this one doesn't + var.name = "SB_VERSION"; + var.namelen = strlen(var.name); + var.datalen = 1; + var.data = &version; + + // Unlike the other vars, SB_VERSION is owned by firmware instead of the OS + ret = plpks_read_fw_var(&var); + if (ret) { + if (ret == -ENOENT) { + ret = snprintf(buf, bufsize, "ibm,plpks-sb-unknown"); + } else { + pr_err("Error %ld reading SB_VERSION from firmware\n", ret); + ret = -EIO; + } + goto err; + } + + ret = snprintf(buf, bufsize, "ibm,plpks-sb-v%hhu", version); +err: + return ret; +} + +static int plpks_max_size(u64 *max_size) +{ + // The max object size reported by the hypervisor is accurate for the + // object itself, but we use the first 8 bytes of data on write as the + // signed update flags, so the max size a user can write is larger. + *max_size = (u64)plpks_get_maxobjectsize() + sizeof(u64); + + return 0; +} + + +static const struct secvar_operations plpks_secvar_ops = { + .get = plpks_get_variable, + .set = plpks_set_variable, + .format = plpks_secvar_format, + .max_size = plpks_max_size, + .config_attrs = config_attrs, + .var_names = plpks_var_names, +}; + +static int plpks_secvar_init(void) +{ + if (!plpks_is_available()) + return -ENODEV; + + return set_secvar_ops(&plpks_secvar_ops); +} +machine_device_initcall(pseries, plpks_secvar_init); diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c new file mode 100644 index 000000000000..febe18f251d0 --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -0,0 +1,711 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * POWER LPAR Platform KeyStore(PLPKS) + * Copyright (C) 2022 IBM Corporation + * Author: Nayna Jain <nayna@linux.ibm.com> + * + * Provides access to variables stored in Power LPAR Platform KeyStore(PLPKS). + */ + +#define pr_fmt(fmt) "plpks: " fmt + +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/io.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> +#include <linux/memblock.h> +#include <asm/hvcall.h> +#include <asm/machdep.h> +#include <asm/plpks.h> +#include <asm/firmware.h> + +static u8 *ospassword; +static u16 ospasswordlength; + +// Retrieved with H_PKS_GET_CONFIG +static u8 version; +static u16 objoverhead; +static u16 maxpwsize; +static u16 maxobjsize; +static s16 maxobjlabelsize; +static u32 totalsize; +static u32 usedspace; +static u32 supportedpolicies; +static u32 maxlargeobjectsize; +static u64 signedupdatealgorithms; + +struct plpks_auth { + u8 version; + u8 consumer; + __be64 rsvd0; + __be32 rsvd1; + __be16 passwordlength; + u8 password[]; +} __packed __aligned(16); + +struct label_attr { + u8 prefix[8]; + u8 version; + u8 os; + u8 length; + u8 reserved[5]; +}; + +struct label { + struct label_attr attr; + u8 name[PLPKS_MAX_NAME_SIZE]; + size_t size; +}; + +static int pseries_status_to_err(int rc) +{ + int err; + + switch (rc) { + case H_SUCCESS: + err = 0; + break; + case H_FUNCTION: + err = -ENXIO; + break; + case H_PARAMETER: + case H_P2: + case H_P3: + case H_P4: + case H_P5: + case H_P6: + err = -EINVAL; + break; + case H_NOT_FOUND: + err = -ENOENT; + break; + case H_BUSY: + case H_LONG_BUSY_ORDER_1_MSEC: + case H_LONG_BUSY_ORDER_10_MSEC: + case H_LONG_BUSY_ORDER_100_MSEC: + case H_LONG_BUSY_ORDER_1_SEC: + case H_LONG_BUSY_ORDER_10_SEC: + case H_LONG_BUSY_ORDER_100_SEC: + err = -EBUSY; + break; + case H_AUTHORITY: + err = -EPERM; + break; + case H_NO_MEM: + err = -ENOMEM; + break; + case H_RESOURCE: + err = -EEXIST; + break; + case H_TOO_BIG: + err = -EFBIG; + break; + case H_STATE: + err = -EIO; + break; + case H_R_STATE: + err = -EIO; + break; + case H_IN_USE: + err = -EEXIST; + break; + case H_ABORTED: + err = -EIO; + break; + default: + err = -EINVAL; + } + + pr_debug("Converted hypervisor code %d to Linux %d\n", rc, err); + + return err; +} + +static int plpks_gen_password(void) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + u8 *password, consumer = PLPKS_OS_OWNER; + int rc; + + // If we booted from kexec, we could be reusing an existing password already + if (ospassword) { + pr_debug("Password of length %u already in use\n", ospasswordlength); + return 0; + } + + // The password must not cross a page boundary, so we align to the next power of 2 + password = kzalloc(roundup_pow_of_two(maxpwsize), GFP_KERNEL); + if (!password) + return -ENOMEM; + + rc = plpar_hcall(H_PKS_GEN_PASSWORD, retbuf, consumer, 0, + virt_to_phys(password), maxpwsize); + + if (!rc) { + ospasswordlength = maxpwsize; + ospassword = kzalloc(maxpwsize, GFP_KERNEL); + if (!ospassword) { + kfree_sensitive(password); + return -ENOMEM; + } + memcpy(ospassword, password, ospasswordlength); + } else { + if (rc == H_IN_USE) { + pr_warn("Password already set - authenticated operations will fail\n"); + rc = 0; + } else { + goto out; + } + } +out: + kfree_sensitive(password); + + return pseries_status_to_err(rc); +} + +static struct plpks_auth *construct_auth(u8 consumer) +{ + struct plpks_auth *auth; + + if (consumer > PLPKS_OS_OWNER) + return ERR_PTR(-EINVAL); + + // The auth structure must not cross a page boundary and must be + // 16 byte aligned. We align to the next largest power of 2 + auth = kzalloc(roundup_pow_of_two(struct_size(auth, password, maxpwsize)), GFP_KERNEL); + if (!auth) + return ERR_PTR(-ENOMEM); + + auth->version = 1; + auth->consumer = consumer; + + if (consumer == PLPKS_FW_OWNER || consumer == PLPKS_BOOTLOADER_OWNER) + return auth; + + memcpy(auth->password, ospassword, ospasswordlength); + + auth->passwordlength = cpu_to_be16(ospasswordlength); + + return auth; +} + +/* + * Label is combination of label attributes + name. + * Label attributes are used internally by kernel and not exposed to the user. + */ +static struct label *construct_label(char *component, u8 varos, u8 *name, + u16 namelen) +{ + struct label *label; + size_t slen = 0; + + if (!name || namelen > PLPKS_MAX_NAME_SIZE) + return ERR_PTR(-EINVAL); + + // Support NULL component for signed updates + if (component) { + slen = strlen(component); + if (slen > sizeof(label->attr.prefix)) + return ERR_PTR(-EINVAL); + } + + // The label structure must not cross a page boundary, so we align to the next power of 2 + label = kzalloc(roundup_pow_of_two(sizeof(*label)), GFP_KERNEL); + if (!label) + return ERR_PTR(-ENOMEM); + + if (component) + memcpy(&label->attr.prefix, component, slen); + + label->attr.version = PLPKS_LABEL_VERSION; + label->attr.os = varos; + label->attr.length = PLPKS_MAX_LABEL_ATTR_SIZE; + memcpy(&label->name, name, namelen); + + label->size = sizeof(struct label_attr) + namelen; + + return label; +} + +static int _plpks_get_config(void) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct config { + u8 version; + u8 flags; + __be16 rsvd0; + __be16 objoverhead; + __be16 maxpwsize; + __be16 maxobjlabelsize; + __be16 maxobjsize; + __be32 totalsize; + __be32 usedspace; + __be32 supportedpolicies; + __be32 maxlargeobjectsize; + __be64 signedupdatealgorithms; + u8 rsvd1[476]; + } __packed * config; + size_t size; + int rc = 0; + + size = sizeof(*config); + + // Config struct must not cross a page boundary. So long as the struct + // size is a power of 2, this should be fine as alignment is guaranteed + config = kzalloc(size, GFP_KERNEL); + if (!config) { + rc = -ENOMEM; + goto err; + } + + rc = plpar_hcall(H_PKS_GET_CONFIG, retbuf, virt_to_phys(config), size); + + if (rc != H_SUCCESS) { + rc = pseries_status_to_err(rc); + goto err; + } + + version = config->version; + objoverhead = be16_to_cpu(config->objoverhead); + maxpwsize = be16_to_cpu(config->maxpwsize); + maxobjsize = be16_to_cpu(config->maxobjsize); + maxobjlabelsize = be16_to_cpu(config->maxobjlabelsize); + totalsize = be32_to_cpu(config->totalsize); + usedspace = be32_to_cpu(config->usedspace); + supportedpolicies = be32_to_cpu(config->supportedpolicies); + maxlargeobjectsize = be32_to_cpu(config->maxlargeobjectsize); + signedupdatealgorithms = be64_to_cpu(config->signedupdatealgorithms); + + // Validate that the numbers we get back match the requirements of the spec + if (maxpwsize < 32) { + pr_err("Invalid Max Password Size received from hypervisor (%d < 32)\n", maxpwsize); + rc = -EIO; + goto err; + } + + if (maxobjlabelsize < 255) { + pr_err("Invalid Max Object Label Size received from hypervisor (%d < 255)\n", + maxobjlabelsize); + rc = -EIO; + goto err; + } + + if (totalsize < 4096) { + pr_err("Invalid Total Size received from hypervisor (%d < 4096)\n", totalsize); + rc = -EIO; + goto err; + } + + if (version >= 3 && maxlargeobjectsize >= 65536 && maxobjsize != 0xFFFF) { + pr_err("Invalid Max Object Size (0x%x != 0xFFFF)\n", maxobjsize); + rc = -EIO; + goto err; + } + +err: + kfree(config); + return rc; +} + +u8 plpks_get_version(void) +{ + return version; +} + +u16 plpks_get_objoverhead(void) +{ + return objoverhead; +} + +u16 plpks_get_maxpwsize(void) +{ + return maxpwsize; +} + +u16 plpks_get_maxobjectsize(void) +{ + return maxobjsize; +} + +u16 plpks_get_maxobjectlabelsize(void) +{ + return maxobjlabelsize; +} + +u32 plpks_get_totalsize(void) +{ + return totalsize; +} + +u32 plpks_get_usedspace(void) +{ + // Unlike other config values, usedspace regularly changes as objects + // are updated, so we need to refresh. + int rc = _plpks_get_config(); + if (rc) { + pr_err("Couldn't get config, rc: %d\n", rc); + return 0; + } + return usedspace; +} + +u32 plpks_get_supportedpolicies(void) +{ + return supportedpolicies; +} + +u32 plpks_get_maxlargeobjectsize(void) +{ + return maxlargeobjectsize; +} + +u64 plpks_get_signedupdatealgorithms(void) +{ + return signedupdatealgorithms; +} + +u16 plpks_get_passwordlen(void) +{ + return ospasswordlength; +} + +bool plpks_is_available(void) +{ + int rc; + + if (!firmware_has_feature(FW_FEATURE_PLPKS)) + return false; + + rc = _plpks_get_config(); + if (rc) + return false; + + return true; +} + +static int plpks_confirm_object_flushed(struct label *label, + struct plpks_auth *auth) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + bool timed_out = true; + u64 timeout = 0; + u8 status; + int rc; + + do { + rc = plpar_hcall(H_PKS_CONFIRM_OBJECT_FLUSHED, retbuf, + virt_to_phys(auth), virt_to_phys(label), + label->size); + + status = retbuf[0]; + if (rc) { + timed_out = false; + if (rc == H_NOT_FOUND && status == 1) + rc = 0; + break; + } + + if (!rc && status == 1) { + timed_out = false; + break; + } + + usleep_range(PLPKS_FLUSH_SLEEP, + PLPKS_FLUSH_SLEEP + PLPKS_FLUSH_SLEEP_RANGE); + timeout = timeout + PLPKS_FLUSH_SLEEP; + } while (timeout < PLPKS_MAX_TIMEOUT); + + if (timed_out) + return -ETIMEDOUT; + + return pseries_status_to_err(rc); +} + +int plpks_signed_update_var(struct plpks_var *var, u64 flags) +{ + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + int rc; + struct label *label; + struct plpks_auth *auth; + u64 continuetoken = 0; + u64 timeout = 0; + + if (!var->data || var->datalen <= 0 || var->namelen > PLPKS_MAX_NAME_SIZE) + return -EINVAL; + + if (!(var->policy & PLPKS_SIGNEDUPDATE)) + return -EINVAL; + + // Signed updates need the component to be NULL. + if (var->component) + return -EINVAL; + + auth = construct_auth(PLPKS_OS_OWNER); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + label = construct_label(var->component, var->os, var->name, var->namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out; + } + + do { + rc = plpar_hcall9(H_PKS_SIGNED_UPDATE, retbuf, + virt_to_phys(auth), virt_to_phys(label), + label->size, var->policy, flags, + virt_to_phys(var->data), var->datalen, + continuetoken); + + continuetoken = retbuf[0]; + if (pseries_status_to_err(rc) == -EBUSY) { + int delay_ms = get_longbusy_msecs(rc); + mdelay(delay_ms); + timeout += delay_ms; + } + rc = pseries_status_to_err(rc); + } while (rc == -EBUSY && timeout < PLPKS_MAX_TIMEOUT); + + if (!rc) + rc = plpks_confirm_object_flushed(label, auth); + + kfree(label); +out: + kfree(auth); + + return rc; +} + +int plpks_write_var(struct plpks_var var) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct plpks_auth *auth; + struct label *label; + int rc; + + if (!var.component || !var.data || var.datalen <= 0 || + var.namelen > PLPKS_MAX_NAME_SIZE || var.datalen > PLPKS_MAX_DATA_SIZE) + return -EINVAL; + + if (var.policy & PLPKS_SIGNEDUPDATE) + return -EINVAL; + + auth = construct_auth(PLPKS_OS_OWNER); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + label = construct_label(var.component, var.os, var.name, var.namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out; + } + + rc = plpar_hcall(H_PKS_WRITE_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(label), label->size, var.policy, + virt_to_phys(var.data), var.datalen); + + if (!rc) + rc = plpks_confirm_object_flushed(label, auth); + + rc = pseries_status_to_err(rc); + kfree(label); +out: + kfree(auth); + + return rc; +} + +int plpks_remove_var(char *component, u8 varos, struct plpks_var_name vname) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct plpks_auth *auth; + struct label *label; + int rc; + + if (vname.namelen > PLPKS_MAX_NAME_SIZE) + return -EINVAL; + + auth = construct_auth(PLPKS_OS_OWNER); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + label = construct_label(component, varos, vname.name, vname.namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out; + } + + rc = plpar_hcall(H_PKS_REMOVE_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(label), label->size); + + if (!rc) + rc = plpks_confirm_object_flushed(label, auth); + + rc = pseries_status_to_err(rc); + kfree(label); +out: + kfree(auth); + + return rc; +} + +static int plpks_read_var(u8 consumer, struct plpks_var *var) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + struct plpks_auth *auth; + struct label *label = NULL; + u8 *output; + int rc; + + if (var->namelen > PLPKS_MAX_NAME_SIZE) + return -EINVAL; + + auth = construct_auth(consumer); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + if (consumer == PLPKS_OS_OWNER) { + label = construct_label(var->component, var->os, var->name, + var->namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out_free_auth; + } + } + + output = kzalloc(maxobjsize, GFP_KERNEL); + if (!output) { + rc = -ENOMEM; + goto out_free_label; + } + + if (consumer == PLPKS_OS_OWNER) + rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(label), label->size, virt_to_phys(output), + maxobjsize); + else + rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(var->name), var->namelen, virt_to_phys(output), + maxobjsize); + + + if (rc != H_SUCCESS) { + rc = pseries_status_to_err(rc); + goto out_free_output; + } + + if (!var->data || var->datalen > retbuf[0]) + var->datalen = retbuf[0]; + + var->policy = retbuf[1]; + + if (var->data) + memcpy(var->data, output, var->datalen); + + rc = 0; + +out_free_output: + kfree(output); +out_free_label: + kfree(label); +out_free_auth: + kfree(auth); + + return rc; +} + +int plpks_read_os_var(struct plpks_var *var) +{ + return plpks_read_var(PLPKS_OS_OWNER, var); +} + +int plpks_read_fw_var(struct plpks_var *var) +{ + return plpks_read_var(PLPKS_FW_OWNER, var); +} + +int plpks_read_bootloader_var(struct plpks_var *var) +{ + return plpks_read_var(PLPKS_BOOTLOADER_OWNER, var); +} + +int plpks_populate_fdt(void *fdt) +{ + int chosen_offset = fdt_path_offset(fdt, "/chosen"); + + if (chosen_offset < 0) { + pr_err("Can't find chosen node: %s\n", + fdt_strerror(chosen_offset)); + return chosen_offset; + } + + return fdt_setprop(fdt, chosen_offset, "ibm,plpks-pw", ospassword, ospasswordlength); +} + +// Once a password is registered with the hypervisor it cannot be cleared without +// rebooting the LPAR, so to keep using the PLPKS across kexec boots we need to +// recover the previous password from the FDT. +// +// There are a few challenges here. We don't want the password to be visible to +// users, so we need to clear it from the FDT. This has to be done in early boot. +// Clearing it from the FDT would make the FDT's checksum invalid, so we have to +// manually cause the checksum to be recalculated. +void __init plpks_early_init_devtree(void) +{ + void *fdt = initial_boot_params; + int chosen_node = fdt_path_offset(fdt, "/chosen"); + const u8 *password; + int len; + + if (chosen_node < 0) + return; + + password = fdt_getprop(fdt, chosen_node, "ibm,plpks-pw", &len); + if (len <= 0) { + pr_debug("Couldn't find ibm,plpks-pw node.\n"); + return; + } + + ospassword = memblock_alloc_raw(len, SMP_CACHE_BYTES); + if (!ospassword) { + pr_err("Error allocating memory for password.\n"); + goto out; + } + + memcpy(ospassword, password, len); + ospasswordlength = (u16)len; + +out: + fdt_nop_property(fdt, chosen_node, "ibm,plpks-pw"); + // Since we've cleared the password, we must update the FDT checksum + early_init_dt_verify(fdt); +} + +static __init int pseries_plpks_init(void) +{ + int rc; + + if (!firmware_has_feature(FW_FEATURE_PLPKS)) + return -ENODEV; + + rc = _plpks_get_config(); + + if (rc) { + pr_err("POWER LPAR Platform KeyStore is not supported or enabled\n"); + return rc; + } + + rc = plpks_gen_password(); + if (rc) + pr_err("Failed setting POWER LPAR Platform KeyStore Password\n"); + else + pr_info("POWER LPAR Platform KeyStore initialized successfully\n"); + + return rc; +} +machine_arch_initcall(pseries, pseries_plpks_init); diff --git a/arch/powerpc/platforms/pseries/plpks_sed_ops.c b/arch/powerpc/platforms/pseries/plpks_sed_ops.c new file mode 100644 index 000000000000..7c873c9589ef --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpks_sed_ops.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * POWER Platform specific code for non-volatile SED key access + * Copyright (C) 2022 IBM Corporation + * + * Define operations for SED Opal to read/write keys + * from POWER LPAR Platform KeyStore(PLPKS). + * + * Self Encrypting Drives(SED) key storage using PLPKS + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/ioctl.h> +#include <linux/sed-opal-key.h> +#include <asm/plpks.h> + +static bool plpks_sed_initialized = false; +static bool plpks_sed_available = false; + +/* + * structure that contains all SED data + */ +struct plpks_sed_object_data { + u_char version; + u_char pad1[7]; + u_long authority; + u_long range; + u_int key_len; + u_char key[32]; +}; + +#define PLPKS_SED_OBJECT_DATA_V0 0 +#define PLPKS_SED_MANGLED_LABEL "/default/pri" +#define PLPKS_SED_COMPONENT "sed-opal" +#define PLPKS_SED_KEY "opal-boot-pin" + +/* + * authority is admin1 and range is global + */ +#define PLPKS_SED_AUTHORITY 0x0000000900010001 +#define PLPKS_SED_RANGE 0x0000080200000001 + +static void plpks_init_var(struct plpks_var *var, char *keyname) +{ + if (!plpks_sed_initialized) { + plpks_sed_initialized = true; + plpks_sed_available = plpks_is_available(); + if (!plpks_sed_available) + pr_err("SED: plpks not available\n"); + } + + var->name = keyname; + var->namelen = strlen(keyname); + if (strcmp(PLPKS_SED_KEY, keyname) == 0) { + var->name = PLPKS_SED_MANGLED_LABEL; + var->namelen = strlen(keyname); + } + var->policy = PLPKS_WORLDREADABLE; + var->os = PLPKS_VAR_COMMON; + var->data = NULL; + var->datalen = 0; + var->component = PLPKS_SED_COMPONENT; +} + +/* + * Read the SED Opal key from PLPKS given the label + */ +int sed_read_key(char *keyname, char *key, u_int *keylen) +{ + struct plpks_var var; + struct plpks_sed_object_data data; + int ret; + u_int len; + + plpks_init_var(&var, keyname); + + if (!plpks_sed_available) + return -EOPNOTSUPP; + + var.data = (u8 *)&data; + var.datalen = sizeof(data); + + ret = plpks_read_os_var(&var); + if (ret != 0) + return ret; + + len = min_t(u16, be32_to_cpu(data.key_len), var.datalen); + memcpy(key, data.key, len); + key[len] = '\0'; + *keylen = len; + + return 0; +} + +/* + * Write the SED Opal key to PLPKS given the label + */ +int sed_write_key(char *keyname, char *key, u_int keylen) +{ + struct plpks_var var; + struct plpks_sed_object_data data; + struct plpks_var_name vname; + + plpks_init_var(&var, keyname); + + if (!plpks_sed_available) + return -EOPNOTSUPP; + + var.datalen = sizeof(struct plpks_sed_object_data); + var.data = (u8 *)&data; + + /* initialize SED object */ + data.version = PLPKS_SED_OBJECT_DATA_V0; + data.authority = cpu_to_be64(PLPKS_SED_AUTHORITY); + data.range = cpu_to_be64(PLPKS_SED_RANGE); + memset(&data.pad1, '\0', sizeof(data.pad1)); + data.key_len = cpu_to_be32(keylen); + memcpy(data.key, (char *)key, keylen); + + /* + * Key update requires remove first. The return value + * is ignored since it's okay if the key doesn't exist. + */ + vname.namelen = var.namelen; + vname.name = var.name; + plpks_remove_var(var.component, var.os, vname); + + return plpks_write_var(var); +} diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c index f860a897a9e0..3c290b9ed01b 100644 --- a/arch/powerpc/platforms/pseries/pmem.c +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -15,7 +15,6 @@ #include <linux/of.h> #include <linux/of_platform.h> #include <linux/slab.h> -#include <asm/prom.h> #include <asm/rtas.h> #include <asm/firmware.h> #include <asm/machdep.h> @@ -24,7 +23,6 @@ #include <asm/topology.h> #include "pseries.h" -#include "offline_states.h" static struct device_node *pmem_node; @@ -140,13 +138,19 @@ int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) return rc; } -const struct of_device_id drc_pmem_match[] = { +static const struct of_device_id drc_pmem_match[] = { { .type = "ibm,persistent-memory", }, {} }; static int pseries_pmem_init(void) { + /* + * Only supported on POWER8 and above. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return 0; + pmem_node = of_find_node_by_type(NULL, "ibm,persistent-memory"); if (!pmem_node) return 0; diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c index ee343ec6ab94..3676cb297767 100644 --- a/arch/powerpc/platforms/pseries/power.c +++ b/arch/powerpc/platforms/pseries/power.c @@ -51,7 +51,7 @@ static struct attribute *g[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 13fa370a87e4..bba4ad192b0f 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -11,7 +11,7 @@ struct device_node; -extern void request_event_sources_irqs(struct device_node *np, +void __init request_event_sources_irqs(struct device_node *np, irq_handler_t handler, const char *name); #include <linux/of.h> @@ -21,6 +21,7 @@ struct pt_regs; extern int pSeries_system_reset_exception(struct pt_regs *regs); extern int pSeries_machine_check_exception(struct pt_regs *regs); extern long pseries_machine_check_realmode(struct pt_regs *regs); +void pSeries_machine_check_log_err(void); #ifdef CONFIG_SMP extern void smp_init_pseries(void); @@ -33,19 +34,17 @@ int smp_query_cpu_stopped(unsigned int pcpu); #define QCSS_HARDWARE_ERROR -1 #define QCSS_HARDWARE_BUSY -2 #else -static inline void smp_init_pseries(void) { }; +static inline void smp_init_pseries(void) { } #endif extern void pseries_kexec_cpu_down(int crash_shutdown, int secondary); +void pseries_machine_kexec(struct kimage *image); extern void pSeries_final_fixup(void); /* Poweron flag used for enabling auto ups restart */ extern unsigned long rtas_poweron_auto; -/* Provided by HVC VIO */ -extern void hvc_vio_init_early(void); - /* Dynamic logical Partitioning/Mobility */ extern void dlpar_free_cc_nodes(struct device_node *); extern void dlpar_free_cc_property(struct property *); @@ -55,6 +54,8 @@ extern int dlpar_attach_node(struct device_node *, struct device_node *); extern int dlpar_detach_node(struct device_node *); extern int dlpar_acquire_drc(u32 drc_index); extern int dlpar_release_drc(u32 drc_index); +extern int dlpar_unisolate_drc(u32 drc_index); +extern void post_mobility_fixup(void); void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog); int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_errlog); @@ -75,11 +76,13 @@ static inline int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) #ifdef CONFIG_HOTPLUG_CPU int dlpar_cpu(struct pseries_hp_errorlog *hp_elog); +void pseries_cpu_hotplug_init(void); #else static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) { return -EOPNOTSUPP; } +static inline void pseries_cpu_hotplug_init(void) { } #endif /* PCI root bridge prepare function override for pseries */ @@ -87,8 +90,8 @@ struct pci_host_bridge; int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); extern struct pci_controller_ops pseries_pci_controller_ops; - -unsigned long pseries_memory_block_size(void); +int pseries_msi_allocate_domains(struct pci_controller *phb); +void pseries_msi_free_domains(struct pci_controller *phb); extern int CMO_PrPSP; extern int CMO_SecPSP; @@ -111,7 +114,19 @@ static inline unsigned long cmo_get_page_size(void) int dlpar_workqueue_init(void); -void pseries_setup_rfi_flush(void); +extern u32 pseries_security_flavor; +void pseries_setup_security_mitigations(void); + +#ifdef CONFIG_PPC_64S_HASH_MMU void pseries_lpar_read_hblkrm_characteristics(void); +#else +static inline void pseries_lpar_read_hblkrm_characteristics(void) { } +#endif + +void pseries_rng_init(void); +#ifdef CONFIG_SPAPR_TCE_IOMMU +struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose, + struct pci_dev *pdev); +#endif #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c index 09e98d301db0..2c661b798235 100644 --- a/arch/powerpc/platforms/pseries/pseries_energy.c +++ b/arch/powerpc/platforms/pseries/pseries_energy.c @@ -300,20 +300,22 @@ static struct device_attribute attr_percpu_deactivate_hint = static int __init pseries_energy_init(void) { int cpu, err; - struct device *cpu_dev; + struct device *cpu_dev, *dev_root; if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) return 0; /* H_BEST_ENERGY hcall not supported */ /* Create the sysfs files */ - err = device_create_file(cpu_subsys.dev_root, - &attr_cpu_activate_hint_list); - if (!err) - err = device_create_file(cpu_subsys.dev_root, - &attr_cpu_deactivate_hint_list); + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + err = device_create_file(dev_root, &attr_cpu_activate_hint_list); + if (!err) + err = device_create_file(dev_root, &attr_cpu_deactivate_hint_list); + put_device(dev_root); + if (err) + return err; + } - if (err) - return err; for_each_possible_cpu(cpu) { cpu_dev = get_cpu_device(cpu); err = device_create_file(cpu_dev, @@ -337,14 +339,18 @@ static int __init pseries_energy_init(void) static void __exit pseries_energy_cleanup(void) { int cpu; - struct device *cpu_dev; + struct device *cpu_dev, *dev_root; if (!sysfs_entries) return; /* Remove the sysfs files */ - device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list); - device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list); + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + device_remove_file(dev_root, &attr_cpu_activate_hint_list); + device_remove_file(dev_root, &attr_cpu_deactivate_hint_list); + put_device(dev_root); + } for_each_possible_cpu(cpu) { cpu_dev = get_cpu_device(cpu); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 1d7f973c647b..adafd593d9d3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock); static int ras_check_exception_token; -static void mce_process_errlog_event(struct irq_work *work); -static struct irq_work mce_errlog_process_work = { - .func = mce_process_errlog_event, -}; - #define EPOW_SENSOR_TOKEN 9 #define EPOW_SENSOR_INDEX 0 @@ -60,11 +55,17 @@ struct pseries_mc_errorlog { * XX 2: Reserved. * XXX 3: Type of UE error. * - * For error_type != MC_ERROR_TYPE_UE + * For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB * XXXXXXXX * X 1: Effective address provided. * XXXXX 5: Reserved. * XX 2: Type of SLB/ERAT/TLB error. + * + * For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS + * XXXXXXXX + * X 1: Error causing address provided. + * XXX 3: Type of error. + * XXXX 4: Reserved. */ u8 sub_err_type; u8 reserved_1[6]; @@ -80,6 +81,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 +#define MC_ERROR_TYPE_CTRL_MEM_ACCESS 0x08 /* RTAS pseries MCE error sub types */ #define MC_ERROR_UE_INDETERMINATE 0 @@ -90,6 +92,7 @@ struct pseries_mc_errorlog { #define UE_EFFECTIVE_ADDR_PROVIDED 0x40 #define UE_LOGICAL_ADDR_PROVIDED 0x20 +#define MC_EFFECTIVE_ADDR_PROVIDED 0x80 #define MC_ERROR_SLB_PARITY 0 #define MC_ERROR_SLB_MULTIHIT 1 @@ -103,6 +106,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_TLB_MULTIHIT 2 #define MC_ERROR_TLB_INDETERMINATE 3 +#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK 0 +#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1 + static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) { switch (mlog->error_type) { @@ -112,6 +118,8 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) case MC_ERROR_TYPE_ERAT: case MC_ERROR_TYPE_TLB: return (mlog->sub_err_type & 0x03); + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + return (mlog->sub_err_type & 0x70) >> 4; default: return 0; } @@ -122,7 +130,7 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) * devices or systems (e.g. hugepages) that have not been initialized at the * subsys stage. */ -int __init init_ras_hotplug_IRQ(void) +static int __init init_ras_hotplug_IRQ(void) { struct device_node *np; @@ -147,7 +155,7 @@ static int __init init_ras_IRQ(void) { struct device_node *np; - ras_check_exception_token = rtas_token("check-exception"); + ras_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); /* Internal Errors */ np = of_find_node_by_path("/event-sources/internal-errors"); @@ -184,7 +192,6 @@ static void handle_system_shutdown(char event_modifier) case EPOW_SHUTDOWN_ON_UPS: pr_emerg("Loss of system power detected. System is running on" " UPS/battery. Check RTAS error log for details\n"); - orderly_poweroff(true); break; case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: @@ -316,12 +323,10 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) /* Handle environmental and power warning (EPOW) interrupts. */ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) { - int status; int state; int critical; - status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, - &state); + rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state); if (state > 3) critical = 1; /* Time Critical */ @@ -330,12 +335,9 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) spin_lock(&ras_log_buf_lock); - status = rtas_call(ras_check_exception_token, 6, 1, NULL, - RTAS_VECTOR_EXTERNAL_INTERRUPT, - virq_to_hw(irq), - RTAS_EPOW_WARNING, - critical, __pa(&ras_log_buf), - rtas_get_error_log_max()); + rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, + virq_to_hw(irq), RTAS_EPOW_WARNING, critical, __pa(&ras_log_buf), + rtas_get_error_log_max()); log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); @@ -395,16 +397,31 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) /* * Some versions of FWNMI place the buffer inside the 4kB page starting at * 0x7000. Other versions place it inside the rtas buffer. We check both. + * Minimum size of the buffer is 16 bytes. */ #define VALID_FWNMI_BUFFER(A) \ - ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ - (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) + ((((A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \ + (((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16)))) static inline struct rtas_error_log *fwnmi_get_errlog(void) { return (struct rtas_error_log *)local_paca->mce_data_buf; } +static __be64 *fwnmi_get_savep(struct pt_regs *regs) +{ + unsigned long savep_ra; + + /* Mask top two bits */ + savep_ra = regs->gpr[3] & ~(0x3UL << 62); + if (!VALID_FWNMI_BUFFER(savep_ra)) { + printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); + return NULL; + } + + return __va(savep_ra); +} + /* * Get the error information for errors coming through the * FWNMI vectors. The pt_regs' r3 will be updated to reflect @@ -422,19 +439,14 @@ static inline struct rtas_error_log *fwnmi_get_errlog(void) */ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) { - unsigned long *savep; struct rtas_error_log *h; + __be64 *savep; - /* Mask top two bits */ - regs->gpr[3] &= ~(0x3UL << 62); - - if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { - printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); + savep = fwnmi_get_savep(regs); + if (!savep) return NULL; - } - savep = __va(regs->gpr[3]); - regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ + regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ h = (struct rtas_error_log *)&savep[1]; /* Use the per cpu buffer from paca to store rtas error log */ @@ -458,7 +470,15 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) */ static void fwnmi_release_errinfo(void) { - int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); + struct rtas_args rtas_args; + int ret; + + /* + * On pseries, the machine check stack is limited to under 4GB, so + * args can be on-stack. + */ + rtas_call_unlocked(&rtas_args, ibm_nmi_interlock_token, 0, 1, NULL); + ret = be32_to_cpu(rtas_args.rets[0]); if (ret != 0) printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); } @@ -475,17 +495,27 @@ int pSeries_system_reset_exception(struct pt_regs *regs) if ((be64_to_cpu(regs->msr) & (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { - regs->nip = be64_to_cpu((__be64)regs->nip); - regs->msr = 0; + regs_set_return_ip(regs, be64_to_cpu((__be64)regs->nip)); + regs_set_return_msr(regs, 0); } #endif if (fwnmi_active) { - struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); - if (errhdr) { - /* XXX Should look at FWNMI information */ - } - fwnmi_release_errinfo(); + __be64 *savep; + + /* + * Firmware (PowerVM and KVM) saves r3 to a save area like + * machine check, which is not exactly what PAPR (2.9) + * suggests but there is no way to detect otherwise, so this + * is the interface now. + * + * System resets do not save any error log or require an + * "ibm,nmi-interlock" rtas call to release. + */ + + savep = fwnmi_get_savep(regs); + if (savep) + regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ } if (smp_handle_nmi_ipi(regs)) @@ -494,18 +524,60 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } +static int mce_handle_err_realmode(int disposition, u8 error_type) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (disposition == RTAS_DISP_NOT_RECOVERED) { + switch (error_type) { + case MC_ERROR_TYPE_ERAT: + flush_erat(); + disposition = RTAS_DISP_FULLY_RECOVERED; + break; + case MC_ERROR_TYPE_SLB: +#ifdef CONFIG_PPC_64S_HASH_MMU + /* + * Store the old slb content in paca before flushing. + * Print this when we go to virtual mode. + * There are chances that we may hit MCE again if there + * is a parity error on the SLB entry we trying to read + * for saving. Hence limit the slb saving to single + * level of recursion. + */ + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); + flush_and_reload_slb(); + disposition = RTAS_DISP_FULLY_RECOVERED; +#endif + break; + default: + break; + } + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + pr_err("MCE: limited recovery, system may be degraded\n"); + disposition = RTAS_DISP_FULLY_RECOVERED; + } +#endif + return disposition; +} -static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) +static int mce_handle_err_virtmode(struct pt_regs *regs, + struct rtas_error_log *errp, + struct pseries_mc_errorlog *mce_log, + int disposition) { struct mce_error_info mce_err = { 0 }; - unsigned long eaddr = 0, paddr = 0; - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - int disposition = rtas_error_disposition(errp); int initiator = rtas_error_initiator(errp); int severity = rtas_error_severity(errp); + unsigned long eaddr = 0, paddr = 0; u8 error_type, err_sub_type; + if (!mce_log) + goto out; + + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + if (initiator == RTAS_INITIATOR_UNKNOWN) mce_err.initiator = MCE_INITIATOR_UNKNOWN; else if (initiator == RTAS_INITIATOR_CPU) @@ -531,8 +603,6 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) mce_err.severity = MCE_SEV_SEVERE; else if (severity == RTAS_SEVERITY_ERROR) mce_err.severity = MCE_SEV_SEVERE; - else if (severity == RTAS_SEVERITY_FATAL) - mce_err.severity = MCE_SEV_FATAL; else mce_err.severity = MCE_SEV_FATAL; @@ -544,20 +614,12 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; mce_err.error_class = MCE_ECLASS_UNKNOWN; - if (!rtas_error_extended(errp)) - goto out; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (pseries_log == NULL) - goto out; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; - err_sub_type = rtas_mc_error_sub_type(mce_log); - - switch (mce_log->error_type) { + switch (error_type) { case MC_ERROR_TYPE_UE: mce_err.error_type = MCE_ERROR_TYPE_UE; + mce_common_process_ue(regs, &mce_err); + if (mce_err.ignore_event) + disposition = RTAS_DISP_FULLY_RECOVERED; switch (err_sub_type) { case MC_ERROR_UE_IFETCH: mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; @@ -604,7 +666,7 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_ERAT: @@ -621,7 +683,7 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_TLB: @@ -638,61 +700,69 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_D_CACHE: mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + mce_err.error_type = MCE_ERROR_TYPE_ICACHE; + break; + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + mce_err.error_type = MCE_ERROR_TYPE_RA; + switch (err_sub_type) { + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; + break; + case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS: + mce_err.u.ra_error_type = + MCE_RA_ERROR_LOAD_STORE_FOREIGN; + break; + } + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) + eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_UNKNOWN: default: mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; break; } - -#ifdef CONFIG_PPC_BOOK3S_64 - if (disposition == RTAS_DISP_NOT_RECOVERED) { - switch (error_type) { - case MC_ERROR_TYPE_SLB: - case MC_ERROR_TYPE_ERAT: - /* - * Store the old slb content in paca before flushing. - * Print this when we go to virtual mode. - * There are chances that we may hit MCE again if there - * is a parity error on the SLB entry we trying to read - * for saving. Hence limit the slb saving to single - * level of recursion. - */ - if (local_paca->in_mce == 1) - slb_save_contents(local_paca->mce_faulty_slbs); - flush_and_reload_slb(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - default: - break; - } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - printk(KERN_ERR "MCE: limited recovery, system may " - "be degraded\n"); - disposition = RTAS_DISP_FULLY_RECOVERED; - } -#endif - out: save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, - &mce_err, regs->nip, eaddr, paddr); + &mce_err, regs->nip, eaddr, paddr); + return disposition; +} + +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) +{ + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log = NULL; + int disposition = rtas_error_disposition(errp); + u8 error_type; + + if (!rtas_error_extended(errp)) + goto out; + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (!pseries_log) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + + disposition = mce_handle_err_realmode(disposition, error_type); +out: + disposition = mce_handle_err_virtmode(regs, errp, mce_log, + disposition); return disposition; } /* * Process MCE rtas errlog event. */ -static void mce_process_errlog_event(struct irq_work *work) +void pSeries_machine_check_log_err(void) { struct rtas_error_log *err; @@ -713,7 +783,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; @@ -749,7 +819,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) */ recovered = 0; } else { - die("Machine check", regs, SIGBUS); + die_mce("Machine check", regs, SIGBUS); recovered = 1; } } @@ -801,10 +871,8 @@ long pseries_machine_check_realmode(struct pt_regs *regs) * virtual mode. */ disposition = mce_handle_error(regs, errp); - fwnmi_release_errinfo(); - /* Queue irq work to log this rtas event later. */ - irq_work_queue(&mce_errlog_process_work); + fwnmi_release_errinfo(); if (disposition == RTAS_DISP_FULLY_RECOVERED) return 1; diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 8a9c4fb95b8b..599bd2c78514 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -10,10 +10,10 @@ #include <linux/kernel.h> #include <linux/notifier.h> #include <linux/proc_fs.h> +#include <linux/security.h> #include <linux/slab.h> #include <linux/of.h> -#include <asm/prom.h> #include <asm/machdep.h> #include <linux/uaccess.h> #include <asm/mmu.h> @@ -362,6 +362,10 @@ static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t coun char *kbuf; char *tmp; + rv = security_locked_down(LOCKDOWN_DEVICE_TREE); + if (rv) + return rv; + kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); @@ -391,9 +395,9 @@ out: return rv ? rv : count; } -static const struct file_operations ofdt_fops = { - .write = ofdt_write, - .llseek = noop_llseek, +static const struct proc_ops ofdt_proc_ops = { + .proc_write = ofdt_write, + .proc_lseek = noop_llseek, }; /* create /proc/powerpc/ofdt write-only by root */ @@ -401,7 +405,7 @@ static int proc_ppc64_create_ofdt(void) { struct proc_dir_entry *ent; - ent = proc_create("powerpc/ofdt", 0200, NULL, &ofdt_fops); + ent = proc_create("powerpc/ofdt", 0200, NULL, &ofdt_proc_ops); if (ent) proc_set_size(ent, 0); diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c index bbb97169bf63..6ddfdeaace9e 100644 --- a/arch/powerpc/platforms/pseries/rng.c +++ b/arch/powerpc/platforms/pseries/rng.c @@ -10,6 +10,7 @@ #include <asm/archrandom.h> #include <asm/machdep.h> #include <asm/plpar_wrappers.h> +#include "pseries.h" static int pseries_get_random_long(unsigned long *v) @@ -24,18 +25,13 @@ static int pseries_get_random_long(unsigned long *v) return 0; } -static __init int rng_init(void) +void __init pseries_rng_init(void) { struct device_node *dn; dn = of_find_compatible_node(NULL, NULL, "ibm,random"); if (!dn) - return -ENODEV; - - pr_info("Registering arch random hook.\n"); - + return; ppc_md.get_random_seed = pseries_get_random_long; - - return 0; + of_node_put(dn); } -machine_subsys_initcall(pseries, rng_init); diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 70c3013fdd07..b5853e9fcc3c 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -13,9 +13,10 @@ #include <linux/delay.h> #include <linux/seq_file.h> #include <linux/crash_dump.h> +#include <linux/of.h> +#include <linux/of_fdt.h> #include <asm/page.h> -#include <asm/prom.h> #include <asm/rtas.h> #include <asm/fadump.h> #include <asm/fadump-internal.h> @@ -39,7 +40,7 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf, * This function is called in the capture kernel to get configuration details * setup in the first kernel and passed to the f/w. */ -static void rtas_fadump_get_config(struct fw_dump *fadump_conf, +static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) { fadump_conf->boot_mem_addr[0] = @@ -108,6 +109,12 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) fdm.hpte_region.destination_address = cpu_to_be64(addr); addr += fadump_conf->hpte_region_size; + /* + * Align boot memory area destination address to page boundary to + * be able to mmap read this area in the vmcore. + */ + addr = PAGE_ALIGN(addr); + /* RMA region section */ fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); fdm.rmr_region.source_data_type = @@ -247,7 +254,7 @@ static inline int rtas_fadump_gpr_index(u64 id) return i; } -void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) +static void __init rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) { int i; @@ -272,7 +279,7 @@ void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) regs->dsisr = (unsigned long)reg_val; } -static struct rtas_fadump_reg_entry* +static struct rtas_fadump_reg_entry* __init rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry, struct pt_regs *regs) { @@ -351,7 +358,7 @@ static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) /* Lower 4 bytes of reg_value contains logical cpu id */ cpu = (be64_to_cpu(reg_entry->reg_value) & RTAS_FADUMP_CPU_ID_MASK); - if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { + if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_mask)) { RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); continue; } @@ -462,10 +469,10 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf, be64_to_cpu(fdm_ptr->rmr_region.source_len), be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); - /* Dump is active. Show reserved area start address. */ + /* Dump is active. Show preserved area start address. */ if (fdm_active) { - seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", - fadump_conf->reserve_dump_area_start); + seq_printf(m, "\nMemory above %#016llx is reserved for saving crash dump\n", + fadump_conf->boot_mem_top); } } @@ -506,7 +513,7 @@ void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) fadump_conf->fadump_supported = 1; /* Firmware supports 64-bit value for size, align it to pagesize. */ - fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE); + fadump_conf->max_copy_size = ALIGN_DOWN(U64_MAX, PAGE_SIZE); /* * The 'ibm,kernel-dump' rtas node is present only if there is diff --git a/arch/powerpc/platforms/pseries/rtas-work-area.c b/arch/powerpc/platforms/pseries/rtas-work-area.c new file mode 100644 index 000000000000..7fe34bee84d8 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-work-area.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "rtas-work-area: " fmt + +#include <linux/genalloc.h> +#include <linux/log2.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/mempool.h> +#include <linux/minmax.h> +#include <linux/mutex.h> +#include <linux/numa.h> +#include <linux/sizes.h> +#include <linux/wait.h> + +#include <asm/machdep.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> + +enum { + /* + * Ensure the pool is page-aligned. + */ + RTAS_WORK_AREA_ARENA_ALIGN = PAGE_SIZE, + /* + * Don't let a single allocation claim the whole arena. + */ + RTAS_WORK_AREA_ARENA_SZ = RTAS_WORK_AREA_MAX_ALLOC_SZ * 2, + /* + * The smallest known work area size is for ibm,get-vpd's + * location code argument, which is limited to 79 characters + * plus 1 nul terminator. + * + * PAPR+ 7.3.20 ibm,get-vpd RTAS Call + * PAPR+ 12.3.2.4 Converged Location Code Rules - Length Restrictions + */ + RTAS_WORK_AREA_MIN_ALLOC_SZ = roundup_pow_of_two(80), +}; + +static struct { + struct gen_pool *gen_pool; + char *arena; + struct mutex mutex; /* serializes allocations */ + struct wait_queue_head wqh; + mempool_t descriptor_pool; + bool available; +} rwa_state = { + .mutex = __MUTEX_INITIALIZER(rwa_state.mutex), + .wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state.wqh), +}; + +/* + * A single work area buffer and descriptor to serve requests early in + * boot before the allocator is fully initialized. We know 4KB is the + * most any boot time user needs (they all call ibm,get-system-parameter). + */ +static bool early_work_area_in_use __initdata; +static char early_work_area_buf[SZ_4K] __initdata __aligned(SZ_4K); +static struct rtas_work_area early_work_area __initdata = { + .buf = early_work_area_buf, + .size = sizeof(early_work_area_buf), +}; + + +static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size) +{ + WARN_ON(size > early_work_area.size); + WARN_ON(early_work_area_in_use); + early_work_area_in_use = true; + memset(early_work_area.buf, 0, early_work_area.size); + return &early_work_area; +} + +static void __init rtas_work_area_free_early(struct rtas_work_area *work_area) +{ + WARN_ON(work_area != &early_work_area); + WARN_ON(!early_work_area_in_use); + early_work_area_in_use = false; +} + +struct rtas_work_area * __ref __rtas_work_area_alloc(size_t size) +{ + struct rtas_work_area *area; + unsigned long addr; + + might_sleep(); + + /* + * The rtas_work_area_alloc() wrapper enforces this at build + * time. Requests that exceed the arena size will block + * indefinitely. + */ + WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); + + if (!rwa_state.available) + return rtas_work_area_alloc_early(size); + /* + * To ensure FCFS behavior and prevent a high rate of smaller + * requests from starving larger ones, use the mutex to queue + * allocations. + */ + mutex_lock(&rwa_state.mutex); + wait_event(rwa_state.wqh, + (addr = gen_pool_alloc(rwa_state.gen_pool, size)) != 0); + mutex_unlock(&rwa_state.mutex); + + area = mempool_alloc(&rwa_state.descriptor_pool, GFP_KERNEL); + area->buf = (char *)addr; + area->size = size; + + return area; +} + +void __ref rtas_work_area_free(struct rtas_work_area *area) +{ + if (!rwa_state.available) { + rtas_work_area_free_early(area); + return; + } + + gen_pool_free(rwa_state.gen_pool, (unsigned long)area->buf, area->size); + mempool_free(area, &rwa_state.descriptor_pool); + wake_up(&rwa_state.wqh); +} + +/* + * Initialization of the work area allocator happens in two parts. To + * reliably reserve an arena that satisfies RTAS addressing + * requirements, we must perform a memblock allocation early, + * immmediately after RTAS instantiation. Then we have to wait until + * the slab allocator is up before setting up the descriptor mempool + * and adding the arena to a gen_pool. + */ +static __init int rtas_work_area_allocator_init(void) +{ + const unsigned int order = ilog2(RTAS_WORK_AREA_MIN_ALLOC_SZ); + const phys_addr_t pa_start = __pa(rwa_state.arena); + const phys_addr_t pa_end = pa_start + RTAS_WORK_AREA_ARENA_SZ - 1; + struct gen_pool *pool; + const int nid = NUMA_NO_NODE; + int err; + + err = -ENOMEM; + if (!rwa_state.arena) + goto err_out; + + pool = gen_pool_create(order, nid); + if (!pool) + goto err_out; + /* + * All RTAS functions that consume work areas are OK with + * natural alignment, when they have alignment requirements at + * all. + */ + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + err = gen_pool_add(pool, (unsigned long)rwa_state.arena, + RTAS_WORK_AREA_ARENA_SZ, nid); + if (err) + goto err_destroy; + + err = mempool_init_kmalloc_pool(&rwa_state.descriptor_pool, 1, + sizeof(struct rtas_work_area)); + if (err) + goto err_destroy; + + rwa_state.gen_pool = pool; + rwa_state.available = true; + + pr_debug("arena [%pa-%pa] (%uK), min/max alloc sizes %u/%u\n", + &pa_start, &pa_end, + RTAS_WORK_AREA_ARENA_SZ / SZ_1K, + RTAS_WORK_AREA_MIN_ALLOC_SZ, + RTAS_WORK_AREA_MAX_ALLOC_SZ); + + return 0; + +err_destroy: + gen_pool_destroy(pool); +err_out: + return err; +} +machine_arch_initcall(pseries, rtas_work_area_allocator_init); + +/** + * rtas_work_area_reserve_arena() - Reserve memory suitable for RTAS work areas. + * @limit: Upper limit for memblock allocation. + */ +void __init rtas_work_area_reserve_arena(const phys_addr_t limit) +{ + const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN; + const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ; + const phys_addr_t min = MEMBLOCK_LOW_LIMIT; + const int nid = NUMA_NO_NODE; + + /* + * Too early for a machine_is(pseries) check. But PAPR + * effectively mandates that ibm,get-system-parameter is + * present: + * + * R1–7.3.16–1. All platforms must support the System + * Parameters option. + * + * So set up the arena if we find that, with a fallback to + * ibm,configure-connector, just in case. + */ + if (rtas_function_implemented(RTAS_FN_IBM_GET_SYSTEM_PARAMETER) || + rtas_function_implemented(RTAS_FN_IBM_CONFIGURE_CONNECTOR)) + rwa_state.arena = memblock_alloc_try_nid(size, align, min, limit, nid); +} diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c deleted file mode 100644 index a0001280503c..000000000000 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ /dev/null @@ -1,196 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * c 2001 PPC 64 Team, IBM Corp - * - * scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com> - * - * When ppc64 hardware fails the service processor dumps internal state - * of the system. After a reboot the operating system can access a dump - * of this data using this driver. A dump exists if the device-tree - * /chosen/ibm,scan-log-data property exists. - * - * This driver exports /proc/powerpc/scan-log-dump which can be read. - * The driver supports only sequential reads. - * - * The driver looks at a write to the driver for the single word "reset". - * If given, the driver will reset the scanlog so the platform can free it. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/proc_fs.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include <asm/rtas.h> -#include <asm/prom.h> - -#define MODULE_VERS "1.0" -#define MODULE_NAME "scanlog" - -/* Status returns from ibm,scan-log-dump */ -#define SCANLOG_COMPLETE 0 -#define SCANLOG_HWERROR -1 -#define SCANLOG_CONTINUE 1 - - -static unsigned int ibm_scan_log_dump; /* RTAS token */ -static unsigned int *scanlog_buffer; /* The data buffer */ - -static ssize_t scanlog_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - unsigned int *data = scanlog_buffer; - int status; - unsigned long len, off; - unsigned int wait_time; - - if (count > RTAS_DATA_BUF_SIZE) - count = RTAS_DATA_BUF_SIZE; - - if (count < 1024) { - /* This is the min supported by this RTAS call. Rather - * than do all the buffering we insist the user code handle - * larger reads. As long as cp works... :) - */ - printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count); - return -EINVAL; - } - - if (!access_ok(buf, count)) - return -EFAULT; - - for (;;) { - wait_time = 500; /* default wait if no data */ - spin_lock(&rtas_data_buf_lock); - memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE); - status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, - (u32) __pa(rtas_data_buf), (u32) count); - memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE); - spin_unlock(&rtas_data_buf_lock); - - pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \ - "data[2]=%x\n", status, data[0], data[1], data[2]); - switch (status) { - case SCANLOG_COMPLETE: - pr_debug("scanlog: hit eof\n"); - return 0; - case SCANLOG_HWERROR: - pr_debug("scanlog: hardware error reading data\n"); - return -EIO; - case SCANLOG_CONTINUE: - /* We may or may not have data yet */ - len = data[1]; - off = data[2]; - if (len > 0) { - if (copy_to_user(buf, ((char *)data)+off, len)) - return -EFAULT; - return len; - } - /* Break to sleep default time */ - break; - default: - /* Assume extended busy */ - wait_time = rtas_busy_delay_time(status); - if (!wait_time) { - printk(KERN_ERR "scanlog: unknown error " \ - "from rtas: %d\n", status); - return -EIO; - } - } - /* Apparently no data yet. Wait and try again. */ - msleep_interruptible(wait_time); - } - /*NOTREACHED*/ -} - -static ssize_t scanlog_write(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - char stkbuf[20]; - int status; - - if (count > 19) count = 19; - if (copy_from_user (stkbuf, buf, count)) { - return -EFAULT; - } - stkbuf[count] = 0; - - if (buf) { - if (strncmp(stkbuf, "reset", 5) == 0) { - pr_debug("scanlog: reset scanlog\n"); - status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0); - pr_debug("scanlog: rtas returns %d\n", status); - } - } - return count; -} - -static int scanlog_open(struct inode * inode, struct file * file) -{ - unsigned int *data = scanlog_buffer; - - if (data[0] != 0) { - /* This imperfect test stops a second copy of the - * data (or a reset while data is being copied) - */ - return -EBUSY; - } - - data[0] = 0; /* re-init so we restart the scan */ - - return 0; -} - -static int scanlog_release(struct inode * inode, struct file * file) -{ - unsigned int *data = scanlog_buffer; - - data[0] = 0; - return 0; -} - -static const struct file_operations scanlog_fops = { - .owner = THIS_MODULE, - .read = scanlog_read, - .write = scanlog_write, - .open = scanlog_open, - .release = scanlog_release, - .llseek = noop_llseek, -}; - -static int __init scanlog_init(void) -{ - struct proc_dir_entry *ent; - int err = -ENOMEM; - - ibm_scan_log_dump = rtas_token("ibm,scan-log-dump"); - if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE) - return -ENODEV; - - /* Ideally we could allocate a buffer < 4G */ - scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); - if (!scanlog_buffer) - goto err; - - ent = proc_create("powerpc/rtas/scan-log-dump", 0400, NULL, - &scanlog_fops); - if (!ent) - goto err; - return 0; -err: - kfree(scanlog_buffer); - return err; -} - -static void __exit scanlog_cleanup(void) -{ - remove_proc_entry("powerpc/rtas/scan-log-dump", NULL); - kfree(scanlog_buffer); -} - -module_init(scanlog_init); -module_exit(scanlog_cleanup); -MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 0a40201f315f..284a6fa04b0c 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -14,6 +14,7 @@ #include <linux/cpu.h> #include <linux/errno.h> +#include <linux/platform_device.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -36,15 +37,15 @@ #include <linux/seq_file.h> #include <linux/root_dev.h> #include <linux/of.h> +#include <linux/of_irq.h> #include <linux/of_pci.h> #include <linux/memblock.h> #include <linux/swiotlb.h> +#include <linux/seq_buf.h> #include <asm/mmu.h> #include <asm/processor.h> #include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/prom.h> #include <asm/rtas.h> #include <asm/pci-bridge.h> #include <asm/iommu.h> @@ -56,6 +57,7 @@ #include <asm/pmc.h> #include <asm/xics.h> #include <asm/xive.h> +#include <asm/papr-sysparm.h> #include <asm/ppc-pci.h> #include <asm/i8259.h> #include <asm/udbg.h> @@ -68,11 +70,31 @@ #include <asm/isa-bridge.h> #include <asm/security_features.h> #include <asm/asm-const.h> +#include <asm/idle.h> #include <asm/swiotlb.h> #include <asm/svm.h> +#include <asm/dtl.h> +#include <asm/hvconsole.h> +#include <asm/setup.h> #include "pseries.h" -#include "../../../../drivers/pci/pci.h" + +DEFINE_STATIC_KEY_FALSE(shared_processor); +EXPORT_SYMBOL(shared_processor); + +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static bool steal_acc = true; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); +#endif int CMO_PrPSP = -1; int CMO_SecPSP = -1; @@ -80,6 +102,8 @@ unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K); EXPORT_SYMBOL(CMO_PageSize); int fwnmi_active; /* TRUE if an FWNMI handler is present */ +int ibm_nmi_interlock_token; +u32 pseries_security_flavor; static void pSeries_show_cpuinfo(struct seq_file *m) { @@ -106,13 +130,18 @@ static void __init fwnmi_init(void) u8 *mce_data_buf; unsigned int i; int nr_cpus = num_possible_cpus(); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU struct slb_entry *slb_ptr; size_t size; #endif + int ibm_nmi_register_token; + + ibm_nmi_register_token = rtas_function_token(RTAS_FN_IBM_NMI_REGISTER); + if (ibm_nmi_register_token == RTAS_UNKNOWN_SERVICE) + return; - int ibm_nmi_register = rtas_token("ibm,nmi-register"); - if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) + ibm_nmi_interlock_token = rtas_function_token(RTAS_FN_IBM_NMI_INTERLOCK); + if (WARN_ON(ibm_nmi_interlock_token == RTAS_UNKNOWN_SERVICE)) return; /* If the kernel's not linked at zero we point the firmware at low @@ -120,8 +149,8 @@ static void __init fwnmi_init(void) system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START; machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START; - if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr, - machine_check_addr)) + if (0 == rtas_call(ibm_nmi_register_token, 2, 1, NULL, + system_reset_addr, machine_check_addr)) fwnmi_active = 1; /* @@ -141,7 +170,7 @@ static void __init fwnmi_init(void) (RTAS_ERROR_LOG_MAX * i); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (!radix_enabled()) { /* Allocate per cpu area to save old slb contents during MCE */ size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; @@ -158,6 +187,18 @@ static void __init fwnmi_init(void) #endif } +/* + * Affix a device for the first timer to the platform bus if + * we have firmware support for the H_WATCHDOG hypercall. + */ +static __init int pseries_wdt_init(void) +{ + if (firmware_has_feature(FW_FEATURE_WATCHDOG)) + platform_device_register_simple("pseries-wdt", 0, NULL, 0); + return 0; +} +machine_subsys_initcall(pseries, pseries_wdt_init); + static void pseries_8259_cascade(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -314,6 +355,9 @@ static int alloc_dispatch_log_kmem_cache(void) } machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache); +DEFINE_PER_CPU(u64, idle_spurr_cycles); +DEFINE_PER_CPU(u64, idle_entry_purr_snap); +DEFINE_PER_CPU(u64, idle_entry_spurr_snap); static void pseries_lpar_idle(void) { /* @@ -325,7 +369,7 @@ static void pseries_lpar_idle(void) return; /* Indicate to hypervisor that we are idle. */ - get_lppaca()->idle = 1; + pseries_idle_prolog(); /* * Yield the processor to the hypervisor. We return if @@ -336,9 +380,17 @@ static void pseries_lpar_idle(void) */ cede_processor(); - get_lppaca()->idle = 0; + pseries_idle_epilog(); } +static bool pseries_reloc_on_exception_enabled; + +bool pseries_reloc_on_exception(void) +{ + return pseries_reloc_on_exception_enabled; +} +EXPORT_SYMBOL_GPL(pseries_reloc_on_exception); + /* * Enable relocation on during exceptions. This has partition wide scope and * may take a while to complete, if it takes longer than one second we will @@ -346,7 +398,7 @@ static void pseries_lpar_idle(void) * to ever be a problem in practice we can move this into a kernel thread to * finish off the process later in boot. */ -void pseries_enable_reloc_on_exc(void) +bool pseries_enable_reloc_on_exc(void) { long rc; unsigned int delay, total_delay = 0; @@ -357,11 +409,14 @@ void pseries_enable_reloc_on_exc(void) if (rc == H_P2) { pr_info("Relocation on exceptions not" " supported\n"); + return false; } else if (rc != H_SUCCESS) { pr_warn("Unable to enable relocation" " on exceptions: %ld\n", rc); + return false; } - break; + pseries_reloc_on_exception_enabled = true; + return true; } delay = get_longbusy_msecs(rc); @@ -370,7 +425,7 @@ void pseries_enable_reloc_on_exc(void) pr_warn("Warning: Giving up waiting to enable " "relocation on exceptions (%u msec)!\n", total_delay); - return; + return false; } mdelay(delay); @@ -388,22 +443,14 @@ void pseries_disable_reloc_on_exc(void) break; mdelay(get_longbusy_msecs(rc)); } - if (rc != H_SUCCESS) + if (rc == H_SUCCESS) + pseries_reloc_on_exception_enabled = false; + else pr_warn("Warning: Failed to disable relocation on exceptions: %ld\n", rc); } EXPORT_SYMBOL(pseries_disable_reloc_on_exc); -#ifdef CONFIG_KEXEC_CORE -static void pSeries_machine_kexec(struct kimage *image) -{ - if (firmware_has_feature(FW_FEATURE_SET_MODE)) - pseries_disable_reloc_on_exc(); - - default_machine_kexec(image); -} -#endif - #ifdef __LITTLE_ENDIAN__ void pseries_big_endian_exceptions(void) { @@ -431,7 +478,7 @@ void pseries_big_endian_exceptions(void) panic("Could not enable big endian exceptions"); } -void pseries_little_endian_exceptions(void) +void __init pseries_little_endian_exceptions(void) { long rc; @@ -448,7 +495,7 @@ void pseries_little_endian_exceptions(void) } #endif -static void __init find_and_init_phbs(void) +static void __init pSeries_discover_phbs(void) { struct device_node *node; struct pci_controller *phb; @@ -466,6 +513,11 @@ static void __init find_and_init_phbs(void) pci_process_bridge_OF_ranges(phb, node, 0); isa_bridge_find_early(phb); phb->controller_ops = pseries_pci_controller_ops; + + /* create pci_dn's for DT nodes under this PHB */ + pci_devs_phb_init_dynamic(phb); + + pseries_msi_allocate_domains(phb); } of_node_put(root); @@ -504,24 +556,46 @@ static void init_cpu_char_feature_flags(struct h_cpu_char_result *result) if (result->character & H_CPU_CHAR_BCCTR_FLUSH_ASSIST) security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST); + if (result->character & H_CPU_CHAR_BCCTR_LINK_FLUSH_ASSIST) + security_ftr_set(SEC_FTR_BCCTR_LINK_FLUSH_ASSIST); + if (result->behaviour & H_CPU_BEHAV_FLUSH_COUNT_CACHE) security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE); + if (result->behaviour & H_CPU_BEHAV_FLUSH_LINK_STACK) + security_ftr_set(SEC_FTR_FLUSH_LINK_STACK); + /* * The features below are enabled by default, so we instead look to see * if firmware has *disabled* them, and clear them if so. + * H_CPU_BEHAV_FAVOUR_SECURITY_H could be set only if + * H_CPU_BEHAV_FAVOUR_SECURITY is. */ - if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) + if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) { security_ftr_clear(SEC_FTR_FAVOUR_SECURITY); + pseries_security_flavor = 0; + } else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H) + pseries_security_flavor = 1; + else + pseries_security_flavor = 2; if (!(result->behaviour & H_CPU_BEHAV_L1D_FLUSH_PR)) security_ftr_clear(SEC_FTR_L1D_FLUSH_PR); + if (result->behaviour & H_CPU_BEHAV_NO_L1D_FLUSH_ENTRY) + security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY); + + if (result->behaviour & H_CPU_BEHAV_NO_L1D_FLUSH_UACCESS) + security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS); + + if (result->behaviour & H_CPU_BEHAV_NO_STF_BARRIER) + security_ftr_clear(SEC_FTR_STF_BARRIER); + if (!(result->behaviour & H_CPU_BEHAV_BNDS_CHK_SPEC_BAR)) security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR); } -void pseries_setup_rfi_flush(void) +void pseries_setup_security_mitigations(void) { struct h_cpu_char_result result; enum l1d_flush_type types; @@ -558,6 +632,16 @@ void pseries_setup_rfi_flush(void) setup_rfi_flush(types, enable); setup_count_cache_flush(); + + enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && + security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY); + setup_entry_flush(enable); + + enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && + security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS); + setup_uaccess_flush(enable); + + setup_stf_barrier(); } #ifdef CONFIG_PCI_IOV @@ -576,8 +660,8 @@ enum get_iov_fw_value_index { WDW_SIZE = 3 /* Get Window Size */ }; -resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno, - enum get_iov_fw_value_index value) +static resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno, + enum get_iov_fw_value_index value) { const int *indexes; struct device_node *dn = pci_device_to_OF_node(dev); @@ -594,7 +678,7 @@ resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno, */ num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1); if (resno >= num_res) - return 0; /* or an errror */ + return 0; /* or an error */ i = START_OF_ENTRIES + NEXT_ENTRY * resno; switch (value) { @@ -612,7 +696,7 @@ resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno, return ret; } -void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes) +static void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes) { struct resource *res; resource_size_t base, size; @@ -634,7 +718,7 @@ void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes) } } -void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes) +static void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes) { struct resource *res, *root, *conflict; resource_size_t base, size; @@ -696,9 +780,9 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev) const int *indexes; struct device_node *dn = pci_device_to_OF_node(pdev); - if (!pdev->is_physfn || pci_dev_is_added(pdev)) + if (!pdev->is_physfn) return; - /*Firmware must support open sriov otherwise dont configure*/ + /*Firmware must support open sriov otherwise don't configure*/ indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL); if (indexes) of_pci_parse_iov_addrs(pdev, indexes); @@ -732,6 +816,13 @@ static void __init pSeries_setup_arch(void) /* Discover PIC type and setup ppc_md accordingly */ smp_init_pseries(); + // Setup CPU hotplug callbacks + pseries_cpu_hotplug_init(); + + if (radix_enabled() && !mmu_has_feature(MMU_FTR_GTSE)) + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) + panic("BUG: Radix support requires either GTSE or RPT_INVALIDATE\n"); + /* openpic global configuration register (64-bit format). */ /* openpic Interrupt Source Unit pointer (64-bit format). */ @@ -742,22 +833,32 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); - pseries_setup_rfi_flush(); - setup_stf_barrier(); - pseries_lpar_read_hblkrm_characteristics(); + pseries_setup_security_mitigations(); + if (!radix_enabled()) + pseries_lpar_read_hblkrm_characteristics(); /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); /* Find and initialize PCI host bridges */ init_pci_config_tokens(); - find_and_init_phbs(); of_reconfig_notifier_register(&pci_dn_reconfig_nb); pSeries_nvram_init(); if (firmware_has_feature(FW_FEATURE_LPAR)) { vpa_init(boot_cpuid); + + if (lppaca_shared_proc()) { + static_branch_enable(&shared_processor); + pv_spinlocks_init(); +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); +#endif + } + ppc_md.power_save = pseries_lpar_idle; ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; #ifdef CONFIG_PCI_IOV @@ -774,9 +875,7 @@ static void __init pSeries_setup_arch(void) } ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare; - - if (swiotlb_force == SWIOTLB_FORCE) - ppc_swiotlb_enable = 1; + pseries_rng_init(); } static void pseries_panic(char *str) @@ -815,12 +914,15 @@ static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx) return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx); } -static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx) +static int pseries_set_dawr(int nr, unsigned long dawr, unsigned long dawrx) { /* PAPR says we can't set HYP */ dawrx &= ~DAWRX_HYP; - return plpar_set_watchpoint0(dawr, dawrx); + if (nr == 0) + return plpar_set_watchpoint0(dawr, dawrx); + else + return plpar_set_watchpoint1(dawr, dawrx); } #define CMO_CHARACTERISTICS_TOKEN 44 @@ -840,30 +942,23 @@ void pSeries_coalesce_init(void) * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions, * handle that here. (Stolen from parse_system_parameter_string) */ -static void pSeries_cmo_feature_init(void) +static void __init pSeries_cmo_feature_init(void) { + static struct papr_sysparm_buf buf __initdata; + static_assert(sizeof(buf.val) >= CMO_MAXLENGTH); char *ptr, *key, *value, *end; - int call_status; int page_order = IOMMU_PAGE_SHIFT_4K; pr_debug(" -> fw_cmo_feature_init()\n"); - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - CMO_CHARACTERISTICS_TOKEN, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - - if (call_status != 0) { - spin_unlock(&rtas_data_buf_lock); + + if (papr_sysparm_get(PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS, &buf)) { pr_debug("CMO not available\n"); pr_debug(" <- fw_cmo_feature_init()\n"); return; } - end = rtas_data_buf + CMO_MAXLENGTH - 2; - ptr = rtas_data_buf + 2; /* step over strlen value */ + end = &buf.val[CMO_MAXLENGTH]; + ptr = &buf.val[0]; key = value = ptr; while (*ptr && (ptr <= end)) { @@ -909,10 +1004,38 @@ static void pSeries_cmo_feature_init(void) } else pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); - spin_unlock(&rtas_data_buf_lock); pr_debug(" <- fw_cmo_feature_init()\n"); } +static void __init pseries_add_hw_description(void) +{ + struct device_node *dn; + const char *s; + + dn = of_find_node_by_path("/openprom"); + if (dn) { + if (of_property_read_string(dn, "model", &s) == 0) + seq_buf_printf(&ppc_hw_desc, "of:%s ", s); + + of_node_put(dn); + } + + dn = of_find_node_by_path("/hypervisor"); + if (dn) { + if (of_property_read_string(dn, "compatible", &s) == 0) + seq_buf_printf(&ppc_hw_desc, "hv:%s ", s); + + of_node_put(dn); + return; + } + + dn = of_find_node_by_path("/"); + if (of_property_read_bool(dn, "ibm,powervm-partition") || + of_property_read_bool(dn, "ibm,fw-net-version")) + seq_buf_printf(&ppc_hw_desc, "hv:phyp "); + of_node_put(dn); +} + /* * Early initialization. Relocation is on but do not reference unbolted pages */ @@ -920,6 +1043,8 @@ static void __init pseries_init(void) { pr_debug(" -> pseries_init()\n"); + pseries_add_hw_description(); + #ifdef CONFIG_HVC_CONSOLE if (firmware_has_feature(FW_FEATURE_LPAR)) hvc_vio_init_early(); @@ -950,14 +1075,14 @@ static void __init pseries_init(void) static void pseries_power_off(void) { int rc; - int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups"); + int rtas_poweroff_ups_token = rtas_function_token(RTAS_FN_IBM_POWER_OFF_UPS); if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_POWER_OFF); if (rtas_poweron_auto == 0 || rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) { - rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1); + rc = rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1); printk(KERN_INFO "RTAS power-off returned %d\n", rc); } else { rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL); @@ -968,7 +1093,11 @@ static void pseries_power_off(void) static int __init pSeries_probe(void) { - if (!of_node_is_type(of_root, "chrp")) + struct device_node *root = of_find_node_by_path("/"); + bool ret = of_node_is_type(root, "chrp"); + + of_node_put(root); + if (!ret) return 0; /* Cell blades firmware claims to be chrp while it's not. Until this @@ -995,8 +1124,18 @@ static int pSeries_pci_probe_mode(struct pci_bus *bus) return PCI_PROBE_NORMAL; } +#ifdef CONFIG_MEMORY_HOTPLUG +static unsigned long pseries_memory_block_size(void) +{ + return memory_block_size; +} +#endif + struct pci_controller_ops pseries_pci_controller_ops = { .probe_mode = pSeries_pci_probe_mode, +#ifdef CONFIG_SPAPR_TCE_IOMMU + .device_group = pSeries_pci_device_group, +#endif }; define_machine(pseries) { @@ -1006,6 +1145,7 @@ define_machine(pseries) { .init_IRQ = pseries_init_irq, .show_cpuinfo = pSeries_show_cpuinfo, .log_error = pSeries_log_error, + .discover_phbs = pSeries_discover_phbs, .pcibios_fixup = pSeries_final_fixup, .restart = rtas_restart, .halt = rtas_halt, @@ -1013,16 +1153,16 @@ define_machine(pseries) { .get_boot_time = rtas_get_boot_time, .get_rtc_time = rtas_get_rtc_time, .set_rtc_time = rtas_set_rtc_time, - .calibrate_decr = generic_calibrate_decr, .progress = rtas_progress, .system_reset_exception = pSeries_system_reset_exception, .machine_check_early = pseries_machine_check_realmode, .machine_check_exception = pSeries_machine_check_exception, + .machine_check_log_err = pSeries_machine_check_log_err, #ifdef CONFIG_KEXEC_CORE - .machine_kexec = pSeries_machine_kexec, + .machine_kexec = pseries_machine_kexec, .kexec_cpu_down = pseries_kexec_cpu_down, #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG .memory_block_size = pseries_memory_block_size, #endif }; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index ad61e90032da..c597711ef20a 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -20,14 +20,13 @@ #include <linux/err.h> #include <linux/device.h> #include <linux/cpu.h> +#include <linux/pgtable.h> #include <asm/ptrace.h> #include <linux/atomic.h> #include <asm/irq.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/smp.h> #include <asm/paca.h> #include <asm/machdep.h> @@ -42,10 +41,9 @@ #include <asm/plpar_wrappers.h> #include <asm/code-patching.h> #include <asm/svm.h> +#include <asm/kvm_guest.h> #include "pseries.h" -#include "offline_states.h" - /* * The Primary thread of each non-boot processor was started from the OF client @@ -57,7 +55,7 @@ static cpumask_var_t of_spin_mask; int smp_query_cpu_stopped(unsigned int pcpu) { int cpu_status, status; - int qcss_tok = rtas_token("query-cpu-stopped-state"); + int qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE); if (qcss_tok == RTAS_UNKNOWN_SERVICE) { printk_once(KERN_INFO @@ -106,17 +104,11 @@ static inline int smp_startup_cpu(unsigned int lcpu) return 1; } - /* Fixup atomic count: it exited inside IRQ handler. */ - task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count = 0; -#ifdef CONFIG_HOTPLUG_CPU - if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) - goto out; -#endif /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. */ - start_cpu = rtas_token("start-cpu"); + start_cpu = rtas_function_token(RTAS_FN_START_CPU); if (start_cpu == RTAS_UNKNOWN_SERVICE) return 1; @@ -126,9 +118,6 @@ static inline int smp_startup_cpu(unsigned int lcpu) return 0; } -#ifdef CONFIG_HOTPLUG_CPU -out: -#endif return 1; } @@ -143,10 +132,6 @@ static void smp_setup_cpu(int cpu) vpa_init(cpu); cpumask_clear_cpu(cpu, of_spin_mask); -#ifdef CONFIG_HOTPLUG_CPU - set_cpu_current_state(cpu, CPU_STATE_ONLINE); - set_default_offline_state(cpu); -#endif } static int smp_pSeries_kick_cpu(int nr) @@ -163,20 +148,6 @@ static int smp_pSeries_kick_cpu(int nr) * the processor will continue on to secondary_start */ paca_ptrs[nr]->cpu_start = 1; -#ifdef CONFIG_HOTPLUG_CPU - set_preferred_offline_state(nr, CPU_STATE_ONLINE); - - if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { - long rc; - unsigned long hcpuid; - - hcpuid = get_hard_smp_processor_id(nr); - rc = plpar_hcall_norets(H_PROD, hcpuid); - if (rc != H_SUCCESS) - printk(KERN_ERR "Error: Prod to wake up processor %d " - "Ret= %ld\n", nr, rc); - } -#endif return 0; } @@ -188,13 +159,16 @@ static int pseries_smp_prepare_cpu(int cpu) return 0; } -static void smp_pseries_cause_ipi(int cpu) +/* Cause IPI as setup by the interrupt controller (xics or xive) */ +static void (*ic_cause_ipi)(int cpu) __ro_after_init; + +/* Use msgsndp doorbells target is a sibling, else use interrupt controller */ +static void dbell_or_ic_cause_ipi(int cpu) { - /* POWER9 should not use this handler */ if (doorbell_try_core_ipi(cpu)) return; - icp_ops->cause_ipi(cpu); + ic_cause_ipi(cpu); } static int pseries_cause_nmi_ipi(int cpu) @@ -218,26 +192,51 @@ static int pseries_cause_nmi_ipi(int cpu) return 0; } -static __init void pSeries_smp_probe_xics(void) -{ - xics_smp_probe(); - - if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) - smp_ops->cause_ipi = smp_pseries_cause_ipi; - else - smp_ops->cause_ipi = icp_ops->cause_ipi; -} - static __init void pSeries_smp_probe(void) { if (xive_enabled()) - /* - * Don't use P9 doorbells when XIVE is enabled. IPIs - * using MMIOs should be faster - */ xive_smp_probe(); else - pSeries_smp_probe_xics(); + xics_smp_probe(); + + /* No doorbell facility, must use the interrupt controller for IPIs */ + if (!cpu_has_feature(CPU_FTR_DBELL)) + return; + + /* Doorbells can only be used for IPIs between SMT siblings */ + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + check_kvm_guest(); + + if (is_kvm_guest()) { + /* + * KVM emulates doorbells by disabling FSCR[MSGP] so msgsndp + * faults to the hypervisor which then reads the instruction + * from guest memory, which tends to be slower than using XIVE. + */ + if (xive_enabled()) + return; + + /* + * XICS hcalls aren't as fast, so we can use msgsndp (which + * also helps exercise KVM emulation), however KVM can't + * emulate secure guests because it can't read the instruction + * out of their memory. + */ + if (is_secure_guest()) + return; + } + + /* + * Under PowerVM, FSCR[MSGP] is enabled as guest vCPU siblings are + * gang scheduled on the same physical core, so doorbells are always + * faster than the interrupt controller, and they can be used by + * secure guests. + */ + + ic_cause_ipi = smp_ops->cause_ipi; + smp_ops->cause_ipi = dbell_or_ic_cause_ipi; } static struct smp_ops_t pseries_smp_ops = { @@ -267,7 +266,7 @@ void __init smp_init_pseries(void) * We know prom_init will not have started them if RTAS supports * query-cpu-stopped-state. */ - if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) { + if (rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE) == RTAS_UNKNOWN_SERVICE) { if (cpu_has_feature(CPU_FTR_SMT)) { for_each_present_cpu(i) { if (cpu_thread_in_core(i) == 0) @@ -279,11 +278,5 @@ void __init smp_init_pseries(void) cpumask_clear_cpu(boot_cpuid, of_spin_mask); } - /* Non-lpar has additional take/give timebase */ - if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { - smp_ops->give_timebase = rtas_give_timebase; - smp_ops->take_timebase = rtas_take_timebase; - } - pr_debug(" <- smp_init_pSeries()\n"); } diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c index 0a24a5a185f0..382003dfdb9a 100644 --- a/arch/powerpc/platforms/pseries/suspend.c +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -13,13 +13,9 @@ #include <asm/mmu.h> #include <asm/rtas.h> #include <asm/topology.h> -#include "../../kernel/cacheinfo.h" +#include "pseries.h" -static u64 stream_id; static struct device suspend_dev; -static DECLARE_COMPLETION(suspend_work); -static struct rtas_suspend_me_data suspend_data; -static atomic_t suspending; /** * pseries_suspend_begin - First phase of hibernation @@ -29,7 +25,7 @@ static atomic_t suspending; * Return value: * 0 on success / other on failure **/ -static int pseries_suspend_begin(suspend_state_t state) +static int pseries_suspend_begin(u64 stream_id) { long vasi_state, rc; unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; @@ -49,41 +45,10 @@ static int pseries_suspend_begin(suspend_state_t state) vasi_state); return -EIO; } - - return 0; -} - -/** - * pseries_suspend_cpu - Suspend a single CPU - * - * Makes the H_JOIN call to suspend the CPU - * - **/ -static int pseries_suspend_cpu(void) -{ - if (atomic_read(&suspending)) - return rtas_suspend_cpu(&suspend_data); return 0; } /** - * pseries_suspend_enable_irqs - * - * Post suspend configuration updates - * - **/ -static void pseries_suspend_enable_irqs(void) -{ - /* - * Update configuration which can be modified based on device tree - * changes during resume. - */ - cacheinfo_cpu_offline(smp_processor_id()); - post_mobility_fixup(); - cacheinfo_cpu_online(smp_processor_id()); -} - -/** * pseries_suspend_enter - Final phase of hibernation * * Return value: @@ -91,28 +56,7 @@ static void pseries_suspend_enable_irqs(void) **/ static int pseries_suspend_enter(suspend_state_t state) { - int rc = rtas_suspend_last_cpu(&suspend_data); - - atomic_set(&suspending, 0); - atomic_set(&suspend_data.done, 1); - return rc; -} - -/** - * pseries_prepare_late - Prepare to suspend all other CPUs - * - * Return value: - * 0 on success / other on failure - **/ -static int pseries_prepare_late(void) -{ - atomic_set(&suspending, 1); - atomic_set(&suspend_data.working, 0); - atomic_set(&suspend_data.done, 0); - atomic_set(&suspend_data.error, 0); - suspend_data.complete = &suspend_work; - reinit_completion(&suspend_work); - return 0; + return rtas_ibm_suspend_me(NULL); } /** @@ -132,50 +76,29 @@ static ssize_t store_hibernate(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - cpumask_var_t offline_mask; + u64 stream_id; int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL)) - return -ENOMEM; - stream_id = simple_strtoul(buf, NULL, 16); do { - rc = pseries_suspend_begin(PM_SUSPEND_MEM); + rc = pseries_suspend_begin(stream_id); if (rc == -EAGAIN) ssleep(1); } while (rc == -EAGAIN); - if (!rc) { - /* All present CPUs must be online */ - cpumask_andnot(offline_mask, cpu_present_mask, - cpu_online_mask); - rc = rtas_online_cpus_mask(offline_mask); - if (rc) { - pr_err("%s: Could not bring present CPUs online.\n", - __func__); - goto out; - } - - stop_topology_update(); + if (!rc) rc = pm_suspend(PM_SUSPEND_MEM); - start_topology_update(); - /* Take down CPUs not online prior to suspend */ - if (!rtas_offline_cpus_mask(offline_mask)) - pr_warn("%s: Could not restore CPUs to offline " - "state.\n", __func__); + if (!rc) { + rc = count; + post_mobility_fixup(); } - stream_id = 0; - if (!rc) - rc = count; -out: - free_cpumask_var(offline_mask); return rc; } @@ -210,8 +133,6 @@ static struct bus_type suspend_subsys = { static const struct platform_suspend_ops pseries_suspend_ops = { .valid = suspend_valid_only_mem, - .begin = pseries_suspend_begin, - .prepare_late = pseries_prepare_late, .enter = pseries_suspend_enter, }; @@ -223,6 +144,7 @@ static const struct platform_suspend_ops pseries_suspend_ops = { **/ static int pseries_suspend_sysfs_register(struct device *dev) { + struct device *dev_root; int rc; if ((rc = subsys_system_register(&suspend_subsys, NULL))) @@ -231,8 +153,13 @@ static int pseries_suspend_sysfs_register(struct device *dev) dev->id = 0; dev->bus = &suspend_subsys; - if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate))) - goto subsys_unregister; + dev_root = bus_get_dev_root(&suspend_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_hibernate); + put_device(dev_root); + if (rc) + goto subsys_unregister; + } return 0; @@ -254,15 +181,9 @@ static int __init pseries_suspend_init(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) return 0; - suspend_data.token = rtas_token("ibm,suspend-me"); - if (suspend_data.token == RTAS_UNKNOWN_SERVICE) - return 0; - if ((rc = pseries_suspend_sysfs_register(&suspend_dev))) return rc; - ppc_md.suspend_disable_cpu = pseries_suspend_cpu; - ppc_md.suspend_enable_irqs = pseries_suspend_enable_irqs; suspend_set_ops(&pseries_suspend_ops); return 0; } diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 40c0637203d5..3b4045d508ec 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -7,10 +7,13 @@ */ #include <linux/mm.h> +#include <linux/memblock.h> +#include <linux/cc_platform.h> #include <asm/machdep.h> #include <asm/svm.h> #include <asm/swiotlb.h> #include <asm/ultravisor.h> +#include <asm/dtl.h> static int __init init_svm(void) { @@ -25,7 +28,7 @@ static int __init init_svm(void) * need to use the SWIOTLB buffer for DMA even if dma_capable() says * otherwise. */ - swiotlb_force = SWIOTLB_FORCE; + ppc_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_FORCE; /* Share the SWIOTLB buffer with the host. */ swiotlb_update_mem_attributes(); @@ -36,6 +39,9 @@ machine_early_initcall(pseries, init_svm); int set_memory_encrypted(unsigned long addr, int numpages) { + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return 0; + if (!PAGE_ALIGNED(addr)) return -EINVAL; @@ -46,6 +52,9 @@ int set_memory_encrypted(unsigned long addr, int numpages) int set_memory_decrypted(unsigned long addr, int numpages) { + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return 0; + if (!PAGE_ALIGNED(addr)) return -EINVAL; diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c b/arch/powerpc/platforms/pseries/vas-sysfs.c new file mode 100644 index 000000000000..f9f682724e77 --- /dev/null +++ b/arch/powerpc/platforms/pseries/vas-sysfs.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2022-23 IBM Corp. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/kobject.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include "vas.h" + +#ifdef CONFIG_SYSFS +static struct kobject *pseries_vas_kobj; +static struct kobject *gzip_caps_kobj; + +struct vas_caps_entry { + struct kobject kobj; + struct vas_cop_feat_caps *caps; +}; + +#define to_caps_entry(entry) container_of(entry, struct vas_caps_entry, kobj) + +/* + * This function is used to get the notification from the drmgr when + * QoS credits are changed. + */ +static ssize_t update_total_credits_store(struct vas_cop_feat_caps *caps, + const char *buf, size_t count) +{ + int err; + u16 creds; + + err = kstrtou16(buf, 0, &creds); + /* + * The user space interface from the management console + * notifies OS with the new QoS credits and then the + * hypervisor. So OS has to use this new credits value + * and reconfigure VAS windows (close or reopen depends + * on the credits available) instead of depending on VAS + * QoS capabilities from the hypervisor. + */ + if (!err) + err = vas_reconfig_capabilties(caps->win_type, creds); + + if (err) + return -EINVAL; + + pr_info("Set QoS total credits %u\n", creds); + + return count; +} + +#define sysfs_caps_entry_read(_name) \ +static ssize_t _name##_show(struct vas_cop_feat_caps *caps, char *buf) \ +{ \ + return sprintf(buf, "%d\n", atomic_read(&caps->_name)); \ +} + +struct vas_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct vas_cop_feat_caps *, char *); + ssize_t (*store)(struct vas_cop_feat_caps *, const char *, size_t); +}; + +#define VAS_ATTR_RO(_name) \ + sysfs_caps_entry_read(_name); \ + static struct vas_sysfs_entry _name##_attribute = __ATTR(_name, \ + 0444, _name##_show, NULL); + +/* + * Create sysfs interface: + * /sys/devices/virtual/misc/vas/vas0/gzip/default_capabilities + * This directory contains the following VAS GZIP capabilities + * for the default credit type. + * /sys/devices/virtual/misc/vas/vas0/gzip/default_capabilities/nr_total_credits + * Total number of default credits assigned to the LPAR which + * can be changed with DLPAR operation. + * /sys/devices/virtual/misc/vas/vas0/gzip/default_capabilities/nr_used_credits + * Number of credits used by the user space. One credit will + * be assigned for each window open. + * + * /sys/devices/virtual/misc/vas/vas0/gzip/qos_capabilities + * This directory contains the following VAS GZIP capabilities + * for the Quality of Service (QoS) credit type. + * /sys/devices/virtual/misc/vas/vas0/gzip/qos_capabilities/nr_total_credits + * Total number of QoS credits assigned to the LPAR. The user + * has to define this value using HMC interface. It can be + * changed dynamically by the user. + * /sys/devices/virtual/misc/vas/vas0/gzip/qos_capabilities/nr_used_credits + * Number of credits used by the user space. + * /sys/devices/virtual/misc/vas/vas0/gzip/qos_capabilities/update_total_credits + * Update total QoS credits dynamically + */ + +VAS_ATTR_RO(nr_total_credits); +VAS_ATTR_RO(nr_used_credits); + +static struct vas_sysfs_entry update_total_credits_attribute = + __ATTR(update_total_credits, 0200, NULL, update_total_credits_store); + +static struct attribute *vas_def_capab_attrs[] = { + &nr_total_credits_attribute.attr, + &nr_used_credits_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vas_def_capab); + +static struct attribute *vas_qos_capab_attrs[] = { + &nr_total_credits_attribute.attr, + &nr_used_credits_attribute.attr, + &update_total_credits_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vas_qos_capab); + +static ssize_t vas_type_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct vas_caps_entry *centry; + struct vas_cop_feat_caps *caps; + struct vas_sysfs_entry *entry; + + centry = to_caps_entry(kobj); + caps = centry->caps; + entry = container_of(attr, struct vas_sysfs_entry, attr); + + if (!entry->show) + return -EIO; + + return entry->show(caps, buf); +} + +static ssize_t vas_type_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct vas_caps_entry *centry; + struct vas_cop_feat_caps *caps; + struct vas_sysfs_entry *entry; + + centry = to_caps_entry(kobj); + caps = centry->caps; + entry = container_of(attr, struct vas_sysfs_entry, attr); + if (!entry->store) + return -EIO; + + return entry->store(caps, buf, count); +} + +static void vas_type_release(struct kobject *kobj) +{ + struct vas_caps_entry *centry = to_caps_entry(kobj); + kfree(centry); +} + +static const struct sysfs_ops vas_sysfs_ops = { + .show = vas_type_show, + .store = vas_type_store, +}; + +static struct kobj_type vas_def_attr_type = { + .release = vas_type_release, + .sysfs_ops = &vas_sysfs_ops, + .default_groups = vas_def_capab_groups, +}; + +static struct kobj_type vas_qos_attr_type = { + .release = vas_type_release, + .sysfs_ops = &vas_sysfs_ops, + .default_groups = vas_qos_capab_groups, +}; + +static char *vas_caps_kobj_name(struct vas_caps_entry *centry, + struct kobject **kobj) +{ + struct vas_cop_feat_caps *caps = centry->caps; + + if (caps->descriptor == VAS_GZIP_QOS_CAPABILITIES) { + kobject_init(¢ry->kobj, &vas_qos_attr_type); + *kobj = gzip_caps_kobj; + return "qos_capabilities"; + } else if (caps->descriptor == VAS_GZIP_DEFAULT_CAPABILITIES) { + kobject_init(¢ry->kobj, &vas_def_attr_type); + *kobj = gzip_caps_kobj; + return "default_capabilities"; + } else + return "Unknown"; +} + +/* + * Add feature specific capability dir entry. + * Ex: VDefGzip or VQosGzip + */ +int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps) +{ + struct vas_caps_entry *centry; + struct kobject *kobj = NULL; + int ret = 0; + char *name; + + centry = kzalloc(sizeof(*centry), GFP_KERNEL); + if (!centry) + return -ENOMEM; + + centry->caps = caps; + name = vas_caps_kobj_name(centry, &kobj); + + if (kobj) { + ret = kobject_add(¢ry->kobj, kobj, "%s", name); + + if (ret) { + pr_err("VAS: sysfs kobject add / event failed %d\n", + ret); + kobject_put(¢ry->kobj); + } + } + + return ret; +} + +static struct miscdevice vas_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "vas", +}; + +/* + * Add VAS and VasCaps (overall capabilities) dir entries. + */ +int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps) +{ + int ret; + + ret = misc_register(&vas_miscdev); + if (ret < 0) { + pr_err("%s: register vas misc device failed\n", __func__); + return ret; + } + + /* + * The hypervisor does not expose multiple VAS instances, but can + * see multiple VAS instances on PowerNV. So create 'vas0' directory + * on pseries. + */ + pseries_vas_kobj = kobject_create_and_add("vas0", + &vas_miscdev.this_device->kobj); + if (!pseries_vas_kobj) { + misc_deregister(&vas_miscdev); + pr_err("Failed to create VAS sysfs entry\n"); + return -ENOMEM; + } + + if ((vas_caps->feat_type & VAS_GZIP_QOS_FEAT_BIT) || + (vas_caps->feat_type & VAS_GZIP_DEF_FEAT_BIT)) { + gzip_caps_kobj = kobject_create_and_add("gzip", + pseries_vas_kobj); + if (!gzip_caps_kobj) { + pr_err("Failed to create VAS GZIP capability entry\n"); + kobject_put(pseries_vas_kobj); + misc_deregister(&vas_miscdev); + return -ENOMEM; + } + } + + return 0; +} + +#else +int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps) +{ + return 0; +} + +int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps) +{ + return 0; +} +#endif diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c new file mode 100644 index 000000000000..71d52a670d95 --- /dev/null +++ b/arch/powerpc/platforms/pseries/vas.c @@ -0,0 +1,1121 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2020-21 IBM Corp. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/irqdomain.h> +#include <asm/machdep.h> +#include <asm/hvcall.h> +#include <asm/plpar_wrappers.h> +#include <asm/firmware.h> +#include <asm/vphn.h> +#include <asm/vas.h> +#include "vas.h" + +#define VAS_INVALID_WIN_ADDRESS 0xFFFFFFFFFFFFFFFFul +#define VAS_DEFAULT_DOMAIN_ID 0xFFFFFFFFFFFFFFFFul +/* The hypervisor allows one credit per window right now */ +#define DEF_WIN_CREDS 1 + +static struct vas_all_caps caps_all; +static bool copypaste_feat; +static struct hv_vas_cop_feat_caps hv_cop_caps; + +static struct vas_caps vascaps[VAS_MAX_FEAT_TYPE]; +static DEFINE_MUTEX(vas_pseries_mutex); +static bool migration_in_progress; + +static long hcall_return_busy_check(long rc) +{ + /* Check if we are stalled for some time */ + if (H_IS_LONG_BUSY(rc)) { + msleep(get_longbusy_msecs(rc)); + rc = H_BUSY; + } else if (rc == H_BUSY) { + cond_resched(); + } + + return rc; +} + +/* + * Allocate VAS window hcall + */ +static int h_allocate_vas_window(struct pseries_vas_window *win, u64 *domain, + u8 wintype, u16 credits) +{ + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + long rc; + + do { + rc = plpar_hcall9(H_ALLOCATE_VAS_WINDOW, retbuf, wintype, + credits, domain[0], domain[1], domain[2], + domain[3], domain[4], domain[5]); + + rc = hcall_return_busy_check(rc); + } while (rc == H_BUSY); + + if (rc == H_SUCCESS) { + if (win->win_addr == VAS_INVALID_WIN_ADDRESS) { + pr_err("H_ALLOCATE_VAS_WINDOW: COPY/PASTE is not supported\n"); + return -ENOTSUPP; + } + win->vas_win.winid = retbuf[0]; + win->win_addr = retbuf[1]; + win->complete_irq = retbuf[2]; + win->fault_irq = retbuf[3]; + return 0; + } + + pr_err("H_ALLOCATE_VAS_WINDOW error: %ld, wintype: %u, credits: %u\n", + rc, wintype, credits); + + return -EIO; +} + +/* + * Deallocate VAS window hcall. + */ +static int h_deallocate_vas_window(u64 winid) +{ + long rc; + + do { + rc = plpar_hcall_norets(H_DEALLOCATE_VAS_WINDOW, winid); + + rc = hcall_return_busy_check(rc); + } while (rc == H_BUSY); + + if (rc == H_SUCCESS) + return 0; + + pr_err("H_DEALLOCATE_VAS_WINDOW error: %ld, winid: %llu\n", + rc, winid); + return -EIO; +} + +/* + * Modify VAS window. + * After the window is opened with allocate window hcall, configure it + * with flags and LPAR PID before using. + */ +static int h_modify_vas_window(struct pseries_vas_window *win) +{ + long rc; + + /* + * AMR value is not supported in Linux VAS implementation. + * The hypervisor ignores it if 0 is passed. + */ + do { + rc = plpar_hcall_norets(H_MODIFY_VAS_WINDOW, + win->vas_win.winid, win->pid, 0, + VAS_MOD_WIN_FLAGS, 0); + + rc = hcall_return_busy_check(rc); + } while (rc == H_BUSY); + + if (rc == H_SUCCESS) + return 0; + + pr_err("H_MODIFY_VAS_WINDOW error: %ld, winid %u pid %u\n", + rc, win->vas_win.winid, win->pid); + return -EIO; +} + +/* + * This hcall is used to determine the capabilities from the hypervisor. + * @hcall: H_QUERY_VAS_CAPABILITIES or H_QUERY_NX_CAPABILITIES + * @query_type: If 0 is passed, the hypervisor returns the overall + * capabilities which provides all feature(s) that are + * available. Then query the hypervisor to get the + * corresponding capabilities for the specific feature. + * Example: H_QUERY_VAS_CAPABILITIES provides VAS GZIP QoS + * and VAS GZIP Default capabilities. + * H_QUERY_NX_CAPABILITIES provides NX GZIP + * capabilities. + * @result: Return buffer to save capabilities. + */ +int h_query_vas_capabilities(const u64 hcall, u8 query_type, u64 result) +{ + long rc; + + rc = plpar_hcall_norets(hcall, query_type, result); + + if (rc == H_SUCCESS) + return 0; + + /* H_FUNCTION means HV does not support VAS so don't print an error */ + if (rc != H_FUNCTION) { + pr_err("%s error %ld, query_type %u, result buffer 0x%llx\n", + (hcall == H_QUERY_VAS_CAPABILITIES) ? + "H_QUERY_VAS_CAPABILITIES" : + "H_QUERY_NX_CAPABILITIES", + rc, query_type, result); + } + + return -EIO; +} +EXPORT_SYMBOL_GPL(h_query_vas_capabilities); + +/* + * hcall to get fault CRB from the hypervisor. + */ +static int h_get_nx_fault(u32 winid, u64 buffer) +{ + long rc; + + rc = plpar_hcall_norets(H_GET_NX_FAULT, winid, buffer); + + if (rc == H_SUCCESS) + return 0; + + pr_err("H_GET_NX_FAULT error: %ld, winid %u, buffer 0x%llx\n", + rc, winid, buffer); + return -EIO; + +} + +/* + * Handle the fault interrupt. + * When the fault interrupt is received for each window, query the + * hypervisor to get the fault CRB on the specific fault. Then + * process the CRB by updating CSB or send signal if the user space + * CSB is invalid. + * Note: The hypervisor forwards an interrupt for each fault request. + * So one fault CRB to process for each H_GET_NX_FAULT hcall. + */ +static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) +{ + struct pseries_vas_window *txwin = data; + struct coprocessor_request_block crb; + struct vas_user_win_ref *tsk_ref; + int rc; + + while (atomic_read(&txwin->pending_faults)) { + rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); + if (!rc) { + tsk_ref = &txwin->vas_win.task_ref; + vas_dump_crb(&crb); + vas_update_csb(&crb, tsk_ref); + } + atomic_dec(&txwin->pending_faults); + } + + return IRQ_HANDLED; +} + +/* + * irq_default_primary_handler() can be used only with IRQF_ONESHOT + * which disables IRQ before executing the thread handler and enables + * it after. But this disabling interrupt sets the VAS IRQ OFF + * state in the hypervisor. If the NX generates fault interrupt + * during this window, the hypervisor will not deliver this + * interrupt to the LPAR. So use VAS specific IRQ handler instead + * of calling the default primary handler. + */ +static irqreturn_t pseries_vas_irq_handler(int irq, void *data) +{ + struct pseries_vas_window *txwin = data; + + /* + * The thread hanlder will process this interrupt if it is + * already running. + */ + atomic_inc(&txwin->pending_faults); + + return IRQ_WAKE_THREAD; +} + +/* + * Allocate window and setup IRQ mapping. + */ +static int allocate_setup_window(struct pseries_vas_window *txwin, + u64 *domain, u8 wintype) +{ + int rc; + + rc = h_allocate_vas_window(txwin, domain, wintype, DEF_WIN_CREDS); + if (rc) + return rc; + /* + * On PowerVM, the hypervisor setup and forwards the fault + * interrupt per window. So the IRQ setup and fault handling + * will be done for each open window separately. + */ + txwin->fault_virq = irq_create_mapping(NULL, txwin->fault_irq); + if (!txwin->fault_virq) { + pr_err("Failed irq mapping %d\n", txwin->fault_irq); + rc = -EINVAL; + goto out_win; + } + + txwin->name = kasprintf(GFP_KERNEL, "vas-win-%d", + txwin->vas_win.winid); + if (!txwin->name) { + rc = -ENOMEM; + goto out_irq; + } + + rc = request_threaded_irq(txwin->fault_virq, + pseries_vas_irq_handler, + pseries_vas_fault_thread_fn, 0, + txwin->name, txwin); + if (rc) { + pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n", + txwin->vas_win.winid, txwin->fault_virq, rc); + goto out_free; + } + + txwin->vas_win.wcreds_max = DEF_WIN_CREDS; + + return 0; +out_free: + kfree(txwin->name); +out_irq: + irq_dispose_mapping(txwin->fault_virq); +out_win: + h_deallocate_vas_window(txwin->vas_win.winid); + return rc; +} + +static inline void free_irq_setup(struct pseries_vas_window *txwin) +{ + free_irq(txwin->fault_virq, txwin); + kfree(txwin->name); + irq_dispose_mapping(txwin->fault_virq); +} + +static struct vas_window *vas_allocate_window(int vas_id, u64 flags, + enum vas_cop_type cop_type) +{ + long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID}; + struct vas_cop_feat_caps *cop_feat_caps; + struct vas_caps *caps; + struct pseries_vas_window *txwin; + int rc; + + txwin = kzalloc(sizeof(*txwin), GFP_KERNEL); + if (!txwin) + return ERR_PTR(-ENOMEM); + + /* + * A VAS window can have many credits which means that many + * requests can be issued simultaneously. But the hypervisor + * restricts one credit per window. + * The hypervisor introduces 2 different types of credits: + * Default credit type (Uses normal priority FIFO): + * A limited number of credits are assigned to partitions + * based on processor entitlement. But these credits may be + * over-committed on a system depends on whether the CPUs + * are in shared or dedicated modes - that is, more requests + * may be issued across the system than NX can service at + * once which can result in paste command failure (RMA_busy). + * Then the process has to resend requests or fall-back to + * SW compression. + * Quality of Service (QoS) credit type (Uses high priority FIFO): + * To avoid NX HW contention, the system admins can assign + * QoS credits for each LPAR so that this partition is + * guaranteed access to NX resources. These credits are + * assigned to partitions via the HMC. + * Refer PAPR for more information. + * + * Allocate window with QoS credits if user requested. Otherwise + * default credits are used. + */ + if (flags & VAS_TX_WIN_FLAG_QOS_CREDIT) + caps = &vascaps[VAS_GZIP_QOS_FEAT_TYPE]; + else + caps = &vascaps[VAS_GZIP_DEF_FEAT_TYPE]; + + cop_feat_caps = &caps->caps; + + if (atomic_inc_return(&cop_feat_caps->nr_used_credits) > + atomic_read(&cop_feat_caps->nr_total_credits)) { + pr_err_ratelimited("Credits are not available to allocate window\n"); + rc = -EINVAL; + goto out; + } + + if (vas_id == -1) { + /* + * The user space is requesting to allocate a window on + * a VAS instance where the process is executing. + * On PowerVM, domain values are passed to the hypervisor + * to select VAS instance. Useful if the process is + * affinity to NUMA node. + * The hypervisor selects VAS instance if + * VAS_DEFAULT_DOMAIN_ID (-1) is passed for domain values. + * The h_allocate_vas_window hcall is defined to take a + * domain values as specified by h_home_node_associativity, + * So no unpacking needs to be done. + */ + rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, domain, + VPHN_FLAG_VCPU, hard_smp_processor_id()); + if (rc != H_SUCCESS) { + pr_err("H_HOME_NODE_ASSOCIATIVITY error: %d\n", rc); + goto out; + } + } + + txwin->pid = mfspr(SPRN_PID); + + /* + * Allocate / Deallocate window hcalls and setup / free IRQs + * have to be protected with mutex. + * Open VAS window: Allocate window hcall and setup IRQ + * Close VAS window: Deallocate window hcall and free IRQ + * The hypervisor waits until all NX requests are + * completed before closing the window. So expects OS + * to handle NX faults, means IRQ can be freed only + * after the deallocate window hcall is returned. + * So once the window is closed with deallocate hcall before + * the IRQ is freed, it can be assigned to new allocate + * hcall with the same fault IRQ by the hypervisor. It can + * result in setup IRQ fail for the new window since the + * same fault IRQ is not freed by the OS before. + */ + mutex_lock(&vas_pseries_mutex); + if (migration_in_progress) { + rc = -EBUSY; + } else { + rc = allocate_setup_window(txwin, (u64 *)&domain[0], + cop_feat_caps->win_type); + if (!rc) + caps->nr_open_wins_progress++; + } + + mutex_unlock(&vas_pseries_mutex); + if (rc) + goto out; + + /* + * Modify window and it is ready to use. + */ + rc = h_modify_vas_window(txwin); + if (!rc) + rc = get_vas_user_win_ref(&txwin->vas_win.task_ref); + if (rc) + goto out_free; + + txwin->win_type = cop_feat_caps->win_type; + + /* + * The migration SUSPEND thread sets migration_in_progress and + * closes all open windows from the list. But the window is + * added to the list after open and modify HCALLs. So possible + * that migration_in_progress is set before modify HCALL which + * may cause some windows are still open when the hypervisor + * initiates the migration. + * So checks the migration_in_progress flag again and close all + * open windows. + * + * Possible to lose the acquired credit with DLPAR core + * removal after the window is opened. So if there are any + * closed windows (means with lost credits), do not give new + * window to user space. New windows will be opened only + * after the existing windows are reopened when credits are + * available. + */ + mutex_lock(&vas_pseries_mutex); + if (!caps->nr_close_wins && !migration_in_progress) { + list_add(&txwin->win_list, &caps->list); + caps->nr_open_windows++; + caps->nr_open_wins_progress--; + mutex_unlock(&vas_pseries_mutex); + vas_user_win_add_mm_context(&txwin->vas_win.task_ref); + return &txwin->vas_win; + } + mutex_unlock(&vas_pseries_mutex); + + put_vas_user_win_ref(&txwin->vas_win.task_ref); + rc = -EBUSY; + pr_err_ratelimited("No credit is available to allocate window\n"); + +out_free: + /* + * Window is not operational. Free IRQ before closing + * window so that do not have to hold mutex. + */ + free_irq_setup(txwin); + h_deallocate_vas_window(txwin->vas_win.winid); + /* + * Hold mutex and reduce nr_open_wins_progress counter. + */ + mutex_lock(&vas_pseries_mutex); + caps->nr_open_wins_progress--; + mutex_unlock(&vas_pseries_mutex); +out: + atomic_dec(&cop_feat_caps->nr_used_credits); + kfree(txwin); + return ERR_PTR(rc); +} + +static u64 vas_paste_address(struct vas_window *vwin) +{ + struct pseries_vas_window *win; + + win = container_of(vwin, struct pseries_vas_window, vas_win); + return win->win_addr; +} + +static int deallocate_free_window(struct pseries_vas_window *win) +{ + int rc = 0; + + /* + * The hypervisor waits for all requests including faults + * are processed before closing the window - Means all + * credits have to be returned. In the case of fault + * request, a credit is returned after OS issues + * H_GET_NX_FAULT hcall. + * So free IRQ after executing H_DEALLOCATE_VAS_WINDOW + * hcall. + */ + rc = h_deallocate_vas_window(win->vas_win.winid); + if (!rc) + free_irq_setup(win); + + return rc; +} + +static int vas_deallocate_window(struct vas_window *vwin) +{ + struct pseries_vas_window *win; + struct vas_cop_feat_caps *caps; + int rc = 0; + + if (!vwin) + return -EINVAL; + + win = container_of(vwin, struct pseries_vas_window, vas_win); + + /* Should not happen */ + if (win->win_type >= VAS_MAX_FEAT_TYPE) { + pr_err("Window (%u): Invalid window type %u\n", + vwin->winid, win->win_type); + return -EINVAL; + } + + caps = &vascaps[win->win_type].caps; + mutex_lock(&vas_pseries_mutex); + /* + * VAS window is already closed in the hypervisor when + * lost the credit or with migration. So just remove the entry + * from the list, remove task references and free vas_window + * struct. + */ + if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) && + !(win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) { + rc = deallocate_free_window(win); + if (rc) { + mutex_unlock(&vas_pseries_mutex); + return rc; + } + } else + vascaps[win->win_type].nr_close_wins--; + + list_del(&win->win_list); + atomic_dec(&caps->nr_used_credits); + vascaps[win->win_type].nr_open_windows--; + mutex_unlock(&vas_pseries_mutex); + + mm_context_remove_vas_window(vwin->task_ref.mm); + put_vas_user_win_ref(&vwin->task_ref); + + kfree(win); + return 0; +} + +static const struct vas_user_win_ops vops_pseries = { + .open_win = vas_allocate_window, /* Open and configure window */ + .paste_addr = vas_paste_address, /* To do copy/paste */ + .close_win = vas_deallocate_window, /* Close window */ +}; + +/* + * Supporting only nx-gzip coprocessor type now, but this API code + * extended to other coprocessor types later. + */ +int vas_register_api_pseries(struct module *mod, enum vas_cop_type cop_type, + const char *name) +{ + if (!copypaste_feat) + return -ENOTSUPP; + + return vas_register_coproc_api(mod, cop_type, name, &vops_pseries); +} +EXPORT_SYMBOL_GPL(vas_register_api_pseries); + +void vas_unregister_api_pseries(void) +{ + vas_unregister_coproc_api(); +} +EXPORT_SYMBOL_GPL(vas_unregister_api_pseries); + +/* + * Get the specific capabilities based on the feature type. + * Right now supports GZIP default and GZIP QoS capabilities. + */ +static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type, + struct hv_vas_cop_feat_caps *hv_caps) +{ + struct vas_cop_feat_caps *caps; + struct vas_caps *vcaps; + int rc = 0; + + vcaps = &vascaps[type]; + memset(vcaps, 0, sizeof(*vcaps)); + INIT_LIST_HEAD(&vcaps->list); + + vcaps->feat = feat; + caps = &vcaps->caps; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, feat, + (u64)virt_to_phys(hv_caps)); + if (rc) + return rc; + + caps->user_mode = hv_caps->user_mode; + if (!(caps->user_mode & VAS_COPY_PASTE_USER_MODE)) { + pr_err("User space COPY/PASTE is not supported\n"); + return -ENOTSUPP; + } + + caps->descriptor = be64_to_cpu(hv_caps->descriptor); + caps->win_type = hv_caps->win_type; + if (caps->win_type >= VAS_MAX_FEAT_TYPE) { + pr_err("Unsupported window type %u\n", caps->win_type); + return -EINVAL; + } + caps->max_lpar_creds = be16_to_cpu(hv_caps->max_lpar_creds); + caps->max_win_creds = be16_to_cpu(hv_caps->max_win_creds); + atomic_set(&caps->nr_total_credits, + be16_to_cpu(hv_caps->target_lpar_creds)); + if (feat == VAS_GZIP_DEF_FEAT) { + caps->def_lpar_creds = be16_to_cpu(hv_caps->def_lpar_creds); + + if (caps->max_win_creds < DEF_WIN_CREDS) { + pr_err("Window creds(%u) > max allowed window creds(%u)\n", + DEF_WIN_CREDS, caps->max_win_creds); + return -EINVAL; + } + } + + rc = sysfs_add_vas_caps(caps); + if (rc) + return rc; + + copypaste_feat = true; + + return 0; +} + +/* + * VAS windows can be closed due to lost credits when the core is + * removed. So reopen them if credits are available due to DLPAR + * core add and set the window active status. When NX sees the page + * fault on the unmapped paste address, the kernel handles the fault + * by setting the remapping to new paste address if the window is + * active. + */ +static int reconfig_open_windows(struct vas_caps *vcaps, int creds, + bool migrate) +{ + long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID}; + struct vas_cop_feat_caps *caps = &vcaps->caps; + struct pseries_vas_window *win = NULL, *tmp; + int rc, mv_ents = 0; + int flag; + + /* + * Nothing to do if there are no closed windows. + */ + if (!vcaps->nr_close_wins) + return 0; + + /* + * For the core removal, the hypervisor reduces the credits + * assigned to the LPAR and the kernel closes VAS windows + * in the hypervisor depends on reduced credits. The kernel + * uses LIFO (the last windows that are opened will be closed + * first) and expects to open in the same order when credits + * are available. + * For example, 40 windows are closed when the LPAR lost 2 cores + * (dedicated). If 1 core is added, this LPAR can have 20 more + * credits. It means the kernel can reopen 20 windows. So move + * 20 entries in the VAS windows lost and reopen next 20 windows. + * For partition migration, reopen all windows that are closed + * during resume. + */ + if ((vcaps->nr_close_wins > creds) && !migrate) + mv_ents = vcaps->nr_close_wins - creds; + + list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) { + if (!mv_ents) + break; + + mv_ents--; + } + + /* + * Open windows if they are closed only with migration or + * DLPAR (lost credit) before. + */ + if (migrate) + flag = VAS_WIN_MIGRATE_CLOSE; + else + flag = VAS_WIN_NO_CRED_CLOSE; + + list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) { + /* + * This window is closed with DLPAR and migration events. + * So reopen the window with the last event. + * The user space is not suspended with the current + * migration notifier. So the user space can issue DLPAR + * CPU hotplug while migration in progress. In this case + * this window will be opened with the last event. + */ + if ((win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) && + (win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) { + win->vas_win.status &= ~flag; + continue; + } + + /* + * Nothing to do on this window if it is not closed + * with this flag + */ + if (!(win->vas_win.status & flag)) + continue; + + rc = allocate_setup_window(win, (u64 *)&domain[0], + caps->win_type); + if (rc) + return rc; + + rc = h_modify_vas_window(win); + if (rc) + goto out; + + mutex_lock(&win->vas_win.task_ref.mmap_mutex); + /* + * Set window status to active + */ + win->vas_win.status &= ~flag; + mutex_unlock(&win->vas_win.task_ref.mmap_mutex); + win->win_type = caps->win_type; + if (!--vcaps->nr_close_wins) + break; + } + + return 0; +out: + /* + * Window modify HCALL failed. So close the window to the + * hypervisor and return. + */ + free_irq_setup(win); + h_deallocate_vas_window(win->vas_win.winid); + return rc; +} + +/* + * The hypervisor reduces the available credits if the LPAR lost core. It + * means the excessive windows should not be active and the user space + * should not be using these windows to send compression requests to NX. + * So the kernel closes the excessive windows and unmap the paste address + * such that the user space receives paste instruction failure. Then up to + * the user space to fall back to SW compression and manage with the + * existing windows. + */ +static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, + bool migrate) +{ + struct pseries_vas_window *win, *tmp; + struct vas_user_win_ref *task_ref; + struct vm_area_struct *vma; + int rc = 0, flag; + + if (migrate) + flag = VAS_WIN_MIGRATE_CLOSE; + else + flag = VAS_WIN_NO_CRED_CLOSE; + + list_for_each_entry_safe(win, tmp, &vcap->list, win_list) { + /* + * This window is already closed due to lost credit + * or for migration before. Go for next window. + * For migration, nothing to do since this window + * closed for DLPAR and will be reopened even on + * the destination system with other DLPAR operation. + */ + if ((win->vas_win.status & VAS_WIN_MIGRATE_CLOSE) || + (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE)) { + win->vas_win.status |= flag; + continue; + } + + task_ref = &win->vas_win.task_ref; + /* + * VAS mmap (coproc_mmap()) and its fault handler + * (vas_mmap_fault()) are called after holding mmap lock. + * So hold mmap mutex after mmap_lock to avoid deadlock. + */ + mmap_write_lock(task_ref->mm); + mutex_lock(&task_ref->mmap_mutex); + vma = task_ref->vma; + /* + * Number of available credits are reduced, So select + * and close windows. + */ + win->vas_win.status |= flag; + + /* + * vma is set in the original mapping. But this mapping + * is done with mmap() after the window is opened with ioctl. + * so we may not see the original mapping if the core remove + * is done before the original mmap() and after the ioctl. + */ + if (vma) + zap_vma_pages(vma); + + mutex_unlock(&task_ref->mmap_mutex); + mmap_write_unlock(task_ref->mm); + /* + * Close VAS window in the hypervisor, but do not + * free vas_window struct since it may be reused + * when the credit is available later (DLPAR with + * adding cores). This struct will be used + * later when the process issued with close(FD). + */ + rc = deallocate_free_window(win); + /* + * This failure is from the hypervisor. + * No way to stop migration for these failures. + * So ignore error and continue closing other windows. + */ + if (rc && !migrate) + return rc; + + vcap->nr_close_wins++; + + /* + * For migration, do not depend on lpar_creds in case if + * mismatch with the hypervisor value (should not happen). + * So close all active windows in the list and will be + * reopened windows based on the new lpar_creds on the + * destination system during resume. + */ + if (!migrate && !--excess_creds) + break; + } + + return 0; +} + +/* + * Get new VAS capabilities when the core add/removal configuration + * changes. Reconfig window configurations based on the credits + * availability from this new capabilities. + */ +int vas_reconfig_capabilties(u8 type, int new_nr_creds) +{ + struct vas_cop_feat_caps *caps; + int old_nr_creds; + struct vas_caps *vcaps; + int rc = 0, nr_active_wins; + + if (type >= VAS_MAX_FEAT_TYPE) { + pr_err("Invalid credit type %d\n", type); + return -EINVAL; + } + + vcaps = &vascaps[type]; + caps = &vcaps->caps; + + mutex_lock(&vas_pseries_mutex); + + old_nr_creds = atomic_read(&caps->nr_total_credits); + + atomic_set(&caps->nr_total_credits, new_nr_creds); + /* + * The total number of available credits may be decreased or + * increased with DLPAR operation. Means some windows have to be + * closed / reopened. Hold the vas_pseries_mutex so that the + * user space can not open new windows. + */ + if (old_nr_creds < new_nr_creds) { + /* + * If the existing target credits is less than the new + * target, reopen windows if they are closed due to + * the previous DLPAR (core removal). + */ + rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds, + false); + } else { + /* + * # active windows is more than new LPAR available + * credits. So close the excessive windows. + * On pseries, each window will have 1 credit. + */ + nr_active_wins = vcaps->nr_open_windows - vcaps->nr_close_wins; + if (nr_active_wins > new_nr_creds) + rc = reconfig_close_windows(vcaps, + nr_active_wins - new_nr_creds, + false); + } + + mutex_unlock(&vas_pseries_mutex); + return rc; +} + +int pseries_vas_dlpar_cpu(void) +{ + int new_nr_creds, rc; + + /* + * NX-GZIP is not enabled. Nothing to do for DLPAR event + */ + if (!copypaste_feat) + return 0; + + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds); + } + + if (rc) + pr_err("Failed reconfig VAS capabilities with DLPAR\n"); + + return rc; +} + +/* + * Total number of default credits available (target_credits) + * in LPAR depends on number of cores configured. It varies based on + * whether processors are in shared mode or dedicated mode. + * Get the notifier when CPU configuration is changed with DLPAR + * operation so that get the new target_credits (vas default capabilities) + * and then update the existing windows usage if needed. + */ +static int pseries_vas_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct of_reconfig_data *rd = data; + struct device_node *dn = rd->dn; + const __be32 *intserv = NULL; + int len; + + /* + * For shared CPU partition, the hypervisor assigns total credits + * based on entitled core capacity. So updating VAS windows will + * be called from lparcfg_write(). + */ + if (is_shared_processor()) + return NOTIFY_OK; + + if ((action == OF_RECONFIG_ATTACH_NODE) || + (action == OF_RECONFIG_DETACH_NODE)) + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", + &len); + /* + * Processor config is not changed + */ + if (!intserv) + return NOTIFY_OK; + + return pseries_vas_dlpar_cpu(); +} + +static struct notifier_block pseries_vas_nb = { + .notifier_call = pseries_vas_notifier, +}; + +/* + * For LPM, all windows have to be closed on the source partition + * before migration and reopen them on the destination partition + * after migration. So closing windows during suspend and + * reopen them during resume. + */ +int vas_migration_handler(int action) +{ + struct vas_cop_feat_caps *caps; + int old_nr_creds, new_nr_creds = 0; + struct vas_caps *vcaps; + int i, rc = 0; + + pr_info("VAS migration event %d\n", action); + + /* + * NX-GZIP is not enabled. Nothing to do for migration. + */ + if (!copypaste_feat) + return rc; + + if (action == VAS_SUSPEND) + migration_in_progress = true; + else + migration_in_progress = false; + + for (i = 0; i < VAS_MAX_FEAT_TYPE; i++) { + vcaps = &vascaps[i]; + caps = &vcaps->caps; + old_nr_creds = atomic_read(&caps->nr_total_credits); + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vcaps->feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + /* + * Should not happen. But incase print messages, close + * all windows in the list during suspend and reopen + * windows based on new lpar_creds on the destination + * system. + */ + if (old_nr_creds != new_nr_creds) { + pr_err("Target credits mismatch with the hypervisor\n"); + pr_err("state(%d): lpar creds: %d HV lpar creds: %d\n", + action, old_nr_creds, new_nr_creds); + pr_err("Used creds: %d, Active creds: %d\n", + atomic_read(&caps->nr_used_credits), + vcaps->nr_open_windows - vcaps->nr_close_wins); + } + } else { + pr_err("state(%d): Get VAS capabilities failed with %d\n", + action, rc); + /* + * We can not stop migration with the current lpm + * implementation. So continue closing all windows in + * the list (during suspend) and return without + * opening windows (during resume) if VAS capabilities + * HCALL failed. + */ + if (action == VAS_RESUME) + goto out; + } + + switch (action) { + case VAS_SUSPEND: + mutex_lock(&vas_pseries_mutex); + rc = reconfig_close_windows(vcaps, vcaps->nr_open_windows, + true); + /* + * Windows are included in the list after successful + * open. So wait for closing these in-progress open + * windows in vas_allocate_window() which will be + * done if the migration_in_progress is set. + */ + while (vcaps->nr_open_wins_progress) { + mutex_unlock(&vas_pseries_mutex); + msleep(10); + mutex_lock(&vas_pseries_mutex); + } + mutex_unlock(&vas_pseries_mutex); + break; + case VAS_RESUME: + mutex_lock(&vas_pseries_mutex); + atomic_set(&caps->nr_total_credits, new_nr_creds); + rc = reconfig_open_windows(vcaps, new_nr_creds, true); + mutex_unlock(&vas_pseries_mutex); + break; + default: + /* should not happen */ + pr_err("Invalid migration action %d\n", action); + rc = -EINVAL; + goto out; + } + + /* + * Ignore errors during suspend and return for resume. + */ + if (rc && (action == VAS_RESUME)) + goto out; + } + + pr_info("VAS migration event (%d) successful\n", action); + +out: + return rc; +} + +static int __init pseries_vas_init(void) +{ + struct hv_vas_all_caps *hv_caps; + int rc = 0; + + /* + * Linux supports user space COPY/PASTE only with Radix + */ + if (!radix_enabled()) { + copypaste_feat = false; + pr_err("API is supported only with radix page tables\n"); + return -ENOTSUPP; + } + + hv_caps = kmalloc(sizeof(*hv_caps), GFP_KERNEL); + if (!hv_caps) + return -ENOMEM; + /* + * Get VAS overall capabilities by passing 0 to feature type. + */ + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, 0, + (u64)virt_to_phys(hv_caps)); + if (rc) + goto out; + + caps_all.descriptor = be64_to_cpu(hv_caps->descriptor); + caps_all.feat_type = be64_to_cpu(hv_caps->feat_type); + + sysfs_pseries_vas_init(&caps_all); + + /* + * QOS capabilities available + */ + if (caps_all.feat_type & VAS_GZIP_QOS_FEAT_BIT) { + rc = get_vas_capabilities(VAS_GZIP_QOS_FEAT, + VAS_GZIP_QOS_FEAT_TYPE, &hv_cop_caps); + + if (rc) + goto out; + } + /* + * Default capabilities available + */ + if (caps_all.feat_type & VAS_GZIP_DEF_FEAT_BIT) + rc = get_vas_capabilities(VAS_GZIP_DEF_FEAT, + VAS_GZIP_DEF_FEAT_TYPE, &hv_cop_caps); + + if (!rc && copypaste_feat) { + if (firmware_has_feature(FW_FEATURE_LPAR)) + of_reconfig_notifier_register(&pseries_vas_nb); + + pr_info("GZIP feature is available\n"); + } else { + /* + * Should not happen, but only when get default + * capabilities HCALL failed. So disable copy paste + * feature. + */ + copypaste_feat = false; + } + +out: + kfree(hv_caps); + return rc; +} +machine_device_initcall(pseries, pseries_vas_init); diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h new file mode 100644 index 000000000000..45567cd13178 --- /dev/null +++ b/arch/powerpc/platforms/pseries/vas.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2020-21 IBM Corp. + */ + +#ifndef _VAS_H +#define _VAS_H +#include <asm/vas.h> +#include <linux/mutex.h> +#include <linux/stringify.h> + +/* + * VAS window modify flags + */ +#define VAS_MOD_WIN_CLOSE PPC_BIT(0) +#define VAS_MOD_WIN_JOBS_KILL PPC_BIT(1) +#define VAS_MOD_WIN_DR PPC_BIT(3) +#define VAS_MOD_WIN_PR PPC_BIT(4) +#define VAS_MOD_WIN_SF PPC_BIT(5) +#define VAS_MOD_WIN_TA PPC_BIT(6) +#define VAS_MOD_WIN_FLAGS (VAS_MOD_WIN_JOBS_KILL | VAS_MOD_WIN_DR | \ + VAS_MOD_WIN_PR | VAS_MOD_WIN_SF) + +#define VAS_WIN_ACTIVE 0x0 +#define VAS_WIN_CLOSED 0x1 +#define VAS_WIN_INACTIVE 0x2 /* Inactive due to HW failure */ +/* Process of being modified, deallocated, or quiesced */ +#define VAS_WIN_MOD_IN_PROCESS 0x3 + +#define VAS_COPY_PASTE_USER_MODE 0x00000001 +#define VAS_COP_OP_USER_MODE 0x00000010 + +#define VAS_GZIP_QOS_CAPABILITIES 0x56516F73477A6970 +#define VAS_GZIP_DEFAULT_CAPABILITIES 0x56446566477A6970 + +enum vas_migrate_action { + VAS_SUSPEND, + VAS_RESUME, +}; + +/* + * Co-processor feature - GZIP QoS windows or GZIP default windows + */ +enum vas_cop_feat_type { + VAS_GZIP_QOS_FEAT_TYPE, + VAS_GZIP_DEF_FEAT_TYPE, + VAS_MAX_FEAT_TYPE, +}; + +/* + * Use to get feature specific capabilities from the + * hypervisor. + */ +struct hv_vas_cop_feat_caps { + __be64 descriptor; + u8 win_type; /* Default or QoS type */ + u8 user_mode; + __be16 max_lpar_creds; + __be16 max_win_creds; + union { + __be16 reserved; + __be16 def_lpar_creds; /* Used for default capabilities */ + }; + __be16 target_lpar_creds; +} __packed __aligned(0x1000); + +/* + * Feature specific (QoS or default) capabilities. + */ +struct vas_cop_feat_caps { + u64 descriptor; + u8 win_type; /* Default or QoS type */ + u8 user_mode; /* User mode copy/paste or COP HCALL */ + u16 max_lpar_creds; /* Max credits available in LPAR */ + /* Max credits can be assigned per window */ + u16 max_win_creds; + union { + u16 reserved; /* Used for QoS credit type */ + u16 def_lpar_creds; /* Used for default credit type */ + }; + /* Total LPAR available credits. Can be different from max LPAR */ + /* credits due to DLPAR operation */ + atomic_t nr_total_credits; /* Total credits assigned to LPAR */ + atomic_t nr_used_credits; /* Used credits so far */ +}; + +/* + * Feature (QoS or Default) specific to store capabilities and + * the list of open windows. + */ +struct vas_caps { + struct vas_cop_feat_caps caps; + struct list_head list; /* List of open windows */ + int nr_open_wins_progress; /* Number of open windows in */ + /* progress. Used in migration */ + int nr_close_wins; /* closed windows in the hypervisor for DLPAR */ + int nr_open_windows; /* Number of successful open windows */ + u8 feat; /* Feature type */ +}; + +/* + * To get window information from the hypervisor. + */ +struct hv_vas_win_lpar { + __be16 version; + u8 win_type; + u8 status; + __be16 credits; /* No of credits assigned to this window */ + __be16 reserved; + __be32 pid; /* LPAR Process ID */ + __be32 tid; /* LPAR Thread ID */ + __be64 win_addr; /* Paste address */ + __be32 interrupt; /* Interrupt when NX request completes */ + __be32 fault; /* Interrupt when NX sees fault */ + /* Associativity Domain Identifiers as returned in */ + /* H_HOME_NODE_ASSOCIATIVITY */ + __be64 domain[6]; + __be64 win_util; /* Number of bytes processed */ +} __packed __aligned(0x1000); + +struct pseries_vas_window { + struct vas_window vas_win; + u64 win_addr; /* Physical paste address */ + u8 win_type; /* QoS or Default window */ + u32 complete_irq; /* Completion interrupt */ + u32 fault_irq; /* Fault interrupt */ + u64 domain[6]; /* Associativity domain Ids */ + /* this window is allocated */ + u64 util; + u32 pid; /* PID associated with this window */ + + /* List of windows opened which is used for LPM */ + struct list_head win_list; + u64 flags; + char *name; + int fault_virq; + atomic_t pending_faults; /* Number of pending faults */ +}; + +int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); +int vas_reconfig_capabilties(u8 type, int new_nr_creds); +int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); + +#ifdef CONFIG_PPC_VAS +int vas_migration_handler(int action); +int pseries_vas_dlpar_cpu(void); +#else +static inline int vas_migration_handler(int action) +{ + return 0; +} +static inline int pseries_vas_dlpar_cpu(void) +{ + return 0; +} +#endif +#endif /* _VAS_H */ diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 79e2287991db..90ff85c879bf 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -20,8 +20,10 @@ #include <linux/console.h> #include <linux/export.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/kobject.h> +#include <linux/kexec.h> +#include <linux/of_irq.h> #include <asm/iommu.h> #include <asm/dma.h> @@ -31,6 +33,7 @@ #include <asm/tce.h> #include <asm/page.h> #include <asm/hvcall.h> +#include <asm/machdep.h> static struct vio_dev vio_bus_device = { /* fake "parent" device */ .name = "vio", @@ -558,7 +561,8 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); - if (vio_cmo_alloc(viodev, alloc_size)) + ret = vio_cmo_alloc(viodev, alloc_size); + if (ret) goto out_fail; ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, dma_get_mask(dev), direction, attrs); @@ -575,7 +579,7 @@ out_deallocate: vio_cmo_dealloc(viodev, alloc_size); out_fail: atomic_inc(&viodev->cmo.allocs_failed); - return 0; + return ret; } static void vio_dma_iommu_unmap_sg(struct device *dev, @@ -607,6 +611,8 @@ static const struct dma_map_ops vio_dma_mapping_ops = { .get_required_mask = dma_iommu_get_required_mask, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; /** @@ -985,22 +991,10 @@ static DEVICE_ATTR_RO(cmo_allocated); static DEVICE_ATTR_RW(cmo_desired); static DEVICE_ATTR_RW(cmo_allocs_failed); -static struct attribute *vio_cmo_dev_attrs[] = { - &dev_attr_name.attr, - &dev_attr_devspec.attr, - &dev_attr_modalias.attr, - &dev_attr_cmo_entitled.attr, - &dev_attr_cmo_allocated.attr, - &dev_attr_cmo_desired.attr, - &dev_attr_cmo_allocs_failed.attr, - NULL, -}; -ATTRIBUTE_GROUPS(vio_cmo_dev); - /* sysfs bus functions and data structures for CMO */ #define viobus_cmo_rd_attr(name) \ -static ssize_t cmo_bus_##name##_show(struct bus_type *bt, char *buf) \ +static ssize_t cmo_bus_##name##_show(const struct bus_type *bt, char *buf) \ { \ return sprintf(buf, "%lu\n", vio_cmo.name); \ } \ @@ -1009,7 +1003,7 @@ static struct bus_attribute bus_attr_cmo_bus_##name = \ #define viobus_cmo_pool_rd_attr(name, var) \ static ssize_t \ -cmo_##name##_##var##_show(struct bus_type *bt, char *buf) \ +cmo_##name##_##var##_show(const struct bus_type *bt, char *buf) \ { \ return sprintf(buf, "%lu\n", vio_cmo.name.var); \ } \ @@ -1024,12 +1018,12 @@ viobus_cmo_pool_rd_attr(reserve, size); viobus_cmo_pool_rd_attr(excess, size); viobus_cmo_pool_rd_attr(excess, free); -static ssize_t cmo_high_show(struct bus_type *bt, char *buf) +static ssize_t cmo_high_show(const struct bus_type *bt, char *buf) { return sprintf(buf, "%lu\n", vio_cmo.high); } -static ssize_t cmo_high_store(struct bus_type *bt, const char *buf, +static ssize_t cmo_high_store(const struct bus_type *bt, const char *buf, size_t count) { unsigned long flags; @@ -1056,11 +1050,7 @@ static struct attribute *vio_bus_attrs[] = { }; ATTRIBUTE_GROUPS(vio_bus); -static void vio_cmo_sysfs_init(void) -{ - vio_bus_type.dev_groups = vio_cmo_dev_groups; - vio_bus_type.bus_groups = vio_bus_groups; -} +static void __init vio_cmo_sysfs_init(void) { } #else /* CONFIG_PPC_SMLPAR */ int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; } void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {} @@ -1068,7 +1058,7 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; } static void vio_cmo_bus_remove(struct vio_dev *viodev) {} static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {} static void vio_cmo_bus_init(void) {} -static void vio_cmo_sysfs_init(void) { } +static void __init vio_cmo_sysfs_init(void) { } #endif /* CONFIG_PPC_SMLPAR */ EXPORT_SYMBOL(vio_cmo_entitlement_update); EXPORT_SYMBOL(vio_cmo_set_dev_desired); @@ -1176,6 +1166,8 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) if (tbl == NULL) return NULL; + kref_init(&tbl->it_kref); + of_parse_dma_window(dev->dev.of_node, dma_window, &tbl->it_index, &offset, &size); @@ -1251,12 +1243,11 @@ static int vio_bus_probe(struct device *dev) } /* convert from struct device to struct vio_dev and pass to driver. */ -static int vio_bus_remove(struct device *dev) +static void vio_bus_remove(struct device *dev) { struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); struct device *devptr; - int ret = 1; /* * Hold a reference to the device after the remove function is called @@ -1265,13 +1256,26 @@ static int vio_bus_remove(struct device *dev) devptr = get_device(dev); if (viodrv->remove) - ret = viodrv->remove(viodev); + viodrv->remove(viodev); - if (!ret && firmware_has_feature(FW_FEATURE_CMO)) + if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_bus_remove(viodev); put_device(devptr); - return ret; +} + +static void vio_bus_shutdown(struct device *dev) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct vio_driver *viodrv; + + if (dev->driver) { + viodrv = to_vio_driver(dev->driver); + if (viodrv->shutdown) + viodrv->shutdown(viodev); + else if (kexec_in_progress) + vio_bus_remove(dev); + } } /** @@ -1281,6 +1285,10 @@ static int vio_bus_remove(struct device *dev) int __vio_register_driver(struct vio_driver *viodrv, struct module *owner, const char *mod_name) { + // vio_bus_type is only initialised for pseries + if (!machine_is(pseries)) + return -ENODEV; + pr_debug("%s: driver %s registering\n", __func__, viodrv->name); /* fill in 'struct driver' fields */ @@ -1357,7 +1365,7 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) } if (family == PFO) { - if (of_get_property(of_node, "interrupt-controller", NULL)) { + if (of_property_read_bool(of_node, "interrupt-controller")) { pr_debug("%s: Skipping the interrupt controller %pOFn.\n", __func__, of_node); return NULL; @@ -1416,7 +1424,7 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) viodev->dev.bus = &vio_bus_type; viodev->dev.release = vio_dev_release; - if (of_get_property(viodev->dev.of_node, "ibm,my-dma-window", NULL)) { + if (of_property_present(viodev->dev.of_node, "ibm,my-dma-window")) { if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_set_dma_ops(viodev); else @@ -1456,7 +1464,7 @@ EXPORT_SYMBOL(vio_register_device_node); * Starting from the root node provide, register the device node for * each child beneath the root. */ -static void vio_bus_scan_register_devices(char *root_name) +static void __init vio_bus_scan_register_devices(char *root_name) { struct device_node *node_root, *node_child; @@ -1511,7 +1519,7 @@ static int __init vio_bus_init(void) return 0; } -postcore_initcall(vio_bus_init); +machine_postcore_initcall(pseries, vio_bus_init); static int __init vio_device_init(void) { @@ -1520,7 +1528,7 @@ static int __init vio_device_init(void) return 0; } -device_initcall(vio_device_init); +machine_device_initcall(pseries, vio_device_init); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1560,14 +1568,6 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(modalias); -static struct attribute *vio_dev_attrs[] = { - &dev_attr_name.attr, - &dev_attr_devspec.attr, - &dev_attr_modalias.attr, - NULL, -}; -ATTRIBUTE_GROUPS(vio_dev); - void vio_unregister_device(struct vio_dev *viodev) { device_unregister(&viodev->dev); @@ -1585,10 +1585,10 @@ static int vio_bus_match(struct device *dev, struct device_driver *drv) return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); } -static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env) +static int vio_hotplug(const struct device *dev, struct kobj_uevent_env *env) { const struct vio_dev *vio_dev = to_vio_dev(dev); - struct device_node *dn; + const struct device_node *dn; const char *cp; dn = dev->of_node; @@ -1602,14 +1602,48 @@ static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env) return 0; } -struct bus_type vio_bus_type = { +#ifdef CONFIG_PPC_SMLPAR +static struct attribute *vio_cmo_dev_attrs[] = { + &dev_attr_name.attr, + &dev_attr_devspec.attr, + &dev_attr_modalias.attr, + &dev_attr_cmo_entitled.attr, + &dev_attr_cmo_allocated.attr, + &dev_attr_cmo_desired.attr, + &dev_attr_cmo_allocs_failed.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vio_cmo_dev); + +const struct bus_type vio_bus_type = { + .name = "vio", + .dev_groups = vio_cmo_dev_groups, + .bus_groups = vio_bus_groups, + .uevent = vio_hotplug, + .match = vio_bus_match, + .probe = vio_bus_probe, + .remove = vio_bus_remove, + .shutdown = vio_bus_shutdown, +}; +#else /* CONFIG_PPC_SMLPAR */ +static struct attribute *vio_dev_attrs[] = { + &dev_attr_name.attr, + &dev_attr_devspec.attr, + &dev_attr_modalias.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vio_dev); + +const struct bus_type vio_bus_type = { .name = "vio", .dev_groups = vio_dev_groups, .uevent = vio_hotplug, .match = vio_bus_match, .probe = vio_bus_probe, .remove = vio_bus_remove, + .shutdown = vio_bus_shutdown, }; +#endif /* CONFIG_PPC_SMLPAR */ /** * vio_get_attribute: - get attribute for virtual device @@ -1626,7 +1660,6 @@ const void *vio_get_attribute(struct vio_dev *vdev, char *which, int *length) } EXPORT_SYMBOL(vio_get_attribute); -#ifdef CONFIG_PPC_PSERIES /* vio_find_name() - internal because only vio.c knows how we formatted the * kobject name */ @@ -1696,11 +1729,10 @@ int vio_disable_interrupts(struct vio_dev *dev) return rc; } EXPORT_SYMBOL(vio_disable_interrupts); -#endif /* CONFIG_PPC_PSERIES */ static int __init vio_init(void) { dma_debug_add_bus(&vio_bus_type); return 0; } -fs_initcall(vio_init); +machine_fs_initcall(pseries, vio_init); diff --git a/arch/powerpc/platforms/pseries/vphn.c b/arch/powerpc/platforms/pseries/vphn.c index 3f07bf6c670e..3f85ece3c872 100644 --- a/arch/powerpc/platforms/pseries/vphn.c +++ b/arch/powerpc/platforms/pseries/vphn.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/byteorder.h> -#include <asm/lppaca.h> +#include <asm/vphn.h> /* * The associativity domain numbers are returned from the hypervisor as a @@ -82,7 +82,8 @@ long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity) long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, cpu); - vphn_unpack_associativity(retbuf, associativity); + if (rc == H_SUCCESS) + vphn_unpack_associativity(retbuf, associativity); return rc; } |