diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
35 files changed, 3576 insertions, 676 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index afc0f6a61337..fa3c2fff082a 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -23,6 +23,7 @@ config PPC_PSERIES select FORCE_SMP select SWIOTLB select ARCH_SUPPORTS_PER_VMA_LOCK + select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU default y config PARAVIRT @@ -128,6 +129,15 @@ config CMM will be reused for other LPARs. The interface allows firmware to balance memory across many LPARs. +config HTMDUMP + tristate "PowerVM data dumper" + depends on PPC_PSERIES && DEBUG_FS + default m + help + Select this option, if you want to enable the kernel debugfs + interface to dump the Hardware Trace Macro (HTM) function data + in the LPAR. + config HV_PERF_CTRS bool "Hypervisor supplied PMU events (24x7 & GPCI)" default y @@ -140,6 +150,20 @@ config HV_PERF_CTRS If unsure, select Y. +config VPA_PMU + tristate "VPA PMU events" + depends on KVM_BOOK3S_64_HV && HV_PERF_CTRS + help + Enable access to the VPA PMU counters via perf. This enables + code that support measurement for KVM on PowerVM(KoP) feature. + PAPR hypervisor has introduced three new counters in the VPA area + of LPAR CPUs for KVM L2 guest observability. Two for context switches + from host to guest and vice versa, and one counter for getting + the total time spent inside the KVM guest. This config enables code + that access these software counters via perf. + + If unsure, Select N. + config IBMVIO depends on PPC_PSERIES bool diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index f936962a2946..57222678bb3f 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ of_helpers.o rtas-work-area.o papr-sysparm.o \ - papr-vpd.o \ + papr-rtas-common.o papr-vpd.o papr-indices.o \ + papr-platform-dump.o papr-phy-attest.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o \ @@ -20,6 +20,7 @@ obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_HTMDUMP) += htmdump.o obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 47f8eabd1bee..213aa26dc8b3 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -23,6 +23,7 @@ #include <linux/uaccess.h> #include <asm/rtas.h> #include <asm/rtas-work-area.h> +#include <asm/prom.h> static struct workqueue_struct *pseries_hp_wq; @@ -250,11 +251,8 @@ int dlpar_detach_node(struct device_node *dn) struct device_node *child; int rc; - child = of_get_next_child(dn, NULL); - while (child) { + for_each_child_of_node(dn, child) dlpar_detach_node(child); - child = of_get_next_child(dn, child); - } rc = of_detach_node(dn); if (rc) @@ -264,6 +262,20 @@ int dlpar_detach_node(struct device_node *dn) return 0; } +static int dlpar_changeset_attach_cc_nodes(struct of_changeset *ocs, + struct device_node *dn) +{ + int rc; + + rc = of_changeset_attach_node(ocs, dn); + + if (!rc && dn->child) + rc = dlpar_changeset_attach_cc_nodes(ocs, dn->child); + if (!rc && dn->sibling) + rc = dlpar_changeset_attach_cc_nodes(ocs, dn->sibling); + + return rc; +} #define DR_ENTITY_SENSE 9003 #define DR_ENTITY_PRESENT 1 @@ -330,27 +342,206 @@ int dlpar_unisolate_drc(u32 drc_index) return 0; } -int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) +static struct device_node * +get_device_node_with_drc_index(u32 index) +{ + struct device_node *np = NULL; + u32 node_index; + int rc; + + for_each_node_with_property(np, "ibm,my-drc-index") { + rc = of_property_read_u32(np, "ibm,my-drc-index", + &node_index); + if (rc) { + pr_err("%s: %pOF: of_property_read_u32 %s: %d\n", + __func__, np, "ibm,my-drc-index", rc); + of_node_put(np); + return NULL; + } + + if (index == node_index) + break; + } + + return np; +} + +static struct device_node * +get_device_node_with_drc_info(u32 index) +{ + struct device_node *np = NULL; + struct of_drc_info drc; + struct property *info; + const __be32 *value; + u32 node_index; + int i, j, count; + + for_each_node_with_property(np, "ibm,drc-info") { + info = of_find_property(np, "ibm,drc-info", NULL); + if (info == NULL) { + /* XXX can this happen? */ + of_node_put(np); + return NULL; + } + value = of_prop_next_u32(info, NULL, &count); + if (value == NULL) + continue; + value++; + for (i = 0; i < count; i++) { + if (of_read_drc_info_cell(&info, &value, &drc)) + break; + if (index > drc.last_drc_index) + continue; + node_index = drc.drc_index_start; + for (j = 0; j < drc.num_sequential_elems; j++) { + if (index == node_index) + return np; + node_index += drc.sequential_inc; + } + } + } + + return NULL; +} + +static int dlpar_hp_dt_add(u32 index) +{ + struct device_node *np, *nodes; + struct of_changeset ocs; + int rc; + + /* + * Do not add device node(s) if already exists in the + * device tree. + */ + np = get_device_node_with_drc_index(index); + if (np) { + pr_err("%s: Adding device node for index (%d), but " + "already exists in the device tree\n", + __func__, index); + rc = -EINVAL; + goto out; + } + + np = get_device_node_with_drc_info(index); + + if (!np) + return -EIO; + + /* Next, configure the connector. */ + nodes = dlpar_configure_connector(cpu_to_be32(index), np); + if (!nodes) { + rc = -EIO; + goto out; + } + + /* + * Add the new nodes from dlpar_configure_connector() onto + * the device-tree. + */ + of_changeset_init(&ocs); + rc = dlpar_changeset_attach_cc_nodes(&ocs, nodes); + + if (!rc) + rc = of_changeset_apply(&ocs); + else + dlpar_free_cc_nodes(nodes); + + of_changeset_destroy(&ocs); + +out: + of_node_put(np); + return rc; +} + +static int changeset_detach_node_recursive(struct of_changeset *ocs, + struct device_node *node) { + struct device_node *child; int rc; - /* pseries error logs are in BE format, convert to cpu type */ - switch (hp_elog->id_type) { - case PSERIES_HP_ELOG_ID_DRC_COUNT: - hp_elog->_drc_u.drc_count = - be32_to_cpu(hp_elog->_drc_u.drc_count); + for_each_child_of_node(node, child) { + rc = changeset_detach_node_recursive(ocs, child); + if (rc) { + of_node_put(child); + return rc; + } + } + + return of_changeset_detach_node(ocs, node); +} + +static int dlpar_hp_dt_remove(u32 drc_index) +{ + struct device_node *np; + struct of_changeset ocs; + u32 index; + int rc = 0; + + /* + * Prune all nodes with a matching index. + */ + of_changeset_init(&ocs); + + for_each_node_with_property(np, "ibm,my-drc-index") { + rc = of_property_read_u32(np, "ibm,my-drc-index", &index); + if (rc) { + pr_err("%s: %pOF: of_property_read_u32 %s: %d\n", + __func__, np, "ibm,my-drc-index", rc); + of_node_put(np); + goto out; + } + + if (index == drc_index) { + rc = changeset_detach_node_recursive(&ocs, np); + if (rc) { + of_node_put(np); + goto out; + } + } + } + + rc = of_changeset_apply(&ocs); + +out: + of_changeset_destroy(&ocs); + return rc; +} + +static int dlpar_hp_dt(struct pseries_hp_errorlog *phpe) +{ + u32 drc_index; + int rc; + + if (phpe->id_type != PSERIES_HP_ELOG_ID_DRC_INDEX) + return -EINVAL; + + drc_index = be32_to_cpu(phpe->_drc_u.drc_index); + + lock_device_hotplug(); + + switch (phpe->action) { + case PSERIES_HP_ELOG_ACTION_ADD: + rc = dlpar_hp_dt_add(drc_index); break; - case PSERIES_HP_ELOG_ID_DRC_INDEX: - hp_elog->_drc_u.drc_index = - be32_to_cpu(hp_elog->_drc_u.drc_index); + case PSERIES_HP_ELOG_ACTION_REMOVE: + rc = dlpar_hp_dt_remove(drc_index); + break; + default: + pr_err("Invalid action (%d) specified\n", phpe->action); + rc = -EINVAL; break; - case PSERIES_HP_ELOG_ID_DRC_IC: - hp_elog->_drc_u.ic.count = - be32_to_cpu(hp_elog->_drc_u.ic.count); - hp_elog->_drc_u.ic.index = - be32_to_cpu(hp_elog->_drc_u.ic.index); } + unlock_device_hotplug(); + + return rc; +} + +int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) +{ + int rc; + switch (hp_elog->resource) { case PSERIES_HP_ELOG_RESOURCE_MEM: rc = dlpar_memory(hp_elog); @@ -361,6 +552,9 @@ int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) case PSERIES_HP_ELOG_RESOURCE_PMEM: rc = dlpar_hp_pmem(hp_elog); break; + case PSERIES_HP_ELOG_RESOURCE_DT: + rc = dlpar_hp_dt(hp_elog); + break; default: pr_warn_ratelimited("Invalid resource (%d) specified\n", @@ -413,6 +607,8 @@ static int dlpar_parse_resource(char **cmd, struct pseries_hp_errorlog *hp_elog) hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM; } else if (sysfs_streq(arg, "cpu")) { hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_CPU; + } else if (sysfs_streq(arg, "dt")) { + hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_DT; } else { pr_err("Invalid resource specified.\n"); return -EINVAL; @@ -554,7 +750,7 @@ dlpar_store_out: static ssize_t dlpar_show(const struct class *class, const struct class_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", "memory,cpu"); + return sprintf(buf, "%s\n", "memory,cpu,dt"); } static CLASS_ATTR_RW(dlpar); diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 3f1cdccebc9c..f293588b8c7b 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -191,7 +191,7 @@ static int dtl_enable(struct dtl *dtl) return -EBUSY; /* ensure there are no other conflicting dtl users */ - if (!read_trylock(&dtl_access_lock)) + if (!down_read_trylock(&dtl_access_lock)) return -EBUSY; n_entries = dtl_buf_entries; @@ -199,7 +199,7 @@ static int dtl_enable(struct dtl *dtl) if (!buf) { printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", __func__, dtl->cpu); - read_unlock(&dtl_access_lock); + up_read(&dtl_access_lock); return -ENOMEM; } @@ -217,7 +217,7 @@ static int dtl_enable(struct dtl *dtl) spin_unlock(&dtl->lock); if (rc) { - read_unlock(&dtl_access_lock); + up_read(&dtl_access_lock); kmem_cache_free(dtl_cache, buf); } @@ -232,7 +232,7 @@ static void dtl_disable(struct dtl *dtl) dtl->buf = NULL; dtl->buf_entries = 0; spin_unlock(&dtl->lock); - read_unlock(&dtl_access_lock); + up_read(&dtl_access_lock); } /* file interface */ @@ -325,7 +325,6 @@ static const struct file_operations dtl_fops = { .open = dtl_file_open, .release = dtl_file_release, .read = dtl_file_read, - .llseek = no_llseek, }; static struct dentry *dtl_dir; diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index b1ae0c0d1187..b12ef382fec7 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -580,8 +580,10 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) switch(rets[0]) { case 0: - result = EEH_STATE_MMIO_ACTIVE | - EEH_STATE_DMA_ACTIVE; + result = EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE | + EEH_STATE_MMIO_ENABLED | + EEH_STATE_DMA_ENABLED; break; case 1: result = EEH_STATE_RESET_ACTIVE | @@ -784,6 +786,43 @@ static int pseries_notify_resume(struct eeh_dev *edev) } #endif +/** + * pseries_eeh_err_inject - Inject specified error to the indicated PE + * @pe: the indicated PE + * @type: error type + * @func: specific error type + * @addr: address + * @mask: address mask + * The routine is called to inject specified error, which is + * determined by @type and @func, to the indicated PE + */ +static int pseries_eeh_err_inject(struct eeh_pe *pe, int type, int func, + unsigned long addr, unsigned long mask) +{ + struct eeh_dev *pdev; + + /* Check on PCI error type */ + if (type != EEH_ERR_TYPE_32 && type != EEH_ERR_TYPE_64) + return -EINVAL; + + switch (func) { + case EEH_ERR_FUNC_LD_MEM_ADDR: + case EEH_ERR_FUNC_LD_MEM_DATA: + case EEH_ERR_FUNC_ST_MEM_ADDR: + case EEH_ERR_FUNC_ST_MEM_DATA: + /* injects a MMIO error for all pdev's belonging to PE */ + pci_lock_rescan_remove(); + list_for_each_entry(pdev, &pe->edevs, entry) + eeh_pe_inject_mmio_error(pdev->pdev); + pci_unlock_rescan_remove(); + break; + default: + return -ERANGE; + } + + return 0; +} + static struct eeh_ops pseries_eeh_ops = { .name = "pseries", .probe = pseries_eeh_probe, @@ -792,7 +831,7 @@ static struct eeh_ops pseries_eeh_ops = { .reset = pseries_eeh_reset, .get_log = pseries_eeh_get_log, .configure_bridge = pseries_eeh_configure_bridge, - .err_inject = NULL, + .err_inject = pseries_eeh_err_inject, .read_config = pseries_eeh_read_config, .write_config = pseries_eeh_write_config, .next_error = NULL, diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index e62835a12d73..bc6926dbf148 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -33,6 +33,7 @@ #include <asm/xive.h> #include <asm/plpar_wrappers.h> #include <asm/topology.h> +#include <asm/systemcfg.h> #include "pseries.h" @@ -83,7 +84,9 @@ static int pseries_cpu_disable(void) int cpu = smp_processor_id(); set_cpu_online(cpu, false); - vdso_data->processorCount--; +#ifdef CONFIG_PPC64_PROC_SYSTEMCFG + systemcfg->processorCount--; +#endif /*fix boot_cpuid here*/ if (cpu == boot_cpuid) @@ -757,7 +760,7 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) u32 drc_index; int rc; - drc_index = hp_elog->_drc_u.drc_index; + drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index); lock_device_hotplug(); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 3fe3ddb30c04..38dc4f7c9296 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -817,16 +817,16 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) case PSERIES_HP_ELOG_ACTION_ADD: switch (hp_elog->id_type) { case PSERIES_HP_ELOG_ID_DRC_COUNT: - count = hp_elog->_drc_u.drc_count; + count = be32_to_cpu(hp_elog->_drc_u.drc_count); rc = dlpar_memory_add_by_count(count); break; case PSERIES_HP_ELOG_ID_DRC_INDEX: - drc_index = hp_elog->_drc_u.drc_index; + drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index); rc = dlpar_memory_add_by_index(drc_index); break; case PSERIES_HP_ELOG_ID_DRC_IC: - count = hp_elog->_drc_u.ic.count; - drc_index = hp_elog->_drc_u.ic.index; + count = be32_to_cpu(hp_elog->_drc_u.ic.count); + drc_index = be32_to_cpu(hp_elog->_drc_u.ic.index); rc = dlpar_memory_add_by_ic(count, drc_index); break; default: @@ -838,16 +838,16 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) case PSERIES_HP_ELOG_ACTION_REMOVE: switch (hp_elog->id_type) { case PSERIES_HP_ELOG_ID_DRC_COUNT: - count = hp_elog->_drc_u.drc_count; + count = be32_to_cpu(hp_elog->_drc_u.drc_count); rc = dlpar_memory_remove_by_count(count); break; case PSERIES_HP_ELOG_ID_DRC_INDEX: - drc_index = hp_elog->_drc_u.drc_index; + drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index); rc = dlpar_memory_remove_by_index(drc_index); break; case PSERIES_HP_ELOG_ID_DRC_IC: - count = hp_elog->_drc_u.ic.count; - drc_index = hp_elog->_drc_u.ic.index; + count = be32_to_cpu(hp_elog->_drc_u.ic.count); + drc_index = be32_to_cpu(hp_elog->_drc_u.ic.index); rc = dlpar_memory_remove_by_ic(count, drc_index); break; default: diff --git a/arch/powerpc/platforms/pseries/htmdump.c b/arch/powerpc/platforms/pseries/htmdump.c new file mode 100644 index 000000000000..742ec52c9d4d --- /dev/null +++ b/arch/powerpc/platforms/pseries/htmdump.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) IBM Corporation, 2024 + */ + +#define pr_fmt(fmt) "htmdump: " fmt + +#include <linux/debugfs.h> +#include <linux/module.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/plpar_wrappers.h> +#include <asm/kvm_guest.h> + +static void *htm_buf; +static void *htm_status_buf; +static void *htm_info_buf; +static void *htm_caps_buf; +static u32 nodeindex; +static u32 nodalchipindex; +static u32 coreindexonchip; +static u32 htmtype; +static u32 htmconfigure; +static u32 htmstart; +static u32 htmsetup; +static u64 htmflags; + +static struct dentry *htmdump_debugfs_dir; +#define HTM_ENABLE 1 +#define HTM_DISABLE 0 +#define HTM_NOWRAP 1 +#define HTM_WRAP 0 + +/* + * Check the return code for H_HTM hcall. + * Return non-zero value (1) if either H_PARTIAL or H_SUCCESS + * is returned. For other return codes: + * Return zero if H_NOT_AVAILABLE. + * Return -EBUSY if hcall return busy. + * Return -EINVAL if any parameter or operation is not valid. + * Return -EPERM if HTM Virtualization Engine Technology code + * is not applied. + * Return -EIO if the HTM state is not valid. + */ +static ssize_t htm_return_check(long rc) +{ + switch (rc) { + case H_SUCCESS: + /* H_PARTIAL for the case where all available data can't be + * returned due to buffer size constraint. + */ + case H_PARTIAL: + break; + /* H_NOT_AVAILABLE indicates reading from an offset outside the range, + * i.e. past end of file. + */ + case H_NOT_AVAILABLE: + return 0; + case H_BUSY: + case H_LONG_BUSY_ORDER_1_MSEC: + case H_LONG_BUSY_ORDER_10_MSEC: + case H_LONG_BUSY_ORDER_100_MSEC: + case H_LONG_BUSY_ORDER_1_SEC: + case H_LONG_BUSY_ORDER_10_SEC: + case H_LONG_BUSY_ORDER_100_SEC: + return -EBUSY; + case H_PARAMETER: + case H_P2: + case H_P3: + case H_P4: + case H_P5: + case H_P6: + return -EINVAL; + case H_STATE: + return -EIO; + case H_AUTHORITY: + return -EPERM; + } + + /* + * Return 1 for H_SUCCESS/H_PARTIAL + */ + return 1; +} + +static ssize_t htmdump_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + void *htm_buf = filp->private_data; + unsigned long page, read_size, available; + loff_t offset; + long rc, ret; + + page = ALIGN_DOWN(*ppos, PAGE_SIZE); + offset = (*ppos) % PAGE_SIZE; + + /* + * Invoke H_HTM call with: + * - operation as htm dump (H_HTM_OP_DUMP_DATA) + * - last three values are address, size and offset + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_DUMP_DATA, virt_to_phys(htm_buf), + PAGE_SIZE, page); + + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall failed for op: H_HTM_OP_DUMP_DATA, returning %ld\n", ret); + return ret; + } + + available = PAGE_SIZE; + read_size = min(count, available); + *ppos += read_size; + return simple_read_from_buffer(ubuf, count, &offset, htm_buf, available); +} + +static const struct file_operations htmdump_fops = { + .llseek = NULL, + .read = htmdump_read, + .open = simple_open, +}; + +static int htmconfigure_set(void *data, u64 val) +{ + long rc, ret; + unsigned long param1 = -1, param2 = -1; + + /* + * value as 1 : configure HTM. + * value as 0 : deconfigure HTM. Return -EINVAL for + * other values. + */ + if (val == HTM_ENABLE) { + /* + * Invoke H_HTM call with: + * - operation as htm configure (H_HTM_OP_CONFIGURE) + * - If htmflags is set, param1 and param2 will be -1 + * which is an indicator to use default htm mode reg mask + * and htm mode reg value. + * - last three values are unused, hence set to zero + */ + if (!htmflags) { + param1 = 0; + param2 = 0; + } + + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_CONFIGURE, param1, param2, 0); + } else if (val == HTM_DISABLE) { + /* + * Invoke H_HTM call with: + * - operation as htm deconfigure (H_HTM_OP_DECONFIGURE) + * - last three values are unused, hence set to zero + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_DECONFIGURE, 0, 0, 0); + } else + return -EINVAL; + + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall failed, returning %ld\n", ret); + return ret; + } + + /* Set htmconfigure if operation succeeds */ + htmconfigure = val; + + return 0; +} + +static int htmconfigure_get(void *data, u64 *val) +{ + *val = htmconfigure; + return 0; +} + +static int htmstart_set(void *data, u64 val) +{ + long rc, ret; + + /* + * value as 1: start HTM + * value as 0: stop HTM + * Return -EINVAL for other values. + */ + if (val == HTM_ENABLE) { + /* + * Invoke H_HTM call with: + * - operation as htm start (H_HTM_OP_START) + * - last three values are unused, hence set to zero + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_START, 0, 0, 0); + + } else if (val == HTM_DISABLE) { + /* + * Invoke H_HTM call with: + * - operation as htm stop (H_HTM_OP_STOP) + * - last three values are unused, hence set to zero + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_STOP, 0, 0, 0); + } else + return -EINVAL; + + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall failed, returning %ld\n", ret); + return ret; + } + + /* Set htmstart if H_HTM_OP_START/H_HTM_OP_STOP operation succeeds */ + htmstart = val; + + return 0; +} + +static int htmstart_get(void *data, u64 *val) +{ + *val = htmstart; + return 0; +} + +static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + void *htm_status_buf = filp->private_data; + long rc, ret; + u64 *num_entries; + u64 to_copy; + int htmstatus_flag; + + /* + * Invoke H_HTM call with: + * - operation as htm status (H_HTM_OP_STATUS) + * - last three values as addr, size and offset + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_STATUS, virt_to_phys(htm_status_buf), + PAGE_SIZE, 0); + + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall failed for op: H_HTM_OP_STATUS, returning %ld\n", ret); + return ret; + } + + /* + * HTM status buffer, start of buffer + 0x10 gives the + * number of HTM entries in the buffer. Each nest htm status + * entry is 0x6 bytes where each core htm status entry is + * 0x8 bytes. + * So total count to copy is: + * 32 bytes (for first 7 fields) + (number of HTM entries * entry size) + */ + num_entries = htm_status_buf + 0x10; + if (htmtype == 0x2) + htmstatus_flag = 0x8; + else + htmstatus_flag = 0x6; + to_copy = 32 + (be64_to_cpu(*num_entries) * htmstatus_flag); + return simple_read_from_buffer(ubuf, count, ppos, htm_status_buf, to_copy); +} + +static const struct file_operations htmstatus_fops = { + .llseek = NULL, + .read = htmstatus_read, + .open = simple_open, +}; + +static ssize_t htminfo_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + void *htm_info_buf = filp->private_data; + long rc, ret; + u64 *num_entries; + u64 to_copy; + + /* + * Invoke H_HTM call with: + * - operation as htm status (H_HTM_OP_STATUS) + * - last three values as addr, size and offset + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_DUMP_SYSPROC_CONF, virt_to_phys(htm_info_buf), + PAGE_SIZE, 0); + + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall failed for op: H_HTM_OP_DUMP_SYSPROC_CONF, returning %ld\n", ret); + return ret; + } + + /* + * HTM status buffer, start of buffer + 0x10 gives the + * number of HTM entries in the buffer. Each entry of processor + * is 16 bytes. + * + * So total count to copy is: + * 32 bytes (for first 5 fields) + (number of HTM entries * entry size) + */ + num_entries = htm_info_buf + 0x10; + to_copy = 32 + (be64_to_cpu(*num_entries) * 16); + return simple_read_from_buffer(ubuf, count, ppos, htm_info_buf, to_copy); +} + +static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + void *htm_caps_buf = filp->private_data; + long rc, ret; + + /* + * Invoke H_HTM call with: + * - operation as htm capabilities (H_HTM_OP_CAPABILITIES) + * - last three values as addr, size (0x80 for Capabilities Output Buffer + * and zero + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_CAPABILITIES, virt_to_phys(htm_caps_buf), + 0x80, 0); + + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall failed for op: H_HTM_OP_CAPABILITIES, returning %ld\n", ret); + return ret; + } + + return simple_read_from_buffer(ubuf, count, ppos, htm_caps_buf, 0x80); +} + +static const struct file_operations htminfo_fops = { + .llseek = NULL, + .read = htminfo_read, + .open = simple_open, +}; + +static const struct file_operations htmcaps_fops = { + .llseek = NULL, + .read = htmcaps_read, + .open = simple_open, +}; + +static int htmsetup_set(void *data, u64 val) +{ + long rc, ret; + + /* + * Input value: HTM buffer size in the power of 2 + * example: hex value 0x21 ( decimal: 33 ) is for + * 8GB + * Invoke H_HTM call with: + * - operation as htm start (H_HTM_OP_SETUP) + * - parameter 1 set to input value. + * - last two values are unused, hence set to zero + */ + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_SETUP, val, 0, 0); + + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall failed for op: H_HTM_OP_SETUP, returning %ld\n", ret); + return ret; + } + + /* Set htmsetup if H_HTM_OP_SETUP operation succeeds */ + htmsetup = val; + + return 0; +} + +static int htmsetup_get(void *data, u64 *val) +{ + *val = htmsetup; + return 0; +} + +static int htmflags_set(void *data, u64 val) +{ + /* + * Input value: + * Currently supported flag value is to enable/disable + * HTM buffer wrap. wrap is used along with "configure" + * to prevent HTM buffer from wrapping. + * Writing 1 will set noWrap while configuring HTM + */ + if (val == HTM_NOWRAP) + htmflags = H_HTM_FLAGS_NOWRAP; + else if (val == HTM_WRAP) + htmflags = 0; + else + return -EINVAL; + + return 0; +} + +static int htmflags_get(void *data, u64 *val) +{ + *val = htmflags; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(htmconfigure_fops, htmconfigure_get, htmconfigure_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(htmstart_fops, htmstart_get, htmstart_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(htmsetup_fops, htmsetup_get, htmsetup_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(htmflags_fops, htmflags_get, htmflags_set, "%llu\n"); + +static int htmdump_init_debugfs(void) +{ + htm_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!htm_buf) { + pr_err("Failed to allocate htmdump buf\n"); + return -ENOMEM; + } + + htmdump_debugfs_dir = debugfs_create_dir("htmdump", + arch_debugfs_dir); + + debugfs_create_u32("nodeindex", 0600, + htmdump_debugfs_dir, &nodeindex); + debugfs_create_u32("nodalchipindex", 0600, + htmdump_debugfs_dir, &nodalchipindex); + debugfs_create_u32("coreindexonchip", 0600, + htmdump_debugfs_dir, &coreindexonchip); + debugfs_create_u32("htmtype", 0600, + htmdump_debugfs_dir, &htmtype); + debugfs_create_file("trace", 0400, htmdump_debugfs_dir, htm_buf, &htmdump_fops); + + /* + * Debugfs interface files to control HTM operations: + */ + debugfs_create_file("htmconfigure", 0600, htmdump_debugfs_dir, NULL, &htmconfigure_fops); + debugfs_create_file("htmstart", 0600, htmdump_debugfs_dir, NULL, &htmstart_fops); + debugfs_create_file("htmsetup", 0600, htmdump_debugfs_dir, NULL, &htmsetup_fops); + debugfs_create_file("htmflags", 0600, htmdump_debugfs_dir, NULL, &htmflags_fops); + + /* Debugfs interface file to present status of HTM */ + htm_status_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!htm_status_buf) { + pr_err("Failed to allocate htmstatus buf\n"); + return -ENOMEM; + } + + /* Debugfs interface file to present System Processor Configuration */ + htm_info_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!htm_info_buf) { + pr_err("Failed to allocate htm info buf\n"); + return -ENOMEM; + } + + /* Debugfs interface file to present HTM capabilities */ + htm_caps_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!htm_caps_buf) { + pr_err("Failed to allocate htm caps buf\n"); + return -ENOMEM; + } + + debugfs_create_file("htmstatus", 0400, htmdump_debugfs_dir, htm_status_buf, &htmstatus_fops); + debugfs_create_file("htminfo", 0400, htmdump_debugfs_dir, htm_info_buf, &htminfo_fops); + debugfs_create_file("htmcaps", 0400, htmdump_debugfs_dir, htm_caps_buf, &htmcaps_fops); + + return 0; +} + +static int __init htmdump_init(void) +{ + /* Disable on kvm guest */ + if (is_kvm_guest()) { + pr_info("htmdump not supported inside KVM guest\n"); + return -EOPNOTSUPP; + } + + if (htmdump_init_debugfs()) + return -ENOMEM; + + return 0; +} + +static void __exit htmdump_exit(void) +{ + debugfs_remove_recursive(htmdump_debugfs_dir); + kfree(htm_buf); +} + +module_init(htmdump_init); +module_exit(htmdump_exit); +MODULE_DESCRIPTION("PHYP Hardware Trace Macro (HTM) data dumper"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c index b401282727a4..3436b0af795e 100644 --- a/arch/powerpc/platforms/pseries/ibmebus.c +++ b/arch/powerpc/platforms/pseries/ibmebus.c @@ -339,7 +339,7 @@ static struct attribute *ibmbus_bus_attrs[] = { }; ATTRIBUTE_GROUPS(ibmbus_bus); -static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv) +static int ibmebus_bus_bus_match(struct device *dev, const struct device_driver *drv) { const struct of_device_id *matches = drv->of_match_table; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b1e6d275cda9..eec333dd2e59 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -21,6 +21,7 @@ #include <linux/dma-mapping.h> #include <linux/crash_dump.h> #include <linux/memory.h> +#include <linux/vmalloc.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/iommu.h> @@ -51,7 +52,8 @@ enum { enum { DDW_EXT_SIZE = 0, DDW_EXT_RESET_DMA_WIN = 1, - DDW_EXT_QUERY_OUT_SIZE = 2 + DDW_EXT_QUERY_OUT_SIZE = 2, + DDW_EXT_LIMITED_ADDR_MODE = 3 }; static struct iommu_table *iommu_pseries_alloc_table(int node) @@ -67,6 +69,10 @@ static struct iommu_table *iommu_pseries_alloc_table(int node) return tbl; } +#ifdef CONFIG_IOMMU_API +static struct iommu_table_group_ops spapr_tce_table_group_ops; +#endif + static struct iommu_table_group *iommu_pseries_alloc_group(int node) { struct iommu_table_group *table_group; @@ -102,7 +108,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group, #endif /* Default DMA window table is at index 0, while DDW at 1. SR-IOV - * adapters only have table on index 1. + * adapters only have table on index 0(if not direct mapped). */ if (table_group->tables[0]) iommu_tce_table_put(table_group->tables[0]); @@ -143,7 +149,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, } -static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) +static void tce_clear_pSeries(struct iommu_table *tbl, long index, long npages) { __be64 *tcep; @@ -162,6 +168,39 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return be64_to_cpu(*tcep); } +#ifdef CONFIG_IOMMU_API +static long pseries_tce_iommu_userspace_view_alloc(struct iommu_table *tbl) +{ + unsigned long cb = ALIGN(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE); + unsigned long *uas; + + if (tbl->it_indirect_levels) /* Impossible */ + return -EPERM; + + WARN_ON(tbl->it_userspace); + + uas = vzalloc(cb); + if (!uas) + return -ENOMEM; + + tbl->it_userspace = (__be64 *) uas; + + return 0; +} +#endif + +static void tce_iommu_userspace_view_free(struct iommu_table *tbl) +{ + vfree(tbl->it_userspace); + tbl->it_userspace = NULL; +} + +static void tce_free_pSeries(struct iommu_table *tbl) +{ + if (tbl->it_userspace) + tce_iommu_userspace_view_free(tbl); +} + static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); @@ -576,7 +615,7 @@ struct iommu_table_ops iommu_table_lpar_multi_ops; struct iommu_table_ops iommu_table_pseries_ops = { .set = tce_build_pSeries, - .clear = tce_free_pSeries, + .clear = tce_clear_pSeries, .get = tce_get_pseries }; @@ -685,17 +724,47 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned return rc; } + +static __be64 *tce_useraddr_pSeriesLP(struct iommu_table *tbl, long index, + bool __always_unused alloc) +{ + return tbl->it_userspace ? &tbl->it_userspace[index - tbl->it_offset] : NULL; +} #endif struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, #ifdef CONFIG_IOMMU_API .xchg_no_kill = tce_exchange_pseries, + .useraddrptr = tce_useraddr_pSeriesLP, #endif .clear = tce_freemulti_pSeriesLP, - .get = tce_get_pSeriesLP + .get = tce_get_pSeriesLP, + .free = tce_free_pSeries }; +#ifdef CONFIG_IOMMU_API +/* + * When the DMA window properties might have been removed, + * the parent node has the table_group setup on it. + */ +static struct device_node *pci_dma_find_parent_node(struct pci_dev *dev, + struct iommu_table_group *table_group) +{ + struct device_node *dn = pci_device_to_OF_node(dev); + struct pci_dn *rpdn; + + for (; dn && PCI_DN(dn); dn = dn->parent) { + rpdn = PCI_DN(dn); + + if (table_group == rpdn->table_group) + return dn; + } + + return NULL; +} +#endif + /* * Find nearest ibm,dma-window (default DMA window) or direct DMA window or * dynamic 64bit DMA window, walking up the device tree. @@ -812,13 +881,6 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) be32_to_cpu(prop.tce_shift), NULL, &iommu_table_lpar_multi_ops); - /* Only for normal boot with default window. Doesn't matter even - * if we set these with DDW which is 64bit during kdump, since - * these will not be used during kdump. - */ - ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base); - ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); - if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) panic("Failed to initialize iommu table"); @@ -917,7 +979,7 @@ static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liob } static void remove_dma_window(struct device_node *np, u32 *ddw_avail, - struct property *win) + struct property *win, bool cleanup) { struct dynamic_dma_window_prop *dwp; u64 liobn; @@ -925,11 +987,44 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail, dwp = win->value; liobn = (u64)be32_to_cpu(dwp->liobn); - clean_dma_window(np, dwp); + if (cleanup) + clean_dma_window(np, dwp); __remove_dma_window(np, ddw_avail, liobn); } -static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name) +static void copy_property(struct device_node *pdn, const char *from, const char *to) +{ + struct property *src, *dst; + + src = of_find_property(pdn, from, NULL); + if (!src) + return; + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return; + + dst->name = kstrdup(to, GFP_KERNEL); + dst->value = kmemdup(src->value, src->length, GFP_KERNEL); + dst->length = src->length; + if (!dst->name || !dst->value) + return; + + if (of_add_property(pdn, dst)) { + pr_err("Unable to add DMA window property for %pOF", pdn); + goto free_prop; + } + + return; + +free_prop: + kfree(dst->name); + kfree(dst->value); + kfree(dst); +} + +static int remove_dma_window_named(struct device_node *np, bool remove_prop, const char *win_name, + bool cleanup) { struct property *win; u32 ddw_avail[DDW_APPLICABLE_SIZE]; @@ -944,13 +1039,20 @@ static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_ if (ret) return 0; - if (win->length >= sizeof(struct dynamic_dma_window_prop)) - remove_dma_window(np, ddw_avail, win); + remove_dma_window(np, ddw_avail, win, cleanup); if (!remove_prop) return 0; + /* Default window property if removed is lost as reset-pe doesn't restore it. + * Though FDT has a copy of it, the DLPAR hotplugged devices will not have a + * node on FDT until next reboot. So, back it up. + */ + if ((strcmp(win_name, "ibm,dma-window") == 0) && + !of_find_property(np, "ibm,dma-window-saved", NULL)) + copy_property(np, win_name, "ibm,dma-window-saved"); + ret = of_remove_property(np, win); if (ret) pr_warn("%pOF: failed to remove DMA window property: %d\n", @@ -1008,7 +1110,7 @@ static void find_existing_ddw_windows_named(const char *name) for_each_node_with_property(pdn, name) { dma64 = of_get_property(pdn, name, &len); if (!dma64 || len < sizeof(*dma64)) { - remove_ddw(pdn, true, name); + remove_dma_window_named(pdn, true, name, true); continue; } @@ -1183,17 +1285,13 @@ static LIST_HEAD(failed_ddw_pdn_list); static phys_addr_t ddw_memory_hotplug_max(void) { - resource_size_t max_addr = memory_hotplug_max(); - struct device_node *memory; - - for_each_node_by_type(memory, "memory") { - struct resource res; - - if (of_address_to_resource(memory, 0, &res)) - continue; + resource_size_t max_addr; - max_addr = max_t(resource_size_t, max_addr, res.end + 1); - } +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) + max_addr = hot_add_drconf_memory_max(); +#else + max_addr = memblock_end_of_DRAM(); +#endif return max_addr; } @@ -1230,6 +1328,54 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) ret); } +/* + * Platforms support placing PHB in limited address mode starting with LoPAR + * level 2.13 implement. In this mode, the DMA address returned by DDW is over + * 4GB but, less than 64-bits. This benefits IO adapters that don't support + * 64-bits for DMA addresses. + */ +static int limited_dma_window(struct pci_dev *dev, struct device_node *par_dn) +{ + int ret; + u32 cfg_addr, reset_dma_win, las_supported; + u64 buid; + struct device_node *dn; + struct pci_dn *pdn; + + ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); + if (ret) + goto out; + + ret = ddw_read_ext(par_dn, DDW_EXT_LIMITED_ADDR_MODE, &las_supported); + + /* Limited Address Space extension available on the platform but DDW in + * limited addressing mode not supported + */ + if (!ret && !las_supported) + ret = -EPROTO; + + if (ret) { + dev_info(&dev->dev, "Limited Address Space for DDW not Supported, err: %d", ret); + goto out; + } + + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + buid = pdn->phb->buid; + cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); + + ret = rtas_call(reset_dma_win, 4, 1, NULL, cfg_addr, BUID_HI(buid), + BUID_LO(buid), 1); + if (ret) + dev_info(&dev->dev, + "ibm,reset-pe-dma-windows(%x) for Limited Addr Support: %x %x %x returned %d ", + reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), + ret); + +out: + return ret; +} + /* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */ static int iommu_get_page_shift(u32 query_page_size) { @@ -1297,14 +1443,14 @@ static struct property *ddw_property_create(const char *propname, u32 liobn, u64 * * returns true if can map all pages (direct mapping), false otherwise.. */ -static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) +static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn, u64 dma_mask) { int len = 0, ret; int max_ram_len = order_base_2(ddw_memory_hotplug_max()); struct ddw_query_response query; struct ddw_create_response create; int page_shift; - u64 win_addr; + u64 win_addr, dynamic_offset = 0; const char *win_name; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; @@ -1312,9 +1458,13 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct property *win64; struct failed_ddw_pdn *fpdn; bool default_win_removed = false, direct_mapping = false; + bool dynamic_mapping = false; bool pmem_present; struct pci_dn *pci = PCI_DN(pdn); struct property *default_win = NULL; + bool limited_addr_req = false, limited_addr_enabled = false; + int dev_max_ddw; + int ddw_sz; dn = of_find_node_by_type(NULL, "ibm,pmemory"); pmem_present = dn != NULL; @@ -1341,7 +1491,6 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) * the ibm,ddw-applicable property holds the tokens for: * ibm,query-pe-dma-window * ibm,create-pe-dma-window - * ibm,remove-pe-dma-window * for the given node in that order. * the property is actually in the parent, not the PE */ @@ -1361,6 +1510,20 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (ret != 0) goto out_failed; + /* DMA Limited Addressing required? This is when the driver has + * requested to create DDW but supports mask which is less than 64-bits + */ + limited_addr_req = (dma_mask != DMA_BIT_MASK(64)); + + /* place the PHB in Limited Addressing mode */ + if (limited_addr_req) { + if (limited_dma_window(dev, pdn)) + goto out_failed; + + /* PHB is in Limited address mode */ + limited_addr_enabled = true; + } + /* * If there is no window available, remove the default DMA window, * if it's present. This will make all the resources available to the @@ -1385,7 +1548,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (reset_win_ext) goto out_failed; - remove_dma_window(pdn, ddw_avail, default_win); + remove_dma_window(pdn, ddw_avail, default_win, true); default_win_removed = true; /* Query again, to check if the window is available */ @@ -1407,6 +1570,14 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) goto out_failed; } + /* Maximum DMA window size that the device can address (in log2) */ + dev_max_ddw = fls64(dma_mask); + + /* If the device DMA mask is less than 64-bits, make sure the DMA window + * size is not bigger than what the device can access + */ + ddw_sz = min(order_base_2(query.largest_available_block << page_shift), + dev_max_ddw); /* * The "ibm,pmemory" can appear anywhere in the address space. @@ -1416,30 +1587,56 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) */ len = max_ram_len; if (pmem_present) { - if (query.largest_available_block >= - (1ULL << (MAX_PHYSMEM_BITS - page_shift))) + if (ddw_sz >= MAX_PHYSMEM_BITS) len = MAX_PHYSMEM_BITS; else dev_info(&dev->dev, "Skipping ibm,pmemory"); } /* check if the available block * number of ptes will map everything */ - if (query.largest_available_block < (1ULL << (len - page_shift))) { + if (ddw_sz < len) { dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n", 1ULL << len, query.largest_available_block, 1ULL << page_shift); - len = order_base_2(query.largest_available_block << page_shift); - win_name = DMA64_PROPNAME; + len = ddw_sz; + dynamic_mapping = true; } else { direct_mapping = !default_win_removed || (len == MAX_PHYSMEM_BITS) || (!pmem_present && (len == max_ram_len)); - win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME; + + /* DDW is big enough to direct map RAM. If there is vPMEM, check + * if enough space is left in DDW where we can dynamically + * allocate TCEs for vPMEM. For now, this Hybrid sharing of DDW + * is only for SR-IOV devices. + */ + if (default_win_removed && pmem_present && !direct_mapping) { + /* DDW is big enough to be split */ + if ((1ULL << ddw_sz) >= + MIN_DDW_VPMEM_DMA_WINDOW + (1ULL << max_ram_len)) { + + direct_mapping = true; + + /* offset of the Dynamic part of DDW */ + dynamic_offset = 1ULL << max_ram_len; + } + + /* DDW will at least have dynamic allocation */ + dynamic_mapping = true; + + /* create max size DDW possible */ + len = ddw_sz; + } } + /* Even if the DDW is split into both direct mapped RAM and dynamically + * mapped vPMEM, the DDW property in OF will be marked as Direct. + */ + win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME; + ret = create_ddw(dev, ddw_avail, &create, page_shift, len); if (ret != 0) goto out_failed; @@ -1467,11 +1664,11 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (!window) goto out_del_prop; - if (direct_mapping) { - window->direct = true; + window->direct = direct_mapping; + if (direct_mapping) { /* DDW maps the whole partition, so enable direct DMA mapping */ - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT, win64->value, tce_setrange_multi_pSeriesLP_walk); if (ret) { dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", @@ -1481,12 +1678,18 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) clean_dma_window(pdn, win64->value); goto out_del_list; } - } else { + if (default_win_removed) { + iommu_tce_table_put(pci->table_group->tables[0]); + pci->table_group->tables[0] = NULL; + set_iommu_table_base(&dev->dev, NULL); + } + } + + if (dynamic_mapping) { struct iommu_table *newtbl; int i; unsigned long start = 0, end = 0; - - window->direct = false; + u64 dynamic_addr, dynamic_len; for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; @@ -1506,20 +1709,27 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) goto out_del_list; } - iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr, - 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); - iommu_init_table(newtbl, pci->phb->node, start, end); + /* If the DDW is split between directly mapped RAM and Dynamic + * mapped for TCES, offset into the DDW where the dynamic part + * begins. + */ + dynamic_addr = win_addr + dynamic_offset; + dynamic_len = (1UL << len) - dynamic_offset; + iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, + dynamic_addr, dynamic_len, page_shift, NULL, + &iommu_table_lpar_multi_ops); + iommu_init_table(newtbl, pci->phb->node, + start >> page_shift, end >> page_shift); - pci->table_group->tables[1] = newtbl; + pci->table_group->tables[default_win_removed ? 0 : 1] = newtbl; set_iommu_table_base(&dev->dev, newtbl); } if (default_win_removed) { - iommu_tce_table_put(pci->table_group->tables[0]); - pci->table_group->tables[0] = NULL; - /* default_win is valid here because default_win_removed == true */ + if (!of_find_property(pdn, "ibm,dma-window-saved", NULL)) + copy_property(pdn, "ibm,dma-window", "ibm,dma-window-saved"); of_remove_property(pdn, default_win); dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn); } @@ -1547,7 +1757,7 @@ out_remove_win: __remove_dma_window(pdn, ddw_avail, create.liobn); out_failed: - if (default_win_removed) + if (default_win_removed || limited_addr_enabled) reset_dma_window(dev, pdn); fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); @@ -1559,17 +1769,84 @@ out_failed: out_unlock: mutex_unlock(&dma_win_init_mutex); - /* - * If we have persistent memory and the window size is only as big - * as RAM, then we failed to create a window to cover persistent - * memory and need to set the DMA limit. + /* If we have persistent memory and the window size is not big enough + * to directly map both RAM and vPMEM, then we need to set DMA limit. */ - if (pmem_present && direct_mapping && len == max_ram_len) - dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); + if (pmem_present && direct_mapping && len != MAX_PHYSMEM_BITS) + dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + + (1ULL << max_ram_len); + + dev_info(&dev->dev, "lsa_required: %x, lsa_enabled: %x, direct mapping: %x\n", + limited_addr_req, limited_addr_enabled, direct_mapping); return direct_mapping; } +static __u64 query_page_size_to_mask(u32 query_page_size) +{ + const long shift[] = { + (SZ_4K), (SZ_64K), (SZ_16M), + (SZ_32M), (SZ_64M), (SZ_128M), + (SZ_256M), (SZ_16G), (SZ_2M) + }; + int i, ret = 0; + + for (i = 0; i < ARRAY_SIZE(shift); i++) { + if (query_page_size & (1 << i)) + ret |= shift[i]; + } + + return ret; +} + +static void spapr_tce_init_table_group(struct pci_dev *pdev, + struct device_node *pdn, + struct dynamic_dma_window_prop prop) +{ + struct iommu_table_group *table_group = PCI_DN(pdn)->table_group; + u32 ddw_avail[DDW_APPLICABLE_SIZE]; + + struct ddw_query_response query; + int ret; + + /* Only for normal boot with default window. Doesn't matter during + * kdump, since these will not be used during kdump. + */ + if (is_kdump_kernel()) + return; + + if (table_group->max_dynamic_windows_supported != 0) + return; /* already initialized */ + + table_group->tce32_start = be64_to_cpu(prop.dma_base); + table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); + + if (!of_find_property(pdn, "ibm,dma-window", NULL)) + dev_err(&pdev->dev, "default dma window missing!\n"); + + ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", + &ddw_avail[0], DDW_APPLICABLE_SIZE); + if (ret) { + table_group->max_dynamic_windows_supported = -1; + return; + } + + ret = query_ddw(pdev, ddw_avail, &query, pdn); + if (ret) { + dev_err(&pdev->dev, "%s: query_ddw failed\n", __func__); + table_group->max_dynamic_windows_supported = -1; + return; + } + + if (query.windows_available == 0) + table_group->max_dynamic_windows_supported = 1; + else + table_group->max_dynamic_windows_supported = IOMMU_TABLE_GROUP_MAX_TABLES; + + table_group->max_levels = 1; + table_group->pgsizes |= query_page_size_to_mask(query.page_size); +} + static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; @@ -1609,13 +1886,6 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) be32_to_cpu(prop.tce_shift), NULL, &iommu_table_lpar_multi_ops); - /* Only for normal boot with default window. Doesn't matter even - * if we set these with DDW which is 64bit during kdump, since - * these will not be used during kdump. - */ - pci->table_group->tce32_start = be64_to_cpu(prop.dma_base); - pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); - iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); @@ -1624,6 +1894,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pr_debug(" found DMA window, table: %p\n", pci->table_group); } + spapr_tce_init_table_group(dev, pdn, prop); + set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); iommu_add_device(pci->table_group, &dev->dev); } @@ -1632,8 +1904,11 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) { struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; - /* only attempt to use a new window if 64-bit DMA is requested */ - if (dma_mask < DMA_BIT_MASK(64)) + /* For DDW, DMA mask should be more than 32-bits. For mask more then + * 32-bits but less then 64-bits, DMA addressing is supported in + * Limited Addressing mode. + */ + if (dma_mask <= DMA_BIT_MASK(32)) return false; dev_dbg(&pdev->dev, "node is %pOF\n", dn); @@ -1646,11 +1921,501 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) */ pdn = pci_dma_find(dn, NULL); if (pdn && PCI_DN(pdn)) - return enable_ddw(pdev, pdn); + return enable_ddw(pdev, pdn, dma_mask); + + return false; +} + +#ifdef CONFIG_IOMMU_API +/* + * A simple iommu_table_group_ops which only allows reusing the existing + * iommu_table. This handles VFIO for POWER7 or the nested KVM. + * The ops does not allow creating windows and only allows reusing the existing + * one if it matches table_group->tce32_start/tce32_size/page_shift. + */ +static unsigned long spapr_tce_get_table_size(__u32 page_shift, + __u64 window_size, __u32 levels) +{ + unsigned long size; + + if (levels > 1) + return ~0U; + size = window_size >> (page_shift - 3); + return size; +} + +static struct pci_dev *iommu_group_get_first_pci_dev(struct iommu_group *group) +{ + struct pci_dev *pdev = NULL; + int ret; + + /* No IOMMU group ? */ + if (!group) + return NULL; + + ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table); + if (!ret || !pdev) + return NULL; + return pdev; +} + +static void restore_default_dma_window(struct pci_dev *pdev, struct device_node *pdn) +{ + reset_dma_window(pdev, pdn); + copy_property(pdn, "ibm,dma-window-saved", "ibm,dma-window"); +} + +static long remove_dynamic_dma_windows(struct pci_dev *pdev, struct device_node *pdn) +{ + struct pci_dn *pci = PCI_DN(pdn); + struct dma_win *window; + bool direct_mapping; + int len; + + if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, &direct_mapping)) { + remove_dma_window_named(pdn, true, direct_mapping ? + DIRECT64_PROPNAME : DMA64_PROPNAME, true); + if (!direct_mapping) { + WARN_ON(!pci->table_group->tables[0] && !pci->table_group->tables[1]); + + if (pci->table_group->tables[1]) { + iommu_tce_table_put(pci->table_group->tables[1]); + pci->table_group->tables[1] = NULL; + } else if (pci->table_group->tables[0]) { + /* Default window was removed and only the DDW exists */ + iommu_tce_table_put(pci->table_group->tables[0]); + pci->table_group->tables[0] = NULL; + } + } + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { + if (window->device == pdn) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&dma_win_list_lock); + } + + return 0; +} + +static long pseries_setup_default_iommu_config(struct iommu_table_group *table_group, + struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + const __be32 *default_prop; + long liobn, offset, size; + struct device_node *pdn; + struct iommu_table *tbl; + struct pci_dn *pci; + + pdn = pci_dma_find_parent_node(pdev, table_group); + if (!pdn || !PCI_DN(pdn)) { + dev_warn(&pdev->dev, "No table_group configured for the node %pOF\n", pdn); + return -1; + } + pci = PCI_DN(pdn); + + /* The default window is restored if not present already on removal of DDW. + * However, if used by VFIO SPAPR sub driver, the user's order of removal of + * windows might have been different to not leading to auto restoration, + * suppose the DDW was removed first followed by the default one. + * So, restore the default window with reset-pe-dma call explicitly. + */ + restore_default_dma_window(pdev, pdn); + + default_prop = of_get_property(pdn, "ibm,dma-window", NULL); + of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); + tbl = iommu_pseries_alloc_table(pci->phb->node); + if (!tbl) { + dev_err(&pdev->dev, "couldn't create new IOMMU table\n"); + return -1; + } + + iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, offset, + size, IOMMU_PAGE_SHIFT_4K, NULL, + &iommu_table_lpar_multi_ops); + iommu_init_table(tbl, pci->phb->node, 0, 0); + + pci->table_group->tables[0] = tbl; + set_iommu_table_base(&pdev->dev, tbl); + + return 0; +} + +static bool is_default_window_request(struct iommu_table_group *table_group, __u32 page_shift, + __u64 window_size) +{ + if ((window_size <= table_group->tce32_size) && + (page_shift == IOMMU_PAGE_SHIFT_4K)) + return true; return false; } +static long spapr_tce_create_table(struct iommu_table_group *table_group, int num, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); + u32 ddw_avail[DDW_APPLICABLE_SIZE]; + struct ddw_create_response create; + unsigned long liobn, offset, size; + unsigned long start = 0, end = 0; + struct ddw_query_response query; + const __be32 *default_prop; + struct failed_ddw_pdn *fpdn; + unsigned int window_shift; + struct device_node *pdn; + struct iommu_table *tbl; + struct dma_win *window; + struct property *win64; + struct pci_dn *pci; + u64 win_addr; + int len, i; + long ret; + + if (!is_power_of_2(window_size) || levels > 1) + return -EINVAL; + + window_shift = order_base_2(window_size); + + mutex_lock(&dma_win_init_mutex); + + ret = -ENODEV; + + pdn = pci_dma_find_parent_node(pdev, table_group); + if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ + dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); + goto out_failed; + } + pci = PCI_DN(pdn); + + /* If the enable DDW failed for the pdn, dont retry! */ + list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { + if (fpdn->pdn == pdn) { + dev_info(&pdev->dev, "%pOF in failed DDW device list\n", pdn); + goto out_unlock; + } + } + + tbl = iommu_pseries_alloc_table(pci->phb->node); + if (!tbl) { + dev_dbg(&pdev->dev, "couldn't create new IOMMU table\n"); + goto out_unlock; + } + + if (num == 0) { + bool direct_mapping; + /* The request is not for default window? Ensure there is no DDW window already */ + if (!is_default_window_request(table_group, page_shift, window_size)) { + if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, + &direct_mapping)) { + dev_warn(&pdev->dev, "%pOF: 64-bit window already present.", pdn); + ret = -EPERM; + goto out_unlock; + } + } else { + /* Request is for Default window, ensure there is no DDW if there is a + * need to reset. reset-pe otherwise removes the DDW also + */ + default_prop = of_get_property(pdn, "ibm,dma-window", NULL); + if (!default_prop) { + if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, + &direct_mapping)) { + dev_warn(&pdev->dev, "%pOF: Attempt to create window#0 when 64-bit window is present. Preventing the attempt as that would destroy the 64-bit window", + pdn); + ret = -EPERM; + goto out_unlock; + } + + restore_default_dma_window(pdev, pdn); + + default_prop = of_get_property(pdn, "ibm,dma-window", NULL); + of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); + /* Limit the default window size to window_size */ + iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, + offset, 1UL << window_shift, + IOMMU_PAGE_SHIFT_4K, NULL, + &iommu_table_lpar_multi_ops); + iommu_init_table(tbl, pci->phb->node, + start >> IOMMU_PAGE_SHIFT_4K, + end >> IOMMU_PAGE_SHIFT_4K); + + table_group->tables[0] = tbl; + + mutex_unlock(&dma_win_init_mutex); + + goto exit; + } + } + } + + ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", + &ddw_avail[0], DDW_APPLICABLE_SIZE); + if (ret) { + dev_info(&pdev->dev, "ibm,ddw-applicable not found\n"); + goto out_failed; + } + ret = -ENODEV; + + pr_err("%s: Calling query %pOF\n", __func__, pdn); + ret = query_ddw(pdev, ddw_avail, &query, pdn); + if (ret) + goto out_failed; + ret = -ENODEV; + + len = window_shift; + if (query.largest_available_block < (1ULL << (len - page_shift))) { + dev_dbg(&pdev->dev, "can't map window 0x%llx with %llu %llu-sized pages\n", + 1ULL << len, query.largest_available_block, + 1ULL << page_shift); + ret = -EINVAL; /* Retry with smaller window size */ + goto out_unlock; + } + + if (create_ddw(pdev, ddw_avail, &create, page_shift, len)) { + pr_err("%s: Create ddw failed %pOF\n", __func__, pdn); + goto out_failed; + } + + win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; + win64 = ddw_property_create(DMA64_PROPNAME, create.liobn, win_addr, page_shift, len); + if (!win64) + goto remove_window; + + ret = of_add_property(pdn, win64); + if (ret) { + dev_err(&pdev->dev, "unable to add DMA window property for %pOF: %ld", pdn, ret); + goto free_property; + } + ret = -ENODEV; + + window = ddw_list_new_entry(pdn, win64->value); + if (!window) + goto remove_property; + + window->direct = false; + + for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { + const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; + + /* Look for MMIO32 */ + if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { + start = pci->phb->mem_resources[i].start; + end = pci->phb->mem_resources[i].end; + break; + } + } + + /* New table for using DDW instead of the default DMA window */ + iommu_table_setparms_common(tbl, pci->phb->bus->number, create.liobn, win_addr, + 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); + iommu_init_table(tbl, pci->phb->node, start >> page_shift, end >> page_shift); + + pci->table_group->tables[num] = tbl; + set_iommu_table_base(&pdev->dev, tbl); + pdev->dev.archdata.dma_offset = win_addr; + + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); + + mutex_unlock(&dma_win_init_mutex); + + goto exit; + +remove_property: + of_remove_property(pdn, win64); +free_property: + kfree(win64->name); + kfree(win64->value); + kfree(win64); +remove_window: + __remove_dma_window(pdn, ddw_avail, create.liobn); + +out_failed: + fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); + if (!fpdn) + goto out_unlock; + fpdn->pdn = pdn; + list_add(&fpdn->list, &failed_ddw_pdn_list); + +out_unlock: + mutex_unlock(&dma_win_init_mutex); + + return ret; +exit: + /* Allocate the userspace view */ + pseries_tce_iommu_userspace_view_alloc(tbl); + tbl->it_allocated_size = spapr_tce_get_table_size(page_shift, window_size, levels); + + *ptbl = iommu_tce_table_get(tbl); + + return 0; +} + +static bool is_default_window_table(struct iommu_table_group *table_group, struct iommu_table *tbl) +{ + if (((tbl->it_size << tbl->it_page_shift) <= table_group->tce32_size) && + (tbl->it_page_shift == IOMMU_PAGE_SHIFT_4K)) + return true; + + return false; +} + +static long spapr_tce_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + return tbl == table_group->tables[num] ? 0 : -EPERM; +} + +static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num) +{ + struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); + struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; + struct iommu_table *tbl = table_group->tables[num]; + struct failed_ddw_pdn *fpdn; + struct dma_win *window; + const char *win_name; + int ret = -ENODEV; + + if (!tbl) /* The table was never created OR window was never opened */ + return 0; + + mutex_lock(&dma_win_init_mutex); + + if ((num == 0) && is_default_window_table(table_group, tbl)) + win_name = "ibm,dma-window"; + else + win_name = DMA64_PROPNAME; + + pdn = pci_dma_find(dn, NULL); + if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ + dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); + goto out_failed; + } + + /* Dont clear the TCEs, User should have done it */ + if (remove_dma_window_named(pdn, true, win_name, false)) { + pr_err("%s: The existing DDW removal failed for node %pOF\n", __func__, pdn); + goto out_failed; /* Could not remove it either! */ + } + + if (strcmp(win_name, DMA64_PROPNAME) == 0) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { + if (window->device == pdn) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&dma_win_list_lock); + } + + iommu_tce_table_put(table_group->tables[num]); + table_group->tables[num] = NULL; + + ret = 0; + + goto out_unlock; + +out_failed: + fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); + if (!fpdn) + goto out_unlock; + fpdn->pdn = pdn; + list_add(&fpdn->list, &failed_ddw_pdn_list); + +out_unlock: + mutex_unlock(&dma_win_init_mutex); + + return ret; +} + +static long spapr_tce_take_ownership(struct iommu_table_group *table_group, struct device *dev) +{ + struct iommu_table *tbl = table_group->tables[0]; + struct pci_dev *pdev = to_pci_dev(dev); + struct device_node *dn = pci_device_to_OF_node(pdev); + struct device_node *pdn; + + /* SRIOV VFs using direct map by the host driver OR multifunction devices + * where the ownership was taken on the attempt by the first function + */ + if (!tbl && (table_group->max_dynamic_windows_supported != 1)) + return 0; + + mutex_lock(&dma_win_init_mutex); + + pdn = pci_dma_find(dn, NULL); + if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ + dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); + mutex_unlock(&dma_win_init_mutex); + return -1; + } + + /* + * Though rtas call reset-pe removes the DDW, it doesn't clear the entries on the table + * if there are any. In case of direct map, the entries will be left over, which + * is fine for PEs with 2 DMA windows where the second window is created with create-pe + * at which point the table is cleared. However, on VFs having only one DMA window, the + * default window would end up seeing the entries left over from the direct map done + * on the second window. So, remove the ddw explicitly so that clean_dma_window() + * cleans up the entries if any. + */ + if (remove_dynamic_dma_windows(pdev, pdn)) { + dev_warn(&pdev->dev, "The existing DDW removal failed for node %pOF\n", pdn); + mutex_unlock(&dma_win_init_mutex); + return -1; + } + + /* The table_group->tables[0] is not null now, it must be the default window + * Remove it, let the userspace create it as it needs. + */ + if (table_group->tables[0]) { + remove_dma_window_named(pdn, true, "ibm,dma-window", true); + iommu_tce_table_put(tbl); + table_group->tables[0] = NULL; + } + set_iommu_table_base(dev, NULL); + + mutex_unlock(&dma_win_init_mutex); + + return 0; +} + +static void spapr_tce_release_ownership(struct iommu_table_group *table_group, struct device *dev) +{ + struct iommu_table *tbl = table_group->tables[0]; + + if (tbl) { /* Default window already restored */ + return; + } + + mutex_lock(&dma_win_init_mutex); + + /* Restore the default window */ + pseries_setup_default_iommu_config(table_group, dev); + + mutex_unlock(&dma_win_init_mutex); + + return; +} + +static struct iommu_table_group_ops spapr_tce_table_group_ops = { + .get_table_size = spapr_tce_get_table_size, + .create_table = spapr_tce_create_table, + .set_window = spapr_tce_set_window, + .unset_window = spapr_tce_unset_window, + .take_ownership = spapr_tce_take_ownership, + .release_ownership = spapr_tce_release_ownership, +}; +#endif + static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -1658,11 +2423,17 @@ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, struct memory_notify *arg = data; int ret = 0; + /* This notifier can get called when onlining persistent memory as well. + * TCEs are not pre-mapped for persistent memory. Persistent memory will + * always be above ddw_memory_hotplug_max() + */ + switch (action) { case MEM_GOING_ONLINE: spin_lock(&dma_win_list_lock); list_for_each_entry(window, &dma_win_list, list) { - if (window->direct) { + if (window->direct && (arg->start_pfn << PAGE_SHIFT) < + ddw_memory_hotplug_max()) { ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); } @@ -1674,7 +2445,8 @@ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, case MEM_OFFLINE: spin_lock(&dma_win_list_lock); list_for_each_entry(window, &dma_win_list, list) { - if (window->direct) { + if (window->direct && (arg->start_pfn << PAGE_SHIFT) < + ddw_memory_hotplug_max()) { ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); } @@ -1712,8 +2484,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * we have to remove the property when releasing * the device node. */ - if (remove_ddw(np, false, DIRECT64_PROPNAME)) - remove_ddw(np, false, DMA64_PROPNAME); + if (remove_dma_window_named(np, false, DIRECT64_PROPNAME, true)) + remove_dma_window_named(np, false, DMA64_PROPNAME, true); if (pci && pci->table_group) iommu_pseries_free_group(pci->table_group, diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index 096d09ed89f6..431be156ca9b 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -61,11 +61,3 @@ void pseries_kexec_cpu_down(int crash_shutdown, int secondary) } else xics_kexec_teardown_cpu(secondary); } - -void pseries_machine_kexec(struct kimage *image) -{ - if (firmware_has_feature(FW_FEATURE_SET_MODE)) - pseries_disable_reloc_on_exc(); - - default_machine_kexec(image); -} diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 4e9916bb03d7..6a415febc53b 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -16,6 +16,7 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/delay.h> +#include <linux/seq_file.h> #include <linux/stop_machine.h> #include <linux/spinlock.h> #include <linux/cpuhotplug.h> @@ -169,7 +170,7 @@ struct vcpu_dispatch_data { */ #define NR_CPUS_H NR_CPUS -DEFINE_RWLOCK(dtl_access_lock); +DECLARE_RWSEM(dtl_access_lock); static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data); static DEFINE_PER_CPU(u64, dtl_entry_ridx); static DEFINE_PER_CPU(struct dtl_worker, dtl_workers); @@ -463,7 +464,7 @@ static int dtl_worker_enable(unsigned long *time_limit) { int rc = 0, state; - if (!write_trylock(&dtl_access_lock)) { + if (!down_write_trylock(&dtl_access_lock)) { rc = -EBUSY; goto out; } @@ -479,7 +480,7 @@ static int dtl_worker_enable(unsigned long *time_limit) pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n"); free_dtl_buffers(time_limit); reset_global_dtl_mask(); - write_unlock(&dtl_access_lock); + up_write(&dtl_access_lock); rc = -EINVAL; goto out; } @@ -494,7 +495,7 @@ static void dtl_worker_disable(unsigned long *time_limit) cpuhp_remove_state(dtl_worker_state); free_dtl_buffers(time_limit); reset_global_dtl_mask(); - write_unlock(&dtl_access_lock); + up_write(&dtl_access_lock); } static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p, @@ -1886,10 +1887,10 @@ out: * h_get_mpp * H_GET_MPP hcall returns info in 7 parms */ -int h_get_mpp(struct hvcall_mpp_data *mpp_data) +long h_get_mpp(struct hvcall_mpp_data *mpp_data) { - int rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + long rc; rc = plpar_hcall9(H_GET_MPP, retbuf); diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index f73c4d1c26af..cc22924f159f 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -29,7 +29,6 @@ #include <asm/firmware.h> #include <asm/rtas.h> #include <asm/time.h> -#include <asm/vdso_datapage.h> #include <asm/vio.h> #include <asm/mmu.h> #include <asm/machdep.h> @@ -113,8 +112,8 @@ struct hvcall_ppp_data { */ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) { - unsigned long rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + long rc; rc = plpar_hcall9(H_GET_PPP, retbuf); @@ -170,20 +169,24 @@ out: kfree(buf); } -static unsigned h_pic(unsigned long *pool_idle_time, - unsigned long *num_procs) +static long h_pic(unsigned long *pool_idle_time, + unsigned long *num_procs) { - unsigned long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = {0}; rc = plpar_hcall(H_PIC, retbuf); - *pool_idle_time = retbuf[0]; - *num_procs = retbuf[1]; + if (pool_idle_time) + *pool_idle_time = retbuf[0]; + if (num_procs) + *num_procs = retbuf[1]; return rc; } +unsigned long boot_pool_idle_time; + /* * parse_ppp_data * Parse out the data returned from h_get_ppp and h_pic @@ -193,7 +196,7 @@ static void parse_ppp_data(struct seq_file *m) struct hvcall_ppp_data ppp_data; struct device_node *root; const __be32 *perf_level; - int rc; + long rc; rc = h_get_ppp(&ppp_data); if (rc) @@ -215,9 +218,15 @@ static void parse_ppp_data(struct seq_file *m) seq_printf(m, "pool_capacity=%d\n", ppp_data.active_procs_in_pool * 100); - h_pic(&pool_idle_time, &pool_procs); - seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); - seq_printf(m, "pool_num_procs=%ld\n", pool_procs); + /* In case h_pic call is not successful, this would result in + * APP values being wrong in tools like lparstat. + */ + + if (h_pic(&pool_idle_time, &pool_procs) == H_SUCCESS) { + seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); + seq_printf(m, "pool_num_procs=%ld\n", pool_procs); + seq_printf(m, "boot_pool_idle_time=%ld\n", boot_pool_idle_time); + } } seq_printf(m, "unallocated_capacity_weight=%d\n", @@ -361,8 +370,8 @@ static int read_dt_lpar_name(struct seq_file *m) static void read_lpar_name(struct seq_file *m) { - if (read_rtas_lpar_name(m) && read_dt_lpar_name(m)) - pr_err_once("Error can't get the LPAR name"); + if (read_rtas_lpar_name(m)) + read_dt_lpar_name(m); } #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) @@ -520,7 +529,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL); if (lrdrp == NULL) { - partition_potential_processors = vdso_data->processorCount; + partition_potential_processors = num_possible_cpus(); } else { partition_potential_processors = be32_to_cpup(lrdrp + 4); } @@ -543,7 +552,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) } else { /* non SPLPAR case */ seq_printf(m, "system_active_processors=%d\n", - partition_potential_processors); + partition_active_processors); seq_printf(m, "system_potential_processors=%d\n", partition_potential_processors); @@ -792,6 +801,7 @@ static const struct proc_ops lparcfg_proc_ops = { static int __init lparcfg_init(void) { umode_t mode = 0444; + long retval; /* Allow writing if we have FW_FEATURE_SPLPAR */ if (firmware_has_feature(FW_FEATURE_SPLPAR)) @@ -801,6 +811,16 @@ static int __init lparcfg_init(void) printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); return -EIO; } + + /* If this call fails, it would result in APP values + * being wrong for since boot reports of lparstat + */ + retval = h_pic(&boot_pool_idle_time, NULL); + + if (retval != H_SUCCESS) + pr_debug("H_PIC failed during lparcfg init retval: %ld\n", + retval); + return 0; } machine_device_initcall(pseries, lparcfg_init); diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 1798f0f14d58..62bd8e2d5d4c 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -53,7 +53,7 @@ struct update_props_workarea { static unsigned int nmi_wd_lpm_factor = 200; #ifdef CONFIG_SYSCTL -static struct ctl_table nmi_wd_lpm_factor_ctl_table[] = { +static const struct ctl_table nmi_wd_lpm_factor_ctl_table[] = { { .procname = "nmi_wd_lpm_factor", .data = &nmi_wd_lpm_factor, diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 6dfb55b52d36..ee1c8c6898a3 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -9,6 +9,7 @@ #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/msi.h> +#include <linux/seq_file.h> #include <asm/rtas.h> #include <asm/hw_irq.h> @@ -524,7 +525,12 @@ static struct msi_domain_info pseries_msi_domain_info = { static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { - __pci_read_msi_msg(irq_data_get_msi_desc(data), msg); + struct pci_dev *dev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); + + if (dev->current_state == PCI_D0) + __pci_read_msi_msg(irq_data_get_msi_desc(data), msg); + else + get_cached_msi_msg(data->irq, msg); } static struct irq_chip pseries_msi_irq_chip = { @@ -610,7 +616,7 @@ static const struct irq_domain_ops pseries_irq_domain_ops = { static int __pseries_msi_allocate_domains(struct pci_controller *phb, unsigned int count) { - struct irq_domain *parent = irq_get_default_host(); + struct irq_domain *parent = irq_get_default_domain(); phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI", phb->global_number); @@ -627,7 +633,7 @@ static int __pseries_msi_allocate_domains(struct pci_controller *phb, return -ENOMEM; } - phb->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(phb->dn), + phb->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(phb->dn), &pseries_msi_domain_info, phb->dev_domain); if (!phb->msi_domain) { diff --git a/arch/powerpc/platforms/pseries/papr-indices.c b/arch/powerpc/platforms/pseries/papr-indices.c new file mode 100644 index 000000000000..3c7545591c45 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-indices.c @@ -0,0 +1,488 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-indices: " fmt + +#include <linux/build_bug.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/lockdep.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/signal.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/string_helpers.h> +#include <linux/uaccess.h> +#include <asm/machdep.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> +#include <uapi/asm/papr-indices.h> +#include "papr-rtas-common.h" + +/* + * Function-specific return values for ibm,set-dynamic-indicator and + * ibm,get-dynamic-sensor-state RTAS calls. + * PAPR+ v2.13 7.3.18 and 7.3.19. + */ +#define RTAS_IBM_DYNAMIC_INDICE_NO_INDICATOR -3 + +/** + * struct rtas_get_indices_params - Parameters (in and out) for + * ibm,get-indices. + * @is_sensor: In: Caller-provided whether sensor or indicator. + * @indice_type:In: Caller-provided indice (sensor or indicator) token + * @work_area: In: Caller-provided work area buffer for results. + * @next: In: Sequence number. Out: Next sequence number. + * @status: Out: RTAS call status. + */ +struct rtas_get_indices_params { + u8 is_sensor; + u32 indice_type; + struct rtas_work_area *work_area; + u32 next; + s32 status; +}; + +/* + * rtas_ibm_get_indices() - Call ibm,get-indices to fill a work area buffer. + * @params: See &struct rtas_ibm_get_indices_params. + * + * Calls ibm,get-indices until it errors or successfully deposits data + * into the supplied work area. Handles RTAS retry statuses. Maps RTAS + * error statuses to reasonable errno values. + * + * The caller is expected to invoke rtas_ibm_get_indices() multiple times + * to retrieve all indices data for the provided indice type. Only one + * sequence should be in progress at any time; starting a new sequence + * will disrupt any sequence already in progress. Serialization of + * indices retrieval sequences is the responsibility of the caller. + * + * The caller should inspect @params.status to determine whether more + * calls are needed to complete the sequence. + * + * Context: May sleep. + * Return: -ve on error, 0 otherwise. + */ +static int rtas_ibm_get_indices(struct rtas_get_indices_params *params) +{ + struct rtas_work_area *work_area = params->work_area; + const s32 token = rtas_function_token(RTAS_FN_IBM_GET_INDICES); + u32 rets; + s32 fwrc; + int ret; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + lockdep_assert_held(&rtas_ibm_get_indices_lock); + + do { + fwrc = rtas_call(token, 5, 2, &rets, params->is_sensor, + params->indice_type, + rtas_work_area_phys(work_area), + rtas_work_area_size(work_area), + params->next); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: /* Indicator type is not supported */ + ret = -EINVAL; + break; + case RTAS_SEQ_START_OVER: + ret = -EAGAIN; + pr_info_ratelimited("Indices changed during retrieval, retrying\n"); + params->next = 1; + break; + case RTAS_SEQ_MORE_DATA: + params->next = rets; + ret = 0; + break; + case RTAS_SEQ_COMPLETE: + params->next = 0; + ret = 0; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,get-indices status %d\n", fwrc); + break; + } + + params->status = fwrc; + return ret; +} + +/* + * Internal indices sequence APIs. A sequence is a series of calls to + * ibm,get-indices for a given location code. The sequence ends when + * an error is encountered or all indices for the input has been + * returned. + */ + +/* + * indices_sequence_begin() - Begin a indices retrieval sequence. + * + * Context: May sleep. + */ +static void indices_sequence_begin(struct papr_rtas_sequence *seq) +{ + struct rtas_get_indices_params *param; + + param = (struct rtas_get_indices_params *)seq->params; + /* + * We could allocate the work area before acquiring the + * function lock, but that would allow concurrent requests to + * exhaust the limited work area pool for no benefit. So + * allocate the work area under the lock. + */ + mutex_lock(&rtas_ibm_get_indices_lock); + param->work_area = rtas_work_area_alloc(RTAS_GET_INDICES_BUF_SIZE); + param->next = 1; + param->status = 0; +} + +/* + * indices_sequence_end() - Finalize a indices retrieval sequence. + * + * Releases resources obtained by indices_sequence_begin(). + */ +static void indices_sequence_end(struct papr_rtas_sequence *seq) +{ + struct rtas_get_indices_params *param; + + param = (struct rtas_get_indices_params *)seq->params; + rtas_work_area_free(param->work_area); + mutex_unlock(&rtas_ibm_get_indices_lock); +} + +/* + * Work function to be passed to papr_rtas_blob_generate(). + * + * ibm,get-indices RTAS call fills the work area with the certain + * format but does not return the bytes written in the buffer. So + * instead of kernel parsing this work area to determine the buffer + * length, copy the complete work area (RTAS_GET_INDICES_BUF_SIZE) + * to the blob and let the user space to obtain the data. + * Means RTAS_GET_INDICES_BUF_SIZE data will be returned for each + * read(). + */ + +static const char *indices_sequence_fill_work_area(struct papr_rtas_sequence *seq, + size_t *len) +{ + struct rtas_get_indices_params *p; + bool init_state; + + p = (struct rtas_get_indices_params *)seq->params; + init_state = (p->next == 1) ? true : false; + + if (papr_rtas_sequence_should_stop(seq, p->status, init_state)) + return NULL; + if (papr_rtas_sequence_set_err(seq, rtas_ibm_get_indices(p))) + return NULL; + + *len = RTAS_GET_INDICES_BUF_SIZE; + return rtas_work_area_raw_buf(p->work_area); +} + +/* + * papr_indices_handle_read - returns indices blob data to the user space + * + * ibm,get-indices RTAS call fills the work area with the certian + * format but does not return the bytes written in the buffer and + * copied RTAS_GET_INDICES_BUF_SIZE data to the blob for each RTAS + * call. So send RTAS_GET_INDICES_BUF_SIZE buffer to the user space + * for each read(). + */ +static ssize_t papr_indices_handle_read(struct file *file, + char __user *buf, size_t size, loff_t *off) +{ + const struct papr_rtas_blob *blob = file->private_data; + + /* we should not instantiate a handle without any data attached. */ + if (!papr_rtas_blob_has_data(blob)) { + pr_err_once("handle without data\n"); + return -EIO; + } + + if (size < RTAS_GET_INDICES_BUF_SIZE) { + pr_err_once("Invalid buffer length %ld, expect %d\n", + size, RTAS_GET_INDICES_BUF_SIZE); + return -EINVAL; + } else if (size > RTAS_GET_INDICES_BUF_SIZE) + size = RTAS_GET_INDICES_BUF_SIZE; + + return simple_read_from_buffer(buf, size, off, blob->data, blob->len); +} + +static const struct file_operations papr_indices_handle_ops = { + .read = papr_indices_handle_read, + .llseek = papr_rtas_common_handle_seek, + .release = papr_rtas_common_handle_release, +}; + +/* + * papr_indices_create_handle() - Create a fd-based handle for reading + * indices data + * @ubuf: Input parameters to RTAS call such as whether sensor or indicator + * and indice type in user memory + * + * Handler for PAPR_INDICES_IOC_GET ioctl command. Validates @ubuf + * and instantiates an immutable indices "blob" for it. The blob is + * attached to a file descriptor for reading by user space. The memory + * backing the blob is freed when the file is released. + * + * The entire requested indices is retrieved by this call and all + * necessary RTAS interactions are performed before returning the fd + * to user space. This keeps the read handler simple and ensures that + * the kernel can prevent interleaving of ibm,get-indices call sequences. + * + * Return: The installed fd number if successful, -ve errno otherwise. + */ +static long papr_indices_create_handle(struct papr_indices_io_block __user *ubuf) +{ + struct papr_rtas_sequence seq = {}; + struct rtas_get_indices_params params = {}; + int fd; + + if (get_user(params.is_sensor, &ubuf->indices.is_sensor)) + return -EFAULT; + + if (get_user(params.indice_type, &ubuf->indices.indice_type)) + return -EFAULT; + + seq = (struct papr_rtas_sequence) { + .begin = indices_sequence_begin, + .end = indices_sequence_end, + .work = indices_sequence_fill_work_area, + }; + + seq.params = ¶ms; + fd = papr_rtas_setup_file_interface(&seq, + &papr_indices_handle_ops, "[papr-indices]"); + + return fd; +} + +/* + * Create work area with the input parameters. This function is used + * for both ibm,set-dynamic-indicator and ibm,get-dynamic-sensor-state + * RTAS Calls. + */ +static struct rtas_work_area * +papr_dynamic_indice_buf_from_user(struct papr_indices_io_block __user *ubuf, + struct papr_indices_io_block *kbuf) +{ + struct rtas_work_area *work_area; + u32 length; + __be32 len_be; + + if (copy_from_user(kbuf, ubuf, sizeof(*kbuf))) + return ERR_PTR(-EFAULT); + + + if (!string_is_terminated(kbuf->dynamic_param.location_code_str, + ARRAY_SIZE(kbuf->dynamic_param.location_code_str))) + return ERR_PTR(-EINVAL); + + /* + * The input data in the work area should be as follows: + * - 32-bit integer length of the location code string, + * including NULL. + * - Location code string, NULL terminated, identifying the + * token (sensor or indicator). + * PAPR 2.13 - R1–7.3.18–5 ibm,set-dynamic-indicator + * - R1–7.3.19–5 ibm,get-dynamic-sensor-state + */ + /* + * Length that user space passed should also include NULL + * terminator. + */ + length = strlen(kbuf->dynamic_param.location_code_str) + 1; + if (length > LOC_CODE_SIZE) + return ERR_PTR(-EINVAL); + + len_be = cpu_to_be32(length); + + work_area = rtas_work_area_alloc(LOC_CODE_SIZE + sizeof(u32)); + memcpy(rtas_work_area_raw_buf(work_area), &len_be, sizeof(u32)); + memcpy((rtas_work_area_raw_buf(work_area) + sizeof(u32)), + &kbuf->dynamic_param.location_code_str, length); + + return work_area; +} + +/** + * papr_dynamic_indicator_ioc_set - ibm,set-dynamic-indicator RTAS Call + * PAPR 2.13 7.3.18 + * + * @ubuf: Input parameters to RTAS call such as indicator token and + * new state. + * + * Returns success or -errno. + */ +static long papr_dynamic_indicator_ioc_set(struct papr_indices_io_block __user *ubuf) +{ + struct papr_indices_io_block kbuf; + struct rtas_work_area *work_area; + s32 fwrc, token, ret; + + token = rtas_function_token(RTAS_FN_IBM_SET_DYNAMIC_INDICATOR); + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + mutex_lock(&rtas_ibm_set_dynamic_indicator_lock); + work_area = papr_dynamic_indice_buf_from_user(ubuf, &kbuf); + if (IS_ERR(work_area)) { + ret = PTR_ERR(work_area); + goto out; + } + + do { + fwrc = rtas_call(token, 3, 1, NULL, + kbuf.dynamic_param.token, + kbuf.dynamic_param.state, + rtas_work_area_phys(work_area)); + } while (rtas_busy_delay(fwrc)); + + rtas_work_area_free(work_area); + + switch (fwrc) { + case RTAS_SUCCESS: + ret = 0; + break; + case RTAS_IBM_DYNAMIC_INDICE_NO_INDICATOR: /* No such indicator */ + ret = -EOPNOTSUPP; + break; + default: + pr_err("unexpected ibm,set-dynamic-indicator result %d\n", + fwrc); + fallthrough; + case RTAS_HARDWARE_ERROR: /* Hardware/platform error */ + ret = -EIO; + break; + } + +out: + mutex_unlock(&rtas_ibm_set_dynamic_indicator_lock); + return ret; +} + +/** + * papr_dynamic_sensor_ioc_get - ibm,get-dynamic-sensor-state RTAS Call + * PAPR 2.13 7.3.19 + * + * @ubuf: Input parameters to RTAS call such as sensor token + * Copies the state in user space buffer. + * + * + * Returns success or -errno. + */ + +static long papr_dynamic_sensor_ioc_get(struct papr_indices_io_block __user *ubuf) +{ + struct papr_indices_io_block kbuf; + struct rtas_work_area *work_area; + s32 fwrc, token, ret; + u32 rets; + + token = rtas_function_token(RTAS_FN_IBM_GET_DYNAMIC_SENSOR_STATE); + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + mutex_lock(&rtas_ibm_get_dynamic_sensor_state_lock); + work_area = papr_dynamic_indice_buf_from_user(ubuf, &kbuf); + if (IS_ERR(work_area)) { + ret = PTR_ERR(work_area); + goto out; + } + + do { + fwrc = rtas_call(token, 2, 2, &rets, + kbuf.dynamic_param.token, + rtas_work_area_phys(work_area)); + } while (rtas_busy_delay(fwrc)); + + rtas_work_area_free(work_area); + + switch (fwrc) { + case RTAS_SUCCESS: + if (put_user(rets, &ubuf->dynamic_param.state)) + ret = -EFAULT; + else + ret = 0; + break; + case RTAS_IBM_DYNAMIC_INDICE_NO_INDICATOR: /* No such indicator */ + ret = -EOPNOTSUPP; + break; + default: + pr_err("unexpected ibm,get-dynamic-sensor result %d\n", + fwrc); + fallthrough; + case RTAS_HARDWARE_ERROR: /* Hardware/platform error */ + ret = -EIO; + break; + } + +out: + mutex_unlock(&rtas_ibm_get_dynamic_sensor_state_lock); + return ret; +} + +/* + * Top-level ioctl handler for /dev/papr-indices. + */ +static long papr_indices_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (__force void __user *)arg; + long ret; + + switch (ioctl) { + case PAPR_INDICES_IOC_GET: + ret = papr_indices_create_handle(argp); + break; + case PAPR_DYNAMIC_SENSOR_IOC_GET: + ret = papr_dynamic_sensor_ioc_get(argp); + break; + case PAPR_DYNAMIC_INDICATOR_IOC_SET: + if (filp->f_mode & FMODE_WRITE) + ret = papr_dynamic_indicator_ioc_set(argp); + else + ret = -EBADF; + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + return ret; +} + +static const struct file_operations papr_indices_ops = { + .unlocked_ioctl = papr_indices_dev_ioctl, +}; + +static struct miscdevice papr_indices_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-indices", + .fops = &papr_indices_ops, +}; + +static __init int papr_indices_init(void) +{ + if (!rtas_function_implemented(RTAS_FN_IBM_GET_INDICES)) + return -ENODEV; + + if (!rtas_function_implemented(RTAS_FN_IBM_SET_DYNAMIC_INDICATOR)) + return -ENODEV; + + if (!rtas_function_implemented(RTAS_FN_IBM_GET_DYNAMIC_SENSOR_STATE)) + return -ENODEV; + + return misc_register(&papr_indices_dev); +} +machine_device_initcall(pseries, papr_indices_init); diff --git a/arch/powerpc/platforms/pseries/papr-phy-attest.c b/arch/powerpc/platforms/pseries/papr-phy-attest.c new file mode 100644 index 000000000000..1907f2411567 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-phy-attest.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-phy-attest: " fmt + +#include <linux/build_bug.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/lockdep.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/signal.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/string_helpers.h> +#include <linux/uaccess.h> +#include <asm/machdep.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> +#include <uapi/asm/papr-physical-attestation.h> +#include "papr-rtas-common.h" + +/** + * struct rtas_phy_attest_params - Parameters (in and out) for + * ibm,physical-attestation. + * + * @cmd: In: Caller-provided attestation command buffer. Must be + * RTAS-addressable. + * @work_area: In: Caller-provided work area buffer for attestation + * command structure + * Out: Caller-provided work area buffer for the response + * @cmd_len: In: Caller-provided attestation command structure + * length + * @sequence: In: Sequence number. Out: Next sequence number. + * @written: Out: Bytes written by ibm,physical-attestation to + * @work_area. + * @status: Out: RTAS call status. + */ +struct rtas_phy_attest_params { + struct papr_phy_attest_io_block cmd; + struct rtas_work_area *work_area; + u32 cmd_len; + u32 sequence; + u32 written; + s32 status; +}; + +/** + * rtas_physical_attestation() - Call ibm,physical-attestation to + * fill a work area buffer. + * @params: See &struct rtas_phy_attest_params. + * + * Calls ibm,physical-attestation until it errors or successfully + * deposits data into the supplied work area. Handles RTAS retry + * statuses. Maps RTAS error statuses to reasonable errno values. + * + * The caller is expected to invoke rtas_physical_attestation() + * multiple times to retrieve all the data for the provided + * attestation command. Only one sequence should be in progress at + * any time; starting a new sequence will disrupt any sequence + * already in progress. Serialization of attestation retrieval + * sequences is the responsibility of the caller. + * + * The caller should inspect @params.status to determine whether more + * calls are needed to complete the sequence. + * + * Context: May sleep. + * Return: -ve on error, 0 otherwise. + */ +static int rtas_physical_attestation(struct rtas_phy_attest_params *params) +{ + struct rtas_work_area *work_area; + s32 fwrc, token; + u32 rets[2]; + int ret; + + work_area = params->work_area; + token = rtas_function_token(RTAS_FN_IBM_PHYSICAL_ATTESTATION); + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + lockdep_assert_held(&rtas_ibm_physical_attestation_lock); + + do { + fwrc = rtas_call(token, 3, 3, rets, + rtas_work_area_phys(work_area), + params->cmd_len, + params->sequence); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: + ret = -EINVAL; + break; + case RTAS_SEQ_MORE_DATA: + params->sequence = rets[0]; + fallthrough; + case RTAS_SEQ_COMPLETE: + params->written = rets[1]; + /* + * Kernel or firmware bug, do not continue. + */ + if (WARN(params->written > rtas_work_area_size(work_area), + "possible write beyond end of work area")) + ret = -EFAULT; + else + ret = 0; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,get-phy_attest status %d\n", fwrc); + break; + } + + params->status = fwrc; + return ret; +} + +/* + * Internal physical-attestation sequence APIs. A physical-attestation + * sequence is a series of calls to get ibm,physical-attestation + * for a given attestation command. The sequence ends when an error + * is encountered or all data for the attestation command has been + * returned. + */ + +/** + * phy_attest_sequence_begin() - Begin a response data for attestation + * command retrieval sequence. + * @seq: user specified parameters for RTAS call from seq struct. + * + * Context: May sleep. + */ +static void phy_attest_sequence_begin(struct papr_rtas_sequence *seq) +{ + struct rtas_phy_attest_params *param; + + /* + * We could allocate the work area before acquiring the + * function lock, but that would allow concurrent requests to + * exhaust the limited work area pool for no benefit. So + * allocate the work area under the lock. + */ + mutex_lock(&rtas_ibm_physical_attestation_lock); + param = (struct rtas_phy_attest_params *)seq->params; + param->work_area = rtas_work_area_alloc(SZ_4K); + memcpy(rtas_work_area_raw_buf(param->work_area), ¶m->cmd, + param->cmd_len); + param->sequence = 1; + param->status = 0; +} + +/** + * phy_attest_sequence_end() - Finalize a attestation command + * response retrieval sequence. + * @seq: Sequence state. + * + * Releases resources obtained by phy_attest_sequence_begin(). + */ +static void phy_attest_sequence_end(struct papr_rtas_sequence *seq) +{ + struct rtas_phy_attest_params *param; + + param = (struct rtas_phy_attest_params *)seq->params; + rtas_work_area_free(param->work_area); + mutex_unlock(&rtas_ibm_physical_attestation_lock); + kfree(param); +} + +/* + * Generator function to be passed to papr_rtas_blob_generate(). + */ +static const char *phy_attest_sequence_fill_work_area(struct papr_rtas_sequence *seq, + size_t *len) +{ + struct rtas_phy_attest_params *p; + bool init_state; + + p = (struct rtas_phy_attest_params *)seq->params; + init_state = (p->written == 0) ? true : false; + + if (papr_rtas_sequence_should_stop(seq, p->status, init_state)) + return NULL; + if (papr_rtas_sequence_set_err(seq, rtas_physical_attestation(p))) + return NULL; + *len = p->written; + return rtas_work_area_raw_buf(p->work_area); +} + +static const struct file_operations papr_phy_attest_handle_ops = { + .read = papr_rtas_common_handle_read, + .llseek = papr_rtas_common_handle_seek, + .release = papr_rtas_common_handle_release, +}; + +/** + * papr_phy_attest_create_handle() - Create a fd-based handle for + * reading the response for the given attestation command. + * @ulc: Attestation command in user memory; defines the scope of + * data for the attestation command to retrieve. + * + * Handler for PAPR_PHYSICAL_ATTESTATION_IOC_CREATE_HANDLE ioctl + * command. Validates @ulc and instantiates an immutable response + * "blob" for attestation command. The blob is attached to a file + * descriptor for reading by user space. The memory backing the blob + * is freed when the file is released. + * + * The entire requested response buffer for the attestation command + * retrieved by this call and all necessary RTAS interactions are + * performed before returning the fd to user space. This keeps the + * read handler simple and ensures that kernel can prevent + * interleaving ibm,physical-attestation call sequences. + * + * Return: The installed fd number if successful, -ve errno otherwise. + */ +static long papr_phy_attest_create_handle(struct papr_phy_attest_io_block __user *ulc) +{ + struct rtas_phy_attest_params *params; + struct papr_rtas_sequence seq = {}; + int fd; + + /* + * Freed in phy_attest_sequence_end(). + */ + params = kzalloc(sizeof(*params), GFP_KERNEL_ACCOUNT); + if (!params) + return -ENOMEM; + + if (copy_from_user(¶ms->cmd, ulc, + sizeof(struct papr_phy_attest_io_block))) + return -EFAULT; + + params->cmd_len = be32_to_cpu(params->cmd.length); + seq = (struct papr_rtas_sequence) { + .begin = phy_attest_sequence_begin, + .end = phy_attest_sequence_end, + .work = phy_attest_sequence_fill_work_area, + }; + + seq.params = (void *)params; + + fd = papr_rtas_setup_file_interface(&seq, + &papr_phy_attest_handle_ops, + "[papr-physical-attestation]"); + + return fd; +} + +/* + * Top-level ioctl handler for /dev/papr-physical-attestation. + */ +static long papr_phy_attest_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (__force void __user *)arg; + long ret; + + switch (ioctl) { + case PAPR_PHY_ATTEST_IOC_HANDLE: + ret = papr_phy_attest_create_handle(argp); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + return ret; +} + +static const struct file_operations papr_phy_attest_ops = { + .unlocked_ioctl = papr_phy_attest_dev_ioctl, +}; + +static struct miscdevice papr_phy_attest_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-physical-attestation", + .fops = &papr_phy_attest_ops, +}; + +static __init int papr_phy_attest_init(void) +{ + if (!rtas_function_implemented(RTAS_FN_IBM_PHYSICAL_ATTESTATION)) + return -ENODEV; + + return misc_register(&papr_phy_attest_dev); +} +machine_device_initcall(pseries, papr_phy_attest_init); diff --git a/arch/powerpc/platforms/pseries/papr-platform-dump.c b/arch/powerpc/platforms/pseries/papr-platform-dump.c new file mode 100644 index 000000000000..f8d55eccdb6b --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-platform-dump.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-platform-dump: " fmt + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <asm/machdep.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> +#include <uapi/asm/papr-platform-dump.h> + +/* + * Function-specific return values for ibm,platform-dump, derived from + * PAPR+ v2.13 7.3.3.4.1 "ibm,platform-dump RTAS Call". + */ +#define RTAS_IBM_PLATFORM_DUMP_COMPLETE 0 /* Complete dump retrieved. */ +#define RTAS_IBM_PLATFORM_DUMP_CONTINUE 1 /* Continue dump */ +#define RTAS_NOT_AUTHORIZED -9002 /* Not Authorized */ + +#define RTAS_IBM_PLATFORM_DUMP_START 2 /* Linux status to start dump */ + +/** + * struct ibm_platform_dump_params - Parameters (in and out) for + * ibm,platform-dump + * @work_area: In: work area buffer for results. + * @buf_length: In: work area buffer length in bytes + * @dump_tag_hi: In: Most-significant 32 bits of a Dump_Tag representing + * an id of the dump being processed. + * @dump_tag_lo: In: Least-significant 32 bits of a Dump_Tag representing + * an id of the dump being processed. + * @sequence_hi: In: Sequence number in most-significant 32 bits. + * Out: Next sequence number in most-significant 32 bits. + * @sequence_lo: In: Sequence number in Least-significant 32 bits + * Out: Next sequence number in Least-significant 32 bits. + * @bytes_ret_hi: Out: Bytes written in most-significant 32 bits. + * @bytes_ret_lo: Out: Bytes written in Least-significant 32 bits. + * @status: Out: RTAS call status. + * @list: Maintain the list of dumps are in progress. Can + * retrieve multiple dumps with different dump IDs at + * the same time but not with the same dump ID. This list + * is used to determine whether the dump for the same ID + * is in progress. + */ +struct ibm_platform_dump_params { + struct rtas_work_area *work_area; + u32 buf_length; + u32 dump_tag_hi; + u32 dump_tag_lo; + u32 sequence_hi; + u32 sequence_lo; + u32 bytes_ret_hi; + u32 bytes_ret_lo; + s32 status; + struct list_head list; +}; + +/* + * Multiple dumps with different dump IDs can be retrieved at the same + * time, but not with dame dump ID. platform_dump_list_mutex and + * platform_dump_list are used to prevent this behavior. + */ +static DEFINE_MUTEX(platform_dump_list_mutex); +static LIST_HEAD(platform_dump_list); + +/** + * rtas_ibm_platform_dump() - Call ibm,platform-dump to fill a work area + * buffer. + * @params: See &struct ibm_platform_dump_params. + * @buf_addr: Address of dump buffer (work_area) + * @buf_length: Length of the buffer in bytes (min. 1024) + * + * Calls ibm,platform-dump until it errors or successfully deposits data + * into the supplied work area. Handles RTAS retry statuses. Maps RTAS + * error statuses to reasonable errno values. + * + * Can request multiple dumps with different dump IDs at the same time, + * but not with the same dump ID which is prevented with the check in + * the ioctl code (papr_platform_dump_create_handle()). + * + * The caller should inspect @params.status to determine whether more + * calls are needed to complete the sequence. + * + * Context: May sleep. + * Return: -ve on error, 0 for dump complete and 1 for continue dump + */ +static int rtas_ibm_platform_dump(struct ibm_platform_dump_params *params, + phys_addr_t buf_addr, u32 buf_length) +{ + u32 rets[4]; + s32 fwrc; + int ret = 0; + + do { + fwrc = rtas_call(rtas_function_token(RTAS_FN_IBM_PLATFORM_DUMP), + 6, 5, + rets, + params->dump_tag_hi, + params->dump_tag_lo, + params->sequence_hi, + params->sequence_lo, + buf_addr, + buf_length); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_NOT_AUTHORIZED: + ret = -EPERM; + break; + case RTAS_IBM_PLATFORM_DUMP_CONTINUE: + case RTAS_IBM_PLATFORM_DUMP_COMPLETE: + params->sequence_hi = rets[0]; + params->sequence_lo = rets[1]; + params->bytes_ret_hi = rets[2]; + params->bytes_ret_lo = rets[3]; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,platform-dump status %d\n", + fwrc); + break; + } + + params->status = fwrc; + return ret; +} + +/* + * Platform dump is used with multiple RTAS calls to retrieve the + * complete dump for the provided dump ID. Once the complete dump is + * retrieved, the hypervisor returns dump complete status (0) for the + * last RTAS call and expects the caller issues one more call with + * NULL buffer to invalidate the dump so that the hypervisor can remove + * the dump. + * + * After the specific dump is invalidated in the hypervisor, expect the + * dump complete status for the new sequence - the user space initiates + * new request for the same dump ID. + */ +static ssize_t papr_platform_dump_handle_read(struct file *file, + char __user *buf, size_t size, loff_t *off) +{ + struct ibm_platform_dump_params *params = file->private_data; + u64 total_bytes; + s32 fwrc; + + /* + * Dump already completed with the previous read calls. + * In case if the user space issues further reads, returns + * -EINVAL. + */ + if (!params->buf_length) { + pr_warn_once("Platform dump completed for dump ID %llu\n", + (u64) (((u64)params->dump_tag_hi << 32) | + params->dump_tag_lo)); + return -EINVAL; + } + + /* + * The hypervisor returns status 0 if no more data available to + * download. The dump will be invalidated with ioctl (see below). + */ + if (params->status == RTAS_IBM_PLATFORM_DUMP_COMPLETE) { + params->buf_length = 0; + /* + * Returns 0 to the user space so that user + * space read stops. + */ + return 0; + } + + if (size < SZ_1K) { + pr_err_once("Buffer length should be minimum 1024 bytes\n"); + return -EINVAL; + } else if (size > params->buf_length) { + /* + * Allocate 4K work area. So if the user requests > 4K, + * resize the buffer length. + */ + size = params->buf_length; + } + + fwrc = rtas_ibm_platform_dump(params, + rtas_work_area_phys(params->work_area), + size); + if (fwrc < 0) + return fwrc; + + total_bytes = (u64) (((u64)params->bytes_ret_hi << 32) | + params->bytes_ret_lo); + + /* + * Kernel or firmware bug, do not continue. + */ + if (WARN(total_bytes > size, "possible write beyond end of work area")) + return -EFAULT; + + if (copy_to_user(buf, rtas_work_area_raw_buf(params->work_area), + total_bytes)) + return -EFAULT; + + return total_bytes; +} + +static int papr_platform_dump_handle_release(struct inode *inode, + struct file *file) +{ + struct ibm_platform_dump_params *params = file->private_data; + + if (params->work_area) + rtas_work_area_free(params->work_area); + + mutex_lock(&platform_dump_list_mutex); + list_del(¶ms->list); + mutex_unlock(&platform_dump_list_mutex); + + kfree(params); + file->private_data = NULL; + return 0; +} + +/* + * This ioctl is used to invalidate the dump assuming the user space + * issue this ioctl after obtain the complete dump. + * Issue the last RTAS call with NULL buffer to invalidate the dump + * which means dump will be freed in the hypervisor. + */ +static long papr_platform_dump_invalidate_ioctl(struct file *file, + unsigned int ioctl, unsigned long arg) +{ + struct ibm_platform_dump_params *params; + u64 __user *argp = (void __user *)arg; + u64 param_dump_tag, dump_tag; + + if (ioctl != PAPR_PLATFORM_DUMP_IOC_INVALIDATE) + return -ENOIOCTLCMD; + + if (get_user(dump_tag, argp)) + return -EFAULT; + + /* + * private_data is freeded during release(), so should not + * happen. + */ + if (!file->private_data) { + pr_err("No valid FD to invalidate dump for the ID(%llu)\n", + dump_tag); + return -EINVAL; + } + + params = file->private_data; + param_dump_tag = (u64) (((u64)params->dump_tag_hi << 32) | + params->dump_tag_lo); + if (dump_tag != param_dump_tag) { + pr_err("Invalid dump ID(%llu) to invalidate dump\n", + dump_tag); + return -EINVAL; + } + + if (params->status != RTAS_IBM_PLATFORM_DUMP_COMPLETE) { + pr_err("Platform dump is not complete, but requested " + "to invalidate dump for ID(%llu)\n", + dump_tag); + return -EINPROGRESS; + } + + return rtas_ibm_platform_dump(params, 0, 0); +} + +static const struct file_operations papr_platform_dump_handle_ops = { + .read = papr_platform_dump_handle_read, + .release = papr_platform_dump_handle_release, + .unlocked_ioctl = papr_platform_dump_invalidate_ioctl, +}; + +/** + * papr_platform_dump_create_handle() - Create a fd-based handle for + * reading platform dump + * + * Handler for PAPR_PLATFORM_DUMP_IOC_CREATE_HANDLE ioctl command + * Allocates RTAS parameter struct and work area and attached to the + * file descriptor for reading by user space with the multiple RTAS + * calls until the dump is completed. This memory allocation is freed + * when the file is released. + * + * Multiple dump requests with different IDs are allowed at the same + * time, but not with the same dump ID. So if the user space is + * already opened file descriptor for the specific dump ID, return + * -EALREADY for the next request. + * + * @dump_tag: Dump ID for the dump requested to retrieve from the + * hypervisor + * + * Return: The installed fd number if successful, -ve errno otherwise. + */ +static long papr_platform_dump_create_handle(u64 dump_tag) +{ + struct ibm_platform_dump_params *params; + u64 param_dump_tag; + struct file *file; + long err; + int fd; + + /* + * Return failure if the user space is already opened FD for + * the specific dump ID. This check will prevent multiple dump + * requests for the same dump ID at the same time. Generally + * should not expect this, but in case. + */ + list_for_each_entry(params, &platform_dump_list, list) { + param_dump_tag = (u64) (((u64)params->dump_tag_hi << 32) | + params->dump_tag_lo); + if (dump_tag == param_dump_tag) { + pr_err("Platform dump for ID(%llu) is already in progress\n", + dump_tag); + return -EALREADY; + } + } + + params = kzalloc(sizeof(struct ibm_platform_dump_params), + GFP_KERNEL_ACCOUNT); + if (!params) + return -ENOMEM; + + params->work_area = rtas_work_area_alloc(SZ_4K); + params->buf_length = SZ_4K; + params->dump_tag_hi = (u32)(dump_tag >> 32); + params->dump_tag_lo = (u32)(dump_tag & 0x00000000ffffffffULL); + params->status = RTAS_IBM_PLATFORM_DUMP_START; + + fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = fd; + goto free_area; + } + + file = anon_inode_getfile_fmode("[papr-platform-dump]", + &papr_platform_dump_handle_ops, + (void *)params, O_RDONLY, + FMODE_LSEEK | FMODE_PREAD); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto put_fd; + } + + fd_install(fd, file); + + list_add(¶ms->list, &platform_dump_list); + + pr_info("%s (%d) initiated platform dump for dump tag %llu\n", + current->comm, current->pid, dump_tag); + return fd; +put_fd: + put_unused_fd(fd); +free_area: + rtas_work_area_free(params->work_area); + kfree(params); + return err; +} + +/* + * Top-level ioctl handler for /dev/papr-platform-dump. + */ +static long papr_platform_dump_dev_ioctl(struct file *filp, + unsigned int ioctl, + unsigned long arg) +{ + u64 __user *argp = (void __user *)arg; + u64 dump_tag; + long ret; + + if (get_user(dump_tag, argp)) + return -EFAULT; + + switch (ioctl) { + case PAPR_PLATFORM_DUMP_IOC_CREATE_HANDLE: + mutex_lock(&platform_dump_list_mutex); + ret = papr_platform_dump_create_handle(dump_tag); + mutex_unlock(&platform_dump_list_mutex); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + return ret; +} + +static const struct file_operations papr_platform_dump_ops = { + .unlocked_ioctl = papr_platform_dump_dev_ioctl, +}; + +static struct miscdevice papr_platform_dump_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-platform-dump", + .fops = &papr_platform_dump_ops, +}; + +static __init int papr_platform_dump_init(void) +{ + if (!rtas_function_implemented(RTAS_FN_IBM_PLATFORM_DUMP)) + return -ENODEV; + + return misc_register(&papr_platform_dump_dev); +} +machine_device_initcall(pseries, papr_platform_dump_init); diff --git a/arch/powerpc/platforms/pseries/papr-rtas-common.c b/arch/powerpc/platforms/pseries/papr-rtas-common.c new file mode 100644 index 000000000000..33c606e3378a --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-rtas-common.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-common: " fmt + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <linux/sched/signal.h> +#include "papr-rtas-common.h" + +/* + * Sequence based RTAS HCALL has to issue multiple times to retrieve + * complete data from the hypervisor. For some of these RTAS calls, + * the OS should not interleave calls with different input until the + * sequence is completed. So data is collected for these calls during + * ioctl handle and export to user space with read() handle. + * This file provides common functions needed for such sequence based + * RTAS calls Ex: ibm,get-vpd and ibm,get-indices. + */ + +bool papr_rtas_blob_has_data(const struct papr_rtas_blob *blob) +{ + return blob->data && blob->len; +} + +void papr_rtas_blob_free(const struct papr_rtas_blob *blob) +{ + if (blob) { + kvfree(blob->data); + kfree(blob); + } +} + +/** + * papr_rtas_blob_extend() - Append data to a &struct papr_rtas_blob. + * @blob: The blob to extend. + * @data: The new data to append to @blob. + * @len: The length of @data. + * + * Context: May sleep. + * Return: -ENOMEM on allocation failure, 0 otherwise. + */ +static int papr_rtas_blob_extend(struct papr_rtas_blob *blob, + const char *data, size_t len) +{ + const size_t new_len = blob->len + len; + const size_t old_len = blob->len; + const char *old_ptr = blob->data; + char *new_ptr; + + new_ptr = kvrealloc(old_ptr, new_len, GFP_KERNEL_ACCOUNT); + if (!new_ptr) + return -ENOMEM; + + memcpy(&new_ptr[old_len], data, len); + blob->data = new_ptr; + blob->len = new_len; + return 0; +} + +/** + * papr_rtas_blob_generate() - Construct a new &struct papr_rtas_blob. + * @seq: work function of the caller that is called to obtain + * data with the caller RTAS call. + * + * The @work callback is invoked until it returns NULL. @seq is + * passed to @work in its first argument on each call. When + * @work returns data, it should store the data length in its + * second argument. + * + * Context: May sleep. + * Return: A completely populated &struct papr_rtas_blob, or NULL on error. + */ +static const struct papr_rtas_blob * +papr_rtas_blob_generate(struct papr_rtas_sequence *seq) +{ + struct papr_rtas_blob *blob; + const char *buf; + size_t len; + int err = 0; + + blob = kzalloc(sizeof(*blob), GFP_KERNEL_ACCOUNT); + if (!blob) + return NULL; + + if (!seq->work) + return ERR_PTR(-EINVAL); + + + while (err == 0 && (buf = seq->work(seq, &len))) + err = papr_rtas_blob_extend(blob, buf, len); + + if (err != 0 || !papr_rtas_blob_has_data(blob)) + goto free_blob; + + return blob; +free_blob: + papr_rtas_blob_free(blob); + return NULL; +} + +int papr_rtas_sequence_set_err(struct papr_rtas_sequence *seq, int err) +{ + /* Preserve the first error recorded. */ + if (seq->error == 0) + seq->error = err; + + return seq->error; +} + +/* + * Higher-level retrieval code below. These functions use the + * papr_rtas_blob_* and sequence_* APIs defined above to create fd-based + * handles for consumption by user space. + */ + +/** + * papr_rtas_run_sequence() - Run a single retrieval sequence. + * @seq: Functions of the caller to complete the sequence + * + * Context: May sleep. Holds a mutex and an RTAS work area for its + * duration. Typically performs multiple sleepable slab + * allocations. + * + * Return: A populated &struct papr_rtas_blob on success. Encoded error + * pointer otherwise. + */ +static const struct papr_rtas_blob *papr_rtas_run_sequence(struct papr_rtas_sequence *seq) +{ + const struct papr_rtas_blob *blob; + + if (seq->begin) + seq->begin(seq); + + blob = papr_rtas_blob_generate(seq); + if (!blob) + papr_rtas_sequence_set_err(seq, -ENOMEM); + + if (seq->end) + seq->end(seq); + + + if (seq->error) { + papr_rtas_blob_free(blob); + return ERR_PTR(seq->error); + } + + return blob; +} + +/** + * papr_rtas_retrieve() - Return the data blob that is exposed to + * user space. + * @seq: RTAS call specific functions to be invoked until the + * sequence is completed. + * + * Run sequences against @param until a blob is successfully + * instantiated, or a hard error is encountered, or a fatal signal is + * pending. + * + * Context: May sleep. + * Return: A fully populated data blob when successful. Encoded error + * pointer otherwise. + */ +const struct papr_rtas_blob *papr_rtas_retrieve(struct papr_rtas_sequence *seq) +{ + const struct papr_rtas_blob *blob; + + /* + * EAGAIN means the sequence returns error with a -4 (data + * changed and need to start the sequence) status from RTAS calls + * and we should attempt a new sequence. PAPR+ (v2.13 R1–7.3.20–5 + * - ibm,get-vpd, R1–7.3.17–6 - ibm,get-indices) indicates that + * this should be a transient condition, not something that + * happens continuously. But we'll stop trying on a fatal signal. + */ + do { + blob = papr_rtas_run_sequence(seq); + if (!IS_ERR(blob)) /* Success. */ + break; + if (PTR_ERR(blob) != -EAGAIN) /* Hard error. */ + break; + cond_resched(); + } while (!fatal_signal_pending(current)); + + return blob; +} + +/** + * papr_rtas_setup_file_interface - Complete the sequence and obtain + * the data and export to user space with fd-based handles. Then the + * user spave gets the data with read() handle. + * @seq: RTAS call specific functions to get the data. + * @fops: RTAS call specific file operations such as read(). + * @name: RTAS call specific char device node. + * + * Return: FD handle for consumption by user space + */ +long papr_rtas_setup_file_interface(struct papr_rtas_sequence *seq, + const struct file_operations *fops, + char *name) +{ + const struct papr_rtas_blob *blob; + struct file *file; + long ret; + int fd; + + blob = papr_rtas_retrieve(seq); + if (IS_ERR(blob)) + return PTR_ERR(blob); + + fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = fd; + goto free_blob; + } + + file = anon_inode_getfile_fmode(name, fops, (void *)blob, + O_RDONLY, FMODE_LSEEK | FMODE_PREAD); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto put_fd; + } + + fd_install(fd, file); + return fd; + +put_fd: + put_unused_fd(fd); +free_blob: + papr_rtas_blob_free(blob); + return ret; +} + +/* + * papr_rtas_sequence_should_stop() - Determine whether RTAS retrieval + * sequence should continue. + * + * Examines the sequence error state and outputs of the last call to + * the specific RTAS to determine whether the sequence in progress + * should continue or stop. + * + * Return: True if the sequence has encountered an error or if all data + * for this sequence has been retrieved. False otherwise. + */ +bool papr_rtas_sequence_should_stop(const struct papr_rtas_sequence *seq, + s32 status, bool init_state) +{ + bool done; + + if (seq->error) + return true; + + switch (status) { + case RTAS_SEQ_COMPLETE: + if (init_state) + done = false; /* Initial state. */ + else + done = true; /* All data consumed. */ + break; + case RTAS_SEQ_MORE_DATA: + done = false; /* More data available. */ + break; + default: + done = true; /* Error encountered. */ + break; + } + + return done; +} + +/* + * User space read to retrieve data for the corresponding RTAS call. + * papr_rtas_blob is filled with the data using the corresponding RTAS + * call sequence API. + */ +ssize_t papr_rtas_common_handle_read(struct file *file, + char __user *buf, size_t size, loff_t *off) +{ + const struct papr_rtas_blob *blob = file->private_data; + + /* We should not instantiate a handle without any data attached. */ + if (!papr_rtas_blob_has_data(blob)) { + pr_err_once("handle without data\n"); + return -EIO; + } + + return simple_read_from_buffer(buf, size, off, blob->data, blob->len); +} + +int papr_rtas_common_handle_release(struct inode *inode, + struct file *file) +{ + const struct papr_rtas_blob *blob = file->private_data; + + papr_rtas_blob_free(blob); + + return 0; +} + +loff_t papr_rtas_common_handle_seek(struct file *file, loff_t off, + int whence) +{ + const struct papr_rtas_blob *blob = file->private_data; + + return fixed_size_llseek(file, off, whence, blob->len); +} diff --git a/arch/powerpc/platforms/pseries/papr-rtas-common.h b/arch/powerpc/platforms/pseries/papr-rtas-common.h new file mode 100644 index 000000000000..4ceabcaf4905 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-rtas-common.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_POWERPC_PAPR_RTAS_COMMON_H +#define _ASM_POWERPC_PAPR_RTAS_COMMON_H + +#include <linux/types.h> + +/* + * Return codes for sequence based RTAS calls. + * Not listed under PAPR+ v2.13 7.2.8: "Return Codes". + * But defined in the specific section of each RTAS call. + */ +#define RTAS_SEQ_COMPLETE 0 /* All data has been retrieved. */ +#define RTAS_SEQ_MORE_DATA 1 /* More data is available */ +#define RTAS_SEQ_START_OVER -4 /* Data changed, restart call sequence. */ + +/* + * Internal "blob" APIs for accumulating RTAS call results into + * an immutable buffer to be attached to a file descriptor. + */ +struct papr_rtas_blob { + const char *data; + size_t len; +}; + +/** + * struct papr_sequence - State for managing a sequence of RTAS calls. + * @error: Shall be zero as long as the sequence has not encountered an error, + * -ve errno otherwise. Use papr_rtas_sequence_set_err() to update. + * @params: Parameter block to pass to rtas_*() calls. + * @begin: Work area allocation and initialize the needed parameter + * values passed to RTAS call + * @end: Free the allocated work area + * @work: Obtain data with RTAS call and invoke it until the sequence is + * completed. + * + */ +struct papr_rtas_sequence { + int error; + void *params; + void (*begin)(struct papr_rtas_sequence *seq); + void (*end)(struct papr_rtas_sequence *seq); + const char *(*work)(struct papr_rtas_sequence *seq, size_t *len); +}; + +extern bool papr_rtas_blob_has_data(const struct papr_rtas_blob *blob); +extern void papr_rtas_blob_free(const struct papr_rtas_blob *blob); +extern int papr_rtas_sequence_set_err(struct papr_rtas_sequence *seq, + int err); +extern const struct papr_rtas_blob *papr_rtas_retrieve(struct papr_rtas_sequence *seq); +extern long papr_rtas_setup_file_interface(struct papr_rtas_sequence *seq, + const struct file_operations *fops, char *name); +extern bool papr_rtas_sequence_should_stop(const struct papr_rtas_sequence *seq, + s32 status, bool init_state); +extern ssize_t papr_rtas_common_handle_read(struct file *file, + char __user *buf, size_t size, loff_t *off); +extern int papr_rtas_common_handle_release(struct inode *inode, + struct file *file); +extern loff_t papr_rtas_common_handle_seek(struct file *file, loff_t off, + int whence); +#endif /* _ASM_POWERPC_PAPR_RTAS_COMMON_H */ + diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c index c29e85db5f35..f38c188fc4a1 100644 --- a/arch/powerpc/platforms/pseries/papr-vpd.c +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -2,7 +2,6 @@ #define pr_fmt(fmt) "papr-vpd: " fmt -#include <linux/anon_inodes.h> #include <linux/build_bug.h> #include <linux/file.h> #include <linux/fs.h> @@ -20,14 +19,7 @@ #include <asm/rtas-work-area.h> #include <asm/rtas.h> #include <uapi/asm/papr-vpd.h> - -/* - * Function-specific return values for ibm,get-vpd, derived from PAPR+ - * v2.13 7.3.20 "ibm,get-vpd RTAS Call". - */ -#define RTAS_IBM_GET_VPD_COMPLETE 0 /* All VPD has been retrieved. */ -#define RTAS_IBM_GET_VPD_MORE_DATA 1 /* More VPD is available. */ -#define RTAS_IBM_GET_VPD_START_OVER -4 /* VPD changed, restart call sequence. */ +#include "papr-rtas-common.h" /** * struct rtas_ibm_get_vpd_params - Parameters (in and out) for ibm,get-vpd. @@ -91,13 +83,14 @@ static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params) case RTAS_INVALID_PARAMETER: ret = -EINVAL; break; - case RTAS_IBM_GET_VPD_START_OVER: + case RTAS_SEQ_START_OVER: ret = -EAGAIN; + pr_info_ratelimited("VPD changed during retrieval, retrying\n"); break; - case RTAS_IBM_GET_VPD_MORE_DATA: + case RTAS_SEQ_MORE_DATA: params->sequence = rets[0]; fallthrough; - case RTAS_IBM_GET_VPD_COMPLETE: + case RTAS_SEQ_COMPLETE: params->written = rets[1]; /* * Kernel or firmware bug, do not continue. @@ -119,94 +112,6 @@ static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params) } /* - * Internal VPD "blob" APIs for accumulating ibm,get-vpd results into - * an immutable buffer to be attached to a file descriptor. - */ -struct vpd_blob { - const char *data; - size_t len; -}; - -static bool vpd_blob_has_data(const struct vpd_blob *blob) -{ - return blob->data && blob->len; -} - -static void vpd_blob_free(const struct vpd_blob *blob) -{ - if (blob) { - kvfree(blob->data); - kfree(blob); - } -} - -/** - * vpd_blob_extend() - Append data to a &struct vpd_blob. - * @blob: The blob to extend. - * @data: The new data to append to @blob. - * @len: The length of @data. - * - * Context: May sleep. - * Return: -ENOMEM on allocation failure, 0 otherwise. - */ -static int vpd_blob_extend(struct vpd_blob *blob, const char *data, size_t len) -{ - const size_t new_len = blob->len + len; - const size_t old_len = blob->len; - const char *old_ptr = blob->data; - char *new_ptr; - - new_ptr = old_ptr ? - kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) : - kvmalloc(len, GFP_KERNEL_ACCOUNT); - - if (!new_ptr) - return -ENOMEM; - - memcpy(&new_ptr[old_len], data, len); - blob->data = new_ptr; - blob->len = new_len; - return 0; -} - -/** - * vpd_blob_generate() - Construct a new &struct vpd_blob. - * @generator: Function that supplies the blob data. - * @arg: Context pointer supplied by caller, passed to @generator. - * - * The @generator callback is invoked until it returns NULL. @arg is - * passed to @generator in its first argument on each call. When - * @generator returns data, it should store the data length in its - * second argument. - * - * Context: May sleep. - * Return: A completely populated &struct vpd_blob, or NULL on error. - */ -static const struct vpd_blob * -vpd_blob_generate(const char * (*generator)(void *, size_t *), void *arg) -{ - struct vpd_blob *blob; - const char *buf; - size_t len; - int err = 0; - - blob = kzalloc(sizeof(*blob), GFP_KERNEL_ACCOUNT); - if (!blob) - return NULL; - - while (err == 0 && (buf = generator(arg, &len))) - err = vpd_blob_extend(blob, buf, len); - - if (err != 0 || !vpd_blob_has_data(blob)) - goto free_blob; - - return blob; -free_blob: - vpd_blob_free(blob); - return NULL; -} - -/* * Internal VPD sequence APIs. A VPD sequence is a series of calls to * ibm,get-vpd for a given location code. The sequence ends when an * error is encountered or all VPD for the location code has been @@ -214,30 +119,14 @@ free_blob: */ /** - * struct vpd_sequence - State for managing a VPD sequence. - * @error: Shall be zero as long as the sequence has not encountered an error, - * -ve errno otherwise. Use vpd_sequence_set_err() to update this. - * @params: Parameter block to pass to rtas_ibm_get_vpd(). - */ -struct vpd_sequence { - int error; - struct rtas_ibm_get_vpd_params params; -}; - -/** * vpd_sequence_begin() - Begin a VPD retrieval sequence. - * @seq: Uninitialized sequence state. - * @loc_code: Location code that defines the scope of the VPD to return. - * - * Initializes @seq with the resources necessary to carry out a VPD - * sequence. Callers must pass @seq to vpd_sequence_end() regardless - * of whether the sequence succeeds. + * @seq: vpd call parameters from sequence struct * * Context: May sleep. */ -static void vpd_sequence_begin(struct vpd_sequence *seq, - const struct papr_location_code *loc_code) +static void vpd_sequence_begin(struct papr_rtas_sequence *seq) { + struct rtas_ibm_get_vpd_params *vpd_params; /* * Use a static data structure for the location code passed to * RTAS to ensure it's in the RMA and avoid a separate work @@ -245,6 +134,7 @@ static void vpd_sequence_begin(struct vpd_sequence *seq, */ static struct papr_location_code static_loc_code; + vpd_params = (struct rtas_ibm_get_vpd_params *)seq->params; /* * We could allocate the work area before acquiring the * function lock, but that would allow concurrent requests to @@ -252,14 +142,12 @@ static void vpd_sequence_begin(struct vpd_sequence *seq, * allocate the work area under the lock. */ mutex_lock(&rtas_ibm_get_vpd_lock); - static_loc_code = *loc_code; - *seq = (struct vpd_sequence) { - .params = { - .work_area = rtas_work_area_alloc(SZ_4K), - .loc_code = &static_loc_code, - .sequence = 1, - }, - }; + static_loc_code = *(struct papr_location_code *)vpd_params->loc_code; + vpd_params = (struct rtas_ibm_get_vpd_params *)seq->params; + vpd_params->work_area = rtas_work_area_alloc(SZ_4K); + vpd_params->loc_code = &static_loc_code; + vpd_params->sequence = 1; + vpd_params->status = 0; } /** @@ -268,180 +156,39 @@ static void vpd_sequence_begin(struct vpd_sequence *seq, * * Releases resources obtained by vpd_sequence_begin(). */ -static void vpd_sequence_end(struct vpd_sequence *seq) -{ - rtas_work_area_free(seq->params.work_area); - mutex_unlock(&rtas_ibm_get_vpd_lock); -} - -/** - * vpd_sequence_should_stop() - Determine whether a VPD retrieval sequence - * should continue. - * @seq: VPD sequence state. - * - * Examines the sequence error state and outputs of the last call to - * ibm,get-vpd to determine whether the sequence in progress should - * continue or stop. - * - * Return: True if the sequence has encountered an error or if all VPD for - * this sequence has been retrieved. False otherwise. - */ -static bool vpd_sequence_should_stop(const struct vpd_sequence *seq) +static void vpd_sequence_end(struct papr_rtas_sequence *seq) { - bool done; + struct rtas_ibm_get_vpd_params *vpd_params; - if (seq->error) - return true; - - switch (seq->params.status) { - case 0: - if (seq->params.written == 0) - done = false; /* Initial state. */ - else - done = true; /* All data consumed. */ - break; - case 1: - done = false; /* More data available. */ - break; - default: - done = true; /* Error encountered. */ - break; - } - - return done; -} - -static int vpd_sequence_set_err(struct vpd_sequence *seq, int err) -{ - /* Preserve the first error recorded. */ - if (seq->error == 0) - seq->error = err; - - return seq->error; + vpd_params = (struct rtas_ibm_get_vpd_params *)seq->params; + rtas_work_area_free(vpd_params->work_area); + mutex_unlock(&rtas_ibm_get_vpd_lock); } /* - * Generator function to be passed to vpd_blob_generate(). + * Generator function to be passed to papr_rtas_blob_generate(). */ -static const char *vpd_sequence_fill_work_area(void *arg, size_t *len) +static const char *vpd_sequence_fill_work_area(struct papr_rtas_sequence *seq, + size_t *len) { - struct vpd_sequence *seq = arg; - struct rtas_ibm_get_vpd_params *p = &seq->params; + struct rtas_ibm_get_vpd_params *p; + bool init_state; + + p = (struct rtas_ibm_get_vpd_params *)seq->params; + init_state = (p->written == 0) ? true : false; - if (vpd_sequence_should_stop(seq)) + if (papr_rtas_sequence_should_stop(seq, p->status, init_state)) return NULL; - if (vpd_sequence_set_err(seq, rtas_ibm_get_vpd(p))) + if (papr_rtas_sequence_set_err(seq, rtas_ibm_get_vpd(p))) return NULL; *len = p->written; return rtas_work_area_raw_buf(p->work_area); } -/* - * Higher-level VPD retrieval code below. These functions use the - * vpd_blob_* and vpd_sequence_* APIs defined above to create fd-based - * VPD handles for consumption by user space. - */ - -/** - * papr_vpd_run_sequence() - Run a single VPD retrieval sequence. - * @loc_code: Location code that defines the scope of VPD to return. - * - * Context: May sleep. Holds a mutex and an RTAS work area for its - * duration. Typically performs multiple sleepable slab - * allocations. - * - * Return: A populated &struct vpd_blob on success. Encoded error - * pointer otherwise. - */ -static const struct vpd_blob *papr_vpd_run_sequence(const struct papr_location_code *loc_code) -{ - const struct vpd_blob *blob; - struct vpd_sequence seq; - - vpd_sequence_begin(&seq, loc_code); - blob = vpd_blob_generate(vpd_sequence_fill_work_area, &seq); - if (!blob) - vpd_sequence_set_err(&seq, -ENOMEM); - vpd_sequence_end(&seq); - - if (seq.error) { - vpd_blob_free(blob); - return ERR_PTR(seq.error); - } - - return blob; -} - -/** - * papr_vpd_retrieve() - Return the VPD for a location code. - * @loc_code: Location code that defines the scope of VPD to return. - * - * Run VPD sequences against @loc_code until a blob is successfully - * instantiated, or a hard error is encountered, or a fatal signal is - * pending. - * - * Context: May sleep. - * Return: A fully populated VPD blob when successful. Encoded error - * pointer otherwise. - */ -static const struct vpd_blob *papr_vpd_retrieve(const struct papr_location_code *loc_code) -{ - const struct vpd_blob *blob; - - /* - * EAGAIN means the sequence errored with a -4 (VPD changed) - * status from ibm,get-vpd, and we should attempt a new - * sequence. PAPR+ v2.13 R1–7.3.20–5 indicates that this - * should be a transient condition, not something that happens - * continuously. But we'll stop trying on a fatal signal. - */ - do { - blob = papr_vpd_run_sequence(loc_code); - if (!IS_ERR(blob)) /* Success. */ - break; - if (PTR_ERR(blob) != -EAGAIN) /* Hard error. */ - break; - pr_info_ratelimited("VPD changed during retrieval, retrying\n"); - cond_resched(); - } while (!fatal_signal_pending(current)); - - return blob; -} - -static ssize_t papr_vpd_handle_read(struct file *file, char __user *buf, size_t size, loff_t *off) -{ - const struct vpd_blob *blob = file->private_data; - - /* bug: we should not instantiate a handle without any data attached. */ - if (!vpd_blob_has_data(blob)) { - pr_err_once("handle without data\n"); - return -EIO; - } - - return simple_read_from_buffer(buf, size, off, blob->data, blob->len); -} - -static int papr_vpd_handle_release(struct inode *inode, struct file *file) -{ - const struct vpd_blob *blob = file->private_data; - - vpd_blob_free(blob); - - return 0; -} - -static loff_t papr_vpd_handle_seek(struct file *file, loff_t off, int whence) -{ - const struct vpd_blob *blob = file->private_data; - - return fixed_size_llseek(file, off, whence, blob->len); -} - - static const struct file_operations papr_vpd_handle_ops = { - .read = papr_vpd_handle_read, - .llseek = papr_vpd_handle_seek, - .release = papr_vpd_handle_release, + .read = papr_rtas_common_handle_read, + .llseek = papr_rtas_common_handle_seek, + .release = papr_rtas_common_handle_release, }; /** @@ -463,10 +210,9 @@ static const struct file_operations papr_vpd_handle_ops = { */ static long papr_vpd_create_handle(struct papr_location_code __user *ulc) { + struct rtas_ibm_get_vpd_params vpd_params = {}; + struct papr_rtas_sequence seq = {}; struct papr_location_code klc; - const struct vpd_blob *blob; - struct file *file; - long err; int fd; if (copy_from_user(&klc, ulc, sizeof(klc))) @@ -475,31 +221,19 @@ static long papr_vpd_create_handle(struct papr_location_code __user *ulc) if (!string_is_terminated(klc.str, ARRAY_SIZE(klc.str))) return -EINVAL; - blob = papr_vpd_retrieve(&klc); - if (IS_ERR(blob)) - return PTR_ERR(blob); + seq = (struct papr_rtas_sequence) { + .begin = vpd_sequence_begin, + .end = vpd_sequence_end, + .work = vpd_sequence_fill_work_area, + }; - fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); - if (fd < 0) { - err = fd; - goto free_blob; - } + vpd_params.loc_code = &klc; + seq.params = (void *)&vpd_params; - file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops, - (void *)blob, O_RDONLY); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto put_fd; - } + fd = papr_rtas_setup_file_interface(&seq, &papr_vpd_handle_ops, + "[papr-vpd]"); - file->f_mode |= FMODE_LSEEK | FMODE_PREAD; - fd_install(fd, file); return fd; -put_fd: - put_unused_fd(fd); -free_blob: - vpd_blob_free(blob); - return err; } /* diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index c233f9db039b..f7c9271bda58 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/ioport.h> +#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/ndctl.h> #include <linux/sched.h> @@ -16,9 +17,10 @@ #include <linux/nd.h> #include <asm/plpar_wrappers.h> -#include <asm/papr_pdsm.h> +#include <uapi/linux/papr_pdsm.h> +#include <linux/papr_scm.h> #include <asm/mce.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/perf_event.h> #define BIND_ANY_ADDR (~0ul) @@ -29,46 +31,6 @@ (1ul << ND_CMD_SET_CONFIG_DATA) | \ (1ul << ND_CMD_CALL)) -/* DIMM health bitmap indicators */ -/* SCM device is unable to persist memory contents */ -#define PAPR_PMEM_UNARMED (1ULL << (63 - 0)) -/* SCM device failed to persist memory contents */ -#define PAPR_PMEM_SHUTDOWN_DIRTY (1ULL << (63 - 1)) -/* SCM device contents are persisted from previous IPL */ -#define PAPR_PMEM_SHUTDOWN_CLEAN (1ULL << (63 - 2)) -/* SCM device contents are not persisted from previous IPL */ -#define PAPR_PMEM_EMPTY (1ULL << (63 - 3)) -/* SCM device memory life remaining is critically low */ -#define PAPR_PMEM_HEALTH_CRITICAL (1ULL << (63 - 4)) -/* SCM device will be garded off next IPL due to failure */ -#define PAPR_PMEM_HEALTH_FATAL (1ULL << (63 - 5)) -/* SCM contents cannot persist due to current platform health status */ -#define PAPR_PMEM_HEALTH_UNHEALTHY (1ULL << (63 - 6)) -/* SCM device is unable to persist memory contents in certain conditions */ -#define PAPR_PMEM_HEALTH_NON_CRITICAL (1ULL << (63 - 7)) -/* SCM device is encrypted */ -#define PAPR_PMEM_ENCRYPTED (1ULL << (63 - 8)) -/* SCM device has been scrubbed and locked */ -#define PAPR_PMEM_SCRUBBED_AND_LOCKED (1ULL << (63 - 9)) - -/* Bits status indicators for health bitmap indicating unarmed dimm */ -#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -/* Bits status indicators for health bitmap indicating unflushed dimm */ -#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY) - -/* Bits status indicators for health bitmap indicating unrestored dimm */ -#define PAPR_PMEM_BAD_RESTORE_MASK (PAPR_PMEM_EMPTY) - -/* Bit status indicators for smart event notification */ -#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \ - PAPR_PMEM_HEALTH_FATAL | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) -#define PAPR_SCM_PERF_STATS_VERSION 0x1 - /* Struct holding a single performance metric */ struct papr_scm_perf_stat { u8 stat_id[8]; @@ -582,7 +544,7 @@ static int drc_pmem_query_health(struct papr_scm_priv *p) /* Jiffies offset for which the health data is assumed to be same */ cache_timeout = p->lasthealth_jiffies + - msecs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL * 1000); + secs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL); /* Fetch new health info is its older than MIN_HEALTH_QUERY_INTERVAL */ if (time_after(jiffies, cache_timeout)) @@ -1548,7 +1510,7 @@ static const struct of_device_id papr_scm_match[] = { static struct platform_driver papr_scm_driver = { .probe = papr_scm_probe, - .remove_new = papr_scm_remove, + .remove = papr_scm_remove, .driver = { .name = "papr_scm", .of_match_table = papr_scm_match, @@ -1575,5 +1537,6 @@ static void __exit papr_scm_exit(void) module_exit(papr_scm_exit); MODULE_DEVICE_TABLE(of, papr_scm_match); +MODULE_DESCRIPTION("PAPR Storage Class Memory interface driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("IBM Corporation"); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 1772ae3d193d..6dbc73eb2ca2 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -18,33 +18,6 @@ #include <asm/pci.h> #include "pseries.h" -#if 0 -void pcibios_name_device(struct pci_dev *dev) -{ - struct device_node *dn; - - /* - * Add IBM loc code (slot) as a prefix to the device names for service - */ - dn = pci_device_to_OF_node(dev); - if (dn) { - const char *loc_code = of_get_property(dn, "ibm,loc-code", - NULL); - if (loc_code) { - int loc_len = strlen(loc_code); - if (loc_len < sizeof(dev->dev.name)) { - memmove(dev->dev.name+loc_len+1, dev->dev.name, - sizeof(dev->dev.name)-loc_len-1); - memcpy(dev->dev.name, loc_code, loc_len); - dev->dev.name[loc_len] = ' '; - dev->dev.name[sizeof(dev->dev.name)-1] = '\0'; - } - } - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device); -#endif - #ifdef CONFIG_PCI_IOV #define MAX_VFS_FOR_MAP_PE 256 struct pe_map_bar_entry { diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 4448386268d9..52e2623a741d 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -11,6 +11,7 @@ #include <linux/pci.h> #include <linux/export.h> +#include <linux/node.h> #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> #include <asm/firmware.h> @@ -21,9 +22,22 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; + int nid; pr_debug("PCI: Initializing new hotplug PHB %pOF\n", dn); + nid = of_node_to_nid(dn); + if (likely((nid) >= 0)) { + if (!node_online(nid)) { + if (__register_one_node(nid)) { + pr_err("PCI: Failed to register node %d\n", nid); + } else { + update_numa_distance(dn); + node_set_online(nid); + } + } + } + phb = pcibios_alloc_controller(dn); if (!phb) return NULL; diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index 4a595493d28a..b1667ed05f98 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -683,7 +683,7 @@ void __init plpks_early_init_devtree(void) out: fdt_nop_property(fdt, chosen_node, "ibm,plpks-pw"); // Since we've cleared the password, we must update the FDT checksum - early_init_dt_verify(fdt); + early_init_dt_verify(fdt, __pa(fdt)); } static __init int pseries_plpks_init(void) diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c index 3c290b9ed01b..0f1d45f32e4a 100644 --- a/arch/powerpc/platforms/pseries/pmem.c +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -121,7 +121,7 @@ int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) return -EINVAL; } - drc_index = hp_elog->_drc_u.drc_index; + drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index); lock_device_hotplug(); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index bba4ad192b0f..3968a6970fa8 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -38,7 +38,6 @@ static inline void smp_init_pseries(void) { } #endif extern void pseries_kexec_cpu_down(int crash_shutdown, int secondary); -void pseries_machine_kexec(struct kimage *image); extern void pSeries_final_fixup(void); diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index b5853e9fcc3c..eceb3289383e 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -18,6 +18,7 @@ #include <asm/page.h> #include <asm/rtas.h> +#include <asm/setup.h> #include <asm/fadump.h> #include <asm/fadump-internal.h> @@ -29,9 +30,6 @@ static const struct rtas_fadump_mem_struct *fdm_active; static void rtas_fadump_update_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) { - fadump_conf->boot_mem_dest_addr = - be64_to_cpu(fdm->rmr_region.destination_address); - fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr + fadump_conf->boot_memory_size); } @@ -43,20 +41,56 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf, static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) { - fadump_conf->boot_mem_addr[0] = - be64_to_cpu(fdm->rmr_region.source_address); - fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len); - fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0]; + unsigned long base, size, last_end, hole_size; - fadump_conf->boot_mem_top = fadump_conf->boot_memory_size; - fadump_conf->boot_mem_regs_cnt = 1; + last_end = 0; + hole_size = 0; + fadump_conf->boot_memory_size = 0; + fadump_conf->boot_mem_regs_cnt = 0; + pr_debug("Boot memory regions:\n"); + for (int i = 0; i < be16_to_cpu(fdm->header.dump_num_sections); i++) { + int type = be16_to_cpu(fdm->rgn[i].source_data_type); + u64 addr; - /* - * Start address of reserve dump area (permanent reservation) for - * re-registering FADump after dump capture. - */ - fadump_conf->reserve_dump_area_start = - be64_to_cpu(fdm->cpu_state_data.destination_address); + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + addr = be64_to_cpu(fdm->rgn[i].destination_address); + + fadump_conf->cpu_state_dest_vaddr = (u64)__va(addr); + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = addr; + break; + case RTAS_FADUMP_HPTE_REGION: + /* Not processed currently. */ + break; + case RTAS_FADUMP_REAL_MODE_REGION: + base = be64_to_cpu(fdm->rgn[i].source_address); + size = be64_to_cpu(fdm->rgn[i].source_len); + pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size); + if (!base) { + fadump_conf->boot_mem_dest_addr = + be64_to_cpu(fdm->rgn[i].destination_address); + } + + fadump_conf->boot_mem_addr[fadump_conf->boot_mem_regs_cnt] = base; + fadump_conf->boot_mem_sz[fadump_conf->boot_mem_regs_cnt] = size; + fadump_conf->boot_memory_size += size; + hole_size += (base - last_end); + last_end = base + size; + fadump_conf->boot_mem_regs_cnt++; + break; + case RTAS_FADUMP_PARAM_AREA: + fadump_conf->param_area = be64_to_cpu(fdm->rgn[i].destination_address); + break; + default: + pr_warn("Section type %d unsupported on this kernel. Ignoring!\n", type); + break; + } + } + fadump_conf->boot_mem_top = fadump_conf->boot_memory_size + hole_size; rtas_fadump_update_config(fadump_conf, fdm); } @@ -64,16 +98,15 @@ static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf, static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) { u64 addr = fadump_conf->reserve_dump_area_start; + u16 sec_cnt = 0; memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct)); addr = addr & PAGE_MASK; fdm.header.dump_format_version = cpu_to_be32(0x00000001); - fdm.header.dump_num_sections = cpu_to_be16(3); fdm.header.dump_status_flag = 0; fdm.header.offset_first_dump_section = - cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, - cpu_state_data)); + cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, rgn)); /* * Fields for disk dump option. @@ -89,25 +122,22 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) /* Kernel dump sections */ /* cpu state data section. */ - fdm.cpu_state_data.request_flag = - cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm.cpu_state_data.source_data_type = - cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); - fdm.cpu_state_data.source_address = 0; - fdm.cpu_state_data.source_len = - cpu_to_be64(fadump_conf->cpu_state_data_size); - fdm.cpu_state_data.destination_address = cpu_to_be64(addr); + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); + fdm.rgn[sec_cnt].source_address = 0; + fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->cpu_state_data_size); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr); addr += fadump_conf->cpu_state_data_size; + sec_cnt++; /* hpte region section */ - fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm.hpte_region.source_data_type = - cpu_to_be16(RTAS_FADUMP_HPTE_REGION); - fdm.hpte_region.source_address = 0; - fdm.hpte_region.source_len = - cpu_to_be64(fadump_conf->hpte_region_size); - fdm.hpte_region.destination_address = cpu_to_be64(addr); + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_HPTE_REGION); + fdm.rgn[sec_cnt].source_address = 0; + fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->hpte_region_size); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr); addr += fadump_conf->hpte_region_size; + sec_cnt++; /* * Align boot memory area destination address to page boundary to @@ -115,14 +145,29 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) */ addr = PAGE_ALIGN(addr); - /* RMA region section */ - fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm.rmr_region.source_data_type = - cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); - fdm.rmr_region.source_address = cpu_to_be64(0); - fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size); - fdm.rmr_region.destination_address = cpu_to_be64(addr); - addr += fadump_conf->boot_memory_size; + /* First boot memory region destination address */ + fadump_conf->boot_mem_dest_addr = addr; + for (int i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) { + /* Boot memory regions */ + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); + fdm.rgn[sec_cnt].source_address = cpu_to_be64(fadump_conf->boot_mem_addr[i]); + fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->boot_mem_sz[i]); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr); + addr += fadump_conf->boot_mem_sz[i]; + sec_cnt++; + } + + /* Parameters area */ + if (fadump_conf->param_area) { + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_PARAM_AREA); + fdm.rgn[sec_cnt].source_address = cpu_to_be64(fadump_conf->param_area); + fdm.rgn[sec_cnt].source_len = cpu_to_be64(COMMAND_LINE_SIZE); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(fadump_conf->param_area); + sec_cnt++; + } + fdm.header.dump_num_sections = cpu_to_be16(sec_cnt); rtas_fadump_update_config(fadump_conf, &fdm); @@ -136,14 +181,21 @@ static u64 rtas_fadump_get_bootmem_min(void) static int rtas_fadump_register(struct fw_dump *fadump_conf) { - unsigned int wait_time; + unsigned int wait_time, fdm_size; int rc, err = -EIO; + /* + * Platform requires the exact size of the Dump Memory Structure. + * Avoid including any unused rgns in the calculation, as this + * could result in a parameter error (-3) from the platform. + */ + fdm_size = sizeof(struct rtas_fadump_section_header); + fdm_size += be16_to_cpu(fdm.header.dump_num_sections) * sizeof(struct rtas_fadump_section); + /* TODO: Add upper time limit for the delay */ do { rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, - NULL, FADUMP_REGISTER, &fdm, - sizeof(struct rtas_fadump_mem_struct)); + NULL, FADUMP_REGISTER, &fdm, fdm_size); wait_time = rtas_busy_delay_time(rc); if (wait_time) @@ -161,9 +213,7 @@ static int rtas_fadump_register(struct fw_dump *fadump_conf) pr_err("Failed to register. Hardware Error(%d).\n", rc); break; case -3: - if (!is_fadump_boot_mem_contiguous()) - pr_err("Can't have holes in boot memory area.\n"); - else if (!is_fadump_reserved_mem_contiguous()) + if (!is_fadump_reserved_mem_contiguous()) pr_err("Can't have holes in reserved memory area.\n"); pr_err("Failed to register. Parameter Error(%d).\n", rc); @@ -316,11 +366,9 @@ static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) u32 num_cpus, *note_buf; int i, rc = 0, cpu = 0; struct pt_regs regs; - unsigned long addr; void *vaddr; - addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address); - vaddr = __va(addr); + vaddr = (void *)fadump_conf->cpu_state_dest_vaddr; reg_header = vaddr; if (be64_to_cpu(reg_header->magic_number) != @@ -375,11 +423,8 @@ static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) } final_note(note_buf); - if (fdh) { - pr_debug("Updating elfcore header (%llx) with cpu notes\n", - fdh->elfcorehdr_addr); - fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); - } + pr_debug("Updating elfcore header (%llx) with cpu notes\n", fadump_conf->elfcorehdr_addr); + fadump_update_elfcore_header((char *)fadump_conf->elfcorehdr_addr); return 0; error_out: @@ -389,57 +434,66 @@ error_out: } /* - * Validate and process the dump data stored by firmware before exporting - * it through '/proc/vmcore'. + * Validate and process the dump data stored by the firmware, and update + * the CPU notes of elfcorehdr. */ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) { - struct fadump_crash_info_header *fdh; - int rc = 0; - if (!fdm_active || !fadump_conf->fadumphdr_addr) return -EINVAL; /* Check if the dump data is valid. */ - if ((be16_to_cpu(fdm_active->header.dump_status_flag) == - RTAS_FADUMP_ERROR_FLAG) || - (fdm_active->cpu_state_data.error_flags != 0) || - (fdm_active->rmr_region.error_flags != 0)) { - pr_err("Dump taken by platform is not valid\n"); - return -EINVAL; - } - if ((fdm_active->rmr_region.bytes_dumped != - fdm_active->rmr_region.source_len) || - !fdm_active->cpu_state_data.bytes_dumped) { - pr_err("Dump taken by platform is incomplete\n"); - return -EINVAL; - } + for (int i = 0; i < be16_to_cpu(fdm_active->header.dump_num_sections); i++) { + int type = be16_to_cpu(fdm_active->rgn[i].source_data_type); + int rc = 0; - /* Validate the fadump crash info header */ - fdh = __va(fadump_conf->fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - pr_err("Crash info header is not valid.\n"); - return -EINVAL; + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + case RTAS_FADUMP_HPTE_REGION: + case RTAS_FADUMP_REAL_MODE_REGION: + if (fdm_active->rgn[i].error_flags != 0) { + pr_err("Dump taken by platform is not valid (%d)\n", i); + rc = -EINVAL; + } + if (fdm_active->rgn[i].bytes_dumped != fdm_active->rgn[i].source_len) { + pr_err("Dump taken by platform is incomplete (%d)\n", i); + rc = -EINVAL; + } + if (rc) { + pr_warn("Region type: %u src addr: 0x%llx dest addr: 0x%llx\n", + be16_to_cpu(fdm_active->rgn[i].source_data_type), + be64_to_cpu(fdm_active->rgn[i].source_address), + be64_to_cpu(fdm_active->rgn[i].destination_address)); + return rc; + } + break; + case RTAS_FADUMP_PARAM_AREA: + if (fdm_active->rgn[i].bytes_dumped != fdm_active->rgn[i].source_len || + fdm_active->rgn[i].error_flags != 0) { + pr_warn("Failed to process additional parameters! Proceeding anyway..\n"); + fadump_conf->param_area = 0; + } + break; + default: + /* + * If the first/crashed kernel added a new region type that the + * second/fadump kernel doesn't recognize, skip it and process + * assuming backward compatibility. + */ + pr_warn("Unknown region found: type: %u src addr: 0x%llx dest addr: 0x%llx\n", + be16_to_cpu(fdm_active->rgn[i].source_data_type), + be64_to_cpu(fdm_active->rgn[i].source_address), + be64_to_cpu(fdm_active->rgn[i].destination_address)); + break; + } } - rc = rtas_fadump_build_cpu_notes(fadump_conf); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; - - return 0; + return rtas_fadump_build_cpu_notes(fadump_conf); } static void rtas_fadump_region_show(struct fw_dump *fadump_conf, struct seq_file *m) { - const struct rtas_fadump_section *cpu_data_section; const struct rtas_fadump_mem_struct *fdm_ptr; if (fdm_active) @@ -447,27 +501,49 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf, else fdm_ptr = &fdm; - cpu_data_section = &(fdm_ptr->cpu_state_data); - seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", - be64_to_cpu(cpu_data_section->destination_address), - be64_to_cpu(cpu_data_section->destination_address) + - be64_to_cpu(cpu_data_section->source_len) - 1, - be64_to_cpu(cpu_data_section->source_len), - be64_to_cpu(cpu_data_section->bytes_dumped)); - - seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->hpte_region.destination_address), - be64_to_cpu(fdm_ptr->hpte_region.destination_address) + - be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, - be64_to_cpu(fdm_ptr->hpte_region.source_len), - be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); - - seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", - be64_to_cpu(fdm_ptr->rmr_region.source_address), - be64_to_cpu(fdm_ptr->rmr_region.destination_address)); - seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", - be64_to_cpu(fdm_ptr->rmr_region.source_len), - be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); + + for (int i = 0; i < be16_to_cpu(fdm_ptr->header.dump_num_sections); i++) { + int type = be16_to_cpu(fdm_ptr->rgn[i].source_data_type); + + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->rgn[i].destination_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address) + + be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1, + be64_to_cpu(fdm_ptr->rgn[i].source_len), + be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped)); + break; + case RTAS_FADUMP_HPTE_REGION: + seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->rgn[i].destination_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address) + + be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1, + be64_to_cpu(fdm_ptr->rgn[i].source_len), + be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped)); + break; + case RTAS_FADUMP_REAL_MODE_REGION: + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + be64_to_cpu(fdm_ptr->rgn[i].source_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address)); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + be64_to_cpu(fdm_ptr->rgn[i].source_len), + be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped)); + break; + case RTAS_FADUMP_PARAM_AREA: + seq_printf(m, "\n[%#016llx-%#016llx]: cmdline append: '%s'\n", + be64_to_cpu(fdm_ptr->rgn[i].destination_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address) + + be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1, + (char *)__va(be64_to_cpu(fdm_ptr->rgn[i].destination_address))); + break; + default: + seq_printf(m, "Unknown region type %d : Src: %#016llx, Dest: %#016llx, ", + type, be64_to_cpu(fdm_ptr->rgn[i].source_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address)); + break; + } + } /* Dump is active. Show preserved area start address. */ if (fdm_active) { @@ -483,6 +559,20 @@ static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, rtas_os_term((char *)msg); } +/* FADUMP_MAX_MEM_REGS or lower */ +static int rtas_fadump_max_boot_mem_rgns(void) +{ + /* + * Version 1 of Kernel Assisted Dump Memory Structure (PAPR) supports 10 sections. + * With one each section taken for CPU state data & HPTE respectively, 8 sections + * can be used for boot memory regions. + * + * If new region(s) is(are) defined, maximum boot memory regions will decrease + * proportionally. + */ + return RTAS_FADUMP_MAX_BOOT_MEM_REGS; +} + static struct fadump_ops rtas_fadump_ops = { .fadump_init_mem_struct = rtas_fadump_init_mem_struct, .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min, @@ -492,6 +582,7 @@ static struct fadump_ops rtas_fadump_ops = { .fadump_process = rtas_fadump_process, .fadump_region_show = rtas_fadump_region_show, .fadump_trigger = rtas_fadump_trigger, + .fadump_max_boot_mem_rgns = rtas_fadump_max_boot_mem_rgns, }; void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) @@ -508,9 +599,10 @@ void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) if (!token) return; - fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); - fadump_conf->ops = &rtas_fadump_ops; - fadump_conf->fadump_supported = 1; + fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); + fadump_conf->ops = &rtas_fadump_ops; + fadump_conf->fadump_supported = 1; + fadump_conf->param_area_supported = 1; /* Firmware supports 64-bit value for size, align it to pagesize. */ fadump_conf->max_copy_size = ALIGN_DOWN(U64_MAX, PAGE_SIZE); diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h index fd59bd7ca9c3..c109abf6befd 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.h +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -23,12 +23,24 @@ #define RTAS_FADUMP_HPTE_REGION 0x0002 #define RTAS_FADUMP_REAL_MODE_REGION 0x0011 +/* OS defined sections */ +#define RTAS_FADUMP_PARAM_AREA 0x0100 + /* Dump request flag */ #define RTAS_FADUMP_REQUEST_FLAG 0x00000001 /* Dump status flag */ #define RTAS_FADUMP_ERROR_FLAG 0x2000 +/* + * The Firmware Assisted Dump Memory structure supports a maximum of 10 sections + * in the dump memory structure. Presently, three sections are used for + * CPU state data, HPTE & Parameters area, while the remaining seven sections + * can be used for boot memory regions. + */ +#define MAX_SECTIONS 10 +#define RTAS_FADUMP_MAX_BOOT_MEM_REGS 7 + /* Kernel Dump section info */ struct rtas_fadump_section { __be32 request_flag; @@ -61,20 +73,15 @@ struct rtas_fadump_section_header { * Firmware Assisted dump memory structure. This structure is required for * registering future kernel dump with power firmware through rtas call. * - * No disk dump option. Hence disk dump path string section is not included. + * In version 1, the platform permits one section header, dump-disk path + * and ten sections. + * + * Note: No disk dump option. Hence disk dump path string section is not + * included. */ struct rtas_fadump_mem_struct { struct rtas_fadump_section_header header; - - /* Kernel dump sections */ - struct rtas_fadump_section cpu_state_data; - struct rtas_fadump_section hpte_region; - - /* - * TODO: Extend multiple boot memory regions support in the kernel - * for this platform. - */ - struct rtas_fadump_section rmr_region; + struct rtas_fadump_section rgn[MAX_SECTIONS]; }; /* diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 284a6fa04b0c..b10a25325238 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -343,8 +343,8 @@ static int alloc_dispatch_log_kmem_cache(void) { void (*ctor)(void *) = get_dtl_cache_ctor(); - dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, ctor); + dtl_cache = kmem_cache_create_usercopy("dtl", DISPATCH_LOG_BYTES, + DISPATCH_LOG_BYTES, 0, 0, DISPATCH_LOG_BYTES, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); @@ -1159,7 +1159,6 @@ define_machine(pseries) { .machine_check_exception = pSeries_machine_check_exception, .machine_check_log_err = pSeries_machine_check_log_err, #ifdef CONFIG_KEXEC_CORE - .machine_kexec = pseries_machine_kexec, .kexec_cpu_down = pseries_kexec_cpu_down, #endif #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index c597711ef20a..db99725e752b 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -39,7 +39,7 @@ #include <asm/xive.h> #include <asm/dbell.h> #include <asm/plpar_wrappers.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/svm.h> #include <asm/kvm_guest.h> diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 3b4045d508ec..384c9dc1899a 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -8,6 +8,7 @@ #include <linux/mm.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <linux/cc_platform.h> #include <asm/machdep.h> #include <asm/svm.h> diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c b/arch/powerpc/platforms/pseries/vas-sysfs.c index f9f682724e77..9e05a0e99cad 100644 --- a/arch/powerpc/platforms/pseries/vas-sysfs.c +++ b/arch/powerpc/platforms/pseries/vas-sysfs.c @@ -162,13 +162,13 @@ static const struct sysfs_ops vas_sysfs_ops = { .store = vas_type_store, }; -static struct kobj_type vas_def_attr_type = { +static const struct kobj_type vas_def_attr_type = { .release = vas_type_release, .sysfs_ops = &vas_sysfs_ops, .default_groups = vas_def_capab_groups, }; -static struct kobj_type vas_qos_attr_type = { +static const struct kobj_type vas_qos_attr_type = { .release = vas_type_release, .sysfs_ops = &vas_sysfs_ops, .default_groups = vas_qos_capab_groups, diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 71d52a670d95..c25eb1a38185 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -38,7 +38,27 @@ static long hcall_return_busy_check(long rc) { /* Check if we are stalled for some time */ if (H_IS_LONG_BUSY(rc)) { - msleep(get_longbusy_msecs(rc)); + unsigned int ms; + /* + * Allocate, Modify and Deallocate HCALLs returns + * H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC + * for the long delay. So the sleep time should always + * be either 1 or 10msecs, but in case if the HCALL + * returns the long delay > 10 msecs, clamp the sleep + * time to 10msecs. + */ + ms = clamp(get_longbusy_msecs(rc), 1, 10); + + /* + * msleep() will often sleep at least 20 msecs even + * though the hypervisor suggests that the OS reissue + * HCALLs after 1 or 10msecs. Also the delay hint from + * the HCALL is just a suggestion. So OK to pause for + * less time than the hinted delay. Use usleep_range() + * to ensure we don't sleep much longer than actually + * needed. + */ + usleep_range(ms * (USEC_PER_MSEC / 10), ms * USEC_PER_MSEC); rc = H_BUSY; } else if (rc == H_BUSY) { cond_resched(); @@ -228,7 +248,7 @@ static irqreturn_t pseries_vas_irq_handler(int irq, void *data) struct pseries_vas_window *txwin = data; /* - * The thread hanlder will process this interrupt if it is + * The thread handler will process this interrupt if it is * already running. */ atomic_inc(&txwin->pending_faults); diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 90ff85c879bf..ac1d2d2c9a88 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -611,7 +611,7 @@ static const struct dma_map_ops vio_dma_mapping_ops = { .get_required_mask = dma_iommu_get_required_mask, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; @@ -1576,10 +1576,10 @@ void vio_unregister_device(struct vio_dev *viodev) } EXPORT_SYMBOL(vio_unregister_device); -static int vio_bus_match(struct device *dev, struct device_driver *drv) +static int vio_bus_match(struct device *dev, const struct device_driver *drv) { const struct vio_dev *vio_dev = to_vio_dev(dev); - struct vio_driver *vio_drv = to_vio_driver(drv); + const struct vio_driver *vio_drv = to_vio_driver(drv); const struct vio_device_id *ids = vio_drv->id_table; return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); @@ -1592,13 +1592,9 @@ static int vio_hotplug(const struct device *dev, struct kobj_uevent_env *env) const char *cp; dn = dev->of_node; - if (!dn) - return -ENODEV; - cp = of_get_property(dn, "compatible", NULL); - if (!cp) - return -ENODEV; + if (dn && (cp = of_get_property(dn, "compatible", NULL))) + add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp); - add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp); return 0; } @@ -1693,7 +1689,7 @@ struct vio_dev *vio_find_node(struct device_node *vnode) /* construct the kobject name from the device node */ if (of_node_is_type(vnode_parent, "vdevice")) { const __be32 *prop; - + prop = of_get_property(vnode, "reg", NULL); if (!prop) goto out; |